From 5a0b8cb46624cc17fda676d6ae44fb72504f0ad9 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Tue, 13 Mar 2018 14:23:28 -0700
Subject: iio: cros_ec: Move cros_ec_sensors_core.h in /include

Similar to other common iio frameworks, move cros_ec_sensors_core.h from
drivers/iio/common/cros_ec_sensors/ to include/linux/iio/common.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/common/cros_ec_sensors_core.h | 180 ++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 include/linux/iio/common/cros_ec_sensors_core.h

(limited to 'include')

diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h
new file mode 100644
index 000000000000..ce16445411ac
--- /dev/null
+++ b/include/linux/iio/common/cros_ec_sensors_core.h
@@ -0,0 +1,180 @@
+/*
+ * ChromeOS EC sensor hub
+ *
+ * Copyright (C) 2016 Google, Inc
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __CROS_EC_SENSORS_CORE_H
+#define __CROS_EC_SENSORS_CORE_H
+
+#include <linux/iio/iio.h>
+#include <linux/irqreturn.h>
+#include <linux/mfd/cros_ec.h>
+
+enum {
+	CROS_EC_SENSOR_X,
+	CROS_EC_SENSOR_Y,
+	CROS_EC_SENSOR_Z,
+	CROS_EC_SENSOR_MAX_AXIS,
+};
+
+/* EC returns sensor values using signed 16 bit registers */
+#define CROS_EC_SENSOR_BITS 16
+
+/*
+ * 4 16 bit channels are allowed.
+ * Good enough for current sensors, they use up to 3 16 bit vectors.
+ */
+#define CROS_EC_SAMPLE_SIZE  (sizeof(s64) * 2)
+
+/* Minimum sampling period to use when device is suspending */
+#define CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY 1000  /* 1 second */
+
+/**
+ * struct cros_ec_sensors_core_state - state data for EC sensors IIO driver
+ * @ec:				cros EC device structure
+ * @cmd_lock:			lock used to prevent simultaneous access to the
+ *				commands.
+ * @msg:			cros EC command structure
+ * @param:			motion sensor parameters structure
+ * @resp:			motion sensor response structure
+ * @type:			type of motion sensor
+ * @loc:			location where the motion sensor is placed
+ * @calib:			calibration parameters. Note that trigger
+ *				captured data will always provide the calibrated
+ *				data
+ * @samples:			static array to hold data from a single capture.
+ *				For each channel we need 2 bytes, except for
+ *				the timestamp. The timestamp is always last and
+ *				is always 8-byte aligned.
+ * @read_ec_sensors_data:	function used for accessing sensors values
+ * @cuur_sampl_freq:		current sampling period
+ */
+struct cros_ec_sensors_core_state {
+	struct cros_ec_device *ec;
+	struct mutex cmd_lock;
+
+	struct cros_ec_command *msg;
+	struct ec_params_motion_sense param;
+	struct ec_response_motion_sense *resp;
+
+	enum motionsensor_type type;
+	enum motionsensor_location loc;
+
+	s16 calib[CROS_EC_SENSOR_MAX_AXIS];
+
+	u8 samples[CROS_EC_SAMPLE_SIZE];
+
+	int (*read_ec_sensors_data)(struct iio_dev *indio_dev,
+				    unsigned long scan_mask, s16 *data);
+
+	int curr_sampl_freq;
+};
+
+/**
+ * cros_ec_sensors_read_lpc() - retrieve data from EC shared memory
+ * @indio_dev:	pointer to IIO device
+ * @scan_mask:	bitmap of the sensor indices to scan
+ * @data:	location to store data
+ *
+ * This is the safe function for reading the EC data. It guarantees that the
+ * data sampled was not modified by the EC while being read.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int cros_ec_sensors_read_lpc(struct iio_dev *indio_dev, unsigned long scan_mask,
+			     s16 *data);
+
+/**
+ * cros_ec_sensors_read_cmd() - retrieve data using the EC command protocol
+ * @indio_dev:	pointer to IIO device
+ * @scan_mask:	bitmap of the sensor indices to scan
+ * @data:	location to store data
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev, unsigned long scan_mask,
+			     s16 *data);
+
+struct platform_device;
+/**
+ * cros_ec_sensors_core_init() - basic initialization of the core structure
+ * @pdev:		platform device created for the sensors
+ * @indio_dev:		iio device structure of the device
+ * @physical_device:	true if the device refers to a physical device
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int cros_ec_sensors_core_init(struct platform_device *pdev,
+			      struct iio_dev *indio_dev, bool physical_device);
+
+/**
+ * cros_ec_sensors_capture() - the trigger handler function
+ * @irq:	the interrupt number.
+ * @p:		a pointer to the poll function.
+ *
+ * On a trigger event occurring, if the pollfunc is attached then this
+ * handler is called as a threaded interrupt (and hence may sleep). It
+ * is responsible for grabbing data from the device and pushing it into
+ * the associated buffer.
+ *
+ * Return: IRQ_HANDLED
+ */
+irqreturn_t cros_ec_sensors_capture(int irq, void *p);
+
+/**
+ * cros_ec_motion_send_host_cmd() - send motion sense host command
+ * @st:		pointer to state information for device
+ * @opt_length:	optional length to reduce the response size, useful on the data
+ *		path. Otherwise, the maximal allowed response size is used
+ *
+ * When called, the sub-command is assumed to be set in param->cmd.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int cros_ec_motion_send_host_cmd(struct cros_ec_sensors_core_state *st,
+				 u16 opt_length);
+
+/**
+ * cros_ec_sensors_core_read() - function to request a value from the sensor
+ * @st:		pointer to state information for device
+ * @chan:	channel specification structure table
+ * @val:	will contain one element making up the returned value
+ * @val2:	will contain another element making up the returned value
+ * @mask:	specifies which values to be requested
+ *
+ * Return:	the type of value returned by the device
+ */
+int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
+			      struct iio_chan_spec const *chan,
+			      int *val, int *val2, long mask);
+
+/**
+ * cros_ec_sensors_core_write() - function to write a value to the sensor
+ * @st:		pointer to state information for device
+ * @chan:	channel specification structure table
+ * @val:	first part of value to write
+ * @val2:	second part of value to write
+ * @mask:	specifies which values to write
+ *
+ * Return:	the type of value returned by the device
+ */
+int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st,
+			       struct iio_chan_spec const *chan,
+			       int val, int val2, long mask);
+
+extern const struct dev_pm_ops cros_ec_sensors_pm_ops;
+
+/* List of extended channel specification for all sensors */
+extern const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[];
+
+#endif  /* __CROS_EC_SENSORS_CORE_H */
-- 
cgit v1.2.3


From 192af06a287bbfcae0de8fa456dca15767ecd056 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 12 Mar 2018 17:48:15 +0200
Subject: iio: adc: ad7780: remove IIO_CHAN_INFO_SAMP_FREQ support

The `ad7780` driver does not implement setting/getting the sampling
frequency.
For the ad7780/ad7781 devices, the control is done via an external pin,
and the ad7170/ad7171 devices have a fixed sampling rate (so, no control).

For these devices, and similar other that may be added later on,
a AD_SD_CHANNEL_NO_SAMPLE_FREQ() macro has been added, which doesn't set
the IIO_CHAN_INFO_SAMP_FREQ flag.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/staging/iio/adc/ad7780.c       |  2 +-
 include/linux/iio/adc/ad_sigma_delta.h | 24 +++++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/iio/adc/ad7780.c b/drivers/staging/iio/adc/ad7780.c
index a7797af579b9..16d72072c076 100644
--- a/drivers/staging/iio/adc/ad7780.c
+++ b/drivers/staging/iio/adc/ad7780.c
@@ -128,7 +128,7 @@ static const struct ad_sigma_delta_info ad7780_sigma_delta_info = {
 };
 
 #define AD7780_CHANNEL(bits, wordsize) \
-	AD_SD_CHANNEL(1, 0, 0, bits, 32, wordsize - bits)
+	AD_SD_CHANNEL_NO_SAMP_FREQ(1, 0, 0, bits, 32, wordsize - bits)
 
 static const struct ad7780_chip_info ad7780_chip_info_tbl[] = {
 	[ID_AD7170] = {
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index 1fc7abd28b0b..730ead1a46df 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -127,7 +127,7 @@ void ad_sd_cleanup_buffer_and_trigger(struct iio_dev *indio_dev);
 int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig);
 
 #define __AD_SD_CHANNEL(_si, _channel1, _channel2, _address, _bits, \
-	_storagebits, _shift, _extend_name, _type) \
+	_storagebits, _shift, _extend_name, _type, _mask_all) \
 	{ \
 		.type = (_type), \
 		.differential = (_channel2 == -1 ? 0 : 1), \
@@ -139,7 +139,7 @@ int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig);
 		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
 			BIT(IIO_CHAN_INFO_OFFSET), \
 		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE), \
-		.info_mask_shared_by_all = BIT(IIO_CHAN_INFO_SAMP_FREQ), \
+		.info_mask_shared_by_all = _mask_all, \
 		.scan_index = (_si), \
 		.scan_type = { \
 			.sign = 'u', \
@@ -153,25 +153,35 @@ int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig);
 #define AD_SD_DIFF_CHANNEL(_si, _channel1, _channel2, _address, _bits, \
 	_storagebits, _shift) \
 	__AD_SD_CHANNEL(_si, _channel1, _channel2, _address, _bits, \
-		_storagebits, _shift, NULL, IIO_VOLTAGE)
+		_storagebits, _shift, NULL, IIO_VOLTAGE, \
+		BIT(IIO_CHAN_INFO_SAMP_FREQ))
 
 #define AD_SD_SHORTED_CHANNEL(_si, _channel, _address, _bits, \
 	_storagebits, _shift) \
 	__AD_SD_CHANNEL(_si, _channel, _channel, _address, _bits, \
-		_storagebits, _shift, "shorted", IIO_VOLTAGE)
+		_storagebits, _shift, "shorted", IIO_VOLTAGE, \
+		BIT(IIO_CHAN_INFO_SAMP_FREQ))
 
 #define AD_SD_CHANNEL(_si, _channel, _address, _bits, \
 	_storagebits, _shift) \
 	__AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \
-		_storagebits, _shift, NULL, IIO_VOLTAGE)
+		_storagebits, _shift, NULL, IIO_VOLTAGE, \
+		 BIT(IIO_CHAN_INFO_SAMP_FREQ))
+
+#define AD_SD_CHANNEL_NO_SAMP_FREQ(_si, _channel, _address, _bits, \
+	_storagebits, _shift) \
+	__AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \
+		_storagebits, _shift, NULL, IIO_VOLTAGE, 0)
 
 #define AD_SD_TEMP_CHANNEL(_si, _address, _bits, _storagebits, _shift) \
 	__AD_SD_CHANNEL(_si, 0, -1, _address, _bits, \
-		_storagebits, _shift, NULL, IIO_TEMP)
+		_storagebits, _shift, NULL, IIO_TEMP, \
+		BIT(IIO_CHAN_INFO_SAMP_FREQ))
 
 #define AD_SD_SUPPLY_CHANNEL(_si, _channel, _address, _bits, _storagebits, \
 	_shift) \
 	__AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \
-		_storagebits, _shift, "supply", IIO_VOLTAGE)
+		_storagebits, _shift, "supply", IIO_VOLTAGE, \
+		BIT(IIO_CHAN_INFO_SAMP_FREQ))
 
 #endif
-- 
cgit v1.2.3


From 06d42212e69bfa953aec5887c75a1781f27c2a2e Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Sat, 17 Mar 2018 15:39:41 +0530
Subject: dt-bindings: clock: Add Actions S900 clock bindings

Add Actions Semi S900 clock bindings.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/actions,s900-cmu.txt |  47 ++++++++
 include/dt-bindings/clock/actions,s900-cmu.h       | 129 +++++++++++++++++++++
 2 files changed, 176 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/actions,s900-cmu.txt
 create mode 100644 include/dt-bindings/clock/actions,s900-cmu.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt b/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt
new file mode 100644
index 000000000000..93e4fb827cd6
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt
@@ -0,0 +1,47 @@
+* Actions S900 Clock Management Unit (CMU)
+
+The Actions S900 clock management unit generates and supplies clock to various
+controllers within the SoC. The clock binding described here is applicable to
+S900 SoC.
+
+Required Properties:
+
+- compatible: should be "actions,s900-cmu"
+- reg: physical base address of the controller and length of memory mapped
+  region.
+- clocks: Reference to the parent clocks ("hosc", "losc")
+- #clock-cells: should be 1.
+
+Each clock is assigned an identifier, and client nodes can use this identifier
+to specify the clock which they consume.
+
+All available clocks are defined as preprocessor macros in
+dt-bindings/clock/actions,s900-cmu.h header and can be used in device
+tree sources.
+
+External clocks:
+
+The hosc clock used as input for the plls is generated outside the SoC. It is
+expected that it is defined using standard clock bindings as "hosc".
+
+Actions S900 CMU also requires one more clock:
+ - "losc" - internal low frequency oscillator
+
+Example: Clock Management Unit node:
+
+        cmu: clock-controller@e0160000 {
+                compatible = "actions,s900-cmu";
+                reg = <0x0 0xe0160000 0x0 0x1000>;
+                clocks = <&hosc>, <&losc>;
+                #clock-cells = <1>;
+        };
+
+Example: UART controller node that consumes clock generated by the clock
+management unit:
+
+        uart: serial@e012a000 {
+                compatible = "actions,s900-uart", "actions,owl-uart";
+                reg = <0x0 0xe012a000 0x0 0x2000>;
+                interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
+                clocks = <&cmu CLK_UART5>;
+        };
diff --git a/include/dt-bindings/clock/actions,s900-cmu.h b/include/dt-bindings/clock/actions,s900-cmu.h
new file mode 100644
index 000000000000..7c1251565f43
--- /dev/null
+++ b/include/dt-bindings/clock/actions,s900-cmu.h
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Device Tree binding constants for Actions Semi S900 Clock Management Unit
+//
+// Copyright (c) 2014 Actions Semi Inc.
+// Copyright (c) 2018 Linaro Ltd.
+
+#ifndef __DT_BINDINGS_CLOCK_S900_CMU_H
+#define __DT_BINDINGS_CLOCK_S900_CMU_H
+
+#define CLK_NONE			0
+
+/* fixed rate clocks */
+#define CLK_LOSC			1
+#define CLK_HOSC			2
+
+/* pll clocks */
+#define CLK_CORE_PLL			3
+#define CLK_DEV_PLL			4
+#define CLK_DDR_PLL			5
+#define CLK_NAND_PLL			6
+#define CLK_DISPLAY_PLL			7
+#define CLK_DSI_PLL			8
+#define CLK_ASSIST_PLL			9
+#define CLK_AUDIO_PLL			10
+
+/* system clock */
+#define CLK_CPU				15
+#define CLK_DEV				16
+#define CLK_NOC				17
+#define CLK_NOC_MUX			18
+#define CLK_NOC_DIV			19
+#define CLK_AHB				20
+#define CLK_APB				21
+#define CLK_DMAC			22
+
+/* peripheral device clock */
+#define CLK_GPIO			23
+
+#define CLK_BISP			24
+#define CLK_CSI0			25
+#define CLK_CSI1			26
+
+#define CLK_DE0				27
+#define CLK_DE1				28
+#define CLK_DE2				29
+#define CLK_DE3				30
+#define CLK_DSI				32
+
+#define CLK_GPU				33
+#define CLK_GPU_CORE			34
+#define CLK_GPU_MEM			35
+#define CLK_GPU_SYS			36
+
+#define CLK_HDE				37
+#define CLK_I2C0			38
+#define CLK_I2C1			39
+#define CLK_I2C2			40
+#define CLK_I2C3			41
+#define CLK_I2C4			42
+#define CLK_I2C5			43
+#define CLK_I2SRX			44
+#define CLK_I2STX			45
+#define CLK_IMX				46
+#define CLK_LCD				47
+#define CLK_NAND0			48
+#define CLK_NAND1			49
+#define CLK_PWM0			50
+#define CLK_PWM1			51
+#define CLK_PWM2			52
+#define CLK_PWM3			53
+#define CLK_PWM4			54
+#define CLK_PWM5			55
+#define CLK_SD0				56
+#define CLK_SD1				57
+#define CLK_SD2				58
+#define CLK_SD3				59
+#define CLK_SENSOR			60
+#define CLK_SPEED_SENSOR		61
+#define CLK_SPI0			62
+#define CLK_SPI1			63
+#define CLK_SPI2			64
+#define CLK_SPI3			65
+#define CLK_THERMAL_SENSOR		66
+#define CLK_UART0			67
+#define CLK_UART1			68
+#define CLK_UART2			69
+#define CLK_UART3			70
+#define CLK_UART4			71
+#define CLK_UART5			72
+#define CLK_UART6			73
+#define CLK_VCE				74
+#define CLK_VDE				75
+
+#define CLK_USB3_480MPLL0		76
+#define CLK_USB3_480MPHY0		77
+#define CLK_USB3_5GPHY			78
+#define CLK_USB3_CCE			79
+#define CLK_USB3_MAC			80
+
+#define CLK_TIMER			83
+
+#define CLK_HDMI_AUDIO			84
+
+#define CLK_24M				85
+
+#define CLK_EDP				86
+
+#define CLK_24M_EDP			87
+#define CLK_EDP_PLL			88
+#define CLK_EDP_LINK			89
+
+#define CLK_USB2H0_PLLEN		90
+#define CLK_USB2H0_PHY			91
+#define CLK_USB2H0_CCE			92
+#define CLK_USB2H1_PLLEN		93
+#define CLK_USB2H1_PHY			94
+#define CLK_USB2H1_CCE			95
+
+#define CLK_DDR0			96
+#define CLK_DDR1			97
+#define CLK_DMM				98
+
+#define CLK_ETH_MAC			99
+#define CLK_RMII_REF			100
+
+#define CLK_NR_CLKS			(CLK_RMII_REF + 1)
+
+#endif /* __DT_BINDINGS_CLOCK_S900_CMU_H */
-- 
cgit v1.2.3


From 3b7008b226f3de811d4ac34238e9cf670f7c9fe7 Mon Sep 17 00:00:00 2001
From: Szymon Lukasz <noh4hss@gmail.com>
Date: Thu, 9 Nov 2017 21:23:35 +0100
Subject: fuse: return -ECONNABORTED on /dev/fuse read after abort

Currently the userspace has no way of knowing whether the fuse
connection ended because of umount or abort via sysfs. It makes it hard
for filesystems to free the mountpoint after abort without worrying
about removing some new mount.

The patch fixes it by returning different errors when userspace reads
from /dev/fuse (-ENODEV for umount and -ECONNABORTED for abort).

Add a new capability flag FUSE_ABORT_ERROR. If set and the connection is
gone because of sysfs abort, reading from the device will return
-ECONNABORTED.

Signed-off-by: Szymon Lukasz <noh4hss@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/control.c         |  2 +-
 fs/fuse/cuse.c            |  4 ++--
 fs/fuse/dev.c             | 12 +++++++-----
 fs/fuse/fuse_i.h          |  8 +++++++-
 fs/fuse/inode.c           |  9 ++++++---
 include/uapi/linux/fuse.h |  7 ++++++-
 6 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index b9ea99c5b5b3..78fb7a07c5ca 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -35,7 +35,7 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
 {
 	struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 	if (fc) {
-		fuse_abort_conn(fc);
+		fuse_abort_conn(fc, true);
 		fuse_conn_put(fc);
 	}
 	return count;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e9e97803442a..31d33b69957f 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -406,7 +406,7 @@ err_unlock:
 err_region:
 	unregister_chrdev_region(devt, 1);
 err:
-	fuse_abort_conn(fc);
+	fuse_abort_conn(fc, false);
 	goto out;
 }
 
@@ -581,7 +581,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
 {
 	struct cuse_conn *cc = dev_get_drvdata(dev);
 
-	fuse_abort_conn(&cc->fc);
+	fuse_abort_conn(&cc->fc, false);
 	return count;
 }
 static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 5d06384c2cae..78b6293bd04d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1234,9 +1234,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
 	if (err)
 		goto err_unlock;
 
-	err = -ENODEV;
-	if (!fiq->connected)
+	if (!fiq->connected) {
+		err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
 		goto err_unlock;
+	}
 
 	if (!list_empty(&fiq->interrupts)) {
 		req = list_entry(fiq->interrupts.next, struct fuse_req,
@@ -1287,7 +1288,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
 	spin_lock(&fpq->lock);
 	clear_bit(FR_LOCKED, &req->flags);
 	if (!fpq->connected) {
-		err = -ENODEV;
+		err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
 		goto out_end;
 	}
 	if (err) {
@@ -2076,7 +2077,7 @@ static void end_polls(struct fuse_conn *fc)
  * is OK, the request will in that case be removed from the list before we touch
  * it.
  */
-void fuse_abort_conn(struct fuse_conn *fc)
+void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
 {
 	struct fuse_iqueue *fiq = &fc->iq;
 
@@ -2089,6 +2090,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
 
 		fc->connected = 0;
 		fc->blocked = 0;
+		fc->aborted = is_abort;
 		fuse_set_initialized(fc);
 		list_for_each_entry(fud, &fc->devices, entry) {
 			struct fuse_pqueue *fpq = &fud->pq;
@@ -2151,7 +2153,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
 		/* Are we the last open device? */
 		if (atomic_dec_and_test(&fc->dev_count)) {
 			WARN_ON(fc->iq.fasync != NULL);
-			fuse_abort_conn(fc);
+			fuse_abort_conn(fc, false);
 		}
 		fuse_dev_free(fud);
 	}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c4c093bbf456..7d2e7deea64b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -515,6 +515,9 @@ struct fuse_conn {
 	    abort and device release */
 	unsigned connected;
 
+	/** Connection aborted via sysfs */
+	bool aborted;
+
 	/** Connection failed (version mismatch).  Cannot race with
 	    setting other bitfields since it is only set once in INIT
 	    reply, before any other request, and never cleared */
@@ -526,6 +529,9 @@ struct fuse_conn {
 	/** Do readpages asynchronously?  Only set in INIT */
 	unsigned async_read:1;
 
+	/** Return an unique read error after abort.  Only set in INIT */
+	unsigned abort_err:1;
+
 	/** Do not send separate SETATTR request before open(O_TRUNC)  */
 	unsigned atomic_o_trunc:1;
 
@@ -851,7 +857,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
 					 struct fuse_req *req);
 
 /* Abort all requests */
-void fuse_abort_conn(struct fuse_conn *fc);
+void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
 
 /**
  * Invalidate inode attributes
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 624f18bbfd2b..3c9b675d99da 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -371,7 +371,7 @@ void fuse_unlock_inode(struct inode *inode)
 
 static void fuse_umount_begin(struct super_block *sb)
 {
-	fuse_abort_conn(get_fuse_conn_super(sb));
+	fuse_abort_conn(get_fuse_conn_super(sb), false);
 }
 
 static void fuse_send_destroy(struct fuse_conn *fc)
@@ -393,7 +393,7 @@ static void fuse_put_super(struct super_block *sb)
 
 	fuse_send_destroy(fc);
 
-	fuse_abort_conn(fc);
+	fuse_abort_conn(fc, false);
 	mutex_lock(&fuse_mutex);
 	list_del(&fc->entry);
 	fuse_ctl_remove_conn(fc);
@@ -918,6 +918,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->posix_acl = 1;
 				fc->sb->s_xattr = fuse_acl_xattr_handlers;
 			}
+			if (arg->flags & FUSE_ABORT_ERROR)
+				fc->abort_err = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -948,7 +950,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 		FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
 		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
 		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
-		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL;
+		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
+		FUSE_ABORT_ERROR;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 4b5001c57f46..92fa24c24c92 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -113,6 +113,9 @@
  *  7.26
  *  - add FUSE_HANDLE_KILLPRIV
  *  - add FUSE_POSIX_ACL
+ *
+ *  7.27
+ *  - add FUSE_ABORT_ERROR
  */
 
 #ifndef _LINUX_FUSE_H
@@ -148,7 +151,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 26
+#define FUSE_KERNEL_MINOR_VERSION 27
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -245,6 +248,7 @@ struct fuse_file_lock {
  * FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir
  * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc
  * FUSE_POSIX_ACL: filesystem supports posix acls
+ * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -267,6 +271,7 @@ struct fuse_file_lock {
 #define FUSE_PARALLEL_DIROPS    (1 << 18)
 #define FUSE_HANDLE_KILLPRIV	(1 << 19)
 #define FUSE_POSIX_ACL		(1 << 20)
+#define FUSE_ABORT_ERROR	(1 << 21)
 
 /**
  * CUSE INIT request/reply flags
-- 
cgit v1.2.3


From 3ae7fb202d86b7847f237daa474f3946bdc3b0c6 Mon Sep 17 00:00:00 2001
From: Haneen Mohammed <hamohammed.sa@gmail.com>
Date: Tue, 20 Mar 2018 09:37:49 -0400
Subject: drm: Remove drm_property_{un/reference}_blob aliases

This patch remove the compatibility aliases
drm_property_{reference/unreference}_blob of
drm_property_blob_{get/put} since all callers have been converted to the
prefered _{get/put}.

Remove the helpers from the semantic patch drm-get-put-cocci.

Signed-off-by: Haneen Mohammed <hamohammed.sa@gmail.com>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20180320133749.GA11695@haneen-VirtualBox
---
 include/drm/drm_property.h               | 26 --------------------------
 scripts/coccinelle/api/drm-get-put.cocci | 10 ----------
 2 files changed, 36 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_property.h b/include/drm/drm_property.h
index d1423c7f3c73..ab8167baade5 100644
--- a/include/drm/drm_property.h
+++ b/include/drm/drm_property.h
@@ -280,32 +280,6 @@ bool drm_property_replace_blob(struct drm_property_blob **blob,
 struct drm_property_blob *drm_property_blob_get(struct drm_property_blob *blob);
 void drm_property_blob_put(struct drm_property_blob *blob);
 
-/**
- * drm_property_reference_blob - acquire a blob property reference
- * @blob: DRM blob property
- *
- * This is a compatibility alias for drm_property_blob_get() and should not be
- * used by new code.
- */
-static inline struct drm_property_blob *
-drm_property_reference_blob(struct drm_property_blob *blob)
-{
-	return drm_property_blob_get(blob);
-}
-
-/**
- * drm_property_unreference_blob - release a blob property reference
- * @blob: DRM blob property
- *
- * This is a compatibility alias for drm_property_blob_put() and should not be
- * used by new code.
- */
-static inline void
-drm_property_unreference_blob(struct drm_property_blob *blob)
-{
-	drm_property_blob_put(blob);
-}
-
 /**
  * drm_property_find - find property object
  * @dev: DRM device
diff --git a/scripts/coccinelle/api/drm-get-put.cocci b/scripts/coccinelle/api/drm-get-put.cocci
index ceb71ea7f61c..3a09c97ad87d 100644
--- a/scripts/coccinelle/api/drm-get-put.cocci
+++ b/scripts/coccinelle/api/drm-get-put.cocci
@@ -40,12 +40,6 @@ expression object;
 - drm_gem_object_unreference_unlocked(object)
 + drm_gem_object_put_unlocked(object)
 |
-- drm_property_reference_blob(object)
-+ drm_property_blob_get(object)
-|
-- drm_property_unreference_blob(object)
-+ drm_property_blob_put(object)
-|
 - drm_dev_unref(object)
 + drm_dev_put(object)
 )
@@ -72,10 +66,6 @@ __drm_gem_object_unreference(object)
 |
 drm_gem_object_unreference_unlocked(object)
 |
-drm_property_unreference_blob@p(object)
-|
-drm_property_reference_blob@p(object)
-|
 drm_dev_unref@p(object)
 )
 
-- 
cgit v1.2.3


From 56859d310c0eacc4a77b0cb1c9087c161b463e5d Mon Sep 17 00:00:00 2001
From: Tali Perry <tali.perry1@gmail.com>
Date: Tue, 20 Mar 2018 15:40:48 +0200
Subject: dt-binding: clk: npcm750: Add binding for Nuvoton NPCM7XX Clock

* Nuvoton NPCM7XX Clock Controller

Nuvoton Poleg BMC NPCM7XX contains an integrated clock controller, which
generates and supplies clocks to all modules within the BMC.

Signed-off-by: Tali Perry <tali.perry1@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/clock/nuvoton,npcm750-clk.txt         | 100 +++++++++++++++++++++
 include/dt-bindings/clock/nuvoton,npcm7xx-clock.h  |  44 +++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt
 create mode 100644 include/dt-bindings/clock/nuvoton,npcm7xx-clock.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt b/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt
new file mode 100644
index 000000000000..f82064546d11
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt
@@ -0,0 +1,100 @@
+* Nuvoton NPCM7XX Clock Controller
+
+Nuvoton Poleg BMC NPCM7XX contains an integrated clock controller, which
+generates and supplies clocks to all modules within the BMC.
+
+External clocks:
+
+There are six fixed clocks that are generated outside the BMC. All clocks are of
+a known fixed value that cannot be changed. clk_refclk, clk_mcbypck and
+clk_sysbypck are inputs to the clock controller.
+clk_rg1refck, clk_rg2refck and clk_xin are external clocks suppling the
+network. They are set on the device tree, but not used by the clock module. The
+network devices use them directly.
+Example can be found below.
+
+All available clocks are defined as preprocessor macros in:
+dt-bindings/clock/nuvoton,npcm7xx-clock.h
+and can be reused as DT sources.
+
+Required Properties of clock controller:
+
+	- compatible: "nuvoton,npcm750-clk" : for clock controller of Nuvoton
+		  Poleg BMC NPCM750
+
+	- reg: physical base address of the clock controller and length of
+		memory mapped region.
+
+	- #clock-cells: should be 1.
+
+Example: Clock controller node:
+
+	clk: clock-controller@f0801000 {
+		compatible = "nuvoton,npcm750-clk";
+		#clock-cells = <1>;
+		reg = <0xf0801000 0x1000>;
+		clock-names = "refclk", "sysbypck", "mcbypck";
+		clocks = <&clk_refclk>, <&clk_sysbypck>, <&clk_mcbypck>;
+	};
+
+Example: Required external clocks for network:
+
+	/* external reference clock */
+	clk_refclk: clk-refclk {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <25000000>;
+		clock-output-names = "refclk";
+	};
+
+	/* external reference clock for cpu. float in normal operation */
+	clk_sysbypck: clk-sysbypck {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <800000000>;
+		clock-output-names = "sysbypck";
+	};
+
+	/* external reference clock for MC. float in normal operation */
+	clk_mcbypck: clk-mcbypck {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <800000000>;
+		clock-output-names = "mcbypck";
+	};
+
+	 /* external clock signal rg1refck, supplied by the phy */
+	clk_rg1refck: clk-rg1refck {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <125000000>;
+		clock-output-names = "clk_rg1refck";
+	};
+
+	 /* external clock signal rg2refck, supplied by the phy */
+	clk_rg2refck: clk-rg2refck {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <125000000>;
+		clock-output-names = "clk_rg2refck";
+	};
+
+	clk_xin: clk-xin {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <50000000>;
+		clock-output-names = "clk_xin";
+	};
+
+
+Example: GMAC controller node that consumes two clocks: a generated clk by the
+clock controller and a fixed clock from DT (clk_rg1refck).
+
+	ethernet0: ethernet@f0802000 {
+		compatible = "snps,dwmac";
+		reg = <0xf0802000 0x2000>;
+		interrupts = <0 14 4>;
+		interrupt-names = "macirq";
+		clocks	= <&clk_rg1refck>, <&clk NPCM7XX_CLK_AHB>;
+		clock-names = "stmmaceth", "clk_gmac";
+	};
diff --git a/include/dt-bindings/clock/nuvoton,npcm7xx-clock.h b/include/dt-bindings/clock/nuvoton,npcm7xx-clock.h
new file mode 100644
index 000000000000..f21522605b94
--- /dev/null
+++ b/include/dt-bindings/clock/nuvoton,npcm7xx-clock.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Nuvoton NPCM7xx Clock Generator binding
+ * clock binding number for all clocks supportted by nuvoton,npcm7xx-clk
+ *
+ * Copyright (C) 2018 Nuvoton Technologies tali.perry@nuvoton.com
+ *
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_NPCM7XX_H
+#define __DT_BINDINGS_CLOCK_NPCM7XX_H
+
+
+#define NPCM7XX_CLK_CPU 0
+#define NPCM7XX_CLK_GFX_PIXEL 1
+#define NPCM7XX_CLK_MC 2
+#define NPCM7XX_CLK_ADC 3
+#define NPCM7XX_CLK_AHB 4
+#define NPCM7XX_CLK_TIMER 5
+#define NPCM7XX_CLK_UART 6
+#define NPCM7XX_CLK_MMC  7
+#define NPCM7XX_CLK_SPI3 8
+#define NPCM7XX_CLK_PCI  9
+#define NPCM7XX_CLK_AXI 10
+#define NPCM7XX_CLK_APB4 11
+#define NPCM7XX_CLK_APB3 12
+#define NPCM7XX_CLK_APB2 13
+#define NPCM7XX_CLK_APB1 14
+#define NPCM7XX_CLK_APB5 15
+#define NPCM7XX_CLK_CLKOUT 16
+#define NPCM7XX_CLK_GFX  17
+#define NPCM7XX_CLK_SU   18
+#define NPCM7XX_CLK_SU48 19
+#define NPCM7XX_CLK_SDHC 20
+#define NPCM7XX_CLK_SPI0 21
+#define NPCM7XX_CLK_SPIX 22
+
+#define NPCM7XX_CLK_REFCLK 23
+#define NPCM7XX_CLK_SYSBYPCK 24
+#define NPCM7XX_CLK_MCBYPCK 25
+
+#define NPCM7XX_NUM_CLOCKS	 (NPCM7XX_CLK_MCBYPCK+1)
+
+#endif
-- 
cgit v1.2.3


From a7d2a87e99deb5481b5dd723408c42f460de25a3 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 22 Mar 2018 11:51:28 +0100
Subject: drm/tinydrm: Use gem_free_object_unlocked
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tinydrm doesn't use dev->struct_mutex and therefore has no need to use
gem_free_object.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: "Noralf Trønnes" <noralf@tronnes.org>
Acked-by: Noralf Trønnes <noralf@tronnes.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20180322105133.11211-2-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/tinydrm/core/tinydrm-core.c | 2 +-
 include/drm/tinydrm/tinydrm.h               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-core.c b/drivers/gpu/drm/tinydrm/core/tinydrm-core.c
index 4c6616278c48..24a33bf862fa 100644
--- a/drivers/gpu/drm/tinydrm/core/tinydrm-core.c
+++ b/drivers/gpu/drm/tinydrm/core/tinydrm-core.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL(tinydrm_gem_cma_prime_import_sg_table);
  * GEM object state and frees the memory used to store the object itself using
  * drm_gem_cma_free_object(). It also handles PRIME buffers which has the kernel
  * virtual address set by tinydrm_gem_cma_prime_import_sg_table(). Drivers
- * can use this as their &drm_driver->gem_free_object callback.
+ * can use this as their &drm_driver->gem_free_object_unlocked callback.
  */
 void tinydrm_gem_cma_free_object(struct drm_gem_object *gem_obj)
 {
diff --git a/include/drm/tinydrm/tinydrm.h b/include/drm/tinydrm/tinydrm.h
index 07a9a11fe19d..77a93ec577fd 100644
--- a/include/drm/tinydrm/tinydrm.h
+++ b/include/drm/tinydrm/tinydrm.h
@@ -41,7 +41,7 @@ pipe_to_tinydrm(struct drm_simple_display_pipe *pipe)
  * the &drm_driver structure.
  */
 #define TINYDRM_GEM_DRIVER_OPS \
-	.gem_free_object	= tinydrm_gem_cma_free_object, \
+	.gem_free_object_unlocked = tinydrm_gem_cma_free_object, \
 	.gem_print_info		= drm_gem_cma_print_info, \
 	.gem_vm_ops		= &drm_gem_cma_vm_ops, \
 	.prime_handle_to_fd	= drm_gem_prime_handle_to_fd, \
-- 
cgit v1.2.3


From d1a9d710d12485710d424daaa2430fe93bc767f8 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 27 Mar 2018 23:47:20 +0300
Subject: drm: prefer inline over __inline__

Remove last users of __inline__.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180327204722.31246-1-jani.nikula@intel.com
---
 include/drm/drmP.h       | 5 ++---
 include/drm/drm_legacy.h | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index c6666cd09347..4bbef061c9c0 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -123,8 +123,7 @@ static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev)
 #define DRM_SWITCH_POWER_CHANGING 2
 #define DRM_SWITCH_POWER_DYNAMIC_OFF 3
 
-static __inline__ int drm_core_check_feature(struct drm_device *dev,
-					     int feature)
+static inline int drm_core_check_feature(struct drm_device *dev, int feature)
 {
 	return ((dev->driver->driver_features & feature) ? 1 : 0);
 }
@@ -143,7 +142,7 @@ static __inline__ int drm_core_check_feature(struct drm_device *dev,
 /*@}*/
 
 /* returns true if currently okay to sleep */
-static __inline__ bool drm_can_sleep(void)
+static inline bool drm_can_sleep(void)
 {
 	if (in_atomic() || in_dbg_master() || irqs_disabled())
 		return false;
diff --git a/include/drm/drm_legacy.h b/include/drm/drm_legacy.h
index cf0e7d89bcdf..8fad66f88e4f 100644
--- a/include/drm/drm_legacy.h
+++ b/include/drm/drm_legacy.h
@@ -194,8 +194,8 @@ void drm_legacy_ioremap(struct drm_local_map *map, struct drm_device *dev);
 void drm_legacy_ioremap_wc(struct drm_local_map *map, struct drm_device *dev);
 void drm_legacy_ioremapfree(struct drm_local_map *map, struct drm_device *dev);
 
-static __inline__ struct drm_local_map *drm_legacy_findmap(struct drm_device *dev,
-							   unsigned int token)
+static inline struct drm_local_map *drm_legacy_findmap(struct drm_device *dev,
+						       unsigned int token)
 {
 	struct drm_map_list *_entry;
 	list_for_each_entry(_entry, &dev->maplist, head)
-- 
cgit v1.2.3


From 885a31cb6c752d5403adc6389894c27560fc6e6c Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 27 Mar 2018 23:47:21 +0300
Subject: drm: remove old documentation comment cruft from drmP.h

Throw out the leftovers.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180327204722.31246-2-jani.nikula@intel.com
---
 include/drm/drmP.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include')

diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 4bbef061c9c0..b5d52a3d7d19 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -95,14 +95,6 @@ struct dma_buf_attachment;
 struct pci_dev;
 struct pci_controller;
 
-/***********************************************************************/
-/** \name DRM template customization defaults */
-/*@{*/
-
-/***********************************************************************/
-/** \name Internal types and structures */
-/*@{*/
-
 #define DRM_IF_VERSION(maj, min) (maj << 16 | min)
 
 /**
@@ -128,19 +120,6 @@ static inline int drm_core_check_feature(struct drm_device *dev, int feature)
 	return ((dev->driver->driver_features & feature) ? 1 : 0);
 }
 
-/******************************************************************/
-/** \name Internal function definitions */
-/*@{*/
-
-				/* Driver support (drm_drv.h) */
-
-/*
- * These are exported to drivers so that they can implement fencing using
- * DMA quiscent + idle. DMA quiescent usually requires the hardware lock.
- */
-
-/*@}*/
-
 /* returns true if currently okay to sleep */
 static inline bool drm_can_sleep(void)
 {
-- 
cgit v1.2.3


From f4392860b4fe55d7d7cadaa64743c9b2466e4fd8 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 27 Mar 2018 23:47:22 +0300
Subject: drm: make drm_core_check_feature() bool that it is

Bool is the more appropriate return type here, use it.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180327204722.31246-3-jani.nikula@intel.com
---
 include/drm/drmP.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index b5d52a3d7d19..f5099c12c6a6 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -115,9 +115,9 @@ static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev)
 #define DRM_SWITCH_POWER_CHANGING 2
 #define DRM_SWITCH_POWER_DYNAMIC_OFF 3
 
-static inline int drm_core_check_feature(struct drm_device *dev, int feature)
+static inline bool drm_core_check_feature(struct drm_device *dev, int feature)
 {
-	return ((dev->driver->driver_features & feature) ? 1 : 0);
+	return dev->driver->driver_features & feature;
 }
 
 /* returns true if currently okay to sleep */
-- 
cgit v1.2.3


From 49efffc7fbd48d5ea3d0dd60c218c7502d4a179d Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 21 Mar 2018 12:20:24 +0200
Subject: drm: Add drm_mode_config->normalize_zpos boolean

Instead of drivers duplicating the drm_atomic_helper_check() code to be
able to normalize the zpos they can use the normalize_zpos flag to let the
drm core to do it.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180321102029.15248-2-peter.ujfalusi@ti.com
---
 drivers/gpu/drm/drm_atomic_helper.c | 11 +++++++++++
 include/drm/drm_mode_config.h       |  8 ++++++++
 include/drm/drm_plane.h             |  4 ++--
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index c35654591c12..d63c806e7d38 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -875,6 +875,11 @@ EXPORT_SYMBOL(drm_atomic_helper_check_planes);
  * functions depend upon an updated adjusted_mode.clock to e.g. properly compute
  * watermarks.
  *
+ * Note that zpos normalization will add all enable planes to the state which
+ * might not desired for some drivers.
+ * For example enable/disable of a cursor plane which have fixed zpos value
+ * would trigger all other enabled planes to be forced to the state change.
+ *
  * RETURNS:
  * Zero for success or -errno
  */
@@ -887,6 +892,12 @@ int drm_atomic_helper_check(struct drm_device *dev,
 	if (ret)
 		return ret;
 
+	if (dev->mode_config.normalize_zpos) {
+		ret = drm_atomic_normalize_zpos(dev, state);
+		if (ret)
+			return ret;
+	}
+
 	ret = drm_atomic_helper_check_planes(dev, state);
 	if (ret)
 		return ret;
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 7569f22ffef6..33b3a96d66d0 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -795,6 +795,14 @@ struct drm_mode_config {
 	 */
 	bool allow_fb_modifiers;
 
+	/**
+	 * @normalize_zpos:
+	 *
+	 * If true the drm core will call drm_atomic_normalize_zpos() as part of
+	 * atomic mode checking from drm_atomic_helper_check()
+	 */
+	bool normalize_zpos;
+
 	/**
 	 * @modifiers_property: Plane property to list support modifier/format
 	 * combination.
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index f7bf4a48b1c3..d6da26d66a4b 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -51,8 +51,8 @@ struct drm_modeset_acquire_ctx;
  *	plane with a lower ID.
  * @normalized_zpos: normalized value of zpos: unique, range from 0 to N-1
  *	where N is the number of active planes for given crtc. Note that
- *	the driver must call drm_atomic_normalize_zpos() to update this before
- *	it can be trusted.
+ *	the driver must set drm_mode_config.normalize_zpos or call
+ *	drm_atomic_normalize_zpos() to update this before it can be trusted.
  * @src: clipped source coordinates of the plane (in 16.16)
  * @dst: clipped destination coordinates of the plane
  * @state: backpointer to global drm_atomic_state
-- 
cgit v1.2.3


From 0c9c7fd00e17907efb35697ecb9f2df39a0b536c Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Thu, 22 Mar 2018 22:27:37 +0200
Subject: drm/simple-kms-helper: Plumb plane state to the enable hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tinydrm enable hook wants to play around with the new fb in
.atomic_enable(), thus we'll need access to the plane state.

Performed with coccinelle:
@r1@
identifier F =~ ".*enable$";
identifier P, CS;
@@
F(
	struct drm_simple_display_pipe *P
	,struct drm_crtc_state *CS
+	,struct drm_plane_state *plane_state
	)
{
...
}

@@
struct drm_simple_display_pipe *P;
expression E;
@@
{
+ struct drm_plane *plane;
...
+ plane = &P->plane;
P->funcs->enable(P
		,E
+		,plane->state
	);
...
}

@@
identifier P, CS;
@@
struct drm_simple_display_pipe_funcs {
...
        void (*enable)(struct drm_simple_display_pipe *P
	     		,struct drm_crtc_state *CS
+			,struct drm_plane_state *plane_state
		);
...
};

v2: Pimp the commit message (David)

Cc: Marek Vasut <marex@denx.de>
Cc: Eric Anholt <eric@anholt.net>
Cc: David Lechner <david@lechnology.com>
Cc: "Noralf Trønnes" <noralf@tronnes.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180322202738.25817-1-ville.syrjala@linux.intel.com
Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
---
 drivers/gpu/drm/drm_simple_kms_helper.c | 4 +++-
 drivers/gpu/drm/mxsfb/mxsfb_drv.c       | 3 ++-
 drivers/gpu/drm/pl111/pl111_display.c   | 3 ++-
 drivers/gpu/drm/tinydrm/ili9225.c       | 3 ++-
 drivers/gpu/drm/tinydrm/mi0283qt.c      | 3 ++-
 drivers/gpu/drm/tinydrm/repaper.c       | 3 ++-
 drivers/gpu/drm/tinydrm/st7586.c        | 3 ++-
 drivers/gpu/drm/tinydrm/st7735r.c       | 3 ++-
 drivers/gpu/drm/tve200/tve200_display.c | 3 ++-
 include/drm/drm_simple_kms_helper.h     | 3 ++-
 10 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_simple_kms_helper.c b/drivers/gpu/drm/drm_simple_kms_helper.c
index 987a353c7f72..7a00455ca568 100644
--- a/drivers/gpu/drm/drm_simple_kms_helper.c
+++ b/drivers/gpu/drm/drm_simple_kms_helper.c
@@ -64,13 +64,15 @@ static int drm_simple_kms_crtc_check(struct drm_crtc *crtc,
 static void drm_simple_kms_crtc_enable(struct drm_crtc *crtc,
 				       struct drm_crtc_state *old_state)
 {
+	struct drm_plane *plane;
 	struct drm_simple_display_pipe *pipe;
 
 	pipe = container_of(crtc, struct drm_simple_display_pipe, crtc);
 	if (!pipe->funcs || !pipe->funcs->enable)
 		return;
 
-	pipe->funcs->enable(pipe, crtc->state);
+	plane = &pipe->plane;
+	pipe->funcs->enable(pipe, crtc->state, plane->state);
 }
 
 static void drm_simple_kms_crtc_disable(struct drm_crtc *crtc,
diff --git a/drivers/gpu/drm/mxsfb/mxsfb_drv.c b/drivers/gpu/drm/mxsfb/mxsfb_drv.c
index 5cae8db9dcd4..b9c7507813db 100644
--- a/drivers/gpu/drm/mxsfb/mxsfb_drv.c
+++ b/drivers/gpu/drm/mxsfb/mxsfb_drv.c
@@ -99,7 +99,8 @@ static const struct drm_mode_config_funcs mxsfb_mode_config_funcs = {
 };
 
 static void mxsfb_pipe_enable(struct drm_simple_display_pipe *pipe,
-			      struct drm_crtc_state *crtc_state)
+			      struct drm_crtc_state *crtc_state,
+			      struct drm_plane_state *plane_state)
 {
 	struct mxsfb_drm_private *mxsfb = drm_pipe_to_mxsfb_drm_private(pipe);
 
diff --git a/drivers/gpu/drm/pl111/pl111_display.c b/drivers/gpu/drm/pl111/pl111_display.c
index 310646427907..1fee578e05b0 100644
--- a/drivers/gpu/drm/pl111/pl111_display.c
+++ b/drivers/gpu/drm/pl111/pl111_display.c
@@ -120,7 +120,8 @@ static int pl111_display_check(struct drm_simple_display_pipe *pipe,
 }
 
 static void pl111_display_enable(struct drm_simple_display_pipe *pipe,
-				 struct drm_crtc_state *cstate)
+				 struct drm_crtc_state *cstate,
+				 struct drm_plane_state *plane_state)
 {
 	struct drm_crtc *crtc = &pipe->crtc;
 	struct drm_plane *plane = &pipe->plane;
diff --git a/drivers/gpu/drm/tinydrm/ili9225.c b/drivers/gpu/drm/tinydrm/ili9225.c
index a0759502b81a..089d22798c8b 100644
--- a/drivers/gpu/drm/tinydrm/ili9225.c
+++ b/drivers/gpu/drm/tinydrm/ili9225.c
@@ -176,7 +176,8 @@ static const struct drm_framebuffer_funcs ili9225_fb_funcs = {
 };
 
 static void ili9225_pipe_enable(struct drm_simple_display_pipe *pipe,
-				struct drm_crtc_state *crtc_state)
+				struct drm_crtc_state *crtc_state,
+				struct drm_plane_state *plane_state)
 {
 	struct tinydrm_device *tdev = pipe_to_tinydrm(pipe);
 	struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev);
diff --git a/drivers/gpu/drm/tinydrm/mi0283qt.c b/drivers/gpu/drm/tinydrm/mi0283qt.c
index d8ed6e6f8e05..82ad9b61898e 100644
--- a/drivers/gpu/drm/tinydrm/mi0283qt.c
+++ b/drivers/gpu/drm/tinydrm/mi0283qt.c
@@ -49,7 +49,8 @@
 #define ILI9341_MADCTL_MY	BIT(7)
 
 static void mi0283qt_enable(struct drm_simple_display_pipe *pipe,
-			    struct drm_crtc_state *crtc_state)
+			    struct drm_crtc_state *crtc_state,
+			    struct drm_plane_state *plane_state)
 {
 	struct tinydrm_device *tdev = pipe_to_tinydrm(pipe);
 	struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev);
diff --git a/drivers/gpu/drm/tinydrm/repaper.c b/drivers/gpu/drm/tinydrm/repaper.c
index 75740630c410..33b4a71916e4 100644
--- a/drivers/gpu/drm/tinydrm/repaper.c
+++ b/drivers/gpu/drm/tinydrm/repaper.c
@@ -659,7 +659,8 @@ static void power_off(struct repaper_epd *epd)
 }
 
 static void repaper_pipe_enable(struct drm_simple_display_pipe *pipe,
-				struct drm_crtc_state *crtc_state)
+				struct drm_crtc_state *crtc_state,
+				struct drm_plane_state *plane_state)
 {
 	struct tinydrm_device *tdev = pipe_to_tinydrm(pipe);
 	struct repaper_epd *epd = epd_from_tinydrm(tdev);
diff --git a/drivers/gpu/drm/tinydrm/st7586.c b/drivers/gpu/drm/tinydrm/st7586.c
index a6396ef9cc4a..bb08b293c8ce 100644
--- a/drivers/gpu/drm/tinydrm/st7586.c
+++ b/drivers/gpu/drm/tinydrm/st7586.c
@@ -175,7 +175,8 @@ static const struct drm_framebuffer_funcs st7586_fb_funcs = {
 };
 
 static void st7586_pipe_enable(struct drm_simple_display_pipe *pipe,
-			       struct drm_crtc_state *crtc_state)
+			       struct drm_crtc_state *crtc_state,
+			       struct drm_plane_state *plane_state)
 {
 	struct tinydrm_device *tdev = pipe_to_tinydrm(pipe);
 	struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev);
diff --git a/drivers/gpu/drm/tinydrm/st7735r.c b/drivers/gpu/drm/tinydrm/st7735r.c
index 67d197ecfc4b..19b28f8c78db 100644
--- a/drivers/gpu/drm/tinydrm/st7735r.c
+++ b/drivers/gpu/drm/tinydrm/st7735r.c
@@ -37,7 +37,8 @@
 #define ST7735R_MV	BIT(5)
 
 static void jd_t18003_t01_pipe_enable(struct drm_simple_display_pipe *pipe,
-				      struct drm_crtc_state *crtc_state)
+				      struct drm_crtc_state *crtc_state,
+				      struct drm_plane_state *plane_state)
 {
 	struct tinydrm_device *tdev = pipe_to_tinydrm(pipe);
 	struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev);
diff --git a/drivers/gpu/drm/tve200/tve200_display.c b/drivers/gpu/drm/tve200/tve200_display.c
index db397fcb345a..108f3b2b5d25 100644
--- a/drivers/gpu/drm/tve200/tve200_display.c
+++ b/drivers/gpu/drm/tve200/tve200_display.c
@@ -120,7 +120,8 @@ static int tve200_display_check(struct drm_simple_display_pipe *pipe,
 }
 
 static void tve200_display_enable(struct drm_simple_display_pipe *pipe,
-				 struct drm_crtc_state *cstate)
+				 struct drm_crtc_state *cstate,
+				 struct drm_plane_state *plane_state)
 {
 	struct drm_crtc *crtc = &pipe->crtc;
 	struct drm_plane *plane = &pipe->plane;
diff --git a/include/drm/drm_simple_kms_helper.h b/include/drm/drm_simple_kms_helper.h
index 1b4e352143fd..b02793742317 100644
--- a/include/drm/drm_simple_kms_helper.h
+++ b/include/drm/drm_simple_kms_helper.h
@@ -64,7 +64,8 @@ struct drm_simple_display_pipe_funcs {
 	 * This hook is optional.
 	 */
 	void (*enable)(struct drm_simple_display_pipe *pipe,
-		       struct drm_crtc_state *crtc_state);
+		       struct drm_crtc_state *crtc_state,
+		       struct drm_plane_state *plane_state);
 	/**
 	 * @disable:
 	 *
-- 
cgit v1.2.3


From e85d30060eddccfcfbf7fdbd61a23cfbda05cc59 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Fri, 23 Mar 2018 17:35:09 +0200
Subject: drm/tinydrm: Make fb_dirty into a lower level hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mipi_dbi_enable_flush() wants to call the fb->dirty() hook from the
bowels of the .atomic_enable() hook. That prevents us from taking the
plane mutex in fb->dirty() unless we also plumb down the acquire
context.

Instead it seems simpler to split the fb->dirty() into a tinydrm
specific lower level hook that can be called from
mipi_dbi_enable_flush() and from a generic higher level
tinydrm_fb_dirty() helper. As we don't have a tinydrm specific
vfuncs table we'll just stick it into tinydrm_device directly
for now.

v2: Deal with the fb->dirty() in tinydrm_display_pipe_update() as well (Noralf)

Cc: "Noralf Trønnes" <noralf@tronnes.org>
Cc: David Lechner <david@lechnology.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20180323153509.15287-1-ville.syrjala@linux.intel.com
Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Tested-by: Noralf Trønnes <noralf@tronnes.org>
---
 drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c | 30 ++++++++++++++++++++++++++
 drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c    |  5 ++---
 drivers/gpu/drm/tinydrm/ili9225.c              | 23 ++++++--------------
 drivers/gpu/drm/tinydrm/mi0283qt.c             |  2 +-
 drivers/gpu/drm/tinydrm/mipi-dbi.c             | 30 ++++++++++----------------
 drivers/gpu/drm/tinydrm/repaper.c              | 28 ++++++++----------------
 drivers/gpu/drm/tinydrm/st7586.c               | 23 ++++++--------------
 drivers/gpu/drm/tinydrm/st7735r.c              |  2 +-
 include/drm/tinydrm/mipi-dbi.h                 |  4 +++-
 include/drm/tinydrm/tinydrm-helpers.h          |  5 +++++
 include/drm/tinydrm/tinydrm.h                  |  4 ++++
 11 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
index d1c3ce9ab294..dcd390163a4a 100644
--- a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
+++ b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
@@ -78,6 +78,36 @@ bool tinydrm_merge_clips(struct drm_clip_rect *dst,
 }
 EXPORT_SYMBOL(tinydrm_merge_clips);
 
+int tinydrm_fb_dirty(struct drm_framebuffer *fb,
+		     struct drm_file *file_priv,
+		     unsigned int flags, unsigned int color,
+		     struct drm_clip_rect *clips,
+		     unsigned int num_clips)
+{
+	struct tinydrm_device *tdev = fb->dev->dev_private;
+	struct drm_plane *plane = &tdev->pipe.plane;
+	int ret = 0;
+
+	drm_modeset_lock(&plane->mutex, NULL);
+
+	/* fbdev can flush even when we're not interested */
+	if (plane->state->fb == fb) {
+		mutex_lock(&tdev->dirty_lock);
+		ret = tdev->fb_dirty(fb, file_priv, flags,
+				     color, clips, num_clips);
+		mutex_unlock(&tdev->dirty_lock);
+	}
+
+	drm_modeset_unlock(&plane->mutex);
+
+	if (ret)
+		dev_err_once(fb->dev->dev,
+			     "Failed to update display %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL(tinydrm_fb_dirty);
+
 /**
  * tinydrm_memcpy - Copy clip buffer
  * @dst: Destination buffer
diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c
index 11ae950b0fc9..e68b528ae64d 100644
--- a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c
+++ b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c
@@ -125,9 +125,8 @@ void tinydrm_display_pipe_update(struct drm_simple_display_pipe *pipe,
 	struct drm_crtc *crtc = &tdev->pipe.crtc;
 
 	if (fb && (fb != old_state->fb)) {
-		pipe->plane.fb = fb;
-		if (fb->funcs->dirty)
-			fb->funcs->dirty(fb, NULL, 0, 0, NULL, 0);
+		if (tdev->fb_dirty)
+			tdev->fb_dirty(fb, NULL, 0, 0, NULL, 0);
 	}
 
 	if (crtc->state->event) {
diff --git a/drivers/gpu/drm/tinydrm/ili9225.c b/drivers/gpu/drm/tinydrm/ili9225.c
index 089d22798c8b..0874e877b111 100644
--- a/drivers/gpu/drm/tinydrm/ili9225.c
+++ b/drivers/gpu/drm/tinydrm/ili9225.c
@@ -88,14 +88,8 @@ static int ili9225_fb_dirty(struct drm_framebuffer *fb,
 	bool full;
 	void *tr;
 
-	mutex_lock(&tdev->dirty_lock);
-
 	if (!mipi->enabled)
-		goto out_unlock;
-
-	/* fbdev can flush even when we're not interested */
-	if (tdev->pipe.plane.fb != fb)
-		goto out_unlock;
+		return 0;
 
 	full = tinydrm_merge_clips(&clip, clips, num_clips, flags,
 				   fb->width, fb->height);
@@ -108,7 +102,7 @@ static int ili9225_fb_dirty(struct drm_framebuffer *fb,
 		tr = mipi->tx_buf;
 		ret = mipi_dbi_buf_copy(mipi->tx_buf, fb, &clip, swap);
 		if (ret)
-			goto out_unlock;
+			return ret;
 	} else {
 		tr = cma_obj->vaddr;
 	}
@@ -159,20 +153,13 @@ static int ili9225_fb_dirty(struct drm_framebuffer *fb,
 	ret = mipi_dbi_command_buf(mipi, ILI9225_WRITE_DATA_TO_GRAM, tr,
 				(clip.x2 - clip.x1) * (clip.y2 - clip.y1) * 2);
 
-out_unlock:
-	mutex_unlock(&tdev->dirty_lock);
-
-	if (ret)
-		dev_err_once(fb->dev->dev, "Failed to update display %d\n",
-			     ret);
-
 	return ret;
 }
 
 static const struct drm_framebuffer_funcs ili9225_fb_funcs = {
 	.destroy	= drm_gem_fb_destroy,
 	.create_handle	= drm_gem_fb_create_handle,
-	.dirty		= ili9225_fb_dirty,
+	.dirty		= tinydrm_fb_dirty,
 };
 
 static void ili9225_pipe_enable(struct drm_simple_display_pipe *pipe,
@@ -269,7 +256,7 @@ static void ili9225_pipe_enable(struct drm_simple_display_pipe *pipe,
 
 	ili9225_command(mipi, ILI9225_DISPLAY_CONTROL_1, 0x1017);
 
-	mipi_dbi_enable_flush(mipi);
+	mipi_dbi_enable_flush(mipi, crtc_state, plane_state);
 }
 
 static void ili9225_pipe_disable(struct drm_simple_display_pipe *pipe)
@@ -342,6 +329,8 @@ static int ili9225_init(struct device *dev, struct mipi_dbi *mipi,
 	if (ret)
 		return ret;
 
+	tdev->fb_dirty = ili9225_fb_dirty;
+
 	ret = tinydrm_display_pipe_init(tdev, pipe_funcs,
 					DRM_MODE_CONNECTOR_VIRTUAL,
 					ili9225_formats,
diff --git a/drivers/gpu/drm/tinydrm/mi0283qt.c b/drivers/gpu/drm/tinydrm/mi0283qt.c
index 82ad9b61898e..4e6d2ee94e55 100644
--- a/drivers/gpu/drm/tinydrm/mi0283qt.c
+++ b/drivers/gpu/drm/tinydrm/mi0283qt.c
@@ -127,7 +127,7 @@ static void mi0283qt_enable(struct drm_simple_display_pipe *pipe,
 	msleep(100);
 
 out_enable:
-	mipi_dbi_enable_flush(mipi);
+	mipi_dbi_enable_flush(mipi, crtc_state, plane_state);
 }
 
 static const struct drm_simple_display_pipe_funcs mi0283qt_pipe_funcs = {
diff --git a/drivers/gpu/drm/tinydrm/mipi-dbi.c b/drivers/gpu/drm/tinydrm/mipi-dbi.c
index 9e903812b573..4d1fb31a781f 100644
--- a/drivers/gpu/drm/tinydrm/mipi-dbi.c
+++ b/drivers/gpu/drm/tinydrm/mipi-dbi.c
@@ -219,14 +219,8 @@ static int mipi_dbi_fb_dirty(struct drm_framebuffer *fb,
 	bool full;
 	void *tr;
 
-	mutex_lock(&tdev->dirty_lock);
-
 	if (!mipi->enabled)
-		goto out_unlock;
-
-	/* fbdev can flush even when we're not interested */
-	if (tdev->pipe.plane.fb != fb)
-		goto out_unlock;
+		return 0;
 
 	full = tinydrm_merge_clips(&clip, clips, num_clips, flags,
 				   fb->width, fb->height);
@@ -239,7 +233,7 @@ static int mipi_dbi_fb_dirty(struct drm_framebuffer *fb,
 		tr = mipi->tx_buf;
 		ret = mipi_dbi_buf_copy(mipi->tx_buf, fb, &clip, swap);
 		if (ret)
-			goto out_unlock;
+			return ret;
 	} else {
 		tr = cma_obj->vaddr;
 	}
@@ -254,20 +248,13 @@ static int mipi_dbi_fb_dirty(struct drm_framebuffer *fb,
 	ret = mipi_dbi_command_buf(mipi, MIPI_DCS_WRITE_MEMORY_START, tr,
 				(clip.x2 - clip.x1) * (clip.y2 - clip.y1) * 2);
 
-out_unlock:
-	mutex_unlock(&tdev->dirty_lock);
-
-	if (ret)
-		dev_err_once(fb->dev->dev, "Failed to update display %d\n",
-			     ret);
-
 	return ret;
 }
 
 static const struct drm_framebuffer_funcs mipi_dbi_fb_funcs = {
 	.destroy	= drm_gem_fb_destroy,
 	.create_handle	= drm_gem_fb_create_handle,
-	.dirty		= mipi_dbi_fb_dirty,
+	.dirty		= tinydrm_fb_dirty,
 };
 
 /**
@@ -278,13 +265,16 @@ static const struct drm_framebuffer_funcs mipi_dbi_fb_funcs = {
  * enables the backlight. Drivers can use this in their
  * &drm_simple_display_pipe_funcs->enable callback.
  */
-void mipi_dbi_enable_flush(struct mipi_dbi *mipi)
+void mipi_dbi_enable_flush(struct mipi_dbi *mipi,
+			   struct drm_crtc_state *crtc_state,
+			   struct drm_plane_state *plane_state)
 {
-	struct drm_framebuffer *fb = mipi->tinydrm.pipe.plane.fb;
+	struct tinydrm_device *tdev = &mipi->tinydrm;
+	struct drm_framebuffer *fb = plane_state->fb;
 
 	mipi->enabled = true;
 	if (fb)
-		fb->funcs->dirty(fb, NULL, 0, 0, NULL, 0);
+		tdev->fb_dirty(fb, NULL, 0, 0, NULL, 0);
 
 	backlight_enable(mipi->backlight);
 }
@@ -381,6 +371,8 @@ int mipi_dbi_init(struct device *dev, struct mipi_dbi *mipi,
 	if (ret)
 		return ret;
 
+	tdev->fb_dirty = mipi_dbi_fb_dirty;
+
 	/* TODO: Maybe add DRM_MODE_CONNECTOR_SPI */
 	ret = tinydrm_display_pipe_init(tdev, pipe_funcs,
 					DRM_MODE_CONNECTOR_VIRTUAL,
diff --git a/drivers/gpu/drm/tinydrm/repaper.c b/drivers/gpu/drm/tinydrm/repaper.c
index 33b4a71916e4..bb6f80a81899 100644
--- a/drivers/gpu/drm/tinydrm/repaper.c
+++ b/drivers/gpu/drm/tinydrm/repaper.c
@@ -540,14 +540,8 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb,
 	clip.y1 = 0;
 	clip.y2 = fb->height;
 
-	mutex_lock(&tdev->dirty_lock);
-
 	if (!epd->enabled)
-		goto out_unlock;
-
-	/* fbdev can flush even when we're not interested */
-	if (tdev->pipe.plane.fb != fb)
-		goto out_unlock;
+		return 0;
 
 	repaper_get_temperature(epd);
 
@@ -555,16 +549,14 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb,
 		  epd->factored_stage_time);
 
 	buf = kmalloc(fb->width * fb->height, GFP_KERNEL);
-	if (!buf) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
+	if (!buf)
+		return -ENOMEM;
 
 	if (import_attach) {
 		ret = dma_buf_begin_cpu_access(import_attach->dmabuf,
 					       DMA_FROM_DEVICE);
 		if (ret)
-			goto out_unlock;
+			goto out_free;
 	}
 
 	tinydrm_xrgb8888_to_gray8(buf, cma_obj->vaddr, fb, &clip);
@@ -573,7 +565,7 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb,
 		ret = dma_buf_end_cpu_access(import_attach->dmabuf,
 					     DMA_FROM_DEVICE);
 		if (ret)
-			goto out_unlock;
+			goto out_free;
 	}
 
 	repaper_gray8_to_mono_reversed(buf, fb->width, fb->height);
@@ -625,11 +617,7 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb,
 			}
 	}
 
-out_unlock:
-	mutex_unlock(&tdev->dirty_lock);
-
-	if (ret)
-		DRM_DEV_ERROR(fb->dev->dev, "Failed to update display (%d)\n", ret);
+out_free:
 	kfree(buf);
 
 	return ret;
@@ -638,7 +626,7 @@ out_unlock:
 static const struct drm_framebuffer_funcs repaper_fb_funcs = {
 	.destroy	= drm_gem_fb_destroy,
 	.create_handle	= drm_gem_fb_create_handle,
-	.dirty		= repaper_fb_dirty,
+	.dirty		= tinydrm_fb_dirty,
 };
 
 static void power_off(struct repaper_epd *epd)
@@ -1070,6 +1058,8 @@ static int repaper_probe(struct spi_device *spi)
 	if (ret)
 		return ret;
 
+	tdev->fb_dirty = repaper_fb_dirty;
+
 	ret = tinydrm_display_pipe_init(tdev, &repaper_pipe_funcs,
 					DRM_MODE_CONNECTOR_VIRTUAL,
 					repaper_formats,
diff --git a/drivers/gpu/drm/tinydrm/st7586.c b/drivers/gpu/drm/tinydrm/st7586.c
index bb08b293c8ce..22644b88199a 100644
--- a/drivers/gpu/drm/tinydrm/st7586.c
+++ b/drivers/gpu/drm/tinydrm/st7586.c
@@ -120,14 +120,8 @@ static int st7586_fb_dirty(struct drm_framebuffer *fb,
 	int start, end;
 	int ret = 0;
 
-	mutex_lock(&tdev->dirty_lock);
-
 	if (!mipi->enabled)
-		goto out_unlock;
-
-	/* fbdev can flush even when we're not interested */
-	if (tdev->pipe.plane.fb != fb)
-		goto out_unlock;
+		return 0;
 
 	tinydrm_merge_clips(&clip, clips, num_clips, flags, fb->width,
 			    fb->height);
@@ -141,7 +135,7 @@ static int st7586_fb_dirty(struct drm_framebuffer *fb,
 
 	ret = st7586_buf_copy(mipi->tx_buf, fb, &clip);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	/* Pixels are packed 3 per byte */
 	start = clip.x1 / 3;
@@ -158,20 +152,13 @@ static int st7586_fb_dirty(struct drm_framebuffer *fb,
 				   (u8 *)mipi->tx_buf,
 				   (end - start) * (clip.y2 - clip.y1));
 
-out_unlock:
-	mutex_unlock(&tdev->dirty_lock);
-
-	if (ret)
-		dev_err_once(fb->dev->dev, "Failed to update display %d\n",
-			     ret);
-
 	return ret;
 }
 
 static const struct drm_framebuffer_funcs st7586_fb_funcs = {
 	.destroy	= drm_gem_fb_destroy,
 	.create_handle	= drm_gem_fb_create_handle,
-	.dirty		= st7586_fb_dirty,
+	.dirty		= tinydrm_fb_dirty,
 };
 
 static void st7586_pipe_enable(struct drm_simple_display_pipe *pipe,
@@ -238,7 +225,7 @@ static void st7586_pipe_enable(struct drm_simple_display_pipe *pipe,
 
 	mipi_dbi_command(mipi, MIPI_DCS_SET_DISPLAY_ON);
 
-	mipi_dbi_enable_flush(mipi);
+	mipi_dbi_enable_flush(mipi, crtc_state, plane_state);
 }
 
 static void st7586_pipe_disable(struct drm_simple_display_pipe *pipe)
@@ -278,6 +265,8 @@ static int st7586_init(struct device *dev, struct mipi_dbi *mipi,
 	if (ret)
 		return ret;
 
+	tdev->fb_dirty = st7586_fb_dirty;
+
 	ret = tinydrm_display_pipe_init(tdev, pipe_funcs,
 					DRM_MODE_CONNECTOR_VIRTUAL,
 					st7586_formats,
diff --git a/drivers/gpu/drm/tinydrm/st7735r.c b/drivers/gpu/drm/tinydrm/st7735r.c
index 19b28f8c78db..189a07894d36 100644
--- a/drivers/gpu/drm/tinydrm/st7735r.c
+++ b/drivers/gpu/drm/tinydrm/st7735r.c
@@ -99,7 +99,7 @@ static void jd_t18003_t01_pipe_enable(struct drm_simple_display_pipe *pipe,
 
 	msleep(20);
 
-	mipi_dbi_enable_flush(mipi);
+	mipi_dbi_enable_flush(mipi, crtc_state, plane_state);
 }
 
 static const struct drm_simple_display_pipe_funcs jd_t18003_t01_pipe_funcs = {
diff --git a/include/drm/tinydrm/mipi-dbi.h b/include/drm/tinydrm/mipi-dbi.h
index 44e824af2ef6..b8ba58861986 100644
--- a/include/drm/tinydrm/mipi-dbi.h
+++ b/include/drm/tinydrm/mipi-dbi.h
@@ -67,7 +67,9 @@ int mipi_dbi_init(struct device *dev, struct mipi_dbi *mipi,
 		  const struct drm_simple_display_pipe_funcs *pipe_funcs,
 		  struct drm_driver *driver,
 		  const struct drm_display_mode *mode, unsigned int rotation);
-void mipi_dbi_enable_flush(struct mipi_dbi *mipi);
+void mipi_dbi_enable_flush(struct mipi_dbi *mipi,
+			   struct drm_crtc_state *crtc_state,
+			   struct drm_plane_state *plan_state);
 void mipi_dbi_pipe_disable(struct drm_simple_display_pipe *pipe);
 void mipi_dbi_hw_reset(struct mipi_dbi *mipi);
 bool mipi_dbi_display_is_on(struct mipi_dbi *mipi);
diff --git a/include/drm/tinydrm/tinydrm-helpers.h b/include/drm/tinydrm/tinydrm-helpers.h
index 0a4ddbc04c60..5b96f0b12c8c 100644
--- a/include/drm/tinydrm/tinydrm-helpers.h
+++ b/include/drm/tinydrm/tinydrm-helpers.h
@@ -36,6 +36,11 @@ static inline bool tinydrm_machine_little_endian(void)
 bool tinydrm_merge_clips(struct drm_clip_rect *dst,
 			 struct drm_clip_rect *src, unsigned int num_clips,
 			 unsigned int flags, u32 max_width, u32 max_height);
+int tinydrm_fb_dirty(struct drm_framebuffer *fb,
+		     struct drm_file *file_priv,
+		     unsigned int flags, unsigned int color,
+		     struct drm_clip_rect *clips,
+		     unsigned int num_clips);
 void tinydrm_memcpy(void *dst, void *vaddr, struct drm_framebuffer *fb,
 		    struct drm_clip_rect *clip);
 void tinydrm_swab16(u16 *dst, void *vaddr, struct drm_framebuffer *fb,
diff --git a/include/drm/tinydrm/tinydrm.h b/include/drm/tinydrm/tinydrm.h
index 77a93ec577fd..6e2b960e25eb 100644
--- a/include/drm/tinydrm/tinydrm.h
+++ b/include/drm/tinydrm/tinydrm.h
@@ -26,6 +26,10 @@ struct tinydrm_device {
 	struct drm_simple_display_pipe pipe;
 	struct mutex dirty_lock;
 	const struct drm_framebuffer_funcs *fb_funcs;
+	int (*fb_dirty)(struct drm_framebuffer *framebuffer,
+			struct drm_file *file_priv, unsigned flags,
+			unsigned color, struct drm_clip_rect *clips,
+			unsigned num_clips);
 };
 
 static inline struct tinydrm_device *
-- 
cgit v1.2.3


From bee330f3d67273a68dcb99f59480d59553c008b2 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Wed, 28 Mar 2018 10:38:35 +0300
Subject: drm: Use srcu to protect drm_device.unplugged
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use srcu to protect drm_device.unplugged in a race free manner.
Drivers can use drm_dev_enter()/drm_dev_exit() to protect and mark
sections preventing access to device resources that are not available
after the device is gone.

Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Reviewed-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Tested-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Cc: intel-gfx@lists.freedesktop.org
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1522222715-11814-1-git-send-email-andr2000@gmail.com
---
 drivers/gpu/drm/drm_drv.c | 54 ++++++++++++++++++++++++++++++++++++++++++-----
 include/drm/drm_device.h  |  9 +++++++-
 include/drm/drm_drv.h     | 15 +++++++++----
 3 files changed, 68 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index a1b9338736e3..32a83b41ab61 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -32,6 +32,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/srcu.h>
 
 #include <drm/drm_drv.h>
 #include <drm/drmP.h>
@@ -75,6 +76,8 @@ static bool drm_core_init_complete = false;
 
 static struct dentry *drm_debugfs_root;
 
+DEFINE_STATIC_SRCU(drm_unplug_srcu);
+
 /*
  * DRM Minors
  * A DRM device can provide several char-dev interfaces on the DRM-Major. Each
@@ -318,18 +321,51 @@ void drm_put_dev(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_put_dev);
 
-static void drm_device_set_unplugged(struct drm_device *dev)
+/**
+ * drm_dev_enter - Enter device critical section
+ * @dev: DRM device
+ * @idx: Pointer to index that will be passed to the matching drm_dev_exit()
+ *
+ * This function marks and protects the beginning of a section that should not
+ * be entered after the device has been unplugged. The section end is marked
+ * with drm_dev_exit(). Calls to this function can be nested.
+ *
+ * Returns:
+ * True if it is OK to enter the section, false otherwise.
+ */
+bool drm_dev_enter(struct drm_device *dev, int *idx)
+{
+	*idx = srcu_read_lock(&drm_unplug_srcu);
+
+	if (dev->unplugged) {
+		srcu_read_unlock(&drm_unplug_srcu, *idx);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(drm_dev_enter);
+
+/**
+ * drm_dev_exit - Exit device critical section
+ * @idx: index returned from drm_dev_enter()
+ *
+ * This function marks the end of a section that should not be entered after
+ * the device has been unplugged.
+ */
+void drm_dev_exit(int idx)
 {
-	smp_wmb();
-	atomic_set(&dev->unplugged, 1);
+	srcu_read_unlock(&drm_unplug_srcu, idx);
 }
+EXPORT_SYMBOL(drm_dev_exit);
 
 /**
  * drm_dev_unplug - unplug a DRM device
  * @dev: DRM device
  *
  * This unplugs a hotpluggable DRM device, which makes it inaccessible to
- * userspace operations. Entry-points can use drm_dev_is_unplugged(). This
+ * userspace operations. Entry-points can use drm_dev_enter() and
+ * drm_dev_exit() to protect device resources in a race free manner. This
  * essentially unregisters the device like drm_dev_unregister(), but can be
  * called while there are still open users of @dev.
  */
@@ -338,10 +374,18 @@ void drm_dev_unplug(struct drm_device *dev)
 	drm_dev_unregister(dev);
 
 	mutex_lock(&drm_global_mutex);
-	drm_device_set_unplugged(dev);
 	if (dev->open_count == 0)
 		drm_dev_put(dev);
 	mutex_unlock(&drm_global_mutex);
+
+	/*
+	 * After synchronizing any critical read section is guaranteed to see
+	 * the new value of ->unplugged, and any critical section which might
+	 * still have seen the old value of ->unplugged is guaranteed to have
+	 * finished.
+	 */
+	dev->unplugged = true;
+	synchronize_srcu(&drm_unplug_srcu);
 }
 EXPORT_SYMBOL(drm_dev_unplug);
 
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 7c4fa32f3fc6..3a0eac2885b7 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -46,7 +46,14 @@ struct drm_device {
 	/* currently active master for this device. Protected by master_mutex */
 	struct drm_master *master;
 
-	atomic_t unplugged;			/**< Flag whether dev is dead */
+	/**
+	 * @unplugged:
+	 *
+	 * Flag to tell if the device has been unplugged.
+	 * See drm_dev_enter() and drm_dev_is_unplugged().
+	 */
+	bool unplugged;
+
 	struct inode *anon_inode;		/**< inode for private address-space */
 	char *unique;				/**< unique name of the device */
 	/*@} */
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index d32b688eb346..ff7312c40cd8 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -623,6 +623,8 @@ void drm_dev_get(struct drm_device *dev);
 void drm_dev_put(struct drm_device *dev);
 void drm_dev_unref(struct drm_device *dev);
 void drm_put_dev(struct drm_device *dev);
+bool drm_dev_enter(struct drm_device *dev, int *idx);
+void drm_dev_exit(int idx);
 void drm_dev_unplug(struct drm_device *dev);
 
 /**
@@ -634,11 +636,16 @@ void drm_dev_unplug(struct drm_device *dev);
  * unplugged, these two functions guarantee that any store before calling
  * drm_dev_unplug() is visible to callers of this function after it completes
  */
-static inline int drm_dev_is_unplugged(struct drm_device *dev)
+static inline bool drm_dev_is_unplugged(struct drm_device *dev)
 {
-	int ret = atomic_read(&dev->unplugged);
-	smp_rmb();
-	return ret;
+	int idx;
+
+	if (drm_dev_enter(dev, &idx)) {
+		drm_dev_exit(idx);
+		return false;
+	}
+
+	return true;
 }
 
 
-- 
cgit v1.2.3


From 4f212e40468650e220c1770876c7f25b8e0c1ff5 Mon Sep 17 00:00:00 2001
From: José Roberto de Souza <jose.souza@intel.com>
Date: Wed, 28 Mar 2018 15:30:37 -0700
Subject: drm: Add DP PSR2 sink enable bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To comply with eDP1.4a this bit should be set when enabling PSR2.

Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180328223046.16125-1-jose.souza@intel.com
---
 include/drm/drm_dp_helper.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 4de97e94ef9d..a62714578b93 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -477,6 +477,7 @@
 # define DP_PSR_FRAME_CAPTURE		    (1 << 3)
 # define DP_PSR_SELECTIVE_UPDATE	    (1 << 4)
 # define DP_PSR_IRQ_HPD_WITH_CRC_ERRORS     (1 << 5)
+# define DP_PSR_ENABLE_PSR2		    (1 << 6) /* eDP 1.4a */
 
 #define DP_ADAPTER_CTRL			    0x1a0
 # define DP_ADAPTER_CTRL_FORCE_LOAD_SENSE   (1 << 0)
-- 
cgit v1.2.3


From fe36948afb08e361d12b2878348778aeaba74134 Mon Sep 17 00:00:00 2001
From: José Roberto de Souza <jose.souza@intel.com>
Date: Wed, 28 Mar 2018 15:30:38 -0700
Subject: drm: Add DP last received PSR SDP VSC register and bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a register to help debug what is in the last SDP VSC
packet revived by sink.

Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180328223046.16125-2-jose.souza@intel.com
---
 include/drm/drm_dp_helper.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index a62714578b93..c6853f0fef2a 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -794,6 +794,15 @@
 # define DP_LAST_ACTUAL_SYNCHRONIZATION_LATENCY_MASK	(0xf << 4)
 # define DP_LAST_ACTUAL_SYNCHRONIZATION_LATENCY_SHIFT	4
 
+#define DP_LAST_RECEIVED_PSR_SDP	    0x200a /* eDP 1.2 */
+# define DP_PSR_STATE_BIT		    (1 << 0) /* eDP 1.2 */
+# define DP_UPDATE_RFB_BIT		    (1 << 1) /* eDP 1.2 */
+# define DP_CRC_VALID_BIT		    (1 << 2) /* eDP 1.2 */
+# define DP_SU_VALID			    (1 << 3) /* eDP 1.4 */
+# define DP_FIRST_SCAN_LINE_SU_REGION	    (1 << 4) /* eDP 1.4 */
+# define DP_LAST_SCAN_LINE_SU_REGION	    (1 << 5) /* eDP 1.4 */
+# define DP_Y_COORDINATE_VALID		    (1 << 6) /* eDP 1.4a */
+
 #define DP_RECEIVER_ALPM_STATUS		    0x200b  /* eDP 1.4 */
 # define DP_ALPM_LOCK_TIMEOUT_ERROR	    (1 << 0)
 
-- 
cgit v1.2.3


From a2c09ac0fb6756d7085c359b6c020ef8b4205e0f Mon Sep 17 00:00:00 2001
From: Inju Song <inju.song@navercorp.com>
Date: Tue, 27 Mar 2018 23:14:40 +0900
Subject: netfilter: ipvs: Keep latest weight of destination

The hashing table in scheduler such as source hash or maglev hash
should ignore the changed weight to 0 and allow changing the weight
from/to non-0 values. So, struct ip_vs_dest needs to keep weight
with latest non-0 weight.

Signed-off-by: Inju Song <inju.song@navercorp.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h            | 1 +
 net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec043c96..0ac795b41ab8 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -668,6 +668,7 @@ struct ip_vs_dest {
 	volatile unsigned int	flags;		/* dest status flags */
 	atomic_t		conn_flags;	/* flags to copy to conn */
 	atomic_t		weight;		/* server weight */
+	atomic_t		last_weight;	/* server latest weight */
 
 	refcount_t		refcnt;		/* reference counter */
 	struct ip_vs_stats      stats;          /* statistics */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5ebde4b15810..b91bb70ece92 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	if (add && udest->af != svc->af)
 		ipvs->mixed_address_family_dests++;
 
+	/* keep the last_weight with latest non-0 weight */
+	if (add || udest->weight != 0)
+		atomic_set(&dest->last_weight, udest->weight);
+
 	/* set the weight and the flags */
 	atomic_set(&dest->weight, udest->weight);
 	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
-- 
cgit v1.2.3


From a70cdb9eddcfd4ba20d69b84149b4a38648455ac Mon Sep 17 00:00:00 2001
From: Nayan Deshmukh <nayan26deshmukh@gmail.com>
Date: Thu, 29 Mar 2018 22:36:33 +0530
Subject: drm/scheduler: move the tracepoints file from the include directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move it with the scheduler code. This is mostly a straight forward
rename with no code change except for updating the TRACE_INCLUDE_PATH

Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Acked-by: Lucas Stach <l.stach@pengutronix.de>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/scheduler/gpu_scheduler.c       |  2 +-
 drivers/gpu/drm/scheduler/gpu_scheduler_trace.h | 82 +++++++++++++++++++++++++
 include/drm/gpu_scheduler_trace.h               | 82 -------------------------
 3 files changed, 83 insertions(+), 83 deletions(-)
 create mode 100644 drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
 delete mode 100644 include/drm/gpu_scheduler_trace.h

(limited to 'include')

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 1d368bc66ac2..310275eaf128 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -30,7 +30,7 @@
 #include <drm/spsc_queue.h>
 
 #define CREATE_TRACE_POINTS
-#include <drm/gpu_scheduler_trace.h>
+#include "gpu_scheduler_trace.h"
 
 #define to_drm_sched_job(sched_job)		\
 		container_of((sched_job), struct drm_sched_job, queue_node)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
new file mode 100644
index 000000000000..4998ad950a48
--- /dev/null
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#if !defined(_GPU_SCHED_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _GPU_SCHED_TRACE_H_
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#include <drm/drmP.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gpu_scheduler
+#define TRACE_INCLUDE_FILE gpu_scheduler_trace
+
+TRACE_EVENT(drm_sched_job,
+	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity),
+	    TP_ARGS(sched_job, entity),
+	    TP_STRUCT__entry(
+			     __field(struct drm_sched_entity *, entity)
+			     __field(struct dma_fence *, fence)
+			     __field(const char *, name)
+			     __field(uint64_t, id)
+			     __field(u32, job_count)
+			     __field(int, hw_job_count)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->entity = entity;
+			   __entry->id = sched_job->id;
+			   __entry->fence = &sched_job->s_fence->finished;
+			   __entry->name = sched_job->sched->name;
+			   __entry->job_count = spsc_queue_count(&entity->job_queue);
+			   __entry->hw_job_count = atomic_read(
+				   &sched_job->sched->hw_rq_count);
+			   ),
+	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
+		      __entry->entity, __entry->id,
+		      __entry->fence, __entry->name,
+		      __entry->job_count, __entry->hw_job_count)
+);
+
+TRACE_EVENT(drm_sched_process_job,
+	    TP_PROTO(struct drm_sched_fence *fence),
+	    TP_ARGS(fence),
+	    TP_STRUCT__entry(
+		    __field(struct dma_fence *, fence)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->fence = &fence->finished;
+		    ),
+	    TP_printk("fence=%p signaled", __entry->fence)
+);
+
+#endif
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/scheduler
+#include <trace/define_trace.h>
diff --git a/include/drm/gpu_scheduler_trace.h b/include/drm/gpu_scheduler_trace.h
deleted file mode 100644
index 0789e8d0a0e1..000000000000
--- a/include/drm/gpu_scheduler_trace.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#if !defined(_GPU_SCHED_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _GPU_SCHED_TRACE_H_
-
-#include <linux/stringify.h>
-#include <linux/types.h>
-#include <linux/tracepoint.h>
-
-#include <drm/drmP.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM gpu_scheduler
-#define TRACE_INCLUDE_FILE gpu_scheduler_trace
-
-TRACE_EVENT(drm_sched_job,
-	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity),
-	    TP_ARGS(sched_job, entity),
-	    TP_STRUCT__entry(
-			     __field(struct drm_sched_entity *, entity)
-			     __field(struct dma_fence *, fence)
-			     __field(const char *, name)
-			     __field(uint64_t, id)
-			     __field(u32, job_count)
-			     __field(int, hw_job_count)
-			     ),
-
-	    TP_fast_assign(
-			   __entry->entity = entity;
-			   __entry->id = sched_job->id;
-			   __entry->fence = &sched_job->s_fence->finished;
-			   __entry->name = sched_job->sched->name;
-			   __entry->job_count = spsc_queue_count(&entity->job_queue);
-			   __entry->hw_job_count = atomic_read(
-				   &sched_job->sched->hw_rq_count);
-			   ),
-	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
-		      __entry->entity, __entry->id,
-		      __entry->fence, __entry->name,
-		      __entry->job_count, __entry->hw_job_count)
-);
-
-TRACE_EVENT(drm_sched_process_job,
-	    TP_PROTO(struct drm_sched_fence *fence),
-	    TP_ARGS(fence),
-	    TP_STRUCT__entry(
-		    __field(struct dma_fence *, fence)
-		    ),
-
-	    TP_fast_assign(
-		    __entry->fence = &fence->finished;
-		    ),
-	    TP_printk("fence=%p signaled", __entry->fence)
-);
-
-#endif
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 552825b28ddac200b6080d9e79f4121b68e1517d Mon Sep 17 00:00:00 2001
From: Chunming Zhou <david1.zhou@amd.com>
Date: Mon, 2 Apr 2018 11:20:44 +0800
Subject: drm/amdgpu: add new bo flag that indicates BOs don't need fallback
 (v2)

user cases:
1. KFD wraps amdgpu_bo_create, they have no fallback case which is different
with amdgpu_gem_object_create.
since upstream branch has no amdgpu_amdkfd_gpuvm.c, which need KFD
guys add this flag to __alloc_memory_of_gpu:
+       flags |= AMDGPU_GEM_CREATE_NO_FALLBACK;
2. UMD can specify this flag for their allocation as well if they like.

v2: squash in merge conflict fix (Chunming)

Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: felix.kuehling@amd.com
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++++-
 include/uapi/drm/amdgpu_drm.h              | 2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index dc34b50e6b29..d7d7ce1507ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -386,7 +386,8 @@ retry:
 	    bo->tbo.mem.start < adev->gmc.visible_vram_size >> PAGE_SHIFT)
 		p->bytes_moved_vis += ctx.bytes_moved;
 
-	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
+	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains &&
+	    !(bo->flags & AMDGPU_GEM_CREATE_NO_FALLBACK)) {
 		domain = bo->allowed_domains;
 		goto retry;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 04d6830347ec..9e23d6f6f3f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -388,6 +388,8 @@ retry:
 	drm_gem_private_object_init(adev->ddev, &bo->gem_base, size);
 	INIT_LIST_HEAD(&bo->shadow_list);
 	INIT_LIST_HEAD(&bo->va);
+	bo->preferred_domains = preferred_domains;
+	bo->allowed_domains = allowed_domains;
 
 	bo->flags = flags;
 
@@ -424,7 +426,8 @@ retry:
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
 				 &bo->placement, page_align, &ctx, acc_size,
 				 NULL, resv, &amdgpu_ttm_bo_destroy);
-	if (unlikely(r && r != -ERESTARTSYS) && type == ttm_bo_type_device) {
+	if (unlikely(r && r != -ERESTARTSYS) && type == ttm_bo_type_device &&
+	    !(flags & AMDGPU_GEM_CREATE_NO_FALLBACK)) {
 		if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) {
 			flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
 			goto retry;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index c363b67f2d0a..4f5a27d64c54 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -95,6 +95,8 @@ extern "C" {
 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID	(1 << 6)
 /* Flag that BO sharing will be explicitly synchronized */
 #define AMDGPU_GEM_CREATE_EXPLICIT_SYNC		(1 << 7)
+/* Flag that BO doesn't need fallback */
+#define AMDGPU_GEM_CREATE_NO_FALLBACK		(1 << 8)
 
 struct drm_amdgpu_gem_create_in  {
 	/** the requested memory size */
-- 
cgit v1.2.3


From 1a61ee07211c543bf43e635fa703c162a78af0e1 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 4 Apr 2018 15:32:51 -0700
Subject: drm/sched: Extend the documentation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These comments answer all the questions I had for myself when
implementing a driver using the GPU scheduler.

Signed-off-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/drm/gpu_scheduler.h | 46 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index dfd54fb94e10..c053a32341bf 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -43,10 +43,12 @@ enum drm_sched_priority {
 };
 
 /**
- * A scheduler entity is a wrapper around a job queue or a group
- * of other entities. Entities take turns emitting jobs from their
- * job queues to corresponding hardware ring based on scheduling
- * policy.
+ * drm_sched_entity - A wrapper around a job queue (typically attached
+ * to the DRM file_priv).
+ *
+ * Entities will emit jobs in order to their corresponding hardware
+ * ring, and the scheduler will alternate between entities based on
+ * scheduling policy.
 */
 struct drm_sched_entity {
 	struct list_head		list;
@@ -78,7 +80,18 @@ struct drm_sched_rq {
 
 struct drm_sched_fence {
 	struct dma_fence		scheduled;
+
+	/* This fence is what will be signaled by the scheduler when
+	 * the job is completed.
+	 *
+	 * When setting up an out fence for the job, you should use
+	 * this, since it's available immediately upon
+	 * drm_sched_job_init(), and the fence returned by the driver
+	 * from run_job() won't be created until the dependencies have
+	 * resolved.
+	 */
 	struct dma_fence		finished;
+
 	struct dma_fence_cb		cb;
 	struct dma_fence		*parent;
 	struct drm_gpu_scheduler	*sched;
@@ -88,6 +101,13 @@ struct drm_sched_fence {
 
 struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
 
+/**
+ * drm_sched_job - A job to be run by an entity.
+ *
+ * A job is created by the driver using drm_sched_job_init(), and
+ * should call drm_sched_entity_push_job() once it wants the scheduler
+ * to schedule the job.
+ */
 struct drm_sched_job {
 	struct spsc_node		queue_node;
 	struct drm_gpu_scheduler	*sched;
@@ -112,10 +132,28 @@ static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
  * these functions should be implemented in driver side
 */
 struct drm_sched_backend_ops {
+	/* Called when the scheduler is considering scheduling this
+	 * job next, to get another struct dma_fence for this job to
+	 * block on.  Once it returns NULL, run_job() may be called.
+	 */
 	struct dma_fence *(*dependency)(struct drm_sched_job *sched_job,
 					struct drm_sched_entity *s_entity);
+
+	/* Called to execute the job once all of the dependencies have
+	 * been resolved.  This may be called multiple times, if
+	 * timedout_job() has happened and drm_sched_job_recovery()
+	 * decides to try it again.
+	 */
 	struct dma_fence *(*run_job)(struct drm_sched_job *sched_job);
+
+	/* Called when a job has taken too long to execute, to trigger
+	 * GPU recovery.
+	 */
 	void (*timedout_job)(struct drm_sched_job *sched_job);
+
+	/* Called once the job's finished fence has been signaled and
+	 * it's time to clean it up.
+	 */
 	void (*free_job)(struct drm_sched_job *sched_job);
 };
 
-- 
cgit v1.2.3


From 351f683b9823a3d1bffb6e2e3f381601aa0b2671 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Date: Sat, 17 Feb 2018 20:33:13 +0100
Subject: auxdisplay: Replace licenses with SPDX identifiers

Cc: Philippe Ombredanne <pombredanne@nexb.com>
Acked-by: Willy Tarreau <w@1wt.eu>
Acked-by: Linus Walleij <triad@dflund.se>
Acked-by: Robin van der Gracht <robin@protonic.nl>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
---
 drivers/auxdisplay/arm-charlcd.c        |  2 +-
 drivers/auxdisplay/cfag12864b.c         | 16 +---------------
 drivers/auxdisplay/cfag12864bfb.c       | 16 +---------------
 drivers/auxdisplay/charlcd.c            |  6 +-----
 drivers/auxdisplay/hd44780.c            |  6 +-----
 drivers/auxdisplay/ht16k33.c            | 10 +---------
 drivers/auxdisplay/ks0108.c             | 16 +---------------
 drivers/auxdisplay/panel.c              |  6 +-----
 include/linux/cfag12864b.h              | 16 +---------------
 include/linux/ks0108.h                  | 16 +---------------
 samples/auxdisplay/cfag12864b-example.c | 16 +---------------
 11 files changed, 11 insertions(+), 115 deletions(-)

(limited to 'include')

diff --git a/drivers/auxdisplay/arm-charlcd.c b/drivers/auxdisplay/arm-charlcd.c
index dde180995582..296fb30dfa00 100644
--- a/drivers/auxdisplay/arm-charlcd.c
+++ b/drivers/auxdisplay/arm-charlcd.c
@@ -1,10 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the on-board character LCD found on some ARM reference boards
  * This is basically an Hitachi HD44780 LCD with a custom IP block to drive it
  * http://en.wikipedia.org/wiki/HD44780_Character_LCD
  * Currently it will just display the text "ARM Linux" and the linux version
  *
- * License terms: GNU General Public License (GPL) version 2
  * Author: Linus Walleij <triad@df.lth.se>
  */
 #include <linux/init.h>
diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c
index 41ce4bd96813..6bd2f65e116a 100644
--- a/drivers/auxdisplay/cfag12864b.c
+++ b/drivers/auxdisplay/cfag12864b.c
@@ -1,26 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *    Filename: cfag12864b.c
  *     Version: 0.1.0
  * Description: cfag12864b LCD driver
- *     License: GPLv2
  *     Depends: ks0108
  *
  *      Author: Copyright (C) Miguel Ojeda Sandonis
  *        Date: 2006-10-31
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  */
 
 #include <linux/init.h>
diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c
index dcbee7e44721..40c8a552a478 100644
--- a/drivers/auxdisplay/cfag12864bfb.c
+++ b/drivers/auxdisplay/cfag12864bfb.c
@@ -1,26 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *    Filename: cfag12864bfb.c
  *     Version: 0.1.0
  * Description: cfag12864b LCD framebuffer driver
- *     License: GPLv2
  *     Depends: cfag12864b
  *
  *      Author: Copyright (C) Miguel Ojeda Sandonis
  *        Date: 2006-10-31
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  */
 
 #include <linux/init.h>
diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c
index e92f3e25f51d..8673fc2b9eb8 100644
--- a/drivers/auxdisplay/charlcd.c
+++ b/drivers/auxdisplay/charlcd.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Character LCD driver for Linux
  *
  * Copyright (C) 2000-2008, Willy Tarreau <w@1wt.eu>
  * Copyright (C) 2016-2017 Glider bvba
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/atomic.h>
diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c
index 036eec404289..78d8f1986fec 100644
--- a/drivers/auxdisplay/hd44780.c
+++ b/drivers/auxdisplay/hd44780.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * HD44780 Character LCD driver for Linux
  *
  * Copyright (C) 2000-2008, Willy Tarreau <w@1wt.eu>
  * Copyright (C) 2016-2017 Glider bvba
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/delay.h>
diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c
index fbfa5b4cc567..a43276c76fc6 100644
--- a/drivers/auxdisplay/ht16k33.c
+++ b/drivers/auxdisplay/ht16k33.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * HT16K33 driver
  *
  * Author: Robin van der Gracht <robin@protonic.nl>
  *
  * Copyright: (C) 2016 Protonic Holland.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c
index 816de9eaac26..abfe3fa9e6f4 100644
--- a/drivers/auxdisplay/ks0108.c
+++ b/drivers/auxdisplay/ks0108.c
@@ -1,26 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *    Filename: ks0108.c
  *     Version: 0.1.0
  * Description: ks0108 LCD Controller driver
- *     License: GPLv2
  *     Depends: parport
  *
  *      Author: Copyright (C) Miguel Ojeda Sandonis
  *        Date: 2006-10-31
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c
index ec5e8800f8ad..3b25a643058c 100644
--- a/drivers/auxdisplay/panel.c
+++ b/drivers/auxdisplay/panel.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Front panel driver for Linux
  * Copyright (C) 2000-2008, Willy Tarreau <w@1wt.eu>
  * Copyright (C) 2016-2017 Glider bvba
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
  * This code drives an LCD module (/dev/lcd), and a keypad (/dev/keypad)
  * connected to a parallel printer port.
  *
diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h
index b454dfce60d9..4060004968c8 100644
--- a/include/linux/cfag12864b.h
+++ b/include/linux/cfag12864b.h
@@ -1,25 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  *    Filename: cfag12864b.h
  *     Version: 0.1.0
  * Description: cfag12864b LCD driver header
- *     License: GPLv2
  *
  *      Author: Copyright (C) Miguel Ojeda Sandonis
  *        Date: 2006-10-12
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  */
 
 #ifndef _CFAG12864B_H_
diff --git a/include/linux/ks0108.h b/include/linux/ks0108.h
index cb311798e0bc..0738389b42b6 100644
--- a/include/linux/ks0108.h
+++ b/include/linux/ks0108.h
@@ -1,25 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  *    Filename: ks0108.h
  *     Version: 0.1.0
  * Description: ks0108 LCD Controller driver header
- *     License: GPLv2
  *
  *      Author: Copyright (C) Miguel Ojeda Sandonis
  *        Date: 2006-10-31
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  */
 
 #ifndef _KS0108_H_
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c
index e7823ffb1ca0..85571e90191f 100644
--- a/samples/auxdisplay/cfag12864b-example.c
+++ b/samples/auxdisplay/cfag12864b-example.c
@@ -1,25 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *    Filename: cfag12864b-example.c
  *     Version: 0.1.0
  * Description: cfag12864b LCD userspace example program
- *     License: GPLv2
  *
  *      Author: Copyright (C) Miguel Ojeda Sandonis
  *        Date: 2006-10-31
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  */
 
 /*
-- 
cgit v1.2.3


From 8ba05d770dc46df76900d4e17c312d62c83bed6f Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 28 Mar 2018 02:18:13 +0000
Subject: ASoC: trace: remove snd_soc_codec

snd_soc_codec is replaced to snd_soc_component,
and it is not used in this file.
Let's remove it

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/trace/events/asoc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h
index ccd1a3bdff46..40c300fe704d 100644
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -12,7 +12,6 @@
 #define DAPM_ARROW(dir) (((dir) == SND_SOC_DAPM_DIR_OUT) ? "->" : "<-")
 
 struct snd_soc_jack;
-struct snd_soc_codec;
 struct snd_soc_card;
 struct snd_soc_dapm_widget;
 struct snd_soc_dapm_path;
-- 
cgit v1.2.3


From 483cbae76824e12d4894f20fab2608789532db3f Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 28 Mar 2018 02:22:07 +0000
Subject: ASoC: wm8350: remove snd_soc_codec

codec is replace to component.
It seems no-one is using it, Let's remove it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/wm8350/audio.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/wm8350/audio.h b/include/linux/mfd/wm8350/audio.h
index bd581c6fa085..0bc41c4c0429 100644
--- a/include/linux/mfd/wm8350/audio.h
+++ b/include/linux/mfd/wm8350/audio.h
@@ -617,11 +617,8 @@ struct wm8350_audio_platform_data {
 	u32 codec_current_charge:2;	/* codec current @ vmid charge */
 };
 
-struct snd_soc_codec;
-
 struct wm8350_codec {
 	struct platform_device *pdev;
-	struct snd_soc_codec *codec;
 	struct wm8350_audio_platform_data *platform_data;
 };
 
-- 
cgit v1.2.3


From 343e64a6c48a6c86552db945d842283eee9f528b Mon Sep 17 00:00:00 2001
From: Biju Das <biju.das@bp.renesas.com>
Date: Wed, 28 Mar 2018 20:26:11 +0100
Subject: clk: renesas: Add r8a77470 CPG Core Clock Definitions

Add all RZ/G1C Clock Pulse Generator Core Clock Outputs, as listed in
Table 7.2 ("List of Clocks [RZ/G1C]") of the RZ/G1C Hardware User's
Manual.

Signed-off-by: Biju Das <biju.das@bp.renesas.com>
Reviewed-by: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
[geert: Use consecutive numbering]
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r8a77470-cpg-mssr.h | 36 +++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 include/dt-bindings/clock/r8a77470-cpg-mssr.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/r8a77470-cpg-mssr.h b/include/dt-bindings/clock/r8a77470-cpg-mssr.h
new file mode 100644
index 000000000000..34cba49d0f84
--- /dev/null
+++ b/include/dt-bindings/clock/r8a77470-cpg-mssr.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2018 Renesas Electronics Corp.
+ */
+#ifndef __DT_BINDINGS_CLOCK_R8A77470_CPG_MSSR_H__
+#define __DT_BINDINGS_CLOCK_R8A77470_CPG_MSSR_H__
+
+#include <dt-bindings/clock/renesas-cpg-mssr.h>
+
+/* r8a77470 CPG Core Clocks */
+#define R8A77470_CLK_Z2		0
+#define R8A77470_CLK_ZTR	1
+#define R8A77470_CLK_ZTRD2	2
+#define R8A77470_CLK_ZT		3
+#define R8A77470_CLK_ZX		4
+#define R8A77470_CLK_ZS		5
+#define R8A77470_CLK_HP		6
+#define R8A77470_CLK_B		7
+#define R8A77470_CLK_LB		8
+#define R8A77470_CLK_P		9
+#define R8A77470_CLK_CL		10
+#define R8A77470_CLK_CP		11
+#define R8A77470_CLK_M2		12
+#define R8A77470_CLK_ZB3	13
+#define R8A77470_CLK_SDH	14
+#define R8A77470_CLK_SD0	15
+#define R8A77470_CLK_SD1	16
+#define R8A77470_CLK_SD2	17
+#define R8A77470_CLK_MP		18
+#define R8A77470_CLK_QSPI	19
+#define R8A77470_CLK_CPEX	20
+#define R8A77470_CLK_RCAN	21
+#define R8A77470_CLK_R		22
+#define R8A77470_CLK_OSC	23
+
+#endif /* __DT_BINDINGS_CLOCK_R8A77470_CPG_MSSR_H__ */
-- 
cgit v1.2.3


From ed645cccc0ebc0329916b6df039dd792d9105c9d Mon Sep 17 00:00:00 2001
From: Andrey Gusakov <andrey.gusakov@cogentembedded.com>
Date: Tue, 27 Mar 2018 17:19:49 +0300
Subject: hwmon: MC13783: Add uid and die temperature sensor inputs

The uid and die temperature can be read out on the ADIN7 using
input mux. Map uid and die temperature sensor to channels 16
and 17.

Signed-off-by: Andrey Gusakov <andrey.gusakov@cogentembedded.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/hwmon/mc13783-adc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/mfd/mc13xxx-core.c  | 15 +++++++++++-
 include/linux/mfd/mc13xxx.h |  2 ++
 3 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/hwmon/mc13783-adc.c b/drivers/hwmon/mc13783-adc.c
index 960a1db6f269..67860ad2e3d9 100644
--- a/drivers/hwmon/mc13783-adc.c
+++ b/drivers/hwmon/mc13783-adc.c
@@ -63,6 +63,10 @@ static int mc13783_adc_read(struct device *dev,
 	if (ret)
 		return ret;
 
+	/* ADIN7 subchannels */
+	if (channel >= 16)
+		channel = 7;
+
 	channel &= 0x7;
 
 	*val = (sample[channel % 4] >> (channel > 3 ? 14 : 2)) & 0x3ff;
@@ -111,6 +115,57 @@ static ssize_t mc13783_adc_read_gp(struct device *dev,
 	return sprintf(buf, "%u\n", val);
 }
 
+static ssize_t mc13783_adc_read_uid(struct device *dev,
+		struct device_attribute *devattr, char *buf)
+{
+	unsigned int val;
+	struct platform_device *pdev = to_platform_device(dev);
+	kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data;
+	int ret = mc13783_adc_read(dev, devattr, &val);
+
+	if (ret)
+		return ret;
+
+	if (driver_data & MC13783_ADC_BPDIV2)
+		/* MC13892 have 1/2 divider, input range is [0, 4.800V] */
+		val = DIV_ROUND_CLOSEST(val * 4800, 1024);
+	else
+		/* MC13783 have 0.9 divider, input range is [0, 2.555V] */
+		val = DIV_ROUND_CLOSEST(val * 2555, 1024);
+
+	return sprintf(buf, "%u\n", val);
+}
+
+static ssize_t mc13783_adc_read_temp(struct device *dev,
+		struct device_attribute *devattr, char *buf)
+{
+	unsigned int val;
+	struct platform_device *pdev = to_platform_device(dev);
+	kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data;
+	int ret = mc13783_adc_read(dev, devattr, &val);
+
+	if (ret)
+		return ret;
+
+	if (driver_data & MC13783_ADC_BPDIV2) {
+		/*
+		 * MC13892:
+		 * Die Temperature Read Out Code at 25C 680
+		 * Temperature change per LSB +0.4244C
+		 */
+		ret = DIV_ROUND_CLOSEST(-2635920 + val * 4244, 10);
+	} else {
+		/*
+		 * MC13783:
+		 * Die Temperature Read Out Code at 25C 282
+		 * Temperature change per LSB -1.14C
+		 */
+		ret = 346480 - 1140 * val;
+	}
+
+	return sprintf(buf, "%d\n", ret);
+}
+
 static DEVICE_ATTR_RO(name);
 static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, mc13783_adc_read_bp, NULL, 2);
 static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, mc13783_adc_read_gp, NULL, 5);
@@ -124,6 +179,9 @@ static SENSOR_DEVICE_ATTR(in12_input, S_IRUGO, mc13783_adc_read_gp, NULL, 12);
 static SENSOR_DEVICE_ATTR(in13_input, S_IRUGO, mc13783_adc_read_gp, NULL, 13);
 static SENSOR_DEVICE_ATTR(in14_input, S_IRUGO, mc13783_adc_read_gp, NULL, 14);
 static SENSOR_DEVICE_ATTR(in15_input, S_IRUGO, mc13783_adc_read_gp, NULL, 15);
+static SENSOR_DEVICE_ATTR(in16_input, S_IRUGO, mc13783_adc_read_uid, NULL, 16);
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO,
+			  mc13783_adc_read_temp, NULL, 17);
 
 static struct attribute *mc13783_attr_base[] = {
 	&dev_attr_name.attr,
@@ -131,6 +189,8 @@ static struct attribute *mc13783_attr_base[] = {
 	&sensor_dev_attr_in5_input.dev_attr.attr,
 	&sensor_dev_attr_in6_input.dev_attr.attr,
 	&sensor_dev_attr_in7_input.dev_attr.attr,
+	&sensor_dev_attr_in16_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
 	NULL
 };
 
diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index d7f54e492aa6..c63e331738c1 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -279,8 +279,21 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
 	adc0 = MC13XXX_ADC0_ADINC1 | MC13XXX_ADC0_ADINC2;
 	adc1 = MC13XXX_ADC1_ADEN | MC13XXX_ADC1_ADTRIGIGN | MC13XXX_ADC1_ASC;
 
-	if (channel > 7)
+	/*
+	 * Channels mapped through ADIN7:
+	 * 7  - General purpose ADIN7
+	 * 16 - UID
+	 * 17 - Die temperature
+	 */
+	if (channel > 7 && channel < 16) {
 		adc1 |= MC13XXX_ADC1_ADSEL;
+	} else if (channel == 16) {
+		adc0 |= MC13XXX_ADC0_ADIN7SEL_UID;
+		channel = 7;
+	} else if (channel == 17) {
+		adc0 |= MC13XXX_ADC0_ADIN7SEL_DIE;
+		channel = 7;
+	}
 
 	switch (mode) {
 	case MC13XXX_ADC_MODE_TS:
diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h
index 638222e43e48..54a3cd808f9e 100644
--- a/include/linux/mfd/mc13xxx.h
+++ b/include/linux/mfd/mc13xxx.h
@@ -243,6 +243,8 @@ struct mc13xxx_platform_data {
 #define MC13XXX_ADC0_LICELLCON		(1 << 0)
 #define MC13XXX_ADC0_CHRGICON		(1 << 1)
 #define MC13XXX_ADC0_BATICON		(1 << 2)
+#define MC13XXX_ADC0_ADIN7SEL_DIE	(1 << 4)
+#define MC13XXX_ADC0_ADIN7SEL_UID	(2 << 4)
 #define MC13XXX_ADC0_ADREFEN		(1 << 10)
 #define MC13XXX_ADC0_TSMOD0		(1 << 12)
 #define MC13XXX_ADC0_TSMOD1		(1 << 13)
-- 
cgit v1.2.3


From 057666b69b1d51feb389a17ec73722b001aaf3d0 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 9 Apr 2018 22:21:49 +0200
Subject: ALSA: emu10k1: Reduce GFP_ATOMIC allocation

The emu10k1 fx8010 code allocates each irq resource dynamically and
links to the list at PCM trigger callback.  Due to the nature of
trigger callback, the allocation is done with GFP_ATOMIC, hence it
may fail more often.  Moreover, the irq resource isn't big at all, and
using the kmalloc for this won't save many bytes, either.

This patch removes the dynamic allocation and embeds the irq resource
into struct snd_emu10k1_fx8010_pcm.irq field instead of keeping a
pointer.  As a result, it simplifies the code and removes the
unnecessary GFP_ATOMIC usage.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h    | 4 ++--
 sound/pci/emu10k1/emufx.c  | 9 +--------
 sound/pci/emu10k1/emupcm.c | 2 +-
 3 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 5ebcc51c0a6a..8c1572de44c5 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1610,7 +1610,7 @@ struct snd_emu10k1_fx8010_pcm {
 	struct snd_pcm_indirect pcm_rec;
 	unsigned int tram_pos;
 	unsigned int tram_shift;
-	struct snd_emu10k1_fx8010_irq *irq;
+	struct snd_emu10k1_fx8010_irq irq;
 };
 
 struct snd_emu10k1_fx8010 {
@@ -1902,7 +1902,7 @@ int snd_emu10k1_fx8010_register_irq_handler(struct snd_emu10k1 *emu,
 					    snd_fx8010_irq_handler_t *handler,
 					    unsigned char gpr_running,
 					    void *private_data,
-					    struct snd_emu10k1_fx8010_irq **r_irq);
+					    struct snd_emu10k1_fx8010_irq *irq);
 int snd_emu10k1_fx8010_unregister_irq_handler(struct snd_emu10k1 *emu,
 					      struct snd_emu10k1_fx8010_irq *irq);
 
diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c
index a2b56b188be4..608ff4857d70 100644
--- a/sound/pci/emu10k1/emufx.c
+++ b/sound/pci/emu10k1/emufx.c
@@ -421,14 +421,10 @@ int snd_emu10k1_fx8010_register_irq_handler(struct snd_emu10k1 *emu,
 					    snd_fx8010_irq_handler_t *handler,
 					    unsigned char gpr_running,
 					    void *private_data,
-					    struct snd_emu10k1_fx8010_irq **r_irq)
+					    struct snd_emu10k1_fx8010_irq *irq)
 {
-	struct snd_emu10k1_fx8010_irq *irq;
 	unsigned long flags;
 	
-	irq = kmalloc(sizeof(*irq), GFP_ATOMIC);
-	if (irq == NULL)
-		return -ENOMEM;
 	irq->handler = handler;
 	irq->gpr_running = gpr_running;
 	irq->private_data = private_data;
@@ -443,8 +439,6 @@ int snd_emu10k1_fx8010_register_irq_handler(struct snd_emu10k1 *emu,
 		emu->fx8010.irq_handlers = irq;
 	}
 	spin_unlock_irqrestore(&emu->fx8010.irq_lock, flags);
-	if (r_irq)
-		*r_irq = irq;
 	return 0;
 }
 
@@ -468,7 +462,6 @@ int snd_emu10k1_fx8010_unregister_irq_handler(struct snd_emu10k1 *emu,
 			tmp->next = tmp->next->next;
 	}
 	spin_unlock_irqrestore(&emu->fx8010.irq_lock, flags);
-	kfree(irq);
 	return 0;
 }
 
diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c
index cefe613ef7b7..d39458ab251f 100644
--- a/sound/pci/emu10k1/emupcm.c
+++ b/sound/pci/emu10k1/emupcm.c
@@ -1724,7 +1724,7 @@ static int snd_emu10k1_fx8010_playback_trigger(struct snd_pcm_substream *substre
 	case SNDRV_PCM_TRIGGER_STOP:
 	case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
 	case SNDRV_PCM_TRIGGER_SUSPEND:
-		snd_emu10k1_fx8010_unregister_irq_handler(emu, pcm->irq); pcm->irq = NULL;
+		snd_emu10k1_fx8010_unregister_irq_handler(emu, &pcm->irq);
 		snd_emu10k1_ptr_write(emu, emu->gpr_base + pcm->gpr_trigger, 0, 0);
 		pcm->tram_pos = INITIAL_TRAM_POS(pcm->buffer_size);
 		pcm->tram_shift = 0;
-- 
cgit v1.2.3


From ec1ba3e519c0f46523cf40b83dc71562171b7c08 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 22 Mar 2018 11:17:40 +0100
Subject: regulator: ab8500: Drop AB8540/9540 support

The AB8540 was an evolved version of the AB8500, but it was never
mass produced or put into products, only reference designs exist.
The upstream support was never completed and it is unlikely that
this will happen so drop the support for now to simplify
maintenance of the AB8500.

Cc: Loic Pallardy <loic.pallardy@st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/ab8500.c       | 1779 ++++----------------------------------
 include/linux/regulator/ab8500.h |  157 +---
 2 files changed, 188 insertions(+), 1748 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 0f97514e3474..83dba3fbfe0c 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -132,33 +132,6 @@ static const unsigned int ldo_vaux56_voltages[] = {
 	2790000,
 };
 
-static const unsigned int ldo_vaux3_ab8540_voltages[] = {
-	1200000,
-	1500000,
-	1800000,
-	2100000,
-	2500000,
-	2750000,
-	2790000,
-	2910000,
-	3050000,
-};
-
-static const unsigned int ldo_vaux56_ab8540_voltages[] = {
-	750000, 760000, 770000, 780000, 790000, 800000,
-	810000, 820000, 830000, 840000, 850000, 860000,
-	870000, 880000, 890000, 900000, 910000, 920000,
-	930000, 940000, 950000, 960000, 970000, 980000,
-	990000, 1000000, 1010000, 1020000, 1030000,
-	1040000, 1050000, 1060000, 1070000, 1080000,
-	1090000, 1100000, 1110000, 1120000, 1130000,
-	1140000, 1150000, 1160000, 1170000, 1180000,
-	1190000, 1200000, 1210000, 1220000, 1230000,
-	1240000, 1250000, 1260000, 1270000, 1280000,
-	1290000, 1300000, 1310000, 1320000, 1330000,
-	1340000, 1350000, 1360000, 1800000, 2790000,
-};
-
 static const unsigned int ldo_vintcore_voltages[] = {
 	1200000,
 	1225000,
@@ -232,8 +205,6 @@ static const unsigned int ldo_vdmic_voltages[] = {
 static DEFINE_MUTEX(shared_mode_mutex);
 static struct ab8500_shared_mode ldo_anamic1_shared;
 static struct ab8500_shared_mode ldo_anamic2_shared;
-static struct ab8500_shared_mode ab8540_ldo_anamic1_shared;
-static struct ab8500_shared_mode ab8540_ldo_anamic2_shared;
 
 static int ab8500_regulator_enable(struct regulator_dev *rdev)
 {
@@ -507,53 +478,6 @@ static int ab8500_regulator_get_voltage_sel(struct regulator_dev *rdev)
 	return (regval & info->voltage_mask) >> voltage_shift;
 }
 
-static int ab8540_aux3_regulator_get_voltage_sel(struct regulator_dev *rdev)
-{
-	int ret, voltage_shift;
-	struct ab8500_regulator_info *info = rdev_get_drvdata(rdev);
-	u8 regval, regval_expand;
-
-	if (info == NULL) {
-		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
-		return -EINVAL;
-	}
-
-	ret = abx500_get_register_interruptible(info->dev,
-			info->expand_register.voltage_bank,
-			info->expand_register.voltage_reg, &regval_expand);
-	if (ret < 0) {
-		dev_err(rdev_get_dev(rdev),
-			"couldn't read voltage expand reg for regulator\n");
-		return ret;
-	}
-
-	dev_vdbg(rdev_get_dev(rdev),
-		 "%s-get_voltage expand (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n",
-		 info->desc.name, info->expand_register.voltage_bank,
-		 info->expand_register.voltage_reg,
-		 info->expand_register.voltage_mask, regval_expand);
-
-	if (regval_expand & info->expand_register.voltage_mask)
-		return info->expand_register.voltage_limit;
-
-	ret = abx500_get_register_interruptible(info->dev,
-			info->voltage_bank, info->voltage_reg, &regval);
-	if (ret < 0) {
-		dev_err(rdev_get_dev(rdev),
-			"couldn't read voltage reg for regulator\n");
-		return ret;
-	}
-
-	dev_vdbg(rdev_get_dev(rdev),
-		 "%s-get_voltage (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n",
-		 info->desc.name, info->voltage_bank, info->voltage_reg,
-		 info->voltage_mask, regval);
-
-	voltage_shift = ffs(info->voltage_mask) - 1;
-
-	return (regval & info->voltage_mask) >> voltage_shift;
-}
-
 static int ab8500_regulator_set_voltage_sel(struct regulator_dev *rdev,
 					    unsigned selector)
 {
@@ -586,61 +510,6 @@ static int ab8500_regulator_set_voltage_sel(struct regulator_dev *rdev,
 	return ret;
 }
 
-static int ab8540_aux3_regulator_set_voltage_sel(struct regulator_dev *rdev,
-						unsigned selector)
-{
-	int ret;
-	struct ab8500_regulator_info *info = rdev_get_drvdata(rdev);
-	u8 regval, regval_expand;
-
-	if (info == NULL) {
-		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
-		return -EINVAL;
-	}
-
-	if (selector < info->expand_register.voltage_limit) {
-		int voltage_shift = ffs(info->voltage_mask) - 1;
-
-		regval = (u8)selector << voltage_shift;
-		ret = abx500_mask_and_set_register_interruptible(info->dev,
-					info->voltage_bank, info->voltage_reg,
-					info->voltage_mask, regval);
-		if (ret < 0) {
-			dev_err(rdev_get_dev(rdev),
-				"couldn't set voltage reg for regulator\n");
-			return ret;
-		}
-
-		dev_vdbg(rdev_get_dev(rdev),
-			 "%s-set_voltage (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n",
-			 info->desc.name, info->voltage_bank, info->voltage_reg,
-			 info->voltage_mask, regval);
-
-		regval_expand = 0;
-	} else {
-		regval_expand = info->expand_register.voltage_mask;
-	}
-
-	ret = abx500_mask_and_set_register_interruptible(info->dev,
-				info->expand_register.voltage_bank,
-				info->expand_register.voltage_reg,
-				info->expand_register.voltage_mask,
-				regval_expand);
-	if (ret < 0) {
-		dev_err(rdev_get_dev(rdev),
-			"couldn't set expand voltage reg for regulator\n");
-		return ret;
-	}
-
-	dev_vdbg(rdev_get_dev(rdev),
-		 "%s-set_voltage expand (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n",
-		 info->desc.name, info->expand_register.voltage_bank,
-		 info->expand_register.voltage_reg,
-		 info->expand_register.voltage_mask, regval_expand);
-
-	return 0;
-}
-
 static struct regulator_ops ab8500_regulator_volt_mode_ops = {
 	.enable			= ab8500_regulator_enable,
 	.disable		= ab8500_regulator_disable,
@@ -653,18 +522,6 @@ static struct regulator_ops ab8500_regulator_volt_mode_ops = {
 	.list_voltage		= regulator_list_voltage_table,
 };
 
-static struct regulator_ops ab8540_aux3_regulator_volt_mode_ops = {
-	.enable		= ab8500_regulator_enable,
-	.disable	= ab8500_regulator_disable,
-	.get_optimum_mode	= ab8500_regulator_get_optimum_mode,
-	.set_mode	= ab8500_regulator_set_mode,
-	.get_mode	= ab8500_regulator_get_mode,
-	.is_enabled	= ab8500_regulator_is_enabled,
-	.get_voltage_sel = ab8540_aux3_regulator_get_voltage_sel,
-	.set_voltage_sel = ab8540_aux3_regulator_set_voltage_sel,
-	.list_voltage	= regulator_list_voltage_table,
-};
-
 static struct regulator_ops ab8500_regulator_volt_ops = {
 	.enable		= ab8500_regulator_enable,
 	.disable	= ab8500_regulator_disable,
@@ -1217,1156 +1074,118 @@ static struct ab8500_regulator_info
 	},
 };
 
-/* AB9540 regulator information */
-static struct ab8500_regulator_info
-		ab9540_regulator_info[AB9540_NUM_REGULATORS] = {
+static struct ab8500_shared_mode ldo_anamic1_shared = {
+	.shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC2],
+};
+
+static struct ab8500_shared_mode ldo_anamic2_shared = {
+	.shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC1],
+};
+
+struct ab8500_reg_init {
+	u8 bank;
+	u8 addr;
+	u8 mask;
+};
+
+#define REG_INIT(_id, _bank, _addr, _mask)	\
+	[_id] = {				\
+		.bank = _bank,			\
+		.addr = _addr,			\
+		.mask = _mask,			\
+	}
+
+/* AB8500 register init */
+static struct ab8500_reg_init ab8500_reg_init[] = {
 	/*
-	 * Variable Voltage Regulators
-	 *   name, min mV, max mV,
-	 *   update bank, reg, mask, enable val
-	 *   volt bank, reg, mask
+	 * 0x30, VanaRequestCtrl
+	 * 0xc0, VextSupply1RequestCtrl
 	 */
-	[AB9540_LDO_AUX1] = {
-		.desc = {
-			.name		= "LDO-AUX1",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_AUX1,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
-			.volt_table	= ldo_vauxn_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x09,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x1f,
-		.voltage_mask		= 0x0f,
-	},
-	[AB9540_LDO_AUX2] = {
-		.desc = {
-			.name		= "LDO-AUX2",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_AUX2,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
-			.volt_table	= ldo_vauxn_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x09,
-		.update_mask		= 0x0c,
-		.update_val		= 0x04,
-		.update_val_idle	= 0x0c,
-		.update_val_normal	= 0x04,
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x20,
-		.voltage_mask		= 0x0f,
-	},
-	[AB9540_LDO_AUX3] = {
-		.desc = {
-			.name		= "LDO-AUX3",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_AUX3,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vaux3_voltages),
-			.volt_table	= ldo_vaux3_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x0a,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x21,
-		.voltage_mask		= 0x07,
-	},
-	[AB9540_LDO_AUX4] = {
-		.desc = {
-			.name		= "LDO-AUX4",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_AUX4,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
-			.volt_table	= ldo_vauxn_voltages,
-		},
-		.load_lp_uA		= 5000,
-		/* values for Vaux4Regu register */
-		.update_bank		= 0x04,
-		.update_reg		= 0x2e,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		/* values for Vaux4SEL register */
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x2f,
-		.voltage_mask		= 0x0f,
-	},
-	[AB9540_LDO_INTCORE] = {
-		.desc = {
-			.name		= "LDO-INTCORE",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_INTCORE,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vintcore_voltages),
-			.volt_table	= ldo_vintcore_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x03,
-		.update_reg		= 0x80,
-		.update_mask		= 0x44,
-		.update_val		= 0x44,
-		.update_val_idle	= 0x44,
-		.update_val_normal	= 0x04,
-		.voltage_bank		= 0x03,
-		.voltage_reg		= 0x80,
-		.voltage_mask		= 0x38,
-	},
-
+	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xf0),
 	/*
-	 * Fixed Voltage Regulators
-	 *   name, fixed mV,
-	 *   update bank, reg, mask, enable val
+	 * 0x03, VextSupply2RequestCtrl
+	 * 0x0c, VextSupply3RequestCtrl
+	 * 0x30, Vaux1RequestCtrl
+	 * 0xc0, Vaux2RequestCtrl
 	 */
-	[AB9540_LDO_TVOUT] = {
-		.desc = {
-			.name		= "LDO-TVOUT",
-			.ops		= &ab8500_regulator_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_TVOUT,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_2000000_voltage,
-			.enable_time	= 10000,
-		},
-		.load_lp_uA		= 1000,
-		.update_bank		= 0x03,
-		.update_reg		= 0x80,
-		.update_mask		= 0x82,
-		.update_val		= 0x02,
-		.update_val_idle	= 0x82,
-		.update_val_normal	= 0x02,
-	},
-	[AB9540_LDO_USB] = {
-		.desc = {
-			.name           = "LDO-USB",
-			.ops            = &ab8500_regulator_ops,
-			.type           = REGULATOR_VOLTAGE,
-			.id             = AB9540_LDO_USB,
-			.owner          = THIS_MODULE,
-			.n_voltages     = 1,
-			.volt_table	= fixed_3300000_voltage,
-		},
-		.update_bank            = 0x03,
-		.update_reg             = 0x82,
-		.update_mask            = 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-	},
-	[AB9540_LDO_AUDIO] = {
-		.desc = {
-			.name		= "LDO-AUDIO",
-			.ops		= &ab8500_regulator_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_AUDIO,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_2000000_voltage,
-		},
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x02,
-		.update_val		= 0x02,
-	},
-	[AB9540_LDO_ANAMIC1] = {
-		.desc = {
-			.name		= "LDO-ANAMIC1",
-			.ops		= &ab8500_regulator_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_ANAMIC1,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_2050000_voltage,
-		},
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x08,
-		.update_val		= 0x08,
-	},
-	[AB9540_LDO_ANAMIC2] = {
-		.desc = {
-			.name		= "LDO-ANAMIC2",
-			.ops		= &ab8500_regulator_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_ANAMIC2,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_2050000_voltage,
-		},
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x10,
-		.update_val		= 0x10,
-	},
-	[AB9540_LDO_DMIC] = {
-		.desc = {
-			.name		= "LDO-DMIC",
-			.ops		= &ab8500_regulator_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_DMIC,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_1800000_voltage,
-		},
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x04,
-		.update_val		= 0x04,
-	},
-
-	/*
-	 * Regulators with fixed voltage and normal/idle modes
-	 */
-	[AB9540_LDO_ANA] = {
-		.desc = {
-			.name		= "LDO-ANA",
-			.ops		= &ab8500_regulator_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB9540_LDO_ANA,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_1200000_voltage,
-		},
-		.load_lp_uA		= 1000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x06,
-		.update_mask		= 0x0c,
-		.update_val		= 0x08,
-		.update_val_idle	= 0x0c,
-		.update_val_normal	= 0x08,
-	},
-};
-
-/* AB8540 regulator information */
-static struct ab8500_regulator_info
-		ab8540_regulator_info[AB8540_NUM_REGULATORS] = {
-	/*
-	 * Variable Voltage Regulators
-	 *   name, min mV, max mV,
-	 *   update bank, reg, mask, enable val
-	 *   volt bank, reg, mask
-	 */
-	[AB8540_LDO_AUX1] = {
-		.desc = {
-			.name		= "LDO-AUX1",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_AUX1,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
-			.volt_table	= ldo_vauxn_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x09,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x1f,
-		.voltage_mask		= 0x0f,
-	},
-	[AB8540_LDO_AUX2] = {
-		.desc = {
-			.name		= "LDO-AUX2",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_AUX2,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
-			.volt_table	= ldo_vauxn_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x09,
-		.update_mask		= 0x0c,
-		.update_val		= 0x04,
-		.update_val_idle	= 0x0c,
-		.update_val_normal	= 0x04,
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x20,
-		.voltage_mask		= 0x0f,
-	},
-	[AB8540_LDO_AUX3] = {
-		.desc = {
-			.name		= "LDO-AUX3",
-			.ops		= &ab8540_aux3_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_AUX3,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vaux3_ab8540_voltages),
-			.volt_table	= ldo_vaux3_ab8540_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x0a,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x21,
-		.voltage_mask		= 0x07,
-		.expand_register = {
-			.voltage_limit		= 8,
-			.voltage_bank		= 0x04,
-			.voltage_reg		= 0x01,
-			.voltage_mask		= 0x10,
-		}
-	},
-	[AB8540_LDO_AUX4] = {
-		.desc = {
-			.name		= "LDO-AUX4",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_AUX4,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
-			.volt_table	= ldo_vauxn_voltages,
-		},
-		.load_lp_uA		= 5000,
-		/* values for Vaux4Regu register */
-		.update_bank		= 0x04,
-		.update_reg		= 0x2e,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		/* values for Vaux4SEL register */
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x2f,
-		.voltage_mask		= 0x0f,
-	},
-	[AB8540_LDO_AUX5] = {
-		.desc = {
-			.name		= "LDO-AUX5",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_AUX5,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vaux56_ab8540_voltages),
-			.volt_table	= ldo_vaux56_ab8540_voltages,
-		},
-		.load_lp_uA		= 20000,
-		/* values for Vaux5Regu register */
-		.update_bank		= 0x04,
-		.update_reg		= 0x32,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		/* values for Vaux5SEL register */
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x33,
-		.voltage_mask		= 0x3f,
-	},
-	[AB8540_LDO_AUX6] = {
-		.desc = {
-			.name		= "LDO-AUX6",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_AUX6,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vaux56_ab8540_voltages),
-			.volt_table	= ldo_vaux56_ab8540_voltages,
-		},
-		.load_lp_uA		= 20000,
-		/* values for Vaux6Regu register */
-		.update_bank		= 0x04,
-		.update_reg		= 0x35,
-		.update_mask		= 0x03,
-		.update_val		= 0x01,
-		.update_val_idle	= 0x03,
-		.update_val_normal	= 0x01,
-		/* values for Vaux6SEL register */
-		.voltage_bank		= 0x04,
-		.voltage_reg		= 0x36,
-		.voltage_mask		= 0x3f,
-	},
-	[AB8540_LDO_INTCORE] = {
-		.desc = {
-			.name		= "LDO-INTCORE",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_INTCORE,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vintcore_voltages),
-			.volt_table	= ldo_vintcore_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x03,
-		.update_reg		= 0x80,
-		.update_mask		= 0x44,
-		.update_val		= 0x44,
-		.update_val_idle	= 0x44,
-		.update_val_normal	= 0x04,
-		.voltage_bank		= 0x03,
-		.voltage_reg		= 0x80,
-		.voltage_mask		= 0x38,
-	},
-
-	/*
-	 * Fixed Voltage Regulators
-	 *   name, fixed mV,
-	 *   update bank, reg, mask, enable val
-	 */
-	[AB8540_LDO_TVOUT] = {
-		.desc = {
-			.name		= "LDO-TVOUT",
-			.ops		= &ab8500_regulator_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_TVOUT,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table     = fixed_2000000_voltage,
-			.enable_time	= 10000,
-		},
-		.load_lp_uA		= 1000,
-		.update_bank		= 0x03,
-		.update_reg		= 0x80,
-		.update_mask		= 0x82,
-		.update_val		= 0x02,
-		.update_val_idle	= 0x82,
-		.update_val_normal	= 0x02,
-	},
-	[AB8540_LDO_AUDIO] = {
-		.desc = {
-			.name		= "LDO-AUDIO",
-			.ops		= &ab8500_regulator_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_AUDIO,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_2000000_voltage,
-		},
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x02,
-		.update_val		= 0x02,
-	},
-	[AB8540_LDO_ANAMIC1] = {
-		.desc = {
-			.name		= "LDO-ANAMIC1",
-			.ops		= &ab8500_regulator_anamic_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_ANAMIC1,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_2050000_voltage,
-		},
-		.shared_mode		= &ab8540_ldo_anamic1_shared,
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x08,
-		.update_val		= 0x08,
-		.mode_bank		= 0x03,
-		.mode_reg		= 0x83,
-		.mode_mask		= 0x20,
-		.mode_val_idle		= 0x20,
-		.mode_val_normal	= 0x00,
-	},
-	[AB8540_LDO_ANAMIC2] = {
-		.desc = {
-			.name		= "LDO-ANAMIC2",
-			.ops		= &ab8500_regulator_anamic_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_ANAMIC2,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table	= fixed_2050000_voltage,
-		},
-		.shared_mode		= &ab8540_ldo_anamic2_shared,
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x10,
-		.update_val		= 0x10,
-		.mode_bank		= 0x03,
-		.mode_reg		= 0x83,
-		.mode_mask		= 0x20,
-		.mode_val_idle		= 0x20,
-		.mode_val_normal	= 0x00,
-	},
-	[AB8540_LDO_DMIC] = {
-		.desc = {
-			.name		= "LDO-DMIC",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_DMIC,
-			.owner		= THIS_MODULE,
-			.n_voltages	= ARRAY_SIZE(ldo_vdmic_voltages),
-			.volt_table	= ldo_vdmic_voltages,
-		},
-		.load_lp_uA		= 1000,
-		.update_bank		= 0x03,
-		.update_reg		= 0x83,
-		.update_mask		= 0x04,
-		.update_val		= 0x04,
-		.voltage_bank		= 0x03,
-		.voltage_reg		= 0x83,
-		.voltage_mask		= 0xc0,
-	},
-
-	/*
-	 * Regulators with fixed voltage and normal/idle modes
-	 */
-	[AB8540_LDO_ANA] = {
-		.desc = {
-			.name		= "LDO-ANA",
-			.ops		= &ab8500_regulator_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_ANA,
-			.owner		= THIS_MODULE,
-			.n_voltages	= 1,
-			.volt_table     = fixed_1200000_voltage,
-		},
-		.load_lp_uA		= 1000,
-		.update_bank		= 0x04,
-		.update_reg		= 0x06,
-		.update_mask		= 0x0c,
-		.update_val		= 0x04,
-		.update_val_idle	= 0x0c,
-		.update_val_normal	= 0x04,
-	},
-	[AB8540_LDO_SDIO] = {
-		.desc = {
-			.name		= "LDO-SDIO",
-			.ops		= &ab8500_regulator_volt_mode_ops,
-			.type		= REGULATOR_VOLTAGE,
-			.id		= AB8540_LDO_SDIO,
-			.owner		= THIS_MODULE,
-			.n_voltages 	= ARRAY_SIZE(ldo_sdio_voltages),
-			.volt_table	= ldo_sdio_voltages,
-		},
-		.load_lp_uA		= 5000,
-		.update_bank		= 0x03,
-		.update_reg		= 0x88,
-		.update_mask		= 0x30,
-		.update_val		= 0x10,
-		.update_val_idle	= 0x30,
-		.update_val_normal	= 0x10,
-		.voltage_bank		= 0x03,
-		.voltage_reg		= 0x88,
-		.voltage_mask		= 0x07,
-	},
-};
-
-static struct ab8500_shared_mode ldo_anamic1_shared = {
-	.shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC2],
-};
-
-static struct ab8500_shared_mode ldo_anamic2_shared = {
-	.shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC1],
-};
-
-static struct ab8500_shared_mode ab8540_ldo_anamic1_shared = {
-	.shared_regulator = &ab8540_regulator_info[AB8540_LDO_ANAMIC2],
-};
-
-static struct ab8500_shared_mode ab8540_ldo_anamic2_shared = {
-	.shared_regulator = &ab8540_regulator_info[AB8540_LDO_ANAMIC1],
-};
-
-struct ab8500_reg_init {
-	u8 bank;
-	u8 addr;
-	u8 mask;
-};
-
-#define REG_INIT(_id, _bank, _addr, _mask)	\
-	[_id] = {				\
-		.bank = _bank,			\
-		.addr = _addr,			\
-		.mask = _mask,			\
-	}
-
-/* AB8500 register init */
-static struct ab8500_reg_init ab8500_reg_init[] = {
-	/*
-	 * 0x30, VanaRequestCtrl
-	 * 0xc0, VextSupply1RequestCtrl
-	 */
-	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xf0),
-	/*
-	 * 0x03, VextSupply2RequestCtrl
-	 * 0x0c, VextSupply3RequestCtrl
-	 * 0x30, Vaux1RequestCtrl
-	 * 0xc0, Vaux2RequestCtrl
-	 */
-	REG_INIT(AB8500_REGUREQUESTCTRL3,	0x03, 0x05, 0xff),
-	/*
-	 * 0x03, Vaux3RequestCtrl
-	 * 0x04, SwHPReq
-	 */
-	REG_INIT(AB8500_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
-	/*
-	 * 0x08, VanaSysClkReq1HPValid
-	 * 0x20, Vaux1SysClkReq1HPValid
-	 * 0x40, Vaux2SysClkReq1HPValid
-	 * 0x80, Vaux3SysClkReq1HPValid
-	 */
-	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xe8),
-	/*
-	 * 0x10, VextSupply1SysClkReq1HPValid
-	 * 0x20, VextSupply2SysClkReq1HPValid
-	 * 0x40, VextSupply3SysClkReq1HPValid
-	 */
-	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x70),
-	/*
-	 * 0x08, VanaHwHPReq1Valid
-	 * 0x20, Vaux1HwHPReq1Valid
-	 * 0x40, Vaux2HwHPReq1Valid
-	 * 0x80, Vaux3HwHPReq1Valid
-	 */
-	REG_INIT(AB8500_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xe8),
-	/*
-	 * 0x01, VextSupply1HwHPReq1Valid
-	 * 0x02, VextSupply2HwHPReq1Valid
-	 * 0x04, VextSupply3HwHPReq1Valid
-	 */
-	REG_INIT(AB8500_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x07),
-	/*
-	 * 0x08, VanaHwHPReq2Valid
-	 * 0x20, Vaux1HwHPReq2Valid
-	 * 0x40, Vaux2HwHPReq2Valid
-	 * 0x80, Vaux3HwHPReq2Valid
-	 */
-	REG_INIT(AB8500_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xe8),
-	/*
-	 * 0x01, VextSupply1HwHPReq2Valid
-	 * 0x02, VextSupply2HwHPReq2Valid
-	 * 0x04, VextSupply3HwHPReq2Valid
-	 */
-	REG_INIT(AB8500_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x07),
-	/*
-	 * 0x20, VanaSwHPReqValid
-	 * 0x80, Vaux1SwHPReqValid
-	 */
-	REG_INIT(AB8500_REGUSWHPREQVALID1,	0x03, 0x0d, 0xa0),
-	/*
-	 * 0x01, Vaux2SwHPReqValid
-	 * 0x02, Vaux3SwHPReqValid
-	 * 0x04, VextSupply1SwHPReqValid
-	 * 0x08, VextSupply2SwHPReqValid
-	 * 0x10, VextSupply3SwHPReqValid
-	 */
-	REG_INIT(AB8500_REGUSWHPREQVALID2,	0x03, 0x0e, 0x1f),
-	/*
-	 * 0x02, SysClkReq2Valid1
-	 * 0x04, SysClkReq3Valid1
-	 * 0x08, SysClkReq4Valid1
-	 * 0x10, SysClkReq5Valid1
-	 * 0x20, SysClkReq6Valid1
-	 * 0x40, SysClkReq7Valid1
-	 * 0x80, SysClkReq8Valid1
-	 */
-	REG_INIT(AB8500_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0xfe),
-	/*
-	 * 0x02, SysClkReq2Valid2
-	 * 0x04, SysClkReq3Valid2
-	 * 0x08, SysClkReq4Valid2
-	 * 0x10, SysClkReq5Valid2
-	 * 0x20, SysClkReq6Valid2
-	 * 0x40, SysClkReq7Valid2
-	 * 0x80, SysClkReq8Valid2
-	 */
-	REG_INIT(AB8500_REGUSYSCLKREQVALID2,	0x03, 0x10, 0xfe),
-	/*
-	 * 0x02, VTVoutEna
-	 * 0x04, Vintcore12Ena
-	 * 0x38, Vintcore12Sel
-	 * 0x40, Vintcore12LP
-	 * 0x80, VTVoutLP
-	 */
-	REG_INIT(AB8500_REGUMISC1,		0x03, 0x80, 0xfe),
-	/*
-	 * 0x02, VaudioEna
-	 * 0x04, VdmicEna
-	 * 0x08, Vamic1Ena
-	 * 0x10, Vamic2Ena
-	 */
-	REG_INIT(AB8500_VAUDIOSUPPLY,		0x03, 0x83, 0x1e),
-	/*
-	 * 0x01, Vamic1_dzout
-	 * 0x02, Vamic2_dzout
-	 */
-	REG_INIT(AB8500_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
-	/*
-	 * 0x03, VpllRegu (NOTE! PRCMU register bits)
-	 * 0x0c, VanaRegu
-	 */
-	REG_INIT(AB8500_VPLLVANAREGU,		0x04, 0x06, 0x0f),
-	/*
-	 * 0x01, VrefDDREna
-	 * 0x02, VrefDDRSleepMode
-	 */
-	REG_INIT(AB8500_VREFDDR,		0x04, 0x07, 0x03),
-	/*
-	 * 0x03, VextSupply1Regu
-	 * 0x0c, VextSupply2Regu
-	 * 0x30, VextSupply3Regu
-	 * 0x40, ExtSupply2Bypass
-	 * 0x80, ExtSupply3Bypass
-	 */
-	REG_INIT(AB8500_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
-	/*
-	 * 0x03, Vaux1Regu
-	 * 0x0c, Vaux2Regu
-	 */
-	REG_INIT(AB8500_VAUX12REGU,		0x04, 0x09, 0x0f),
-	/*
-	 * 0x03, Vaux3Regu
-	 */
-	REG_INIT(AB8500_VRF1VAUX3REGU,		0x04, 0x0a, 0x03),
-	/*
-	 * 0x0f, Vaux1Sel
-	 */
-	REG_INIT(AB8500_VAUX1SEL,		0x04, 0x1f, 0x0f),
-	/*
-	 * 0x0f, Vaux2Sel
-	 */
-	REG_INIT(AB8500_VAUX2SEL,		0x04, 0x20, 0x0f),
-	/*
-	 * 0x07, Vaux3Sel
-	 */
-	REG_INIT(AB8500_VRF1VAUX3SEL,		0x04, 0x21, 0x07),
-	/*
-	 * 0x01, VextSupply12LP
-	 */
-	REG_INIT(AB8500_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
-	/*
-	 * 0x04, Vaux1Disch
-	 * 0x08, Vaux2Disch
-	 * 0x10, Vaux3Disch
-	 * 0x20, Vintcore12Disch
-	 * 0x40, VTVoutDisch
-	 * 0x80, VaudioDisch
-	 */
-	REG_INIT(AB8500_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
-	/*
-	 * 0x02, VanaDisch
-	 * 0x04, VdmicPullDownEna
-	 * 0x10, VdmicDisch
-	 */
-	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
-};
-
-/* AB8505 register init */
-static struct ab8500_reg_init ab8505_reg_init[] = {
-	/*
-	 * 0x03, VarmRequestCtrl
-	 * 0x0c, VsmpsCRequestCtrl
-	 * 0x30, VsmpsARequestCtrl
-	 * 0xc0, VsmpsBRequestCtrl
-	 */
-	REG_INIT(AB8505_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
-	/*
-	 * 0x03, VsafeRequestCtrl
-	 * 0x0c, VpllRequestCtrl
-	 * 0x30, VanaRequestCtrl
-	 */
-	REG_INIT(AB8505_REGUREQUESTCTRL2,	0x03, 0x04, 0x3f),
-	/*
-	 * 0x30, Vaux1RequestCtrl
-	 * 0xc0, Vaux2RequestCtrl
-	 */
-	REG_INIT(AB8505_REGUREQUESTCTRL3,	0x03, 0x05, 0xf0),
-	/*
-	 * 0x03, Vaux3RequestCtrl
-	 * 0x04, SwHPReq
-	 */
-	REG_INIT(AB8505_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
-	/*
-	 * 0x01, VsmpsASysClkReq1HPValid
-	 * 0x02, VsmpsBSysClkReq1HPValid
-	 * 0x04, VsafeSysClkReq1HPValid
-	 * 0x08, VanaSysClkReq1HPValid
-	 * 0x10, VpllSysClkReq1HPValid
-	 * 0x20, Vaux1SysClkReq1HPValid
-	 * 0x40, Vaux2SysClkReq1HPValid
-	 * 0x80, Vaux3SysClkReq1HPValid
-	 */
-	REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
-	/*
-	 * 0x01, VsmpsCSysClkReq1HPValid
-	 * 0x02, VarmSysClkReq1HPValid
-	 * 0x04, VbbSysClkReq1HPValid
-	 * 0x08, VsmpsMSysClkReq1HPValid
-	 */
-	REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x0f),
-	/*
-	 * 0x01, VsmpsAHwHPReq1Valid
-	 * 0x02, VsmpsBHwHPReq1Valid
-	 * 0x04, VsafeHwHPReq1Valid
-	 * 0x08, VanaHwHPReq1Valid
-	 * 0x10, VpllHwHPReq1Valid
-	 * 0x20, Vaux1HwHPReq1Valid
-	 * 0x40, Vaux2HwHPReq1Valid
-	 * 0x80, Vaux3HwHPReq1Valid
-	 */
-	REG_INIT(AB8505_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
-	/*
-	 * 0x08, VsmpsMHwHPReq1Valid
-	 */
-	REG_INIT(AB8505_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x08),
-	/*
-	 * 0x01, VsmpsAHwHPReq2Valid
-	 * 0x02, VsmpsBHwHPReq2Valid
-	 * 0x04, VsafeHwHPReq2Valid
-	 * 0x08, VanaHwHPReq2Valid
-	 * 0x10, VpllHwHPReq2Valid
-	 * 0x20, Vaux1HwHPReq2Valid
-	 * 0x40, Vaux2HwHPReq2Valid
-	 * 0x80, Vaux3HwHPReq2Valid
-	 */
-	REG_INIT(AB8505_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
-	/*
-	 * 0x08, VsmpsMHwHPReq2Valid
-	 */
-	REG_INIT(AB8505_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x08),
-	/*
-	 * 0x01, VsmpsCSwHPReqValid
-	 * 0x02, VarmSwHPReqValid
-	 * 0x04, VsmpsASwHPReqValid
-	 * 0x08, VsmpsBSwHPReqValid
-	 * 0x10, VsafeSwHPReqValid
-	 * 0x20, VanaSwHPReqValid
-	 * 0x40, VpllSwHPReqValid
-	 * 0x80, Vaux1SwHPReqValid
-	 */
-	REG_INIT(AB8505_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
-	/*
-	 * 0x01, Vaux2SwHPReqValid
-	 * 0x02, Vaux3SwHPReqValid
-	 * 0x20, VsmpsMSwHPReqValid
-	 */
-	REG_INIT(AB8505_REGUSWHPREQVALID2,	0x03, 0x0e, 0x23),
-	/*
-	 * 0x02, SysClkReq2Valid1
-	 * 0x04, SysClkReq3Valid1
-	 * 0x08, SysClkReq4Valid1
-	 */
-	REG_INIT(AB8505_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0x0e),
-	/*
-	 * 0x02, SysClkReq2Valid2
-	 * 0x04, SysClkReq3Valid2
-	 * 0x08, SysClkReq4Valid2
-	 */
-	REG_INIT(AB8505_REGUSYSCLKREQVALID2,	0x03, 0x10, 0x0e),
-	/*
-	 * 0x01, Vaux4SwHPReqValid
-	 * 0x02, Vaux4HwHPReq2Valid
-	 * 0x04, Vaux4HwHPReq1Valid
-	 * 0x08, Vaux4SysClkReq1HPValid
-	 */
-	REG_INIT(AB8505_REGUVAUX4REQVALID,	0x03, 0x11, 0x0f),
-	/*
-	 * 0x02, VadcEna
-	 * 0x04, VintCore12Ena
-	 * 0x38, VintCore12Sel
-	 * 0x40, VintCore12LP
-	 * 0x80, VadcLP
-	 */
-	REG_INIT(AB8505_REGUMISC1,		0x03, 0x80, 0xfe),
-	/*
-	 * 0x02, VaudioEna
-	 * 0x04, VdmicEna
-	 * 0x08, Vamic1Ena
-	 * 0x10, Vamic2Ena
-	 */
-	REG_INIT(AB8505_VAUDIOSUPPLY,		0x03, 0x83, 0x1e),
-	/*
-	 * 0x01, Vamic1_dzout
-	 * 0x02, Vamic2_dzout
-	 */
-	REG_INIT(AB8505_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
-	/*
-	 * 0x03, VsmpsARegu
-	 * 0x0c, VsmpsASelCtrl
-	 * 0x10, VsmpsAAutoMode
-	 * 0x20, VsmpsAPWMMode
-	 */
-	REG_INIT(AB8505_VSMPSAREGU,		0x04, 0x03, 0x3f),
-	/*
-	 * 0x03, VsmpsBRegu
-	 * 0x0c, VsmpsBSelCtrl
-	 * 0x10, VsmpsBAutoMode
-	 * 0x20, VsmpsBPWMMode
-	 */
-	REG_INIT(AB8505_VSMPSBREGU,		0x04, 0x04, 0x3f),
-	/*
-	 * 0x03, VsafeRegu
-	 * 0x0c, VsafeSelCtrl
-	 * 0x10, VsafeAutoMode
-	 * 0x20, VsafePWMMode
-	 */
-	REG_INIT(AB8505_VSAFEREGU,		0x04, 0x05, 0x3f),
-	/*
-	 * 0x03, VpllRegu (NOTE! PRCMU register bits)
-	 * 0x0c, VanaRegu
-	 */
-	REG_INIT(AB8505_VPLLVANAREGU,		0x04, 0x06, 0x0f),
-	/*
-	 * 0x03, VextSupply1Regu
-	 * 0x0c, VextSupply2Regu
-	 * 0x30, VextSupply3Regu
-	 * 0x40, ExtSupply2Bypass
-	 * 0x80, ExtSupply3Bypass
-	 */
-	REG_INIT(AB8505_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
-	/*
-	 * 0x03, Vaux1Regu
-	 * 0x0c, Vaux2Regu
-	 */
-	REG_INIT(AB8505_VAUX12REGU,		0x04, 0x09, 0x0f),
-	/*
-	 * 0x0f, Vaux3Regu
-	 */
-	REG_INIT(AB8505_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
-	/*
-	 * 0x3f, VsmpsASel1
-	 */
-	REG_INIT(AB8505_VSMPSASEL1,		0x04, 0x13, 0x3f),
-	/*
-	 * 0x3f, VsmpsASel2
-	 */
-	REG_INIT(AB8505_VSMPSASEL2,		0x04, 0x14, 0x3f),
-	/*
-	 * 0x3f, VsmpsASel3
-	 */
-	REG_INIT(AB8505_VSMPSASEL3,		0x04, 0x15, 0x3f),
-	/*
-	 * 0x3f, VsmpsBSel1
-	 */
-	REG_INIT(AB8505_VSMPSBSEL1,		0x04, 0x17, 0x3f),
-	/*
-	 * 0x3f, VsmpsBSel2
-	 */
-	REG_INIT(AB8505_VSMPSBSEL2,		0x04, 0x18, 0x3f),
-	/*
-	 * 0x3f, VsmpsBSel3
-	 */
-	REG_INIT(AB8505_VSMPSBSEL3,		0x04, 0x19, 0x3f),
-	/*
-	 * 0x7f, VsafeSel1
-	 */
-	REG_INIT(AB8505_VSAFESEL1,		0x04, 0x1b, 0x7f),
-	/*
-	 * 0x3f, VsafeSel2
-	 */
-	REG_INIT(AB8505_VSAFESEL2,		0x04, 0x1c, 0x7f),
-	/*
-	 * 0x3f, VsafeSel3
-	 */
-	REG_INIT(AB8505_VSAFESEL3,		0x04, 0x1d, 0x7f),
-	/*
-	 * 0x0f, Vaux1Sel
-	 */
-	REG_INIT(AB8505_VAUX1SEL,		0x04, 0x1f, 0x0f),
-	/*
-	 * 0x0f, Vaux2Sel
-	 */
-	REG_INIT(AB8505_VAUX2SEL,		0x04, 0x20, 0x0f),
-	/*
-	 * 0x07, Vaux3Sel
-	 * 0x30, VRF1Sel
-	 */
-	REG_INIT(AB8505_VRF1VAUX3SEL,		0x04, 0x21, 0x37),
-	/*
-	 * 0x03, Vaux4RequestCtrl
-	 */
-	REG_INIT(AB8505_VAUX4REQCTRL,		0x04, 0x2d, 0x03),
-	/*
-	 * 0x03, Vaux4Regu
-	 */
-	REG_INIT(AB8505_VAUX4REGU,		0x04, 0x2e, 0x03),
-	/*
-	 * 0x0f, Vaux4Sel
-	 */
-	REG_INIT(AB8505_VAUX4SEL,		0x04, 0x2f, 0x0f),
-	/*
-	 * 0x04, Vaux1Disch
-	 * 0x08, Vaux2Disch
-	 * 0x10, Vaux3Disch
-	 * 0x20, Vintcore12Disch
-	 * 0x40, VTVoutDisch
-	 * 0x80, VaudioDisch
-	 */
-	REG_INIT(AB8505_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
-	/*
-	 * 0x02, VanaDisch
-	 * 0x04, VdmicPullDownEna
-	 * 0x10, VdmicDisch
-	 */
-	REG_INIT(AB8505_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
-	/*
-	 * 0x01, Vaux4Disch
-	 */
-	REG_INIT(AB8505_REGUCTRLDISCH3,		0x04, 0x48, 0x01),
-	/*
-	 * 0x07, Vaux5Sel
-	 * 0x08, Vaux5LP
-	 * 0x10, Vaux5Ena
-	 * 0x20, Vaux5Disch
-	 * 0x40, Vaux5DisSfst
-	 * 0x80, Vaux5DisPulld
-	 */
-	REG_INIT(AB8505_CTRLVAUX5,		0x01, 0x55, 0xff),
-	/*
-	 * 0x07, Vaux6Sel
-	 * 0x08, Vaux6LP
-	 * 0x10, Vaux6Ena
-	 * 0x80, Vaux6DisPulld
-	 */
-	REG_INIT(AB8505_CTRLVAUX6,		0x01, 0x56, 0x9f),
-};
-
-/* AB9540 register init */
-static struct ab8500_reg_init ab9540_reg_init[] = {
-	/*
-	 * 0x03, VarmRequestCtrl
-	 * 0x0c, VapeRequestCtrl
-	 * 0x30, Vsmps1RequestCtrl
-	 * 0xc0, Vsmps2RequestCtrl
-	 */
-	REG_INIT(AB9540_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
-	/*
-	 * 0x03, Vsmps3RequestCtrl
-	 * 0x0c, VpllRequestCtrl
-	 * 0x30, VanaRequestCtrl
-	 * 0xc0, VextSupply1RequestCtrl
-	 */
-	REG_INIT(AB9540_REGUREQUESTCTRL2,	0x03, 0x04, 0xff),
-	/*
-	 * 0x03, VextSupply2RequestCtrl
-	 * 0x0c, VextSupply3RequestCtrl
-	 * 0x30, Vaux1RequestCtrl
-	 * 0xc0, Vaux2RequestCtrl
-	 */
-	REG_INIT(AB9540_REGUREQUESTCTRL3,	0x03, 0x05, 0xff),
+	REG_INIT(AB8500_REGUREQUESTCTRL3,	0x03, 0x05, 0xff),
 	/*
 	 * 0x03, Vaux3RequestCtrl
 	 * 0x04, SwHPReq
 	 */
-	REG_INIT(AB9540_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
+	REG_INIT(AB8500_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
 	/*
-	 * 0x01, Vsmps1SysClkReq1HPValid
-	 * 0x02, Vsmps2SysClkReq1HPValid
-	 * 0x04, Vsmps3SysClkReq1HPValid
 	 * 0x08, VanaSysClkReq1HPValid
-	 * 0x10, VpllSysClkReq1HPValid
 	 * 0x20, Vaux1SysClkReq1HPValid
 	 * 0x40, Vaux2SysClkReq1HPValid
 	 * 0x80, Vaux3SysClkReq1HPValid
 	 */
-	REG_INIT(AB9540_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xe8),
 	/*
-	 * 0x01, VapeSysClkReq1HPValid
-	 * 0x02, VarmSysClkReq1HPValid
-	 * 0x04, VbbSysClkReq1HPValid
-	 * 0x08, VmodSysClkReq1HPValid
 	 * 0x10, VextSupply1SysClkReq1HPValid
 	 * 0x20, VextSupply2SysClkReq1HPValid
 	 * 0x40, VextSupply3SysClkReq1HPValid
 	 */
-	REG_INIT(AB9540_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x7f),
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x70),
 	/*
-	 * 0x01, Vsmps1HwHPReq1Valid
-	 * 0x02, Vsmps2HwHPReq1Valid
-	 * 0x04, Vsmps3HwHPReq1Valid
 	 * 0x08, VanaHwHPReq1Valid
-	 * 0x10, VpllHwHPReq1Valid
 	 * 0x20, Vaux1HwHPReq1Valid
 	 * 0x40, Vaux2HwHPReq1Valid
 	 * 0x80, Vaux3HwHPReq1Valid
 	 */
-	REG_INIT(AB9540_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
+	REG_INIT(AB8500_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xe8),
 	/*
 	 * 0x01, VextSupply1HwHPReq1Valid
 	 * 0x02, VextSupply2HwHPReq1Valid
 	 * 0x04, VextSupply3HwHPReq1Valid
-	 * 0x08, VmodHwHPReq1Valid
 	 */
-	REG_INIT(AB9540_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x0f),
+	REG_INIT(AB8500_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x07),
 	/*
-	 * 0x01, Vsmps1HwHPReq2Valid
-	 * 0x02, Vsmps2HwHPReq2Valid
-	 * 0x03, Vsmps3HwHPReq2Valid
 	 * 0x08, VanaHwHPReq2Valid
-	 * 0x10, VpllHwHPReq2Valid
 	 * 0x20, Vaux1HwHPReq2Valid
 	 * 0x40, Vaux2HwHPReq2Valid
 	 * 0x80, Vaux3HwHPReq2Valid
 	 */
-	REG_INIT(AB9540_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
+	REG_INIT(AB8500_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xe8),
 	/*
 	 * 0x01, VextSupply1HwHPReq2Valid
 	 * 0x02, VextSupply2HwHPReq2Valid
 	 * 0x04, VextSupply3HwHPReq2Valid
-	 * 0x08, VmodHwHPReq2Valid
 	 */
-	REG_INIT(AB9540_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x0f),
+	REG_INIT(AB8500_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x07),
 	/*
-	 * 0x01, VapeSwHPReqValid
-	 * 0x02, VarmSwHPReqValid
-	 * 0x04, Vsmps1SwHPReqValid
-	 * 0x08, Vsmps2SwHPReqValid
-	 * 0x10, Vsmps3SwHPReqValid
 	 * 0x20, VanaSwHPReqValid
-	 * 0x40, VpllSwHPReqValid
 	 * 0x80, Vaux1SwHPReqValid
 	 */
-	REG_INIT(AB9540_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
+	REG_INIT(AB8500_REGUSWHPREQVALID1,	0x03, 0x0d, 0xa0),
 	/*
 	 * 0x01, Vaux2SwHPReqValid
 	 * 0x02, Vaux3SwHPReqValid
 	 * 0x04, VextSupply1SwHPReqValid
 	 * 0x08, VextSupply2SwHPReqValid
 	 * 0x10, VextSupply3SwHPReqValid
-	 * 0x20, VmodSwHPReqValid
 	 */
-	REG_INIT(AB9540_REGUSWHPREQVALID2,	0x03, 0x0e, 0x3f),
+	REG_INIT(AB8500_REGUSWHPREQVALID2,	0x03, 0x0e, 0x1f),
 	/*
 	 * 0x02, SysClkReq2Valid1
-	 * ...
+	 * 0x04, SysClkReq3Valid1
+	 * 0x08, SysClkReq4Valid1
+	 * 0x10, SysClkReq5Valid1
+	 * 0x20, SysClkReq6Valid1
+	 * 0x40, SysClkReq7Valid1
 	 * 0x80, SysClkReq8Valid1
 	 */
-	REG_INIT(AB9540_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0xfe),
+	REG_INIT(AB8500_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0xfe),
 	/*
 	 * 0x02, SysClkReq2Valid2
-	 * ...
+	 * 0x04, SysClkReq3Valid2
+	 * 0x08, SysClkReq4Valid2
+	 * 0x10, SysClkReq5Valid2
+	 * 0x20, SysClkReq6Valid2
+	 * 0x40, SysClkReq7Valid2
 	 * 0x80, SysClkReq8Valid2
 	 */
-	REG_INIT(AB9540_REGUSYSCLKREQVALID2,	0x03, 0x10, 0xfe),
-	/*
-	 * 0x01, Vaux4SwHPReqValid
-	 * 0x02, Vaux4HwHPReq2Valid
-	 * 0x04, Vaux4HwHPReq1Valid
-	 * 0x08, Vaux4SysClkReq1HPValid
-	 */
-	REG_INIT(AB9540_REGUVAUX4REQVALID,	0x03, 0x11, 0x0f),
+	REG_INIT(AB8500_REGUSYSCLKREQVALID2,	0x03, 0x10, 0xfe),
 	/*
 	 * 0x02, VTVoutEna
 	 * 0x04, Vintcore12Ena
@@ -2374,44 +1193,29 @@ static struct ab8500_reg_init ab9540_reg_init[] = {
 	 * 0x40, Vintcore12LP
 	 * 0x80, VTVoutLP
 	 */
-	REG_INIT(AB9540_REGUMISC1,		0x03, 0x80, 0xfe),
+	REG_INIT(AB8500_REGUMISC1,		0x03, 0x80, 0xfe),
 	/*
 	 * 0x02, VaudioEna
 	 * 0x04, VdmicEna
 	 * 0x08, Vamic1Ena
 	 * 0x10, Vamic2Ena
 	 */
-	REG_INIT(AB9540_VAUDIOSUPPLY,		0x03, 0x83, 0x1e),
+	REG_INIT(AB8500_VAUDIOSUPPLY,		0x03, 0x83, 0x1e),
 	/*
 	 * 0x01, Vamic1_dzout
 	 * 0x02, Vamic2_dzout
 	 */
-	REG_INIT(AB9540_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
-	/*
-	 * 0x03, Vsmps1Regu
-	 * 0x0c, Vsmps1SelCtrl
-	 * 0x10, Vsmps1AutoMode
-	 * 0x20, Vsmps1PWMMode
-	 */
-	REG_INIT(AB9540_VSMPS1REGU,		0x04, 0x03, 0x3f),
-	/*
-	 * 0x03, Vsmps2Regu
-	 * 0x0c, Vsmps2SelCtrl
-	 * 0x10, Vsmps2AutoMode
-	 * 0x20, Vsmps2PWMMode
-	 */
-	REG_INIT(AB9540_VSMPS2REGU,		0x04, 0x04, 0x3f),
+	REG_INIT(AB8500_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
 	/*
-	 * 0x03, Vsmps3Regu
-	 * 0x0c, Vsmps3SelCtrl
-	 * NOTE! PRCMU register
+	 * 0x03, VpllRegu (NOTE! PRCMU register bits)
+	 * 0x0c, VanaRegu
 	 */
-	REG_INIT(AB9540_VSMPS3REGU,		0x04, 0x05, 0x0f),
+	REG_INIT(AB8500_VPLLVANAREGU,		0x04, 0x06, 0x0f),
 	/*
-	 * 0x03, VpllRegu
-	 * 0x0c, VanaRegu
+	 * 0x01, VrefDDREna
+	 * 0x02, VrefDDRSleepMode
 	 */
-	REG_INIT(AB9540_VPLLVANAREGU,		0x04, 0x06, 0x0f),
+	REG_INIT(AB8500_VREFDDR,		0x04, 0x07, 0x03),
 	/*
 	 * 0x03, VextSupply1Regu
 	 * 0x0c, VextSupply2Regu
@@ -2419,83 +1223,33 @@ static struct ab8500_reg_init ab9540_reg_init[] = {
 	 * 0x40, ExtSupply2Bypass
 	 * 0x80, ExtSupply3Bypass
 	 */
-	REG_INIT(AB9540_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
+	REG_INIT(AB8500_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
 	/*
 	 * 0x03, Vaux1Regu
 	 * 0x0c, Vaux2Regu
 	 */
-	REG_INIT(AB9540_VAUX12REGU,		0x04, 0x09, 0x0f),
+	REG_INIT(AB8500_VAUX12REGU,		0x04, 0x09, 0x0f),
 	/*
-	 * 0x0c, Vrf1Regu
 	 * 0x03, Vaux3Regu
 	 */
-	REG_INIT(AB9540_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
-	/*
-	 * 0x3f, Vsmps1Sel1
-	 */
-	REG_INIT(AB9540_VSMPS1SEL1,		0x04, 0x13, 0x3f),
-	/*
-	 * 0x3f, Vsmps1Sel2
-	 */
-	REG_INIT(AB9540_VSMPS1SEL2,		0x04, 0x14, 0x3f),
-	/*
-	 * 0x3f, Vsmps1Sel3
-	 */
-	REG_INIT(AB9540_VSMPS1SEL3,		0x04, 0x15, 0x3f),
-	/*
-	 * 0x3f, Vsmps2Sel1
-	 */
-	REG_INIT(AB9540_VSMPS2SEL1,		0x04, 0x17, 0x3f),
-	/*
-	 * 0x3f, Vsmps2Sel2
-	 */
-	REG_INIT(AB9540_VSMPS2SEL2,		0x04, 0x18, 0x3f),
-	/*
-	 * 0x3f, Vsmps2Sel3
-	 */
-	REG_INIT(AB9540_VSMPS2SEL3,		0x04, 0x19, 0x3f),
-	/*
-	 * 0x7f, Vsmps3Sel1
-	 * NOTE! PRCMU register
-	 */
-	REG_INIT(AB9540_VSMPS3SEL1,             0x04, 0x1b, 0x7f),
-	/*
-	 * 0x7f, Vsmps3Sel2
-	 * NOTE! PRCMU register
-	 */
-	REG_INIT(AB9540_VSMPS3SEL2,             0x04, 0x1c, 0x7f),
+	REG_INIT(AB8500_VRF1VAUX3REGU,		0x04, 0x0a, 0x03),
 	/*
 	 * 0x0f, Vaux1Sel
 	 */
-	REG_INIT(AB9540_VAUX1SEL,		0x04, 0x1f, 0x0f),
+	REG_INIT(AB8500_VAUX1SEL,		0x04, 0x1f, 0x0f),
 	/*
 	 * 0x0f, Vaux2Sel
 	 */
-	REG_INIT(AB9540_VAUX2SEL,		0x04, 0x20, 0x0f),
+	REG_INIT(AB8500_VAUX2SEL,		0x04, 0x20, 0x0f),
 	/*
 	 * 0x07, Vaux3Sel
-	 * 0x30, Vrf1Sel
 	 */
-	REG_INIT(AB9540_VRF1VAUX3SEL,		0x04, 0x21, 0x37),
+	REG_INIT(AB8500_VRF1VAUX3SEL,		0x04, 0x21, 0x07),
 	/*
 	 * 0x01, VextSupply12LP
 	 */
-	REG_INIT(AB9540_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
-	/*
-	 * 0x03, Vaux4RequestCtrl
-	 */
-	REG_INIT(AB9540_VAUX4REQCTRL,		0x04, 0x2d, 0x03),
-	/*
-	 * 0x03, Vaux4Regu
-	 */
-	REG_INIT(AB9540_VAUX4REGU,		0x04, 0x2e, 0x03),
-	/*
-	 * 0x08, Vaux4Sel
-	 */
-	REG_INIT(AB9540_VAUX4SEL,		0x04, 0x2f, 0x0f),
+	REG_INIT(AB8500_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
 	/*
-	 * 0x01, VpllDisch
-	 * 0x02, Vrf1Disch
 	 * 0x04, Vaux1Disch
 	 * 0x08, Vaux2Disch
 	 * 0x10, Vaux3Disch
@@ -2503,243 +1257,170 @@ static struct ab8500_reg_init ab9540_reg_init[] = {
 	 * 0x40, VTVoutDisch
 	 * 0x80, VaudioDisch
 	 */
-	REG_INIT(AB9540_REGUCTRLDISCH,		0x04, 0x43, 0xff),
+	REG_INIT(AB8500_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
 	/*
-	 * 0x01, VsimDisch
 	 * 0x02, VanaDisch
 	 * 0x04, VdmicPullDownEna
-	 * 0x08, VpllPullDownEna
 	 * 0x10, VdmicDisch
 	 */
-	REG_INIT(AB9540_REGUCTRLDISCH2,		0x04, 0x44, 0x1f),
-	/*
-	 * 0x01, Vaux4Disch
-	 */
-	REG_INIT(AB9540_REGUCTRLDISCH3,		0x04, 0x48, 0x01),
+	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
 };
 
-/* AB8540 register init */
-static struct ab8500_reg_init ab8540_reg_init[] = {
-	/*
-	 * 0x01, VSimSycClkReq1Valid
-	 * 0x02, VSimSycClkReq2Valid
-	 * 0x04, VSimSycClkReq3Valid
-	 * 0x08, VSimSycClkReq4Valid
-	 * 0x10, VSimSycClkReq5Valid
-	 * 0x20, VSimSycClkReq6Valid
-	 * 0x40, VSimSycClkReq7Valid
-	 * 0x80, VSimSycClkReq8Valid
-	 */
-	REG_INIT(AB8540_VSIMSYSCLKCTRL,		0x02, 0x33, 0xff),
+/* AB8505 register init */
+static struct ab8500_reg_init ab8505_reg_init[] = {
 	/*
 	 * 0x03, VarmRequestCtrl
-	 * 0x0c, VapeRequestCtrl
-	 * 0x30, Vsmps1RequestCtrl
-	 * 0xc0, Vsmps2RequestCtrl
+	 * 0x0c, VsmpsCRequestCtrl
+	 * 0x30, VsmpsARequestCtrl
+	 * 0xc0, VsmpsBRequestCtrl
 	 */
-	REG_INIT(AB8540_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
+	REG_INIT(AB8505_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
 	/*
-	 * 0x03, Vsmps3RequestCtrl
+	 * 0x03, VsafeRequestCtrl
 	 * 0x0c, VpllRequestCtrl
 	 * 0x30, VanaRequestCtrl
-	 * 0xc0, VextSupply1RequestCtrl
 	 */
-	REG_INIT(AB8540_REGUREQUESTCTRL2,	0x03, 0x04, 0xff),
+	REG_INIT(AB8505_REGUREQUESTCTRL2,	0x03, 0x04, 0x3f),
 	/*
-	 * 0x03, VextSupply2RequestCtrl
-	 * 0x0c, VextSupply3RequestCtrl
 	 * 0x30, Vaux1RequestCtrl
 	 * 0xc0, Vaux2RequestCtrl
 	 */
-	REG_INIT(AB8540_REGUREQUESTCTRL3,	0x03, 0x05, 0xff),
+	REG_INIT(AB8505_REGUREQUESTCTRL3,	0x03, 0x05, 0xf0),
 	/*
 	 * 0x03, Vaux3RequestCtrl
 	 * 0x04, SwHPReq
 	 */
-	REG_INIT(AB8540_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
+	REG_INIT(AB8505_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
 	/*
-	 * 0x01, Vsmps1SysClkReq1HPValid
-	 * 0x02, Vsmps2SysClkReq1HPValid
-	 * 0x04, Vsmps3SysClkReq1HPValid
+	 * 0x01, VsmpsASysClkReq1HPValid
+	 * 0x02, VsmpsBSysClkReq1HPValid
+	 * 0x04, VsafeSysClkReq1HPValid
 	 * 0x08, VanaSysClkReq1HPValid
 	 * 0x10, VpllSysClkReq1HPValid
 	 * 0x20, Vaux1SysClkReq1HPValid
 	 * 0x40, Vaux2SysClkReq1HPValid
 	 * 0x80, Vaux3SysClkReq1HPValid
 	 */
-	REG_INIT(AB8540_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
+	REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
 	/*
-	 * 0x01, VapeSysClkReq1HPValid
+	 * 0x01, VsmpsCSysClkReq1HPValid
 	 * 0x02, VarmSysClkReq1HPValid
 	 * 0x04, VbbSysClkReq1HPValid
-	 * 0x10, VextSupply1SysClkReq1HPValid
-	 * 0x20, VextSupply2SysClkReq1HPValid
-	 * 0x40, VextSupply3SysClkReq1HPValid
+	 * 0x08, VsmpsMSysClkReq1HPValid
 	 */
-	REG_INIT(AB8540_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x77),
+	REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x0f),
 	/*
-	 * 0x01, Vsmps1HwHPReq1Valid
-	 * 0x02, Vsmps2HwHPReq1Valid
-	 * 0x04, Vsmps3HwHPReq1Valid
+	 * 0x01, VsmpsAHwHPReq1Valid
+	 * 0x02, VsmpsBHwHPReq1Valid
+	 * 0x04, VsafeHwHPReq1Valid
 	 * 0x08, VanaHwHPReq1Valid
 	 * 0x10, VpllHwHPReq1Valid
 	 * 0x20, Vaux1HwHPReq1Valid
 	 * 0x40, Vaux2HwHPReq1Valid
 	 * 0x80, Vaux3HwHPReq1Valid
 	 */
-	REG_INIT(AB8540_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
+	REG_INIT(AB8505_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
 	/*
-	 * 0x01, VextSupply1HwHPReq1Valid
-	 * 0x02, VextSupply2HwHPReq1Valid
-	 * 0x04, VextSupply3HwHPReq1Valid
+	 * 0x08, VsmpsMHwHPReq1Valid
 	 */
-	REG_INIT(AB8540_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x07),
+	REG_INIT(AB8505_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x08),
 	/*
-	 * 0x01, Vsmps1HwHPReq2Valid
-	 * 0x02, Vsmps2HwHPReq2Valid
-	 * 0x03, Vsmps3HwHPReq2Valid
+	 * 0x01, VsmpsAHwHPReq2Valid
+	 * 0x02, VsmpsBHwHPReq2Valid
+	 * 0x04, VsafeHwHPReq2Valid
 	 * 0x08, VanaHwHPReq2Valid
 	 * 0x10, VpllHwHPReq2Valid
 	 * 0x20, Vaux1HwHPReq2Valid
 	 * 0x40, Vaux2HwHPReq2Valid
 	 * 0x80, Vaux3HwHPReq2Valid
 	 */
-	REG_INIT(AB8540_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
+	REG_INIT(AB8505_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
 	/*
-	 * 0x01, VextSupply1HwHPReq2Valid
-	 * 0x02, VextSupply2HwHPReq2Valid
-	 * 0x04, VextSupply3HwHPReq2Valid
+	 * 0x08, VsmpsMHwHPReq2Valid
 	 */
-	REG_INIT(AB8540_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x07),
+	REG_INIT(AB8505_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x08),
 	/*
-	 * 0x01, VapeSwHPReqValid
+	 * 0x01, VsmpsCSwHPReqValid
 	 * 0x02, VarmSwHPReqValid
-	 * 0x04, Vsmps1SwHPReqValid
-	 * 0x08, Vsmps2SwHPReqValid
-	 * 0x10, Vsmps3SwHPReqValid
+	 * 0x04, VsmpsASwHPReqValid
+	 * 0x08, VsmpsBSwHPReqValid
+	 * 0x10, VsafeSwHPReqValid
 	 * 0x20, VanaSwHPReqValid
 	 * 0x40, VpllSwHPReqValid
 	 * 0x80, Vaux1SwHPReqValid
 	 */
-	REG_INIT(AB8540_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
+	REG_INIT(AB8505_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
 	/*
 	 * 0x01, Vaux2SwHPReqValid
 	 * 0x02, Vaux3SwHPReqValid
-	 * 0x04, VextSupply1SwHPReqValid
-	 * 0x08, VextSupply2SwHPReqValid
-	 * 0x10, VextSupply3SwHPReqValid
+	 * 0x20, VsmpsMSwHPReqValid
 	 */
-	REG_INIT(AB8540_REGUSWHPREQVALID2,	0x03, 0x0e, 0x1f),
+	REG_INIT(AB8505_REGUSWHPREQVALID2,	0x03, 0x0e, 0x23),
 	/*
 	 * 0x02, SysClkReq2Valid1
-	 * ...
-	 * 0x80, SysClkReq8Valid1
+	 * 0x04, SysClkReq3Valid1
+	 * 0x08, SysClkReq4Valid1
 	 */
-	REG_INIT(AB8540_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0xff),
+	REG_INIT(AB8505_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0x0e),
 	/*
 	 * 0x02, SysClkReq2Valid2
-	 * ...
-	 * 0x80, SysClkReq8Valid2
+	 * 0x04, SysClkReq3Valid2
+	 * 0x08, SysClkReq4Valid2
 	 */
-	REG_INIT(AB8540_REGUSYSCLKREQVALID2,	0x03, 0x10, 0xff),
+	REG_INIT(AB8505_REGUSYSCLKREQVALID2,	0x03, 0x10, 0x0e),
 	/*
 	 * 0x01, Vaux4SwHPReqValid
 	 * 0x02, Vaux4HwHPReq2Valid
 	 * 0x04, Vaux4HwHPReq1Valid
 	 * 0x08, Vaux4SysClkReq1HPValid
 	 */
-	REG_INIT(AB8540_REGUVAUX4REQVALID,	0x03, 0x11, 0x0f),
-	/*
-	 * 0x01, Vaux5SwHPReqValid
-	 * 0x02, Vaux5HwHPReq2Valid
-	 * 0x04, Vaux5HwHPReq1Valid
-	 * 0x08, Vaux5SysClkReq1HPValid
-	 */
-	REG_INIT(AB8540_REGUVAUX5REQVALID,	0x03, 0x12, 0x0f),
-	/*
-	 * 0x01, Vaux6SwHPReqValid
-	 * 0x02, Vaux6HwHPReq2Valid
-	 * 0x04, Vaux6HwHPReq1Valid
-	 * 0x08, Vaux6SysClkReq1HPValid
-	 */
-	REG_INIT(AB8540_REGUVAUX6REQVALID,	0x03, 0x13, 0x0f),
-	/*
-	 * 0x01, VclkbSwHPReqValid
-	 * 0x02, VclkbHwHPReq2Valid
-	 * 0x04, VclkbHwHPReq1Valid
-	 * 0x08, VclkbSysClkReq1HPValid
-	 */
-	REG_INIT(AB8540_REGUVCLKBREQVALID,	0x03, 0x14, 0x0f),
-	/*
-	 * 0x01, Vrf1SwHPReqValid
-	 * 0x02, Vrf1HwHPReq2Valid
-	 * 0x04, Vrf1HwHPReq1Valid
-	 * 0x08, Vrf1SysClkReq1HPValid
-	 */
-	REG_INIT(AB8540_REGUVRF1REQVALID,	0x03, 0x15, 0x0f),
+	REG_INIT(AB8505_REGUVAUX4REQVALID,	0x03, 0x11, 0x0f),
 	/*
-	 * 0x02, VTVoutEna
-	 * 0x04, Vintcore12Ena
-	 * 0x38, Vintcore12Sel
-	 * 0x40, Vintcore12LP
-	 * 0x80, VTVoutLP
+	 * 0x02, VadcEna
+	 * 0x04, VintCore12Ena
+	 * 0x38, VintCore12Sel
+	 * 0x40, VintCore12LP
+	 * 0x80, VadcLP
 	 */
-	REG_INIT(AB8540_REGUMISC1,		0x03, 0x80, 0xfe),
+	REG_INIT(AB8505_REGUMISC1,		0x03, 0x80, 0xfe),
 	/*
 	 * 0x02, VaudioEna
 	 * 0x04, VdmicEna
 	 * 0x08, Vamic1Ena
 	 * 0x10, Vamic2Ena
-	 * 0x20, Vamic12LP
-	 * 0xC0, VdmicSel
 	 */
-	REG_INIT(AB8540_VAUDIOSUPPLY,		0x03, 0x83, 0xfe),
+	REG_INIT(AB8505_VAUDIOSUPPLY,		0x03, 0x83, 0x1e),
 	/*
 	 * 0x01, Vamic1_dzout
 	 * 0x02, Vamic2_dzout
 	 */
-	REG_INIT(AB8540_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
-	/*
-	 * 0x07, VHSICSel
-	 * 0x08, VHSICOffState
-	 * 0x10, VHSIEna
-	 * 0x20, VHSICLP
-	 */
-	REG_INIT(AB8540_VHSIC,			0x03, 0x87, 0x3f),
-	/*
-	 * 0x07, VSDIOSel
-	 * 0x08, VSDIOOffState
-	 * 0x10, VSDIOEna
-	 * 0x20, VSDIOLP
-	 */
-	REG_INIT(AB8540_VSDIO,			0x03, 0x88, 0x3f),
+	REG_INIT(AB8505_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
 	/*
-	 * 0x03, Vsmps1Regu
-	 * 0x0c, Vsmps1SelCtrl
-	 * 0x10, Vsmps1AutoMode
-	 * 0x20, Vsmps1PWMMode
+	 * 0x03, VsmpsARegu
+	 * 0x0c, VsmpsASelCtrl
+	 * 0x10, VsmpsAAutoMode
+	 * 0x20, VsmpsAPWMMode
 	 */
-	REG_INIT(AB8540_VSMPS1REGU,		0x04, 0x03, 0x3f),
+	REG_INIT(AB8505_VSMPSAREGU,		0x04, 0x03, 0x3f),
 	/*
-	 * 0x03, Vsmps2Regu
-	 * 0x0c, Vsmps2SelCtrl
-	 * 0x10, Vsmps2AutoMode
-	 * 0x20, Vsmps2PWMMode
+	 * 0x03, VsmpsBRegu
+	 * 0x0c, VsmpsBSelCtrl
+	 * 0x10, VsmpsBAutoMode
+	 * 0x20, VsmpsBPWMMode
 	 */
-	REG_INIT(AB8540_VSMPS2REGU,		0x04, 0x04, 0x3f),
+	REG_INIT(AB8505_VSMPSBREGU,		0x04, 0x04, 0x3f),
 	/*
-	 * 0x03, Vsmps3Regu
-	 * 0x0c, Vsmps3SelCtrl
-	 * 0x10, Vsmps3AutoMode
-	 * 0x20, Vsmps3PWMMode
-	 * NOTE! PRCMU register
+	 * 0x03, VsafeRegu
+	 * 0x0c, VsafeSelCtrl
+	 * 0x10, VsafeAutoMode
+	 * 0x20, VsafePWMMode
 	 */
-	REG_INIT(AB8540_VSMPS3REGU,		0x04, 0x05, 0x0f),
+	REG_INIT(AB8505_VSAFEREGU,		0x04, 0x05, 0x3f),
 	/*
-	 * 0x03, VpllRegu
+	 * 0x03, VpllRegu (NOTE! PRCMU register bits)
 	 * 0x0c, VanaRegu
 	 */
-	REG_INIT(AB8540_VPLLVANAREGU,		0x04, 0x06, 0x0f),
+	REG_INIT(AB8505_VPLLVANAREGU,		0x04, 0x06, 0x0f),
 	/*
 	 * 0x03, VextSupply1Regu
 	 * 0x0c, VextSupply2Regu
@@ -2747,128 +1428,78 @@ static struct ab8500_reg_init ab8540_reg_init[] = {
 	 * 0x40, ExtSupply2Bypass
 	 * 0x80, ExtSupply3Bypass
 	 */
-	REG_INIT(AB8540_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
+	REG_INIT(AB8505_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
 	/*
 	 * 0x03, Vaux1Regu
 	 * 0x0c, Vaux2Regu
 	 */
-	REG_INIT(AB8540_VAUX12REGU,		0x04, 0x09, 0x0f),
+	REG_INIT(AB8505_VAUX12REGU,		0x04, 0x09, 0x0f),
 	/*
-	 * 0x0c, VRF1Regu
-	 * 0x03, Vaux3Regu
+	 * 0x0f, Vaux3Regu
+	 */
+	REG_INIT(AB8505_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
+	/*
+	 * 0x3f, VsmpsASel1
 	 */
-	REG_INIT(AB8540_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
+	REG_INIT(AB8505_VSMPSASEL1,		0x04, 0x13, 0x3f),
 	/*
-	 * 0x3f, Vsmps1Sel1
+	 * 0x3f, VsmpsASel2
 	 */
-	REG_INIT(AB8540_VSMPS1SEL1,		0x04, 0x13, 0x3f),
+	REG_INIT(AB8505_VSMPSASEL2,		0x04, 0x14, 0x3f),
 	/*
-	 * 0x3f, Vsmps1Sel2
+	 * 0x3f, VsmpsASel3
 	 */
-	REG_INIT(AB8540_VSMPS1SEL2,		0x04, 0x14, 0x3f),
+	REG_INIT(AB8505_VSMPSASEL3,		0x04, 0x15, 0x3f),
 	/*
-	 * 0x3f, Vsmps1Sel3
+	 * 0x3f, VsmpsBSel1
 	 */
-	REG_INIT(AB8540_VSMPS1SEL3,		0x04, 0x15, 0x3f),
+	REG_INIT(AB8505_VSMPSBSEL1,		0x04, 0x17, 0x3f),
 	/*
-	 * 0x3f, Vsmps2Sel1
+	 * 0x3f, VsmpsBSel2
 	 */
-	REG_INIT(AB8540_VSMPS2SEL1,		0x04, 0x17, 0x3f),
+	REG_INIT(AB8505_VSMPSBSEL2,		0x04, 0x18, 0x3f),
 	/*
-	 * 0x3f, Vsmps2Sel2
+	 * 0x3f, VsmpsBSel3
 	 */
-	REG_INIT(AB8540_VSMPS2SEL2,		0x04, 0x18, 0x3f),
+	REG_INIT(AB8505_VSMPSBSEL3,		0x04, 0x19, 0x3f),
 	/*
-	 * 0x3f, Vsmps2Sel3
+	 * 0x7f, VsafeSel1
 	 */
-	REG_INIT(AB8540_VSMPS2SEL3,		0x04, 0x19, 0x3f),
+	REG_INIT(AB8505_VSAFESEL1,		0x04, 0x1b, 0x7f),
 	/*
-	 * 0x7f, Vsmps3Sel1
-	 * NOTE! PRCMU register
+	 * 0x3f, VsafeSel2
 	 */
-	REG_INIT(AB8540_VSMPS3SEL1,             0x04, 0x1b, 0x7f),
+	REG_INIT(AB8505_VSAFESEL2,		0x04, 0x1c, 0x7f),
 	/*
-	 * 0x7f, Vsmps3Sel2
-	 * NOTE! PRCMU register
+	 * 0x3f, VsafeSel3
 	 */
-	REG_INIT(AB8540_VSMPS3SEL2,             0x04, 0x1c, 0x7f),
+	REG_INIT(AB8505_VSAFESEL3,		0x04, 0x1d, 0x7f),
 	/*
 	 * 0x0f, Vaux1Sel
 	 */
-	REG_INIT(AB8540_VAUX1SEL,		0x04, 0x1f, 0x0f),
+	REG_INIT(AB8505_VAUX1SEL,		0x04, 0x1f, 0x0f),
 	/*
 	 * 0x0f, Vaux2Sel
 	 */
-	REG_INIT(AB8540_VAUX2SEL,		0x04, 0x20, 0x0f),
+	REG_INIT(AB8505_VAUX2SEL,		0x04, 0x20, 0x0f),
 	/*
 	 * 0x07, Vaux3Sel
-	 * 0x70, Vrf1Sel
-	 */
-	REG_INIT(AB8540_VRF1VAUX3SEL,		0x04, 0x21, 0x77),
-	/*
-	 * 0x01, VextSupply12LP
-	 */
-	REG_INIT(AB8540_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
-	/*
-	 * 0x07, Vanasel
-	 * 0x30, Vpllsel
+	 * 0x30, VRF1Sel
 	 */
-	REG_INIT(AB8540_VANAVPLLSEL,		0x04, 0x29, 0x37),
+	REG_INIT(AB8505_VRF1VAUX3SEL,		0x04, 0x21, 0x37),
 	/*
 	 * 0x03, Vaux4RequestCtrl
 	 */
-	REG_INIT(AB8540_VAUX4REQCTRL,		0x04, 0x2d, 0x03),
+	REG_INIT(AB8505_VAUX4REQCTRL,		0x04, 0x2d, 0x03),
 	/*
 	 * 0x03, Vaux4Regu
 	 */
-	REG_INIT(AB8540_VAUX4REGU,		0x04, 0x2e, 0x03),
+	REG_INIT(AB8505_VAUX4REGU,		0x04, 0x2e, 0x03),
 	/*
 	 * 0x0f, Vaux4Sel
 	 */
-	REG_INIT(AB8540_VAUX4SEL,		0x04, 0x2f, 0x0f),
-	/*
-	 * 0x03, Vaux5RequestCtrl
-	 */
-	REG_INIT(AB8540_VAUX5REQCTRL,		0x04, 0x31, 0x03),
-	/*
-	 * 0x03, Vaux5Regu
-	 */
-	REG_INIT(AB8540_VAUX5REGU,		0x04, 0x32, 0x03),
-	/*
-	 * 0x3f, Vaux5Sel
-	 */
-	REG_INIT(AB8540_VAUX5SEL,		0x04, 0x33, 0x3f),
-	/*
-	 * 0x03, Vaux6RequestCtrl
-	 */
-	REG_INIT(AB8540_VAUX6REQCTRL,		0x04, 0x34, 0x03),
-	/*
-	 * 0x03, Vaux6Regu
-	 */
-	REG_INIT(AB8540_VAUX6REGU,		0x04, 0x35, 0x03),
-	/*
-	 * 0x3f, Vaux6Sel
-	 */
-	REG_INIT(AB8540_VAUX6SEL,		0x04, 0x36, 0x3f),
-	/*
-	 * 0x03, VCLKBRequestCtrl
-	 */
-	REG_INIT(AB8540_VCLKBREQCTRL,		0x04, 0x37, 0x03),
-	/*
-	 * 0x03, VCLKBRegu
-	 */
-	REG_INIT(AB8540_VCLKBREGU,		0x04, 0x38, 0x03),
-	/*
-	 * 0x07, VCLKBSel
-	 */
-	REG_INIT(AB8540_VCLKBSEL,		0x04, 0x39, 0x07),
-	/*
-	 * 0x03, Vrf1RequestCtrl
-	 */
-	REG_INIT(AB8540_VRF1REQCTRL,		0x04, 0x3a, 0x03),
+	REG_INIT(AB8505_VAUX4SEL,		0x04, 0x2f, 0x0f),
 	/*
-	 * 0x01, VpllDisch
-	 * 0x02, Vrf1Disch
 	 * 0x04, Vaux1Disch
 	 * 0x08, Vaux2Disch
 	 * 0x10, Vaux3Disch
@@ -2876,24 +1507,33 @@ static struct ab8500_reg_init ab8540_reg_init[] = {
 	 * 0x40, VTVoutDisch
 	 * 0x80, VaudioDisch
 	 */
-	REG_INIT(AB8540_REGUCTRLDISCH,		0x04, 0x43, 0xff),
+	REG_INIT(AB8505_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
 	/*
 	 * 0x02, VanaDisch
 	 * 0x04, VdmicPullDownEna
-	 * 0x08, VpllPullDownEna
 	 * 0x10, VdmicDisch
 	 */
-	REG_INIT(AB8540_REGUCTRLDISCH2,		0x04, 0x44, 0x1e),
+	REG_INIT(AB8505_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
 	/*
 	 * 0x01, Vaux4Disch
 	 */
-	REG_INIT(AB8540_REGUCTRLDISCH3,		0x04, 0x48, 0x01),
+	REG_INIT(AB8505_REGUCTRLDISCH3,		0x04, 0x48, 0x01),
+	/*
+	 * 0x07, Vaux5Sel
+	 * 0x08, Vaux5LP
+	 * 0x10, Vaux5Ena
+	 * 0x20, Vaux5Disch
+	 * 0x40, Vaux5DisSfst
+	 * 0x80, Vaux5DisPulld
+	 */
+	REG_INIT(AB8505_CTRLVAUX5,		0x01, 0x55, 0xff),
 	/*
-	 * 0x01, Vaux5Disch
-	 * 0x02, Vaux6Disch
-	 * 0x04, VCLKBDisch
+	 * 0x07, Vaux6Sel
+	 * 0x08, Vaux6LP
+	 * 0x10, Vaux6Ena
+	 * 0x80, Vaux6DisPulld
 	 */
-	REG_INIT(AB8540_REGUCTRLDISCH4,		0x04, 0x49, 0x07),
+	REG_INIT(AB8505_CTRLVAUX6,		0x01, 0x56, 0x9f),
 };
 
 static struct of_regulator_match ab8500_regulator_match[] = {
@@ -2925,37 +1565,6 @@ static struct of_regulator_match ab8505_regulator_match[] = {
 	{ .name	= "ab8500_ldo_ana",     .driver_data = (void *) AB8505_LDO_ANA, },
 };
 
-static struct of_regulator_match ab8540_regulator_match[] = {
-	{ .name	= "ab8500_ldo_aux1",    .driver_data = (void *) AB8540_LDO_AUX1, },
-	{ .name	= "ab8500_ldo_aux2",    .driver_data = (void *) AB8540_LDO_AUX2, },
-	{ .name	= "ab8500_ldo_aux3",    .driver_data = (void *) AB8540_LDO_AUX3, },
-	{ .name	= "ab8500_ldo_aux4",    .driver_data = (void *) AB8540_LDO_AUX4, },
-	{ .name	= "ab8500_ldo_aux5",    .driver_data = (void *) AB8540_LDO_AUX5, },
-	{ .name	= "ab8500_ldo_aux6",    .driver_data = (void *) AB8540_LDO_AUX6, },
-	{ .name	= "ab8500_ldo_intcore", .driver_data = (void *) AB8540_LDO_INTCORE, },
-	{ .name	= "ab8500_ldo_tvout",   .driver_data = (void *) AB8540_LDO_TVOUT, },
-	{ .name = "ab8500_ldo_audio",   .driver_data = (void *) AB8540_LDO_AUDIO, },
-	{ .name	= "ab8500_ldo_anamic1", .driver_data = (void *) AB8540_LDO_ANAMIC1, },
-	{ .name	= "ab8500_ldo_anamic2", .driver_data = (void *) AB8540_LDO_ANAMIC2, },
-	{ .name	= "ab8500_ldo_dmic",    .driver_data = (void *) AB8540_LDO_DMIC, },
-	{ .name	= "ab8500_ldo_ana",     .driver_data = (void *) AB8540_LDO_ANA, },
-	{ .name = "ab8500_ldo_sdio",    .driver_data = (void *) AB8540_LDO_SDIO, },
-};
-
-static struct of_regulator_match ab9540_regulator_match[] = {
-	{ .name	= "ab8500_ldo_aux1",    .driver_data = (void *) AB9540_LDO_AUX1, },
-	{ .name	= "ab8500_ldo_aux2",    .driver_data = (void *) AB9540_LDO_AUX2, },
-	{ .name	= "ab8500_ldo_aux3",    .driver_data = (void *) AB9540_LDO_AUX3, },
-	{ .name	= "ab8500_ldo_aux4",    .driver_data = (void *) AB9540_LDO_AUX4, },
-	{ .name	= "ab8500_ldo_intcore", .driver_data = (void *) AB9540_LDO_INTCORE, },
-	{ .name	= "ab8500_ldo_tvout",   .driver_data = (void *) AB9540_LDO_TVOUT, },
-	{ .name = "ab8500_ldo_audio",   .driver_data = (void *) AB9540_LDO_AUDIO, },
-	{ .name	= "ab8500_ldo_anamic1", .driver_data = (void *) AB9540_LDO_ANAMIC1, },
-	{ .name	= "ab8500_ldo_anamic2", .driver_data = (void *) AB9540_LDO_ANAMIC2, },
-	{ .name	= "ab8500_ldo_dmic",    .driver_data = (void *) AB9540_LDO_DMIC, },
-	{ .name	= "ab8500_ldo_ana",     .driver_data = (void *) AB9540_LDO_ANA, },
-};
-
 static struct {
 	struct ab8500_regulator_info *info;
 	int info_size;
@@ -2967,27 +1576,13 @@ static struct {
 
 static void abx500_get_regulator_info(struct ab8500 *ab8500)
 {
-	if (is_ab9540(ab8500)) {
-		abx500_regulator.info = ab9540_regulator_info;
-		abx500_regulator.info_size = ARRAY_SIZE(ab9540_regulator_info);
-		abx500_regulator.init = ab9540_reg_init;
-		abx500_regulator.init_size = AB9540_NUM_REGULATOR_REGISTERS;
-		abx500_regulator.match = ab9540_regulator_match;
-		abx500_regulator.match_size = ARRAY_SIZE(ab9540_regulator_match);
-	} else if (is_ab8505(ab8500)) {
+	if (is_ab8505(ab8500)) {
 		abx500_regulator.info = ab8505_regulator_info;
 		abx500_regulator.info_size = ARRAY_SIZE(ab8505_regulator_info);
 		abx500_regulator.init = ab8505_reg_init;
 		abx500_regulator.init_size = AB8505_NUM_REGULATOR_REGISTERS;
 		abx500_regulator.match = ab8505_regulator_match;
 		abx500_regulator.match_size = ARRAY_SIZE(ab8505_regulator_match);
-	} else if (is_ab8540(ab8500)) {
-		abx500_regulator.info = ab8540_regulator_info;
-		abx500_regulator.info_size = ARRAY_SIZE(ab8540_regulator_info);
-		abx500_regulator.init = ab8540_reg_init;
-		abx500_regulator.init_size = AB8540_NUM_REGULATOR_REGISTERS;
-		abx500_regulator.match = ab8540_regulator_match;
-		abx500_regulator.match_size = ARRAY_SIZE(ab8540_regulator_match);
 	} else {
 		abx500_regulator.info = ab8500_regulator_info;
 		abx500_regulator.info_size = ARRAY_SIZE(ab8500_regulator_info);
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index d8ecefaf63ca..6d46f962685d 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -49,47 +49,7 @@ enum ab8505_regulator_id {
 	AB8505_NUM_REGULATORS,
 };
 
-/* AB9540 regulators */
-enum ab9540_regulator_id {
-	AB9540_LDO_AUX1,
-	AB9540_LDO_AUX2,
-	AB9540_LDO_AUX3,
-	AB9540_LDO_AUX4,
-	AB9540_LDO_INTCORE,
-	AB9540_LDO_TVOUT,
-	AB9540_LDO_USB,
-	AB9540_LDO_AUDIO,
-	AB9540_LDO_ANAMIC1,
-	AB9540_LDO_ANAMIC2,
-	AB9540_LDO_DMIC,
-	AB9540_LDO_ANA,
-	AB9540_SYSCLKREQ_2,
-	AB9540_SYSCLKREQ_4,
-	AB9540_NUM_REGULATORS,
-};
-
-/* AB8540 regulators */
-enum ab8540_regulator_id {
-	AB8540_LDO_AUX1,
-	AB8540_LDO_AUX2,
-	AB8540_LDO_AUX3,
-	AB8540_LDO_AUX4,
-	AB8540_LDO_AUX5,
-	AB8540_LDO_AUX6,
-	AB8540_LDO_INTCORE,
-	AB8540_LDO_TVOUT,
-	AB8540_LDO_AUDIO,
-	AB8540_LDO_ANAMIC1,
-	AB8540_LDO_ANAMIC2,
-	AB8540_LDO_DMIC,
-	AB8540_LDO_ANA,
-	AB8540_LDO_SDIO,
-	AB8540_SYSCLKREQ_2,
-	AB8540_SYSCLKREQ_4,
-	AB8540_NUM_REGULATORS,
-};
-
-/* AB8500, AB8505, and AB9540 register initialization */
+/* AB8500 and AB8505 register initialization */
 struct ab8500_regulator_reg_init {
 	int id;
 	u8 mask;
@@ -185,121 +145,6 @@ enum ab8505_regulator_reg {
 	AB8505_NUM_REGULATOR_REGISTERS,
 };
 
-/* AB9540 registers */
-enum ab9540_regulator_reg {
-	AB9540_REGUREQUESTCTRL1,
-	AB9540_REGUREQUESTCTRL2,
-	AB9540_REGUREQUESTCTRL3,
-	AB9540_REGUREQUESTCTRL4,
-	AB9540_REGUSYSCLKREQ1HPVALID1,
-	AB9540_REGUSYSCLKREQ1HPVALID2,
-	AB9540_REGUHWHPREQ1VALID1,
-	AB9540_REGUHWHPREQ1VALID2,
-	AB9540_REGUHWHPREQ2VALID1,
-	AB9540_REGUHWHPREQ2VALID2,
-	AB9540_REGUSWHPREQVALID1,
-	AB9540_REGUSWHPREQVALID2,
-	AB9540_REGUSYSCLKREQVALID1,
-	AB9540_REGUSYSCLKREQVALID2,
-	AB9540_REGUVAUX4REQVALID,
-	AB9540_REGUMISC1,
-	AB9540_VAUDIOSUPPLY,
-	AB9540_REGUCTRL1VAMIC,
-	AB9540_VSMPS1REGU,
-	AB9540_VSMPS2REGU,
-	AB9540_VSMPS3REGU, /* NOTE! PRCMU register */
-	AB9540_VPLLVANAREGU,
-	AB9540_EXTSUPPLYREGU,
-	AB9540_VAUX12REGU,
-	AB9540_VRF1VAUX3REGU,
-	AB9540_VSMPS1SEL1,
-	AB9540_VSMPS1SEL2,
-	AB9540_VSMPS1SEL3,
-	AB9540_VSMPS2SEL1,
-	AB9540_VSMPS2SEL2,
-	AB9540_VSMPS2SEL3,
-	AB9540_VSMPS3SEL1, /* NOTE! PRCMU register */
-	AB9540_VSMPS3SEL2, /* NOTE! PRCMU register */
-	AB9540_VAUX1SEL,
-	AB9540_VAUX2SEL,
-	AB9540_VRF1VAUX3SEL,
-	AB9540_REGUCTRL2SPARE,
-	AB9540_VAUX4REQCTRL,
-	AB9540_VAUX4REGU,
-	AB9540_VAUX4SEL,
-	AB9540_REGUCTRLDISCH,
-	AB9540_REGUCTRLDISCH2,
-	AB9540_REGUCTRLDISCH3,
-	AB9540_NUM_REGULATOR_REGISTERS,
-};
-
-/* AB8540 registers */
-enum ab8540_regulator_reg {
-	AB8540_REGUREQUESTCTRL1,
-	AB8540_REGUREQUESTCTRL2,
-	AB8540_REGUREQUESTCTRL3,
-	AB8540_REGUREQUESTCTRL4,
-	AB8540_REGUSYSCLKREQ1HPVALID1,
-	AB8540_REGUSYSCLKREQ1HPVALID2,
-	AB8540_REGUHWHPREQ1VALID1,
-	AB8540_REGUHWHPREQ1VALID2,
-	AB8540_REGUHWHPREQ2VALID1,
-	AB8540_REGUHWHPREQ2VALID2,
-	AB8540_REGUSWHPREQVALID1,
-	AB8540_REGUSWHPREQVALID2,
-	AB8540_REGUSYSCLKREQVALID1,
-	AB8540_REGUSYSCLKREQVALID2,
-	AB8540_REGUVAUX4REQVALID,
-	AB8540_REGUVAUX5REQVALID,
-	AB8540_REGUVAUX6REQVALID,
-	AB8540_REGUVCLKBREQVALID,
-	AB8540_REGUVRF1REQVALID,
-	AB8540_REGUMISC1,
-	AB8540_VAUDIOSUPPLY,
-	AB8540_REGUCTRL1VAMIC,
-	AB8540_VHSIC,
-	AB8540_VSDIO,
-	AB8540_VSMPS1REGU,
-	AB8540_VSMPS2REGU,
-	AB8540_VSMPS3REGU,
-	AB8540_VPLLVANAREGU,
-	AB8540_EXTSUPPLYREGU,
-	AB8540_VAUX12REGU,
-	AB8540_VRF1VAUX3REGU,
-	AB8540_VSMPS1SEL1,
-	AB8540_VSMPS1SEL2,
-	AB8540_VSMPS1SEL3,
-	AB8540_VSMPS2SEL1,
-	AB8540_VSMPS2SEL2,
-	AB8540_VSMPS2SEL3,
-	AB8540_VSMPS3SEL1,
-	AB8540_VSMPS3SEL2,
-	AB8540_VAUX1SEL,
-	AB8540_VAUX2SEL,
-	AB8540_VRF1VAUX3SEL,
-	AB8540_REGUCTRL2SPARE,
-	AB8540_VAUX4REQCTRL,
-	AB8540_VAUX4REGU,
-	AB8540_VAUX4SEL,
-	AB8540_VAUX5REQCTRL,
-	AB8540_VAUX5REGU,
-	AB8540_VAUX5SEL,
-	AB8540_VAUX6REQCTRL,
-	AB8540_VAUX6REGU,
-	AB8540_VAUX6SEL,
-	AB8540_VCLKBREQCTRL,
-	AB8540_VCLKBREGU,
-	AB8540_VCLKBSEL,
-	AB8540_VRF1REQCTRL,
-	AB8540_REGUCTRLDISCH,
-	AB8540_REGUCTRLDISCH2,
-	AB8540_REGUCTRLDISCH3,
-	AB8540_REGUCTRLDISCH4,
-	AB8540_VSIMSYSCLKCTRL,
-	AB8540_VANAVPLLSEL,
-	AB8540_NUM_REGULATOR_REGISTERS,
-};
-
 /* AB8500 external regulators */
 struct ab8500_ext_regulator_cfg {
 	bool hwreq; /* requires hw mode or high power mode */
-- 
cgit v1.2.3


From 964f7c0dd23de68c0a3f33a91ca10775ef39ad71 Mon Sep 17 00:00:00 2001
From: Biju Das <biju.das@bp.renesas.com>
Date: Wed, 28 Mar 2018 20:26:09 +0100
Subject: soc: renesas: rcar-sysc: Add r8a77470 support

Add support for RZ/G1C (R8A77470) SoC power areas to the R-Car SYSC
driver.

Signed-off-by: Biju Das <biju.das@bp.renesas.com>
Reviewed-by: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 .../bindings/power/renesas,rcar-sysc.txt           |  1 +
 drivers/soc/renesas/Kconfig                        |  5 ++++
 drivers/soc/renesas/Makefile                       |  1 +
 drivers/soc/renesas/r8a77470-sysc.c                | 29 ++++++++++++++++++++++
 drivers/soc/renesas/rcar-sysc.c                    |  3 +++
 drivers/soc/renesas/rcar-sysc.h                    |  1 +
 include/dt-bindings/power/r8a77470-sysc.h          | 22 ++++++++++++++++
 7 files changed, 62 insertions(+)
 create mode 100644 drivers/soc/renesas/r8a77470-sysc.c
 create mode 100644 include/dt-bindings/power/r8a77470-sysc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt b/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt
index ab399e559257..3e91d2032253 100644
--- a/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt
+++ b/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt
@@ -9,6 +9,7 @@ Required properties:
   - compatible: Must contain exactly one of the following:
       - "renesas,r8a7743-sysc" (RZ/G1M)
       - "renesas,r8a7745-sysc" (RZ/G1E)
+      - "renesas,r8a77470-sysc" (RZ/G1C)
       - "renesas,r8a7779-sysc" (R-Car H1)
       - "renesas,r8a7790-sysc" (R-Car H2)
       - "renesas,r8a7791-sysc" (R-Car M2-W)
diff --git a/drivers/soc/renesas/Kconfig b/drivers/soc/renesas/Kconfig
index 3bbe6114a420..96dd93660359 100644
--- a/drivers/soc/renesas/Kconfig
+++ b/drivers/soc/renesas/Kconfig
@@ -7,6 +7,7 @@ config SOC_RENESAS
 			   ARCH_R8A77970 || ARCH_R8A77980 || ARCH_R8A77995
 	select SYSC_R8A7743 if ARCH_R8A7743
 	select SYSC_R8A7745 if ARCH_R8A7745
+	select SYSC_R8A77470 if ARCH_R8A77470
 	select SYSC_R8A7779 if ARCH_R8A7779
 	select SYSC_R8A7790 if ARCH_R8A7790
 	select SYSC_R8A7791 if ARCH_R8A7791 || ARCH_R8A7793
@@ -30,6 +31,10 @@ config SYSC_R8A7745
 	bool "RZ/G1E System Controller support" if COMPILE_TEST
 	select SYSC_RCAR
 
+config SYSC_R8A77470
+	bool "RZ/G1C System Controller support" if COMPILE_TEST
+	select SYSC_RCAR
+
 config SYSC_R8A7779
 	bool "R-Car H1 System Controller support" if COMPILE_TEST
 	select SYSC_RCAR
diff --git a/drivers/soc/renesas/Makefile b/drivers/soc/renesas/Makefile
index ccb5ec57a262..a86ece7b84d1 100644
--- a/drivers/soc/renesas/Makefile
+++ b/drivers/soc/renesas/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_SOC_RENESAS)	+= renesas-soc.o
 # SoC
 obj-$(CONFIG_SYSC_R8A7743)	+= r8a7743-sysc.o
 obj-$(CONFIG_SYSC_R8A7745)	+= r8a7745-sysc.o
+obj-$(CONFIG_SYSC_R8A77470)	+= r8a77470-sysc.o
 obj-$(CONFIG_SYSC_R8A7779)	+= r8a7779-sysc.o
 obj-$(CONFIG_SYSC_R8A7790)	+= r8a7790-sysc.o
 obj-$(CONFIG_SYSC_R8A7791)	+= r8a7791-sysc.o
diff --git a/drivers/soc/renesas/r8a77470-sysc.c b/drivers/soc/renesas/r8a77470-sysc.c
new file mode 100644
index 000000000000..cfa015e208ef
--- /dev/null
+++ b/drivers/soc/renesas/r8a77470-sysc.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Renesas RZ/G1C System Controller
+ *
+ * Copyright (C) 2018 Renesas Electronics Corp.
+ */
+
+#include <linux/bug.h>
+#include <linux/kernel.h>
+
+#include <dt-bindings/power/r8a77470-sysc.h>
+
+#include "rcar-sysc.h"
+
+static const struct rcar_sysc_area r8a77470_areas[] __initconst = {
+	{ "always-on",	    0, 0, R8A77470_PD_ALWAYS_ON, -1, PD_ALWAYS_ON },
+	{ "ca7-scu",	0x100, 0, R8A77470_PD_CA7_SCU,	R8A77470_PD_ALWAYS_ON,
+	  PD_SCU },
+	{ "ca7-cpu0",	0x1c0, 0, R8A77470_PD_CA7_CPU0,	R8A77470_PD_CA7_SCU,
+	  PD_CPU_NOCR },
+	{ "ca7-cpu1",	0x1c0, 1, R8A77470_PD_CA7_CPU1,	R8A77470_PD_CA7_SCU,
+	  PD_CPU_NOCR },
+	{ "sgx",	 0xc0, 0, R8A77470_PD_SGX, R8A77470_PD_ALWAYS_ON },
+};
+
+const struct rcar_sysc_info r8a77470_sysc_info __initconst = {
+	.areas = r8a77470_areas,
+	.num_areas = ARRAY_SIZE(r8a77470_areas),
+};
diff --git a/drivers/soc/renesas/rcar-sysc.c b/drivers/soc/renesas/rcar-sysc.c
index faf20e719361..99203bdc333a 100644
--- a/drivers/soc/renesas/rcar-sysc.c
+++ b/drivers/soc/renesas/rcar-sysc.c
@@ -261,6 +261,9 @@ static const struct of_device_id rcar_sysc_matches[] __initconst = {
 #ifdef CONFIG_SYSC_R8A7745
 	{ .compatible = "renesas,r8a7745-sysc", .data = &r8a7745_sysc_info },
 #endif
+#ifdef CONFIG_SYSC_R8A77470
+	{ .compatible = "renesas,r8a77470-sysc", .data = &r8a77470_sysc_info },
+#endif
 #ifdef CONFIG_SYSC_R8A7779
 	{ .compatible = "renesas,r8a7779-sysc", .data = &r8a7779_sysc_info },
 #endif
diff --git a/drivers/soc/renesas/rcar-sysc.h b/drivers/soc/renesas/rcar-sysc.h
index dcdc9ec8eba7..9b24e3af288f 100644
--- a/drivers/soc/renesas/rcar-sysc.h
+++ b/drivers/soc/renesas/rcar-sysc.h
@@ -51,6 +51,7 @@ struct rcar_sysc_info {
 
 extern const struct rcar_sysc_info r8a7743_sysc_info;
 extern const struct rcar_sysc_info r8a7745_sysc_info;
+extern const struct rcar_sysc_info r8a77470_sysc_info;
 extern const struct rcar_sysc_info r8a7779_sysc_info;
 extern const struct rcar_sysc_info r8a7790_sysc_info;
 extern const struct rcar_sysc_info r8a7791_sysc_info;
diff --git a/include/dt-bindings/power/r8a77470-sysc.h b/include/dt-bindings/power/r8a77470-sysc.h
new file mode 100644
index 000000000000..8bf4db187c31
--- /dev/null
+++ b/include/dt-bindings/power/r8a77470-sysc.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2018 Renesas Electronics Corp.
+ */
+#ifndef __DT_BINDINGS_POWER_R8A77470_SYSC_H__
+#define __DT_BINDINGS_POWER_R8A77470_SYSC_H__
+
+/*
+ * These power domain indices match the numbers of the interrupt bits
+ * representing the power areas in the various Interrupt Registers
+ * (e.g. SYSCISR, Interrupt Status Register)
+ */
+
+#define R8A77470_PD_CA7_CPU0		 5
+#define R8A77470_PD_CA7_CPU1		 6
+#define R8A77470_PD_SGX			20
+#define R8A77470_PD_CA7_SCU		21
+
+/* Always-on power area */
+#define R8A77470_PD_ALWAYS_ON		32
+
+#endif /* __DT_BINDINGS_POWER_R8A77470_SYSC_H__ */
-- 
cgit v1.2.3


From 355f9e6482b2e5c34aa46cac199eff56de286514 Mon Sep 17 00:00:00 2001
From: Takeshi Kihara <takeshi.kihara.df@renesas.com>
Date: Wed, 11 Apr 2018 18:36:24 +0900
Subject: soc: renesas: Add r8a77990 SYSC PM Domain Binding Definitions

This patch adds power domain indices for R-Car E3.

Signed-off-by: Takeshi Kihara <takeshi.kihara.df@renesas.com>
[shimoda: add commit log and SPDX-License-Identifier]
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 include/dt-bindings/power/r8a77990-sysc.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 include/dt-bindings/power/r8a77990-sysc.h

(limited to 'include')

diff --git a/include/dt-bindings/power/r8a77990-sysc.h b/include/dt-bindings/power/r8a77990-sysc.h
new file mode 100644
index 000000000000..944d85beec15
--- /dev/null
+++ b/include/dt-bindings/power/r8a77990-sysc.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Renesas Electronics Corp.
+ */
+#ifndef __DT_BINDINGS_POWER_R8A77990_SYSC_H__
+#define __DT_BINDINGS_POWER_R8A77990_SYSC_H__
+
+/*
+ * These power domain indices match the numbers of the interrupt bits
+ * representing the power areas in the various Interrupt Registers
+ * (e.g. SYSCISR, Interrupt Status Register)
+ */
+
+#define R8A77990_PD_CA53_CPU0		5
+#define R8A77990_PD_CA53_CPU1		6
+#define R8A77990_PD_CR7			13
+#define R8A77990_PD_A3VC		14
+#define R8A77990_PD_3DG_A		17
+#define R8A77990_PD_3DG_B		18
+#define R8A77990_PD_CA53_SCU		21
+#define R8A77990_PD_A2VC1		26
+
+/* Always-on power area */
+#define R8A77990_PD_ALWAYS_ON		32
+
+#endif /* __DT_BINDINGS_POWER_R8A77990_SYSC_H__ */
-- 
cgit v1.2.3


From a941e2fab3207cb0d57dc4ec47b1b12c8ea78b84 Mon Sep 17 00:00:00 2001
From: Kirill Marinushkin <k.marinushkin@gmail.com>
Date: Wed, 4 Apr 2018 06:19:37 +0200
Subject: ASoC: topology: Fix bclk and fsync inversion in set_link_hw_format()

The values of bclk and fsync are inverted WRT the codec. But the existing
solution already works for Broadwell, see the alsa-lib config:

`alsa-lib/src/conf/topology/broadwell/broadwell.conf`

This commit provides the backwards-compatible solution to fix this misuse.

Signed-off-by: Kirill Marinushkin <k.marinushkin@gmail.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Tested-by: Pan Xiuli <xiuli.pan@linux.intel.com>
Tested-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Mark Brown <broonie@kernel.org>
Cc: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Cc: linux-kernel@vger.kernel.org
Cc: alsa-devel@alsa-project.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 16 ++++++++++++++--
 sound/soc/soc-topology.c  | 12 +++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 69c37ecbff7e..f0e5e21efa54 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -160,6 +160,18 @@
 #define SND_SOC_TPLG_LNK_FLGBIT_SYMMETRIC_SAMPLEBITS    (1 << 2)
 #define SND_SOC_TPLG_LNK_FLGBIT_VOICE_WAKEUP            (1 << 3)
 
+/* DAI topology BCLK parameter
+ * For the backwards capability, by default codec is bclk master
+ */
+#define SND_SOC_TPLG_BCLK_CM         0 /* codec is bclk master */
+#define SND_SOC_TPLG_BCLK_CS         1 /* codec is bclk slave */
+
+/* DAI topology FSYNC parameter
+ * For the backwards capability, by default codec is fsync master
+ */
+#define SND_SOC_TPLG_FSYNC_CM         0 /* codec is fsync master */
+#define SND_SOC_TPLG_FSYNC_CS         1 /* codec is fsync slave */
+
 /*
  * Block Header.
  * This header precedes all object and object arrays below.
@@ -315,8 +327,8 @@ struct snd_soc_tplg_hw_config {
 	__u8 clock_gated;	/* 1 if clock can be gated to save power */
 	__u8 invert_bclk;	/* 1 for inverted BCLK, 0 for normal */
 	__u8 invert_fsync;	/* 1 for inverted frame clock, 0 for normal */
-	__u8 bclk_master;	/* 1 for master of BCLK, 0 for slave */
-	__u8 fsync_master;	/* 1 for master of FSYNC, 0 for slave */
+	__u8 bclk_master;	/* SND_SOC_TPLG_BCLK_ value */
+	__u8 fsync_master;	/* SND_SOC_TPLG_FSYNC_ value */
 	__u8 mclk_direction;    /* 0 for input, 1 for output */
 	__le16 reserved;	/* for 32bit alignment */
 	__le32 mclk_rate;	/* MCLK or SYSCLK freqency in Hz */
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 942c6e5eb4b7..1c55252641f3 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -2019,13 +2019,15 @@ static void set_link_hw_format(struct snd_soc_dai_link *link,
 			link->dai_fmt |= SND_SOC_DAIFMT_IB_IF;
 
 		/* clock masters */
-		bclk_master = hw_config->bclk_master;
-		fsync_master = hw_config->fsync_master;
-		if (!bclk_master && !fsync_master)
+		bclk_master = (hw_config->bclk_master ==
+			       SND_SOC_TPLG_BCLK_CM);
+		fsync_master = (hw_config->fsync_master ==
+				SND_SOC_TPLG_FSYNC_CM);
+		if (bclk_master && fsync_master)
 			link->dai_fmt |= SND_SOC_DAIFMT_CBM_CFM;
-		else if (bclk_master && !fsync_master)
-			link->dai_fmt |= SND_SOC_DAIFMT_CBS_CFM;
 		else if (!bclk_master && fsync_master)
+			link->dai_fmt |= SND_SOC_DAIFMT_CBS_CFM;
+		else if (bclk_master && !fsync_master)
 			link->dai_fmt |= SND_SOC_DAIFMT_CBM_CFS;
 		else
 			link->dai_fmt |= SND_SOC_DAIFMT_CBS_CFS;
-- 
cgit v1.2.3


From 933e1c4a667103c4d10ebdc9505a0a6abd8c3fbd Mon Sep 17 00:00:00 2001
From: Kirill Marinushkin <k.marinushkin@gmail.com>
Date: Wed, 4 Apr 2018 06:19:38 +0200
Subject: ASoC: topology: Add missing clock gating parameter when parsing
 hw_configs

Clock gating parameter is a part of `dai_fmt`. It is supported by
`alsa-lib` when creating a topology binary file, but ignored by kernel
when loading this topology file.

After applying this commit, the clock gating parameter is not ignored any
more. This solution is backwards compatible. The existing behaviour is
not broken, because by default the parameter value is 0 and is ignored.

snd_soc_tplg_hw_config.clock_gated = 0 => no effect
snd_soc_tplg_hw_config.clock_gated = 1 => SND_SOC_DAIFMT_GATED
snd_soc_tplg_hw_config.clock_gated = 2 => SND_SOC_DAIFMT_CONT

For example, the following config, based on
alsa-lib/src/conf/topology/broadwell/broadwell.conf, is now supported:

~~~~
SectionHWConfig."CodecHWConfig" {
        id "1"
        format "I2S"            # physical audio format.
        pm_gate_clocks "true"   # clock can be gated
}

SectionLink."Codec" {

        # used for binding to the physical link
        id "0"

        hw_configs [
                "CodecHWConfig"
        ]

        default_hw_conf_id "1"
}
~~~~

Signed-off-by: Kirill Marinushkin <k.marinushkin@gmail.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Mark Brown <broonie@kernel.org>
Cc: Pan Xiuli <xiuli.pan@linux.intel.com>
Cc: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Cc: linux-kernel@vger.kernel.org
Cc: alsa-devel@alsa-project.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 7 ++++++-
 sound/soc/soc-topology.c  | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index f0e5e21efa54..f3c4b46e39d8 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -139,6 +139,11 @@
 #define SND_SOC_TPLG_DAI_FLGBIT_SYMMETRIC_CHANNELS      (1 << 1)
 #define SND_SOC_TPLG_DAI_FLGBIT_SYMMETRIC_SAMPLEBITS    (1 << 2)
 
+/* DAI clock gating */
+#define SND_SOC_TPLG_DAI_CLK_GATE_UNDEFINED	0
+#define SND_SOC_TPLG_DAI_CLK_GATE_GATED	1
+#define SND_SOC_TPLG_DAI_CLK_GATE_CONT		2
+
 /* DAI physical PCM data formats.
  * Add new formats to the end of the list.
  */
@@ -324,7 +329,7 @@ struct snd_soc_tplg_hw_config {
 	__le32 size;            /* in bytes of this structure */
 	__le32 id;		/* unique ID - - used to match */
 	__le32 fmt;		/* SND_SOC_DAI_FORMAT_ format value */
-	__u8 clock_gated;	/* 1 if clock can be gated to save power */
+	__u8 clock_gated;	/* SND_SOC_TPLG_DAI_CLK_GATE_ value */
 	__u8 invert_bclk;	/* 1 for inverted BCLK, 0 for normal */
 	__u8 invert_fsync;	/* 1 for inverted frame clock, 0 for normal */
 	__u8 bclk_master;	/* SND_SOC_TPLG_BCLK_ value */
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 1c55252641f3..aab31144f683 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -2006,6 +2006,13 @@ static void set_link_hw_format(struct snd_soc_dai_link *link,
 
 		link->dai_fmt = hw_config->fmt & SND_SOC_DAIFMT_FORMAT_MASK;
 
+		/* clock gating */
+		if (hw_config->clock_gated == SND_SOC_TPLG_DAI_CLK_GATE_GATED)
+			link->dai_fmt |= SND_SOC_DAIFMT_GATED;
+		else if (hw_config->clock_gated ==
+			 SND_SOC_TPLG_DAI_CLK_GATE_CONT)
+			link->dai_fmt |= SND_SOC_DAIFMT_CONT;
+
 		/* clock signal polarity */
 		invert_bclk = hw_config->invert_bclk;
 		invert_fsync = hw_config->invert_fsync;
-- 
cgit v1.2.3


From e590522a06adce8ca2eb47e77d80616cd1542d91 Mon Sep 17 00:00:00 2001
From: Kirill Marinushkin <k.marinushkin@gmail.com>
Date: Wed, 4 Apr 2018 06:19:39 +0200
Subject: ASoC: topology: Add definitions for mclk_direction values

Current comment makes not clear the direction of mclk. Previously, similar
description caused a misunderstanding for bclk_master and fsync_master.

This commit solves the potential confusion the same way it is solved for
bclk_master and fsync_master.

Signed-off-by: Kirill Marinushkin <k.marinushkin@gmail.com>
Acked-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Mark Brown <broonie@kernel.org>
Cc: Pan Xiuli <xiuli.pan@linux.intel.com>
Cc: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Cc: linux-kernel@vger.kernel.org
Cc: alsa-devel@alsa-project.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index f3c4b46e39d8..b901cdbe532a 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -144,6 +144,10 @@
 #define SND_SOC_TPLG_DAI_CLK_GATE_GATED	1
 #define SND_SOC_TPLG_DAI_CLK_GATE_CONT		2
 
+/* DAI mclk_direction */
+#define SND_SOC_TPLG_MCLK_CO            0 /* for codec, mclk is output */
+#define SND_SOC_TPLG_MCLK_CI            1 /* for codec, mclk is input */
+
 /* DAI physical PCM data formats.
  * Add new formats to the end of the list.
  */
@@ -334,7 +338,7 @@ struct snd_soc_tplg_hw_config {
 	__u8 invert_fsync;	/* 1 for inverted frame clock, 0 for normal */
 	__u8 bclk_master;	/* SND_SOC_TPLG_BCLK_ value */
 	__u8 fsync_master;	/* SND_SOC_TPLG_FSYNC_ value */
-	__u8 mclk_direction;    /* 0 for input, 1 for output */
+	__u8 mclk_direction;    /* SND_SOC_TPLG_MCLK_ value */
 	__le16 reserved;	/* for 32bit alignment */
 	__le32 mclk_rate;	/* MCLK or SYSCLK freqency in Hz */
 	__le32 bclk_rate;	/* BCLK freqency in Hz */
-- 
cgit v1.2.3


From 44773ba170a6f969620221a6d87d03feae5e464f Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 16 Apr 2018 10:22:01 -0700
Subject: ARM: OMAP2+: Drop unused pm-noop

Looks like these functions don't do anything in the mainline kernel so
we can just drop it.

Note that we must now also remove ir-rx51 pdata as it relies on the dummy
platform data that does not do anything. And ir-rx51 is calling a pdata
callback that doesn't do anything without checking if it exists first.

For configuring device specific minimal latencies, the interface to use
is pm_qos_add_request(). For an example, see what was done in commit
9834ffd1ecc3 ("ASoC: omap-mcbsp: Add PM QoS support for McBSP to prevent
glitches"). I've added some comments to ir-rx51 so people using it can
add pm_qos support and test it.

Cc: Ivaylo Dimitrov <ivo.g.dimitrov.75@gmail.com>
Cc: Kevin Hilman <khilman@kernel.org>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Acked-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/Makefile                |   1 -
 arch/arm/mach-omap2/display.c               |   7 --
 arch/arm/mach-omap2/hsmmc.c                 |   1 -
 arch/arm/mach-omap2/i2c.c                   |   1 -
 arch/arm/mach-omap2/io.c                    |   3 -
 arch/arm/mach-omap2/omap-pm-noop.c          | 176 ----------------------------
 arch/arm/mach-omap2/omap-pm.h               | 161 -------------------------
 arch/arm/mach-omap2/pdata-quirks.c          |  15 ---
 arch/arm/mach-omap2/pm-debug.c              |   5 -
 arch/arm/mach-omap2/pm.c                    |  10 +-
 arch/arm/mach-omap2/timer.c                 |   1 -
 arch/arm/plat-omap/Kconfig                  |  10 --
 drivers/media/rc/ir-rx51.c                  |  17 +--
 include/linux/platform_data/media/ir-rx51.h |   9 --
 14 files changed, 4 insertions(+), 413 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/omap-pm-noop.c
 delete mode 100644 arch/arm/mach-omap2/omap-pm.h
 delete mode 100644 include/linux/platform_data/media/ir-rx51.h

(limited to 'include')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 4603c30fef73..95b64b2940e9 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -78,7 +78,6 @@ endif
 omap-4-5-pm-common			= omap-mpuss-lowpower.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(omap-4-5-pm-common)
 obj-$(CONFIG_SOC_OMAP5)			+= $(omap-4-5-pm-common)
-obj-$(CONFIG_OMAP_PM_NOOP)		+= omap-pm-noop.o
 
 ifeq ($(CONFIG_PM),y)
 obj-$(CONFIG_ARCH_OMAP2)		+= pm24xx.o
diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c
index b3f6eb5d04a2..480a2b989908 100644
--- a/arch/arm/mach-omap2/display.c
+++ b/arch/arm/mach-omap2/display.c
@@ -32,7 +32,6 @@
 #include <linux/platform_data/omapdss.h>
 #include "omap_hwmod.h"
 #include "omap_device.h"
-#include "omap-pm.h"
 #include "common.h"
 
 #include "soc.h"
@@ -126,11 +125,6 @@ static void omap_dsi_disable_pads(int dsi_id, unsigned lane_mask)
 		omap4_dsi_mux_pads(dsi_id, 0);
 }
 
-static int omap_dss_set_min_bus_tput(struct device *dev, unsigned long tput)
-{
-	return omap_pm_set_min_bus_tput(dev, OCP_INITIATOR_AGENT, tput);
-}
-
 static enum omapdss_version __init omap_display_get_version(void)
 {
 	if (cpu_is_omap24xx())
@@ -169,7 +163,6 @@ static int __init omapdss_init_fbdev(void)
 	static struct omap_dss_board_info board_data = {
 		.dsi_enable_pads = omap_dsi_enable_pads,
 		.dsi_disable_pads = omap_dsi_disable_pads,
-		.set_min_bus_tput = omap_dss_set_min_bus_tput,
 	};
 	struct device_node *node;
 	int r;
diff --git a/arch/arm/mach-omap2/hsmmc.c b/arch/arm/mach-omap2/hsmmc.c
index b064066d431c..0103548b0b15 100644
--- a/arch/arm/mach-omap2/hsmmc.c
+++ b/arch/arm/mach-omap2/hsmmc.c
@@ -18,7 +18,6 @@
 
 #include "soc.h"
 #include "omap_device.h"
-#include "omap-pm.h"
 
 #include "hsmmc.h"
 #include "control.h"
diff --git a/arch/arm/mach-omap2/i2c.c b/arch/arm/mach-omap2/i2c.c
index 91a21c3923b2..37ff25ee3d89 100644
--- a/arch/arm/mach-omap2/i2c.c
+++ b/arch/arm/mach-omap2/i2c.c
@@ -22,7 +22,6 @@
 #include "soc.h"
 #include "omap_hwmod.h"
 #include "omap_device.h"
-#include "omap-pm.h"
 
 #include "prm.h"
 #include "common.h"
diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c
index cf546dfe3b32..6ce60a478409 100644
--- a/arch/arm/mach-omap2/io.c
+++ b/arch/arm/mach-omap2/io.c
@@ -37,7 +37,6 @@
 #include "clock.h"
 #include "clock2xxx.h"
 #include "clock3xxx.h"
-#include "omap-pm.h"
 #include "sdrc.h"
 #include "control.h"
 #include "serial.h"
@@ -421,8 +420,6 @@ static void __init __maybe_unused omap_hwmod_init_postsetup(void)
 	postsetup_state = _HWMOD_STATE_ENABLED;
 #endif
 	omap_hwmod_for_each(_set_hwmod_postsetup_state, &postsetup_state);
-
-	omap_pm_if_early_init();
 }
 
 static void __init __maybe_unused omap_common_late_init(void)
diff --git a/arch/arm/mach-omap2/omap-pm-noop.c b/arch/arm/mach-omap2/omap-pm-noop.c
deleted file mode 100644
index 4ead077ea4e7..000000000000
--- a/arch/arm/mach-omap2/omap-pm-noop.c
+++ /dev/null
@@ -1,176 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * omap-pm-noop.c - OMAP power management interface - dummy version
- *
- * This code implements the OMAP power management interface to
- * drivers, CPUIdle, CPUFreq, and DSP Bridge.  It is strictly for
- * debug/demonstration use, as it does nothing but printk() whenever a
- * function is called (when DEBUG is defined, below)
- *
- * Copyright (C) 2008-2009 Texas Instruments, Inc.
- * Copyright (C) 2008-2009 Nokia Corporation
- * Paul Walmsley
- *
- * Interface developed by (in alphabetical order):
- * Karthik Dasu, Tony Lindgren, Rajendra Nayak, Sakari Poussa, Veeramanikandan
- * Raju, Anand Sawant, Igor Stoppa, Paul Walmsley, Richard Woodruff
- */
-
-#undef DEBUG
-
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-
-#include "omap_device.h"
-#include "omap-pm.h"
-
-static bool off_mode_enabled;
-static int dummy_context_loss_counter;
-
-/*
- * Device-driver-originated constraints (via board-*.c files)
- */
-
-int omap_pm_set_max_mpu_wakeup_lat(struct device *dev, long t)
-{
-	if (!dev || t < -1) {
-		WARN(1, "OMAP PM: %s: invalid parameter(s)", __func__);
-		return -EINVAL;
-	}
-
-	if (t == -1)
-		pr_debug("OMAP PM: remove max MPU wakeup latency constraint: dev %s\n",
-			 dev_name(dev));
-	else
-		pr_debug("OMAP PM: add max MPU wakeup latency constraint: dev %s, t = %ld usec\n",
-			 dev_name(dev), t);
-
-	/*
-	 * For current Linux, this needs to map the MPU to a
-	 * powerdomain, then go through the list of current max lat
-	 * constraints on the MPU and find the smallest.  If
-	 * the latency constraint has changed, the code should
-	 * recompute the state to enter for the next powerdomain
-	 * state.
-	 *
-	 * TI CDP code can call constraint_set here.
-	 */
-
-	return 0;
-}
-
-int omap_pm_set_min_bus_tput(struct device *dev, u8 agent_id, unsigned long r)
-{
-	if (!dev || (agent_id != OCP_INITIATOR_AGENT &&
-	    agent_id != OCP_TARGET_AGENT)) {
-		WARN(1, "OMAP PM: %s: invalid parameter(s)", __func__);
-		return -EINVAL;
-	}
-
-	if (r == 0)
-		pr_debug("OMAP PM: remove min bus tput constraint: dev %s for agent_id %d\n",
-			 dev_name(dev), agent_id);
-	else
-		pr_debug("OMAP PM: add min bus tput constraint: dev %s for agent_id %d: rate %ld KiB\n",
-			 dev_name(dev), agent_id, r);
-
-	/*
-	 * This code should model the interconnect and compute the
-	 * required clock frequency, convert that to a VDD2 OPP ID, then
-	 * set the VDD2 OPP appropriately.
-	 *
-	 * TI CDP code can call constraint_set here on the VDD2 OPP.
-	 */
-
-	return 0;
-}
-
-/*
- * DSP Bridge-specific constraints
- */
-
-
-/**
- * omap_pm_enable_off_mode - notify OMAP PM that off-mode is enabled
- *
- * Intended for use only by OMAP PM core code to notify this layer
- * that off mode has been enabled.
- */
-void omap_pm_enable_off_mode(void)
-{
-	off_mode_enabled = true;
-}
-
-/**
- * omap_pm_disable_off_mode - notify OMAP PM that off-mode is disabled
- *
- * Intended for use only by OMAP PM core code to notify this layer
- * that off mode has been disabled.
- */
-void omap_pm_disable_off_mode(void)
-{
-	off_mode_enabled = false;
-}
-
-/*
- * Device context loss tracking
- */
-
-#ifdef CONFIG_ARCH_OMAP2PLUS
-
-int omap_pm_get_dev_context_loss_count(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	int count;
-
-	if (WARN_ON(!dev))
-		return -ENODEV;
-
-	if (dev->pm_domain == &omap_device_pm_domain) {
-		count = omap_device_get_context_loss_count(pdev);
-	} else {
-		WARN_ONCE(off_mode_enabled, "omap_pm: using dummy context loss counter; device %s should be converted to omap_device",
-			  dev_name(dev));
-
-		count = dummy_context_loss_counter;
-
-		if (off_mode_enabled) {
-			count++;
-			/*
-			 * Context loss count has to be a non-negative value.
-			 * Clear the sign bit to get a value range from 0 to
-			 * INT_MAX.
-			 */
-			count &= INT_MAX;
-			dummy_context_loss_counter = count;
-		}
-	}
-
-	pr_debug("OMAP PM: context loss count for dev %s = %d\n",
-		 dev_name(dev), count);
-
-	return count;
-}
-
-#else
-
-int omap_pm_get_dev_context_loss_count(struct device *dev)
-{
-	return dummy_context_loss_counter;
-}
-
-#endif
-
-/* Should be called before clk framework init */
-int __init omap_pm_if_early_init(void)
-{
-	return 0;
-}
-
-/* Must be called after clock framework is initialized */
-int __init omap_pm_if_init(void)
-{
-	return 0;
-}
diff --git a/arch/arm/mach-omap2/omap-pm.h b/arch/arm/mach-omap2/omap-pm.h
deleted file mode 100644
index 5ba5df47f91b..000000000000
--- a/arch/arm/mach-omap2/omap-pm.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * omap-pm.h - OMAP power management interface
- *
- * Copyright (C) 2008-2010 Texas Instruments, Inc.
- * Copyright (C) 2008-2010 Nokia Corporation
- * Paul Walmsley
- *
- * Interface developed by (in alphabetical order): Karthik Dasu, Jouni
- * Högander, Tony Lindgren, Rajendra Nayak, Sakari Poussa,
- * Veeramanikandan Raju, Anand Sawant, Igor Stoppa, Paul Walmsley,
- * Richard Woodruff
- */
-
-#ifndef ASM_ARM_ARCH_OMAP_OMAP_PM_H
-#define ASM_ARM_ARCH_OMAP_OMAP_PM_H
-
-#include <linux/device.h>
-#include <linux/cpufreq.h>
-#include <linux/clk.h>
-#include <linux/pm_opp.h>
-
-/*
- * agent_id values for use with omap_pm_set_min_bus_tput():
- *
- * OCP_INITIATOR_AGENT is only valid for devices that can act as
- * initiators -- it represents the device's L3 interconnect
- * connection.  OCP_TARGET_AGENT represents the device's L4
- * interconnect connection.
- */
-#define OCP_TARGET_AGENT		1
-#define OCP_INITIATOR_AGENT		2
-
-/**
- * omap_pm_if_early_init - OMAP PM init code called before clock fw init
- * @mpu_opp_table: array ptr to struct omap_opp for MPU
- * @dsp_opp_table: array ptr to struct omap_opp for DSP
- * @l3_opp_table : array ptr to struct omap_opp for CORE
- *
- * Initialize anything that must be configured before the clock
- * framework starts.  The "_if_" is to avoid name collisions with the
- * PM idle-loop code.
- */
-int __init omap_pm_if_early_init(void);
-
-/**
- * omap_pm_if_init - OMAP PM init code called after clock fw init
- *
- * The main initialization code.  OPP tables are passed in here.  The
- * "_if_" is to avoid name collisions with the PM idle-loop code.
- */
-int __init omap_pm_if_init(void);
-
-/*
- * Device-driver-originated constraints (via board-*.c files, platform_data)
- */
-
-
-/**
- * omap_pm_set_max_mpu_wakeup_lat - set the maximum MPU wakeup latency
- * @dev: struct device * requesting the constraint
- * @t: maximum MPU wakeup latency in microseconds
- *
- * Request that the maximum interrupt latency for the MPU to be no
- * greater than @t microseconds. "Interrupt latency" in this case is
- * defined as the elapsed time from the occurrence of a hardware or
- * timer interrupt to the time when the device driver's interrupt
- * service routine has been entered by the MPU.
- *
- * It is intended that underlying PM code will use this information to
- * determine what power state to put the MPU powerdomain into, and
- * possibly the CORE powerdomain as well, since interrupt handling
- * code currently runs from SDRAM.  Advanced PM or board*.c code may
- * also configure interrupt controller priorities, OCP bus priorities,
- * CPU speed(s), etc.
- *
- * This function will not affect device wakeup latency, e.g., time
- * elapsed from when a device driver enables a hardware device with
- * clk_enable(), to when the device is ready for register access or
- * other use.  To control this device wakeup latency, use
- * omap_pm_set_max_dev_wakeup_lat()
- *
- * Multiple calls to omap_pm_set_max_mpu_wakeup_lat() will replace the
- * previous t value.  To remove the latency target for the MPU, call
- * with t = -1.
- *
- * XXX This constraint will be deprecated soon in favor of the more
- * general omap_pm_set_max_dev_wakeup_lat()
- *
- * Returns -EINVAL for an invalid argument, -ERANGE if the constraint
- * is not satisfiable, or 0 upon success.
- */
-int omap_pm_set_max_mpu_wakeup_lat(struct device *dev, long t);
-
-
-/**
- * omap_pm_set_min_bus_tput - set minimum bus throughput needed by device
- * @dev: struct device * requesting the constraint
- * @tbus_id: interconnect to operate on (OCP_{INITIATOR,TARGET}_AGENT)
- * @r: minimum throughput (in KiB/s)
- *
- * Request that the minimum data throughput on the OCP interconnect
- * attached to device @dev interconnect agent @tbus_id be no less
- * than @r KiB/s.
- *
- * It is expected that the OMAP PM or bus code will use this
- * information to set the interconnect clock to run at the lowest
- * possible speed that satisfies all current system users.  The PM or
- * bus code will adjust the estimate based on its model of the bus, so
- * device driver authors should attempt to specify an accurate
- * quantity for their device use case, and let the PM or bus code
- * overestimate the numbers as necessary to handle request/response
- * latency, other competing users on the system, etc.  On OMAP2/3, if
- * a driver requests a minimum L4 interconnect speed constraint, the
- * code will also need to add an minimum L3 interconnect speed
- * constraint,
- *
- * Multiple calls to omap_pm_set_min_bus_tput() will replace the
- * previous rate value for this device.  To remove the interconnect
- * throughput restriction for this device, call with r = 0.
- *
- * Returns -EINVAL for an invalid argument, -ERANGE if the constraint
- * is not satisfiable, or 0 upon success.
- */
-int omap_pm_set_min_bus_tput(struct device *dev, u8 agent_id, unsigned long r);
-
-
-/*
- * CPUFreq-originated constraint
- *
- * In the future, this should be handled by custom OPP clocktype
- * functions.
- */
-
-
-/*
- * Device context loss tracking
- */
-
-/**
- * omap_pm_get_dev_context_loss_count - return count of times dev has lost ctx
- * @dev: struct device *
- *
- * This function returns the number of times that the device @dev has
- * lost its internal context.  This generally occurs on a powerdomain
- * transition to OFF.  Drivers use this as an optimization to avoid restoring
- * context if the device hasn't lost it.  To use, drivers should initially
- * call this in their context save functions and store the result.  Early in
- * the driver's context restore function, the driver should call this function
- * again, and compare the result to the stored counter.  If they differ, the
- * driver must restore device context.   If the number of context losses
- * exceeds the maximum positive integer, the function will wrap to 0 and
- * continue counting.  Returns the number of context losses for this device,
- * or negative value upon error.
- */
-int omap_pm_get_dev_context_loss_count(struct device *dev);
-
-void omap_pm_enable_off_mode(void);
-void omap_pm_disable_off_mode(void);
-
-#endif
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index 6459816c2879..7f02743edbe4 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -26,14 +26,12 @@
 #include <linux/platform_data/iommu-omap.h>
 #include <linux/platform_data/ti-sysc.h>
 #include <linux/platform_data/wkup_m3.h>
-#include <linux/platform_data/media/ir-rx51.h>
 #include <linux/platform_data/asoc-ti-mcbsp.h>
 
 #include "common.h"
 #include "common-board-devices.h"
 #include "control.h"
 #include "omap_device.h"
-#include "omap-pm.h"
 #include "omap-secure.h"
 #include "soc.h"
 #include "hsmmc.h"
@@ -514,18 +512,6 @@ void omap_auxdata_legacy_init(struct device *dev)
 	dev->platform_data = &twl_gpio_auxdata;
 }
 
-static struct ir_rx51_platform_data __maybe_unused rx51_ir_data = {
-	.set_max_mpu_wakeup_lat = omap_pm_set_max_mpu_wakeup_lat,
-};
-
-static struct platform_device __maybe_unused rx51_ir_device = {
-	.name           = "ir_rx51",
-	.id             = -1,
-	.dev            = {
-		.platform_data = &rx51_ir_data,
-	},
-};
-
 #if IS_ENABLED(CONFIG_SND_OMAP_SOC_MCBSP)
 static struct omap_mcbsp_platform_data mcbsp_pdata;
 static void __init omap3_mcbsp_init(void)
@@ -569,7 +555,6 @@ static struct of_dev_auxdata omap_auxdata_lookup[] = {
 		       "480c9000.smartreflex", &omap_sr_pdata[OMAP_SR_MPU]),
 	OF_DEV_AUXDATA("ti,omap3-hsmmc", 0x4809c000, "4809c000.mmc", &mmc_pdata[0]),
 	OF_DEV_AUXDATA("ti,omap3-hsmmc", 0x480b4000, "480b4000.mmc", &mmc_pdata[1]),
-	OF_DEV_AUXDATA("nokia,n900-ir", 0, "n900-ir", &rx51_ir_data),
 	/* Only on am3517 */
 	OF_DEV_AUXDATA("ti,davinci_mdio", 0x5c030000, "davinci_mdio.0", NULL),
 	OF_DEV_AUXDATA("ti,am3517-emac", 0x5c000000, "davinci_emac.0",
diff --git a/arch/arm/mach-omap2/pm-debug.c b/arch/arm/mach-omap2/pm-debug.c
index 5c46ea6756d7..acb698d5780f 100644
--- a/arch/arm/mach-omap2/pm-debug.c
+++ b/arch/arm/mach-omap2/pm-debug.c
@@ -31,7 +31,6 @@
 #include "clock.h"
 #include "powerdomain.h"
 #include "clockdomain.h"
-#include "omap-pm.h"
 
 #include "soc.h"
 #include "cm2xxx_3xxx.h"
@@ -240,10 +239,6 @@ static int option_set(void *data, u64 val)
 	*option = val;
 
 	if (option == &enable_off_mode) {
-		if (val)
-			omap_pm_enable_off_mode();
-		else
-			omap_pm_disable_off_mode();
 		if (cpu_is_omap34xx())
 			omap3_pm_off_mode_enable(val);
 	}
diff --git a/arch/arm/mach-omap2/pm.c b/arch/arm/mach-omap2/pm.c
index 6f68576e5695..b98c46d7f112 100644
--- a/arch/arm/mach-omap2/pm.c
+++ b/arch/arm/mach-omap2/pm.c
@@ -16,11 +16,11 @@
 #include <linux/pm_opp.h>
 #include <linux/export.h>
 #include <linux/suspend.h>
+#include <linux/clk.h>
 #include <linux/cpu.h>
 
 #include <asm/system_misc.h>
 
-#include "omap-pm.h"
 #include "omap_device.h"
 #include "common.h"
 
@@ -230,14 +230,6 @@ static void __init omap4_init_voltages(void)
 	omap2_set_init_voltage("iva", "dpll_iva_m5x2_ck", "iva");
 }
 
-static int __init omap2_common_pm_init(void)
-{
-	omap_pm_if_init();
-
-	return 0;
-}
-omap_postcore_initcall(omap2_common_pm_init);
-
 int __init omap2_common_pm_late_init(void)
 {
 	/* Init the voltage layer */
diff --git a/arch/arm/mach-omap2/timer.c b/arch/arm/mach-omap2/timer.c
index 4fb4dc24e5e9..61dd72df119c 100644
--- a/arch/arm/mach-omap2/timer.c
+++ b/arch/arm/mach-omap2/timer.c
@@ -50,7 +50,6 @@
 #include "omap_device.h"
 #include <plat/counter-32k.h>
 #include <clocksource/timer-ti-dm.h>
-#include "omap-pm.h"
 
 #include "soc.h"
 #include "common.h"
diff --git a/arch/arm/plat-omap/Kconfig b/arch/arm/plat-omap/Kconfig
index afc1a1d4f7a5..c0a242cae79a 100644
--- a/arch/arm/plat-omap/Kconfig
+++ b/arch/arm/plat-omap/Kconfig
@@ -115,16 +115,6 @@ config OMAP_SERIAL_WAKE
 	  to data on the serial RX line. This allows you to wake the
 	  system from serial console.
 
-choice
-	prompt "OMAP PM layer selection"
-	depends on ARCH_OMAP
-	default OMAP_PM_NOOP
-
-config OMAP_PM_NOOP
-	bool "No-op/debug PM layer"
-
-endchoice
-
 endmenu
 
 endif
diff --git a/drivers/media/rc/ir-rx51.c b/drivers/media/rc/ir-rx51.c
index 49265f02e772..8a93f7468622 100644
--- a/drivers/media/rc/ir-rx51.c
+++ b/drivers/media/rc/ir-rx51.c
@@ -22,7 +22,6 @@
 #include <linux/hrtimer.h>
 
 #include <media/rc-core.h>
-#include <linux/platform_data/media/ir-rx51.h>
 
 #define WBUF_LEN 256
 
@@ -31,7 +30,6 @@ struct ir_rx51 {
 	struct pwm_device *pwm;
 	struct hrtimer timer;
 	struct device	     *dev;
-	struct ir_rx51_platform_data *pdata;
 	wait_queue_head_t     wqueue;
 
 	unsigned int	freq;		/* carrier frequency */
@@ -130,10 +128,9 @@ static int ir_rx51_tx(struct rc_dev *dev, unsigned int *buffer,
 		ir_rx51->wbuf[count] = -1; /* Insert termination mark */
 
 	/*
-	 * Adjust latency requirements so the device doesn't go in too
-	 * deep sleep states
+	 * REVISIT: Adjust latency requirements so the device doesn't go in too
+	 * deep sleep states with pm_qos_add_request().
 	 */
-	ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, 50);
 
 	ir_rx51_on(ir_rx51);
 	ir_rx51->wbuf_index = 1;
@@ -146,8 +143,7 @@ static int ir_rx51_tx(struct rc_dev *dev, unsigned int *buffer,
 	 */
 	wait_event_interruptible(ir_rx51->wqueue, ir_rx51->wbuf_index < 0);
 
-	/* We can sleep again */
-	ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, -1);
+	/* REVISIT: Remove pm_qos constraint, we can sleep again */
 
 	return count;
 }
@@ -244,13 +240,6 @@ static int ir_rx51_probe(struct platform_device *dev)
 	struct pwm_device *pwm;
 	struct rc_dev *rcdev;
 
-	ir_rx51.pdata = dev->dev.platform_data;
-
-	if (!ir_rx51.pdata) {
-		dev_err(&dev->dev, "Platform Data is missing\n");
-		return -ENXIO;
-	}
-
 	pwm = pwm_get(&dev->dev, NULL);
 	if (IS_ERR(pwm)) {
 		int err = PTR_ERR(pwm);
diff --git a/include/linux/platform_data/media/ir-rx51.h b/include/linux/platform_data/media/ir-rx51.h
deleted file mode 100644
index 9d127aa648e7..000000000000
--- a/include/linux/platform_data/media/ir-rx51.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _IR_RX51_H
-#define _IR_RX51_H
-
-struct ir_rx51_platform_data {
-	int(*set_max_mpu_wakeup_lat)(struct device *dev, long t);
-};
-
-#endif
-- 
cgit v1.2.3


From 69c45d57bab00d5777e7686b3965b68b8ab043c7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Apr 2018 14:20:26 -0400
Subject: remove rpc_rmdir()

no users since 2014...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sunrpc/rpc_pipe_fs.h |  2 --
 net/sunrpc/rpc_pipe.c              | 16 ----------------
 2 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index a5704daf5df9..e90b9bd99ded 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -122,8 +122,6 @@ extern struct dentry *rpc_create_cache_dir(struct dentry *,
 					   struct cache_detail *);
 extern void rpc_remove_cache_dir(struct dentry *);
 
-extern int rpc_rmdir(struct dentry *dentry);
-
 struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags);
 void rpc_destroy_pipe_data(struct rpc_pipe *pipe);
 extern struct dentry *rpc_mkpipe_dentry(struct dentry *, const char *, void *,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 0f08934b2cea..9ec3bd7369dc 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -609,22 +609,6 @@ static int __rpc_rmdir(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
-int rpc_rmdir(struct dentry *dentry)
-{
-	struct dentry *parent;
-	struct inode *dir;
-	int error;
-
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	inode_lock_nested(dir, I_MUTEX_PARENT);
-	error = __rpc_rmdir(dir, dentry);
-	inode_unlock(dir);
-	dput(parent);
-	return error;
-}
-EXPORT_SYMBOL_GPL(rpc_rmdir);
-
 static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int ret;
-- 
cgit v1.2.3


From d59fb2856223219ccaa73bd2e96021f02ea5c266 Mon Sep 17 00:00:00 2001
From: Bard Liao <bardliao@realtek.com>
Date: Thu, 22 Mar 2018 14:12:33 +0800
Subject: ASoC: rt5668: add rt5668B codec driver

This is the initial codec driver for rt5668b.

Signed-off-by: Bard Liao <bardliao@realtek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/rt5668.txt |   50 +
 include/sound/rt5668.h                             |   40 +
 sound/soc/codecs/Kconfig                           |    6 +
 sound/soc/codecs/Makefile                          |    2 +
 sound/soc/codecs/rt5668.c                          | 2639 ++++++++++++++++++++
 sound/soc/codecs/rt5668.h                          | 1318 ++++++++++
 6 files changed, 4055 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/sound/rt5668.txt
 create mode 100644 include/sound/rt5668.h
 create mode 100644 sound/soc/codecs/rt5668.c
 create mode 100644 sound/soc/codecs/rt5668.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/rt5668.txt b/Documentation/devicetree/bindings/sound/rt5668.txt
new file mode 100644
index 000000000000..c88b96e7764b
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/rt5668.txt
@@ -0,0 +1,50 @@
+RT5668B audio CODEC
+
+This device supports I2C only.
+
+Required properties:
+
+- compatible : "realtek,rt5668b"
+
+- reg : The I2C address of the device.
+
+Optional properties:
+
+- interrupts : The CODEC's interrupt output.
+
+- realtek,dmic1-data-pin
+  0: dmic1 is not used
+  1: using GPIO2 pin as dmic1 data pin
+  2: using GPIO5 pin as dmic1 data pin
+
+- realtek,dmic1-clk-pin
+  0: using GPIO1 pin as dmic1 clock pin
+  1: using GPIO3 pin as dmic1 clock pin
+
+- realtek,jd-src
+  0: No JD is used
+  1: using JD1 as JD source
+
+- realtek,ldo1-en-gpios : The GPIO that controls the CODEC's LDO1_EN pin.
+
+Pins on the device (for linking into audio routes) for RT5668B:
+
+  * DMIC L1
+  * DMIC R1
+  * IN1P
+  * HPOL
+  * HPOR
+
+Example:
+
+rt5668 {
+	compatible = "realtek,rt5668b";
+	reg = <0x1a>;
+	interrupt-parent = <&gpio>;
+	interrupts = <TEGRA_GPIO(U, 6) GPIO_ACTIVE_HIGH>;
+	realtek,ldo1-en-gpios =
+		<&gpio TEGRA_GPIO(R, 2) GPIO_ACTIVE_HIGH>;
+	realtek,dmic1-data-pin = <1>;
+	realtek,dmic1-clk-pin = <1>;
+	realtek,jd-src = <1>;
+};
diff --git a/include/sound/rt5668.h b/include/sound/rt5668.h
new file mode 100644
index 000000000000..f907b78696cf
--- /dev/null
+++ b/include/sound/rt5668.h
@@ -0,0 +1,40 @@
+/*
+ * linux/sound/rt5668.h -- Platform data for RT5668
+ *
+ * Copyright 2018 Realtek Microelectronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_SND_RT5668_H
+#define __LINUX_SND_RT5668_H
+
+enum rt5668_dmic1_data_pin {
+	RT5668_DMIC1_NULL,
+	RT5668_DMIC1_DATA_GPIO2,
+	RT5668_DMIC1_DATA_GPIO5,
+};
+
+enum rt5668_dmic1_clk_pin {
+	RT5668_DMIC1_CLK_GPIO1,
+	RT5668_DMIC1_CLK_GPIO3,
+};
+
+enum rt5668_jd_src {
+	RT5668_JD_NULL,
+	RT5668_JD1,
+};
+
+struct rt5668_platform_data {
+
+	int ldo1_en; /* GPIO for LDO1_EN */
+
+	enum rt5668_dmic1_data_pin dmic1_data_pin;
+	enum rt5668_dmic1_clk_pin dmic1_clk_pin;
+	enum rt5668_jd_src jd_src;
+};
+
+#endif
+
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index 665edb5b77ff..251e67f180fe 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -137,6 +137,7 @@ config SND_SOC_ALL_CODECS
 	select SND_SOC_RT5660 if I2C
 	select SND_SOC_RT5663 if I2C
 	select SND_SOC_RT5665 if I2C
+	select SND_SOC_RT5668 if I2C
 	select SND_SOC_RT5670 if I2C
 	select SND_SOC_RT5677 if I2C && SPI_MASTER
 	select SND_SOC_SGTL5000 if I2C
@@ -771,6 +772,7 @@ config SND_SOC_RL6231
 	default y if SND_SOC_RT5660=y
 	default y if SND_SOC_RT5663=y
 	default y if SND_SOC_RT5665=y
+	default y if SND_SOC_RT5668=y
 	default y if SND_SOC_RT5670=y
 	default y if SND_SOC_RT5677=y
 	default y if SND_SOC_RT1305=y
@@ -783,6 +785,7 @@ config SND_SOC_RL6231
 	default m if SND_SOC_RT5660=m
 	default m if SND_SOC_RT5663=m
 	default m if SND_SOC_RT5665=m
+	default m if SND_SOC_RT5668=m
 	default m if SND_SOC_RT5670=m
 	default m if SND_SOC_RT5677=m
 	default m if SND_SOC_RT1305=m
@@ -850,6 +853,9 @@ config SND_SOC_RT5663
 config SND_SOC_RT5665
 	tristate
 
+config SND_SOC_RT5668
+	tristate
+
 config SND_SOC_RT5670
 	tristate
 
diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile
index cccd7749e319..d3b73021a401 100644
--- a/sound/soc/codecs/Makefile
+++ b/sound/soc/codecs/Makefile
@@ -141,6 +141,7 @@ snd-soc-rt5659-objs := rt5659.o
 snd-soc-rt5660-objs := rt5660.o
 snd-soc-rt5663-objs := rt5663.o
 snd-soc-rt5665-objs := rt5665.o
+snd-soc-rt5668-objs := rt5668.o
 snd-soc-rt5670-objs := rt5670.o
 snd-soc-rt5677-objs := rt5677.o
 snd-soc-rt5677-spi-objs := rt5677-spi.o
@@ -396,6 +397,7 @@ obj-$(CONFIG_SND_SOC_RT5659)	+= snd-soc-rt5659.o
 obj-$(CONFIG_SND_SOC_RT5660)	+= snd-soc-rt5660.o
 obj-$(CONFIG_SND_SOC_RT5663)	+= snd-soc-rt5663.o
 obj-$(CONFIG_SND_SOC_RT5665)	+= snd-soc-rt5665.o
+obj-$(CONFIG_SND_SOC_RT5668)	+= snd-soc-rt5668.o
 obj-$(CONFIG_SND_SOC_RT5670)	+= snd-soc-rt5670.o
 obj-$(CONFIG_SND_SOC_RT5677)	+= snd-soc-rt5677.o
 obj-$(CONFIG_SND_SOC_RT5677_SPI)	+= snd-soc-rt5677-spi.o
diff --git a/sound/soc/codecs/rt5668.c b/sound/soc/codecs/rt5668.c
new file mode 100644
index 000000000000..52a343f96eb2
--- /dev/null
+++ b/sound/soc/codecs/rt5668.c
@@ -0,0 +1,2639 @@
+/*
+ * rt5668.c  --  RT5668B ALSA SoC audio component driver
+ *
+ * Copyright 2018 Realtek Semiconductor Corp.
+ * Author: Bard Liao <bardliao@realtek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/pm.h>
+#include <linux/i2c.h>
+#include <linux/platform_device.h>
+#include <linux/spi/spi.h>
+#include <linux/acpi.h>
+#include <linux/gpio.h>
+#include <linux/of_gpio.h>
+#include <linux/regulator/consumer.h>
+#include <linux/mutex.h>
+#include <sound/core.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/jack.h>
+#include <sound/soc.h>
+#include <sound/soc-dapm.h>
+#include <sound/initval.h>
+#include <sound/tlv.h>
+#include <sound/rt5668.h>
+
+#include "rl6231.h"
+#include "rt5668.h"
+
+#define RT5668_NUM_SUPPLIES 3
+
+static const char *rt5668_supply_names[RT5668_NUM_SUPPLIES] = {
+	"AVDD",
+	"MICVDD",
+	"VBAT",
+};
+
+struct rt5668_priv {
+	struct snd_soc_component *component;
+	struct rt5668_platform_data pdata;
+	struct regmap *regmap;
+	struct snd_soc_jack *hs_jack;
+	struct regulator_bulk_data supplies[RT5668_NUM_SUPPLIES];
+	struct delayed_work jack_detect_work;
+	struct delayed_work jd_check_work;
+	struct mutex calibrate_mutex;
+
+	int sysclk;
+	int sysclk_src;
+	int lrck[RT5668_AIFS];
+	int bclk[RT5668_AIFS];
+	int master[RT5668_AIFS];
+
+	int pll_src;
+	int pll_in;
+	int pll_out;
+
+	int jack_type;
+};
+
+static const struct reg_default rt5668_reg[] = {
+	{0x0002, 0x8080},
+	{0x0003, 0x8000},
+	{0x0005, 0x0000},
+	{0x0006, 0x0000},
+	{0x0008, 0x800f},
+	{0x000b, 0x0000},
+	{0x0010, 0x4040},
+	{0x0011, 0x0000},
+	{0x0012, 0x1404},
+	{0x0013, 0x1000},
+	{0x0014, 0xa00a},
+	{0x0015, 0x0404},
+	{0x0016, 0x0404},
+	{0x0019, 0xafaf},
+	{0x001c, 0x2f2f},
+	{0x001f, 0x0000},
+	{0x0022, 0x5757},
+	{0x0023, 0x0039},
+	{0x0024, 0x000b},
+	{0x0026, 0xc0c4},
+	{0x0029, 0x8080},
+	{0x002a, 0xa0a0},
+	{0x002b, 0x0300},
+	{0x0030, 0x0000},
+	{0x003c, 0x0080},
+	{0x0044, 0x0c0c},
+	{0x0049, 0x0000},
+	{0x0061, 0x0000},
+	{0x0062, 0x0000},
+	{0x0063, 0x003f},
+	{0x0064, 0x0000},
+	{0x0065, 0x0000},
+	{0x0066, 0x0030},
+	{0x0067, 0x0000},
+	{0x006b, 0x0000},
+	{0x006c, 0x0000},
+	{0x006d, 0x2200},
+	{0x006e, 0x0a10},
+	{0x0070, 0x8000},
+	{0x0071, 0x8000},
+	{0x0073, 0x0000},
+	{0x0074, 0x0000},
+	{0x0075, 0x0002},
+	{0x0076, 0x0001},
+	{0x0079, 0x0000},
+	{0x007a, 0x0000},
+	{0x007b, 0x0000},
+	{0x007c, 0x0100},
+	{0x007e, 0x0000},
+	{0x0080, 0x0000},
+	{0x0081, 0x0000},
+	{0x0082, 0x0000},
+	{0x0083, 0x0000},
+	{0x0084, 0x0000},
+	{0x0085, 0x0000},
+	{0x0086, 0x0005},
+	{0x0087, 0x0000},
+	{0x0088, 0x0000},
+	{0x008c, 0x0003},
+	{0x008d, 0x0000},
+	{0x008e, 0x0060},
+	{0x008f, 0x1000},
+	{0x0091, 0x0c26},
+	{0x0092, 0x0073},
+	{0x0093, 0x0000},
+	{0x0094, 0x0080},
+	{0x0098, 0x0000},
+	{0x009a, 0x0000},
+	{0x009b, 0x0000},
+	{0x009c, 0x0000},
+	{0x009d, 0x0000},
+	{0x009e, 0x100c},
+	{0x009f, 0x0000},
+	{0x00a0, 0x0000},
+	{0x00a3, 0x0002},
+	{0x00a4, 0x0001},
+	{0x00ae, 0x2040},
+	{0x00af, 0x0000},
+	{0x00b6, 0x0000},
+	{0x00b7, 0x0000},
+	{0x00b8, 0x0000},
+	{0x00b9, 0x0002},
+	{0x00be, 0x0000},
+	{0x00c0, 0x0160},
+	{0x00c1, 0x82a0},
+	{0x00c2, 0x0000},
+	{0x00d0, 0x0000},
+	{0x00d1, 0x2244},
+	{0x00d2, 0x3300},
+	{0x00d3, 0x2200},
+	{0x00d4, 0x0000},
+	{0x00d9, 0x0009},
+	{0x00da, 0x0000},
+	{0x00db, 0x0000},
+	{0x00dc, 0x00c0},
+	{0x00dd, 0x2220},
+	{0x00de, 0x3131},
+	{0x00df, 0x3131},
+	{0x00e0, 0x3131},
+	{0x00e2, 0x0000},
+	{0x00e3, 0x4000},
+	{0x00e4, 0x0aa0},
+	{0x00e5, 0x3131},
+	{0x00e6, 0x3131},
+	{0x00e7, 0x3131},
+	{0x00e8, 0x3131},
+	{0x00ea, 0xb320},
+	{0x00eb, 0x0000},
+	{0x00f0, 0x0000},
+	{0x00f1, 0x00d0},
+	{0x00f2, 0x00d0},
+	{0x00f6, 0x0000},
+	{0x00fa, 0x0000},
+	{0x00fb, 0x0000},
+	{0x00fc, 0x0000},
+	{0x00fd, 0x0000},
+	{0x00fe, 0x10ec},
+	{0x00ff, 0x6530},
+	{0x0100, 0xa0a0},
+	{0x010b, 0x0000},
+	{0x010c, 0xae00},
+	{0x010d, 0xaaa0},
+	{0x010e, 0x8aa2},
+	{0x010f, 0x02a2},
+	{0x0110, 0xc000},
+	{0x0111, 0x04a2},
+	{0x0112, 0x2800},
+	{0x0113, 0x0000},
+	{0x0117, 0x0100},
+	{0x0125, 0x0410},
+	{0x0132, 0x6026},
+	{0x0136, 0x5555},
+	{0x0138, 0x3700},
+	{0x013a, 0x2000},
+	{0x013b, 0x2000},
+	{0x013c, 0x2005},
+	{0x013f, 0x0000},
+	{0x0142, 0x0000},
+	{0x0145, 0x0002},
+	{0x0146, 0x0000},
+	{0x0147, 0x0000},
+	{0x0148, 0x0000},
+	{0x0149, 0x0000},
+	{0x0150, 0x79a1},
+	{0x0151, 0x0000},
+	{0x0160, 0x4ec0},
+	{0x0161, 0x0080},
+	{0x0162, 0x0200},
+	{0x0163, 0x0800},
+	{0x0164, 0x0000},
+	{0x0165, 0x0000},
+	{0x0166, 0x0000},
+	{0x0167, 0x000f},
+	{0x0168, 0x000f},
+	{0x0169, 0x0021},
+	{0x0190, 0x413d},
+	{0x0194, 0x0000},
+	{0x0195, 0x0000},
+	{0x0197, 0x0022},
+	{0x0198, 0x0000},
+	{0x0199, 0x0000},
+	{0x01af, 0x0000},
+	{0x01b0, 0x0400},
+	{0x01b1, 0x0000},
+	{0x01b2, 0x0000},
+	{0x01b3, 0x0000},
+	{0x01b4, 0x0000},
+	{0x01b5, 0x0000},
+	{0x01b6, 0x01c3},
+	{0x01b7, 0x02a0},
+	{0x01b8, 0x03e9},
+	{0x01b9, 0x1389},
+	{0x01ba, 0xc351},
+	{0x01bb, 0x0009},
+	{0x01bc, 0x0018},
+	{0x01bd, 0x002a},
+	{0x01be, 0x004c},
+	{0x01bf, 0x0097},
+	{0x01c0, 0x433d},
+	{0x01c1, 0x2800},
+	{0x01c2, 0x0000},
+	{0x01c3, 0x0000},
+	{0x01c4, 0x0000},
+	{0x01c5, 0x0000},
+	{0x01c6, 0x0000},
+	{0x01c7, 0x0000},
+	{0x01c8, 0x40af},
+	{0x01c9, 0x0702},
+	{0x01ca, 0x0000},
+	{0x01cb, 0x0000},
+	{0x01cc, 0x5757},
+	{0x01cd, 0x5757},
+	{0x01ce, 0x5757},
+	{0x01cf, 0x5757},
+	{0x01d0, 0x5757},
+	{0x01d1, 0x5757},
+	{0x01d2, 0x5757},
+	{0x01d3, 0x5757},
+	{0x01d4, 0x5757},
+	{0x01d5, 0x5757},
+	{0x01d6, 0x0000},
+	{0x01d7, 0x0008},
+	{0x01d8, 0x0029},
+	{0x01d9, 0x3333},
+	{0x01da, 0x0000},
+	{0x01db, 0x0004},
+	{0x01dc, 0x0000},
+	{0x01de, 0x7c00},
+	{0x01df, 0x0320},
+	{0x01e0, 0x06a1},
+	{0x01e1, 0x0000},
+	{0x01e2, 0x0000},
+	{0x01e3, 0x0000},
+	{0x01e4, 0x0000},
+	{0x01e6, 0x0001},
+	{0x01e7, 0x0000},
+	{0x01e8, 0x0000},
+	{0x01ea, 0x0000},
+	{0x01eb, 0x0000},
+	{0x01ec, 0x0000},
+	{0x01ed, 0x0000},
+	{0x01ee, 0x0000},
+	{0x01ef, 0x0000},
+	{0x01f0, 0x0000},
+	{0x01f1, 0x0000},
+	{0x01f2, 0x0000},
+	{0x01f3, 0x0000},
+	{0x01f4, 0x0000},
+	{0x0210, 0x6297},
+	{0x0211, 0xa005},
+	{0x0212, 0x824c},
+	{0x0213, 0xf7ff},
+	{0x0214, 0xf24c},
+	{0x0215, 0x0102},
+	{0x0216, 0x00a3},
+	{0x0217, 0x0048},
+	{0x0218, 0xa2c0},
+	{0x0219, 0x0400},
+	{0x021a, 0x00c8},
+	{0x021b, 0x00c0},
+	{0x021c, 0x0000},
+	{0x0250, 0x4500},
+	{0x0251, 0x40b3},
+	{0x0252, 0x0000},
+	{0x0253, 0x0000},
+	{0x0254, 0x0000},
+	{0x0255, 0x0000},
+	{0x0256, 0x0000},
+	{0x0257, 0x0000},
+	{0x0258, 0x0000},
+	{0x0259, 0x0000},
+	{0x025a, 0x0005},
+	{0x0270, 0x0000},
+	{0x02ff, 0x0110},
+	{0x0300, 0x001f},
+	{0x0301, 0x032c},
+	{0x0302, 0x5f21},
+	{0x0303, 0x4000},
+	{0x0304, 0x4000},
+	{0x0305, 0x06d5},
+	{0x0306, 0x8000},
+	{0x0307, 0x0700},
+	{0x0310, 0x4560},
+	{0x0311, 0xa4a8},
+	{0x0312, 0x7418},
+	{0x0313, 0x0000},
+	{0x0314, 0x0006},
+	{0x0315, 0xffff},
+	{0x0316, 0xc400},
+	{0x0317, 0x0000},
+	{0x03c0, 0x7e00},
+	{0x03c1, 0x8000},
+	{0x03c2, 0x8000},
+	{0x03c3, 0x8000},
+	{0x03c4, 0x8000},
+	{0x03c5, 0x8000},
+	{0x03c6, 0x8000},
+	{0x03c7, 0x8000},
+	{0x03c8, 0x8000},
+	{0x03c9, 0x8000},
+	{0x03ca, 0x8000},
+	{0x03cb, 0x8000},
+	{0x03cc, 0x8000},
+	{0x03d0, 0x0000},
+	{0x03d1, 0x0000},
+	{0x03d2, 0x0000},
+	{0x03d3, 0x0000},
+	{0x03d4, 0x2000},
+	{0x03d5, 0x2000},
+	{0x03d6, 0x0000},
+	{0x03d7, 0x0000},
+	{0x03d8, 0x2000},
+	{0x03d9, 0x2000},
+	{0x03da, 0x2000},
+	{0x03db, 0x2000},
+	{0x03dc, 0x0000},
+	{0x03dd, 0x0000},
+	{0x03de, 0x0000},
+	{0x03df, 0x2000},
+	{0x03e0, 0x0000},
+	{0x03e1, 0x0000},
+	{0x03e2, 0x0000},
+	{0x03e3, 0x0000},
+	{0x03e4, 0x0000},
+	{0x03e5, 0x0000},
+	{0x03e6, 0x0000},
+	{0x03e7, 0x0000},
+	{0x03e8, 0x0000},
+	{0x03e9, 0x0000},
+	{0x03ea, 0x0000},
+	{0x03eb, 0x0000},
+	{0x03ec, 0x0000},
+	{0x03ed, 0x0000},
+	{0x03ee, 0x0000},
+	{0x03ef, 0x0000},
+	{0x03f0, 0x0800},
+	{0x03f1, 0x0800},
+	{0x03f2, 0x0800},
+	{0x03f3, 0x0800},
+};
+
+static bool rt5668_volatile_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case RT5668_RESET:
+	case RT5668_CBJ_CTRL_2:
+	case RT5668_INT_ST_1:
+	case RT5668_4BTN_IL_CMD_1:
+	case RT5668_AJD1_CTRL:
+	case RT5668_HP_CALIB_CTRL_1:
+	case RT5668_DEVICE_ID:
+	case RT5668_I2C_MODE:
+	case RT5668_HP_CALIB_CTRL_10:
+	case RT5668_EFUSE_CTRL_2:
+	case RT5668_JD_TOP_VC_VTRL:
+	case RT5668_HP_IMP_SENS_CTRL_19:
+	case RT5668_IL_CMD_1:
+	case RT5668_SAR_IL_CMD_2:
+	case RT5668_SAR_IL_CMD_4:
+	case RT5668_SAR_IL_CMD_10:
+	case RT5668_SAR_IL_CMD_11:
+	case RT5668_EFUSE_CTRL_6...RT5668_EFUSE_CTRL_11:
+	case RT5668_HP_CALIB_STA_1...RT5668_HP_CALIB_STA_11:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool rt5668_readable_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case RT5668_RESET:
+	case RT5668_VERSION_ID:
+	case RT5668_VENDOR_ID:
+	case RT5668_DEVICE_ID:
+	case RT5668_HP_CTRL_1:
+	case RT5668_HP_CTRL_2:
+	case RT5668_HPL_GAIN:
+	case RT5668_HPR_GAIN:
+	case RT5668_I2C_CTRL:
+	case RT5668_CBJ_BST_CTRL:
+	case RT5668_CBJ_CTRL_1:
+	case RT5668_CBJ_CTRL_2:
+	case RT5668_CBJ_CTRL_3:
+	case RT5668_CBJ_CTRL_4:
+	case RT5668_CBJ_CTRL_5:
+	case RT5668_CBJ_CTRL_6:
+	case RT5668_CBJ_CTRL_7:
+	case RT5668_DAC1_DIG_VOL:
+	case RT5668_STO1_ADC_DIG_VOL:
+	case RT5668_STO1_ADC_BOOST:
+	case RT5668_HP_IMP_GAIN_1:
+	case RT5668_HP_IMP_GAIN_2:
+	case RT5668_SIDETONE_CTRL:
+	case RT5668_STO1_ADC_MIXER:
+	case RT5668_AD_DA_MIXER:
+	case RT5668_STO1_DAC_MIXER:
+	case RT5668_A_DAC1_MUX:
+	case RT5668_DIG_INF2_DATA:
+	case RT5668_REC_MIXER:
+	case RT5668_CAL_REC:
+	case RT5668_ALC_BACK_GAIN:
+	case RT5668_PWR_DIG_1:
+	case RT5668_PWR_DIG_2:
+	case RT5668_PWR_ANLG_1:
+	case RT5668_PWR_ANLG_2:
+	case RT5668_PWR_ANLG_3:
+	case RT5668_PWR_MIXER:
+	case RT5668_PWR_VOL:
+	case RT5668_CLK_DET:
+	case RT5668_RESET_LPF_CTRL:
+	case RT5668_RESET_HPF_CTRL:
+	case RT5668_DMIC_CTRL_1:
+	case RT5668_I2S1_SDP:
+	case RT5668_I2S2_SDP:
+	case RT5668_ADDA_CLK_1:
+	case RT5668_ADDA_CLK_2:
+	case RT5668_I2S1_F_DIV_CTRL_1:
+	case RT5668_I2S1_F_DIV_CTRL_2:
+	case RT5668_TDM_CTRL:
+	case RT5668_TDM_ADDA_CTRL_1:
+	case RT5668_TDM_ADDA_CTRL_2:
+	case RT5668_DATA_SEL_CTRL_1:
+	case RT5668_TDM_TCON_CTRL:
+	case RT5668_GLB_CLK:
+	case RT5668_PLL_CTRL_1:
+	case RT5668_PLL_CTRL_2:
+	case RT5668_PLL_TRACK_1:
+	case RT5668_PLL_TRACK_2:
+	case RT5668_PLL_TRACK_3:
+	case RT5668_PLL_TRACK_4:
+	case RT5668_PLL_TRACK_5:
+	case RT5668_PLL_TRACK_6:
+	case RT5668_PLL_TRACK_11:
+	case RT5668_SDW_REF_CLK:
+	case RT5668_DEPOP_1:
+	case RT5668_DEPOP_2:
+	case RT5668_HP_CHARGE_PUMP_1:
+	case RT5668_HP_CHARGE_PUMP_2:
+	case RT5668_MICBIAS_1:
+	case RT5668_MICBIAS_2:
+	case RT5668_PLL_TRACK_12:
+	case RT5668_PLL_TRACK_14:
+	case RT5668_PLL2_CTRL_1:
+	case RT5668_PLL2_CTRL_2:
+	case RT5668_PLL2_CTRL_3:
+	case RT5668_PLL2_CTRL_4:
+	case RT5668_RC_CLK_CTRL:
+	case RT5668_I2S_M_CLK_CTRL_1:
+	case RT5668_I2S2_F_DIV_CTRL_1:
+	case RT5668_I2S2_F_DIV_CTRL_2:
+	case RT5668_EQ_CTRL_1:
+	case RT5668_EQ_CTRL_2:
+	case RT5668_IRQ_CTRL_1:
+	case RT5668_IRQ_CTRL_2:
+	case RT5668_IRQ_CTRL_3:
+	case RT5668_IRQ_CTRL_4:
+	case RT5668_INT_ST_1:
+	case RT5668_GPIO_CTRL_1:
+	case RT5668_GPIO_CTRL_2:
+	case RT5668_GPIO_CTRL_3:
+	case RT5668_HP_AMP_DET_CTRL_1:
+	case RT5668_HP_AMP_DET_CTRL_2:
+	case RT5668_MID_HP_AMP_DET:
+	case RT5668_LOW_HP_AMP_DET:
+	case RT5668_DELAY_BUF_CTRL:
+	case RT5668_SV_ZCD_1:
+	case RT5668_SV_ZCD_2:
+	case RT5668_IL_CMD_1:
+	case RT5668_IL_CMD_2:
+	case RT5668_IL_CMD_3:
+	case RT5668_IL_CMD_4:
+	case RT5668_IL_CMD_5:
+	case RT5668_IL_CMD_6:
+	case RT5668_4BTN_IL_CMD_1:
+	case RT5668_4BTN_IL_CMD_2:
+	case RT5668_4BTN_IL_CMD_3:
+	case RT5668_4BTN_IL_CMD_4:
+	case RT5668_4BTN_IL_CMD_5:
+	case RT5668_4BTN_IL_CMD_6:
+	case RT5668_4BTN_IL_CMD_7:
+	case RT5668_ADC_STO1_HP_CTRL_1:
+	case RT5668_ADC_STO1_HP_CTRL_2:
+	case RT5668_AJD1_CTRL:
+	case RT5668_JD1_THD:
+	case RT5668_JD2_THD:
+	case RT5668_JD_CTRL_1:
+	case RT5668_DUMMY_1:
+	case RT5668_DUMMY_2:
+	case RT5668_DUMMY_3:
+	case RT5668_DAC_ADC_DIG_VOL1:
+	case RT5668_BIAS_CUR_CTRL_2:
+	case RT5668_BIAS_CUR_CTRL_3:
+	case RT5668_BIAS_CUR_CTRL_4:
+	case RT5668_BIAS_CUR_CTRL_5:
+	case RT5668_BIAS_CUR_CTRL_6:
+	case RT5668_BIAS_CUR_CTRL_7:
+	case RT5668_BIAS_CUR_CTRL_8:
+	case RT5668_BIAS_CUR_CTRL_9:
+	case RT5668_BIAS_CUR_CTRL_10:
+	case RT5668_VREF_REC_OP_FB_CAP_CTRL:
+	case RT5668_CHARGE_PUMP_1:
+	case RT5668_DIG_IN_CTRL_1:
+	case RT5668_PAD_DRIVING_CTRL:
+	case RT5668_SOFT_RAMP_DEPOP:
+	case RT5668_CHOP_DAC:
+	case RT5668_CHOP_ADC:
+	case RT5668_CALIB_ADC_CTRL:
+	case RT5668_VOL_TEST:
+	case RT5668_SPKVDD_DET_STA:
+	case RT5668_TEST_MODE_CTRL_1:
+	case RT5668_TEST_MODE_CTRL_2:
+	case RT5668_TEST_MODE_CTRL_3:
+	case RT5668_TEST_MODE_CTRL_4:
+	case RT5668_TEST_MODE_CTRL_5:
+	case RT5668_PLL1_INTERNAL:
+	case RT5668_PLL2_INTERNAL:
+	case RT5668_STO_NG2_CTRL_1:
+	case RT5668_STO_NG2_CTRL_2:
+	case RT5668_STO_NG2_CTRL_3:
+	case RT5668_STO_NG2_CTRL_4:
+	case RT5668_STO_NG2_CTRL_5:
+	case RT5668_STO_NG2_CTRL_6:
+	case RT5668_STO_NG2_CTRL_7:
+	case RT5668_STO_NG2_CTRL_8:
+	case RT5668_STO_NG2_CTRL_9:
+	case RT5668_STO_NG2_CTRL_10:
+	case RT5668_STO1_DAC_SIL_DET:
+	case RT5668_SIL_PSV_CTRL1:
+	case RT5668_SIL_PSV_CTRL2:
+	case RT5668_SIL_PSV_CTRL3:
+	case RT5668_SIL_PSV_CTRL4:
+	case RT5668_SIL_PSV_CTRL5:
+	case RT5668_HP_IMP_SENS_CTRL_01:
+	case RT5668_HP_IMP_SENS_CTRL_02:
+	case RT5668_HP_IMP_SENS_CTRL_03:
+	case RT5668_HP_IMP_SENS_CTRL_04:
+	case RT5668_HP_IMP_SENS_CTRL_05:
+	case RT5668_HP_IMP_SENS_CTRL_06:
+	case RT5668_HP_IMP_SENS_CTRL_07:
+	case RT5668_HP_IMP_SENS_CTRL_08:
+	case RT5668_HP_IMP_SENS_CTRL_09:
+	case RT5668_HP_IMP_SENS_CTRL_10:
+	case RT5668_HP_IMP_SENS_CTRL_11:
+	case RT5668_HP_IMP_SENS_CTRL_12:
+	case RT5668_HP_IMP_SENS_CTRL_13:
+	case RT5668_HP_IMP_SENS_CTRL_14:
+	case RT5668_HP_IMP_SENS_CTRL_15:
+	case RT5668_HP_IMP_SENS_CTRL_16:
+	case RT5668_HP_IMP_SENS_CTRL_17:
+	case RT5668_HP_IMP_SENS_CTRL_18:
+	case RT5668_HP_IMP_SENS_CTRL_19:
+	case RT5668_HP_IMP_SENS_CTRL_20:
+	case RT5668_HP_IMP_SENS_CTRL_21:
+	case RT5668_HP_IMP_SENS_CTRL_22:
+	case RT5668_HP_IMP_SENS_CTRL_23:
+	case RT5668_HP_IMP_SENS_CTRL_24:
+	case RT5668_HP_IMP_SENS_CTRL_25:
+	case RT5668_HP_IMP_SENS_CTRL_26:
+	case RT5668_HP_IMP_SENS_CTRL_27:
+	case RT5668_HP_IMP_SENS_CTRL_28:
+	case RT5668_HP_IMP_SENS_CTRL_29:
+	case RT5668_HP_IMP_SENS_CTRL_30:
+	case RT5668_HP_IMP_SENS_CTRL_31:
+	case RT5668_HP_IMP_SENS_CTRL_32:
+	case RT5668_HP_IMP_SENS_CTRL_33:
+	case RT5668_HP_IMP_SENS_CTRL_34:
+	case RT5668_HP_IMP_SENS_CTRL_35:
+	case RT5668_HP_IMP_SENS_CTRL_36:
+	case RT5668_HP_IMP_SENS_CTRL_37:
+	case RT5668_HP_IMP_SENS_CTRL_38:
+	case RT5668_HP_IMP_SENS_CTRL_39:
+	case RT5668_HP_IMP_SENS_CTRL_40:
+	case RT5668_HP_IMP_SENS_CTRL_41:
+	case RT5668_HP_IMP_SENS_CTRL_42:
+	case RT5668_HP_IMP_SENS_CTRL_43:
+	case RT5668_HP_LOGIC_CTRL_1:
+	case RT5668_HP_LOGIC_CTRL_2:
+	case RT5668_HP_LOGIC_CTRL_3:
+	case RT5668_HP_CALIB_CTRL_1:
+	case RT5668_HP_CALIB_CTRL_2:
+	case RT5668_HP_CALIB_CTRL_3:
+	case RT5668_HP_CALIB_CTRL_4:
+	case RT5668_HP_CALIB_CTRL_5:
+	case RT5668_HP_CALIB_CTRL_6:
+	case RT5668_HP_CALIB_CTRL_7:
+	case RT5668_HP_CALIB_CTRL_9:
+	case RT5668_HP_CALIB_CTRL_10:
+	case RT5668_HP_CALIB_CTRL_11:
+	case RT5668_HP_CALIB_STA_1:
+	case RT5668_HP_CALIB_STA_2:
+	case RT5668_HP_CALIB_STA_3:
+	case RT5668_HP_CALIB_STA_4:
+	case RT5668_HP_CALIB_STA_5:
+	case RT5668_HP_CALIB_STA_6:
+	case RT5668_HP_CALIB_STA_7:
+	case RT5668_HP_CALIB_STA_8:
+	case RT5668_HP_CALIB_STA_9:
+	case RT5668_HP_CALIB_STA_10:
+	case RT5668_HP_CALIB_STA_11:
+	case RT5668_SAR_IL_CMD_1:
+	case RT5668_SAR_IL_CMD_2:
+	case RT5668_SAR_IL_CMD_3:
+	case RT5668_SAR_IL_CMD_4:
+	case RT5668_SAR_IL_CMD_5:
+	case RT5668_SAR_IL_CMD_6:
+	case RT5668_SAR_IL_CMD_7:
+	case RT5668_SAR_IL_CMD_8:
+	case RT5668_SAR_IL_CMD_9:
+	case RT5668_SAR_IL_CMD_10:
+	case RT5668_SAR_IL_CMD_11:
+	case RT5668_SAR_IL_CMD_12:
+	case RT5668_SAR_IL_CMD_13:
+	case RT5668_EFUSE_CTRL_1:
+	case RT5668_EFUSE_CTRL_2:
+	case RT5668_EFUSE_CTRL_3:
+	case RT5668_EFUSE_CTRL_4:
+	case RT5668_EFUSE_CTRL_5:
+	case RT5668_EFUSE_CTRL_6:
+	case RT5668_EFUSE_CTRL_7:
+	case RT5668_EFUSE_CTRL_8:
+	case RT5668_EFUSE_CTRL_9:
+	case RT5668_EFUSE_CTRL_10:
+	case RT5668_EFUSE_CTRL_11:
+	case RT5668_JD_TOP_VC_VTRL:
+	case RT5668_DRC1_CTRL_0:
+	case RT5668_DRC1_CTRL_1:
+	case RT5668_DRC1_CTRL_2:
+	case RT5668_DRC1_CTRL_3:
+	case RT5668_DRC1_CTRL_4:
+	case RT5668_DRC1_CTRL_5:
+	case RT5668_DRC1_CTRL_6:
+	case RT5668_DRC1_HARD_LMT_CTRL_1:
+	case RT5668_DRC1_HARD_LMT_CTRL_2:
+	case RT5668_DRC1_PRIV_1:
+	case RT5668_DRC1_PRIV_2:
+	case RT5668_DRC1_PRIV_3:
+	case RT5668_DRC1_PRIV_4:
+	case RT5668_DRC1_PRIV_5:
+	case RT5668_DRC1_PRIV_6:
+	case RT5668_DRC1_PRIV_7:
+	case RT5668_DRC1_PRIV_8:
+	case RT5668_EQ_AUTO_RCV_CTRL1:
+	case RT5668_EQ_AUTO_RCV_CTRL2:
+	case RT5668_EQ_AUTO_RCV_CTRL3:
+	case RT5668_EQ_AUTO_RCV_CTRL4:
+	case RT5668_EQ_AUTO_RCV_CTRL5:
+	case RT5668_EQ_AUTO_RCV_CTRL6:
+	case RT5668_EQ_AUTO_RCV_CTRL7:
+	case RT5668_EQ_AUTO_RCV_CTRL8:
+	case RT5668_EQ_AUTO_RCV_CTRL9:
+	case RT5668_EQ_AUTO_RCV_CTRL10:
+	case RT5668_EQ_AUTO_RCV_CTRL11:
+	case RT5668_EQ_AUTO_RCV_CTRL12:
+	case RT5668_EQ_AUTO_RCV_CTRL13:
+	case RT5668_ADC_L_EQ_LPF1_A1:
+	case RT5668_R_EQ_LPF1_A1:
+	case RT5668_L_EQ_LPF1_H0:
+	case RT5668_R_EQ_LPF1_H0:
+	case RT5668_L_EQ_BPF1_A1:
+	case RT5668_R_EQ_BPF1_A1:
+	case RT5668_L_EQ_BPF1_A2:
+	case RT5668_R_EQ_BPF1_A2:
+	case RT5668_L_EQ_BPF1_H0:
+	case RT5668_R_EQ_BPF1_H0:
+	case RT5668_L_EQ_BPF2_A1:
+	case RT5668_R_EQ_BPF2_A1:
+	case RT5668_L_EQ_BPF2_A2:
+	case RT5668_R_EQ_BPF2_A2:
+	case RT5668_L_EQ_BPF2_H0:
+	case RT5668_R_EQ_BPF2_H0:
+	case RT5668_L_EQ_BPF3_A1:
+	case RT5668_R_EQ_BPF3_A1:
+	case RT5668_L_EQ_BPF3_A2:
+	case RT5668_R_EQ_BPF3_A2:
+	case RT5668_L_EQ_BPF3_H0:
+	case RT5668_R_EQ_BPF3_H0:
+	case RT5668_L_EQ_BPF4_A1:
+	case RT5668_R_EQ_BPF4_A1:
+	case RT5668_L_EQ_BPF4_A2:
+	case RT5668_R_EQ_BPF4_A2:
+	case RT5668_L_EQ_BPF4_H0:
+	case RT5668_R_EQ_BPF4_H0:
+	case RT5668_L_EQ_HPF1_A1:
+	case RT5668_R_EQ_HPF1_A1:
+	case RT5668_L_EQ_HPF1_H0:
+	case RT5668_R_EQ_HPF1_H0:
+	case RT5668_L_EQ_PRE_VOL:
+	case RT5668_R_EQ_PRE_VOL:
+	case RT5668_L_EQ_POST_VOL:
+	case RT5668_R_EQ_POST_VOL:
+	case RT5668_I2C_MODE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static const DECLARE_TLV_DB_SCALE(hp_vol_tlv, -2250, 150, 0);
+static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -65625, 375, 0);
+static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -17625, 375, 0);
+static const DECLARE_TLV_DB_SCALE(adc_bst_tlv, 0, 1200, 0);
+
+/* {0, +20, +24, +30, +35, +40, +44, +50, +52} dB */
+static const DECLARE_TLV_DB_RANGE(bst_tlv,
+	0, 0, TLV_DB_SCALE_ITEM(0, 0, 0),
+	1, 1, TLV_DB_SCALE_ITEM(2000, 0, 0),
+	2, 2, TLV_DB_SCALE_ITEM(2400, 0, 0),
+	3, 5, TLV_DB_SCALE_ITEM(3000, 500, 0),
+	6, 6, TLV_DB_SCALE_ITEM(4400, 0, 0),
+	7, 7, TLV_DB_SCALE_ITEM(5000, 0, 0),
+	8, 8, TLV_DB_SCALE_ITEM(5200, 0, 0)
+);
+
+/* Interface data select */
+static const char * const rt5668_data_select[] = {
+	"L/R", "R/L", "L/L", "R/R"
+};
+
+static SOC_ENUM_SINGLE_DECL(rt5668_if2_adc_enum,
+	RT5668_DIG_INF2_DATA, RT5668_IF2_ADC_SEL_SFT, rt5668_data_select);
+
+static SOC_ENUM_SINGLE_DECL(rt5668_if1_01_adc_enum,
+	RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC1_SEL_SFT, rt5668_data_select);
+
+static SOC_ENUM_SINGLE_DECL(rt5668_if1_23_adc_enum,
+	RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC2_SEL_SFT, rt5668_data_select);
+
+static SOC_ENUM_SINGLE_DECL(rt5668_if1_45_adc_enum,
+	RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC3_SEL_SFT, rt5668_data_select);
+
+static SOC_ENUM_SINGLE_DECL(rt5668_if1_67_adc_enum,
+	RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC4_SEL_SFT, rt5668_data_select);
+
+static const struct snd_kcontrol_new rt5668_if2_adc_swap_mux =
+	SOC_DAPM_ENUM("IF2 ADC Swap Mux", rt5668_if2_adc_enum);
+
+static const struct snd_kcontrol_new rt5668_if1_01_adc_swap_mux =
+	SOC_DAPM_ENUM("IF1 01 ADC Swap Mux", rt5668_if1_01_adc_enum);
+
+static const struct snd_kcontrol_new rt5668_if1_23_adc_swap_mux =
+	SOC_DAPM_ENUM("IF1 23 ADC Swap Mux", rt5668_if1_23_adc_enum);
+
+static const struct snd_kcontrol_new rt5668_if1_45_adc_swap_mux =
+	SOC_DAPM_ENUM("IF1 45 ADC Swap Mux", rt5668_if1_45_adc_enum);
+
+static const struct snd_kcontrol_new rt5668_if1_67_adc_swap_mux =
+	SOC_DAPM_ENUM("IF1 67 ADC Swap Mux", rt5668_if1_67_adc_enum);
+
+static void rt5668_reset(struct regmap *regmap)
+{
+	regmap_write(regmap, RT5668_RESET, 0);
+	regmap_write(regmap, RT5668_I2C_MODE, 1);
+}
+/**
+ * rt5668_sel_asrc_clk_src - select ASRC clock source for a set of filters
+ * @component: SoC audio component device.
+ * @filter_mask: mask of filters.
+ * @clk_src: clock source
+ *
+ * The ASRC function is for asynchronous MCLK and LRCK. Also, since RT5668 can
+ * only support standard 32fs or 64fs i2s format, ASRC should be enabled to
+ * support special i2s clock format such as Intel's 100fs(100 * sampling rate).
+ * ASRC function will track i2s clock and generate a corresponding system clock
+ * for codec. This function provides an API to select the clock source for a
+ * set of filters specified by the mask. And the component driver will turn on
+ * ASRC for these filters if ASRC is selected as their clock source.
+ */
+int rt5668_sel_asrc_clk_src(struct snd_soc_component *component,
+		unsigned int filter_mask, unsigned int clk_src)
+{
+
+	switch (clk_src) {
+	case RT5668_CLK_SEL_SYS:
+	case RT5668_CLK_SEL_I2S1_ASRC:
+	case RT5668_CLK_SEL_I2S2_ASRC:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (filter_mask & RT5668_DA_STEREO1_FILTER) {
+		snd_soc_component_update_bits(component, RT5668_PLL_TRACK_2,
+			RT5668_FILTER_CLK_SEL_MASK,
+			clk_src << RT5668_FILTER_CLK_SEL_SFT);
+	}
+
+	if (filter_mask & RT5668_AD_STEREO1_FILTER) {
+		snd_soc_component_update_bits(component, RT5668_PLL_TRACK_3,
+			RT5668_FILTER_CLK_SEL_MASK,
+			clk_src << RT5668_FILTER_CLK_SEL_SFT);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rt5668_sel_asrc_clk_src);
+
+static int rt5668_button_detect(struct snd_soc_component *component)
+{
+	int btn_type, val;
+
+	val = snd_soc_component_read32(component, RT5668_4BTN_IL_CMD_1);
+	btn_type = val & 0xfff0;
+	snd_soc_component_write(component, RT5668_4BTN_IL_CMD_1, val);
+	pr_debug("%s btn_type=%x\n", __func__, btn_type);
+
+	return btn_type;
+}
+
+static void rt5668_enable_push_button_irq(struct snd_soc_component *component,
+		bool enable)
+{
+	if (enable) {
+		snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_1,
+			RT5668_SAR_BUTT_DET_MASK, RT5668_SAR_BUTT_DET_EN);
+		snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_13,
+			RT5668_SAR_SOUR_MASK, RT5668_SAR_SOUR_BTN);
+		snd_soc_component_write(component, RT5668_IL_CMD_1, 0x0040);
+		snd_soc_component_update_bits(component, RT5668_4BTN_IL_CMD_2,
+			RT5668_4BTN_IL_MASK | RT5668_4BTN_IL_RST_MASK,
+			RT5668_4BTN_IL_EN | RT5668_4BTN_IL_NOR);
+		snd_soc_component_update_bits(component, RT5668_IRQ_CTRL_3,
+			RT5668_IL_IRQ_MASK, RT5668_IL_IRQ_EN);
+	} else {
+		snd_soc_component_update_bits(component, RT5668_IRQ_CTRL_3,
+			RT5668_IL_IRQ_MASK, RT5668_IL_IRQ_DIS);
+		snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_1,
+			RT5668_SAR_BUTT_DET_MASK, RT5668_SAR_BUTT_DET_DIS);
+		snd_soc_component_update_bits(component, RT5668_4BTN_IL_CMD_2,
+			RT5668_4BTN_IL_MASK, RT5668_4BTN_IL_DIS);
+		snd_soc_component_update_bits(component, RT5668_4BTN_IL_CMD_2,
+			RT5668_4BTN_IL_RST_MASK, RT5668_4BTN_IL_RST);
+		snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_13,
+			RT5668_SAR_SOUR_MASK, RT5668_SAR_SOUR_TYPE);
+	}
+}
+
+/**
+ * rt5668_headset_detect - Detect headset.
+ * @component: SoC audio component device.
+ * @jack_insert: Jack insert or not.
+ *
+ * Detect whether is headset or not when jack inserted.
+ *
+ * Returns detect status.
+ */
+static int rt5668_headset_detect(struct snd_soc_component *component,
+		int jack_insert)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+	struct snd_soc_dapm_context *dapm =
+		snd_soc_component_get_dapm(component);
+	unsigned int val, count;
+
+	if (jack_insert) {
+		snd_soc_dapm_force_enable_pin(dapm, "CBJ Power");
+		snd_soc_dapm_sync(dapm);
+		snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_1,
+			RT5668_TRIG_JD_MASK, RT5668_TRIG_JD_HIGH);
+
+		count = 0;
+		val = snd_soc_component_read32(component, RT5668_CBJ_CTRL_2)
+			& RT5668_JACK_TYPE_MASK;
+		while (val == 0 && count < 50) {
+			usleep_range(10000, 15000);
+			val = snd_soc_component_read32(component,
+				RT5668_CBJ_CTRL_2) & RT5668_JACK_TYPE_MASK;
+			count++;
+		}
+
+		switch (val) {
+		case 0x1:
+		case 0x2:
+			rt5668->jack_type = SND_JACK_HEADSET;
+			rt5668_enable_push_button_irq(component, true);
+			break;
+		default:
+			rt5668->jack_type = SND_JACK_HEADPHONE;
+		}
+
+	} else {
+		rt5668_enable_push_button_irq(component, false);
+		snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_1,
+			RT5668_TRIG_JD_MASK, RT5668_TRIG_JD_LOW);
+		snd_soc_dapm_disable_pin(dapm, "CBJ Power");
+		snd_soc_dapm_sync(dapm);
+
+		rt5668->jack_type = 0;
+	}
+
+	dev_dbg(component->dev, "jack_type = %d\n", rt5668->jack_type);
+	return rt5668->jack_type;
+}
+
+static irqreturn_t rt5668_irq(int irq, void *data)
+{
+	struct rt5668_priv *rt5668 = data;
+
+	mod_delayed_work(system_power_efficient_wq,
+			&rt5668->jack_detect_work, msecs_to_jiffies(250));
+
+	return IRQ_HANDLED;
+}
+
+static void rt5668_jd_check_handler(struct work_struct *work)
+{
+	struct rt5668_priv *rt5668 = container_of(work, struct rt5668_priv,
+		jd_check_work.work);
+
+	if (snd_soc_component_read32(rt5668->component, RT5668_AJD1_CTRL)
+		& RT5668_JDH_RS_MASK) {
+		/* jack out */
+		rt5668->jack_type = rt5668_headset_detect(rt5668->component, 0);
+
+		snd_soc_jack_report(rt5668->hs_jack, rt5668->jack_type,
+				SND_JACK_HEADSET |
+				SND_JACK_BTN_0 | SND_JACK_BTN_1 |
+				SND_JACK_BTN_2 | SND_JACK_BTN_3);
+	} else {
+		schedule_delayed_work(&rt5668->jd_check_work, 500);
+	}
+}
+
+static int rt5668_set_jack_detect(struct snd_soc_component *component,
+	struct snd_soc_jack *hs_jack, void *data)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+
+	switch (rt5668->pdata.jd_src) {
+	case RT5668_JD1:
+		snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_2,
+			RT5668_EXT_JD_SRC, RT5668_EXT_JD_SRC_MANUAL);
+		snd_soc_component_write(component, RT5668_CBJ_CTRL_1, 0xd002);
+		snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_3,
+			RT5668_CBJ_IN_BUF_EN, RT5668_CBJ_IN_BUF_EN);
+		snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_1,
+			RT5668_SAR_POW_MASK, RT5668_SAR_POW_EN);
+		regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1,
+			RT5668_GP1_PIN_MASK, RT5668_GP1_PIN_IRQ);
+		regmap_update_bits(rt5668->regmap, RT5668_RC_CLK_CTRL,
+				RT5668_POW_IRQ | RT5668_POW_JDH |
+				RT5668_POW_ANA, RT5668_POW_IRQ |
+				RT5668_POW_JDH | RT5668_POW_ANA);
+		regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_2,
+			RT5668_PWR_JDH | RT5668_PWR_JDL,
+			RT5668_PWR_JDH | RT5668_PWR_JDL);
+		regmap_update_bits(rt5668->regmap, RT5668_IRQ_CTRL_2,
+			RT5668_JD1_EN_MASK | RT5668_JD1_POL_MASK,
+			RT5668_JD1_EN | RT5668_JD1_POL_NOR);
+		mod_delayed_work(system_power_efficient_wq,
+			   &rt5668->jack_detect_work, msecs_to_jiffies(250));
+		break;
+
+	case RT5668_JD_NULL:
+		regmap_update_bits(rt5668->regmap, RT5668_IRQ_CTRL_2,
+			RT5668_JD1_EN_MASK, RT5668_JD1_DIS);
+		regmap_update_bits(rt5668->regmap, RT5668_RC_CLK_CTRL,
+				RT5668_POW_JDH | RT5668_POW_JDL, 0);
+		break;
+
+	default:
+		dev_warn(component->dev, "Wrong JD source\n");
+		break;
+	}
+
+	rt5668->hs_jack = hs_jack;
+
+	return 0;
+}
+
+static void rt5668_jack_detect_handler(struct work_struct *work)
+{
+	struct rt5668_priv *rt5668 =
+		container_of(work, struct rt5668_priv, jack_detect_work.work);
+	int val, btn_type;
+
+	while (!rt5668->component)
+		usleep_range(10000, 15000);
+
+	while (!rt5668->component->card->instantiated)
+		usleep_range(10000, 15000);
+
+	mutex_lock(&rt5668->calibrate_mutex);
+
+	val = snd_soc_component_read32(rt5668->component, RT5668_AJD1_CTRL)
+		& RT5668_JDH_RS_MASK;
+	if (!val) {
+		/* jack in */
+		if (rt5668->jack_type == 0) {
+			/* jack was out, report jack type */
+			rt5668->jack_type =
+				rt5668_headset_detect(rt5668->component, 1);
+		} else {
+			/* jack is already in, report button event */
+			rt5668->jack_type = SND_JACK_HEADSET;
+			btn_type = rt5668_button_detect(rt5668->component);
+			/**
+			 * rt5668 can report three kinds of button behavior,
+			 * one click, double click and hold. However,
+			 * currently we will report button pressed/released
+			 * event. So all the three button behaviors are
+			 * treated as button pressed.
+			 */
+			switch (btn_type) {
+			case 0x8000:
+			case 0x4000:
+			case 0x2000:
+				rt5668->jack_type |= SND_JACK_BTN_0;
+				break;
+			case 0x1000:
+			case 0x0800:
+			case 0x0400:
+				rt5668->jack_type |= SND_JACK_BTN_1;
+				break;
+			case 0x0200:
+			case 0x0100:
+			case 0x0080:
+				rt5668->jack_type |= SND_JACK_BTN_2;
+				break;
+			case 0x0040:
+			case 0x0020:
+			case 0x0010:
+				rt5668->jack_type |= SND_JACK_BTN_3;
+				break;
+			case 0x0000: /* unpressed */
+				break;
+			default:
+				btn_type = 0;
+				dev_err(rt5668->component->dev,
+					"Unexpected button code 0x%04x\n",
+					btn_type);
+				break;
+			}
+		}
+	} else {
+		/* jack out */
+		rt5668->jack_type = rt5668_headset_detect(rt5668->component, 0);
+	}
+
+	snd_soc_jack_report(rt5668->hs_jack, rt5668->jack_type,
+			SND_JACK_HEADSET |
+			    SND_JACK_BTN_0 | SND_JACK_BTN_1 |
+			    SND_JACK_BTN_2 | SND_JACK_BTN_3);
+
+	if (rt5668->jack_type & (SND_JACK_BTN_0 | SND_JACK_BTN_1 |
+		SND_JACK_BTN_2 | SND_JACK_BTN_3))
+		schedule_delayed_work(&rt5668->jd_check_work, 0);
+	else
+		cancel_delayed_work_sync(&rt5668->jd_check_work);
+
+	mutex_unlock(&rt5668->calibrate_mutex);
+}
+
+static const struct snd_kcontrol_new rt5668_snd_controls[] = {
+	/* Headphone Output Volume */
+	SOC_DOUBLE_R_TLV("Headphone Playback Volume", RT5668_HPL_GAIN,
+		RT5668_HPR_GAIN, RT5668_G_HP_SFT, 15, 1, hp_vol_tlv),
+
+	/* DAC Digital Volume */
+	SOC_DOUBLE_TLV("DAC1 Playback Volume", RT5668_DAC1_DIG_VOL,
+		RT5668_L_VOL_SFT, RT5668_R_VOL_SFT, 175, 0, dac_vol_tlv),
+
+	/* IN Boost Volume */
+	SOC_SINGLE_TLV("CBJ Boost Volume", RT5668_CBJ_BST_CTRL,
+		RT5668_BST_CBJ_SFT, 8, 0, bst_tlv),
+
+	/* ADC Digital Volume Control */
+	SOC_DOUBLE("STO1 ADC Capture Switch", RT5668_STO1_ADC_DIG_VOL,
+		RT5668_L_MUTE_SFT, RT5668_R_MUTE_SFT, 1, 1),
+	SOC_DOUBLE_TLV("STO1 ADC Capture Volume", RT5668_STO1_ADC_DIG_VOL,
+		RT5668_L_VOL_SFT, RT5668_R_VOL_SFT, 127, 0, adc_vol_tlv),
+
+	/* ADC Boost Volume Control */
+	SOC_DOUBLE_TLV("STO1 ADC Boost Gain Volume", RT5668_STO1_ADC_BOOST,
+		RT5668_STO1_ADC_L_BST_SFT, RT5668_STO1_ADC_R_BST_SFT,
+		3, 0, adc_bst_tlv),
+};
+
+
+static int rt5668_div_sel(struct rt5668_priv *rt5668,
+			  int target, const int div[], int size)
+{
+	int i;
+
+	if (rt5668->sysclk < target) {
+		pr_err("sysclk rate %d is too low\n",
+			rt5668->sysclk);
+		return 0;
+	}
+
+	for (i = 0; i < size - 1; i++) {
+		pr_info("div[%d]=%d\n", i, div[i]);
+		if (target * div[i] == rt5668->sysclk)
+			return i;
+		if (target * div[i + 1] > rt5668->sysclk) {
+			pr_err("can't find div for sysclk %d\n",
+				rt5668->sysclk);
+			return i;
+		}
+	}
+
+	if (target * div[i] < rt5668->sysclk)
+		pr_err("sysclk rate %d is too high\n",
+			rt5668->sysclk);
+
+	return size - 1;
+
+}
+
+/**
+ * set_dmic_clk - Set parameter of dmic.
+ *
+ * @w: DAPM widget.
+ * @kcontrol: The kcontrol of this widget.
+ * @event: Event id.
+ *
+ * Choose dmic clock between 1MHz and 3MHz.
+ * It is better for clock to approximate 3MHz.
+ */
+static int set_dmic_clk(struct snd_soc_dapm_widget *w,
+	struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component =
+		snd_soc_dapm_to_component(w->dapm);
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+	int idx = -EINVAL;
+	static const int div[] = {2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128};
+
+	idx = rt5668_div_sel(rt5668, 1500000, div, ARRAY_SIZE(div));
+
+	snd_soc_component_update_bits(component, RT5668_DMIC_CTRL_1,
+		RT5668_DMIC_CLK_MASK, idx << RT5668_DMIC_CLK_SFT);
+
+	return 0;
+}
+
+static int set_filter_clk(struct snd_soc_dapm_widget *w,
+	struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component =
+		snd_soc_dapm_to_component(w->dapm);
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+	int ref, val, reg, idx = -EINVAL;
+	static const int div[] = {1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48};
+
+	val = snd_soc_component_read32(component, RT5668_GPIO_CTRL_1) &&
+		RT5668_GP4_PIN_MASK;
+	if (w->shift == RT5668_PWR_ADC_S1F_BIT &&
+		val == RT5668_GP4_PIN_ADCDAT2)
+		ref = 256 * rt5668->lrck[RT5668_AIF2];
+	else
+		ref = 256 * rt5668->lrck[RT5668_AIF1];
+
+	idx = rt5668_div_sel(rt5668, ref, div, ARRAY_SIZE(div));
+
+	if (w->shift == RT5668_PWR_ADC_S1F_BIT)
+		reg = RT5668_PLL_TRACK_3;
+	else
+		reg = RT5668_PLL_TRACK_2;
+
+	snd_soc_component_update_bits(component, reg,
+		RT5668_FILTER_CLK_SEL_MASK, idx << RT5668_FILTER_CLK_SEL_SFT);
+
+	return 0;
+}
+
+static int is_sys_clk_from_pll1(struct snd_soc_dapm_widget *w,
+			 struct snd_soc_dapm_widget *sink)
+{
+	unsigned int val;
+	struct snd_soc_component *component =
+		snd_soc_dapm_to_component(w->dapm);
+
+	val = snd_soc_component_read32(component, RT5668_GLB_CLK);
+	val &= RT5668_SCLK_SRC_MASK;
+	if (val == RT5668_SCLK_SRC_PLL1)
+		return 1;
+	else
+		return 0;
+}
+
+static int is_using_asrc(struct snd_soc_dapm_widget *w,
+			 struct snd_soc_dapm_widget *sink)
+{
+	unsigned int reg, shift, val;
+	struct snd_soc_component *component =
+		snd_soc_dapm_to_component(w->dapm);
+
+	switch (w->shift) {
+	case RT5668_ADC_STO1_ASRC_SFT:
+		reg = RT5668_PLL_TRACK_3;
+		shift = RT5668_FILTER_CLK_SEL_SFT;
+		break;
+	case RT5668_DAC_STO1_ASRC_SFT:
+		reg = RT5668_PLL_TRACK_2;
+		shift = RT5668_FILTER_CLK_SEL_SFT;
+		break;
+	default:
+		return 0;
+	}
+
+	val = (snd_soc_component_read32(component, reg) >> shift) & 0xf;
+	switch (val) {
+	case RT5668_CLK_SEL_I2S1_ASRC:
+	case RT5668_CLK_SEL_I2S2_ASRC:
+		return 1;
+	default:
+		return 0;
+	}
+
+}
+
+/* Digital Mixer */
+static const struct snd_kcontrol_new rt5668_sto1_adc_l_mix[] = {
+	SOC_DAPM_SINGLE("ADC1 Switch", RT5668_STO1_ADC_MIXER,
+			RT5668_M_STO1_ADC_L1_SFT, 1, 1),
+	SOC_DAPM_SINGLE("ADC2 Switch", RT5668_STO1_ADC_MIXER,
+			RT5668_M_STO1_ADC_L2_SFT, 1, 1),
+};
+
+static const struct snd_kcontrol_new rt5668_sto1_adc_r_mix[] = {
+	SOC_DAPM_SINGLE("ADC1 Switch", RT5668_STO1_ADC_MIXER,
+			RT5668_M_STO1_ADC_R1_SFT, 1, 1),
+	SOC_DAPM_SINGLE("ADC2 Switch", RT5668_STO1_ADC_MIXER,
+			RT5668_M_STO1_ADC_R2_SFT, 1, 1),
+};
+
+static const struct snd_kcontrol_new rt5668_dac_l_mix[] = {
+	SOC_DAPM_SINGLE("Stereo ADC Switch", RT5668_AD_DA_MIXER,
+			RT5668_M_ADCMIX_L_SFT, 1, 1),
+	SOC_DAPM_SINGLE("DAC1 Switch", RT5668_AD_DA_MIXER,
+			RT5668_M_DAC1_L_SFT, 1, 1),
+};
+
+static const struct snd_kcontrol_new rt5668_dac_r_mix[] = {
+	SOC_DAPM_SINGLE("Stereo ADC Switch", RT5668_AD_DA_MIXER,
+			RT5668_M_ADCMIX_R_SFT, 1, 1),
+	SOC_DAPM_SINGLE("DAC1 Switch", RT5668_AD_DA_MIXER,
+			RT5668_M_DAC1_R_SFT, 1, 1),
+};
+
+static const struct snd_kcontrol_new rt5668_sto1_dac_l_mix[] = {
+	SOC_DAPM_SINGLE("DAC L1 Switch", RT5668_STO1_DAC_MIXER,
+			RT5668_M_DAC_L1_STO_L_SFT, 1, 1),
+	SOC_DAPM_SINGLE("DAC R1 Switch", RT5668_STO1_DAC_MIXER,
+			RT5668_M_DAC_R1_STO_L_SFT, 1, 1),
+};
+
+static const struct snd_kcontrol_new rt5668_sto1_dac_r_mix[] = {
+	SOC_DAPM_SINGLE("DAC L1 Switch", RT5668_STO1_DAC_MIXER,
+			RT5668_M_DAC_L1_STO_R_SFT, 1, 1),
+	SOC_DAPM_SINGLE("DAC R1 Switch", RT5668_STO1_DAC_MIXER,
+			RT5668_M_DAC_R1_STO_R_SFT, 1, 1),
+};
+
+/* Analog Input Mixer */
+static const struct snd_kcontrol_new rt5668_rec1_l_mix[] = {
+	SOC_DAPM_SINGLE("CBJ Switch", RT5668_REC_MIXER,
+			RT5668_M_CBJ_RM1_L_SFT, 1, 1),
+};
+
+/* STO1 ADC1 Source */
+/* MX-26 [13] [5] */
+static const char * const rt5668_sto1_adc1_src[] = {
+	"DAC MIX", "ADC"
+};
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_sto1_adc1l_enum, RT5668_STO1_ADC_MIXER,
+	RT5668_STO1_ADC1L_SRC_SFT, rt5668_sto1_adc1_src);
+
+static const struct snd_kcontrol_new rt5668_sto1_adc1l_mux =
+	SOC_DAPM_ENUM("Stereo1 ADC1L Source", rt5668_sto1_adc1l_enum);
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_sto1_adc1r_enum, RT5668_STO1_ADC_MIXER,
+	RT5668_STO1_ADC1R_SRC_SFT, rt5668_sto1_adc1_src);
+
+static const struct snd_kcontrol_new rt5668_sto1_adc1r_mux =
+	SOC_DAPM_ENUM("Stereo1 ADC1L Source", rt5668_sto1_adc1r_enum);
+
+/* STO1 ADC Source */
+/* MX-26 [11:10] [3:2] */
+static const char * const rt5668_sto1_adc_src[] = {
+	"ADC1 L", "ADC1 R"
+};
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_sto1_adcl_enum, RT5668_STO1_ADC_MIXER,
+	RT5668_STO1_ADCL_SRC_SFT, rt5668_sto1_adc_src);
+
+static const struct snd_kcontrol_new rt5668_sto1_adcl_mux =
+	SOC_DAPM_ENUM("Stereo1 ADCL Source", rt5668_sto1_adcl_enum);
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_sto1_adcr_enum, RT5668_STO1_ADC_MIXER,
+	RT5668_STO1_ADCR_SRC_SFT, rt5668_sto1_adc_src);
+
+static const struct snd_kcontrol_new rt5668_sto1_adcr_mux =
+	SOC_DAPM_ENUM("Stereo1 ADCR Source", rt5668_sto1_adcr_enum);
+
+/* STO1 ADC2 Source */
+/* MX-26 [12] [4] */
+static const char * const rt5668_sto1_adc2_src[] = {
+	"DAC MIX", "DMIC"
+};
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_sto1_adc2l_enum, RT5668_STO1_ADC_MIXER,
+	RT5668_STO1_ADC2L_SRC_SFT, rt5668_sto1_adc2_src);
+
+static const struct snd_kcontrol_new rt5668_sto1_adc2l_mux =
+	SOC_DAPM_ENUM("Stereo1 ADC2L Source", rt5668_sto1_adc2l_enum);
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_sto1_adc2r_enum, RT5668_STO1_ADC_MIXER,
+	RT5668_STO1_ADC2R_SRC_SFT, rt5668_sto1_adc2_src);
+
+static const struct snd_kcontrol_new rt5668_sto1_adc2r_mux =
+	SOC_DAPM_ENUM("Stereo1 ADC2R Source", rt5668_sto1_adc2r_enum);
+
+/* MX-79 [6:4] I2S1 ADC data location */
+static const unsigned int rt5668_if1_adc_slot_values[] = {
+	0,
+	2,
+	4,
+	6,
+};
+
+static const char * const rt5668_if1_adc_slot_src[] = {
+	"Slot 0", "Slot 2", "Slot 4", "Slot 6"
+};
+
+static SOC_VALUE_ENUM_SINGLE_DECL(rt5668_if1_adc_slot_enum,
+	RT5668_TDM_CTRL, RT5668_TDM_ADC_LCA_SFT, RT5668_TDM_ADC_LCA_MASK,
+	rt5668_if1_adc_slot_src, rt5668_if1_adc_slot_values);
+
+static const struct snd_kcontrol_new rt5668_if1_adc_slot_mux =
+	SOC_DAPM_ENUM("IF1 ADC Slot location", rt5668_if1_adc_slot_enum);
+
+/* Analog DAC L1 Source, Analog DAC R1 Source*/
+/* MX-2B [4], MX-2B [0]*/
+static const char * const rt5668_alg_dac1_src[] = {
+	"Stereo1 DAC Mixer", "DAC1"
+};
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_alg_dac_l1_enum, RT5668_A_DAC1_MUX,
+	RT5668_A_DACL1_SFT, rt5668_alg_dac1_src);
+
+static const struct snd_kcontrol_new rt5668_alg_dac_l1_mux =
+	SOC_DAPM_ENUM("Analog DAC L1 Source", rt5668_alg_dac_l1_enum);
+
+static SOC_ENUM_SINGLE_DECL(
+	rt5668_alg_dac_r1_enum, RT5668_A_DAC1_MUX,
+	RT5668_A_DACR1_SFT, rt5668_alg_dac1_src);
+
+static const struct snd_kcontrol_new rt5668_alg_dac_r1_mux =
+	SOC_DAPM_ENUM("Analog DAC R1 Source", rt5668_alg_dac_r1_enum);
+
+/* Out Switch */
+static const struct snd_kcontrol_new hpol_switch =
+	SOC_DAPM_SINGLE_AUTODISABLE("Switch", RT5668_HP_CTRL_1,
+					RT5668_L_MUTE_SFT, 1, 1);
+static const struct snd_kcontrol_new hpor_switch =
+	SOC_DAPM_SINGLE_AUTODISABLE("Switch", RT5668_HP_CTRL_1,
+					RT5668_R_MUTE_SFT, 1, 1);
+
+static int rt5668_hp_event(struct snd_soc_dapm_widget *w,
+	struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component =
+		snd_soc_dapm_to_component(w->dapm);
+
+	switch (event) {
+	case SND_SOC_DAPM_PRE_PMU:
+		snd_soc_component_write(component,
+			RT5668_HP_LOGIC_CTRL_2, 0x0012);
+		snd_soc_component_write(component,
+			RT5668_HP_CTRL_2, 0x6000);
+		snd_soc_component_update_bits(component, RT5668_STO_NG2_CTRL_1,
+			RT5668_NG2_EN_MASK, RT5668_NG2_EN);
+		snd_soc_component_update_bits(component,
+			RT5668_DEPOP_1, 0x60, 0x60);
+		break;
+
+	case SND_SOC_DAPM_POST_PMD:
+		snd_soc_component_update_bits(component,
+			RT5668_DEPOP_1, 0x60, 0x0);
+		snd_soc_component_write(component,
+			RT5668_HP_CTRL_2, 0x0000);
+		break;
+
+	default:
+		return 0;
+	}
+
+	return 0;
+
+}
+
+static int set_dmic_power(struct snd_soc_dapm_widget *w,
+	struct snd_kcontrol *kcontrol, int event)
+{
+	switch (event) {
+	case SND_SOC_DAPM_POST_PMU:
+		/*Add delay to avoid pop noise*/
+		msleep(150);
+		break;
+
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static int rt5655_set_verf(struct snd_soc_dapm_widget *w,
+	struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component =
+		snd_soc_dapm_to_component(w->dapm);
+
+	switch (event) {
+	case SND_SOC_DAPM_PRE_PMU:
+		switch (w->shift) {
+		case RT5668_PWR_VREF1_BIT:
+			snd_soc_component_update_bits(component,
+				RT5668_PWR_ANLG_1, RT5668_PWR_FV1, 0);
+			break;
+
+		case RT5668_PWR_VREF2_BIT:
+			snd_soc_component_update_bits(component,
+				RT5668_PWR_ANLG_1, RT5668_PWR_FV2, 0);
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	case SND_SOC_DAPM_POST_PMU:
+		usleep_range(15000, 20000);
+		switch (w->shift) {
+		case RT5668_PWR_VREF1_BIT:
+			snd_soc_component_update_bits(component,
+				RT5668_PWR_ANLG_1, RT5668_PWR_FV1,
+				RT5668_PWR_FV1);
+			break;
+
+		case RT5668_PWR_VREF2_BIT:
+			snd_soc_component_update_bits(component,
+				RT5668_PWR_ANLG_1, RT5668_PWR_FV2,
+				RT5668_PWR_FV2);
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static const unsigned int rt5668_adcdat_pin_values[] = {
+	1,
+	3,
+};
+
+static const char * const rt5668_adcdat_pin_select[] = {
+	"ADCDAT1",
+	"ADCDAT2",
+};
+
+static SOC_VALUE_ENUM_SINGLE_DECL(rt5668_adcdat_pin_enum,
+	RT5668_GPIO_CTRL_1, RT5668_GP4_PIN_SFT, RT5668_GP4_PIN_MASK,
+	rt5668_adcdat_pin_select, rt5668_adcdat_pin_values);
+
+static const struct snd_kcontrol_new rt5668_adcdat_pin_ctrl =
+	SOC_DAPM_ENUM("ADCDAT", rt5668_adcdat_pin_enum);
+
+static const struct snd_soc_dapm_widget rt5668_dapm_widgets[] = {
+	SND_SOC_DAPM_SUPPLY("LDO2", RT5668_PWR_ANLG_3, RT5668_PWR_LDO2_BIT,
+		0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("PLL1", RT5668_PWR_ANLG_3, RT5668_PWR_PLL_BIT,
+		0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("PLL2B", RT5668_PWR_ANLG_3, RT5668_PWR_PLL2B_BIT,
+		0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("PLL2F", RT5668_PWR_ANLG_3, RT5668_PWR_PLL2F_BIT,
+		0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("Vref1", RT5668_PWR_ANLG_1, RT5668_PWR_VREF1_BIT, 0,
+		rt5655_set_verf, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU),
+	SND_SOC_DAPM_SUPPLY("Vref2", RT5668_PWR_ANLG_1, RT5668_PWR_VREF2_BIT, 0,
+		rt5655_set_verf, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU),
+
+	/* ASRC */
+	SND_SOC_DAPM_SUPPLY_S("DAC STO1 ASRC", 1, RT5668_PLL_TRACK_1,
+		RT5668_DAC_STO1_ASRC_SFT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY_S("ADC STO1 ASRC", 1, RT5668_PLL_TRACK_1,
+		RT5668_ADC_STO1_ASRC_SFT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY_S("AD ASRC", 1, RT5668_PLL_TRACK_1,
+		RT5668_AD_ASRC_SFT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY_S("DA ASRC", 1, RT5668_PLL_TRACK_1,
+		RT5668_DA_ASRC_SFT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY_S("DMIC ASRC", 1, RT5668_PLL_TRACK_1,
+		RT5668_DMIC_ASRC_SFT, 0, NULL, 0),
+
+	/* Input Side */
+	SND_SOC_DAPM_SUPPLY("MICBIAS1", RT5668_PWR_ANLG_2, RT5668_PWR_MB1_BIT,
+		0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("MICBIAS2", RT5668_PWR_ANLG_2, RT5668_PWR_MB2_BIT,
+		0, NULL, 0),
+
+	/* Input Lines */
+	SND_SOC_DAPM_INPUT("DMIC L1"),
+	SND_SOC_DAPM_INPUT("DMIC R1"),
+
+	SND_SOC_DAPM_INPUT("IN1P"),
+
+	SND_SOC_DAPM_SUPPLY("DMIC CLK", SND_SOC_NOPM, 0, 0,
+		set_dmic_clk, SND_SOC_DAPM_PRE_PMU),
+	SND_SOC_DAPM_SUPPLY("DMIC1 Power", RT5668_DMIC_CTRL_1,
+		RT5668_DMIC_1_EN_SFT, 0, set_dmic_power, SND_SOC_DAPM_POST_PMU),
+
+	/* Boost */
+	SND_SOC_DAPM_PGA("BST1 CBJ", SND_SOC_NOPM,
+		0, 0, NULL, 0),
+
+	SND_SOC_DAPM_SUPPLY("CBJ Power", RT5668_PWR_ANLG_3,
+		RT5668_PWR_CBJ_BIT, 0, NULL, 0),
+
+	/* REC Mixer */
+	SND_SOC_DAPM_MIXER("RECMIX1L", SND_SOC_NOPM, 0, 0, rt5668_rec1_l_mix,
+		ARRAY_SIZE(rt5668_rec1_l_mix)),
+	SND_SOC_DAPM_SUPPLY("RECMIX1L Power", RT5668_PWR_ANLG_2,
+		RT5668_PWR_RM1_L_BIT, 0, NULL, 0),
+
+	/* ADCs */
+	SND_SOC_DAPM_ADC("ADC1 L", NULL, SND_SOC_NOPM, 0, 0),
+	SND_SOC_DAPM_ADC("ADC1 R", NULL, SND_SOC_NOPM, 0, 0),
+
+	SND_SOC_DAPM_SUPPLY("ADC1 L Power", RT5668_PWR_DIG_1,
+		RT5668_PWR_ADC_L1_BIT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("ADC1 R Power", RT5668_PWR_DIG_1,
+		RT5668_PWR_ADC_R1_BIT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("ADC1 clock", RT5668_CHOP_ADC,
+		RT5668_CKGEN_ADC1_SFT, 0, NULL, 0),
+
+	/* ADC Mux */
+	SND_SOC_DAPM_MUX("Stereo1 ADC L1 Mux", SND_SOC_NOPM, 0, 0,
+		&rt5668_sto1_adc1l_mux),
+	SND_SOC_DAPM_MUX("Stereo1 ADC R1 Mux", SND_SOC_NOPM, 0, 0,
+		&rt5668_sto1_adc1r_mux),
+	SND_SOC_DAPM_MUX("Stereo1 ADC L2 Mux", SND_SOC_NOPM, 0, 0,
+		&rt5668_sto1_adc2l_mux),
+	SND_SOC_DAPM_MUX("Stereo1 ADC R2 Mux", SND_SOC_NOPM, 0, 0,
+		&rt5668_sto1_adc2r_mux),
+	SND_SOC_DAPM_MUX("Stereo1 ADC L Mux", SND_SOC_NOPM, 0, 0,
+		&rt5668_sto1_adcl_mux),
+	SND_SOC_DAPM_MUX("Stereo1 ADC R Mux", SND_SOC_NOPM, 0, 0,
+		&rt5668_sto1_adcr_mux),
+	SND_SOC_DAPM_MUX("IF1_ADC Mux", SND_SOC_NOPM, 0, 0,
+		&rt5668_if1_adc_slot_mux),
+
+	/* ADC Mixer */
+	SND_SOC_DAPM_SUPPLY("ADC Stereo1 Filter", RT5668_PWR_DIG_2,
+		RT5668_PWR_ADC_S1F_BIT, 0, set_filter_clk,
+		SND_SOC_DAPM_PRE_PMU),
+	SND_SOC_DAPM_MIXER("Stereo1 ADC MIXL", RT5668_STO1_ADC_DIG_VOL,
+		RT5668_L_MUTE_SFT, 1, rt5668_sto1_adc_l_mix,
+		ARRAY_SIZE(rt5668_sto1_adc_l_mix)),
+	SND_SOC_DAPM_MIXER("Stereo1 ADC MIXR", RT5668_STO1_ADC_DIG_VOL,
+		RT5668_R_MUTE_SFT, 1, rt5668_sto1_adc_r_mix,
+		ARRAY_SIZE(rt5668_sto1_adc_r_mix)),
+
+	/* ADC PGA */
+	SND_SOC_DAPM_PGA("Stereo1 ADC MIX", SND_SOC_NOPM, 0, 0, NULL, 0),
+
+	/* Digital Interface */
+	SND_SOC_DAPM_SUPPLY("I2S1", RT5668_PWR_DIG_1, RT5668_PWR_I2S1_BIT,
+		0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("I2S2", RT5668_PWR_DIG_1, RT5668_PWR_I2S2_BIT,
+		0, NULL, 0),
+	SND_SOC_DAPM_PGA("IF1 DAC1", SND_SOC_NOPM, 0, 0, NULL, 0),
+	SND_SOC_DAPM_PGA("IF1 DAC1 L", SND_SOC_NOPM, 0, 0, NULL, 0),
+	SND_SOC_DAPM_PGA("IF1 DAC1 R", SND_SOC_NOPM, 0, 0, NULL, 0),
+
+	/* Digital Interface Select */
+	SND_SOC_DAPM_MUX("IF1 01 ADC Swap Mux", SND_SOC_NOPM, 0, 0,
+			&rt5668_if1_01_adc_swap_mux),
+	SND_SOC_DAPM_MUX("IF1 23 ADC Swap Mux", SND_SOC_NOPM, 0, 0,
+			&rt5668_if1_23_adc_swap_mux),
+	SND_SOC_DAPM_MUX("IF1 45 ADC Swap Mux", SND_SOC_NOPM, 0, 0,
+			&rt5668_if1_45_adc_swap_mux),
+	SND_SOC_DAPM_MUX("IF1 67 ADC Swap Mux", SND_SOC_NOPM, 0, 0,
+			&rt5668_if1_67_adc_swap_mux),
+	SND_SOC_DAPM_MUX("IF2 ADC Swap Mux", SND_SOC_NOPM, 0, 0,
+			&rt5668_if2_adc_swap_mux),
+
+	SND_SOC_DAPM_MUX("ADCDAT Mux", SND_SOC_NOPM, 0, 0,
+			&rt5668_adcdat_pin_ctrl),
+
+	/* Audio Interface */
+	SND_SOC_DAPM_AIF_OUT("AIF1TX", "AIF1 Capture", 0,
+		RT5668_I2S1_SDP, RT5668_SEL_ADCDAT_SFT, 1),
+	SND_SOC_DAPM_AIF_OUT("AIF2TX", "AIF2 Capture", 0,
+		RT5668_I2S2_SDP, RT5668_I2S2_PIN_CFG_SFT, 1),
+	SND_SOC_DAPM_AIF_IN("AIF1RX", "AIF1 Playback", 0, SND_SOC_NOPM, 0, 0),
+
+	/* Output Side */
+	/* DAC mixer before sound effect  */
+	SND_SOC_DAPM_MIXER("DAC1 MIXL", SND_SOC_NOPM, 0, 0,
+		rt5668_dac_l_mix, ARRAY_SIZE(rt5668_dac_l_mix)),
+	SND_SOC_DAPM_MIXER("DAC1 MIXR", SND_SOC_NOPM, 0, 0,
+		rt5668_dac_r_mix, ARRAY_SIZE(rt5668_dac_r_mix)),
+
+	/* DAC channel Mux */
+	SND_SOC_DAPM_MUX("DAC L1 Source", SND_SOC_NOPM, 0, 0,
+		&rt5668_alg_dac_l1_mux),
+	SND_SOC_DAPM_MUX("DAC R1 Source", SND_SOC_NOPM, 0, 0,
+		&rt5668_alg_dac_r1_mux),
+
+	/* DAC Mixer */
+	SND_SOC_DAPM_SUPPLY("DAC Stereo1 Filter", RT5668_PWR_DIG_2,
+		RT5668_PWR_DAC_S1F_BIT, 0, set_filter_clk,
+		SND_SOC_DAPM_PRE_PMU),
+	SND_SOC_DAPM_MIXER("Stereo1 DAC MIXL", SND_SOC_NOPM, 0, 0,
+		rt5668_sto1_dac_l_mix, ARRAY_SIZE(rt5668_sto1_dac_l_mix)),
+	SND_SOC_DAPM_MIXER("Stereo1 DAC MIXR", SND_SOC_NOPM, 0, 0,
+		rt5668_sto1_dac_r_mix, ARRAY_SIZE(rt5668_sto1_dac_r_mix)),
+
+	/* DACs */
+	SND_SOC_DAPM_DAC("DAC L1", NULL, RT5668_PWR_DIG_1,
+		RT5668_PWR_DAC_L1_BIT, 0),
+	SND_SOC_DAPM_DAC("DAC R1", NULL, RT5668_PWR_DIG_1,
+		RT5668_PWR_DAC_R1_BIT, 0),
+	SND_SOC_DAPM_SUPPLY_S("DAC 1 Clock", 3, RT5668_CHOP_DAC,
+		RT5668_CKGEN_DAC1_SFT, 0, NULL, 0),
+
+	/* HPO */
+	SND_SOC_DAPM_PGA_S("HP Amp", 1, SND_SOC_NOPM, 0, 0, rt5668_hp_event,
+		SND_SOC_DAPM_POST_PMD | SND_SOC_DAPM_PRE_PMU),
+
+	SND_SOC_DAPM_SUPPLY("HP Amp L", RT5668_PWR_ANLG_1,
+		RT5668_PWR_HA_L_BIT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("HP Amp R", RT5668_PWR_ANLG_1,
+		RT5668_PWR_HA_R_BIT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY_S("Charge Pump", 1, RT5668_DEPOP_1,
+		RT5668_PUMP_EN_SFT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY_S("Capless", 2, RT5668_DEPOP_1,
+		RT5668_CAPLESS_EN_SFT, 0, NULL, 0),
+
+	SND_SOC_DAPM_SWITCH("HPOL Playback", SND_SOC_NOPM, 0, 0,
+		&hpol_switch),
+	SND_SOC_DAPM_SWITCH("HPOR Playback", SND_SOC_NOPM, 0, 0,
+		&hpor_switch),
+
+	/* CLK DET */
+	SND_SOC_DAPM_SUPPLY("CLKDET SYS", RT5668_CLK_DET,
+		RT5668_SYS_CLK_DET_SFT,	0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("CLKDET PLL1", RT5668_CLK_DET,
+		RT5668_PLL1_CLK_DET_SFT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("CLKDET PLL2", RT5668_CLK_DET,
+		RT5668_PLL2_CLK_DET_SFT, 0, NULL, 0),
+	SND_SOC_DAPM_SUPPLY("CLKDET", RT5668_CLK_DET,
+		RT5668_POW_CLK_DET_SFT, 0, NULL, 0),
+
+	/* Output Lines */
+	SND_SOC_DAPM_OUTPUT("HPOL"),
+	SND_SOC_DAPM_OUTPUT("HPOR"),
+
+};
+
+static const struct snd_soc_dapm_route rt5668_dapm_routes[] = {
+	/*PLL*/
+	{"ADC Stereo1 Filter", NULL, "PLL1", is_sys_clk_from_pll1},
+	{"DAC Stereo1 Filter", NULL, "PLL1", is_sys_clk_from_pll1},
+
+	/*ASRC*/
+	{"ADC Stereo1 Filter", NULL, "ADC STO1 ASRC", is_using_asrc},
+	{"DAC Stereo1 Filter", NULL, "DAC STO1 ASRC", is_using_asrc},
+	{"ADC STO1 ASRC", NULL, "AD ASRC"},
+	{"DAC STO1 ASRC", NULL, "DA ASRC"},
+
+	/*Vref*/
+	{"MICBIAS1", NULL, "Vref1"},
+	{"MICBIAS1", NULL, "Vref2"},
+	{"MICBIAS2", NULL, "Vref1"},
+	{"MICBIAS2", NULL, "Vref2"},
+
+	{"CLKDET SYS", NULL, "CLKDET"},
+
+	{"IN1P", NULL, "LDO2"},
+
+	{"BST1 CBJ", NULL, "IN1P"},
+	{"BST1 CBJ", NULL, "CBJ Power"},
+	{"CBJ Power", NULL, "Vref2"},
+
+	{"RECMIX1L", "CBJ Switch", "BST1 CBJ"},
+	{"RECMIX1L", NULL, "RECMIX1L Power"},
+
+	{"ADC1 L", NULL, "RECMIX1L"},
+	{"ADC1 L", NULL, "ADC1 L Power"},
+	{"ADC1 L", NULL, "ADC1 clock"},
+
+	{"DMIC L1", NULL, "DMIC CLK"},
+	{"DMIC L1", NULL, "DMIC1 Power"},
+	{"DMIC R1", NULL, "DMIC CLK"},
+	{"DMIC R1", NULL, "DMIC1 Power"},
+	{"DMIC CLK", NULL, "DMIC ASRC"},
+
+	{"Stereo1 ADC L Mux", "ADC1 L", "ADC1 L"},
+	{"Stereo1 ADC L Mux", "ADC1 R", "ADC1 R"},
+	{"Stereo1 ADC R Mux", "ADC1 L", "ADC1 L"},
+	{"Stereo1 ADC R Mux", "ADC1 R", "ADC1 R"},
+
+	{"Stereo1 ADC L1 Mux", "ADC", "Stereo1 ADC L Mux"},
+	{"Stereo1 ADC L1 Mux", "DAC MIX", "Stereo1 DAC MIXL"},
+	{"Stereo1 ADC L2 Mux", "DMIC", "DMIC L1"},
+	{"Stereo1 ADC L2 Mux", "DAC MIX", "Stereo1 DAC MIXL"},
+
+	{"Stereo1 ADC R1 Mux", "ADC", "Stereo1 ADC R Mux"},
+	{"Stereo1 ADC R1 Mux", "DAC MIX", "Stereo1 DAC MIXR"},
+	{"Stereo1 ADC R2 Mux", "DMIC", "DMIC R1"},
+	{"Stereo1 ADC R2 Mux", "DAC MIX", "Stereo1 DAC MIXR"},
+
+	{"Stereo1 ADC MIXL", "ADC1 Switch", "Stereo1 ADC L1 Mux"},
+	{"Stereo1 ADC MIXL", "ADC2 Switch", "Stereo1 ADC L2 Mux"},
+	{"Stereo1 ADC MIXL", NULL, "ADC Stereo1 Filter"},
+
+	{"Stereo1 ADC MIXR", "ADC1 Switch", "Stereo1 ADC R1 Mux"},
+	{"Stereo1 ADC MIXR", "ADC2 Switch", "Stereo1 ADC R2 Mux"},
+	{"Stereo1 ADC MIXR", NULL, "ADC Stereo1 Filter"},
+
+	{"Stereo1 ADC MIX", NULL, "Stereo1 ADC MIXL"},
+	{"Stereo1 ADC MIX", NULL, "Stereo1 ADC MIXR"},
+
+	{"IF1 01 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"},
+	{"IF1 01 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"},
+	{"IF1 01 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"},
+	{"IF1 01 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"},
+	{"IF1 23 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"},
+	{"IF1 23 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"},
+	{"IF1 23 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"},
+	{"IF1 23 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"},
+	{"IF1 45 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"},
+	{"IF1 45 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"},
+	{"IF1 45 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"},
+	{"IF1 45 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"},
+	{"IF1 67 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"},
+	{"IF1 67 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"},
+	{"IF1 67 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"},
+	{"IF1 67 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"},
+
+	{"IF1_ADC Mux", "Slot 0", "IF1 01 ADC Swap Mux"},
+	{"IF1_ADC Mux", "Slot 2", "IF1 23 ADC Swap Mux"},
+	{"IF1_ADC Mux", "Slot 4", "IF1 45 ADC Swap Mux"},
+	{"IF1_ADC Mux", "Slot 6", "IF1 67 ADC Swap Mux"},
+	{"IF1_ADC Mux", NULL, "I2S1"},
+	{"ADCDAT Mux", "ADCDAT1", "IF1_ADC Mux"},
+	{"AIF1TX", NULL, "ADCDAT Mux"},
+	{"IF2 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"},
+	{"IF2 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"},
+	{"IF2 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"},
+	{"IF2 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"},
+	{"ADCDAT Mux", "ADCDAT2", "IF2 ADC Swap Mux"},
+	{"AIF2TX", NULL, "ADCDAT Mux"},
+
+	{"IF1 DAC1 L", NULL, "AIF1RX"},
+	{"IF1 DAC1 L", NULL, "I2S1"},
+	{"IF1 DAC1 L", NULL, "DAC Stereo1 Filter"},
+	{"IF1 DAC1 R", NULL, "AIF1RX"},
+	{"IF1 DAC1 R", NULL, "I2S1"},
+	{"IF1 DAC1 R", NULL, "DAC Stereo1 Filter"},
+
+	{"DAC1 MIXL", "Stereo ADC Switch", "Stereo1 ADC MIXL"},
+	{"DAC1 MIXL", "DAC1 Switch", "IF1 DAC1 L"},
+	{"DAC1 MIXR", "Stereo ADC Switch", "Stereo1 ADC MIXR"},
+	{"DAC1 MIXR", "DAC1 Switch", "IF1 DAC1 R"},
+
+	{"Stereo1 DAC MIXL", "DAC L1 Switch", "DAC1 MIXL"},
+	{"Stereo1 DAC MIXL", "DAC R1 Switch", "DAC1 MIXR"},
+
+	{"Stereo1 DAC MIXR", "DAC R1 Switch", "DAC1 MIXR"},
+	{"Stereo1 DAC MIXR", "DAC L1 Switch", "DAC1 MIXL"},
+
+	{"DAC L1 Source", "DAC1", "DAC1 MIXL"},
+	{"DAC L1 Source", "Stereo1 DAC Mixer", "Stereo1 DAC MIXL"},
+	{"DAC R1 Source", "DAC1", "DAC1 MIXR"},
+	{"DAC R1 Source", "Stereo1 DAC Mixer", "Stereo1 DAC MIXR"},
+
+	{"DAC L1", NULL, "DAC L1 Source"},
+	{"DAC R1", NULL, "DAC R1 Source"},
+
+	{"DAC L1", NULL, "DAC 1 Clock"},
+	{"DAC R1", NULL, "DAC 1 Clock"},
+
+	{"HP Amp", NULL, "DAC L1"},
+	{"HP Amp", NULL, "DAC R1"},
+	{"HP Amp", NULL, "HP Amp L"},
+	{"HP Amp", NULL, "HP Amp R"},
+	{"HP Amp", NULL, "Capless"},
+	{"HP Amp", NULL, "Charge Pump"},
+	{"HP Amp", NULL, "CLKDET SYS"},
+	{"HP Amp", NULL, "CBJ Power"},
+	{"HP Amp", NULL, "Vref2"},
+	{"HPOL Playback", "Switch", "HP Amp"},
+	{"HPOR Playback", "Switch", "HP Amp"},
+	{"HPOL", NULL, "HPOL Playback"},
+	{"HPOR", NULL, "HPOR Playback"},
+};
+
+static int rt5668_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
+			unsigned int rx_mask, int slots, int slot_width)
+{
+	struct snd_soc_component *component = dai->component;
+	unsigned int val = 0;
+
+	switch (slots) {
+	case 4:
+		val |= RT5668_TDM_TX_CH_4;
+		val |= RT5668_TDM_RX_CH_4;
+		break;
+	case 6:
+		val |= RT5668_TDM_TX_CH_6;
+		val |= RT5668_TDM_RX_CH_6;
+		break;
+	case 8:
+		val |= RT5668_TDM_TX_CH_8;
+		val |= RT5668_TDM_RX_CH_8;
+		break;
+	case 2:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	snd_soc_component_update_bits(component, RT5668_TDM_CTRL,
+		RT5668_TDM_TX_CH_MASK | RT5668_TDM_RX_CH_MASK, val);
+
+	switch (slot_width) {
+	case 16:
+		val = RT5668_TDM_CL_16;
+		break;
+	case 20:
+		val = RT5668_TDM_CL_20;
+		break;
+	case 24:
+		val = RT5668_TDM_CL_24;
+		break;
+	case 32:
+		val = RT5668_TDM_CL_32;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	snd_soc_component_update_bits(component, RT5668_TDM_TCON_CTRL,
+		RT5668_TDM_CL_MASK, val);
+
+	return 0;
+}
+
+
+static int rt5668_hw_params(struct snd_pcm_substream *substream,
+	struct snd_pcm_hw_params *params, struct snd_soc_dai *dai)
+{
+	struct snd_soc_component *component = dai->component;
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+	unsigned int len_1 = 0, len_2 = 0;
+	int pre_div, frame_size;
+
+	rt5668->lrck[dai->id] = params_rate(params);
+	pre_div = rl6231_get_clk_info(rt5668->sysclk, rt5668->lrck[dai->id]);
+
+	frame_size = snd_soc_params_to_frame_size(params);
+	if (frame_size < 0) {
+		dev_err(component->dev, "Unsupported frame size: %d\n",
+			frame_size);
+		return -EINVAL;
+	}
+
+	dev_dbg(dai->dev, "lrck is %dHz and pre_div is %d for iis %d\n",
+				rt5668->lrck[dai->id], pre_div, dai->id);
+
+	switch (params_width(params)) {
+	case 16:
+		break;
+	case 20:
+		len_1 |= RT5668_I2S1_DL_20;
+		len_2 |= RT5668_I2S2_DL_20;
+		break;
+	case 24:
+		len_1 |= RT5668_I2S1_DL_24;
+		len_2 |= RT5668_I2S2_DL_24;
+		break;
+	case 32:
+		len_1 |= RT5668_I2S1_DL_32;
+		len_2 |= RT5668_I2S2_DL_24;
+		break;
+	case 8:
+		len_1 |= RT5668_I2S2_DL_8;
+		len_2 |= RT5668_I2S2_DL_8;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (dai->id) {
+	case RT5668_AIF1:
+		snd_soc_component_update_bits(component, RT5668_I2S1_SDP,
+			RT5668_I2S1_DL_MASK, len_1);
+		if (rt5668->master[RT5668_AIF1]) {
+			snd_soc_component_update_bits(component,
+				RT5668_ADDA_CLK_1, RT5668_I2S_M_DIV_MASK,
+				pre_div << RT5668_I2S_M_DIV_SFT);
+		}
+		if (params_channels(params) == 1) /* mono mode */
+			snd_soc_component_update_bits(component,
+				RT5668_I2S1_SDP, RT5668_I2S1_MONO_MASK,
+				RT5668_I2S1_MONO_EN);
+		else
+			snd_soc_component_update_bits(component,
+				RT5668_I2S1_SDP, RT5668_I2S1_MONO_MASK,
+				RT5668_I2S1_MONO_DIS);
+		break;
+	case RT5668_AIF2:
+		snd_soc_component_update_bits(component, RT5668_I2S2_SDP,
+			RT5668_I2S2_DL_MASK, len_2);
+		if (rt5668->master[RT5668_AIF2]) {
+			snd_soc_component_update_bits(component,
+				RT5668_I2S_M_CLK_CTRL_1, RT5668_I2S2_M_PD_MASK,
+				pre_div << RT5668_I2S2_M_PD_SFT);
+		}
+		if (params_channels(params) == 1) /* mono mode */
+			snd_soc_component_update_bits(component,
+				RT5668_I2S2_SDP, RT5668_I2S2_MONO_MASK,
+				RT5668_I2S2_MONO_EN);
+		else
+			snd_soc_component_update_bits(component,
+				RT5668_I2S2_SDP, RT5668_I2S2_MONO_MASK,
+				RT5668_I2S2_MONO_DIS);
+		break;
+	default:
+		dev_err(component->dev, "Invalid dai->id: %d\n", dai->id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int rt5668_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
+{
+	struct snd_soc_component *component = dai->component;
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+	unsigned int reg_val = 0, tdm_ctrl = 0;
+
+	switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
+	case SND_SOC_DAIFMT_CBM_CFM:
+		rt5668->master[dai->id] = 1;
+		break;
+	case SND_SOC_DAIFMT_CBS_CFS:
+		rt5668->master[dai->id] = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (fmt & SND_SOC_DAIFMT_INV_MASK) {
+	case SND_SOC_DAIFMT_NB_NF:
+		break;
+	case SND_SOC_DAIFMT_IB_NF:
+		reg_val |= RT5668_I2S_BP_INV;
+		tdm_ctrl |= RT5668_TDM_S_BP_INV;
+		break;
+	case SND_SOC_DAIFMT_NB_IF:
+		if (dai->id == RT5668_AIF1)
+			tdm_ctrl |= RT5668_TDM_S_LP_INV | RT5668_TDM_M_BP_INV;
+		else
+			return -EINVAL;
+		break;
+	case SND_SOC_DAIFMT_IB_IF:
+		if (dai->id == RT5668_AIF1)
+			tdm_ctrl |= RT5668_TDM_S_BP_INV | RT5668_TDM_S_LP_INV |
+				    RT5668_TDM_M_BP_INV | RT5668_TDM_M_LP_INV;
+		else
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+	case SND_SOC_DAIFMT_I2S:
+		break;
+	case SND_SOC_DAIFMT_LEFT_J:
+		reg_val |= RT5668_I2S_DF_LEFT;
+		tdm_ctrl |= RT5668_TDM_DF_LEFT;
+		break;
+	case SND_SOC_DAIFMT_DSP_A:
+		reg_val |= RT5668_I2S_DF_PCM_A;
+		tdm_ctrl |= RT5668_TDM_DF_PCM_A;
+		break;
+	case SND_SOC_DAIFMT_DSP_B:
+		reg_val |= RT5668_I2S_DF_PCM_B;
+		tdm_ctrl |= RT5668_TDM_DF_PCM_B;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (dai->id) {
+	case RT5668_AIF1:
+		snd_soc_component_update_bits(component, RT5668_I2S1_SDP,
+			RT5668_I2S_DF_MASK, reg_val);
+		snd_soc_component_update_bits(component, RT5668_TDM_TCON_CTRL,
+			RT5668_TDM_MS_MASK | RT5668_TDM_S_BP_MASK |
+			RT5668_TDM_DF_MASK | RT5668_TDM_M_BP_MASK |
+			RT5668_TDM_M_LP_MASK | RT5668_TDM_S_LP_MASK,
+			tdm_ctrl | rt5668->master[dai->id]);
+		break;
+	case RT5668_AIF2:
+		if (rt5668->master[dai->id] == 0)
+			reg_val |= RT5668_I2S2_MS_S;
+		snd_soc_component_update_bits(component, RT5668_I2S2_SDP,
+			RT5668_I2S2_MS_MASK | RT5668_I2S_BP_MASK |
+			RT5668_I2S_DF_MASK, reg_val);
+		break;
+	default:
+		dev_err(component->dev, "Invalid dai->id: %d\n", dai->id);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int rt5668_set_component_sysclk(struct snd_soc_component *component,
+		int clk_id, int source, unsigned int freq, int dir)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+	unsigned int reg_val = 0, src = 0;
+
+	if (freq == rt5668->sysclk && clk_id == rt5668->sysclk_src)
+		return 0;
+
+	switch (clk_id) {
+	case RT5668_SCLK_S_MCLK:
+		reg_val |= RT5668_SCLK_SRC_MCLK;
+		src = RT5668_CLK_SRC_MCLK;
+		break;
+	case RT5668_SCLK_S_PLL1:
+		reg_val |= RT5668_SCLK_SRC_PLL1;
+		src = RT5668_CLK_SRC_PLL1;
+		break;
+	case RT5668_SCLK_S_PLL2:
+		reg_val |= RT5668_SCLK_SRC_PLL2;
+		src = RT5668_CLK_SRC_PLL2;
+		break;
+	case RT5668_SCLK_S_RCCLK:
+		reg_val |= RT5668_SCLK_SRC_RCCLK;
+		src = RT5668_CLK_SRC_RCCLK;
+		break;
+	default:
+		dev_err(component->dev, "Invalid clock id (%d)\n", clk_id);
+		return -EINVAL;
+	}
+	snd_soc_component_update_bits(component, RT5668_GLB_CLK,
+		RT5668_SCLK_SRC_MASK, reg_val);
+
+	if (rt5668->master[RT5668_AIF2]) {
+		snd_soc_component_update_bits(component,
+			RT5668_I2S_M_CLK_CTRL_1, RT5668_I2S2_SRC_MASK,
+			src << RT5668_I2S2_SRC_SFT);
+	}
+
+	rt5668->sysclk = freq;
+	rt5668->sysclk_src = clk_id;
+
+	dev_dbg(component->dev, "Sysclk is %dHz and clock id is %d\n",
+		freq, clk_id);
+
+	return 0;
+}
+
+static int rt5668_set_component_pll(struct snd_soc_component *component,
+		int pll_id, int source, unsigned int freq_in,
+		unsigned int freq_out)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+	struct rl6231_pll_code pll_code;
+	int ret;
+
+	if (source == rt5668->pll_src && freq_in == rt5668->pll_in &&
+	    freq_out == rt5668->pll_out)
+		return 0;
+
+	if (!freq_in || !freq_out) {
+		dev_dbg(component->dev, "PLL disabled\n");
+
+		rt5668->pll_in = 0;
+		rt5668->pll_out = 0;
+		snd_soc_component_update_bits(component, RT5668_GLB_CLK,
+			RT5668_SCLK_SRC_MASK, RT5668_SCLK_SRC_MCLK);
+		return 0;
+	}
+
+	switch (source) {
+	case RT5668_PLL1_S_MCLK:
+		snd_soc_component_update_bits(component, RT5668_GLB_CLK,
+			RT5668_PLL1_SRC_MASK, RT5668_PLL1_SRC_MCLK);
+		break;
+	case RT5668_PLL1_S_BCLK1:
+		snd_soc_component_update_bits(component, RT5668_GLB_CLK,
+				RT5668_PLL1_SRC_MASK, RT5668_PLL1_SRC_BCLK1);
+		break;
+	default:
+		dev_err(component->dev, "Unknown PLL Source %d\n", source);
+		return -EINVAL;
+	}
+
+	ret = rl6231_pll_calc(freq_in, freq_out, &pll_code);
+	if (ret < 0) {
+		dev_err(component->dev, "Unsupport input clock %d\n", freq_in);
+		return ret;
+	}
+
+	dev_dbg(component->dev, "bypass=%d m=%d n=%d k=%d\n",
+		pll_code.m_bp, (pll_code.m_bp ? 0 : pll_code.m_code),
+		pll_code.n_code, pll_code.k_code);
+
+	snd_soc_component_write(component, RT5668_PLL_CTRL_1,
+		pll_code.n_code << RT5668_PLL_N_SFT | pll_code.k_code);
+	snd_soc_component_write(component, RT5668_PLL_CTRL_2,
+		(pll_code.m_bp ? 0 : pll_code.m_code) << RT5668_PLL_M_SFT |
+		pll_code.m_bp << RT5668_PLL_M_BP_SFT);
+
+	rt5668->pll_in = freq_in;
+	rt5668->pll_out = freq_out;
+	rt5668->pll_src = source;
+
+	return 0;
+}
+
+static int rt5668_set_bclk_ratio(struct snd_soc_dai *dai, unsigned int ratio)
+{
+	struct snd_soc_component *component = dai->component;
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+
+	rt5668->bclk[dai->id] = ratio;
+
+	switch (ratio) {
+	case 64:
+		snd_soc_component_update_bits(component, RT5668_ADDA_CLK_2,
+			RT5668_I2S2_BCLK_MS2_MASK,
+			RT5668_I2S2_BCLK_MS2_64);
+		break;
+	case 32:
+		snd_soc_component_update_bits(component, RT5668_ADDA_CLK_2,
+			RT5668_I2S2_BCLK_MS2_MASK,
+			RT5668_I2S2_BCLK_MS2_32);
+		break;
+	default:
+		dev_err(dai->dev, "Invalid bclk ratio %d\n", ratio);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int rt5668_set_bias_level(struct snd_soc_component *component,
+			enum snd_soc_bias_level level)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+
+	switch (level) {
+	case SND_SOC_BIAS_PREPARE:
+		regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1,
+			RT5668_PWR_MB | RT5668_PWR_BG,
+			RT5668_PWR_MB | RT5668_PWR_BG);
+		regmap_update_bits(rt5668->regmap, RT5668_PWR_DIG_1,
+			RT5668_DIG_GATE_CTRL | RT5668_PWR_LDO,
+			RT5668_DIG_GATE_CTRL | RT5668_PWR_LDO);
+		break;
+
+	case SND_SOC_BIAS_STANDBY:
+		regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1,
+			RT5668_PWR_MB, RT5668_PWR_MB);
+		regmap_update_bits(rt5668->regmap, RT5668_PWR_DIG_1,
+			RT5668_DIG_GATE_CTRL, RT5668_DIG_GATE_CTRL);
+		break;
+	case SND_SOC_BIAS_OFF:
+		regmap_update_bits(rt5668->regmap, RT5668_PWR_DIG_1,
+			RT5668_DIG_GATE_CTRL | RT5668_PWR_LDO, 0);
+		regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1,
+			RT5668_PWR_MB | RT5668_PWR_BG, 0);
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int rt5668_probe(struct snd_soc_component *component)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+
+	rt5668->component = component;
+
+	return 0;
+}
+
+static void rt5668_remove(struct snd_soc_component *component)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+
+	rt5668_reset(rt5668->regmap);
+}
+
+#ifdef CONFIG_PM
+static int rt5668_suspend(struct snd_soc_component *component)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+
+	regcache_cache_only(rt5668->regmap, true);
+	regcache_mark_dirty(rt5668->regmap);
+	return 0;
+}
+
+static int rt5668_resume(struct snd_soc_component *component)
+{
+	struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component);
+
+	regcache_cache_only(rt5668->regmap, false);
+	regcache_sync(rt5668->regmap);
+
+	return 0;
+}
+#else
+#define rt5668_suspend NULL
+#define rt5668_resume NULL
+#endif
+
+#define RT5668_STEREO_RATES SNDRV_PCM_RATE_8000_192000
+#define RT5668_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S20_3LE | \
+		SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S8)
+
+static const struct snd_soc_dai_ops rt5668_aif1_dai_ops = {
+	.hw_params = rt5668_hw_params,
+	.set_fmt = rt5668_set_dai_fmt,
+	.set_tdm_slot = rt5668_set_tdm_slot,
+};
+
+static const struct snd_soc_dai_ops rt5668_aif2_dai_ops = {
+	.hw_params = rt5668_hw_params,
+	.set_fmt = rt5668_set_dai_fmt,
+	.set_bclk_ratio = rt5668_set_bclk_ratio,
+};
+
+static struct snd_soc_dai_driver rt5668_dai[] = {
+	{
+		.name = "rt5668-aif1",
+		.id = RT5668_AIF1,
+		.playback = {
+			.stream_name = "AIF1 Playback",
+			.channels_min = 1,
+			.channels_max = 2,
+			.rates = RT5668_STEREO_RATES,
+			.formats = RT5668_FORMATS,
+		},
+		.capture = {
+			.stream_name = "AIF1 Capture",
+			.channels_min = 1,
+			.channels_max = 2,
+			.rates = RT5668_STEREO_RATES,
+			.formats = RT5668_FORMATS,
+		},
+		.ops = &rt5668_aif1_dai_ops,
+	},
+	{
+		.name = "rt5668-aif2",
+		.id = RT5668_AIF2,
+		.capture = {
+			.stream_name = "AIF2 Capture",
+			.channels_min = 1,
+			.channels_max = 2,
+			.rates = RT5668_STEREO_RATES,
+			.formats = RT5668_FORMATS,
+		},
+		.ops = &rt5668_aif2_dai_ops,
+	},
+};
+
+static const struct snd_soc_component_driver soc_component_dev_rt5668 = {
+	.probe = rt5668_probe,
+	.remove = rt5668_remove,
+	.suspend = rt5668_suspend,
+	.resume = rt5668_resume,
+	.set_bias_level = rt5668_set_bias_level,
+	.controls = rt5668_snd_controls,
+	.num_controls = ARRAY_SIZE(rt5668_snd_controls),
+	.dapm_widgets = rt5668_dapm_widgets,
+	.num_dapm_widgets = ARRAY_SIZE(rt5668_dapm_widgets),
+	.dapm_routes = rt5668_dapm_routes,
+	.num_dapm_routes = ARRAY_SIZE(rt5668_dapm_routes),
+	.set_sysclk = rt5668_set_component_sysclk,
+	.set_pll = rt5668_set_component_pll,
+	.set_jack = rt5668_set_jack_detect,
+	.use_pmdown_time	= 1,
+	.endianness		= 1,
+	.non_legacy_dai_naming	= 1,
+};
+
+static const struct regmap_config rt5668_regmap = {
+	.reg_bits = 16,
+	.val_bits = 16,
+	.max_register = RT5668_I2C_MODE,
+	.volatile_reg = rt5668_volatile_register,
+	.readable_reg = rt5668_readable_register,
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = rt5668_reg,
+	.num_reg_defaults = ARRAY_SIZE(rt5668_reg),
+	.use_single_rw = true,
+};
+
+static const struct i2c_device_id rt5668_i2c_id[] = {
+	{"rt5668b", 0},
+	{}
+};
+MODULE_DEVICE_TABLE(i2c, rt5668_i2c_id);
+
+static int rt5668_parse_dt(struct rt5668_priv *rt5668, struct device *dev)
+{
+
+	of_property_read_u32(dev->of_node, "realtek,dmic1-data-pin",
+		&rt5668->pdata.dmic1_data_pin);
+	of_property_read_u32(dev->of_node, "realtek,dmic1-clk-pin",
+		&rt5668->pdata.dmic1_clk_pin);
+	of_property_read_u32(dev->of_node, "realtek,jd-src",
+		&rt5668->pdata.jd_src);
+
+	rt5668->pdata.ldo1_en = of_get_named_gpio(dev->of_node,
+		"realtek,ldo1-en-gpios", 0);
+
+	return 0;
+}
+
+static void rt5668_calibrate(struct rt5668_priv *rt5668)
+{
+	int value, count;
+
+	mutex_lock(&rt5668->calibrate_mutex);
+
+	rt5668_reset(rt5668->regmap);
+	regmap_write(rt5668->regmap, RT5668_PWR_ANLG_1, 0xa2bf);
+	usleep_range(15000, 20000);
+	regmap_write(rt5668->regmap, RT5668_PWR_ANLG_1, 0xf2bf);
+	regmap_write(rt5668->regmap, RT5668_MICBIAS_2, 0x0380);
+	regmap_write(rt5668->regmap, RT5668_PWR_DIG_1, 0x8001);
+	regmap_write(rt5668->regmap, RT5668_TEST_MODE_CTRL_1, 0x0000);
+	regmap_write(rt5668->regmap, RT5668_STO1_DAC_MIXER, 0x2080);
+	regmap_write(rt5668->regmap, RT5668_STO1_ADC_MIXER, 0x4040);
+	regmap_write(rt5668->regmap, RT5668_DEPOP_1, 0x0069);
+	regmap_write(rt5668->regmap, RT5668_CHOP_DAC, 0x3000);
+	regmap_write(rt5668->regmap, RT5668_HP_CTRL_2, 0x6000);
+	regmap_write(rt5668->regmap, RT5668_HP_CHARGE_PUMP_1, 0x0f26);
+	regmap_write(rt5668->regmap, RT5668_CALIB_ADC_CTRL, 0x7f05);
+	regmap_write(rt5668->regmap, RT5668_STO1_ADC_MIXER, 0x686c);
+	regmap_write(rt5668->regmap, RT5668_CAL_REC, 0x0d0d);
+	regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_9, 0x000f);
+	regmap_write(rt5668->regmap, RT5668_PWR_DIG_1, 0x8d01);
+	regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_2, 0x0321);
+	regmap_write(rt5668->regmap, RT5668_HP_LOGIC_CTRL_2, 0x0004);
+	regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_1, 0x7c00);
+	regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_3, 0x06a1);
+	regmap_write(rt5668->regmap, RT5668_A_DAC1_MUX, 0x0311);
+	regmap_write(rt5668->regmap, RT5668_RESET_HPF_CTRL, 0x0000);
+	regmap_write(rt5668->regmap, RT5668_ADC_STO1_HP_CTRL_1, 0x3320);
+
+	regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_1, 0xfc00);
+
+	for (count = 0; count < 60; count++) {
+		regmap_read(rt5668->regmap, RT5668_HP_CALIB_STA_1, &value);
+		if (!(value & 0x8000))
+			break;
+
+		usleep_range(10000, 10005);
+	}
+
+	if (count >= 60)
+		pr_err("HP Calibration Failure\n");
+
+	/* restore settings */
+	regmap_write(rt5668->regmap, RT5668_STO1_ADC_MIXER, 0xc0c4);
+	regmap_write(rt5668->regmap, RT5668_PWR_DIG_1, 0x0000);
+
+	mutex_unlock(&rt5668->calibrate_mutex);
+
+}
+
+static int rt5668_i2c_probe(struct i2c_client *i2c,
+		    const struct i2c_device_id *id)
+{
+	struct rt5668_platform_data *pdata = dev_get_platdata(&i2c->dev);
+	struct rt5668_priv *rt5668;
+	int i, ret;
+	unsigned int val;
+
+	rt5668 = devm_kzalloc(&i2c->dev, sizeof(struct rt5668_priv),
+		GFP_KERNEL);
+
+	if (rt5668 == NULL)
+		return -ENOMEM;
+
+	i2c_set_clientdata(i2c, rt5668);
+
+	if (pdata)
+		rt5668->pdata = *pdata;
+	else
+		rt5668_parse_dt(rt5668, &i2c->dev);
+
+	rt5668->regmap = devm_regmap_init_i2c(i2c, &rt5668_regmap);
+	if (IS_ERR(rt5668->regmap)) {
+		ret = PTR_ERR(rt5668->regmap);
+		dev_err(&i2c->dev, "Failed to allocate register map: %d\n",
+			ret);
+		return ret;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rt5668->supplies); i++)
+		rt5668->supplies[i].supply = rt5668_supply_names[i];
+
+	ret = devm_regulator_bulk_get(&i2c->dev, ARRAY_SIZE(rt5668->supplies),
+				      rt5668->supplies);
+	if (ret != 0) {
+		dev_err(&i2c->dev, "Failed to request supplies: %d\n", ret);
+		return ret;
+	}
+
+	ret = regulator_bulk_enable(ARRAY_SIZE(rt5668->supplies),
+				    rt5668->supplies);
+	if (ret != 0) {
+		dev_err(&i2c->dev, "Failed to enable supplies: %d\n", ret);
+		return ret;
+	}
+
+	if (gpio_is_valid(rt5668->pdata.ldo1_en)) {
+		if (devm_gpio_request_one(&i2c->dev, rt5668->pdata.ldo1_en,
+					  GPIOF_OUT_INIT_HIGH, "rt5668"))
+			dev_err(&i2c->dev, "Fail gpio_request gpio_ldo\n");
+	}
+
+	/* Sleep for 300 ms miniumum */
+	usleep_range(300000, 350000);
+
+	regmap_write(rt5668->regmap, RT5668_I2C_MODE, 0x1);
+	usleep_range(10000, 15000);
+
+	regmap_read(rt5668->regmap, RT5668_DEVICE_ID, &val);
+	if (val != DEVICE_ID) {
+		pr_err("Device with ID register %x is not rt5668\n", val);
+		return -ENODEV;
+	}
+
+	rt5668_reset(rt5668->regmap);
+
+	rt5668_calibrate(rt5668);
+
+	regmap_write(rt5668->regmap, RT5668_DEPOP_1, 0x0000);
+
+	/* DMIC pin*/
+	if (rt5668->pdata.dmic1_data_pin != RT5668_DMIC1_NULL) {
+		switch (rt5668->pdata.dmic1_data_pin) {
+		case RT5668_DMIC1_DATA_GPIO2: /* share with LRCK2 */
+			regmap_update_bits(rt5668->regmap, RT5668_DMIC_CTRL_1,
+				RT5668_DMIC_1_DP_MASK, RT5668_DMIC_1_DP_GPIO2);
+			regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1,
+				RT5668_GP2_PIN_MASK, RT5668_GP2_PIN_DMIC_SDA);
+			break;
+
+		case RT5668_DMIC1_DATA_GPIO5: /* share with DACDAT1 */
+			regmap_update_bits(rt5668->regmap, RT5668_DMIC_CTRL_1,
+				RT5668_DMIC_1_DP_MASK, RT5668_DMIC_1_DP_GPIO5);
+			regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1,
+				RT5668_GP5_PIN_MASK, RT5668_GP5_PIN_DMIC_SDA);
+			break;
+
+		default:
+			dev_dbg(&i2c->dev, "invalid DMIC_DAT pin\n");
+			break;
+		}
+
+		switch (rt5668->pdata.dmic1_clk_pin) {
+		case RT5668_DMIC1_CLK_GPIO1: /* share with IRQ */
+			regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1,
+				RT5668_GP1_PIN_MASK, RT5668_GP1_PIN_DMIC_CLK);
+			break;
+
+		case RT5668_DMIC1_CLK_GPIO3: /* share with BCLK2 */
+			regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1,
+				RT5668_GP3_PIN_MASK, RT5668_GP3_PIN_DMIC_CLK);
+			break;
+
+		default:
+			dev_dbg(&i2c->dev, "invalid DMIC_CLK pin\n");
+			break;
+		}
+	}
+
+	regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1,
+			RT5668_LDO1_DVO_MASK | RT5668_HP_DRIVER_MASK,
+			RT5668_LDO1_DVO_14 | RT5668_HP_DRIVER_5X);
+	regmap_write(rt5668->regmap, RT5668_MICBIAS_2, 0x0380);
+	regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1,
+			RT5668_GP4_PIN_MASK | RT5668_GP5_PIN_MASK,
+			RT5668_GP4_PIN_ADCDAT1 | RT5668_GP5_PIN_DACDAT1);
+	regmap_write(rt5668->regmap, RT5668_TEST_MODE_CTRL_1, 0x0000);
+
+	INIT_DELAYED_WORK(&rt5668->jack_detect_work,
+				rt5668_jack_detect_handler);
+	INIT_DELAYED_WORK(&rt5668->jd_check_work,
+				rt5668_jd_check_handler);
+
+	mutex_init(&rt5668->calibrate_mutex);
+
+	if (i2c->irq) {
+		ret = devm_request_threaded_irq(&i2c->dev, i2c->irq, NULL,
+			rt5668_irq, IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING
+			| IRQF_ONESHOT, "rt5668", rt5668);
+		if (ret)
+			dev_err(&i2c->dev, "Failed to reguest IRQ: %d\n", ret);
+
+	}
+
+	return snd_soc_register_component(&i2c->dev, &soc_component_dev_rt5668,
+			rt5668_dai, ARRAY_SIZE(rt5668_dai));
+}
+
+static int rt5668_i2c_remove(struct i2c_client *i2c)
+{
+	snd_soc_unregister_component(&i2c->dev);
+
+	return 0;
+}
+
+static void rt5668_i2c_shutdown(struct i2c_client *client)
+{
+	struct rt5668_priv *rt5668 = i2c_get_clientdata(client);
+
+	rt5668_reset(rt5668->regmap);
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id rt5668_of_match[] = {
+	{.compatible = "realtek,rt5668b"},
+	{},
+};
+MODULE_DEVICE_TABLE(of, rt5668_of_match);
+#endif
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id rt5668_acpi_match[] = {
+	{"10EC5668", 0,},
+	{},
+};
+MODULE_DEVICE_TABLE(acpi, rt5668_acpi_match);
+#endif
+
+static struct i2c_driver rt5668_i2c_driver = {
+	.driver = {
+		.name = "rt5668b",
+		.of_match_table = of_match_ptr(rt5668_of_match),
+		.acpi_match_table = ACPI_PTR(rt5668_acpi_match),
+	},
+	.probe = rt5668_i2c_probe,
+	.remove = rt5668_i2c_remove,
+	.shutdown = rt5668_i2c_shutdown,
+	.id_table = rt5668_i2c_id,
+};
+module_i2c_driver(rt5668_i2c_driver);
+
+MODULE_DESCRIPTION("ASoC RT5668B driver");
+MODULE_AUTHOR("Bard Liao <bardliao@realtek.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/sound/soc/codecs/rt5668.h b/sound/soc/codecs/rt5668.h
new file mode 100644
index 000000000000..3e7bcfd569ec
--- /dev/null
+++ b/sound/soc/codecs/rt5668.h
@@ -0,0 +1,1318 @@
+/*
+ * rt5668.h  --  RT5668/RT5658 ALSA SoC audio driver
+ *
+ * Copyright 2018 Realtek Microelectronics
+ * Author: Bard Liao <bardliao@realtek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __RT5668_H__
+#define __RT5668_H__
+
+#include <sound/rt5668.h>
+
+#define DEVICE_ID 0x6530
+
+/* Info */
+#define RT5668_RESET				0x0000
+#define RT5668_VERSION_ID			0x00fd
+#define RT5668_VENDOR_ID			0x00fe
+#define RT5668_DEVICE_ID			0x00ff
+/*  I/O - Output */
+#define RT5668_HP_CTRL_1			0x0002
+#define RT5668_HP_CTRL_2			0x0003
+#define RT5668_HPL_GAIN				0x0005
+#define RT5668_HPR_GAIN				0x0006
+
+#define RT5668_I2C_CTRL				0x0008
+
+/* I/O - Input */
+#define RT5668_CBJ_BST_CTRL			0x000b
+#define RT5668_CBJ_CTRL_1			0x0010
+#define RT5668_CBJ_CTRL_2			0x0011
+#define RT5668_CBJ_CTRL_3			0x0012
+#define RT5668_CBJ_CTRL_4			0x0013
+#define RT5668_CBJ_CTRL_5			0x0014
+#define RT5668_CBJ_CTRL_6			0x0015
+#define RT5668_CBJ_CTRL_7			0x0016
+/* I/O - ADC/DAC/DMIC */
+#define RT5668_DAC1_DIG_VOL			0x0019
+#define RT5668_STO1_ADC_DIG_VOL			0x001c
+#define RT5668_STO1_ADC_BOOST			0x001f
+#define RT5668_HP_IMP_GAIN_1			0x0022
+#define RT5668_HP_IMP_GAIN_2			0x0023
+/* Mixer - D-D */
+#define RT5668_SIDETONE_CTRL			0x0024
+#define RT5668_STO1_ADC_MIXER			0x0026
+#define RT5668_AD_DA_MIXER			0x0029
+#define RT5668_STO1_DAC_MIXER			0x002a
+#define RT5668_A_DAC1_MUX			0x002b
+#define RT5668_DIG_INF2_DATA			0x0030
+/* Mixer - ADC */
+#define RT5668_REC_MIXER			0x003c
+#define RT5668_CAL_REC				0x0044
+#define RT5668_ALC_BACK_GAIN			0x0049
+/* Power */
+#define RT5668_PWR_DIG_1			0x0061
+#define RT5668_PWR_DIG_2			0x0062
+#define RT5668_PWR_ANLG_1			0x0063
+#define RT5668_PWR_ANLG_2			0x0064
+#define RT5668_PWR_ANLG_3			0x0065
+#define RT5668_PWR_MIXER			0x0066
+#define RT5668_PWR_VOL				0x0067
+/* Clock Detect */
+#define RT5668_CLK_DET				0x006b
+/* Filter Auto Reset */
+#define RT5668_RESET_LPF_CTRL			0x006c
+#define RT5668_RESET_HPF_CTRL			0x006d
+/* DMIC */
+#define RT5668_DMIC_CTRL_1			0x006e
+/* Format - ADC/DAC */
+#define RT5668_I2S1_SDP				0x0070
+#define RT5668_I2S2_SDP				0x0071
+#define RT5668_ADDA_CLK_1			0x0073
+#define RT5668_ADDA_CLK_2			0x0074
+#define RT5668_I2S1_F_DIV_CTRL_1		0x0075
+#define RT5668_I2S1_F_DIV_CTRL_2		0x0076
+/* Format - TDM Control */
+#define RT5668_TDM_CTRL				0x0079
+#define RT5668_TDM_ADDA_CTRL_1			0x007a
+#define RT5668_TDM_ADDA_CTRL_2			0x007b
+#define RT5668_DATA_SEL_CTRL_1			0x007c
+#define RT5668_TDM_TCON_CTRL			0x007e
+/* Function - Analog */
+#define RT5668_GLB_CLK				0x0080
+#define RT5668_PLL_CTRL_1			0x0081
+#define RT5668_PLL_CTRL_2			0x0082
+#define RT5668_PLL_TRACK_1			0x0083
+#define RT5668_PLL_TRACK_2			0x0084
+#define RT5668_PLL_TRACK_3			0x0085
+#define RT5668_PLL_TRACK_4			0x0086
+#define RT5668_PLL_TRACK_5			0x0087
+#define RT5668_PLL_TRACK_6			0x0088
+#define RT5668_PLL_TRACK_11			0x008c
+#define RT5668_SDW_REF_CLK			0x008d
+#define RT5668_DEPOP_1				0x008e
+#define RT5668_DEPOP_2				0x008f
+#define RT5668_HP_CHARGE_PUMP_1			0x0091
+#define RT5668_HP_CHARGE_PUMP_2			0x0092
+#define RT5668_MICBIAS_1			0x0093
+#define RT5668_MICBIAS_2			0x0094
+#define RT5668_PLL_TRACK_12			0x0098
+#define RT5668_PLL_TRACK_14			0x009a
+#define RT5668_PLL2_CTRL_1			0x009b
+#define RT5668_PLL2_CTRL_2			0x009c
+#define RT5668_PLL2_CTRL_3			0x009d
+#define RT5668_PLL2_CTRL_4			0x009e
+#define RT5668_RC_CLK_CTRL			0x009f
+#define RT5668_I2S_M_CLK_CTRL_1			0x00a0
+#define RT5668_I2S2_F_DIV_CTRL_1		0x00a3
+#define RT5668_I2S2_F_DIV_CTRL_2		0x00a4
+/* Function - Digital */
+#define RT5668_EQ_CTRL_1			0x00ae
+#define RT5668_EQ_CTRL_2			0x00af
+#define RT5668_IRQ_CTRL_1			0x00b6
+#define RT5668_IRQ_CTRL_2			0x00b7
+#define RT5668_IRQ_CTRL_3			0x00b8
+#define RT5668_IRQ_CTRL_4			0x00b9
+#define RT5668_INT_ST_1				0x00be
+#define RT5668_GPIO_CTRL_1			0x00c0
+#define RT5668_GPIO_CTRL_2			0x00c1
+#define RT5668_GPIO_CTRL_3			0x00c2
+#define RT5668_HP_AMP_DET_CTRL_1		0x00d0
+#define RT5668_HP_AMP_DET_CTRL_2		0x00d1
+#define RT5668_MID_HP_AMP_DET			0x00d2
+#define RT5668_LOW_HP_AMP_DET			0x00d3
+#define RT5668_DELAY_BUF_CTRL			0x00d4
+#define RT5668_SV_ZCD_1				0x00d9
+#define RT5668_SV_ZCD_2				0x00da
+#define RT5668_IL_CMD_1				0x00db
+#define RT5668_IL_CMD_2				0x00dc
+#define RT5668_IL_CMD_3				0x00dd
+#define RT5668_IL_CMD_4				0x00de
+#define RT5668_IL_CMD_5				0x00df
+#define RT5668_IL_CMD_6				0x00e0
+#define RT5668_4BTN_IL_CMD_1			0x00e2
+#define RT5668_4BTN_IL_CMD_2			0x00e3
+#define RT5668_4BTN_IL_CMD_3			0x00e4
+#define RT5668_4BTN_IL_CMD_4			0x00e5
+#define RT5668_4BTN_IL_CMD_5			0x00e6
+#define RT5668_4BTN_IL_CMD_6			0x00e7
+#define RT5668_4BTN_IL_CMD_7			0x00e8
+
+#define RT5668_ADC_STO1_HP_CTRL_1		0x00ea
+#define RT5668_ADC_STO1_HP_CTRL_2		0x00eb
+#define RT5668_AJD1_CTRL			0x00f0
+#define RT5668_JD1_THD				0x00f1
+#define RT5668_JD2_THD				0x00f2
+#define RT5668_JD_CTRL_1			0x00f6
+/* General Control */
+#define RT5668_DUMMY_1				0x00fa
+#define RT5668_DUMMY_2				0x00fb
+#define RT5668_DUMMY_3				0x00fc
+
+#define RT5668_DAC_ADC_DIG_VOL1			0x0100
+#define RT5668_BIAS_CUR_CTRL_2			0x010b
+#define RT5668_BIAS_CUR_CTRL_3			0x010c
+#define RT5668_BIAS_CUR_CTRL_4			0x010d
+#define RT5668_BIAS_CUR_CTRL_5			0x010e
+#define RT5668_BIAS_CUR_CTRL_6			0x010f
+#define RT5668_BIAS_CUR_CTRL_7			0x0110
+#define RT5668_BIAS_CUR_CTRL_8			0x0111
+#define RT5668_BIAS_CUR_CTRL_9			0x0112
+#define RT5668_BIAS_CUR_CTRL_10			0x0113
+#define RT5668_VREF_REC_OP_FB_CAP_CTRL		0x0117
+#define RT5668_CHARGE_PUMP_1			0x0125
+#define RT5668_DIG_IN_CTRL_1			0x0132
+#define RT5668_PAD_DRIVING_CTRL			0x0136
+#define RT5668_SOFT_RAMP_DEPOP			0x0138
+#define RT5668_CHOP_DAC				0x013a
+#define RT5668_CHOP_ADC				0x013b
+#define RT5668_CALIB_ADC_CTRL			0x013c
+#define RT5668_VOL_TEST				0x013f
+#define RT5668_SPKVDD_DET_STA			0x0142
+#define RT5668_TEST_MODE_CTRL_1			0x0145
+#define RT5668_TEST_MODE_CTRL_2			0x0146
+#define RT5668_TEST_MODE_CTRL_3			0x0147
+#define RT5668_TEST_MODE_CTRL_4			0x0148
+#define RT5668_TEST_MODE_CTRL_5			0x0149
+#define RT5668_PLL1_INTERNAL			0x0150
+#define RT5668_PLL2_INTERNAL			0x0151
+#define RT5668_STO_NG2_CTRL_1			0x0160
+#define RT5668_STO_NG2_CTRL_2			0x0161
+#define RT5668_STO_NG2_CTRL_3			0x0162
+#define RT5668_STO_NG2_CTRL_4			0x0163
+#define RT5668_STO_NG2_CTRL_5			0x0164
+#define RT5668_STO_NG2_CTRL_6			0x0165
+#define RT5668_STO_NG2_CTRL_7			0x0166
+#define RT5668_STO_NG2_CTRL_8			0x0167
+#define RT5668_STO_NG2_CTRL_9			0x0168
+#define RT5668_STO_NG2_CTRL_10			0x0169
+#define RT5668_STO1_DAC_SIL_DET			0x0190
+#define RT5668_SIL_PSV_CTRL1			0x0194
+#define RT5668_SIL_PSV_CTRL2			0x0195
+#define RT5668_SIL_PSV_CTRL3			0x0197
+#define RT5668_SIL_PSV_CTRL4			0x0198
+#define RT5668_SIL_PSV_CTRL5			0x0199
+#define RT5668_HP_IMP_SENS_CTRL_01		0x01af
+#define RT5668_HP_IMP_SENS_CTRL_02		0x01b0
+#define RT5668_HP_IMP_SENS_CTRL_03		0x01b1
+#define RT5668_HP_IMP_SENS_CTRL_04		0x01b2
+#define RT5668_HP_IMP_SENS_CTRL_05		0x01b3
+#define RT5668_HP_IMP_SENS_CTRL_06		0x01b4
+#define RT5668_HP_IMP_SENS_CTRL_07		0x01b5
+#define RT5668_HP_IMP_SENS_CTRL_08		0x01b6
+#define RT5668_HP_IMP_SENS_CTRL_09		0x01b7
+#define RT5668_HP_IMP_SENS_CTRL_10		0x01b8
+#define RT5668_HP_IMP_SENS_CTRL_11		0x01b9
+#define RT5668_HP_IMP_SENS_CTRL_12		0x01ba
+#define RT5668_HP_IMP_SENS_CTRL_13		0x01bb
+#define RT5668_HP_IMP_SENS_CTRL_14		0x01bc
+#define RT5668_HP_IMP_SENS_CTRL_15		0x01bd
+#define RT5668_HP_IMP_SENS_CTRL_16		0x01be
+#define RT5668_HP_IMP_SENS_CTRL_17		0x01bf
+#define RT5668_HP_IMP_SENS_CTRL_18		0x01c0
+#define RT5668_HP_IMP_SENS_CTRL_19		0x01c1
+#define RT5668_HP_IMP_SENS_CTRL_20		0x01c2
+#define RT5668_HP_IMP_SENS_CTRL_21		0x01c3
+#define RT5668_HP_IMP_SENS_CTRL_22		0x01c4
+#define RT5668_HP_IMP_SENS_CTRL_23		0x01c5
+#define RT5668_HP_IMP_SENS_CTRL_24		0x01c6
+#define RT5668_HP_IMP_SENS_CTRL_25		0x01c7
+#define RT5668_HP_IMP_SENS_CTRL_26		0x01c8
+#define RT5668_HP_IMP_SENS_CTRL_27		0x01c9
+#define RT5668_HP_IMP_SENS_CTRL_28		0x01ca
+#define RT5668_HP_IMP_SENS_CTRL_29		0x01cb
+#define RT5668_HP_IMP_SENS_CTRL_30		0x01cc
+#define RT5668_HP_IMP_SENS_CTRL_31		0x01cd
+#define RT5668_HP_IMP_SENS_CTRL_32		0x01ce
+#define RT5668_HP_IMP_SENS_CTRL_33		0x01cf
+#define RT5668_HP_IMP_SENS_CTRL_34		0x01d0
+#define RT5668_HP_IMP_SENS_CTRL_35		0x01d1
+#define RT5668_HP_IMP_SENS_CTRL_36		0x01d2
+#define RT5668_HP_IMP_SENS_CTRL_37		0x01d3
+#define RT5668_HP_IMP_SENS_CTRL_38		0x01d4
+#define RT5668_HP_IMP_SENS_CTRL_39		0x01d5
+#define RT5668_HP_IMP_SENS_CTRL_40		0x01d6
+#define RT5668_HP_IMP_SENS_CTRL_41		0x01d7
+#define RT5668_HP_IMP_SENS_CTRL_42		0x01d8
+#define RT5668_HP_IMP_SENS_CTRL_43		0x01d9
+#define RT5668_HP_LOGIC_CTRL_1			0x01da
+#define RT5668_HP_LOGIC_CTRL_2			0x01db
+#define RT5668_HP_LOGIC_CTRL_3			0x01dc
+#define RT5668_HP_CALIB_CTRL_1			0x01de
+#define RT5668_HP_CALIB_CTRL_2			0x01df
+#define RT5668_HP_CALIB_CTRL_3			0x01e0
+#define RT5668_HP_CALIB_CTRL_4			0x01e1
+#define RT5668_HP_CALIB_CTRL_5			0x01e2
+#define RT5668_HP_CALIB_CTRL_6			0x01e3
+#define RT5668_HP_CALIB_CTRL_7			0x01e4
+#define RT5668_HP_CALIB_CTRL_9			0x01e6
+#define RT5668_HP_CALIB_CTRL_10			0x01e7
+#define RT5668_HP_CALIB_CTRL_11			0x01e8
+#define RT5668_HP_CALIB_STA_1			0x01ea
+#define RT5668_HP_CALIB_STA_2			0x01eb
+#define RT5668_HP_CALIB_STA_3			0x01ec
+#define RT5668_HP_CALIB_STA_4			0x01ed
+#define RT5668_HP_CALIB_STA_5			0x01ee
+#define RT5668_HP_CALIB_STA_6			0x01ef
+#define RT5668_HP_CALIB_STA_7			0x01f0
+#define RT5668_HP_CALIB_STA_8			0x01f1
+#define RT5668_HP_CALIB_STA_9			0x01f2
+#define RT5668_HP_CALIB_STA_10			0x01f3
+#define RT5668_HP_CALIB_STA_11			0x01f4
+#define RT5668_SAR_IL_CMD_1			0x0210
+#define RT5668_SAR_IL_CMD_2			0x0211
+#define RT5668_SAR_IL_CMD_3			0x0212
+#define RT5668_SAR_IL_CMD_4			0x0213
+#define RT5668_SAR_IL_CMD_5			0x0214
+#define RT5668_SAR_IL_CMD_6			0x0215
+#define RT5668_SAR_IL_CMD_7			0x0216
+#define RT5668_SAR_IL_CMD_8			0x0217
+#define RT5668_SAR_IL_CMD_9			0x0218
+#define RT5668_SAR_IL_CMD_10			0x0219
+#define RT5668_SAR_IL_CMD_11			0x021a
+#define RT5668_SAR_IL_CMD_12			0x021b
+#define RT5668_SAR_IL_CMD_13			0x021c
+#define RT5668_EFUSE_CTRL_1			0x0250
+#define RT5668_EFUSE_CTRL_2			0x0251
+#define RT5668_EFUSE_CTRL_3			0x0252
+#define RT5668_EFUSE_CTRL_4			0x0253
+#define RT5668_EFUSE_CTRL_5			0x0254
+#define RT5668_EFUSE_CTRL_6			0x0255
+#define RT5668_EFUSE_CTRL_7			0x0256
+#define RT5668_EFUSE_CTRL_8			0x0257
+#define RT5668_EFUSE_CTRL_9			0x0258
+#define RT5668_EFUSE_CTRL_10			0x0259
+#define RT5668_EFUSE_CTRL_11			0x025a
+#define RT5668_JD_TOP_VC_VTRL			0x0270
+#define RT5668_DRC1_CTRL_0			0x02ff
+#define RT5668_DRC1_CTRL_1			0x0300
+#define RT5668_DRC1_CTRL_2			0x0301
+#define RT5668_DRC1_CTRL_3			0x0302
+#define RT5668_DRC1_CTRL_4			0x0303
+#define RT5668_DRC1_CTRL_5			0x0304
+#define RT5668_DRC1_CTRL_6			0x0305
+#define RT5668_DRC1_HARD_LMT_CTRL_1		0x0306
+#define RT5668_DRC1_HARD_LMT_CTRL_2		0x0307
+#define RT5668_DRC1_PRIV_1			0x0310
+#define RT5668_DRC1_PRIV_2			0x0311
+#define RT5668_DRC1_PRIV_3			0x0312
+#define RT5668_DRC1_PRIV_4			0x0313
+#define RT5668_DRC1_PRIV_5			0x0314
+#define RT5668_DRC1_PRIV_6			0x0315
+#define RT5668_DRC1_PRIV_7			0x0316
+#define RT5668_DRC1_PRIV_8			0x0317
+#define RT5668_EQ_AUTO_RCV_CTRL1		0x03c0
+#define RT5668_EQ_AUTO_RCV_CTRL2		0x03c1
+#define RT5668_EQ_AUTO_RCV_CTRL3		0x03c2
+#define RT5668_EQ_AUTO_RCV_CTRL4		0x03c3
+#define RT5668_EQ_AUTO_RCV_CTRL5		0x03c4
+#define RT5668_EQ_AUTO_RCV_CTRL6		0x03c5
+#define RT5668_EQ_AUTO_RCV_CTRL7		0x03c6
+#define RT5668_EQ_AUTO_RCV_CTRL8		0x03c7
+#define RT5668_EQ_AUTO_RCV_CTRL9		0x03c8
+#define RT5668_EQ_AUTO_RCV_CTRL10		0x03c9
+#define RT5668_EQ_AUTO_RCV_CTRL11		0x03ca
+#define RT5668_EQ_AUTO_RCV_CTRL12		0x03cb
+#define RT5668_EQ_AUTO_RCV_CTRL13		0x03cc
+#define RT5668_ADC_L_EQ_LPF1_A1			0x03d0
+#define RT5668_R_EQ_LPF1_A1			0x03d1
+#define RT5668_L_EQ_LPF1_H0			0x03d2
+#define RT5668_R_EQ_LPF1_H0			0x03d3
+#define RT5668_L_EQ_BPF1_A1			0x03d4
+#define RT5668_R_EQ_BPF1_A1			0x03d5
+#define RT5668_L_EQ_BPF1_A2			0x03d6
+#define RT5668_R_EQ_BPF1_A2			0x03d7
+#define RT5668_L_EQ_BPF1_H0			0x03d8
+#define RT5668_R_EQ_BPF1_H0			0x03d9
+#define RT5668_L_EQ_BPF2_A1			0x03da
+#define RT5668_R_EQ_BPF2_A1			0x03db
+#define RT5668_L_EQ_BPF2_A2			0x03dc
+#define RT5668_R_EQ_BPF2_A2			0x03dd
+#define RT5668_L_EQ_BPF2_H0			0x03de
+#define RT5668_R_EQ_BPF2_H0			0x03df
+#define RT5668_L_EQ_BPF3_A1			0x03e0
+#define RT5668_R_EQ_BPF3_A1			0x03e1
+#define RT5668_L_EQ_BPF3_A2			0x03e2
+#define RT5668_R_EQ_BPF3_A2			0x03e3
+#define RT5668_L_EQ_BPF3_H0			0x03e4
+#define RT5668_R_EQ_BPF3_H0			0x03e5
+#define RT5668_L_EQ_BPF4_A1			0x03e6
+#define RT5668_R_EQ_BPF4_A1			0x03e7
+#define RT5668_L_EQ_BPF4_A2			0x03e8
+#define RT5668_R_EQ_BPF4_A2			0x03e9
+#define RT5668_L_EQ_BPF4_H0			0x03ea
+#define RT5668_R_EQ_BPF4_H0			0x03eb
+#define RT5668_L_EQ_HPF1_A1			0x03ec
+#define RT5668_R_EQ_HPF1_A1			0x03ed
+#define RT5668_L_EQ_HPF1_H0			0x03ee
+#define RT5668_R_EQ_HPF1_H0			0x03ef
+#define RT5668_L_EQ_PRE_VOL			0x03f0
+#define RT5668_R_EQ_PRE_VOL			0x03f1
+#define RT5668_L_EQ_POST_VOL			0x03f2
+#define RT5668_R_EQ_POST_VOL			0x03f3
+#define RT5668_I2C_MODE				0xffff
+
+
+/* global definition */
+#define RT5668_L_MUTE				(0x1 << 15)
+#define RT5668_L_MUTE_SFT			15
+#define RT5668_VOL_L_MUTE			(0x1 << 14)
+#define RT5668_VOL_L_SFT			14
+#define RT5668_R_MUTE				(0x1 << 7)
+#define RT5668_R_MUTE_SFT			7
+#define RT5668_VOL_R_MUTE			(0x1 << 6)
+#define RT5668_VOL_R_SFT			6
+#define RT5668_L_VOL_MASK			(0x3f << 8)
+#define RT5668_L_VOL_SFT			8
+#define RT5668_R_VOL_MASK			(0x3f)
+#define RT5668_R_VOL_SFT			0
+
+/*Headphone Amp L/R Analog Gain and Digital NG2 Gain Control (0x0005 0x0006)*/
+#define RT5668_G_HP				(0xf << 8)
+#define RT5668_G_HP_SFT				8
+#define RT5668_G_STO_DA_DMIX			(0xf)
+#define RT5668_G_STO_DA_SFT			0
+
+/* CBJ Control (0x000b) */
+#define RT5668_BST_CBJ_MASK			(0xf << 8)
+#define RT5668_BST_CBJ_SFT			8
+
+/* Embeeded Jack and Type Detection Control 1 (0x0010) */
+#define RT5668_EMB_JD_EN			(0x1 << 15)
+#define RT5668_EMB_JD_EN_SFT			15
+#define RT5668_EMB_JD_RST			(0x1 << 14)
+#define RT5668_JD_MODE				(0x1 << 13)
+#define RT5668_JD_MODE_SFT			13
+#define RT5668_DET_TYPE				(0x1 << 12)
+#define RT5668_DET_TYPE_SFT			12
+#define RT5668_POLA_EXT_JD_MASK			(0x1 << 11)
+#define RT5668_POLA_EXT_JD_LOW			(0x1 << 11)
+#define RT5668_POLA_EXT_JD_HIGH			(0x0 << 11)
+#define RT5668_EXT_JD_DIG			(0x1 << 9)
+#define RT5668_POL_FAST_OFF_MASK		(0x1 << 8)
+#define RT5668_POL_FAST_OFF_HIGH		(0x1 << 8)
+#define RT5668_POL_FAST_OFF_LOW			(0x0 << 8)
+#define RT5668_FAST_OFF_MASK			(0x1 << 7)
+#define RT5668_FAST_OFF_EN			(0x1 << 7)
+#define RT5668_FAST_OFF_DIS			(0x0 << 7)
+#define RT5668_VREF_POW_MASK			(0x1 << 6)
+#define RT5668_VREF_POW_FSM			(0x0 << 6)
+#define RT5668_VREF_POW_REG			(0x1 << 6)
+#define RT5668_MB1_PATH_MASK			(0x1 << 5)
+#define RT5668_CTRL_MB1_REG			(0x1 << 5)
+#define RT5668_CTRL_MB1_FSM			(0x0 << 5)
+#define RT5668_MB2_PATH_MASK			(0x1 << 4)
+#define RT5668_CTRL_MB2_REG			(0x1 << 4)
+#define RT5668_CTRL_MB2_FSM			(0x0 << 4)
+#define RT5668_TRIG_JD_MASK			(0x1 << 3)
+#define RT5668_TRIG_JD_HIGH			(0x1 << 3)
+#define RT5668_TRIG_JD_LOW			(0x0 << 3)
+#define RT5668_MIC_CAP_MASK			(0x1 << 1)
+#define RT5668_MIC_CAP_HS			(0x1 << 1)
+#define RT5668_MIC_CAP_HP			(0x0 << 1)
+#define RT5668_MIC_CAP_SRC_MASK			(0x1)
+#define RT5668_MIC_CAP_SRC_REG			(0x1)
+#define RT5668_MIC_CAP_SRC_ANA			(0x0)
+
+/* Embeeded Jack and Type Detection Control 2 (0x0011) */
+#define RT5668_EXT_JD_SRC			(0x7 << 4)
+#define RT5668_EXT_JD_SRC_SFT			4
+#define RT5668_EXT_JD_SRC_GPIO_JD1		(0x0 << 4)
+#define RT5668_EXT_JD_SRC_GPIO_JD2		(0x1 << 4)
+#define RT5668_EXT_JD_SRC_JDH			(0x2 << 4)
+#define RT5668_EXT_JD_SRC_JDL			(0x3 << 4)
+#define RT5668_EXT_JD_SRC_MANUAL		(0x4 << 4)
+#define RT5668_JACK_TYPE_MASK			(0x3)
+
+/* Combo Jack and Type Detection Control 3 (0x0012) */
+#define RT5668_CBJ_IN_BUF_EN			(0x1 << 7)
+
+/* Combo Jack and Type Detection Control 4 (0x0013) */
+#define RT5668_SEL_SHT_MID_TON_MASK		(0x3 << 12)
+#define RT5668_SEL_SHT_MID_TON_2		(0x0 << 12)
+#define RT5668_SEL_SHT_MID_TON_3		(0x1 << 12)
+#define RT5668_CBJ_JD_TEST_MASK			(0x1 << 6)
+#define RT5668_CBJ_JD_TEST_NORM			(0x0 << 6)
+#define RT5668_CBJ_JD_TEST_MODE			(0x1 << 6)
+
+/* DAC1 Digital Volume (0x0019) */
+#define RT5668_DAC_L1_VOL_MASK			(0xff << 8)
+#define RT5668_DAC_L1_VOL_SFT			8
+#define RT5668_DAC_R1_VOL_MASK			(0xff)
+#define RT5668_DAC_R1_VOL_SFT			0
+
+/* ADC Digital Volume Control (0x001c) */
+#define RT5668_ADC_L_VOL_MASK			(0x7f << 8)
+#define RT5668_ADC_L_VOL_SFT			8
+#define RT5668_ADC_R_VOL_MASK			(0x7f)
+#define RT5668_ADC_R_VOL_SFT			0
+
+/* Stereo1 ADC Boost Gain Control (0x001f) */
+#define RT5668_STO1_ADC_L_BST_MASK		(0x3 << 14)
+#define RT5668_STO1_ADC_L_BST_SFT		14
+#define RT5668_STO1_ADC_R_BST_MASK		(0x3 << 12)
+#define RT5668_STO1_ADC_R_BST_SFT		12
+
+/* Sidetone Control (0x0024) */
+#define RT5668_ST_SRC_SEL			(0x1 << 8)
+#define RT5668_ST_SRC_SFT			8
+#define RT5668_ST_EN_MASK			(0x1 << 6)
+#define RT5668_ST_DIS				(0x0 << 6)
+#define RT5668_ST_EN				(0x1 << 6)
+#define RT5668_ST_EN_SFT			6
+
+/* Stereo1 ADC Mixer Control (0x0026) */
+#define RT5668_M_STO1_ADC_L1			(0x1 << 15)
+#define RT5668_M_STO1_ADC_L1_SFT		15
+#define RT5668_M_STO1_ADC_L2			(0x1 << 14)
+#define RT5668_M_STO1_ADC_L2_SFT		14
+#define RT5668_STO1_ADC1L_SRC_MASK		(0x1 << 13)
+#define RT5668_STO1_ADC1L_SRC_SFT		13
+#define RT5668_STO1_ADC1_SRC_ADC		(0x1 << 13)
+#define RT5668_STO1_ADC1_SRC_DACMIX		(0x0 << 13)
+#define RT5668_STO1_ADC2L_SRC_MASK		(0x1 << 12)
+#define RT5668_STO1_ADC2L_SRC_SFT		12
+#define RT5668_STO1_ADCL_SRC_MASK		(0x3 << 10)
+#define RT5668_STO1_ADCL_SRC_SFT		10
+#define RT5668_STO1_DD_L_SRC_MASK		(0x1 << 9)
+#define RT5668_STO1_DD_L_SRC_SFT		9
+#define RT5668_STO1_DMIC_SRC_MASK		(0x1 << 8)
+#define RT5668_STO1_DMIC_SRC_SFT		8
+#define RT5668_STO1_DMIC_SRC_DMIC2		(0x1 << 8)
+#define RT5668_STO1_DMIC_SRC_DMIC1		(0x0 << 8)
+#define RT5668_M_STO1_ADC_R1			(0x1 << 7)
+#define RT5668_M_STO1_ADC_R1_SFT		7
+#define RT5668_M_STO1_ADC_R2			(0x1 << 6)
+#define RT5668_M_STO1_ADC_R2_SFT		6
+#define RT5668_STO1_ADC1R_SRC_MASK		(0x1 << 5)
+#define RT5668_STO1_ADC1R_SRC_SFT		5
+#define RT5668_STO1_ADC2R_SRC_MASK		(0x1 << 4)
+#define RT5668_STO1_ADC2R_SRC_SFT		4
+#define RT5668_STO1_ADCR_SRC_MASK		(0x3 << 2)
+#define RT5668_STO1_ADCR_SRC_SFT		2
+
+/* ADC Mixer to DAC Mixer Control (0x0029) */
+#define RT5668_M_ADCMIX_L			(0x1 << 15)
+#define RT5668_M_ADCMIX_L_SFT			15
+#define RT5668_M_DAC1_L				(0x1 << 14)
+#define RT5668_M_DAC1_L_SFT			14
+#define RT5668_DAC1_R_SEL_MASK			(0x1 << 10)
+#define RT5668_DAC1_R_SEL_SFT			10
+#define RT5668_DAC1_L_SEL_MASK			(0x1 << 8)
+#define RT5668_DAC1_L_SEL_SFT			8
+#define RT5668_M_ADCMIX_R			(0x1 << 7)
+#define RT5668_M_ADCMIX_R_SFT			7
+#define RT5668_M_DAC1_R				(0x1 << 6)
+#define RT5668_M_DAC1_R_SFT			6
+
+/* Stereo1 DAC Mixer Control (0x002a) */
+#define RT5668_M_DAC_L1_STO_L			(0x1 << 15)
+#define RT5668_M_DAC_L1_STO_L_SFT		15
+#define RT5668_G_DAC_L1_STO_L_MASK		(0x1 << 14)
+#define RT5668_G_DAC_L1_STO_L_SFT		14
+#define RT5668_M_DAC_R1_STO_L			(0x1 << 13)
+#define RT5668_M_DAC_R1_STO_L_SFT		13
+#define RT5668_G_DAC_R1_STO_L_MASK		(0x1 << 12)
+#define RT5668_G_DAC_R1_STO_L_SFT		12
+#define RT5668_M_DAC_L1_STO_R			(0x1 << 7)
+#define RT5668_M_DAC_L1_STO_R_SFT		7
+#define RT5668_G_DAC_L1_STO_R_MASK		(0x1 << 6)
+#define RT5668_G_DAC_L1_STO_R_SFT		6
+#define RT5668_M_DAC_R1_STO_R			(0x1 << 5)
+#define RT5668_M_DAC_R1_STO_R_SFT		5
+#define RT5668_G_DAC_R1_STO_R_MASK		(0x1 << 4)
+#define RT5668_G_DAC_R1_STO_R_SFT		4
+
+/* Analog DAC1 Input Source Control (0x002b) */
+#define RT5668_M_ST_STO_L			(0x1 << 9)
+#define RT5668_M_ST_STO_L_SFT			9
+#define RT5668_M_ST_STO_R			(0x1 << 8)
+#define RT5668_M_ST_STO_R_SFT			8
+#define RT5668_DAC_L1_SRC_MASK			(0x3 << 4)
+#define RT5668_A_DACL1_SFT			4
+#define RT5668_DAC_R1_SRC_MASK			(0x3)
+#define RT5668_A_DACR1_SFT			0
+
+/* Digital Interface Data Control (0x0030) */
+#define RT5668_IF2_ADC_SEL_MASK			(0x3 << 0)
+#define RT5668_IF2_ADC_SEL_SFT			0
+
+/* REC Left Mixer Control 2 (0x003c) */
+#define RT5668_G_CBJ_RM1_L			(0x7 << 10)
+#define RT5668_G_CBJ_RM1_L_SFT			10
+#define RT5668_M_CBJ_RM1_L			(0x1 << 7)
+#define RT5668_M_CBJ_RM1_L_SFT			7
+
+/* Power Management for Digital 1 (0x0061) */
+#define RT5668_PWR_I2S1				(0x1 << 15)
+#define RT5668_PWR_I2S1_BIT			15
+#define RT5668_PWR_I2S2				(0x1 << 14)
+#define RT5668_PWR_I2S2_BIT			14
+#define RT5668_PWR_DAC_L1			(0x1 << 11)
+#define RT5668_PWR_DAC_L1_BIT			11
+#define RT5668_PWR_DAC_R1			(0x1 << 10)
+#define RT5668_PWR_DAC_R1_BIT			10
+#define RT5668_PWR_LDO				(0x1 << 8)
+#define RT5668_PWR_LDO_BIT			8
+#define RT5668_PWR_ADC_L1			(0x1 << 4)
+#define RT5668_PWR_ADC_L1_BIT			4
+#define RT5668_PWR_ADC_R1			(0x1 << 3)
+#define RT5668_PWR_ADC_R1_BIT			3
+#define RT5668_DIG_GATE_CTRL			(0x1 << 0)
+#define RT5668_DIG_GATE_CTRL_SFT		0
+
+
+/* Power Management for Digital 2 (0x0062) */
+#define RT5668_PWR_ADC_S1F			(0x1 << 15)
+#define RT5668_PWR_ADC_S1F_BIT			15
+#define RT5668_PWR_DAC_S1F			(0x1 << 10)
+#define RT5668_PWR_DAC_S1F_BIT			10
+
+/* Power Management for Analog 1 (0x0063) */
+#define RT5668_PWR_VREF1			(0x1 << 15)
+#define RT5668_PWR_VREF1_BIT			15
+#define RT5668_PWR_FV1				(0x1 << 14)
+#define RT5668_PWR_FV1_BIT			14
+#define RT5668_PWR_VREF2			(0x1 << 13)
+#define RT5668_PWR_VREF2_BIT			13
+#define RT5668_PWR_FV2				(0x1 << 12)
+#define RT5668_PWR_FV2_BIT			12
+#define RT5668_LDO1_DBG_MASK			(0x3 << 10)
+#define RT5668_PWR_MB				(0x1 << 9)
+#define RT5668_PWR_MB_BIT			9
+#define RT5668_PWR_BG				(0x1 << 7)
+#define RT5668_PWR_BG_BIT			7
+#define RT5668_LDO1_BYPASS_MASK			(0x1 << 6)
+#define RT5668_LDO1_BYPASS			(0x1 << 6)
+#define RT5668_LDO1_NOT_BYPASS			(0x0 << 6)
+#define RT5668_PWR_MA_BIT			6
+#define RT5668_LDO1_DVO_MASK			(0x3 << 4)
+#define RT5668_LDO1_DVO_09			(0x0 << 4)
+#define RT5668_LDO1_DVO_10			(0x1 << 4)
+#define RT5668_LDO1_DVO_12			(0x2 << 4)
+#define RT5668_LDO1_DVO_14			(0x3 << 4)
+#define RT5668_HP_DRIVER_MASK			(0x3 << 2)
+#define RT5668_HP_DRIVER_1X			(0x0 << 2)
+#define RT5668_HP_DRIVER_3X			(0x1 << 2)
+#define RT5668_HP_DRIVER_5X			(0x3 << 2)
+#define RT5668_PWR_HA_L				(0x1 << 1)
+#define RT5668_PWR_HA_L_BIT			1
+#define RT5668_PWR_HA_R				(0x1 << 0)
+#define RT5668_PWR_HA_R_BIT			0
+
+/* Power Management for Analog 2 (0x0064) */
+#define RT5668_PWR_MB1				(0x1 << 11)
+#define RT5668_PWR_MB1_PWR_DOWN			(0x0 << 11)
+#define RT5668_PWR_MB1_BIT			11
+#define RT5668_PWR_MB2				(0x1 << 10)
+#define RT5668_PWR_MB2_PWR_DOWN			(0x0 << 10)
+#define RT5668_PWR_MB2_BIT			10
+#define RT5668_PWR_JDH				(0x1 << 3)
+#define RT5668_PWR_JDH_BIT			3
+#define RT5668_PWR_JDL				(0x1 << 2)
+#define RT5668_PWR_JDL_BIT			2
+#define RT5668_PWR_RM1_L			(0x1 << 1)
+#define RT5668_PWR_RM1_L_BIT			1
+
+/* Power Management for Analog 3 (0x0065) */
+#define RT5668_PWR_CBJ				(0x1 << 9)
+#define RT5668_PWR_CBJ_BIT			9
+#define RT5668_PWR_PLL				(0x1 << 6)
+#define RT5668_PWR_PLL_BIT			6
+#define RT5668_PWR_PLL2B			(0x1 << 5)
+#define RT5668_PWR_PLL2B_BIT			5
+#define RT5668_PWR_PLL2F			(0x1 << 4)
+#define RT5668_PWR_PLL2F_BIT			4
+#define RT5668_PWR_LDO2				(0x1 << 2)
+#define RT5668_PWR_LDO2_BIT			2
+#define RT5668_PWR_DET_SPKVDD			(0x1 << 1)
+#define RT5668_PWR_DET_SPKVDD_BIT		1
+
+/* Power Management for Mixer (0x0066) */
+#define RT5668_PWR_STO1_DAC_L			(0x1 << 5)
+#define RT5668_PWR_STO1_DAC_L_BIT		5
+#define RT5668_PWR_STO1_DAC_R			(0x1 << 4)
+#define RT5668_PWR_STO1_DAC_R_BIT		4
+
+/* MCLK and System Clock Detection Control (0x006b) */
+#define RT5668_SYS_CLK_DET			(0x1 << 15)
+#define RT5668_SYS_CLK_DET_SFT			15
+#define RT5668_PLL1_CLK_DET			(0x1 << 14)
+#define RT5668_PLL1_CLK_DET_SFT			14
+#define RT5668_PLL2_CLK_DET			(0x1 << 13)
+#define RT5668_PLL2_CLK_DET_SFT			13
+#define RT5668_POW_CLK_DET2_SFT			8
+#define RT5668_POW_CLK_DET_SFT			0
+
+/* Digital Microphone Control 1 (0x006e) */
+#define RT5668_DMIC_1_EN_MASK			(0x1 << 15)
+#define RT5668_DMIC_1_EN_SFT			15
+#define RT5668_DMIC_1_DIS			(0x0 << 15)
+#define RT5668_DMIC_1_EN			(0x1 << 15)
+#define RT5668_DMIC_1_DP_MASK			(0x3 << 4)
+#define RT5668_DMIC_1_DP_SFT			4
+#define RT5668_DMIC_1_DP_GPIO2			(0x0 << 4)
+#define RT5668_DMIC_1_DP_GPIO5			(0x1 << 4)
+#define RT5668_DMIC_CLK_MASK			(0xf << 0)
+#define RT5668_DMIC_CLK_SFT			0
+
+/* I2S1 Audio Serial Data Port Control (0x0070) */
+#define RT5668_SEL_ADCDAT_MASK			(0x1 << 15)
+#define RT5668_SEL_ADCDAT_OUT			(0x0 << 15)
+#define RT5668_SEL_ADCDAT_IN			(0x1 << 15)
+#define RT5668_SEL_ADCDAT_SFT			15
+#define RT5668_I2S1_TX_CHL_MASK			(0x7 << 12)
+#define RT5668_I2S1_TX_CHL_SFT			12
+#define RT5668_I2S1_TX_CHL_16			(0x0 << 12)
+#define RT5668_I2S1_TX_CHL_20			(0x1 << 12)
+#define RT5668_I2S1_TX_CHL_24			(0x2 << 12)
+#define RT5668_I2S1_TX_CHL_32			(0x3 << 12)
+#define RT5668_I2S1_TX_CHL_8			(0x4 << 12)
+#define RT5668_I2S1_RX_CHL_MASK			(0x7 << 8)
+#define RT5668_I2S1_RX_CHL_SFT			8
+#define RT5668_I2S1_RX_CHL_16			(0x0 << 8)
+#define RT5668_I2S1_RX_CHL_20			(0x1 << 8)
+#define RT5668_I2S1_RX_CHL_24			(0x2 << 8)
+#define RT5668_I2S1_RX_CHL_32			(0x3 << 8)
+#define RT5668_I2S1_RX_CHL_8			(0x4 << 8)
+#define RT5668_I2S1_MONO_MASK			(0x1 << 7)
+#define RT5668_I2S1_MONO_EN			(0x1 << 7)
+#define RT5668_I2S1_MONO_DIS			(0x0 << 7)
+#define RT5668_I2S2_MONO_MASK			(0x1 << 6)
+#define RT5668_I2S2_MONO_EN			(0x1 << 6)
+#define RT5668_I2S2_MONO_DIS			(0x0 << 6)
+#define RT5668_I2S1_DL_MASK			(0x7 << 4)
+#define RT5668_I2S1_DL_SFT			4
+#define RT5668_I2S1_DL_16			(0x0 << 4)
+#define RT5668_I2S1_DL_20			(0x1 << 4)
+#define RT5668_I2S1_DL_24			(0x2 << 4)
+#define RT5668_I2S1_DL_32			(0x3 << 4)
+#define RT5668_I2S1_DL_8			(0x4 << 4)
+
+/* I2S1/2 Audio Serial Data Port Control (0x0070)(0x0071) */
+#define RT5668_I2S2_MS_MASK			(0x1 << 15)
+#define RT5668_I2S2_MS_SFT			15
+#define RT5668_I2S2_MS_M			(0x0 << 15)
+#define RT5668_I2S2_MS_S			(0x1 << 15)
+#define RT5668_I2S2_PIN_CFG_MASK		(0x1 << 14)
+#define RT5668_I2S2_PIN_CFG_SFT			14
+#define RT5668_I2S2_CLK_SEL_MASK		(0x1 << 11)
+#define RT5668_I2S2_CLK_SEL_SFT			11
+#define RT5668_I2S2_OUT_MASK			(0x1 << 9)
+#define RT5668_I2S2_OUT_SFT			9
+#define RT5668_I2S2_OUT_UM			(0x0 << 9)
+#define RT5668_I2S2_OUT_M			(0x1 << 9)
+#define RT5668_I2S_BP_MASK			(0x1 << 8)
+#define RT5668_I2S_BP_SFT			8
+#define RT5668_I2S_BP_NOR			(0x0 << 8)
+#define RT5668_I2S_BP_INV			(0x1 << 8)
+#define RT5668_I2S2_MONO_EN			(0x1 << 6)
+#define RT5668_I2S2_MONO_DIS			(0x0 << 6)
+#define RT5668_I2S2_DL_MASK			(0x3 << 4)
+#define RT5668_I2S2_DL_SFT			4
+#define RT5668_I2S2_DL_16			(0x0 << 4)
+#define RT5668_I2S2_DL_20			(0x1 << 4)
+#define RT5668_I2S2_DL_24			(0x2 << 4)
+#define RT5668_I2S2_DL_8			(0x3 << 4)
+#define RT5668_I2S_DF_MASK			(0x7)
+#define RT5668_I2S_DF_SFT			0
+#define RT5668_I2S_DF_I2S			(0x0)
+#define RT5668_I2S_DF_LEFT			(0x1)
+#define RT5668_I2S_DF_PCM_A			(0x2)
+#define RT5668_I2S_DF_PCM_B			(0x3)
+#define RT5668_I2S_DF_PCM_A_N			(0x6)
+#define RT5668_I2S_DF_PCM_B_N			(0x7)
+
+/* ADC/DAC Clock Control 1 (0x0073) */
+#define RT5668_ADC_OSR_MASK			(0xf << 12)
+#define RT5668_ADC_OSR_SFT			12
+#define RT5668_ADC_OSR_D_1			(0x0 << 12)
+#define RT5668_ADC_OSR_D_2			(0x1 << 12)
+#define RT5668_ADC_OSR_D_4			(0x2 << 12)
+#define RT5668_ADC_OSR_D_6			(0x3 << 12)
+#define RT5668_ADC_OSR_D_8			(0x4 << 12)
+#define RT5668_ADC_OSR_D_12			(0x5 << 12)
+#define RT5668_ADC_OSR_D_16			(0x6 << 12)
+#define RT5668_ADC_OSR_D_24			(0x7 << 12)
+#define RT5668_ADC_OSR_D_32			(0x8 << 12)
+#define RT5668_ADC_OSR_D_48			(0x9 << 12)
+#define RT5668_I2S_M_DIV_MASK			(0xf << 12)
+#define RT5668_I2S_M_DIV_SFT			8
+#define RT5668_I2S_M_D_1			(0x0 << 8)
+#define RT5668_I2S_M_D_2			(0x1 << 8)
+#define RT5668_I2S_M_D_3			(0x2 << 8)
+#define RT5668_I2S_M_D_4			(0x3 << 8)
+#define RT5668_I2S_M_D_6			(0x4 << 8)
+#define RT5668_I2S_M_D_8			(0x5 << 8)
+#define RT5668_I2S_M_D_12			(0x6 << 8)
+#define RT5668_I2S_M_D_16			(0x7 << 8)
+#define RT5668_I2S_M_D_24			(0x8 << 8)
+#define RT5668_I2S_M_D_32			(0x9 << 8)
+#define RT5668_I2S_M_D_48			(0x10 << 8)
+#define RT5668_I2S_CLK_SRC_MASK			(0x7 << 4)
+#define RT5668_I2S_CLK_SRC_SFT			4
+#define RT5668_I2S_CLK_SRC_MCLK			(0x0 << 4)
+#define RT5668_I2S_CLK_SRC_PLL1			(0x1 << 4)
+#define RT5668_I2S_CLK_SRC_PLL2			(0x2 << 4)
+#define RT5668_I2S_CLK_SRC_SDW			(0x3 << 4)
+#define RT5668_I2S_CLK_SRC_RCCLK		(0x4 << 4) /* 25M */
+#define RT5668_DAC_OSR_MASK			(0xf << 0)
+#define RT5668_DAC_OSR_SFT			0
+#define RT5668_DAC_OSR_D_1			(0x0 << 0)
+#define RT5668_DAC_OSR_D_2			(0x1 << 0)
+#define RT5668_DAC_OSR_D_4			(0x2 << 0)
+#define RT5668_DAC_OSR_D_6			(0x3 << 0)
+#define RT5668_DAC_OSR_D_8			(0x4 << 0)
+#define RT5668_DAC_OSR_D_12			(0x5 << 0)
+#define RT5668_DAC_OSR_D_16			(0x6 << 0)
+#define RT5668_DAC_OSR_D_24			(0x7 << 0)
+#define RT5668_DAC_OSR_D_32			(0x8 << 0)
+#define RT5668_DAC_OSR_D_48			(0x9 << 0)
+
+/* ADC/DAC Clock Control 2 (0x0074) */
+#define RT5668_I2S2_BCLK_MS2_MASK		(0x1 << 11)
+#define RT5668_I2S2_BCLK_MS2_SFT		11
+#define RT5668_I2S2_BCLK_MS2_32			(0x0 << 11)
+#define RT5668_I2S2_BCLK_MS2_64			(0x1 << 11)
+
+
+/* TDM control 1 (0x0079) */
+#define RT5668_TDM_TX_CH_MASK			(0x3 << 12)
+#define RT5668_TDM_TX_CH_2			(0x0 << 12)
+#define RT5668_TDM_TX_CH_4			(0x1 << 12)
+#define RT5668_TDM_TX_CH_6			(0x2 << 12)
+#define RT5668_TDM_TX_CH_8			(0x3 << 12)
+#define RT5668_TDM_RX_CH_MASK			(0x3 << 8)
+#define RT5668_TDM_RX_CH_2			(0x0 << 8)
+#define RT5668_TDM_RX_CH_4			(0x1 << 8)
+#define RT5668_TDM_RX_CH_6			(0x2 << 8)
+#define RT5668_TDM_RX_CH_8			(0x3 << 8)
+#define RT5668_TDM_ADC_LCA_MASK			(0xf << 4)
+#define RT5668_TDM_ADC_LCA_SFT			4
+#define RT5668_TDM_ADC_DL_SFT			0
+
+/* TDM control 3 (0x007a) */
+#define RT5668_IF1_ADC1_SEL_SFT			14
+#define RT5668_IF1_ADC2_SEL_SFT			12
+#define RT5668_IF1_ADC3_SEL_SFT			10
+#define RT5668_IF1_ADC4_SEL_SFT			8
+#define RT5668_TDM_ADC_SEL_SFT			4
+
+/* TDM/I2S control (0x007e) */
+#define RT5668_TDM_S_BP_MASK			(0x1 << 15)
+#define RT5668_TDM_S_BP_SFT			15
+#define RT5668_TDM_S_BP_NOR			(0x0 << 15)
+#define RT5668_TDM_S_BP_INV			(0x1 << 15)
+#define RT5668_TDM_S_LP_MASK			(0x1 << 14)
+#define RT5668_TDM_S_LP_SFT			14
+#define RT5668_TDM_S_LP_NOR			(0x0 << 14)
+#define RT5668_TDM_S_LP_INV			(0x1 << 14)
+#define RT5668_TDM_DF_MASK			(0x7 << 11)
+#define RT5668_TDM_DF_SFT			11
+#define RT5668_TDM_DF_I2S			(0x0 << 11)
+#define RT5668_TDM_DF_LEFT			(0x1 << 11)
+#define RT5668_TDM_DF_PCM_A			(0x2 << 11)
+#define RT5668_TDM_DF_PCM_B			(0x3 << 11)
+#define RT5668_TDM_DF_PCM_A_N			(0x6 << 11)
+#define RT5668_TDM_DF_PCM_B_N			(0x7 << 11)
+#define RT5668_TDM_CL_MASK			(0x3 << 4)
+#define RT5668_TDM_CL_16			(0x0 << 4)
+#define RT5668_TDM_CL_20			(0x1 << 4)
+#define RT5668_TDM_CL_24			(0x2 << 4)
+#define RT5668_TDM_CL_32			(0x3 << 4)
+#define RT5668_TDM_M_BP_MASK			(0x1 << 2)
+#define RT5668_TDM_M_BP_SFT			2
+#define RT5668_TDM_M_BP_NOR			(0x0 << 2)
+#define RT5668_TDM_M_BP_INV			(0x1 << 2)
+#define RT5668_TDM_M_LP_MASK			(0x1 << 1)
+#define RT5668_TDM_M_LP_SFT			1
+#define RT5668_TDM_M_LP_NOR			(0x0 << 1)
+#define RT5668_TDM_M_LP_INV			(0x1 << 1)
+#define RT5668_TDM_MS_MASK			(0x1 << 0)
+#define RT5668_TDM_MS_SFT			0
+#define RT5668_TDM_MS_M				(0x0 << 0)
+#define RT5668_TDM_MS_S				(0x1 << 0)
+
+/* Global Clock Control (0x0080) */
+#define RT5668_SCLK_SRC_MASK			(0x7 << 13)
+#define RT5668_SCLK_SRC_SFT			13
+#define RT5668_SCLK_SRC_MCLK			(0x0 << 13)
+#define RT5668_SCLK_SRC_PLL1			(0x1 << 13)
+#define RT5668_SCLK_SRC_PLL2			(0x2 << 13)
+#define RT5668_SCLK_SRC_SDW			(0x3 << 13)
+#define RT5668_SCLK_SRC_RCCLK			(0x4 << 13)
+#define RT5668_PLL1_SRC_MASK			(0x3 << 10)
+#define RT5668_PLL1_SRC_SFT			10
+#define RT5668_PLL1_SRC_MCLK			(0x0 << 10)
+#define RT5668_PLL1_SRC_BCLK1			(0x1 << 10)
+#define RT5668_PLL1_SRC_SDW			(0x2 << 10)
+#define RT5668_PLL1_SRC_RC			(0x3 << 10)
+#define RT5668_PLL2_SRC_MASK			(0x3 << 8)
+#define RT5668_PLL2_SRC_SFT			8
+#define RT5668_PLL2_SRC_MCLK			(0x0 << 8)
+#define RT5668_PLL2_SRC_BCLK1			(0x1 << 8)
+#define RT5668_PLL2_SRC_SDW			(0x2 << 8)
+#define RT5668_PLL2_SRC_RC			(0x3 << 8)
+
+
+
+#define RT5668_PLL_INP_MAX			40000000
+#define RT5668_PLL_INP_MIN			256000
+/* PLL M/N/K Code Control 1 (0x0081) */
+#define RT5668_PLL_N_MAX			0x001ff
+#define RT5668_PLL_N_MASK			(RT5668_PLL_N_MAX << 7)
+#define RT5668_PLL_N_SFT			7
+#define RT5668_PLL_K_MAX			0x001f
+#define RT5668_PLL_K_MASK			(RT5668_PLL_K_MAX)
+#define RT5668_PLL_K_SFT			0
+
+/* PLL M/N/K Code Control 2 (0x0082) */
+#define RT5668_PLL_M_MAX			0x00f
+#define RT5668_PLL_M_MASK			(RT5668_PLL_M_MAX << 12)
+#define RT5668_PLL_M_SFT			12
+#define RT5668_PLL_M_BP				(0x1 << 11)
+#define RT5668_PLL_M_BP_SFT			11
+#define RT5668_PLL_K_BP				(0x1 << 10)
+#define RT5668_PLL_K_BP_SFT			10
+
+/* PLL tracking mode 1 (0x0083) */
+#define RT5668_DA_ASRC_MASK			(0x1 << 13)
+#define RT5668_DA_ASRC_SFT			13
+#define RT5668_DAC_STO1_ASRC_MASK		(0x1 << 12)
+#define RT5668_DAC_STO1_ASRC_SFT		12
+#define RT5668_AD_ASRC_MASK			(0x1 << 8)
+#define RT5668_AD_ASRC_SFT			8
+#define RT5668_AD_ASRC_SEL_MASK			(0x1 << 4)
+#define RT5668_AD_ASRC_SEL_SFT			4
+#define RT5668_DMIC_ASRC_MASK			(0x1 << 3)
+#define RT5668_DMIC_ASRC_SFT			3
+#define RT5668_ADC_STO1_ASRC_MASK		(0x1 << 2)
+#define RT5668_ADC_STO1_ASRC_SFT		2
+#define RT5668_DA_ASRC_SEL_MASK			(0x1 << 0)
+#define RT5668_DA_ASRC_SEL_SFT			0
+
+/* PLL tracking mode 2 3 (0x0084)(0x0085)*/
+#define RT5668_FILTER_CLK_SEL_MASK		(0x7 << 12)
+#define RT5668_FILTER_CLK_SEL_SFT		12
+
+/* ASRC Control 4 (0x0086) */
+#define RT5668_ASRCIN_FTK_N1_MASK		(0x3 << 14)
+#define RT5668_ASRCIN_FTK_N1_SFT		14
+#define RT5668_ASRCIN_FTK_N2_MASK		(0x3 << 12)
+#define RT5668_ASRCIN_FTK_N2_SFT		12
+#define RT5668_ASRCIN_FTK_M1_MASK		(0x7 << 8)
+#define RT5668_ASRCIN_FTK_M1_SFT		8
+#define RT5668_ASRCIN_FTK_M2_MASK		(0x7 << 4)
+#define RT5668_ASRCIN_FTK_M2_SFT		4
+
+/* SoundWire reference clk (0x008d) */
+#define RT5668_PLL2_OUT_MASK			(0x1 << 8)
+#define RT5668_PLL2_OUT_98M			(0x0 << 8)
+#define RT5668_PLL2_OUT_49M			(0x1 << 8)
+#define RT5668_SDW_REF_2_MASK			(0xf << 4)
+#define RT5668_SDW_REF_2_SFT			4
+#define RT5668_SDW_REF_2_48K			(0x0 << 4)
+#define RT5668_SDW_REF_2_96K			(0x1 << 4)
+#define RT5668_SDW_REF_2_192K			(0x2 << 4)
+#define RT5668_SDW_REF_2_32K			(0x3 << 4)
+#define RT5668_SDW_REF_2_24K			(0x4 << 4)
+#define RT5668_SDW_REF_2_16K			(0x5 << 4)
+#define RT5668_SDW_REF_2_12K			(0x6 << 4)
+#define RT5668_SDW_REF_2_8K			(0x7 << 4)
+#define RT5668_SDW_REF_2_44K			(0x8 << 4)
+#define RT5668_SDW_REF_2_88K			(0x9 << 4)
+#define RT5668_SDW_REF_2_176K			(0xa << 4)
+#define RT5668_SDW_REF_2_353K			(0xb << 4)
+#define RT5668_SDW_REF_2_22K			(0xc << 4)
+#define RT5668_SDW_REF_2_384K			(0xd << 4)
+#define RT5668_SDW_REF_2_11K			(0xe << 4)
+#define RT5668_SDW_REF_1_MASK			(0xf << 0)
+#define RT5668_SDW_REF_1_SFT			0
+#define RT5668_SDW_REF_1_48K			(0x0 << 0)
+#define RT5668_SDW_REF_1_96K			(0x1 << 0)
+#define RT5668_SDW_REF_1_192K			(0x2 << 0)
+#define RT5668_SDW_REF_1_32K			(0x3 << 0)
+#define RT5668_SDW_REF_1_24K			(0x4 << 0)
+#define RT5668_SDW_REF_1_16K			(0x5 << 0)
+#define RT5668_SDW_REF_1_12K			(0x6 << 0)
+#define RT5668_SDW_REF_1_8K			(0x7 << 0)
+#define RT5668_SDW_REF_1_44K			(0x8 << 0)
+#define RT5668_SDW_REF_1_88K			(0x9 << 0)
+#define RT5668_SDW_REF_1_176K			(0xa << 0)
+#define RT5668_SDW_REF_1_353K			(0xb << 0)
+#define RT5668_SDW_REF_1_22K			(0xc << 0)
+#define RT5668_SDW_REF_1_384K			(0xd << 0)
+#define RT5668_SDW_REF_1_11K			(0xe << 0)
+
+/* Depop Mode Control 1 (0x008e) */
+#define RT5668_PUMP_EN				(0x1 << 3)
+#define RT5668_PUMP_EN_SFT				3
+#define RT5668_CAPLESS_EN			(0x1 << 0)
+#define RT5668_CAPLESS_EN_SFT			0
+
+/* Depop Mode Control 2 (0x8f) */
+#define RT5668_RAMP_MASK			(0x1 << 12)
+#define RT5668_RAMP_SFT				12
+#define RT5668_RAMP_DIS				(0x0 << 12)
+#define RT5668_RAMP_EN				(0x1 << 12)
+#define RT5668_BPS_MASK				(0x1 << 11)
+#define RT5668_BPS_SFT				11
+#define RT5668_BPS_DIS				(0x0 << 11)
+#define RT5668_BPS_EN				(0x1 << 11)
+#define RT5668_FAST_UPDN_MASK			(0x1 << 10)
+#define RT5668_FAST_UPDN_SFT			10
+#define RT5668_FAST_UPDN_DIS			(0x0 << 10)
+#define RT5668_FAST_UPDN_EN			(0x1 << 10)
+#define RT5668_VLO_MASK				(0x1 << 7)
+#define RT5668_VLO_SFT				7
+#define RT5668_VLO_3V				(0x0 << 7)
+#define RT5668_VLO_33V				(0x1 << 7)
+
+/* HPOUT charge pump 1 (0x0091) */
+#define RT5668_OSW_L_MASK			(0x1 << 11)
+#define RT5668_OSW_L_SFT			11
+#define RT5668_OSW_L_DIS			(0x0 << 11)
+#define RT5668_OSW_L_EN				(0x1 << 11)
+#define RT5668_OSW_R_MASK			(0x1 << 10)
+#define RT5668_OSW_R_SFT			10
+#define RT5668_OSW_R_DIS			(0x0 << 10)
+#define RT5668_OSW_R_EN				(0x1 << 10)
+#define RT5668_PM_HP_MASK			(0x3 << 8)
+#define RT5668_PM_HP_SFT			8
+#define RT5668_PM_HP_LV				(0x0 << 8)
+#define RT5668_PM_HP_MV				(0x1 << 8)
+#define RT5668_PM_HP_HV				(0x2 << 8)
+#define RT5668_IB_HP_MASK			(0x3 << 6)
+#define RT5668_IB_HP_SFT			6
+#define RT5668_IB_HP_125IL			(0x0 << 6)
+#define RT5668_IB_HP_25IL			(0x1 << 6)
+#define RT5668_IB_HP_5IL			(0x2 << 6)
+#define RT5668_IB_HP_1IL			(0x3 << 6)
+
+/* Micbias Control1 (0x93) */
+#define RT5668_MIC1_OV_MASK			(0x3 << 14)
+#define RT5668_MIC1_OV_SFT			14
+#define RT5668_MIC1_OV_2V7			(0x0 << 14)
+#define RT5668_MIC1_OV_2V4			(0x1 << 14)
+#define RT5668_MIC1_OV_2V25			(0x3 << 14)
+#define RT5668_MIC1_OV_1V8			(0x4 << 14)
+#define RT5668_MIC1_CLK_MASK			(0x1 << 13)
+#define RT5668_MIC1_CLK_SFT			13
+#define RT5668_MIC1_CLK_DIS			(0x0 << 13)
+#define RT5668_MIC1_CLK_EN			(0x1 << 13)
+#define RT5668_MIC1_OVCD_MASK			(0x1 << 12)
+#define RT5668_MIC1_OVCD_SFT			12
+#define RT5668_MIC1_OVCD_DIS			(0x0 << 12)
+#define RT5668_MIC1_OVCD_EN			(0x1 << 12)
+#define RT5668_MIC1_OVTH_MASK			(0x3 << 10)
+#define RT5668_MIC1_OVTH_SFT			10
+#define RT5668_MIC1_OVTH_768UA			(0x0 << 10)
+#define RT5668_MIC1_OVTH_960UA			(0x1 << 10)
+#define RT5668_MIC1_OVTH_1152UA			(0x2 << 10)
+#define RT5668_MIC1_OVTH_1960UA			(0x3 << 10)
+#define RT5668_MIC2_OV_MASK			(0x3 << 8)
+#define RT5668_MIC2_OV_SFT			8
+#define RT5668_MIC2_OV_2V7			(0x0 << 8)
+#define RT5668_MIC2_OV_2V4			(0x1 << 8)
+#define RT5668_MIC2_OV_2V25			(0x3 << 8)
+#define RT5668_MIC2_OV_1V8			(0x4 << 8)
+#define RT5668_MIC2_CLK_MASK			(0x1 << 7)
+#define RT5668_MIC2_CLK_SFT			7
+#define RT5668_MIC2_CLK_DIS			(0x0 << 7)
+#define RT5668_MIC2_CLK_EN			(0x1 << 7)
+#define RT5668_MIC2_OVTH_MASK			(0x3 << 4)
+#define RT5668_MIC2_OVTH_SFT			4
+#define RT5668_MIC2_OVTH_768UA			(0x0 << 4)
+#define RT5668_MIC2_OVTH_960UA			(0x1 << 4)
+#define RT5668_MIC2_OVTH_1152UA			(0x2 << 4)
+#define RT5668_MIC2_OVTH_1960UA			(0x3 << 4)
+#define RT5668_PWR_MB_MASK			(0x1 << 3)
+#define RT5668_PWR_MB_SFT			3
+#define RT5668_PWR_MB_PD			(0x0 << 3)
+#define RT5668_PWR_MB_PU			(0x1 << 3)
+
+/* Micbias Control2 (0x0094) */
+#define RT5668_PWR_CLK25M_MASK			(0x1 << 9)
+#define RT5668_PWR_CLK25M_SFT			9
+#define RT5668_PWR_CLK25M_PD			(0x0 << 9)
+#define RT5668_PWR_CLK25M_PU			(0x1 << 9)
+#define RT5668_PWR_CLK1M_MASK			(0x1 << 8)
+#define RT5668_PWR_CLK1M_SFT			8
+#define RT5668_PWR_CLK1M_PD			(0x0 << 8)
+#define RT5668_PWR_CLK1M_PU			(0x1 << 8)
+
+/* RC Clock Control (0x009f) */
+#define RT5668_POW_IRQ				(0x1 << 15)
+#define RT5668_POW_JDH				(0x1 << 14)
+#define RT5668_POW_JDL				(0x1 << 13)
+#define RT5668_POW_ANA				(0x1 << 12)
+
+/* I2S Master Mode Clock Control 1 (0x00a0) */
+#define RT5668_CLK_SRC_MCLK			(0x0)
+#define RT5668_CLK_SRC_PLL1			(0x1)
+#define RT5668_CLK_SRC_PLL2			(0x2)
+#define RT5668_CLK_SRC_SDW			(0x3)
+#define RT5668_CLK_SRC_RCCLK			(0x4)
+#define RT5668_I2S_PD_1				(0x0)
+#define RT5668_I2S_PD_2				(0x1)
+#define RT5668_I2S_PD_3				(0x2)
+#define RT5668_I2S_PD_4				(0x3)
+#define RT5668_I2S_PD_6				(0x4)
+#define RT5668_I2S_PD_8				(0x5)
+#define RT5668_I2S_PD_12			(0x6)
+#define RT5668_I2S_PD_16			(0x7)
+#define RT5668_I2S_PD_24			(0x8)
+#define RT5668_I2S_PD_32			(0x9)
+#define RT5668_I2S_PD_48			(0xa)
+#define RT5668_I2S2_SRC_MASK			(0x3 << 4)
+#define RT5668_I2S2_SRC_SFT			4
+#define RT5668_I2S2_M_PD_MASK			(0xf << 0)
+#define RT5668_I2S2_M_PD_SFT			0
+
+/* IRQ Control 1 (0x00b6) */
+#define RT5668_JD1_PULSE_EN_MASK		(0x1 << 10)
+#define RT5668_JD1_PULSE_EN_SFT			10
+#define RT5668_JD1_PULSE_DIS			(0x0 << 10)
+#define RT5668_JD1_PULSE_EN			(0x1 << 10)
+
+/* IRQ Control 2 (0x00b7) */
+#define RT5668_JD1_EN_MASK			(0x1 << 15)
+#define RT5668_JD1_EN_SFT			15
+#define RT5668_JD1_DIS				(0x0 << 15)
+#define RT5668_JD1_EN				(0x1 << 15)
+#define RT5668_JD1_POL_MASK			(0x1 << 13)
+#define RT5668_JD1_POL_NOR			(0x0 << 13)
+#define RT5668_JD1_POL_INV			(0x1 << 13)
+
+/* IRQ Control 3 (0x00b8) */
+#define RT5668_IL_IRQ_MASK			(0x1 << 7)
+#define RT5668_IL_IRQ_DIS			(0x0 << 7)
+#define RT5668_IL_IRQ_EN			(0x1 << 7)
+
+/* GPIO Control 1 (0x00c0) */
+#define RT5668_GP1_PIN_MASK			(0x3 << 14)
+#define RT5668_GP1_PIN_SFT			14
+#define RT5668_GP1_PIN_GPIO1			(0x0 << 14)
+#define RT5668_GP1_PIN_IRQ			(0x1 << 14)
+#define RT5668_GP1_PIN_DMIC_CLK			(0x2 << 14)
+#define RT5668_GP2_PIN_MASK			(0x3 << 12)
+#define RT5668_GP2_PIN_SFT			12
+#define RT5668_GP2_PIN_GPIO2			(0x0 << 12)
+#define RT5668_GP2_PIN_LRCK2			(0x1 << 12)
+#define RT5668_GP2_PIN_DMIC_SDA			(0x2 << 12)
+#define RT5668_GP3_PIN_MASK			(0x3 << 10)
+#define RT5668_GP3_PIN_SFT			10
+#define RT5668_GP3_PIN_GPIO3			(0x0 << 10)
+#define RT5668_GP3_PIN_BCLK2			(0x1 << 10)
+#define RT5668_GP3_PIN_DMIC_CLK			(0x2 << 10)
+#define RT5668_GP4_PIN_MASK			(0x3 << 8)
+#define RT5668_GP4_PIN_SFT			8
+#define RT5668_GP4_PIN_GPIO4			(0x0 << 8)
+#define RT5668_GP4_PIN_ADCDAT1			(0x1 << 8)
+#define RT5668_GP4_PIN_DMIC_CLK			(0x2 << 8)
+#define RT5668_GP4_PIN_ADCDAT2			(0x3 << 8)
+#define RT5668_GP5_PIN_MASK			(0x3 << 6)
+#define RT5668_GP5_PIN_SFT			6
+#define RT5668_GP5_PIN_GPIO5			(0x0 << 6)
+#define RT5668_GP5_PIN_DACDAT1			(0x1 << 6)
+#define RT5668_GP5_PIN_DMIC_SDA			(0x2 << 6)
+#define RT5668_GP6_PIN_MASK			(0x1 << 5)
+#define RT5668_GP6_PIN_SFT			5
+#define RT5668_GP6_PIN_GPIO6			(0x0 << 5)
+#define RT5668_GP6_PIN_LRCK1			(0x1 << 5)
+
+/* GPIO Control 2 (0x00c1)*/
+#define RT5668_GP1_PF_MASK			(0x1 << 15)
+#define RT5668_GP1_PF_IN			(0x0 << 15)
+#define RT5668_GP1_PF_OUT			(0x1 << 15)
+#define RT5668_GP1_OUT_MASK			(0x1 << 14)
+#define RT5668_GP1_OUT_L			(0x0 << 14)
+#define RT5668_GP1_OUT_H			(0x1 << 14)
+#define RT5668_GP2_PF_MASK			(0x1 << 13)
+#define RT5668_GP2_PF_IN			(0x0 << 13)
+#define RT5668_GP2_PF_OUT			(0x1 << 13)
+#define RT5668_GP2_OUT_MASK			(0x1 << 12)
+#define RT5668_GP2_OUT_L			(0x0 << 12)
+#define RT5668_GP2_OUT_H			(0x1 << 12)
+#define RT5668_GP3_PF_MASK			(0x1 << 11)
+#define RT5668_GP3_PF_IN			(0x0 << 11)
+#define RT5668_GP3_PF_OUT			(0x1 << 11)
+#define RT5668_GP3_OUT_MASK			(0x1 << 10)
+#define RT5668_GP3_OUT_L			(0x0 << 10)
+#define RT5668_GP3_OUT_H			(0x1 << 10)
+#define RT5668_GP4_PF_MASK			(0x1 << 9)
+#define RT5668_GP4_PF_IN			(0x0 << 9)
+#define RT5668_GP4_PF_OUT			(0x1 << 9)
+#define RT5668_GP4_OUT_MASK			(0x1 << 8)
+#define RT5668_GP4_OUT_L			(0x0 << 8)
+#define RT5668_GP4_OUT_H			(0x1 << 8)
+#define RT5668_GP5_PF_MASK			(0x1 << 7)
+#define RT5668_GP5_PF_IN			(0x0 << 7)
+#define RT5668_GP5_PF_OUT			(0x1 << 7)
+#define RT5668_GP5_OUT_MASK			(0x1 << 6)
+#define RT5668_GP5_OUT_L			(0x0 << 6)
+#define RT5668_GP5_OUT_H			(0x1 << 6)
+#define RT5668_GP6_PF_MASK			(0x1 << 5)
+#define RT5668_GP6_PF_IN			(0x0 << 5)
+#define RT5668_GP6_PF_OUT			(0x1 << 5)
+#define RT5668_GP6_OUT_MASK			(0x1 << 4)
+#define RT5668_GP6_OUT_L			(0x0 << 4)
+#define RT5668_GP6_OUT_H			(0x1 << 4)
+
+
+/* GPIO Status (0x00c2) */
+#define RT5668_GP6_STA				(0x1 << 6)
+#define RT5668_GP5_STA				(0x1 << 5)
+#define RT5668_GP4_STA				(0x1 << 4)
+#define RT5668_GP3_STA				(0x1 << 3)
+#define RT5668_GP2_STA				(0x1 << 2)
+#define RT5668_GP1_STA				(0x1 << 1)
+
+/* Soft volume and zero cross control 1 (0x00d9) */
+#define RT5668_SV_MASK				(0x1 << 15)
+#define RT5668_SV_SFT				15
+#define RT5668_SV_DIS				(0x0 << 15)
+#define RT5668_SV_EN				(0x1 << 15)
+#define RT5668_ZCD_MASK				(0x1 << 10)
+#define RT5668_ZCD_SFT				10
+#define RT5668_ZCD_PD				(0x0 << 10)
+#define RT5668_ZCD_PU				(0x1 << 10)
+#define RT5668_SV_DLY_MASK			(0xf)
+#define RT5668_SV_DLY_SFT			0
+
+/* Soft volume and zero cross control 2 (0x00da) */
+#define RT5668_ZCD_BST1_CBJ_MASK		(0x1 << 7)
+#define RT5668_ZCD_BST1_CBJ_SFT			7
+#define RT5668_ZCD_BST1_CBJ_DIS			(0x0 << 7)
+#define RT5668_ZCD_BST1_CBJ_EN			(0x1 << 7)
+#define RT5668_ZCD_RECMIX_MASK			(0x1)
+#define RT5668_ZCD_RECMIX_SFT			0
+#define RT5668_ZCD_RECMIX_DIS			(0x0)
+#define RT5668_ZCD_RECMIX_EN			(0x1)
+
+/* 4 Button Inline Command Control 2 (0x00e3) */
+#define RT5668_4BTN_IL_MASK			(0x1 << 15)
+#define RT5668_4BTN_IL_EN			(0x1 << 15)
+#define RT5668_4BTN_IL_DIS			(0x0 << 15)
+#define RT5668_4BTN_IL_RST_MASK			(0x1 << 14)
+#define RT5668_4BTN_IL_NOR			(0x1 << 14)
+#define RT5668_4BTN_IL_RST			(0x0 << 14)
+
+/* Analog JD Control (0x00f0) */
+#define RT5668_JDH_RS_MASK			(0x1 << 4)
+#define RT5668_JDH_NO_PLUG			(0x1 << 4)
+#define RT5668_JDH_PLUG				(0x0 << 4)
+
+/* Chopper and Clock control for DAC (0x013a)*/
+#define RT5668_CKXEN_DAC1_MASK			(0x1 << 13)
+#define RT5668_CKXEN_DAC1_SFT			13
+#define RT5668_CKGEN_DAC1_MASK			(0x1 << 12)
+#define RT5668_CKGEN_DAC1_SFT			12
+
+/* Chopper and Clock control for ADC (0x013b)*/
+#define RT5668_CKXEN_ADC1_MASK			(0x1 << 13)
+#define RT5668_CKXEN_ADC1_SFT			13
+#define RT5668_CKGEN_ADC1_MASK			(0x1 << 12)
+#define RT5668_CKGEN_ADC1_SFT			12
+
+/* Volume test (0x013f)*/
+#define RT5668_SEL_CLK_VOL_MASK			(0x1 << 15)
+#define RT5668_SEL_CLK_VOL_EN			(0x1 << 15)
+#define RT5668_SEL_CLK_VOL_DIS			(0x0 << 15)
+
+/* Test Mode Control 1 (0x0145) */
+#define RT5668_AD2DA_LB_MASK			(0x1 << 10)
+#define RT5668_AD2DA_LB_SFT			10
+
+/* Stereo Noise Gate Control 1 (0x0160) */
+#define RT5668_NG2_EN_MASK			(0x1 << 15)
+#define RT5668_NG2_EN				(0x1 << 15)
+#define RT5668_NG2_DIS				(0x0 << 15)
+
+/* Stereo1 DAC Silence Detection Control (0x0190) */
+#define RT5668_DEB_STO_DAC_MASK			(0x7 << 4)
+#define RT5668_DEB_80_MS			(0x0 << 4)
+
+/* SAR ADC Inline Command Control 1 (0x0210) */
+#define RT5668_SAR_BUTT_DET_MASK		(0x1 << 15)
+#define RT5668_SAR_BUTT_DET_EN			(0x1 << 15)
+#define RT5668_SAR_BUTT_DET_DIS			(0x0 << 15)
+#define RT5668_SAR_BUTDET_MODE_MASK		(0x1 << 14)
+#define RT5668_SAR_BUTDET_POW_SAV		(0x1 << 14)
+#define RT5668_SAR_BUTDET_POW_NORM		(0x0 << 14)
+#define RT5668_SAR_BUTDET_RST_MASK		(0x1 << 13)
+#define RT5668_SAR_BUTDET_RST_NORMAL		(0x1 << 13)
+#define RT5668_SAR_BUTDET_RST			(0x0 << 13)
+#define RT5668_SAR_POW_MASK			(0x1 << 12)
+#define RT5668_SAR_POW_EN			(0x1 << 12)
+#define RT5668_SAR_POW_DIS			(0x0 << 12)
+#define RT5668_SAR_RST_MASK			(0x1 << 11)
+#define RT5668_SAR_RST_NORMAL			(0x1 << 11)
+#define RT5668_SAR_RST				(0x0 << 11)
+#define RT5668_SAR_BYPASS_MASK			(0x1 << 10)
+#define RT5668_SAR_BYPASS_EN			(0x1 << 10)
+#define RT5668_SAR_BYPASS_DIS			(0x0 << 10)
+#define RT5668_SAR_SEL_MB1_MASK			(0x1 << 9)
+#define RT5668_SAR_SEL_MB1_SEL			(0x1 << 9)
+#define RT5668_SAR_SEL_MB1_NOSEL		(0x0 << 9)
+#define RT5668_SAR_SEL_MB2_MASK			(0x1 << 8)
+#define RT5668_SAR_SEL_MB2_SEL			(0x1 << 8)
+#define RT5668_SAR_SEL_MB2_NOSEL		(0x0 << 8)
+#define RT5668_SAR_SEL_MODE_MASK		(0x1 << 7)
+#define RT5668_SAR_SEL_MODE_CMP			(0x1 << 7)
+#define RT5668_SAR_SEL_MODE_ADC			(0x0 << 7)
+#define RT5668_SAR_SEL_MB1_MB2_MASK		(0x1 << 5)
+#define RT5668_SAR_SEL_MB1_MB2_AUTO		(0x1 << 5)
+#define RT5668_SAR_SEL_MB1_MB2_MANU		(0x0 << 5)
+#define RT5668_SAR_SEL_SIGNAL_MASK		(0x1 << 4)
+#define RT5668_SAR_SEL_SIGNAL_AUTO		(0x1 << 4)
+#define RT5668_SAR_SEL_SIGNAL_MANU		(0x0 << 4)
+
+/* SAR ADC Inline Command Control 13 (0x021c) */
+#define RT5668_SAR_SOUR_MASK			(0x3f)
+#define RT5668_SAR_SOUR_BTN			(0x3f)
+#define RT5668_SAR_SOUR_TYPE			(0x0)
+
+
+/* System Clock Source */
+enum {
+	RT5668_SCLK_S_MCLK,
+	RT5668_SCLK_S_PLL1,
+	RT5668_SCLK_S_PLL2,
+	RT5668_SCLK_S_RCCLK,
+};
+
+/* PLL Source */
+enum {
+	RT5668_PLL1_S_MCLK,
+	RT5668_PLL1_S_BCLK1,
+	RT5668_PLL1_S_RCCLK,
+};
+
+enum {
+	RT5668_AIF1,
+	RT5668_AIF2,
+	RT5668_AIFS
+};
+
+/* filter mask */
+enum {
+	RT5668_DA_STEREO1_FILTER = 0x1,
+	RT5668_AD_STEREO1_FILTER = (0x1 << 1),
+};
+
+enum {
+	RT5668_CLK_SEL_SYS,
+	RT5668_CLK_SEL_I2S1_ASRC,
+	RT5668_CLK_SEL_I2S2_ASRC,
+};
+
+int rt5668_sel_asrc_clk_src(struct snd_soc_component *component,
+		unsigned int filter_mask, unsigned int clk_src);
+
+#endif /* __RT5668_H__ */
-- 
cgit v1.2.3


From ae0e28265e216dad11d4cbde42fc15e92919af78 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 11 Apr 2018 09:39:25 +0200
Subject: drm/blend: Add a generic alpha property

Some drivers duplicate the logic to create a property to store a per-plane
alpha.

This is especially useful if we ever want to support extra protocols for
Wayland like:
https://lists.freedesktop.org/archives/wayland-devel/2017-August/034741.html

Let's create a helper in order to move that to the core.

Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Acked-by: Sean Paul <seanpaul@chromium.org>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/6e1ce0db78fcfc407e94913c64819e65109d034d.1523432341.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/drm_atomic.c        |  4 ++++
 drivers/gpu/drm/drm_atomic_helper.c |  4 ++++
 drivers/gpu/drm/drm_blend.c         | 39 +++++++++++++++++++++++++++++++++++++
 include/drm/drm_blend.h             |  3 +++
 include/drm/drm_plane.h             |  6 ++++++
 5 files changed, 56 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 7d25c42f22db..3d9ae057a6cd 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -783,6 +783,8 @@ static int drm_atomic_plane_set_property(struct drm_plane *plane,
 		state->src_w = val;
 	} else if (property == config->prop_src_h) {
 		state->src_h = val;
+	} else if (property == plane->alpha_property) {
+		state->alpha = val;
 	} else if (property == plane->rotation_property) {
 		if (!is_power_of_2(val & DRM_MODE_ROTATE_MASK))
 			return -EINVAL;
@@ -848,6 +850,8 @@ drm_atomic_plane_get_property(struct drm_plane *plane,
 		*val = state->src_w;
 	} else if (property == config->prop_src_h) {
 		*val = state->src_h;
+	} else if (property == plane->alpha_property) {
+		*val = state->alpha;
 	} else if (property == plane->rotation_property) {
 		*val = state->rotation;
 	} else if (property == plane->zpos_property) {
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index ee03c1ed2521..0587a0a2f3aa 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -3500,6 +3500,10 @@ void drm_atomic_helper_plane_reset(struct drm_plane *plane)
 	if (plane->state) {
 		plane->state->plane = plane;
 		plane->state->rotation = DRM_MODE_ROTATE_0;
+
+		/* Reset the alpha value to fully opaque if it matters */
+		if (plane->alpha_property)
+			plane->state->alpha = plane->alpha_property->values[1];
 	}
 }
 EXPORT_SYMBOL(drm_atomic_helper_plane_reset);
diff --git a/drivers/gpu/drm/drm_blend.c b/drivers/gpu/drm/drm_blend.c
index 5a81e1b4c076..a16a74d7e15e 100644
--- a/drivers/gpu/drm/drm_blend.c
+++ b/drivers/gpu/drm/drm_blend.c
@@ -88,6 +88,13 @@
  * On top of this basic transformation additional properties can be exposed by
  * the driver:
  *
+ * alpha:
+ * 	Alpha is setup with drm_plane_create_alpha_property(). It controls the
+ * 	plane-wide opacity, from transparent (0) to opaque (0xffff). It can be
+ * 	combined with pixel alpha.
+ *	The pixel values in the framebuffers are expected to not be
+ *	pre-multiplied by the global alpha associated to the plane.
+ *
  * rotation:
  *	Rotation is set up with drm_plane_create_rotation_property(). It adds a
  *	rotation and reflection step between the source and destination rectangles.
@@ -105,6 +112,38 @@
  * exposed and assumed to be black).
  */
 
+/**
+ * drm_plane_create_alpha_property - create a new alpha property
+ * @plane: drm plane
+ *
+ * This function creates a generic, mutable, alpha property and enables support
+ * for it in the DRM core. It is attached to @plane.
+ *
+ * The alpha property will be allowed to be within the bounds of 0
+ * (transparent) to 0xffff (opaque).
+ *
+ * Returns:
+ * 0 on success, negative error code on failure.
+ */
+int drm_plane_create_alpha_property(struct drm_plane *plane)
+{
+	struct drm_property *prop;
+
+	prop = drm_property_create_range(plane->dev, 0, "alpha",
+					 0, DRM_BLEND_ALPHA_OPAQUE);
+	if (!prop)
+		return -ENOMEM;
+
+	drm_object_attach_property(&plane->base, prop, DRM_BLEND_ALPHA_OPAQUE);
+	plane->alpha_property = prop;
+
+	if (plane->state)
+		plane->state->alpha = DRM_BLEND_ALPHA_OPAQUE;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_plane_create_alpha_property);
+
 /**
  * drm_plane_create_rotation_property - create a new rotation property
  * @plane: drm plane
diff --git a/include/drm/drm_blend.h b/include/drm/drm_blend.h
index 17606026590b..330c561c4c11 100644
--- a/include/drm/drm_blend.h
+++ b/include/drm/drm_blend.h
@@ -36,6 +36,9 @@ static inline bool drm_rotation_90_or_270(unsigned int rotation)
 	return rotation & (DRM_MODE_ROTATE_90 | DRM_MODE_ROTATE_270);
 }
 
+#define DRM_BLEND_ALPHA_OPAQUE		0xffff
+
+int drm_plane_create_alpha_property(struct drm_plane *plane);
 int drm_plane_create_rotation_property(struct drm_plane *plane,
 				       unsigned int rotation,
 				       unsigned int supported_rotations);
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index d6da26d66a4b..9563bd25f19b 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -43,6 +43,7 @@ struct drm_modeset_acquire_ctx;
  *	plane (in 16.16)
  * @src_w: width of visible portion of plane (in 16.16)
  * @src_h: height of visible portion of plane (in 16.16)
+ * @alpha: opacity of the plane
  * @rotation: rotation of the plane
  * @zpos: priority of the given plane on crtc (optional)
  *	Note that multiple active planes on the same crtc can have an identical
@@ -106,6 +107,9 @@ struct drm_plane_state {
 	uint32_t src_x, src_y;
 	uint32_t src_h, src_w;
 
+	/* Plane opacity */
+	u16 alpha;
+
 	/* Plane rotation */
 	unsigned int rotation;
 
@@ -496,6 +500,7 @@ enum drm_plane_type {
  * @funcs: helper functions
  * @properties: property tracking for this plane
  * @type: type of plane (overlay, primary, cursor)
+ * @alpha_property: alpha property for this plane
  * @zpos_property: zpos property for this plane
  * @rotation_property: rotation property for this plane
  * @helper_private: mid-layer private data
@@ -571,6 +576,7 @@ struct drm_plane {
 	 */
 	struct drm_plane_state *state;
 
+	struct drm_property *alpha_property;
 	struct drm_property *zpos_property;
 	struct drm_property *rotation_property;
 
-- 
cgit v1.2.3


From ad56b738c5dd223a2f66685830f82194025a6138 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 21 Mar 2018 21:22:47 +0200
Subject: docs/vm: rename documentation files to .rst

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/ABI/stable/sysfs-devices-node        |   2 +-
 .../ABI/testing/sysfs-kernel-mm-hugepages          |   2 +-
 Documentation/ABI/testing/sysfs-kernel-mm-ksm      |   2 +-
 Documentation/ABI/testing/sysfs-kernel-slab        |   4 +-
 Documentation/admin-guide/kernel-parameters.txt    |  12 +-
 Documentation/dev-tools/kasan.rst                  |   2 +-
 Documentation/filesystems/proc.txt                 |   4 +-
 Documentation/filesystems/tmpfs.txt                |   2 +-
 Documentation/sysctl/vm.txt                        |   6 +-
 Documentation/vm/00-INDEX                          |  58 +-
 Documentation/vm/active_mm.rst                     |  91 +++
 Documentation/vm/active_mm.txt                     |  91 ---
 Documentation/vm/balance                           | 102 ----
 Documentation/vm/balance.rst                       | 102 ++++
 Documentation/vm/cleancache.rst                    | 296 ++++++++++
 Documentation/vm/cleancache.txt                    | 296 ----------
 Documentation/vm/frontswap.rst                     | 293 ++++++++++
 Documentation/vm/frontswap.txt                     | 293 ----------
 Documentation/vm/highmem.rst                       | 147 +++++
 Documentation/vm/highmem.txt                       | 147 -----
 Documentation/vm/hmm.rst                           | 374 +++++++++++++
 Documentation/vm/hmm.txt                           | 374 -------------
 Documentation/vm/hugetlbfs_reserv.rst              | 587 ++++++++++++++++++++
 Documentation/vm/hugetlbfs_reserv.txt              | 587 --------------------
 Documentation/vm/hugetlbpage.rst                   | 386 +++++++++++++
 Documentation/vm/hugetlbpage.txt                   | 386 -------------
 Documentation/vm/hwpoison.rst                      | 186 +++++++
 Documentation/vm/hwpoison.txt                      | 186 -------
 Documentation/vm/idle_page_tracking.rst            | 115 ++++
 Documentation/vm/idle_page_tracking.txt            | 115 ----
 Documentation/vm/ksm.rst                           | 183 ++++++
 Documentation/vm/ksm.txt                           | 183 ------
 Documentation/vm/mmu_notifier.rst                  |  99 ++++
 Documentation/vm/mmu_notifier.txt                  |  99 ----
 Documentation/vm/numa                              | 150 -----
 Documentation/vm/numa.rst                          | 150 +++++
 Documentation/vm/numa_memory_policy.rst            | 485 ++++++++++++++++
 Documentation/vm/numa_memory_policy.txt            | 485 ----------------
 Documentation/vm/overcommit-accounting             |  87 ---
 Documentation/vm/overcommit-accounting.rst         |  87 +++
 Documentation/vm/page_frags                        |  45 --
 Documentation/vm/page_frags.rst                    |  45 ++
 Documentation/vm/page_migration                    | 257 ---------
 Documentation/vm/page_migration.rst                | 257 +++++++++
 Documentation/vm/page_owner.rst                    |  90 +++
 Documentation/vm/page_owner.txt                    |  90 ---
 Documentation/vm/pagemap.rst                       | 197 +++++++
 Documentation/vm/pagemap.txt                       | 197 -------
 Documentation/vm/remap_file_pages.rst              |  33 ++
 Documentation/vm/remap_file_pages.txt              |  33 --
 Documentation/vm/slub.rst                          | 361 ++++++++++++
 Documentation/vm/slub.txt                          | 361 ------------
 Documentation/vm/soft-dirty.rst                    |  47 ++
 Documentation/vm/soft-dirty.txt                    |  47 --
 Documentation/vm/split_page_table_lock             | 100 ----
 Documentation/vm/split_page_table_lock.rst         | 100 ++++
 Documentation/vm/swap_numa.rst                     |  80 +++
 Documentation/vm/swap_numa.txt                     |  80 ---
 Documentation/vm/transhuge.rst                     | 573 +++++++++++++++++++
 Documentation/vm/transhuge.txt                     | 573 -------------------
 Documentation/vm/unevictable-lru.rst               | 614 +++++++++++++++++++++
 Documentation/vm/unevictable-lru.txt               | 614 ---------------------
 Documentation/vm/userfaultfd.rst                   | 241 ++++++++
 Documentation/vm/userfaultfd.txt                   | 241 --------
 Documentation/vm/z3fold.rst                        |  30 +
 Documentation/vm/z3fold.txt                        |  30 -
 Documentation/vm/zsmalloc.rst                      |  82 +++
 Documentation/vm/zsmalloc.txt                      |  82 ---
 Documentation/vm/zswap.rst                         | 135 +++++
 Documentation/vm/zswap.txt                         | 135 -----
 MAINTAINERS                                        |   2 +-
 arch/alpha/Kconfig                                 |   2 +-
 arch/ia64/Kconfig                                  |   2 +-
 arch/mips/Kconfig                                  |   2 +-
 arch/powerpc/Kconfig                               |   2 +-
 fs/Kconfig                                         |   2 +-
 fs/dax.c                                           |   2 +-
 fs/proc/task_mmu.c                                 |   4 +-
 include/linux/hmm.h                                |   2 +-
 include/linux/memremap.h                           |   4 +-
 include/linux/mmu_notifier.h                       |   2 +-
 include/linux/sched/mm.h                           |   4 +-
 include/linux/swap.h                               |   2 +-
 mm/Kconfig                                         |   6 +-
 mm/cleancache.c                                    |   2 +-
 mm/frontswap.c                                     |   2 +-
 mm/hmm.c                                           |   2 +-
 mm/huge_memory.c                                   |   4 +-
 mm/hugetlb.c                                       |   4 +-
 mm/ksm.c                                           |   4 +-
 mm/mmap.c                                          |   2 +-
 mm/rmap.c                                          |   6 +-
 mm/util.c                                          |   2 +-
 93 files changed, 6546 insertions(+), 6546 deletions(-)
 create mode 100644 Documentation/vm/active_mm.rst
 delete mode 100644 Documentation/vm/active_mm.txt
 delete mode 100644 Documentation/vm/balance
 create mode 100644 Documentation/vm/balance.rst
 create mode 100644 Documentation/vm/cleancache.rst
 delete mode 100644 Documentation/vm/cleancache.txt
 create mode 100644 Documentation/vm/frontswap.rst
 delete mode 100644 Documentation/vm/frontswap.txt
 create mode 100644 Documentation/vm/highmem.rst
 delete mode 100644 Documentation/vm/highmem.txt
 create mode 100644 Documentation/vm/hmm.rst
 delete mode 100644 Documentation/vm/hmm.txt
 create mode 100644 Documentation/vm/hugetlbfs_reserv.rst
 delete mode 100644 Documentation/vm/hugetlbfs_reserv.txt
 create mode 100644 Documentation/vm/hugetlbpage.rst
 delete mode 100644 Documentation/vm/hugetlbpage.txt
 create mode 100644 Documentation/vm/hwpoison.rst
 delete mode 100644 Documentation/vm/hwpoison.txt
 create mode 100644 Documentation/vm/idle_page_tracking.rst
 delete mode 100644 Documentation/vm/idle_page_tracking.txt
 create mode 100644 Documentation/vm/ksm.rst
 delete mode 100644 Documentation/vm/ksm.txt
 create mode 100644 Documentation/vm/mmu_notifier.rst
 delete mode 100644 Documentation/vm/mmu_notifier.txt
 delete mode 100644 Documentation/vm/numa
 create mode 100644 Documentation/vm/numa.rst
 create mode 100644 Documentation/vm/numa_memory_policy.rst
 delete mode 100644 Documentation/vm/numa_memory_policy.txt
 delete mode 100644 Documentation/vm/overcommit-accounting
 create mode 100644 Documentation/vm/overcommit-accounting.rst
 delete mode 100644 Documentation/vm/page_frags
 create mode 100644 Documentation/vm/page_frags.rst
 delete mode 100644 Documentation/vm/page_migration
 create mode 100644 Documentation/vm/page_migration.rst
 create mode 100644 Documentation/vm/page_owner.rst
 delete mode 100644 Documentation/vm/page_owner.txt
 create mode 100644 Documentation/vm/pagemap.rst
 delete mode 100644 Documentation/vm/pagemap.txt
 create mode 100644 Documentation/vm/remap_file_pages.rst
 delete mode 100644 Documentation/vm/remap_file_pages.txt
 create mode 100644 Documentation/vm/slub.rst
 delete mode 100644 Documentation/vm/slub.txt
 create mode 100644 Documentation/vm/soft-dirty.rst
 delete mode 100644 Documentation/vm/soft-dirty.txt
 delete mode 100644 Documentation/vm/split_page_table_lock
 create mode 100644 Documentation/vm/split_page_table_lock.rst
 create mode 100644 Documentation/vm/swap_numa.rst
 delete mode 100644 Documentation/vm/swap_numa.txt
 create mode 100644 Documentation/vm/transhuge.rst
 delete mode 100644 Documentation/vm/transhuge.txt
 create mode 100644 Documentation/vm/unevictable-lru.rst
 delete mode 100644 Documentation/vm/unevictable-lru.txt
 create mode 100644 Documentation/vm/userfaultfd.rst
 delete mode 100644 Documentation/vm/userfaultfd.txt
 create mode 100644 Documentation/vm/z3fold.rst
 delete mode 100644 Documentation/vm/z3fold.txt
 create mode 100644 Documentation/vm/zsmalloc.rst
 delete mode 100644 Documentation/vm/zsmalloc.txt
 create mode 100644 Documentation/vm/zswap.rst
 delete mode 100644 Documentation/vm/zswap.txt

(limited to 'include')

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 5b2d0f08867c..b38f4b734567 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -90,4 +90,4 @@ Date:		December 2009
 Contact:	Lee Schermerhorn <lee.schermerhorn@hp.com>
 Description:
 		The node's huge page size control/query attributes.
-		See Documentation/vm/hugetlbpage.txt
\ No newline at end of file
+		See Documentation/vm/hugetlbpage.rst
\ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
index e21c00571cf4..5140b233356c 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
@@ -12,4 +12,4 @@ Description:
 			free_hugepages
 			surplus_hugepages
 			resv_hugepages
-		See Documentation/vm/hugetlbpage.txt for details.
+		See Documentation/vm/hugetlbpage.rst for details.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
index 73e653ee2481..dfc13244cda3 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -40,7 +40,7 @@ Description:	Kernel Samepage Merging daemon sysfs interface
 		sleep_millisecs: how many milliseconds ksm should sleep between
 		scans.
 
-		See Documentation/vm/ksm.txt for more information.
+		See Documentation/vm/ksm.rst for more information.
 
 What:		/sys/kernel/mm/ksm/merge_across_nodes
 Date:		January 2013
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 2cc0a72b64be..29601d93a1c2 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -37,7 +37,7 @@ Description:
 		The alloc_calls file is read-only and lists the kernel code
 		locations from which allocations for this cache were performed.
 		The alloc_calls file only contains information if debugging is
-		enabled for that cache (see Documentation/vm/slub.txt).
+		enabled for that cache (see Documentation/vm/slub.rst).
 
 What:		/sys/kernel/slab/cache/alloc_fastpath
 Date:		February 2008
@@ -219,7 +219,7 @@ Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
 Description:
 		The free_calls file is read-only and lists the locations of
 		object frees if slab debugging is enabled (see
-		Documentation/vm/slub.txt).
+		Documentation/vm/slub.rst).
 
 What:		/sys/kernel/slab/cache/free_fastpath
 Date:		February 2008
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1d1d53f85ddd..5d6e5509c049 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3887,7 +3887,7 @@
 			cache (risks via metadata attacks are mostly
 			unchanged). Debug options disable merging on their
 			own.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slab_max_order=	[MM, SLAB]
 			Determines the maximum allowed order for slabs.
@@ -3901,7 +3901,7 @@
 			slub_debug can create guard zones around objects and
 			may poison objects when not in use. Also tracks the
 			last alloc / free. For more information see
-			Documentation/vm/slub.txt.
+			Documentation/vm/slub.rst.
 
 	slub_memcg_sysfs=	[MM, SLUB]
 			Determines whether to enable sysfs directories for
@@ -3915,7 +3915,7 @@
 			Determines the maximum allowed order for slabs.
 			A high setting may cause OOMs due to memory
 			fragmentation. For more information see
-			Documentation/vm/slub.txt.
+			Documentation/vm/slub.rst.
 
 	slub_min_objects=	[MM, SLUB]
 			The minimum number of objects per slab. SLUB will
@@ -3924,12 +3924,12 @@
 			the number of objects indicated. The higher the number
 			of objects the smaller the overhead of tracking slabs
 			and the less frequently locks need to be acquired.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slub_min_order=	[MM, SLUB]
 			Determines the minimum page order for slabs. Must be
 			lower than slub_max_order.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slub_nomerge	[MM, SLUB]
 			Same with slab_nomerge. This is supported for legacy.
@@ -4285,7 +4285,7 @@
 			Format: [always|madvise|never]
 			Can be used to control the default behavior of the system
 			with respect to transparent hugepages.
-			See Documentation/vm/transhuge.txt for more details.
+			See Documentation/vm/transhuge.rst for more details.
 
 	tsc=		Disable clocksource stability checks for TSC.
 			Format: <string>
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index f7a18f274357..aabc8738b3d8 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -120,7 +120,7 @@ A typical out of bounds access report looks like this::
 
 The header of the report discribe what kind of bug happened and what kind of
 access caused it. It's followed by the description of the accessed slub object
-(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
+(see 'SLUB Debug output' section in Documentation/vm/slub.rst for details) and
 the description of the accessed memory page.
 
 In the last section the report shows memory state around the accessed address.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2a84bb334894..2d3984c70feb 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -515,7 +515,7 @@ guarantees:
 
 The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
 bits on both physical and virtual pages associated with a process, and the
-soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
+soft-dirty bit on pte (see Documentation/vm/soft-dirty.rst for details).
 To clear the bits for all the pages associated with the process
     > echo 1 > /proc/PID/clear_refs
 
@@ -536,7 +536,7 @@ Any other value written to /proc/PID/clear_refs will have no effect.
 
 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
 using /proc/kpageflags and number of times a page is mapped using
-/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
+/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.rst.
 
 The /proc/pid/numa_maps is an extension based on maps, showing the memory
 locality and binding policy, as well as the memory usage (in pages) of
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index a85355cf85f4..627389a34f77 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -105,7 +105,7 @@ policy for the file will revert to "default" policy.
 NUMA memory allocation policies have optional flags that can be used in
 conjunction with their modes.  These optional flags can be specified
 when tmpfs is mounted by appending them to the mode before the NodeList.
-See Documentation/vm/numa_memory_policy.txt for a list of all available
+See Documentation/vm/numa_memory_policy.rst for a list of all available
 memory allocation policy mode flags and their effect on memory policy.
 
 	=static		is equivalent to	MPOL_F_STATIC_NODES
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index ff234d229cbb..ef581a940439 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -516,7 +516,7 @@ nr_hugepages
 
 Change the minimum size of the hugepage pool.
 
-See Documentation/vm/hugetlbpage.txt
+See Documentation/vm/hugetlbpage.rst
 
 ==============================================================
 
@@ -525,7 +525,7 @@ nr_overcommit_hugepages
 Change the maximum size of the hugepage pool. The maximum is
 nr_hugepages + nr_overcommit_hugepages.
 
-See Documentation/vm/hugetlbpage.txt
+See Documentation/vm/hugetlbpage.rst
 
 ==============================================================
 
@@ -668,7 +668,7 @@ and don't use much of it.
 
 The default value is 0.
 
-See Documentation/vm/overcommit-accounting and
+See Documentation/vm/overcommit-accounting.rst and
 mm/mmap.c::__vm_enough_memory() for more information.
 
 ==============================================================
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 0278f2c85efb..cda564d55b3c 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -1,62 +1,62 @@
 00-INDEX
 	- this file.
-active_mm.txt
+active_mm.rst
 	- An explanation from Linus about tsk->active_mm vs tsk->mm.
-balance
+balance.rst
 	- various information on memory balancing.
-cleancache.txt
+cleancache.rst
 	- Intro to cleancache and page-granularity victim cache.
-frontswap.txt
+frontswap.rst
 	- Outline frontswap, part of the transcendent memory frontend.
-highmem.txt
+highmem.rst
 	- Outline of highmem and common issues.
-hmm.txt
+hmm.rst
 	- Documentation of heterogeneous memory management
-hugetlbpage.txt
+hugetlbpage.rst
 	- a brief summary of hugetlbpage support in the Linux kernel.
-hugetlbfs_reserv.txt
+hugetlbfs_reserv.rst
 	- A brief overview of hugetlbfs reservation design/implementation.
-hwpoison.txt
+hwpoison.rst
 	- explains what hwpoison is
-idle_page_tracking.txt
+idle_page_tracking.rst
 	- description of the idle page tracking feature.
-ksm.txt
+ksm.rst
 	- how to use the Kernel Samepage Merging feature.
-mmu_notifier.txt
+mmu_notifier.rst
 	- a note about clearing pte/pmd and mmu notifications
-numa
+numa.rst
 	- information about NUMA specific code in the Linux vm.
-numa_memory_policy.txt
+numa_memory_policy.rst
 	- documentation of concepts and APIs of the 2.6 memory policy support.
-overcommit-accounting
+overcommit-accounting.rst
 	- description of the Linux kernels overcommit handling modes.
-page_frags
+page_frags.rst
 	- description of page fragments allocator
-page_migration
+page_migration.rst
 	- description of page migration in NUMA systems.
-pagemap.txt
+pagemap.rst
 	- pagemap, from the userspace perspective
-page_owner.txt
+page_owner.rst
 	- tracking about who allocated each page
-remap_file_pages.txt
+remap_file_pages.rst
 	- a note about remap_file_pages() system call
-slub.txt
+slub.rst
 	- a short users guide for SLUB.
-soft-dirty.txt
+soft-dirty.rst
 	- short explanation for soft-dirty PTEs
-split_page_table_lock
+split_page_table_lock.rst
 	- Separate per-table lock to improve scalability of the old page_table_lock.
-swap_numa.txt
+swap_numa.rst
 	- automatic binding of swap device to numa node
-transhuge.txt
+transhuge.rst
 	- Transparent Hugepage Support, alternative way of using hugepages.
-unevictable-lru.txt
+unevictable-lru.rst
 	- Unevictable LRU infrastructure
-userfaultfd.txt
+userfaultfd.rst
 	- description of userfaultfd system call
 z3fold.txt
 	- outline of z3fold allocator for storing compressed pages
-zsmalloc.txt
+zsmalloc.rst
 	- outline of zsmalloc allocator for storing compressed pages
-zswap.txt
+zswap.rst
 	- Intro to compressed cache for swap pages
diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst
new file mode 100644
index 000000000000..c84471b180f8
--- /dev/null
+++ b/Documentation/vm/active_mm.rst
@@ -0,0 +1,91 @@
+.. _active_mm:
+
+=========
+Active MM
+=========
+
+::
+
+ List:       linux-kernel
+ Subject:    Re: active_mm
+ From:       Linus Torvalds <torvalds () transmeta ! com>
+ Date:       1999-07-30 21:36:24
+
+ Cc'd to linux-kernel, because I don't write explanations all that often,
+ and when I do I feel better about more people reading them.
+
+ On Fri, 30 Jul 1999, David Mosberger wrote:
+ >
+ > Is there a brief description someplace on how "mm" vs. "active_mm" in
+ > the task_struct are supposed to be used?  (My apologies if this was
+ > discussed on the mailing lists---I just returned from vacation and
+ > wasn't able to follow linux-kernel for a while).
+
+ Basically, the new setup is:
+
+  - we have "real address spaces" and "anonymous address spaces". The
+    difference is that an anonymous address space doesn't care about the
+    user-level page tables at all, so when we do a context switch into an
+    anonymous address space we just leave the previous address space
+    active.
+
+    The obvious use for a "anonymous address space" is any thread that
+    doesn't need any user mappings - all kernel threads basically fall into
+    this category, but even "real" threads can temporarily say that for
+    some amount of time they are not going to be interested in user space,
+    and that the scheduler might as well try to avoid wasting time on
+    switching the VM state around. Currently only the old-style bdflush
+    sync does that.
+
+  - "tsk->mm" points to the "real address space". For an anonymous process,
+    tsk->mm will be NULL, for the logical reason that an anonymous process
+    really doesn't _have_ a real address space at all.
+
+  - however, we obviously need to keep track of which address space we
+    "stole" for such an anonymous user. For that, we have "tsk->active_mm",
+    which shows what the currently active address space is.
+
+    The rule is that for a process with a real address space (ie tsk->mm is
+    non-NULL) the active_mm obviously always has to be the same as the real
+    one.
+
+    For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
+    "borrowed" mm while the anonymous process is running. When the
+    anonymous process gets scheduled away, the borrowed address space is
+    returned and cleared.
+
+ To support all that, the "struct mm_struct" now has two counters: a
+ "mm_users" counter that is how many "real address space users" there are,
+ and a "mm_count" counter that is the number of "lazy" users (ie anonymous
+ users) plus one if there are any real users.
+
+ Usually there is at least one real user, but it could be that the real
+ user exited on another CPU while a lazy user was still active, so you do
+ actually get cases where you have a address space that is _only_ used by
+ lazy users. That is often a short-lived state, because once that thread
+ gets scheduled away in favour of a real thread, the "zombie" mm gets
+ released because "mm_users" becomes zero.
+
+ Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
+ more. "init_mm" should be considered just a "lazy context when no other
+ context is available", and in fact it is mainly used just at bootup when
+ no real VM has yet been created. So code that used to check
+
+ 	if (current->mm == &init_mm)
+
+ should generally just do
+
+ 	if (!current->mm)
+
+ instead (which makes more sense anyway - the test is basically one of "do
+ we have a user context", and is generally done by the page fault handler
+ and things like that).
+
+ Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
+ because it slightly changes the interfaces to accommodate the alpha (who
+ would have thought it, but the alpha actually ends up having one of the
+ ugliest context switch codes - unlike the other architectures where the MM
+ and register state is separate, the alpha PALcode joins the two, and you
+ need to switch both together).
+
+ (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt
deleted file mode 100644
index c84471b180f8..000000000000
--- a/Documentation/vm/active_mm.txt
+++ /dev/null
@@ -1,91 +0,0 @@
-.. _active_mm:
-
-=========
-Active MM
-=========
-
-::
-
- List:       linux-kernel
- Subject:    Re: active_mm
- From:       Linus Torvalds <torvalds () transmeta ! com>
- Date:       1999-07-30 21:36:24
-
- Cc'd to linux-kernel, because I don't write explanations all that often,
- and when I do I feel better about more people reading them.
-
- On Fri, 30 Jul 1999, David Mosberger wrote:
- >
- > Is there a brief description someplace on how "mm" vs. "active_mm" in
- > the task_struct are supposed to be used?  (My apologies if this was
- > discussed on the mailing lists---I just returned from vacation and
- > wasn't able to follow linux-kernel for a while).
-
- Basically, the new setup is:
-
-  - we have "real address spaces" and "anonymous address spaces". The
-    difference is that an anonymous address space doesn't care about the
-    user-level page tables at all, so when we do a context switch into an
-    anonymous address space we just leave the previous address space
-    active.
-
-    The obvious use for a "anonymous address space" is any thread that
-    doesn't need any user mappings - all kernel threads basically fall into
-    this category, but even "real" threads can temporarily say that for
-    some amount of time they are not going to be interested in user space,
-    and that the scheduler might as well try to avoid wasting time on
-    switching the VM state around. Currently only the old-style bdflush
-    sync does that.
-
-  - "tsk->mm" points to the "real address space". For an anonymous process,
-    tsk->mm will be NULL, for the logical reason that an anonymous process
-    really doesn't _have_ a real address space at all.
-
-  - however, we obviously need to keep track of which address space we
-    "stole" for such an anonymous user. For that, we have "tsk->active_mm",
-    which shows what the currently active address space is.
-
-    The rule is that for a process with a real address space (ie tsk->mm is
-    non-NULL) the active_mm obviously always has to be the same as the real
-    one.
-
-    For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
-    "borrowed" mm while the anonymous process is running. When the
-    anonymous process gets scheduled away, the borrowed address space is
-    returned and cleared.
-
- To support all that, the "struct mm_struct" now has two counters: a
- "mm_users" counter that is how many "real address space users" there are,
- and a "mm_count" counter that is the number of "lazy" users (ie anonymous
- users) plus one if there are any real users.
-
- Usually there is at least one real user, but it could be that the real
- user exited on another CPU while a lazy user was still active, so you do
- actually get cases where you have a address space that is _only_ used by
- lazy users. That is often a short-lived state, because once that thread
- gets scheduled away in favour of a real thread, the "zombie" mm gets
- released because "mm_users" becomes zero.
-
- Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
- more. "init_mm" should be considered just a "lazy context when no other
- context is available", and in fact it is mainly used just at bootup when
- no real VM has yet been created. So code that used to check
-
- 	if (current->mm == &init_mm)
-
- should generally just do
-
- 	if (!current->mm)
-
- instead (which makes more sense anyway - the test is basically one of "do
- we have a user context", and is generally done by the page fault handler
- and things like that).
-
- Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
- because it slightly changes the interfaces to accommodate the alpha (who
- would have thought it, but the alpha actually ends up having one of the
- ugliest context switch codes - unlike the other architectures where the MM
- and register state is separate, the alpha PALcode joins the two, and you
- need to switch both together).
-
- (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
deleted file mode 100644
index 6a1fadf3e173..000000000000
--- a/Documentation/vm/balance
+++ /dev/null
@@ -1,102 +0,0 @@
-.. _balance:
-
-================
-Memory Balancing
-================
-
-Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
-
-Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
-well as for non __GFP_IO allocations.
-
-The first reason why a caller may avoid reclaim is that the caller can not
-sleep due to holding a spinlock or is in interrupt context. The second may
-be that the caller is willing to fail the allocation without incurring the
-overhead of page reclaim. This may happen for opportunistic high-order
-allocation requests that have order-0 fallback options. In such cases,
-the caller may also wish to avoid waking kswapd.
-
-__GFP_IO allocation requests are made to prevent file system deadlocks.
-
-In the absence of non sleepable allocation requests, it seems detrimental
-to be doing balancing. Page reclamation can be kicked off lazily, that
-is, only when needed (aka zone free memory is 0), instead of making it
-a proactive process.
-
-That being said, the kernel should try to fulfill requests for direct
-mapped pages from the direct mapped pool, instead of falling back on
-the dma pool, so as to keep the dma pool filled for dma requests (atomic
-or not). A similar argument applies to highmem and direct mapped pages.
-OTOH, if there is a lot of free dma pages, it is preferable to satisfy
-regular memory requests by allocating one from the dma pool, instead
-of incurring the overhead of regular zone balancing.
-
-In 2.2, memory balancing/page reclamation would kick off only when the
-_total_ number of free pages fell below 1/64 th of total memory. With the
-right ratio of dma and regular memory, it is quite possible that balancing
-would not be done even when the dma zone was completely empty. 2.2 has
-been running production machines of varying memory sizes, and seems to be
-doing fine even with the presence of this problem. In 2.3, due to
-HIGHMEM, this problem is aggravated.
-
-In 2.3, zone balancing can be done in one of two ways: depending on the
-zone size (and possibly of the size of lower class zones), we can decide
-at init time how many free pages we should aim for while balancing any
-zone. The good part is, while balancing, we do not need to look at sizes
-of lower class zones, the bad part is, we might do too frequent balancing
-due to ignoring possibly lower usage in the lower class zones. Also,
-with a slight change in the allocation routine, it is possible to reduce
-the memclass() macro to be a simple equality.
-
-Another possible solution is that we balance only when the free memory
-of a zone _and_ all its lower class zones falls below 1/64th of the
-total memory in the zone and its lower class zones. This fixes the 2.2
-balancing problem, and stays as close to 2.2 behavior as possible. Also,
-the balancing algorithm works the same way on the various architectures,
-which have different numbers and types of zones. If we wanted to get
-fancy, we could assign different weights to free pages in different
-zones in the future.
-
-Note that if the size of the regular zone is huge compared to dma zone,
-it becomes less significant to consider the free dma pages while
-deciding whether to balance the regular zone. The first solution
-becomes more attractive then.
-
-The appended patch implements the second solution. It also "fixes" two
-problems: first, kswapd is woken up as in 2.2 on low memory conditions
-for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
-so as to give a fighting chance for replace_with_highmem() to get a
-HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
-fall back into regular zone. This also makes sure that HIGHMEM pages
-are not leaked (for example, in situations where a HIGHMEM page is in
-the swapcache but is not being used by anyone)
-
-kswapd also needs to know about the zones it should balance. kswapd is
-primarily needed in a situation where balancing can not be done,
-probably because all allocation requests are coming from intr context
-and all process contexts are sleeping. For 2.3, kswapd does not really
-need to balance the highmem zone, since intr context does not request
-highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
-structure to decide whether a zone needs balancing.
-
-Page stealing from process memory and shm is done if stealing the page would
-alleviate memory pressure on any zone in the page's node that has fallen below
-its watermark.
-
-watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
-are per-zone fields, used to determine when a zone needs to be balanced. When
-the number of pages falls below watermark[WMARK_MIN], the hysteric field
-low_on_memory gets set. This stays set till the number of free pages becomes
-watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
-try to free some pages in the zone (providing GFP_WAIT is set in the request).
-Orthogonal to this, is the decision to poke kswapd to free some zone pages.
-That decision is not hysteresis based, and is done when the number of free
-pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
-
-
-(Good) Ideas that I have heard:
-
-1. Dynamic experience should influence balancing: number of failed requests
-   for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
-2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
-   dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst
new file mode 100644
index 000000000000..6a1fadf3e173
--- /dev/null
+++ b/Documentation/vm/balance.rst
@@ -0,0 +1,102 @@
+.. _balance:
+
+================
+Memory Balancing
+================
+
+Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
+
+Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+well as for non __GFP_IO allocations.
+
+The first reason why a caller may avoid reclaim is that the caller can not
+sleep due to holding a spinlock or is in interrupt context. The second may
+be that the caller is willing to fail the allocation without incurring the
+overhead of page reclaim. This may happen for opportunistic high-order
+allocation requests that have order-0 fallback options. In such cases,
+the caller may also wish to avoid waking kswapd.
+
+__GFP_IO allocation requests are made to prevent file system deadlocks.
+
+In the absence of non sleepable allocation requests, it seems detrimental
+to be doing balancing. Page reclamation can be kicked off lazily, that
+is, only when needed (aka zone free memory is 0), instead of making it
+a proactive process.
+
+That being said, the kernel should try to fulfill requests for direct
+mapped pages from the direct mapped pool, instead of falling back on
+the dma pool, so as to keep the dma pool filled for dma requests (atomic
+or not). A similar argument applies to highmem and direct mapped pages.
+OTOH, if there is a lot of free dma pages, it is preferable to satisfy
+regular memory requests by allocating one from the dma pool, instead
+of incurring the overhead of regular zone balancing.
+
+In 2.2, memory balancing/page reclamation would kick off only when the
+_total_ number of free pages fell below 1/64 th of total memory. With the
+right ratio of dma and regular memory, it is quite possible that balancing
+would not be done even when the dma zone was completely empty. 2.2 has
+been running production machines of varying memory sizes, and seems to be
+doing fine even with the presence of this problem. In 2.3, due to
+HIGHMEM, this problem is aggravated.
+
+In 2.3, zone balancing can be done in one of two ways: depending on the
+zone size (and possibly of the size of lower class zones), we can decide
+at init time how many free pages we should aim for while balancing any
+zone. The good part is, while balancing, we do not need to look at sizes
+of lower class zones, the bad part is, we might do too frequent balancing
+due to ignoring possibly lower usage in the lower class zones. Also,
+with a slight change in the allocation routine, it is possible to reduce
+the memclass() macro to be a simple equality.
+
+Another possible solution is that we balance only when the free memory
+of a zone _and_ all its lower class zones falls below 1/64th of the
+total memory in the zone and its lower class zones. This fixes the 2.2
+balancing problem, and stays as close to 2.2 behavior as possible. Also,
+the balancing algorithm works the same way on the various architectures,
+which have different numbers and types of zones. If we wanted to get
+fancy, we could assign different weights to free pages in different
+zones in the future.
+
+Note that if the size of the regular zone is huge compared to dma zone,
+it becomes less significant to consider the free dma pages while
+deciding whether to balance the regular zone. The first solution
+becomes more attractive then.
+
+The appended patch implements the second solution. It also "fixes" two
+problems: first, kswapd is woken up as in 2.2 on low memory conditions
+for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
+so as to give a fighting chance for replace_with_highmem() to get a
+HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
+fall back into regular zone. This also makes sure that HIGHMEM pages
+are not leaked (for example, in situations where a HIGHMEM page is in
+the swapcache but is not being used by anyone)
+
+kswapd also needs to know about the zones it should balance. kswapd is
+primarily needed in a situation where balancing can not be done,
+probably because all allocation requests are coming from intr context
+and all process contexts are sleeping. For 2.3, kswapd does not really
+need to balance the highmem zone, since intr context does not request
+highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
+structure to decide whether a zone needs balancing.
+
+Page stealing from process memory and shm is done if stealing the page would
+alleviate memory pressure on any zone in the page's node that has fallen below
+its watermark.
+
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
+are per-zone fields, used to determine when a zone needs to be balanced. When
+the number of pages falls below watermark[WMARK_MIN], the hysteric field
+low_on_memory gets set. This stays set till the number of free pages becomes
+watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
+try to free some pages in the zone (providing GFP_WAIT is set in the request).
+Orthogonal to this, is the decision to poke kswapd to free some zone pages.
+That decision is not hysteresis based, and is done when the number of free
+pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
+
+
+(Good) Ideas that I have heard:
+
+1. Dynamic experience should influence balancing: number of failed requests
+   for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
+   dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/cleancache.rst b/Documentation/vm/cleancache.rst
new file mode 100644
index 000000000000..68cba9131c31
--- /dev/null
+++ b/Documentation/vm/cleancache.rst
@@ -0,0 +1,296 @@
+.. _cleancache:
+
+==========
+Cleancache
+==========
+
+Motivation
+==========
+
+Cleancache is a new optional feature provided by the VFS layer that
+potentially dramatically increases page cache effectiveness for
+many workloads in many environments at a negligible cost.
+
+Cleancache can be thought of as a page-granularity victim cache for clean
+pages that the kernel's pageframe replacement algorithm (PFRA) would like
+to keep around, but can't since there isn't enough memory.  So when the
+PFRA "evicts" a page, it first attempts to use cleancache code to
+put the data contained in that page into "transcendent memory", memory
+that is not directly accessible or addressable by the kernel and is
+of unknown and possibly time-varying size.
+
+Later, when a cleancache-enabled filesystem wishes to access a page
+in a file on disk, it first checks cleancache to see if it already
+contains it; if it does, the page of data is copied into the kernel
+and a disk access is avoided.
+
+Transcendent memory "drivers" for cleancache are currently implemented
+in Xen (using hypervisor memory) and zcache (using in-kernel compressed
+memory) and other implementations are in development.
+
+:ref:`FAQs <faq>` are included below.
+
+Implementation Overview
+=======================
+
+A cleancache "backend" that provides transcendent memory registers itself
+to the kernel's cleancache "frontend" by calling cleancache_register_ops,
+passing a pointer to a cleancache_ops structure with funcs set appropriately.
+The functions provided must conform to certain semantics as follows:
+
+Most important, cleancache is "ephemeral".  Pages which are copied into
+cleancache have an indefinite lifetime which is completely unknowable
+by the kernel and so may or may not still be in cleancache at any later time.
+Thus, as its name implies, cleancache is not suitable for dirty pages.
+Cleancache has complete discretion over what pages to preserve and what
+pages to discard and when.
+
+Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
+pool id which, if positive, must be saved in the filesystem's superblock;
+a negative return value indicates failure.  A "put_page" will copy a
+(presumably about-to-be-evicted) page into cleancache and associate it with
+the pool id, a file key, and a page index into the file.  (The combination
+of a pool id, a file key, and an index is sometimes called a "handle".)
+A "get_page" will copy the page, if found, from cleancache into kernel memory.
+An "invalidate_page" will ensure the page no longer is present in cleancache;
+an "invalidate_inode" will invalidate all pages associated with the specified
+file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate
+all pages in all files specified by the given pool id and also surrender
+the pool id.
+
+An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
+to treat the pool as shared using a 128-bit UUID as a key.  On systems
+that may run multiple kernels (such as hard partitioned or virtualized
+systems) that may share a clustered filesystem, and where cleancache
+may be shared among those kernels, calls to init_shared_fs that specify the
+same UUID will receive the same pool id, thus allowing the pages to
+be shared.  Note that any security requirements must be imposed outside
+of the kernel (e.g. by "tools" that control cleancache).  Or a
+cleancache implementation can simply disable shared_init by always
+returning a negative value.
+
+If a get_page is successful on a non-shared pool, the page is invalidated
+(thus making cleancache an "exclusive" cache).  On a shared pool, the page
+is NOT invalidated on a successful get_page so that it remains accessible to
+other sharers.  The kernel is responsible for ensuring coherency between
+cleancache (shared or not), the page cache, and the filesystem, using
+cleancache invalidate operations as required.
+
+Note that cleancache must enforce put-put-get coherency and get-get
+coherency.  For the former, if two puts are made to the same handle but
+with different data, say AAA by the first put and BBB by the second, a
+subsequent get can never return the stale data (AAA).  For get-get coherency,
+if a get for a given handle fails, subsequent gets for that handle will
+never succeed unless preceded by a successful put with that handle.
+
+Last, cleancache provides no SMP serialization guarantees; if two
+different Linux threads are simultaneously putting and invalidating a page
+with the same handle, the results are indeterminate.  Callers must
+lock the page to ensure serial behavior.
+
+Cleancache Performance Metrics
+==============================
+
+If properly configured, monitoring of cleancache is done via debugfs in
+the `/sys/kernel/debug/cleancache` directory.  The effectiveness of cleancache
+can be measured (across all filesystems) with:
+
+``succ_gets``
+	number of gets that were successful
+
+``failed_gets``
+	number of gets that failed
+
+``puts``
+	number of puts attempted (all "succeed")
+
+``invalidates``
+	number of invalidates attempted
+
+A backend implementation may provide additional metrics.
+
+.. _faq:
+
+FAQ
+===
+
+* Where's the value? (Andrew Morton)
+
+Cleancache provides a significant performance benefit to many workloads
+in many environments with negligible overhead by improving the
+effectiveness of the pagecache.  Clean pagecache pages are
+saved in transcendent memory (RAM that is otherwise not directly
+addressable to the kernel); fetching those pages later avoids "refaults"
+and thus disk reads.
+
+Cleancache (and its sister code "frontswap") provide interfaces for
+this transcendent memory (aka "tmem"), which conceptually lies between
+fast kernel-directly-addressable RAM and slower DMA/asynchronous devices.
+Disallowing direct kernel or userland reads/writes to tmem
+is ideal when data is transformed to a different form and size (such
+as with compression) or secretly moved (as might be useful for write-
+balancing for some RAM-like devices).  Evicted page-cache pages (and
+swap pages) are a great use for this kind of slower-than-RAM-but-much-
+faster-than-disk transcendent memory, and the cleancache (and frontswap)
+"page-object-oriented" specification provides a nice way to read and
+write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to
+do it well with no kernel change have essentially failed (except in some
+well-publicized special-case workloads).  Cleancache -- and frontswap --
+with a fairly small impact on the kernel, provide a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), page cache pages
+are the first to go, and cleancache allows those pages to be
+saved and reclaimed if overall host system memory conditions allow.
+
+And the identical interface used for cleancache can be used in
+physical systems as well.  The zcache driver acts as a memory-hungry
+device that stores pages of data in a compressed state.  And
+the proposed "RAMster" driver shares RAM across multiple physical
+systems.
+
+* Why does cleancache have its sticky fingers so deep inside the
+  filesystems and VFS? (Andrew Morton and Christoph Hellwig)
+
+The core hooks for cleancache in VFS are in most cases a single line
+and the minimum set are placed precisely where needed to maintain
+coherency (via cleancache_invalidate operations) between cleancache,
+the page cache, and disk.  All hooks compile into nothingness if
+cleancache is config'ed off and turn into a function-pointer-
+compare-to-NULL if config'ed on but no backend claims the ops
+functions, or to a compare-struct-element-to-negative if a
+backend claims the ops functions but a filesystem doesn't enable
+cleancache.
+
+Some filesystems are built entirely on top of VFS and the hooks
+in VFS are sufficient, so don't require an "init_fs" hook; the
+initial implementation of cleancache didn't provide this hook.
+But for some filesystems (such as btrfs), the VFS hooks are
+incomplete and one or more hooks in fs-specific code are required.
+And for some other filesystems, such as tmpfs, cleancache may
+be counterproductive.  So it seemed prudent to require a filesystem
+to "opt in" to use cleancache, which requires adding a hook in
+each filesystem.  Not all filesystems are supported by cleancache
+only because they haven't been tested.  The existing set should
+be sufficient to validate the concept, the opt-in approach means
+that untested filesystems are not affected, and the hooks in the
+existing filesystems should make it very easy to add more
+filesystems in the future.
+
+The total impact of the hooks to existing fs and mm files is only
+about 40 lines added (not counting comments and blank lines).
+
+* Why not make cleancache asynchronous and batched so it can more
+  easily interface with real devices with DMA instead of copying each
+  individual page? (Minchan Kim)
+
+The one-page-at-a-time copy semantics simplifies the implementation
+on both the frontend and backend and also allows the backend to
+do fancy things on-the-fly like page compression and
+page deduplication.  And since the data is "gone" (copied into/out
+of the pageframe) before the cleancache get/put call returns,
+a great deal of race conditions and potential coherency issues
+are avoided.  While the interface seems odd for a "real device"
+or for real kernel-addressable RAM, it makes perfect sense for
+transcendent memory.
+
+* Why is non-shared cleancache "exclusive"?  And where is the
+  page "invalidated" after a "get"? (Minchan Kim)
+
+The main reason is to free up space in transcendent memory and
+to avoid unnecessary cleancache_invalidate calls.  If you want inclusive,
+the page can be "put" immediately following the "get".  If
+put-after-get for inclusive becomes common, the interface could
+be easily extended to add a "get_no_invalidate" call.
+
+The invalidate is done by the cleancache backend implementation.
+
+* What's the performance impact?
+
+Performance analysis has been presented at OLS'09 and LCA'10.
+Briefly, performance gains can be significant on most workloads,
+especially when memory pressure is high (e.g. when RAM is
+overcommitted in a virtual workload); and because the hooks are
+invoked primarily in place of or in addition to a disk read/write,
+overhead is negligible even in worst case workloads.  Basically
+cleancache replaces I/O with memory-copy-CPU-overhead; on older
+single-core systems with slow memory-copy speeds, cleancache
+has little value, but in newer multicore machines, especially
+consolidated/virtualized machines, it has great value.
+
+* How do I add cleancache support for filesystem X? (Boaz Harrash)
+
+Filesystems that are well-behaved and conform to certain
+restrictions can utilize cleancache simply by making a call to
+cleancache_init_fs at mount time.  Unusual, misbehaving, or
+poorly layered filesystems must either add additional hooks
+and/or undergo extensive additional testing... or should just
+not enable the optional cleancache.
+
+Some points for a filesystem to consider:
+
+  - The FS should be block-device-based (e.g. a ram-based FS such
+    as tmpfs should not enable cleancache)
+  - To ensure coherency/correctness, the FS must ensure that all
+    file removal or truncation operations either go through VFS or
+    add hooks to do the equivalent cleancache "invalidate" operations
+  - To ensure coherency/correctness, either inode numbers must
+    be unique across the lifetime of the on-disk file OR the
+    FS must provide an "encode_fh" function.
+  - The FS must call the VFS superblock alloc and deactivate routines
+    or add hooks to do the equivalent cleancache calls done there.
+  - To maximize performance, all pages fetched from the FS should
+    go through the do_mpag_readpage routine or the FS should add
+    hooks to do the equivalent (cf. btrfs)
+  - Currently, the FS blocksize must be the same as PAGESIZE.  This
+    is not an architectural restriction, but no backends currently
+    support anything different.
+  - A clustered FS should invoke the "shared_init_fs" cleancache
+    hook to get best performance for some backends.
+
+* Why not use the KVA of the inode as the key? (Christoph Hellwig)
+
+If cleancache would use the inode virtual address instead of
+inode/filehandle, the pool id could be eliminated.  But, this
+won't work because cleancache retains pagecache data pages
+persistently even when the inode has been pruned from the
+inode unused list, and only invalidates the data page if the file
+gets removed/truncated.  So if cleancache used the inode kva,
+there would be potential coherency issues if/when the inode
+kva is reused for a different file.  Alternately, if cleancache
+invalidated the pages when the inode kva was freed, much of the value
+of cleancache would be lost because the cache of pages in cleanache
+is potentially much larger than the kernel pagecache and is most
+useful if the pages survive inode cache removal.
+
+* Why is a global variable required?
+
+The cleancache_enabled flag is checked in all of the frequently-used
+cleancache hooks.  The alternative is a function call to check a static
+variable. Since cleancache is enabled dynamically at runtime, systems
+that don't enable cleancache would suffer thousands (possibly
+tens-of-thousands) of unnecessary function calls per second.  So the
+global variable allows cleancache to be enabled by default at compile
+time, but have insignificant performance impact when cleancache remains
+disabled at runtime.
+
+* Does cleanache work with KVM?
+
+The memory model of KVM is sufficiently different that a cleancache
+backend may have less value for KVM.  This remains to be tested,
+especially in an overcommitted system.
+
+* Does cleancache work in userspace?  It sounds useful for
+  memory hungry caches like web browsers.  (Jamie Lokier)
+
+No plans yet, though we agree it sounds useful, at least for
+apps that bypass the page cache (e.g. O_DIRECT).
+
+Last updated: Dan Magenheimer, April 13 2011
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
deleted file mode 100644
index 68cba9131c31..000000000000
--- a/Documentation/vm/cleancache.txt
+++ /dev/null
@@ -1,296 +0,0 @@
-.. _cleancache:
-
-==========
-Cleancache
-==========
-
-Motivation
-==========
-
-Cleancache is a new optional feature provided by the VFS layer that
-potentially dramatically increases page cache effectiveness for
-many workloads in many environments at a negligible cost.
-
-Cleancache can be thought of as a page-granularity victim cache for clean
-pages that the kernel's pageframe replacement algorithm (PFRA) would like
-to keep around, but can't since there isn't enough memory.  So when the
-PFRA "evicts" a page, it first attempts to use cleancache code to
-put the data contained in that page into "transcendent memory", memory
-that is not directly accessible or addressable by the kernel and is
-of unknown and possibly time-varying size.
-
-Later, when a cleancache-enabled filesystem wishes to access a page
-in a file on disk, it first checks cleancache to see if it already
-contains it; if it does, the page of data is copied into the kernel
-and a disk access is avoided.
-
-Transcendent memory "drivers" for cleancache are currently implemented
-in Xen (using hypervisor memory) and zcache (using in-kernel compressed
-memory) and other implementations are in development.
-
-:ref:`FAQs <faq>` are included below.
-
-Implementation Overview
-=======================
-
-A cleancache "backend" that provides transcendent memory registers itself
-to the kernel's cleancache "frontend" by calling cleancache_register_ops,
-passing a pointer to a cleancache_ops structure with funcs set appropriately.
-The functions provided must conform to certain semantics as follows:
-
-Most important, cleancache is "ephemeral".  Pages which are copied into
-cleancache have an indefinite lifetime which is completely unknowable
-by the kernel and so may or may not still be in cleancache at any later time.
-Thus, as its name implies, cleancache is not suitable for dirty pages.
-Cleancache has complete discretion over what pages to preserve and what
-pages to discard and when.
-
-Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
-pool id which, if positive, must be saved in the filesystem's superblock;
-a negative return value indicates failure.  A "put_page" will copy a
-(presumably about-to-be-evicted) page into cleancache and associate it with
-the pool id, a file key, and a page index into the file.  (The combination
-of a pool id, a file key, and an index is sometimes called a "handle".)
-A "get_page" will copy the page, if found, from cleancache into kernel memory.
-An "invalidate_page" will ensure the page no longer is present in cleancache;
-an "invalidate_inode" will invalidate all pages associated with the specified
-file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate
-all pages in all files specified by the given pool id and also surrender
-the pool id.
-
-An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
-to treat the pool as shared using a 128-bit UUID as a key.  On systems
-that may run multiple kernels (such as hard partitioned or virtualized
-systems) that may share a clustered filesystem, and where cleancache
-may be shared among those kernels, calls to init_shared_fs that specify the
-same UUID will receive the same pool id, thus allowing the pages to
-be shared.  Note that any security requirements must be imposed outside
-of the kernel (e.g. by "tools" that control cleancache).  Or a
-cleancache implementation can simply disable shared_init by always
-returning a negative value.
-
-If a get_page is successful on a non-shared pool, the page is invalidated
-(thus making cleancache an "exclusive" cache).  On a shared pool, the page
-is NOT invalidated on a successful get_page so that it remains accessible to
-other sharers.  The kernel is responsible for ensuring coherency between
-cleancache (shared or not), the page cache, and the filesystem, using
-cleancache invalidate operations as required.
-
-Note that cleancache must enforce put-put-get coherency and get-get
-coherency.  For the former, if two puts are made to the same handle but
-with different data, say AAA by the first put and BBB by the second, a
-subsequent get can never return the stale data (AAA).  For get-get coherency,
-if a get for a given handle fails, subsequent gets for that handle will
-never succeed unless preceded by a successful put with that handle.
-
-Last, cleancache provides no SMP serialization guarantees; if two
-different Linux threads are simultaneously putting and invalidating a page
-with the same handle, the results are indeterminate.  Callers must
-lock the page to ensure serial behavior.
-
-Cleancache Performance Metrics
-==============================
-
-If properly configured, monitoring of cleancache is done via debugfs in
-the `/sys/kernel/debug/cleancache` directory.  The effectiveness of cleancache
-can be measured (across all filesystems) with:
-
-``succ_gets``
-	number of gets that were successful
-
-``failed_gets``
-	number of gets that failed
-
-``puts``
-	number of puts attempted (all "succeed")
-
-``invalidates``
-	number of invalidates attempted
-
-A backend implementation may provide additional metrics.
-
-.. _faq:
-
-FAQ
-===
-
-* Where's the value? (Andrew Morton)
-
-Cleancache provides a significant performance benefit to many workloads
-in many environments with negligible overhead by improving the
-effectiveness of the pagecache.  Clean pagecache pages are
-saved in transcendent memory (RAM that is otherwise not directly
-addressable to the kernel); fetching those pages later avoids "refaults"
-and thus disk reads.
-
-Cleancache (and its sister code "frontswap") provide interfaces for
-this transcendent memory (aka "tmem"), which conceptually lies between
-fast kernel-directly-addressable RAM and slower DMA/asynchronous devices.
-Disallowing direct kernel or userland reads/writes to tmem
-is ideal when data is transformed to a different form and size (such
-as with compression) or secretly moved (as might be useful for write-
-balancing for some RAM-like devices).  Evicted page-cache pages (and
-swap pages) are a great use for this kind of slower-than-RAM-but-much-
-faster-than-disk transcendent memory, and the cleancache (and frontswap)
-"page-object-oriented" specification provides a nice way to read and
-write -- and indirectly "name" -- the pages.
-
-In the virtual case, the whole point of virtualization is to statistically
-multiplex physical resources across the varying demands of multiple
-virtual machines.  This is really hard to do with RAM and efforts to
-do it well with no kernel change have essentially failed (except in some
-well-publicized special-case workloads).  Cleancache -- and frontswap --
-with a fairly small impact on the kernel, provide a huge amount
-of flexibility for more dynamic, flexible RAM multiplexing.
-Specifically, the Xen Transcendent Memory backend allows otherwise
-"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
-virtual machines, but the pages can be compressed and deduplicated to
-optimize RAM utilization.  And when guest OS's are induced to surrender
-underutilized RAM (e.g. with "self-ballooning"), page cache pages
-are the first to go, and cleancache allows those pages to be
-saved and reclaimed if overall host system memory conditions allow.
-
-And the identical interface used for cleancache can be used in
-physical systems as well.  The zcache driver acts as a memory-hungry
-device that stores pages of data in a compressed state.  And
-the proposed "RAMster" driver shares RAM across multiple physical
-systems.
-
-* Why does cleancache have its sticky fingers so deep inside the
-  filesystems and VFS? (Andrew Morton and Christoph Hellwig)
-
-The core hooks for cleancache in VFS are in most cases a single line
-and the minimum set are placed precisely where needed to maintain
-coherency (via cleancache_invalidate operations) between cleancache,
-the page cache, and disk.  All hooks compile into nothingness if
-cleancache is config'ed off and turn into a function-pointer-
-compare-to-NULL if config'ed on but no backend claims the ops
-functions, or to a compare-struct-element-to-negative if a
-backend claims the ops functions but a filesystem doesn't enable
-cleancache.
-
-Some filesystems are built entirely on top of VFS and the hooks
-in VFS are sufficient, so don't require an "init_fs" hook; the
-initial implementation of cleancache didn't provide this hook.
-But for some filesystems (such as btrfs), the VFS hooks are
-incomplete and one or more hooks in fs-specific code are required.
-And for some other filesystems, such as tmpfs, cleancache may
-be counterproductive.  So it seemed prudent to require a filesystem
-to "opt in" to use cleancache, which requires adding a hook in
-each filesystem.  Not all filesystems are supported by cleancache
-only because they haven't been tested.  The existing set should
-be sufficient to validate the concept, the opt-in approach means
-that untested filesystems are not affected, and the hooks in the
-existing filesystems should make it very easy to add more
-filesystems in the future.
-
-The total impact of the hooks to existing fs and mm files is only
-about 40 lines added (not counting comments and blank lines).
-
-* Why not make cleancache asynchronous and batched so it can more
-  easily interface with real devices with DMA instead of copying each
-  individual page? (Minchan Kim)
-
-The one-page-at-a-time copy semantics simplifies the implementation
-on both the frontend and backend and also allows the backend to
-do fancy things on-the-fly like page compression and
-page deduplication.  And since the data is "gone" (copied into/out
-of the pageframe) before the cleancache get/put call returns,
-a great deal of race conditions and potential coherency issues
-are avoided.  While the interface seems odd for a "real device"
-or for real kernel-addressable RAM, it makes perfect sense for
-transcendent memory.
-
-* Why is non-shared cleancache "exclusive"?  And where is the
-  page "invalidated" after a "get"? (Minchan Kim)
-
-The main reason is to free up space in transcendent memory and
-to avoid unnecessary cleancache_invalidate calls.  If you want inclusive,
-the page can be "put" immediately following the "get".  If
-put-after-get for inclusive becomes common, the interface could
-be easily extended to add a "get_no_invalidate" call.
-
-The invalidate is done by the cleancache backend implementation.
-
-* What's the performance impact?
-
-Performance analysis has been presented at OLS'09 and LCA'10.
-Briefly, performance gains can be significant on most workloads,
-especially when memory pressure is high (e.g. when RAM is
-overcommitted in a virtual workload); and because the hooks are
-invoked primarily in place of or in addition to a disk read/write,
-overhead is negligible even in worst case workloads.  Basically
-cleancache replaces I/O with memory-copy-CPU-overhead; on older
-single-core systems with slow memory-copy speeds, cleancache
-has little value, but in newer multicore machines, especially
-consolidated/virtualized machines, it has great value.
-
-* How do I add cleancache support for filesystem X? (Boaz Harrash)
-
-Filesystems that are well-behaved and conform to certain
-restrictions can utilize cleancache simply by making a call to
-cleancache_init_fs at mount time.  Unusual, misbehaving, or
-poorly layered filesystems must either add additional hooks
-and/or undergo extensive additional testing... or should just
-not enable the optional cleancache.
-
-Some points for a filesystem to consider:
-
-  - The FS should be block-device-based (e.g. a ram-based FS such
-    as tmpfs should not enable cleancache)
-  - To ensure coherency/correctness, the FS must ensure that all
-    file removal or truncation operations either go through VFS or
-    add hooks to do the equivalent cleancache "invalidate" operations
-  - To ensure coherency/correctness, either inode numbers must
-    be unique across the lifetime of the on-disk file OR the
-    FS must provide an "encode_fh" function.
-  - The FS must call the VFS superblock alloc and deactivate routines
-    or add hooks to do the equivalent cleancache calls done there.
-  - To maximize performance, all pages fetched from the FS should
-    go through the do_mpag_readpage routine or the FS should add
-    hooks to do the equivalent (cf. btrfs)
-  - Currently, the FS blocksize must be the same as PAGESIZE.  This
-    is not an architectural restriction, but no backends currently
-    support anything different.
-  - A clustered FS should invoke the "shared_init_fs" cleancache
-    hook to get best performance for some backends.
-
-* Why not use the KVA of the inode as the key? (Christoph Hellwig)
-
-If cleancache would use the inode virtual address instead of
-inode/filehandle, the pool id could be eliminated.  But, this
-won't work because cleancache retains pagecache data pages
-persistently even when the inode has been pruned from the
-inode unused list, and only invalidates the data page if the file
-gets removed/truncated.  So if cleancache used the inode kva,
-there would be potential coherency issues if/when the inode
-kva is reused for a different file.  Alternately, if cleancache
-invalidated the pages when the inode kva was freed, much of the value
-of cleancache would be lost because the cache of pages in cleanache
-is potentially much larger than the kernel pagecache and is most
-useful if the pages survive inode cache removal.
-
-* Why is a global variable required?
-
-The cleancache_enabled flag is checked in all of the frequently-used
-cleancache hooks.  The alternative is a function call to check a static
-variable. Since cleancache is enabled dynamically at runtime, systems
-that don't enable cleancache would suffer thousands (possibly
-tens-of-thousands) of unnecessary function calls per second.  So the
-global variable allows cleancache to be enabled by default at compile
-time, but have insignificant performance impact when cleancache remains
-disabled at runtime.
-
-* Does cleanache work with KVM?
-
-The memory model of KVM is sufficiently different that a cleancache
-backend may have less value for KVM.  This remains to be tested,
-especially in an overcommitted system.
-
-* Does cleancache work in userspace?  It sounds useful for
-  memory hungry caches like web browsers.  (Jamie Lokier)
-
-No plans yet, though we agree it sounds useful, at least for
-apps that bypass the page cache (e.g. O_DIRECT).
-
-Last updated: Dan Magenheimer, April 13 2011
diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst
new file mode 100644
index 000000000000..1979f430c1c5
--- /dev/null
+++ b/Documentation/vm/frontswap.rst
@@ -0,0 +1,293 @@
+.. _frontswap:
+
+=========
+Frontswap
+=========
+
+Frontswap provides a "transcendent memory" interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends"
+and the only necessary changes to the core kernel for transcendent memory;
+all other supporting code -- the "backends" -- is implemented as drivers.
+See the LWN.net article `Transcendent memory in a nutshell`_
+for a detailed overview of frontswap and related kernel parts)
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device.  The storage is assumed to be
+a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
+to the requirements of transcendent memory (such as Xen's "tmem", or
+in-kernel compressed memory, aka "zcache", or future RAM-like devices);
+this pseudo-RAM device is not directly accessible or addressable by the
+kernel and is of unknown and possibly time-varying size.  The driver
+links itself to frontswap by calling frontswap_register_ops to set the
+frontswap_ops funcs appropriately and the functions it provides must
+conform to certain policies as follows:
+
+An "init" prepares the device to receive frontswap pages associated
+with the specified swap device number (aka "type").  A "store" will
+copy the page to transcendent memory and associate it with the type and
+offset associated with the page. A "load" will copy the page, if found,
+from transcendent memory into kernel memory, but will NOT remove the page
+from transcendent memory.  An "invalidate_page" will remove the page
+from transcendent memory and an "invalidate_area" will remove ALL pages
+associated with the swap type (e.g., like swapoff) and notify the "device"
+to refuse further stores with that swap type.
+
+Once a page is successfully stored, a matching load on the page will normally
+succeed.  So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap.  If the store returns
+success, the data has been successfully saved to transcendent memory and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a store returns failure, transcendent memory has rejected the data, and the
+page can be written to swap as usual.
+
+If a backend chooses, frontswap can be configured as a "writethrough
+cache" by calling frontswap_writethrough().  In this mode, the reduction
+in swap device writes is lost (and also a non-trivial performance advantage)
+in order to allow the backend to arbitrarily "reclaim" space used to
+store frontswap pages to more completely manage its memory usage.
+
+Note that if a page is stored and the page already exists in transcendent memory
+(a "duplicate" store), either the store succeeds and the data is overwritten,
+or the store fails AND the page is invalidated.  This ensures stale data may
+never be obtained from frontswap.
+
+If properly configured, monitoring of frontswap is done via debugfs in
+the `/sys/kernel/debug/frontswap` directory.  The effectiveness of
+frontswap can be measured (across all swap devices) with:
+
+``failed_stores``
+	how many store attempts have failed
+
+``loads``
+	how many loads were attempted (all should succeed)
+
+``succ_stores``
+	how many store attempts have succeeded
+
+``invalidates``
+	how many invalidates were attempted
+
+A backend implementation may provide additional metrics.
+
+FAQ
+===
+
+* Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+"transcendent memory" that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices).  Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and
+cleancache) interface to transcendent memory provides a nice way to read
+and write -- and indirectly "name" -- the pages.
+
+Frontswap -- and cleancache -- with a fairly small impact on the kernel,
+provides a huge amount of flexibility for more dynamic, flexible RAM
+utilization in various system configurations:
+
+In the single kernel case, aka "zcache", pages are compressed and
+stored in local memory, thus increasing the total anonymous pages
+that can be safely kept in RAM.  Zcache essentially trades off CPU
+cycles used in compression/decompression for better memory utilization.
+Benchmarks have shown little or no impact when memory pressure is
+low while providing a significant performance improvement (25%+)
+on some workloads under high memory pressure.
+
+"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
+support for clustered systems.  Frontswap pages are locally compressed
+as in zcache, but then "remotified" to another system's RAM.  This
+allows RAM to be dynamically load-balanced back-and-forth as needed,
+i.e. when system A is overcommitted, it can swap to system B, and
+vice versa.  RAMster can also be configured as a memory server so
+many servers in a cluster can swap, dynamically as needed, to a single
+server configured with a large amount of RAM... without pre-configuring
+how much of the RAM is available for each of the clients!
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads).
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "selfballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM (if overall host system memory
+conditions allow), thus mitigating the potentially awful performance impact
+of unplanned swapping.
+
+A KVM implementation is underway and has been RFC'ed to lkml.  And,
+using frontswap, investigation is also underway on the use of NVM as
+a memory extension technology.
+
+* Sure there may be performance advantages in some situations, but
+  what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written.  If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "store"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd.  This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd.  (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.)  For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+When swap pages are stored in transcendent memory instead of written
+out to disk, there is a side effect that this may create more memory
+pressure that can potentially outweigh the other advantages.  A
+backend, such as zcache, must implement policies to carefully (but
+dynamically) manage memory limits to ensure this doesn't happen.
+
+* OK, how about a quick overview of what this frontswap patch does
+  in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel.  Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "store" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_store is called.  Frontswap
+consults with the frontswap backend and if the backend says it does NOT
+have room, frontswap_store returns -1 and the kernel swaps the page
+to the swap device as normal.  Note that the response from the frontswap
+backend is unpredictable to the kernel; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page.  But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data.  In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_load() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend.  If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete.  If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+store" and (possibly) a "frontswap backend loads", which are presumably much
+faster.
+
+* Can't frontswap be configured as a "special" swap device that is
+  just higher priority than any real swap device (e.g. like zswap,
+  or maybe swap-over-nbd/NFS)?
+
+No.  First, the existing swap subsystem doesn't allow for any kind of
+swap hierarchy.  Perhaps it could be rewritten to accommodate a hierarchy,
+but this would require fairly drastic changes.  Even if it were
+rewritten, the existing swap subsystem uses the block I/O layer which
+assumes a swap device is fixed size and any page in it is linearly
+addressable.  Frontswap barely touches the existing swap subsystem,
+and works around the constraints of the block I/O subsystem to provide
+a great deal of flexibility and dynamicity.
+
+For example, the acceptance of any swap page by the frontswap backend is
+entirely unpredictable. This is critical to the definition of frontswap
+backends because it grants completely dynamic discretion to the
+backend.  In zcache, one cannot know a priori how compressible a page is.
+"Poorly" compressible pages can be rejected, and "poorly" can itself be
+defined dynamically depending on current memory constraints.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O.  The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time.  Synchrony is
+required to ensure the dynamicity of the backend and to avoid thorny race
+conditions that would unnecessarily and greatly complicate frontswap
+and/or the block I/O subsystem.  That said, only the initial "store"
+and "load" operations need be synchronous.  A separate asynchronous thread
+is free to manipulate the pages stored by frontswap.  For example,
+the "remotification" thread in RAMster uses standard asynchronous
+kernel sockets to move compressed frontswap pages to a remote machine.
+Similarly, a KVM guest-side implementation could do in-guest compression
+and use "batched" hypercalls.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit".  For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.
+
+There is a downside to the transcendent memory specifications for
+frontswap:  Since any "store" might fail, there must always be a real
+slot on a real swap device to swap the page.  Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all.  This means
+that frontswap cannot contain more pages than the total of swapon'd
+swap devices.  For example, if NO swap device is configured on some
+installation, frontswap is useless.  Swapless portable devices
+can still use frontswap but a backend for such devices must configure
+some kind of "ghost" swap device and ensure that it is never used.
+
+* Why this weird definition about "duplicate stores"?  If a page
+  has been previously successfully stored, can't it always be
+  successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot.  Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K.  Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K.  But the backend
+has no more space.  In this case, the store must be rejected.  Whenever
+frontswap rejects a store that would overwrite, it also must invalidate
+the old data and ensure that it is no longer accessible.  Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+* What is frontswap_shrink for?
+
+When the (non-frontswap) swap subsystem swaps out a page to a real
+swap device, that page is only taking up low-value pre-allocated disk
+space.  But if frontswap has placed a page in transcendent memory, that
+page may be taking up valuable real estate.  The frontswap_shrink
+routine allows code outside of the swap subsystem to force pages out
+of the memory managed by frontswap and back into kernel-addressable memory.
+For example, in RAMster, a "suction driver" thread will attempt
+to "repatriate" pages sent to a remote machine back to the local machine;
+this is driven using the frontswap_shrink mechanism when memory pressure
+subsides.
+
+* Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global.  This seemed a reasonable compromise:  Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, last updated April 9, 2012
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt
deleted file mode 100644
index 1979f430c1c5..000000000000
--- a/Documentation/vm/frontswap.txt
+++ /dev/null
@@ -1,293 +0,0 @@
-.. _frontswap:
-
-=========
-Frontswap
-=========
-
-Frontswap provides a "transcendent memory" interface for swap pages.
-In some environments, dramatic performance savings may be obtained because
-swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
-
-(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends"
-and the only necessary changes to the core kernel for transcendent memory;
-all other supporting code -- the "backends" -- is implemented as drivers.
-See the LWN.net article `Transcendent memory in a nutshell`_
-for a detailed overview of frontswap and related kernel parts)
-
-.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
-
-Frontswap is so named because it can be thought of as the opposite of
-a "backing" store for a swap device.  The storage is assumed to be
-a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
-to the requirements of transcendent memory (such as Xen's "tmem", or
-in-kernel compressed memory, aka "zcache", or future RAM-like devices);
-this pseudo-RAM device is not directly accessible or addressable by the
-kernel and is of unknown and possibly time-varying size.  The driver
-links itself to frontswap by calling frontswap_register_ops to set the
-frontswap_ops funcs appropriately and the functions it provides must
-conform to certain policies as follows:
-
-An "init" prepares the device to receive frontswap pages associated
-with the specified swap device number (aka "type").  A "store" will
-copy the page to transcendent memory and associate it with the type and
-offset associated with the page. A "load" will copy the page, if found,
-from transcendent memory into kernel memory, but will NOT remove the page
-from transcendent memory.  An "invalidate_page" will remove the page
-from transcendent memory and an "invalidate_area" will remove ALL pages
-associated with the swap type (e.g., like swapoff) and notify the "device"
-to refuse further stores with that swap type.
-
-Once a page is successfully stored, a matching load on the page will normally
-succeed.  So when the kernel finds itself in a situation where it needs
-to swap out a page, it first attempts to use frontswap.  If the store returns
-success, the data has been successfully saved to transcendent memory and
-a disk write and, if the data is later read back, a disk read are avoided.
-If a store returns failure, transcendent memory has rejected the data, and the
-page can be written to swap as usual.
-
-If a backend chooses, frontswap can be configured as a "writethrough
-cache" by calling frontswap_writethrough().  In this mode, the reduction
-in swap device writes is lost (and also a non-trivial performance advantage)
-in order to allow the backend to arbitrarily "reclaim" space used to
-store frontswap pages to more completely manage its memory usage.
-
-Note that if a page is stored and the page already exists in transcendent memory
-(a "duplicate" store), either the store succeeds and the data is overwritten,
-or the store fails AND the page is invalidated.  This ensures stale data may
-never be obtained from frontswap.
-
-If properly configured, monitoring of frontswap is done via debugfs in
-the `/sys/kernel/debug/frontswap` directory.  The effectiveness of
-frontswap can be measured (across all swap devices) with:
-
-``failed_stores``
-	how many store attempts have failed
-
-``loads``
-	how many loads were attempted (all should succeed)
-
-``succ_stores``
-	how many store attempts have succeeded
-
-``invalidates``
-	how many invalidates were attempted
-
-A backend implementation may provide additional metrics.
-
-FAQ
-===
-
-* Where's the value?
-
-When a workload starts swapping, performance falls through the floor.
-Frontswap significantly increases performance in many such workloads by
-providing a clean, dynamic interface to read and write swap pages to
-"transcendent memory" that is otherwise not directly addressable to the kernel.
-This interface is ideal when data is transformed to a different form
-and size (such as with compression) or secretly moved (as might be
-useful for write-balancing for some RAM-like devices).  Swap pages (and
-evicted page-cache pages) are a great use for this kind of slower-than-RAM-
-but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and
-cleancache) interface to transcendent memory provides a nice way to read
-and write -- and indirectly "name" -- the pages.
-
-Frontswap -- and cleancache -- with a fairly small impact on the kernel,
-provides a huge amount of flexibility for more dynamic, flexible RAM
-utilization in various system configurations:
-
-In the single kernel case, aka "zcache", pages are compressed and
-stored in local memory, thus increasing the total anonymous pages
-that can be safely kept in RAM.  Zcache essentially trades off CPU
-cycles used in compression/decompression for better memory utilization.
-Benchmarks have shown little or no impact when memory pressure is
-low while providing a significant performance improvement (25%+)
-on some workloads under high memory pressure.
-
-"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
-support for clustered systems.  Frontswap pages are locally compressed
-as in zcache, but then "remotified" to another system's RAM.  This
-allows RAM to be dynamically load-balanced back-and-forth as needed,
-i.e. when system A is overcommitted, it can swap to system B, and
-vice versa.  RAMster can also be configured as a memory server so
-many servers in a cluster can swap, dynamically as needed, to a single
-server configured with a large amount of RAM... without pre-configuring
-how much of the RAM is available for each of the clients!
-
-In the virtual case, the whole point of virtualization is to statistically
-multiplex physical resources across the varying demands of multiple
-virtual machines.  This is really hard to do with RAM and efforts to do
-it well with no kernel changes have essentially failed (except in some
-well-publicized special-case workloads).
-Specifically, the Xen Transcendent Memory backend allows otherwise
-"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
-virtual machines, but the pages can be compressed and deduplicated to
-optimize RAM utilization.  And when guest OS's are induced to surrender
-underutilized RAM (e.g. with "selfballooning"), sudden unexpected
-memory pressure may result in swapping; frontswap allows those pages
-to be swapped to and from hypervisor RAM (if overall host system memory
-conditions allow), thus mitigating the potentially awful performance impact
-of unplanned swapping.
-
-A KVM implementation is underway and has been RFC'ed to lkml.  And,
-using frontswap, investigation is also underway on the use of NVM as
-a memory extension technology.
-
-* Sure there may be performance advantages in some situations, but
-  what's the space/time overhead of frontswap?
-
-If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
-nothingness and the only overhead is a few extra bytes per swapon'ed
-swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
-registers, there is one extra global variable compared to zero for
-every swap page read or written.  If CONFIG_FRONTSWAP is enabled
-AND a frontswap backend registers AND the backend fails every "store"
-request (i.e. provides no memory despite claiming it might),
-CPU overhead is still negligible -- and since every frontswap fail
-precedes a swap page write-to-disk, the system is highly likely
-to be I/O bound and using a small fraction of a percent of a CPU
-will be irrelevant anyway.
-
-As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
-registers, one bit is allocated for every swap page for every swap
-device that is swapon'd.  This is added to the EIGHT bits (which
-was sixteen until about 2.6.34) that the kernel already allocates
-for every swap page for every swap device that is swapon'd.  (Hugh
-Dickins has observed that frontswap could probably steal one of
-the existing eight bits, but let's worry about that minor optimization
-later.)  For very large swap disks (which are rare) on a standard
-4K pagesize, this is 1MB per 32GB swap.
-
-When swap pages are stored in transcendent memory instead of written
-out to disk, there is a side effect that this may create more memory
-pressure that can potentially outweigh the other advantages.  A
-backend, such as zcache, must implement policies to carefully (but
-dynamically) manage memory limits to ensure this doesn't happen.
-
-* OK, how about a quick overview of what this frontswap patch does
-  in terms that a kernel hacker can grok?
-
-Let's assume that a frontswap "backend" has registered during
-kernel initialization; this registration indicates that this
-frontswap backend has access to some "memory" that is not directly
-accessible by the kernel.  Exactly how much memory it provides is
-entirely dynamic and random.
-
-Whenever a swap-device is swapon'd frontswap_init() is called,
-passing the swap device number (aka "type") as a parameter.
-This notifies frontswap to expect attempts to "store" swap pages
-associated with that number.
-
-Whenever the swap subsystem is readying a page to write to a swap
-device (c.f swap_writepage()), frontswap_store is called.  Frontswap
-consults with the frontswap backend and if the backend says it does NOT
-have room, frontswap_store returns -1 and the kernel swaps the page
-to the swap device as normal.  Note that the response from the frontswap
-backend is unpredictable to the kernel; it may choose to never accept a
-page, it could accept every ninth page, or it might accept every
-page.  But if the backend does accept a page, the data from the page
-has already been copied and associated with the type and offset,
-and the backend guarantees the persistence of the data.  In this case,
-frontswap sets a bit in the "frontswap_map" for the swap device
-corresponding to the page offset on the swap device to which it would
-otherwise have written the data.
-
-When the swap subsystem needs to swap-in a page (swap_readpage()),
-it first calls frontswap_load() which checks the frontswap_map to
-see if the page was earlier accepted by the frontswap backend.  If
-it was, the page of data is filled from the frontswap backend and
-the swap-in is complete.  If not, the normal swap-in code is
-executed to obtain the page of data from the real swap device.
-
-So every time the frontswap backend accepts a page, a swap device read
-and (potentially) a swap device write are replaced by a "frontswap backend
-store" and (possibly) a "frontswap backend loads", which are presumably much
-faster.
-
-* Can't frontswap be configured as a "special" swap device that is
-  just higher priority than any real swap device (e.g. like zswap,
-  or maybe swap-over-nbd/NFS)?
-
-No.  First, the existing swap subsystem doesn't allow for any kind of
-swap hierarchy.  Perhaps it could be rewritten to accommodate a hierarchy,
-but this would require fairly drastic changes.  Even if it were
-rewritten, the existing swap subsystem uses the block I/O layer which
-assumes a swap device is fixed size and any page in it is linearly
-addressable.  Frontswap barely touches the existing swap subsystem,
-and works around the constraints of the block I/O subsystem to provide
-a great deal of flexibility and dynamicity.
-
-For example, the acceptance of any swap page by the frontswap backend is
-entirely unpredictable. This is critical to the definition of frontswap
-backends because it grants completely dynamic discretion to the
-backend.  In zcache, one cannot know a priori how compressible a page is.
-"Poorly" compressible pages can be rejected, and "poorly" can itself be
-defined dynamically depending on current memory constraints.
-
-Further, frontswap is entirely synchronous whereas a real swap
-device is, by definition, asynchronous and uses block I/O.  The
-block I/O layer is not only unnecessary, but may perform "optimizations"
-that are inappropriate for a RAM-oriented device including delaying
-the write of some pages for a significant amount of time.  Synchrony is
-required to ensure the dynamicity of the backend and to avoid thorny race
-conditions that would unnecessarily and greatly complicate frontswap
-and/or the block I/O subsystem.  That said, only the initial "store"
-and "load" operations need be synchronous.  A separate asynchronous thread
-is free to manipulate the pages stored by frontswap.  For example,
-the "remotification" thread in RAMster uses standard asynchronous
-kernel sockets to move compressed frontswap pages to a remote machine.
-Similarly, a KVM guest-side implementation could do in-guest compression
-and use "batched" hypercalls.
-
-In a virtualized environment, the dynamicity allows the hypervisor
-(or host OS) to do "intelligent overcommit".  For example, it can
-choose to accept pages only until host-swapping might be imminent,
-then force guests to do their own swapping.
-
-There is a downside to the transcendent memory specifications for
-frontswap:  Since any "store" might fail, there must always be a real
-slot on a real swap device to swap the page.  Thus frontswap must be
-implemented as a "shadow" to every swapon'd device with the potential
-capability of holding every page that the swap device might have held
-and the possibility that it might hold no pages at all.  This means
-that frontswap cannot contain more pages than the total of swapon'd
-swap devices.  For example, if NO swap device is configured on some
-installation, frontswap is useless.  Swapless portable devices
-can still use frontswap but a backend for such devices must configure
-some kind of "ghost" swap device and ensure that it is never used.
-
-* Why this weird definition about "duplicate stores"?  If a page
-  has been previously successfully stored, can't it always be
-  successfully overwritten?
-
-Nearly always it can, but no, sometimes it cannot.  Consider an example
-where data is compressed and the original 4K page has been compressed
-to 1K.  Now an attempt is made to overwrite the page with data that
-is non-compressible and so would take the entire 4K.  But the backend
-has no more space.  In this case, the store must be rejected.  Whenever
-frontswap rejects a store that would overwrite, it also must invalidate
-the old data and ensure that it is no longer accessible.  Since the
-swap subsystem then writes the new data to the read swap device,
-this is the correct course of action to ensure coherency.
-
-* What is frontswap_shrink for?
-
-When the (non-frontswap) swap subsystem swaps out a page to a real
-swap device, that page is only taking up low-value pre-allocated disk
-space.  But if frontswap has placed a page in transcendent memory, that
-page may be taking up valuable real estate.  The frontswap_shrink
-routine allows code outside of the swap subsystem to force pages out
-of the memory managed by frontswap and back into kernel-addressable memory.
-For example, in RAMster, a "suction driver" thread will attempt
-to "repatriate" pages sent to a remote machine back to the local machine;
-this is driven using the frontswap_shrink mechanism when memory pressure
-subsides.
-
-* Why does the frontswap patch create the new include file swapfile.h?
-
-The frontswap code depends on some swap-subsystem-internal data
-structures that have, over the years, moved back and forth between
-static and global.  This seemed a reasonable compromise:  Define
-them as global but declare them in a new include file that isn't
-included by the large number of source files that include swap.h.
-
-Dan Magenheimer, last updated April 9, 2012
diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst
new file mode 100644
index 000000000000..0f69a9fec34d
--- /dev/null
+++ b/Documentation/vm/highmem.rst
@@ -0,0 +1,147 @@
+.. _highmem:
+
+====================
+High Memory Handling
+====================
+
+By: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+.. contents:: :local:
+
+What Is High Memory?
+====================
+
+High memory (highmem) is used when the size of physical memory approaches or
+exceeds the maximum size of virtual memory.  At that point it becomes
+impossible for the kernel to keep all of the available physical memory mapped
+at all times.  This means the kernel needs to start using temporary mappings of
+the pieces of physical memory that it wants to access.
+
+The part of (physical) memory not covered by a permanent mapping is what we
+refer to as 'highmem'.  There are various architecture dependent constraints on
+where exactly that border lies.
+
+In the i386 arch, for example, we choose to map the kernel into every process's
+VM space so that we don't have to pay the full TLB invalidation costs for
+kernel entry/exit.  This means the available virtual memory space (4GiB on
+i386) has to be divided between user and kernel space.
+
+The traditional split for architectures using this approach is 3:1, 3GiB for
+userspace and the top 1GiB for kernel space::
+
+		+--------+ 0xffffffff
+		| Kernel |
+		+--------+ 0xc0000000
+		|        |
+		| User   |
+		|        |
+		+--------+ 0x00000000
+
+This means that the kernel can at most map 1GiB of physical memory at any one
+time, but because we need virtual address space for other things - including
+temporary maps to access the rest of the physical memory - the actual direct
+map will typically be less (usually around ~896MiB).
+
+Other architectures that have mm context tagged TLBs can have separate kernel
+and user maps.  Some hardware (like some ARMs), however, have limited virtual
+space when they use mm context tags.
+
+
+Temporary Virtual Mappings
+==========================
+
+The kernel contains several ways of creating temporary mappings:
+
+* vmap().  This can be used to make a long duration mapping of multiple
+  physical pages into a contiguous virtual space.  It needs global
+  synchronization to unmap.
+
+* kmap().  This permits a short duration mapping of a single page.  It needs
+  global synchronization, but is amortized somewhat.  It is also prone to
+  deadlocks when using in a nested fashion, and so it is not recommended for
+  new code.
+
+* kmap_atomic().  This permits a very short duration mapping of a single
+  page.  Since the mapping is restricted to the CPU that issued it, it
+  performs well, but the issuing task is therefore required to stay on that
+  CPU until it has finished, lest some other task displace its mappings.
+
+  kmap_atomic() may also be used by interrupt contexts, since it is does not
+  sleep and the caller may not sleep until after kunmap_atomic() is called.
+
+  It may be assumed that k[un]map_atomic() won't fail.
+
+
+Using kmap_atomic
+=================
+
+When and where to use kmap_atomic() is straightforward.  It is used when code
+wants to access the contents of a page that might be allocated from high memory
+(see __GFP_HIGHMEM), for example a page in the pagecache.  The API has two
+functions, and they can be used in a manner similar to the following::
+
+	/* Find the page of interest. */
+	struct page *page = find_get_page(mapping, offset);
+
+	/* Gain access to the contents of that page. */
+	void *vaddr = kmap_atomic(page);
+
+	/* Do something to the contents of that page. */
+	memset(vaddr, 0, PAGE_SIZE);
+
+	/* Unmap that page. */
+	kunmap_atomic(vaddr);
+
+Note that the kunmap_atomic() call takes the result of the kmap_atomic() call
+not the argument.
+
+If you need to map two pages because you want to copy from one page to
+another you need to keep the kmap_atomic calls strictly nested, like::
+
+	vaddr1 = kmap_atomic(page1);
+	vaddr2 = kmap_atomic(page2);
+
+	memcpy(vaddr1, vaddr2, PAGE_SIZE);
+
+	kunmap_atomic(vaddr2);
+	kunmap_atomic(vaddr1);
+
+
+Cost of Temporary Mappings
+==========================
+
+The cost of creating temporary mappings can be quite high.  The arch has to
+manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
+
+If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
+simply with a bit of arithmetic that will convert the page struct address into
+a pointer to the page contents rather than juggling mappings about.  In such a
+case, the unmap operation may be a null operation.
+
+If CONFIG_MMU is not set, then there can be no temporary mappings and no
+highmem.  In such a case, the arithmetic approach will also be used.
+
+
+i386 PAE
+========
+
+The i386 arch, under some circumstances, will permit you to stick up to 64GiB
+of RAM into your 32-bit machine.  This has a number of consequences:
+
+* Linux needs a page-frame structure for each page in the system and the
+  pageframes need to live in the permanent mapping, which means:
+
+* you can have 896M/sizeof(struct page) page-frames at most; with struct
+  page being 32-bytes that would end up being something in the order of 112G
+  worth of pages; the kernel, however, needs to store more than just
+  page-frames in that memory...
+
+* PAE makes your page tables larger - which slows the system down as more
+  data has to be accessed to traverse in TLB fills and the like.  One
+  advantage is that PAE has more PTE bits and can provide advanced features
+  like NX and PAT.
+
+The general recommendation is that you don't use more than 8GiB on a 32-bit
+machine - although more might work for you and your workload, you're pretty
+much on your own - don't expect kernel developers to really care much if things
+come apart.
diff --git a/Documentation/vm/highmem.txt b/Documentation/vm/highmem.txt
deleted file mode 100644
index 0f69a9fec34d..000000000000
--- a/Documentation/vm/highmem.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-.. _highmem:
-
-====================
-High Memory Handling
-====================
-
-By: Peter Zijlstra <a.p.zijlstra@chello.nl>
-
-.. contents:: :local:
-
-What Is High Memory?
-====================
-
-High memory (highmem) is used when the size of physical memory approaches or
-exceeds the maximum size of virtual memory.  At that point it becomes
-impossible for the kernel to keep all of the available physical memory mapped
-at all times.  This means the kernel needs to start using temporary mappings of
-the pieces of physical memory that it wants to access.
-
-The part of (physical) memory not covered by a permanent mapping is what we
-refer to as 'highmem'.  There are various architecture dependent constraints on
-where exactly that border lies.
-
-In the i386 arch, for example, we choose to map the kernel into every process's
-VM space so that we don't have to pay the full TLB invalidation costs for
-kernel entry/exit.  This means the available virtual memory space (4GiB on
-i386) has to be divided between user and kernel space.
-
-The traditional split for architectures using this approach is 3:1, 3GiB for
-userspace and the top 1GiB for kernel space::
-
-		+--------+ 0xffffffff
-		| Kernel |
-		+--------+ 0xc0000000
-		|        |
-		| User   |
-		|        |
-		+--------+ 0x00000000
-
-This means that the kernel can at most map 1GiB of physical memory at any one
-time, but because we need virtual address space for other things - including
-temporary maps to access the rest of the physical memory - the actual direct
-map will typically be less (usually around ~896MiB).
-
-Other architectures that have mm context tagged TLBs can have separate kernel
-and user maps.  Some hardware (like some ARMs), however, have limited virtual
-space when they use mm context tags.
-
-
-Temporary Virtual Mappings
-==========================
-
-The kernel contains several ways of creating temporary mappings:
-
-* vmap().  This can be used to make a long duration mapping of multiple
-  physical pages into a contiguous virtual space.  It needs global
-  synchronization to unmap.
-
-* kmap().  This permits a short duration mapping of a single page.  It needs
-  global synchronization, but is amortized somewhat.  It is also prone to
-  deadlocks when using in a nested fashion, and so it is not recommended for
-  new code.
-
-* kmap_atomic().  This permits a very short duration mapping of a single
-  page.  Since the mapping is restricted to the CPU that issued it, it
-  performs well, but the issuing task is therefore required to stay on that
-  CPU until it has finished, lest some other task displace its mappings.
-
-  kmap_atomic() may also be used by interrupt contexts, since it is does not
-  sleep and the caller may not sleep until after kunmap_atomic() is called.
-
-  It may be assumed that k[un]map_atomic() won't fail.
-
-
-Using kmap_atomic
-=================
-
-When and where to use kmap_atomic() is straightforward.  It is used when code
-wants to access the contents of a page that might be allocated from high memory
-(see __GFP_HIGHMEM), for example a page in the pagecache.  The API has two
-functions, and they can be used in a manner similar to the following::
-
-	/* Find the page of interest. */
-	struct page *page = find_get_page(mapping, offset);
-
-	/* Gain access to the contents of that page. */
-	void *vaddr = kmap_atomic(page);
-
-	/* Do something to the contents of that page. */
-	memset(vaddr, 0, PAGE_SIZE);
-
-	/* Unmap that page. */
-	kunmap_atomic(vaddr);
-
-Note that the kunmap_atomic() call takes the result of the kmap_atomic() call
-not the argument.
-
-If you need to map two pages because you want to copy from one page to
-another you need to keep the kmap_atomic calls strictly nested, like::
-
-	vaddr1 = kmap_atomic(page1);
-	vaddr2 = kmap_atomic(page2);
-
-	memcpy(vaddr1, vaddr2, PAGE_SIZE);
-
-	kunmap_atomic(vaddr2);
-	kunmap_atomic(vaddr1);
-
-
-Cost of Temporary Mappings
-==========================
-
-The cost of creating temporary mappings can be quite high.  The arch has to
-manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
-
-If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
-simply with a bit of arithmetic that will convert the page struct address into
-a pointer to the page contents rather than juggling mappings about.  In such a
-case, the unmap operation may be a null operation.
-
-If CONFIG_MMU is not set, then there can be no temporary mappings and no
-highmem.  In such a case, the arithmetic approach will also be used.
-
-
-i386 PAE
-========
-
-The i386 arch, under some circumstances, will permit you to stick up to 64GiB
-of RAM into your 32-bit machine.  This has a number of consequences:
-
-* Linux needs a page-frame structure for each page in the system and the
-  pageframes need to live in the permanent mapping, which means:
-
-* you can have 896M/sizeof(struct page) page-frames at most; with struct
-  page being 32-bytes that would end up being something in the order of 112G
-  worth of pages; the kernel, however, needs to store more than just
-  page-frames in that memory...
-
-* PAE makes your page tables larger - which slows the system down as more
-  data has to be accessed to traverse in TLB fills and the like.  One
-  advantage is that PAE has more PTE bits and can provide advanced features
-  like NX and PAT.
-
-The general recommendation is that you don't use more than 8GiB on a 32-bit
-machine - although more might work for you and your workload, you're pretty
-much on your own - don't expect kernel developers to really care much if things
-come apart.
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
new file mode 100644
index 000000000000..3fafa3381730
--- /dev/null
+++ b/Documentation/vm/hmm.rst
@@ -0,0 +1,374 @@
+.. hmm:
+
+=====================================
+Heterogeneous Memory Management (HMM)
+=====================================
+
+Transparently allow any component of a program to use any memory region of said
+program with a device without using device specific memory allocator. This is
+becoming a requirement to simplify the use of advance heterogeneous computing
+where GPU, DSP or FPGA are use to perform various computations.
+
+This document is divided as follow, in the first section i expose the problems
+related to the use of a device specific allocator. The second section i expose
+the hardware limitations that are inherent to many platforms. The third section
+gives an overview of HMM designs. The fourth section explains how CPU page-
+table mirroring works and what is HMM purpose in this context. Fifth section
+deals with how device memory is represented inside the kernel. Finaly the last
+section present the new migration helper that allow to leverage the device DMA
+engine.
+
+.. contents:: :local:
+
+Problems of using device specific memory allocator
+==================================================
+
+Device with large amount of on board memory (several giga bytes) like GPU have
+historically manage their memory through dedicated driver specific API. This
+creates a disconnect between memory allocated and managed by device driver and
+regular application memory (private anonymous, share memory or regular file
+back memory). From here on i will refer to this aspect as split address space.
+I use share address space to refer to the opposite situation ie one in which
+any memory region can be use by device transparently.
+
+Split address space because device can only access memory allocated through the
+device specific API. This imply that all memory object in a program are not
+equal from device point of view which complicate large program that rely on a
+wide set of libraries.
+
+Concretly this means that code that wants to leverage device like GPU need to
+copy object between genericly allocated memory (malloc, mmap private/share/)
+and memory allocated through the device driver API (this still end up with an
+mmap but of the device file).
+
+For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
+complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
+data-set need to re-map all the pointer relations between each of its elements.
+This is error prone and program gets harder to debug because of the duplicate
+data-set.
+
+Split address space also means that library can not transparently use data they
+are getting from core program or other library and thus each library might have
+to duplicate its input data-set using specific memory allocator. Large project
+suffer from this and waste resources because of the various memory copy.
+
+Duplicating each library API to accept as input or output memory allocted by
+each device specific allocator is not a viable option. It would lead to a
+combinatorial explosions in the library entry points.
+
+Finaly with the advance of high level language constructs (in C++ but in other
+language too) it is now possible for compiler to leverage GPU or other devices
+without even the programmer knowledge. Some of compiler identified patterns are
+only do-able with a share address. It is as well more reasonable to use a share
+address space for all the other patterns.
+
+
+System bus, device memory characteristics
+=========================================
+
+System bus cripple share address due to few limitations. Most system bus only
+allow basic memory access from device to main memory, even cache coherency is
+often optional. Access to device memory from CPU is even more limited, most
+often than not it is not cache coherent.
+
+If we only consider the PCIE bus than device can access main memory (often
+through an IOMMU) and be cache coherent with the CPUs. However it only allows
+a limited set of atomic operation from device on main memory. This is worse
+in the other direction the CPUs can only access a limited range of the device
+memory and can not perform atomic operations on it. Thus device memory can not
+be consider like regular memory from kernel point of view.
+
+Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
+and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s).
+The final limitation is latency, access to main memory from the device has an
+order of magnitude higher latency than when the device access its own memory.
+
+Some platform are developing new system bus or additions/modifications to PCIE
+to address some of those limitations (OpenCAPI, CCIX). They mainly allow two
+way cache coherency between CPU and device and allow all atomic operations the
+architecture supports. Saddly not all platform are following this trends and
+some major architecture are left without hardware solutions to those problems.
+
+So for share address space to make sense not only we must allow device to
+access any memory memory but we must also permit any memory to be migrated to
+device memory while device is using it (blocking CPU access while it happens).
+
+
+Share address space and migration
+=================================
+
+HMM intends to provide two main features. First one is to share the address
+space by duplication the CPU page table into the device page table so same
+address point to same memory and this for any valid main memory address in
+the process address space.
+
+To achieve this, HMM offer a set of helpers to populate the device page table
+while keeping track of CPU page table updates. Device page table updates are
+not as easy as CPU page table updates. To update the device page table you must
+allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics
+commands in it to perform the update (unmap, cache invalidations and flush,
+...). This can not be done through common code for all device. Hence why HMM
+provides helpers to factor out everything that can be while leaving the gory
+details to the device driver.
+
+The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does
+allow to allocate a struct page for each page of the device memory. Those page
+are special because the CPU can not map them. They however allow to migrate
+main memory to device memory using exhisting migration mechanism and everything
+looks like if page was swap out to disk from CPU point of view. Using a struct
+page gives the easiest and cleanest integration with existing mm mechanisms.
+Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory
+for the device memory and second to perform migration. Policy decision of what
+and when to migrate things is left to the device driver.
+
+Note that any CPU access to a device page trigger a page fault and a migration
+back to main memory ie when a page backing an given address A is migrated from
+a main memory page to a device page then any CPU access to address A trigger a
+page fault and initiate a migration back to main memory.
+
+
+With this two features, HMM not only allow a device to mirror a process address
+space and keeps both CPU and device page table synchronize, but also allow to
+leverage device memory by migrating part of data-set that is actively use by a
+device.
+
+
+Address space mirroring implementation and API
+==============================================
+
+Address space mirroring main objective is to allow to duplicate range of CPU
+page table into a device page table and HMM helps keeping both synchronize. A
+device driver that want to mirror a process address space must start with the
+registration of an hmm_mirror struct::
+
+ int hmm_mirror_register(struct hmm_mirror *mirror,
+                         struct mm_struct *mm);
+ int hmm_mirror_register_locked(struct hmm_mirror *mirror,
+                                struct mm_struct *mm);
+
+The locked variant is to be use when the driver is already holding the mmap_sem
+of the mm in write mode. The mirror struct has a set of callback that are use
+to propagate CPU page table::
+
+ struct hmm_mirror_ops {
+     /* sync_cpu_device_pagetables() - synchronize page tables
+      *
+      * @mirror: pointer to struct hmm_mirror
+      * @update_type: type of update that occurred to the CPU page table
+      * @start: virtual start address of the range to update
+      * @end: virtual end address of the range to update
+      *
+      * This callback ultimately originates from mmu_notifiers when the CPU
+      * page table is updated. The device driver must update its page table
+      * in response to this callback. The update argument tells what action
+      * to perform.
+      *
+      * The device driver must not return from this callback until the device
+      * page tables are completely updated (TLBs flushed, etc); this is a
+      * synchronous call.
+      */
+      void (*update)(struct hmm_mirror *mirror,
+                     enum hmm_update action,
+                     unsigned long start,
+                     unsigned long end);
+ };
+
+Device driver must perform update to the range following action (turn range
+read only, or fully unmap, ...). Once driver callback returns the device must
+be done with the update.
+
+
+When device driver wants to populate a range of virtual address it can use
+either::
+
+ int hmm_vma_get_pfns(struct vm_area_struct *vma,
+                      struct hmm_range *range,
+                      unsigned long start,
+                      unsigned long end,
+                      hmm_pfn_t *pfns);
+ int hmm_vma_fault(struct vm_area_struct *vma,
+                   struct hmm_range *range,
+                   unsigned long start,
+                   unsigned long end,
+                   hmm_pfn_t *pfns,
+                   bool write,
+                   bool block);
+
+First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and
+will not trigger a page fault on missing or non present entry. The second one
+do trigger page fault on missing or read only entry if write parameter is true.
+Page fault use the generic mm page fault code path just like a CPU page fault.
+
+Both function copy CPU page table into their pfns array argument. Each entry in
+that array correspond to an address in the virtual range. HMM provide a set of
+flags to help driver identify special CPU page table entries.
+
+Locking with the update() callback is the most important aspect the driver must
+respect in order to keep things properly synchronize. The usage pattern is::
+
+ int driver_populate_range(...)
+ {
+      struct hmm_range range;
+      ...
+ again:
+      ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
+      if (ret)
+          return ret;
+      take_lock(driver->update);
+      if (!hmm_vma_range_done(vma, &range)) {
+          release_lock(driver->update);
+          goto again;
+      }
+
+      // Use pfns array content to update device page table
+
+      release_lock(driver->update);
+      return 0;
+ }
+
+The driver->update lock is the same lock that driver takes inside its update()
+callback. That lock must be call before hmm_vma_range_done() to avoid any race
+with a concurrent CPU page table update.
+
+HMM implements all this on top of the mmu_notifier API because we wanted to a
+simpler API and also to be able to perform optimization latter own like doing
+concurrent device update in multi-devices scenario.
+
+HMM also serve as an impedence missmatch between how CPU page table update are
+done (by CPU write to the page table and TLB flushes) from how device update
+their own page table. Device update is a multi-step process, first appropriate
+commands are write to a buffer, then this buffer is schedule for execution on
+the device. It is only once the device has executed commands in the buffer that
+the update is done. Creating and scheduling update command buffer can happen
+concurrently for multiple devices. Waiting for each device to report commands
+as executed is serialize (there is no point in doing this concurrently).
+
+
+Represent and manage device memory from core kernel point of view
+=================================================================
+
+Several differents design were try to support device memory. First one use
+device specific data structure to keep information about migrated memory and
+HMM hooked itself in various place of mm code to handle any access to address
+that were back by device memory. It turns out that this ended up replicating
+most of the fields of struct page and also needed many kernel code path to be
+updated to understand this new kind of memory.
+
+Thing is most kernel code path never try to access the memory behind a page
+but only care about struct page contents. Because of this HMM switchted to
+directly using struct page for device memory which left most kernel code path
+un-aware of the difference. We only need to make sure that no one ever try to
+map those page from the CPU side.
+
+HMM provide a set of helpers to register and hotplug device memory as a new
+region needing struct page. This is offer through a very simple API::
+
+ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+                                   struct device *device,
+                                   unsigned long size);
+ void hmm_devmem_remove(struct hmm_devmem *devmem);
+
+The hmm_devmem_ops is where most of the important things are::
+
+ struct hmm_devmem_ops {
+     void (*free)(struct hmm_devmem *devmem, struct page *page);
+     int (*fault)(struct hmm_devmem *devmem,
+                  struct vm_area_struct *vma,
+                  unsigned long addr,
+                  struct page *page,
+                  unsigned flags,
+                  pmd_t *pmdp);
+ };
+
+The first callback (free()) happens when the last reference on a device page is
+drop. This means the device page is now free and no longer use by anyone. The
+second callback happens whenever CPU try to access a device page which it can
+not do. This second callback must trigger a migration back to system memory.
+
+
+Migrate to and from device memory
+=================================
+
+Because CPU can not access device memory, migration must use device DMA engine
+to perform copy from and to device memory. For this we need a new migration
+helper::
+
+ int migrate_vma(const struct migrate_vma_ops *ops,
+                 struct vm_area_struct *vma,
+                 unsigned long mentries,
+                 unsigned long start,
+                 unsigned long end,
+                 unsigned long *src,
+                 unsigned long *dst,
+                 void *private);
+
+Unlike other migration function it works on a range of virtual address, there
+is two reasons for that. First device DMA copy has a high setup overhead cost
+and thus batching multiple pages is needed as otherwise the migration overhead
+make the whole excersie pointless. The second reason is because driver trigger
+such migration base on range of address the device is actively accessing.
+
+The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy())
+control destination memory allocation and copy operation. Second one is there
+to allow device driver to perform cleanup operation after migration::
+
+ struct migrate_vma_ops {
+     void (*alloc_and_copy)(struct vm_area_struct *vma,
+                            const unsigned long *src,
+                            unsigned long *dst,
+                            unsigned long start,
+                            unsigned long end,
+                            void *private);
+     void (*finalize_and_map)(struct vm_area_struct *vma,
+                              const unsigned long *src,
+                              const unsigned long *dst,
+                              unsigned long start,
+                              unsigned long end,
+                              void *private);
+ };
+
+It is important to stress that this migration helpers allow for hole in the
+virtual address range. Some pages in the range might not be migrated for all
+the usual reasons (page is pin, page is lock, ...). This helper does not fail
+but just skip over those pages.
+
+The alloc_and_copy() might as well decide to not migrate all pages in the
+range (for reasons under the callback control). For those the callback just
+have to leave the corresponding dst entry empty.
+
+Finaly the migration of the struct page might fails (for file back page) for
+various reasons (failure to freeze reference, or update page cache, ...). If
+that happens then the finalize_and_map() can catch any pages that was not
+migrated. Note those page were still copied to new page and thus we wasted
+bandwidth but this is considered as a rare event and a price that we are
+willing to pay to keep all the code simpler.
+
+
+Memory cgroup (memcg) and rss accounting
+========================================
+
+For now device memory is accounted as any regular page in rss counters (either
+anonymous if device page is use for anonymous, file if device page is use for
+file back page or shmem if device page is use for share memory). This is a
+deliberate choice to keep existing application that might start using device
+memory without knowing about it to keep runing unimpacted.
+
+Drawbacks is that OOM killer might kill an application using a lot of device
+memory and not a lot of regular system memory and thus not freeing much system
+memory. We want to gather more real world experience on how application and
+system react under memory pressure in the presence of device memory before
+deciding to account device memory differently.
+
+
+Same decision was made for memory cgroup. Device memory page are accounted
+against same memory cgroup a regular page would be accounted to. This does
+simplify migration to and from device memory. This also means that migration
+back from device memory to regular memory can not fail because it would
+go above memory cgroup limit. We might revisit this choice latter on once we
+get more experience in how device memory is use and its impact on memory
+resource control.
+
+
+Note that device memory can never be pin nor by device driver nor through GUP
+and thus such memory is always free upon process exit. Or when last reference
+is drop in case of share memory or file back memory.
diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt
deleted file mode 100644
index 3fafa3381730..000000000000
--- a/Documentation/vm/hmm.txt
+++ /dev/null
@@ -1,374 +0,0 @@
-.. hmm:
-
-=====================================
-Heterogeneous Memory Management (HMM)
-=====================================
-
-Transparently allow any component of a program to use any memory region of said
-program with a device without using device specific memory allocator. This is
-becoming a requirement to simplify the use of advance heterogeneous computing
-where GPU, DSP or FPGA are use to perform various computations.
-
-This document is divided as follow, in the first section i expose the problems
-related to the use of a device specific allocator. The second section i expose
-the hardware limitations that are inherent to many platforms. The third section
-gives an overview of HMM designs. The fourth section explains how CPU page-
-table mirroring works and what is HMM purpose in this context. Fifth section
-deals with how device memory is represented inside the kernel. Finaly the last
-section present the new migration helper that allow to leverage the device DMA
-engine.
-
-.. contents:: :local:
-
-Problems of using device specific memory allocator
-==================================================
-
-Device with large amount of on board memory (several giga bytes) like GPU have
-historically manage their memory through dedicated driver specific API. This
-creates a disconnect between memory allocated and managed by device driver and
-regular application memory (private anonymous, share memory or regular file
-back memory). From here on i will refer to this aspect as split address space.
-I use share address space to refer to the opposite situation ie one in which
-any memory region can be use by device transparently.
-
-Split address space because device can only access memory allocated through the
-device specific API. This imply that all memory object in a program are not
-equal from device point of view which complicate large program that rely on a
-wide set of libraries.
-
-Concretly this means that code that wants to leverage device like GPU need to
-copy object between genericly allocated memory (malloc, mmap private/share/)
-and memory allocated through the device driver API (this still end up with an
-mmap but of the device file).
-
-For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
-complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
-data-set need to re-map all the pointer relations between each of its elements.
-This is error prone and program gets harder to debug because of the duplicate
-data-set.
-
-Split address space also means that library can not transparently use data they
-are getting from core program or other library and thus each library might have
-to duplicate its input data-set using specific memory allocator. Large project
-suffer from this and waste resources because of the various memory copy.
-
-Duplicating each library API to accept as input or output memory allocted by
-each device specific allocator is not a viable option. It would lead to a
-combinatorial explosions in the library entry points.
-
-Finaly with the advance of high level language constructs (in C++ but in other
-language too) it is now possible for compiler to leverage GPU or other devices
-without even the programmer knowledge. Some of compiler identified patterns are
-only do-able with a share address. It is as well more reasonable to use a share
-address space for all the other patterns.
-
-
-System bus, device memory characteristics
-=========================================
-
-System bus cripple share address due to few limitations. Most system bus only
-allow basic memory access from device to main memory, even cache coherency is
-often optional. Access to device memory from CPU is even more limited, most
-often than not it is not cache coherent.
-
-If we only consider the PCIE bus than device can access main memory (often
-through an IOMMU) and be cache coherent with the CPUs. However it only allows
-a limited set of atomic operation from device on main memory. This is worse
-in the other direction the CPUs can only access a limited range of the device
-memory and can not perform atomic operations on it. Thus device memory can not
-be consider like regular memory from kernel point of view.
-
-Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
-and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s).
-The final limitation is latency, access to main memory from the device has an
-order of magnitude higher latency than when the device access its own memory.
-
-Some platform are developing new system bus or additions/modifications to PCIE
-to address some of those limitations (OpenCAPI, CCIX). They mainly allow two
-way cache coherency between CPU and device and allow all atomic operations the
-architecture supports. Saddly not all platform are following this trends and
-some major architecture are left without hardware solutions to those problems.
-
-So for share address space to make sense not only we must allow device to
-access any memory memory but we must also permit any memory to be migrated to
-device memory while device is using it (blocking CPU access while it happens).
-
-
-Share address space and migration
-=================================
-
-HMM intends to provide two main features. First one is to share the address
-space by duplication the CPU page table into the device page table so same
-address point to same memory and this for any valid main memory address in
-the process address space.
-
-To achieve this, HMM offer a set of helpers to populate the device page table
-while keeping track of CPU page table updates. Device page table updates are
-not as easy as CPU page table updates. To update the device page table you must
-allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics
-commands in it to perform the update (unmap, cache invalidations and flush,
-...). This can not be done through common code for all device. Hence why HMM
-provides helpers to factor out everything that can be while leaving the gory
-details to the device driver.
-
-The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does
-allow to allocate a struct page for each page of the device memory. Those page
-are special because the CPU can not map them. They however allow to migrate
-main memory to device memory using exhisting migration mechanism and everything
-looks like if page was swap out to disk from CPU point of view. Using a struct
-page gives the easiest and cleanest integration with existing mm mechanisms.
-Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory
-for the device memory and second to perform migration. Policy decision of what
-and when to migrate things is left to the device driver.
-
-Note that any CPU access to a device page trigger a page fault and a migration
-back to main memory ie when a page backing an given address A is migrated from
-a main memory page to a device page then any CPU access to address A trigger a
-page fault and initiate a migration back to main memory.
-
-
-With this two features, HMM not only allow a device to mirror a process address
-space and keeps both CPU and device page table synchronize, but also allow to
-leverage device memory by migrating part of data-set that is actively use by a
-device.
-
-
-Address space mirroring implementation and API
-==============================================
-
-Address space mirroring main objective is to allow to duplicate range of CPU
-page table into a device page table and HMM helps keeping both synchronize. A
-device driver that want to mirror a process address space must start with the
-registration of an hmm_mirror struct::
-
- int hmm_mirror_register(struct hmm_mirror *mirror,
-                         struct mm_struct *mm);
- int hmm_mirror_register_locked(struct hmm_mirror *mirror,
-                                struct mm_struct *mm);
-
-The locked variant is to be use when the driver is already holding the mmap_sem
-of the mm in write mode. The mirror struct has a set of callback that are use
-to propagate CPU page table::
-
- struct hmm_mirror_ops {
-     /* sync_cpu_device_pagetables() - synchronize page tables
-      *
-      * @mirror: pointer to struct hmm_mirror
-      * @update_type: type of update that occurred to the CPU page table
-      * @start: virtual start address of the range to update
-      * @end: virtual end address of the range to update
-      *
-      * This callback ultimately originates from mmu_notifiers when the CPU
-      * page table is updated. The device driver must update its page table
-      * in response to this callback. The update argument tells what action
-      * to perform.
-      *
-      * The device driver must not return from this callback until the device
-      * page tables are completely updated (TLBs flushed, etc); this is a
-      * synchronous call.
-      */
-      void (*update)(struct hmm_mirror *mirror,
-                     enum hmm_update action,
-                     unsigned long start,
-                     unsigned long end);
- };
-
-Device driver must perform update to the range following action (turn range
-read only, or fully unmap, ...). Once driver callback returns the device must
-be done with the update.
-
-
-When device driver wants to populate a range of virtual address it can use
-either::
-
- int hmm_vma_get_pfns(struct vm_area_struct *vma,
-                      struct hmm_range *range,
-                      unsigned long start,
-                      unsigned long end,
-                      hmm_pfn_t *pfns);
- int hmm_vma_fault(struct vm_area_struct *vma,
-                   struct hmm_range *range,
-                   unsigned long start,
-                   unsigned long end,
-                   hmm_pfn_t *pfns,
-                   bool write,
-                   bool block);
-
-First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and
-will not trigger a page fault on missing or non present entry. The second one
-do trigger page fault on missing or read only entry if write parameter is true.
-Page fault use the generic mm page fault code path just like a CPU page fault.
-
-Both function copy CPU page table into their pfns array argument. Each entry in
-that array correspond to an address in the virtual range. HMM provide a set of
-flags to help driver identify special CPU page table entries.
-
-Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronize. The usage pattern is::
-
- int driver_populate_range(...)
- {
-      struct hmm_range range;
-      ...
- again:
-      ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
-      if (ret)
-          return ret;
-      take_lock(driver->update);
-      if (!hmm_vma_range_done(vma, &range)) {
-          release_lock(driver->update);
-          goto again;
-      }
-
-      // Use pfns array content to update device page table
-
-      release_lock(driver->update);
-      return 0;
- }
-
-The driver->update lock is the same lock that driver takes inside its update()
-callback. That lock must be call before hmm_vma_range_done() to avoid any race
-with a concurrent CPU page table update.
-
-HMM implements all this on top of the mmu_notifier API because we wanted to a
-simpler API and also to be able to perform optimization latter own like doing
-concurrent device update in multi-devices scenario.
-
-HMM also serve as an impedence missmatch between how CPU page table update are
-done (by CPU write to the page table and TLB flushes) from how device update
-their own page table. Device update is a multi-step process, first appropriate
-commands are write to a buffer, then this buffer is schedule for execution on
-the device. It is only once the device has executed commands in the buffer that
-the update is done. Creating and scheduling update command buffer can happen
-concurrently for multiple devices. Waiting for each device to report commands
-as executed is serialize (there is no point in doing this concurrently).
-
-
-Represent and manage device memory from core kernel point of view
-=================================================================
-
-Several differents design were try to support device memory. First one use
-device specific data structure to keep information about migrated memory and
-HMM hooked itself in various place of mm code to handle any access to address
-that were back by device memory. It turns out that this ended up replicating
-most of the fields of struct page and also needed many kernel code path to be
-updated to understand this new kind of memory.
-
-Thing is most kernel code path never try to access the memory behind a page
-but only care about struct page contents. Because of this HMM switchted to
-directly using struct page for device memory which left most kernel code path
-un-aware of the difference. We only need to make sure that no one ever try to
-map those page from the CPU side.
-
-HMM provide a set of helpers to register and hotplug device memory as a new
-region needing struct page. This is offer through a very simple API::
-
- struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-                                   struct device *device,
-                                   unsigned long size);
- void hmm_devmem_remove(struct hmm_devmem *devmem);
-
-The hmm_devmem_ops is where most of the important things are::
-
- struct hmm_devmem_ops {
-     void (*free)(struct hmm_devmem *devmem, struct page *page);
-     int (*fault)(struct hmm_devmem *devmem,
-                  struct vm_area_struct *vma,
-                  unsigned long addr,
-                  struct page *page,
-                  unsigned flags,
-                  pmd_t *pmdp);
- };
-
-The first callback (free()) happens when the last reference on a device page is
-drop. This means the device page is now free and no longer use by anyone. The
-second callback happens whenever CPU try to access a device page which it can
-not do. This second callback must trigger a migration back to system memory.
-
-
-Migrate to and from device memory
-=================================
-
-Because CPU can not access device memory, migration must use device DMA engine
-to perform copy from and to device memory. For this we need a new migration
-helper::
-
- int migrate_vma(const struct migrate_vma_ops *ops,
-                 struct vm_area_struct *vma,
-                 unsigned long mentries,
-                 unsigned long start,
-                 unsigned long end,
-                 unsigned long *src,
-                 unsigned long *dst,
-                 void *private);
-
-Unlike other migration function it works on a range of virtual address, there
-is two reasons for that. First device DMA copy has a high setup overhead cost
-and thus batching multiple pages is needed as otherwise the migration overhead
-make the whole excersie pointless. The second reason is because driver trigger
-such migration base on range of address the device is actively accessing.
-
-The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy())
-control destination memory allocation and copy operation. Second one is there
-to allow device driver to perform cleanup operation after migration::
-
- struct migrate_vma_ops {
-     void (*alloc_and_copy)(struct vm_area_struct *vma,
-                            const unsigned long *src,
-                            unsigned long *dst,
-                            unsigned long start,
-                            unsigned long end,
-                            void *private);
-     void (*finalize_and_map)(struct vm_area_struct *vma,
-                              const unsigned long *src,
-                              const unsigned long *dst,
-                              unsigned long start,
-                              unsigned long end,
-                              void *private);
- };
-
-It is important to stress that this migration helpers allow for hole in the
-virtual address range. Some pages in the range might not be migrated for all
-the usual reasons (page is pin, page is lock, ...). This helper does not fail
-but just skip over those pages.
-
-The alloc_and_copy() might as well decide to not migrate all pages in the
-range (for reasons under the callback control). For those the callback just
-have to leave the corresponding dst entry empty.
-
-Finaly the migration of the struct page might fails (for file back page) for
-various reasons (failure to freeze reference, or update page cache, ...). If
-that happens then the finalize_and_map() can catch any pages that was not
-migrated. Note those page were still copied to new page and thus we wasted
-bandwidth but this is considered as a rare event and a price that we are
-willing to pay to keep all the code simpler.
-
-
-Memory cgroup (memcg) and rss accounting
-========================================
-
-For now device memory is accounted as any regular page in rss counters (either
-anonymous if device page is use for anonymous, file if device page is use for
-file back page or shmem if device page is use for share memory). This is a
-deliberate choice to keep existing application that might start using device
-memory without knowing about it to keep runing unimpacted.
-
-Drawbacks is that OOM killer might kill an application using a lot of device
-memory and not a lot of regular system memory and thus not freeing much system
-memory. We want to gather more real world experience on how application and
-system react under memory pressure in the presence of device memory before
-deciding to account device memory differently.
-
-
-Same decision was made for memory cgroup. Device memory page are accounted
-against same memory cgroup a regular page would be accounted to. This does
-simplify migration to and from device memory. This also means that migration
-back from device memory to regular memory can not fail because it would
-go above memory cgroup limit. We might revisit this choice latter on once we
-get more experience in how device memory is use and its impact on memory
-resource control.
-
-
-Note that device memory can never be pin nor by device driver nor through GUP
-and thus such memory is always free upon process exit. Or when last reference
-is drop in case of share memory or file back memory.
diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/vm/hugetlbfs_reserv.rst
new file mode 100644
index 000000000000..36a87a2ea435
--- /dev/null
+++ b/Documentation/vm/hugetlbfs_reserv.rst
@@ -0,0 +1,587 @@
+.. _hugetlbfs_reserve:
+
+=====================
+Hugetlbfs Reservation
+=====================
+
+Overview
+========
+
+Huge pages as described at :ref:`hugetlbpage` are typically
+preallocated for application use.  These huge pages are instantiated in a
+task's address space at page fault time if the VMA indicates huge pages are
+to be used.  If no huge page exists at page fault time, the task is sent
+a SIGBUS and often dies an unhappy death.  Shortly after huge page support
+was added, it was determined that it would be better to detect a shortage
+of huge pages at mmap() time.  The idea is that if there were not enough
+huge pages to cover the mapping, the mmap() would fail.  This was first
+done with a simple check in the code at mmap() time to determine if there
+were enough free huge pages to cover the mapping.  Like most things in the
+kernel, the code has evolved over time.  However, the basic idea was to
+'reserve' huge pages at mmap() time to ensure that huge pages would be
+available for page faults in that mapping.  The description below attempts to
+describe how huge page reserve processing is done in the v4.10 kernel.
+
+
+Audience
+========
+This description is primarily targeted at kernel developers who are modifying
+hugetlbfs code.
+
+
+The Data Structures
+===================
+
+resv_huge_pages
+	This is a global (per-hstate) count of reserved huge pages.  Reserved
+	huge pages are only available to the task which reserved them.
+	Therefore, the number of huge pages generally available is computed
+	as (``free_huge_pages - resv_huge_pages``).
+Reserve Map
+	A reserve map is described by the structure::
+
+		struct resv_map {
+			struct kref refs;
+			spinlock_t lock;
+			struct list_head regions;
+			long adds_in_progress;
+			struct list_head region_cache;
+			long region_cache_count;
+		};
+
+	There is one reserve map for each huge page mapping in the system.
+	The regions list within the resv_map describes the regions within
+	the mapping.  A region is described as::
+
+		struct file_region {
+			struct list_head link;
+			long from;
+			long to;
+		};
+
+	The 'from' and 'to' fields of the file region structure are huge page
+	indices into the mapping.  Depending on the type of mapping, a
+	region in the reserv_map may indicate reservations exist for the
+	range, or reservations do not exist.
+Flags for MAP_PRIVATE Reservations
+	These are stored in the bottom bits of the reservation map pointer.
+
+	``#define HPAGE_RESV_OWNER    (1UL << 0)``
+		Indicates this task is the owner of the reservations
+		associated with the mapping.
+	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+		Indicates task originally mapping this range (and creating
+		reserves) has unmapped a page from this task (the child)
+		due to a failed COW.
+Page Flags
+	The PagePrivate page flag is used to indicate that a huge page
+	reservation must be restored when the huge page is freed.  More
+	details will be discussed in the "Freeing huge pages" section.
+
+
+Reservation Map Location (Private or Shared)
+============================================
+
+A huge page mapping or segment is either private or shared.  If private,
+it is typically only available to a single address space (task).  If shared,
+it can be mapped into multiple address spaces (tasks).  The location and
+semantics of the reservation map is significantly different for two types
+of mappings.  Location differences are:
+
+- For private mappings, the reservation map hangs off the the VMA structure.
+  Specifically, vma->vm_private_data.  This reserve map is created at the
+  time the mapping (mmap(MAP_PRIVATE)) is created.
+- For shared mappings, the reservation map hangs off the inode.  Specifically,
+  inode->i_mapping->private_data.  Since shared mappings are always backed
+  by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
+  contains a reservation map.  As a result, the reservation map is allocated
+  when the inode is created.
+
+
+Creating Reservations
+=====================
+Reservations are created when a huge page backed shared memory segment is
+created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
+These operations result in a call to the routine hugetlb_reserve_pages()::
+
+	int hugetlb_reserve_pages(struct inode *inode,
+				  long from, long to,
+				  struct vm_area_struct *vma,
+				  vm_flags_t vm_flags)
+
+The first thing hugetlb_reserve_pages() does is check for the NORESERVE
+flag was specified in either the shmget() or mmap() call.  If NORESERVE
+was specified, then this routine returns immediately as no reservation
+are desired.
+
+The arguments 'from' and 'to' are huge page indices into the mapping or
+underlying file.  For shmget(), 'from' is always 0 and 'to' corresponds to
+the length of the segment/mapping.  For mmap(), the offset argument could
+be used to specify the offset into the underlying file.  In such a case
+the 'from' and 'to' arguments have been adjusted by this offset.
+
+One of the big differences between PRIVATE and SHARED mappings is the way
+in which reservations are represented in the reservation map.
+
+- For shared mappings, an entry in the reservation map indicates a reservation
+  exists or did exist for the corresponding page.  As reservations are
+  consumed, the reservation map is not modified.
+- For private mappings, the lack of an entry in the reservation map indicates
+  a reservation exists for the corresponding page.  As reservations are
+  consumed, entries are added to the reservation map.  Therefore, the
+  reservation map can also be used to determine which reservations have
+  been consumed.
+
+For private mappings, hugetlb_reserve_pages() creates the reservation map and
+hangs it off the VMA structure.  In addition, the HPAGE_RESV_OWNER flag is set
+to indicate this VMA owns the reservations.
+
+The reservation map is consulted to determine how many huge page reservations
+are needed for the current mapping/segment.  For private mappings, this is
+always the value (to - from).  However, for shared mappings it is possible that some reservations may already exist within the range (to - from).  See the
+section :ref:`Reservation Map Modifications <resv_map_modifications>`
+for details on how this is accomplished.
+
+The mapping may be associated with a subpool.  If so, the subpool is consulted
+to ensure there is sufficient space for the mapping.  It is possible that the
+subpool has set aside reservations that can be used for the mapping.  See the
+section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
+
+After consulting the reservation map and subpool, the number of needed new
+reservations is known.  The routine hugetlb_acct_memory() is called to check
+for and take the requested number of reservations.  hugetlb_acct_memory()
+calls into routines that potentially allocate and adjust surplus page counts.
+However, within those routines the code is simply checking to ensure there
+are enough free huge pages to accommodate the reservation.  If there are,
+the global reservation count resv_huge_pages is adjusted something like the
+following::
+
+	if (resv_needed <= (resv_huge_pages - free_huge_pages))
+		resv_huge_pages += resv_needed;
+
+Note that the global lock hugetlb_lock is held when checking and adjusting
+these counters.
+
+If there were enough free huge pages and the global count resv_huge_pages
+was adjusted, then the reservation map associated with the mapping is
+modified to reflect the reservations.  In the case of a shared mapping, a
+file_region will exist that includes the range 'from' 'to'.  For private
+mappings, no modifications are made to the reservation map as lack of an
+entry indicates a reservation exists.
+
+If hugetlb_reserve_pages() was successful, the global reservation count and
+reservation map associated with the mapping will be modified as required to
+ensure reservations exist for the range 'from' - 'to'.
+
+.. _consume_resv:
+
+Consuming Reservations/Allocating a Huge Page
+=============================================
+
+Reservations are consumed when huge pages associated with the reservations
+are allocated and instantiated in the corresponding mapping.  The allocation
+is performed within the routine alloc_huge_page()::
+
+	struct page *alloc_huge_page(struct vm_area_struct *vma,
+				     unsigned long addr, int avoid_reserve)
+
+alloc_huge_page is passed a VMA pointer and a virtual address, so it can
+consult the reservation map to determine if a reservation exists.  In addition,
+alloc_huge_page takes the argument avoid_reserve which indicates reserves
+should not be used even if it appears they have been set aside for the
+specified address.  The avoid_reserve argument is most often used in the case
+of Copy on Write and Page Migration where additional copies of an existing
+page are being allocated.
+
+The helper routine vma_needs_reservation() is called to determine if a
+reservation exists for the address within the mapping(vma).  See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
+information on what this routine does.
+The value returned from vma_needs_reservation() is generally
+0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
+If a reservation does not exist, and there is a subpool associated with the
+mapping the subpool is consulted to determine if it contains reservations.
+If the subpool contains reservations, one can be used for this allocation.
+However, in every case the avoid_reserve argument overrides the use of
+a reservation for the allocation.  After determining whether a reservation
+exists and can be used for the allocation, the routine dequeue_huge_page_vma()
+is called.  This routine takes two arguments related to reservations:
+
+- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
+- chg, even though this argument is of type long only the values 0 or 1 are
+  passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
+  reservation exists (see the section "Memory Policy and Reservations" for
+  possible issues).  If the value is 1, it indicates a reservation does not
+  exist and the page must be taken from the global free pool if possible.
+
+The free lists associated with the memory policy of the VMA are searched for
+a free page.  If a page is found, the value free_huge_pages is decremented
+when the page is removed from the free list.  If there was a reservation
+associated with the page, the following adjustments are made::
+
+	SetPagePrivate(page);	/* Indicates allocating this page consumed
+				 * a reservation, and if an error is
+				 * encountered such that the page must be
+				 * freed, the reservation will be restored. */
+	resv_huge_pages--;	/* Decrement the global reservation count */
+
+Note, if no huge page can be found that satisfies the VMA's memory policy
+an attempt will be made to allocate one using the buddy allocator.  This
+brings up the issue of surplus huge pages and overcommit which is beyond
+the scope reservations.  Even if a surplus page is allocated, the same
+reservation based adjustments as above will be made: SetPagePrivate(page) and
+resv_huge_pages--.
+
+After obtaining a new huge page, (page)->private is set to the value of
+the subpool associated with the page if it exists.  This will be used for
+subpool accounting when the page is freed.
+
+The routine vma_commit_reservation() is then called to adjust the reserve
+map based on the consumption of the reservation.  In general, this involves
+ensuring the page is represented within a file_region structure of the region
+map.  For shared mappings where the the reservation was present, an entry
+in the reserve map already existed so no change is made.  However, if there
+was no reservation in a shared mapping or this was a private mapping a new
+entry must be created.
+
+It is possible that the reserve map could have been changed between the call
+to vma_needs_reservation() at the beginning of alloc_huge_page() and the
+call to vma_commit_reservation() after the page was allocated.  This would
+be possible if hugetlb_reserve_pages was called for the same page in a shared
+mapping.  In such cases, the reservation count and subpool free page count
+will be off by one.  This rare condition can be identified by comparing the
+return value from vma_needs_reservation and vma_commit_reservation.  If such
+a race is detected, the subpool and global reserve counts are adjusted to
+compensate.  See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
+information on these routines.
+
+
+Instantiate Huge Pages
+======================
+
+After huge page allocation, the page is typically added to the page tables
+of the allocating task.  Before this, pages in a shared mapping are added
+to the page cache and pages in private mappings are added to an anonymous
+reverse mapping.  In both cases, the PagePrivate flag is cleared.  Therefore,
+when a huge page that has been instantiated is freed no adjustment is made
+to the global reservation count (resv_huge_pages).
+
+
+Freeing Huge Pages
+==================
+
+Huge page freeing is performed by the routine free_huge_page().  This routine
+is the destructor for hugetlbfs compound pages.  As a result, it is only
+passed a pointer to the page struct.  When a huge page is freed, reservation
+accounting may need to be performed.  This would be the case if the page was
+associated with a subpool that contained reserves, or the page is being freed
+on an error path where a global reserve count must be restored.
+
+The page->private field points to any subpool associated with the page.
+If the PagePrivate flag is set, it indicates the global reserve count should
+be adjusted (see the section
+:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
+for information on how these are set).
+
+The routine first calls hugepage_subpool_put_pages() for the page.  If this
+routine returns a value of 0 (which does not equal the value passed 1) it
+indicates reserves are associated with the subpool, and this newly free page
+must be used to keep the number of subpool reserves above the minimum size.
+Therefore, the global resv_huge_pages counter is incremented in this case.
+
+If the PagePrivate flag was set in the page, the global resv_huge_pages counter
+will always be incremented.
+
+.. _sub_pool_resv:
+
+Subpool Reservations
+====================
+
+There is a struct hstate associated with each huge page size.  The hstate
+tracks all huge pages of the specified size.  A subpool represents a subset
+of pages within a hstate that is associated with a mounted hugetlbfs
+filesystem.
+
+When a hugetlbfs filesystem is mounted a min_size option can be specified
+which indicates the minimum number of huge pages required by the filesystem.
+If this option is specified, the number of huge pages corresponding to
+min_size are reserved for use by the filesystem.  This number is tracked in
+the min_hpages field of a struct hugepage_subpool.  At mount time,
+hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
+huge pages.  If they can not be reserved, the mount fails.
+
+The routines hugepage_subpool_get/put_pages() are called when pages are
+obtained from or released back to a subpool.  They perform all subpool
+accounting, and track any reservations associated with the subpool.
+hugepage_subpool_get/put_pages are passed the number of huge pages by which
+to adjust the subpool 'used page' count (down for get, up for put).  Normally,
+they return the same value that was passed or an error if not enough pages
+exist in the subpool.
+
+However, if reserves are associated with the subpool a return value less
+than the passed value may be returned.  This return value indicates the
+number of additional global pool adjustments which must be made.  For example,
+suppose a subpool contains 3 reserved huge pages and someone asks for 5.
+The 3 reserved pages associated with the subpool can be used to satisfy part
+of the request.  But, 2 pages must be obtained from the global pools.  To
+relay this information to the caller, the value 2 is returned.  The caller
+is then responsible for attempting to obtain the additional two pages from
+the global pools.
+
+
+COW and Reservations
+====================
+
+Since shared mappings all point to and use the same underlying pages, the
+biggest reservation concern for COW is private mappings.  In this case,
+two tasks can be pointing at the same previously allocated page.  One task
+attempts to write to the page, so a new page must be allocated so that each
+task points to its own page.
+
+When the page was originally allocated, the reservation for that page was
+consumed.  When an attempt to allocate a new page is made as a result of
+COW, it is possible that no free huge pages are free and the allocation
+will fail.
+
+When the private mapping was originally created, the owner of the mapping
+was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
+map of the owner.  Since the owner created the mapping, the owner owns all
+the reservations associated with the mapping.  Therefore, when a write fault
+occurs and there is no page available, different action is taken for the owner
+and non-owner of the reservation.
+
+In the case where the faulting task is not the owner, the fault will fail and
+the task will typically receive a SIGBUS.
+
+If the owner is the faulting task, we want it to succeed since it owned the
+original reservation.  To accomplish this, the page is unmapped from the
+non-owning task.  In this way, the only reference is from the owning task.
+In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
+of the non-owning task.  The non-owning task may receive a SIGBUS if it later
+faults on a non-present page.  But, the original owner of the
+mapping/reservation will behave as expected.
+
+
+.. _resv_map_modifications:
+
+Reservation Map Modifications
+=============================
+
+The following low level routines are used to make modifications to a
+reservation map.  Typically, these routines are not called directly.  Rather,
+a reservation map helper routine is called which calls one of these low level
+routines.  These low level routines are fairly well documented in the source
+code (mm/hugetlb.c).  These routines are::
+
+	long region_chg(struct resv_map *resv, long f, long t);
+	long region_add(struct resv_map *resv, long f, long t);
+	void region_abort(struct resv_map *resv, long f, long t);
+	long region_count(struct resv_map *resv, long f, long t);
+
+Operations on the reservation map typically involve two operations:
+
+1) region_chg() is called to examine the reserve map and determine how
+   many pages in the specified range [f, t) are NOT currently represented.
+
+   The calling code performs global checks and allocations to determine if
+   there are enough huge pages for the operation to succeed.
+
+2)
+  a) If the operation can succeed, region_add() is called to actually modify
+     the reservation map for the same range [f, t) previously passed to
+     region_chg().
+  b) If the operation can not succeed, region_abort is called for the same
+     range [f, t) to abort the operation.
+
+Note that this is a two step process where region_add() and region_abort()
+are guaranteed to succeed after a prior call to region_chg() for the same
+range.  region_chg() is responsible for pre-allocating any data structures
+necessary to ensure the subsequent operations (specifically region_add()))
+will succeed.
+
+As mentioned above, region_chg() determines the number of pages in the range
+which are NOT currently represented in the map.  This number is returned to
+the caller.  region_add() returns the number of pages in the range added to
+the map.  In most cases, the return value of region_add() is the same as the
+return value of region_chg().  However, in the case of shared mappings it is
+possible for changes to the reservation map to be made between the calls to
+region_chg() and region_add().  In this case, the return value of region_add()
+will not match the return value of region_chg().  It is likely that in such
+cases global counts and subpool accounting will be incorrect and in need of
+adjustment.  It is the responsibility of the caller to check for this condition
+and make the appropriate adjustments.
+
+The routine region_del() is called to remove regions from a reservation map.
+It is typically called in the following situations:
+
+- When a file in the hugetlbfs filesystem is being removed, the inode will
+  be released and the reservation map freed.  Before freeing the reservation
+  map, all the individual file_region structures must be freed.  In this case
+  region_del is passed the range [0, LONG_MAX).
+- When a hugetlbfs file is being truncated.  In this case, all allocated pages
+  after the new file size must be freed.  In addition, any file_region entries
+  in the reservation map past the new end of file must be deleted.  In this
+  case, region_del is passed the range [new_end_of_file, LONG_MAX).
+- When a hole is being punched in a hugetlbfs file.  In this case, huge pages
+  are removed from the middle of the file one at a time.  As the pages are
+  removed, region_del() is called to remove the corresponding entry from the
+  reservation map.  In this case, region_del is passed the range
+  [page_idx, page_idx + 1).
+
+In every case, region_del() will return the number of pages removed from the
+reservation map.  In VERY rare cases, region_del() can fail.  This can only
+happen in the hole punch case where it has to split an existing file_region
+entry and can not allocate a new structure.  In this error case, region_del()
+will return -ENOMEM.  The problem here is that the reservation map will
+indicate that there is a reservation for the page.  However, the subpool and
+global reservation counts will not reflect the reservation.  To handle this
+situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
+counters so that they correspond with the reservation map entry that could
+not be deleted.
+
+region_count() is called when unmapping a private huge page mapping.  In
+private mappings, the lack of a entry in the reservation map indicates that
+a reservation exists.  Therefore, by counting the number of entries in the
+reservation map we know how many reservations were consumed and how many are
+outstanding (outstanding = (end - start) - region_count(resv, start, end)).
+Since the mapping is going away, the subpool and global reservation counts
+are decremented by the number of outstanding reservations.
+
+.. _resv_map_helpers:
+
+Reservation Map Helper Routines
+===============================
+
+Several helper routines exist to query and modify the reservation maps.
+These routines are only interested with reservations for a specific huge
+page, so they just pass in an address instead of a range.  In addition,
+they pass in the associated VMA.  From the VMA, the type of mapping (private
+or shared) and the location of the reservation map (inode or VMA) can be
+determined.  These routines simply call the underlying routines described
+in the section "Reservation Map Modifications".  However, they do take into
+account the 'opposite' meaning of reservation map entries for private and
+shared mappings and hide this detail from the caller::
+
+	long vma_needs_reservation(struct hstate *h,
+				   struct vm_area_struct *vma,
+				   unsigned long addr)
+
+This routine calls region_chg() for the specified page.  If no reservation
+exists, 1 is returned.  If a reservation exists, 0 is returned::
+
+	long vma_commit_reservation(struct hstate *h,
+				    struct vm_area_struct *vma,
+				    unsigned long addr)
+
+This calls region_add() for the specified page.  As in the case of region_chg
+and region_add, this routine is to be called after a previous call to
+vma_needs_reservation.  It will add a reservation entry for the page.  It
+returns 1 if the reservation was added and 0 if not.  The return value should
+be compared with the return value of the previous call to
+vma_needs_reservation.  An unexpected difference indicates the reservation
+map was modified between calls::
+
+	void vma_end_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+This calls region_abort() for the specified page.  As in the case of region_chg
+and region_abort, this routine is to be called after a previous call to
+vma_needs_reservation.  It will abort/end the in progress reservation add
+operation::
+
+	long vma_add_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+This is a special wrapper routine to help facilitate reservation cleanup
+on error paths.  It is only called from the routine restore_reserve_on_error().
+This routine is used in conjunction with vma_needs_reservation in an attempt
+to add a reservation to the reservation map.  It takes into account the
+different reservation map semantics for private and shared mappings.  Hence,
+region_add is called for shared mappings (as an entry present in the map
+indicates a reservation), and region_del is called for private mappings (as
+the absence of an entry in the map indicates a reservation).  See the section
+"Reservation cleanup in error paths" for more information on what needs to
+be done on error paths.
+
+
+Reservation Cleanup in Error Paths
+==================================
+
+As mentioned in the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
+map modifications are performed in two steps.  First vma_needs_reservation
+is called before a page is allocated.  If the allocation is successful,
+then vma_commit_reservation is called.  If not, vma_end_reservation is called.
+Global and subpool reservation counts are adjusted based on success or failure
+of the operation and all is well.
+
+Additionally, after a huge page is instantiated the PagePrivate flag is
+cleared so that accounting when the page is ultimately freed is correct.
+
+However, there are several instances where errors are encountered after a huge
+page is allocated but before it is instantiated.  In this case, the page
+allocation has consumed the reservation and made the appropriate subpool,
+reservation map and global count adjustments.  If the page is freed at this
+time (before instantiation and clearing of PagePrivate), then free_huge_page
+will increment the global reservation count.  However, the reservation map
+indicates the reservation was consumed.  This resulting inconsistent state
+will cause the 'leak' of a reserved huge page.  The global reserve count will
+be  higher than it should and prevent allocation of a pre-allocated page.
+
+The routine restore_reserve_on_error() attempts to handle this situation.  It
+is fairly well documented.  The intention of this routine is to restore
+the reservation map to the way it was before the page allocation.   In this
+way, the state of the reservation map will correspond to the global reservation
+count after the page is freed.
+
+The routine restore_reserve_on_error itself may encounter errors while
+attempting to restore the reservation map entry.  In this case, it will
+simply clear the PagePrivate flag of the page.  In this way, the global
+reserve count will not be incremented when the page is freed.  However, the
+reservation map will continue to look as though the reservation was consumed.
+A page can still be allocated for the address, but it will not use a reserved
+page as originally intended.
+
+There is some code (most notably userfaultfd) which can not call
+restore_reserve_on_error.  In this case, it simply modifies the PagePrivate
+so that a reservation will not be leaked when the huge page is freed.
+
+
+Reservations and Memory Policy
+==============================
+Per-node huge page lists existed in struct hstate when git was first used
+to manage Linux code.  The concept of reservations was added some time later.
+When reservations were added, no attempt was made to take memory policy
+into account.  While cpusets are not exactly the same as memory policy, this
+comment in hugetlb_acct_memory sums up the interaction between reservations
+and cpusets/memory policy::
+
+	/*
+	 * When cpuset is configured, it breaks the strict hugetlb page
+	 * reservation as the accounting is done on a global variable. Such
+	 * reservation is completely rubbish in the presence of cpuset because
+	 * the reservation is not checked against page availability for the
+	 * current cpuset. Application can still potentially OOM'ed by kernel
+	 * with lack of free htlb page in cpuset that the task is in.
+	 * Attempt to enforce strict accounting with cpuset is almost
+	 * impossible (or too ugly) because cpuset is too fluid that
+	 * task or memory node can be dynamically moved between cpusets.
+	 *
+	 * The change of semantics for shared hugetlb mapping with cpuset is
+	 * undesirable. However, in order to preserve some of the semantics,
+	 * we fall back to check against current free page availability as
+	 * a best attempt and hopefully to minimize the impact of changing
+	 * semantics that cpuset has.
+	 */
+
+Huge page reservations were added to prevent unexpected page allocation
+failures (OOM) at page fault time.  However, if an application makes use
+of cpusets or memory policy there is no guarantee that huge pages will be
+available on the required nodes.  This is true even if there are a sufficient
+number of global reservations.
+
+
+Mike Kravetz, 7 April 2017
diff --git a/Documentation/vm/hugetlbfs_reserv.txt b/Documentation/vm/hugetlbfs_reserv.txt
deleted file mode 100644
index 36a87a2ea435..000000000000
--- a/Documentation/vm/hugetlbfs_reserv.txt
+++ /dev/null
@@ -1,587 +0,0 @@
-.. _hugetlbfs_reserve:
-
-=====================
-Hugetlbfs Reservation
-=====================
-
-Overview
-========
-
-Huge pages as described at :ref:`hugetlbpage` are typically
-preallocated for application use.  These huge pages are instantiated in a
-task's address space at page fault time if the VMA indicates huge pages are
-to be used.  If no huge page exists at page fault time, the task is sent
-a SIGBUS and often dies an unhappy death.  Shortly after huge page support
-was added, it was determined that it would be better to detect a shortage
-of huge pages at mmap() time.  The idea is that if there were not enough
-huge pages to cover the mapping, the mmap() would fail.  This was first
-done with a simple check in the code at mmap() time to determine if there
-were enough free huge pages to cover the mapping.  Like most things in the
-kernel, the code has evolved over time.  However, the basic idea was to
-'reserve' huge pages at mmap() time to ensure that huge pages would be
-available for page faults in that mapping.  The description below attempts to
-describe how huge page reserve processing is done in the v4.10 kernel.
-
-
-Audience
-========
-This description is primarily targeted at kernel developers who are modifying
-hugetlbfs code.
-
-
-The Data Structures
-===================
-
-resv_huge_pages
-	This is a global (per-hstate) count of reserved huge pages.  Reserved
-	huge pages are only available to the task which reserved them.
-	Therefore, the number of huge pages generally available is computed
-	as (``free_huge_pages - resv_huge_pages``).
-Reserve Map
-	A reserve map is described by the structure::
-
-		struct resv_map {
-			struct kref refs;
-			spinlock_t lock;
-			struct list_head regions;
-			long adds_in_progress;
-			struct list_head region_cache;
-			long region_cache_count;
-		};
-
-	There is one reserve map for each huge page mapping in the system.
-	The regions list within the resv_map describes the regions within
-	the mapping.  A region is described as::
-
-		struct file_region {
-			struct list_head link;
-			long from;
-			long to;
-		};
-
-	The 'from' and 'to' fields of the file region structure are huge page
-	indices into the mapping.  Depending on the type of mapping, a
-	region in the reserv_map may indicate reservations exist for the
-	range, or reservations do not exist.
-Flags for MAP_PRIVATE Reservations
-	These are stored in the bottom bits of the reservation map pointer.
-
-	``#define HPAGE_RESV_OWNER    (1UL << 0)``
-		Indicates this task is the owner of the reservations
-		associated with the mapping.
-	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
-		Indicates task originally mapping this range (and creating
-		reserves) has unmapped a page from this task (the child)
-		due to a failed COW.
-Page Flags
-	The PagePrivate page flag is used to indicate that a huge page
-	reservation must be restored when the huge page is freed.  More
-	details will be discussed in the "Freeing huge pages" section.
-
-
-Reservation Map Location (Private or Shared)
-============================================
-
-A huge page mapping or segment is either private or shared.  If private,
-it is typically only available to a single address space (task).  If shared,
-it can be mapped into multiple address spaces (tasks).  The location and
-semantics of the reservation map is significantly different for two types
-of mappings.  Location differences are:
-
-- For private mappings, the reservation map hangs off the the VMA structure.
-  Specifically, vma->vm_private_data.  This reserve map is created at the
-  time the mapping (mmap(MAP_PRIVATE)) is created.
-- For shared mappings, the reservation map hangs off the inode.  Specifically,
-  inode->i_mapping->private_data.  Since shared mappings are always backed
-  by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
-  contains a reservation map.  As a result, the reservation map is allocated
-  when the inode is created.
-
-
-Creating Reservations
-=====================
-Reservations are created when a huge page backed shared memory segment is
-created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
-These operations result in a call to the routine hugetlb_reserve_pages()::
-
-	int hugetlb_reserve_pages(struct inode *inode,
-				  long from, long to,
-				  struct vm_area_struct *vma,
-				  vm_flags_t vm_flags)
-
-The first thing hugetlb_reserve_pages() does is check for the NORESERVE
-flag was specified in either the shmget() or mmap() call.  If NORESERVE
-was specified, then this routine returns immediately as no reservation
-are desired.
-
-The arguments 'from' and 'to' are huge page indices into the mapping or
-underlying file.  For shmget(), 'from' is always 0 and 'to' corresponds to
-the length of the segment/mapping.  For mmap(), the offset argument could
-be used to specify the offset into the underlying file.  In such a case
-the 'from' and 'to' arguments have been adjusted by this offset.
-
-One of the big differences between PRIVATE and SHARED mappings is the way
-in which reservations are represented in the reservation map.
-
-- For shared mappings, an entry in the reservation map indicates a reservation
-  exists or did exist for the corresponding page.  As reservations are
-  consumed, the reservation map is not modified.
-- For private mappings, the lack of an entry in the reservation map indicates
-  a reservation exists for the corresponding page.  As reservations are
-  consumed, entries are added to the reservation map.  Therefore, the
-  reservation map can also be used to determine which reservations have
-  been consumed.
-
-For private mappings, hugetlb_reserve_pages() creates the reservation map and
-hangs it off the VMA structure.  In addition, the HPAGE_RESV_OWNER flag is set
-to indicate this VMA owns the reservations.
-
-The reservation map is consulted to determine how many huge page reservations
-are needed for the current mapping/segment.  For private mappings, this is
-always the value (to - from).  However, for shared mappings it is possible that some reservations may already exist within the range (to - from).  See the
-section :ref:`Reservation Map Modifications <resv_map_modifications>`
-for details on how this is accomplished.
-
-The mapping may be associated with a subpool.  If so, the subpool is consulted
-to ensure there is sufficient space for the mapping.  It is possible that the
-subpool has set aside reservations that can be used for the mapping.  See the
-section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
-
-After consulting the reservation map and subpool, the number of needed new
-reservations is known.  The routine hugetlb_acct_memory() is called to check
-for and take the requested number of reservations.  hugetlb_acct_memory()
-calls into routines that potentially allocate and adjust surplus page counts.
-However, within those routines the code is simply checking to ensure there
-are enough free huge pages to accommodate the reservation.  If there are,
-the global reservation count resv_huge_pages is adjusted something like the
-following::
-
-	if (resv_needed <= (resv_huge_pages - free_huge_pages))
-		resv_huge_pages += resv_needed;
-
-Note that the global lock hugetlb_lock is held when checking and adjusting
-these counters.
-
-If there were enough free huge pages and the global count resv_huge_pages
-was adjusted, then the reservation map associated with the mapping is
-modified to reflect the reservations.  In the case of a shared mapping, a
-file_region will exist that includes the range 'from' 'to'.  For private
-mappings, no modifications are made to the reservation map as lack of an
-entry indicates a reservation exists.
-
-If hugetlb_reserve_pages() was successful, the global reservation count and
-reservation map associated with the mapping will be modified as required to
-ensure reservations exist for the range 'from' - 'to'.
-
-.. _consume_resv:
-
-Consuming Reservations/Allocating a Huge Page
-=============================================
-
-Reservations are consumed when huge pages associated with the reservations
-are allocated and instantiated in the corresponding mapping.  The allocation
-is performed within the routine alloc_huge_page()::
-
-	struct page *alloc_huge_page(struct vm_area_struct *vma,
-				     unsigned long addr, int avoid_reserve)
-
-alloc_huge_page is passed a VMA pointer and a virtual address, so it can
-consult the reservation map to determine if a reservation exists.  In addition,
-alloc_huge_page takes the argument avoid_reserve which indicates reserves
-should not be used even if it appears they have been set aside for the
-specified address.  The avoid_reserve argument is most often used in the case
-of Copy on Write and Page Migration where additional copies of an existing
-page are being allocated.
-
-The helper routine vma_needs_reservation() is called to determine if a
-reservation exists for the address within the mapping(vma).  See the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
-information on what this routine does.
-The value returned from vma_needs_reservation() is generally
-0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
-If a reservation does not exist, and there is a subpool associated with the
-mapping the subpool is consulted to determine if it contains reservations.
-If the subpool contains reservations, one can be used for this allocation.
-However, in every case the avoid_reserve argument overrides the use of
-a reservation for the allocation.  After determining whether a reservation
-exists and can be used for the allocation, the routine dequeue_huge_page_vma()
-is called.  This routine takes two arguments related to reservations:
-
-- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
-- chg, even though this argument is of type long only the values 0 or 1 are
-  passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
-  reservation exists (see the section "Memory Policy and Reservations" for
-  possible issues).  If the value is 1, it indicates a reservation does not
-  exist and the page must be taken from the global free pool if possible.
-
-The free lists associated with the memory policy of the VMA are searched for
-a free page.  If a page is found, the value free_huge_pages is decremented
-when the page is removed from the free list.  If there was a reservation
-associated with the page, the following adjustments are made::
-
-	SetPagePrivate(page);	/* Indicates allocating this page consumed
-				 * a reservation, and if an error is
-				 * encountered such that the page must be
-				 * freed, the reservation will be restored. */
-	resv_huge_pages--;	/* Decrement the global reservation count */
-
-Note, if no huge page can be found that satisfies the VMA's memory policy
-an attempt will be made to allocate one using the buddy allocator.  This
-brings up the issue of surplus huge pages and overcommit which is beyond
-the scope reservations.  Even if a surplus page is allocated, the same
-reservation based adjustments as above will be made: SetPagePrivate(page) and
-resv_huge_pages--.
-
-After obtaining a new huge page, (page)->private is set to the value of
-the subpool associated with the page if it exists.  This will be used for
-subpool accounting when the page is freed.
-
-The routine vma_commit_reservation() is then called to adjust the reserve
-map based on the consumption of the reservation.  In general, this involves
-ensuring the page is represented within a file_region structure of the region
-map.  For shared mappings where the the reservation was present, an entry
-in the reserve map already existed so no change is made.  However, if there
-was no reservation in a shared mapping or this was a private mapping a new
-entry must be created.
-
-It is possible that the reserve map could have been changed between the call
-to vma_needs_reservation() at the beginning of alloc_huge_page() and the
-call to vma_commit_reservation() after the page was allocated.  This would
-be possible if hugetlb_reserve_pages was called for the same page in a shared
-mapping.  In such cases, the reservation count and subpool free page count
-will be off by one.  This rare condition can be identified by comparing the
-return value from vma_needs_reservation and vma_commit_reservation.  If such
-a race is detected, the subpool and global reserve counts are adjusted to
-compensate.  See the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
-information on these routines.
-
-
-Instantiate Huge Pages
-======================
-
-After huge page allocation, the page is typically added to the page tables
-of the allocating task.  Before this, pages in a shared mapping are added
-to the page cache and pages in private mappings are added to an anonymous
-reverse mapping.  In both cases, the PagePrivate flag is cleared.  Therefore,
-when a huge page that has been instantiated is freed no adjustment is made
-to the global reservation count (resv_huge_pages).
-
-
-Freeing Huge Pages
-==================
-
-Huge page freeing is performed by the routine free_huge_page().  This routine
-is the destructor for hugetlbfs compound pages.  As a result, it is only
-passed a pointer to the page struct.  When a huge page is freed, reservation
-accounting may need to be performed.  This would be the case if the page was
-associated with a subpool that contained reserves, or the page is being freed
-on an error path where a global reserve count must be restored.
-
-The page->private field points to any subpool associated with the page.
-If the PagePrivate flag is set, it indicates the global reserve count should
-be adjusted (see the section
-:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
-for information on how these are set).
-
-The routine first calls hugepage_subpool_put_pages() for the page.  If this
-routine returns a value of 0 (which does not equal the value passed 1) it
-indicates reserves are associated with the subpool, and this newly free page
-must be used to keep the number of subpool reserves above the minimum size.
-Therefore, the global resv_huge_pages counter is incremented in this case.
-
-If the PagePrivate flag was set in the page, the global resv_huge_pages counter
-will always be incremented.
-
-.. _sub_pool_resv:
-
-Subpool Reservations
-====================
-
-There is a struct hstate associated with each huge page size.  The hstate
-tracks all huge pages of the specified size.  A subpool represents a subset
-of pages within a hstate that is associated with a mounted hugetlbfs
-filesystem.
-
-When a hugetlbfs filesystem is mounted a min_size option can be specified
-which indicates the minimum number of huge pages required by the filesystem.
-If this option is specified, the number of huge pages corresponding to
-min_size are reserved for use by the filesystem.  This number is tracked in
-the min_hpages field of a struct hugepage_subpool.  At mount time,
-hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
-huge pages.  If they can not be reserved, the mount fails.
-
-The routines hugepage_subpool_get/put_pages() are called when pages are
-obtained from or released back to a subpool.  They perform all subpool
-accounting, and track any reservations associated with the subpool.
-hugepage_subpool_get/put_pages are passed the number of huge pages by which
-to adjust the subpool 'used page' count (down for get, up for put).  Normally,
-they return the same value that was passed or an error if not enough pages
-exist in the subpool.
-
-However, if reserves are associated with the subpool a return value less
-than the passed value may be returned.  This return value indicates the
-number of additional global pool adjustments which must be made.  For example,
-suppose a subpool contains 3 reserved huge pages and someone asks for 5.
-The 3 reserved pages associated with the subpool can be used to satisfy part
-of the request.  But, 2 pages must be obtained from the global pools.  To
-relay this information to the caller, the value 2 is returned.  The caller
-is then responsible for attempting to obtain the additional two pages from
-the global pools.
-
-
-COW and Reservations
-====================
-
-Since shared mappings all point to and use the same underlying pages, the
-biggest reservation concern for COW is private mappings.  In this case,
-two tasks can be pointing at the same previously allocated page.  One task
-attempts to write to the page, so a new page must be allocated so that each
-task points to its own page.
-
-When the page was originally allocated, the reservation for that page was
-consumed.  When an attempt to allocate a new page is made as a result of
-COW, it is possible that no free huge pages are free and the allocation
-will fail.
-
-When the private mapping was originally created, the owner of the mapping
-was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
-map of the owner.  Since the owner created the mapping, the owner owns all
-the reservations associated with the mapping.  Therefore, when a write fault
-occurs and there is no page available, different action is taken for the owner
-and non-owner of the reservation.
-
-In the case where the faulting task is not the owner, the fault will fail and
-the task will typically receive a SIGBUS.
-
-If the owner is the faulting task, we want it to succeed since it owned the
-original reservation.  To accomplish this, the page is unmapped from the
-non-owning task.  In this way, the only reference is from the owning task.
-In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
-of the non-owning task.  The non-owning task may receive a SIGBUS if it later
-faults on a non-present page.  But, the original owner of the
-mapping/reservation will behave as expected.
-
-
-.. _resv_map_modifications:
-
-Reservation Map Modifications
-=============================
-
-The following low level routines are used to make modifications to a
-reservation map.  Typically, these routines are not called directly.  Rather,
-a reservation map helper routine is called which calls one of these low level
-routines.  These low level routines are fairly well documented in the source
-code (mm/hugetlb.c).  These routines are::
-
-	long region_chg(struct resv_map *resv, long f, long t);
-	long region_add(struct resv_map *resv, long f, long t);
-	void region_abort(struct resv_map *resv, long f, long t);
-	long region_count(struct resv_map *resv, long f, long t);
-
-Operations on the reservation map typically involve two operations:
-
-1) region_chg() is called to examine the reserve map and determine how
-   many pages in the specified range [f, t) are NOT currently represented.
-
-   The calling code performs global checks and allocations to determine if
-   there are enough huge pages for the operation to succeed.
-
-2)
-  a) If the operation can succeed, region_add() is called to actually modify
-     the reservation map for the same range [f, t) previously passed to
-     region_chg().
-  b) If the operation can not succeed, region_abort is called for the same
-     range [f, t) to abort the operation.
-
-Note that this is a two step process where region_add() and region_abort()
-are guaranteed to succeed after a prior call to region_chg() for the same
-range.  region_chg() is responsible for pre-allocating any data structures
-necessary to ensure the subsequent operations (specifically region_add()))
-will succeed.
-
-As mentioned above, region_chg() determines the number of pages in the range
-which are NOT currently represented in the map.  This number is returned to
-the caller.  region_add() returns the number of pages in the range added to
-the map.  In most cases, the return value of region_add() is the same as the
-return value of region_chg().  However, in the case of shared mappings it is
-possible for changes to the reservation map to be made between the calls to
-region_chg() and region_add().  In this case, the return value of region_add()
-will not match the return value of region_chg().  It is likely that in such
-cases global counts and subpool accounting will be incorrect and in need of
-adjustment.  It is the responsibility of the caller to check for this condition
-and make the appropriate adjustments.
-
-The routine region_del() is called to remove regions from a reservation map.
-It is typically called in the following situations:
-
-- When a file in the hugetlbfs filesystem is being removed, the inode will
-  be released and the reservation map freed.  Before freeing the reservation
-  map, all the individual file_region structures must be freed.  In this case
-  region_del is passed the range [0, LONG_MAX).
-- When a hugetlbfs file is being truncated.  In this case, all allocated pages
-  after the new file size must be freed.  In addition, any file_region entries
-  in the reservation map past the new end of file must be deleted.  In this
-  case, region_del is passed the range [new_end_of_file, LONG_MAX).
-- When a hole is being punched in a hugetlbfs file.  In this case, huge pages
-  are removed from the middle of the file one at a time.  As the pages are
-  removed, region_del() is called to remove the corresponding entry from the
-  reservation map.  In this case, region_del is passed the range
-  [page_idx, page_idx + 1).
-
-In every case, region_del() will return the number of pages removed from the
-reservation map.  In VERY rare cases, region_del() can fail.  This can only
-happen in the hole punch case where it has to split an existing file_region
-entry and can not allocate a new structure.  In this error case, region_del()
-will return -ENOMEM.  The problem here is that the reservation map will
-indicate that there is a reservation for the page.  However, the subpool and
-global reservation counts will not reflect the reservation.  To handle this
-situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
-counters so that they correspond with the reservation map entry that could
-not be deleted.
-
-region_count() is called when unmapping a private huge page mapping.  In
-private mappings, the lack of a entry in the reservation map indicates that
-a reservation exists.  Therefore, by counting the number of entries in the
-reservation map we know how many reservations were consumed and how many are
-outstanding (outstanding = (end - start) - region_count(resv, start, end)).
-Since the mapping is going away, the subpool and global reservation counts
-are decremented by the number of outstanding reservations.
-
-.. _resv_map_helpers:
-
-Reservation Map Helper Routines
-===============================
-
-Several helper routines exist to query and modify the reservation maps.
-These routines are only interested with reservations for a specific huge
-page, so they just pass in an address instead of a range.  In addition,
-they pass in the associated VMA.  From the VMA, the type of mapping (private
-or shared) and the location of the reservation map (inode or VMA) can be
-determined.  These routines simply call the underlying routines described
-in the section "Reservation Map Modifications".  However, they do take into
-account the 'opposite' meaning of reservation map entries for private and
-shared mappings and hide this detail from the caller::
-
-	long vma_needs_reservation(struct hstate *h,
-				   struct vm_area_struct *vma,
-				   unsigned long addr)
-
-This routine calls region_chg() for the specified page.  If no reservation
-exists, 1 is returned.  If a reservation exists, 0 is returned::
-
-	long vma_commit_reservation(struct hstate *h,
-				    struct vm_area_struct *vma,
-				    unsigned long addr)
-
-This calls region_add() for the specified page.  As in the case of region_chg
-and region_add, this routine is to be called after a previous call to
-vma_needs_reservation.  It will add a reservation entry for the page.  It
-returns 1 if the reservation was added and 0 if not.  The return value should
-be compared with the return value of the previous call to
-vma_needs_reservation.  An unexpected difference indicates the reservation
-map was modified between calls::
-
-	void vma_end_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-This calls region_abort() for the specified page.  As in the case of region_chg
-and region_abort, this routine is to be called after a previous call to
-vma_needs_reservation.  It will abort/end the in progress reservation add
-operation::
-
-	long vma_add_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-This is a special wrapper routine to help facilitate reservation cleanup
-on error paths.  It is only called from the routine restore_reserve_on_error().
-This routine is used in conjunction with vma_needs_reservation in an attempt
-to add a reservation to the reservation map.  It takes into account the
-different reservation map semantics for private and shared mappings.  Hence,
-region_add is called for shared mappings (as an entry present in the map
-indicates a reservation), and region_del is called for private mappings (as
-the absence of an entry in the map indicates a reservation).  See the section
-"Reservation cleanup in error paths" for more information on what needs to
-be done on error paths.
-
-
-Reservation Cleanup in Error Paths
-==================================
-
-As mentioned in the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
-map modifications are performed in two steps.  First vma_needs_reservation
-is called before a page is allocated.  If the allocation is successful,
-then vma_commit_reservation is called.  If not, vma_end_reservation is called.
-Global and subpool reservation counts are adjusted based on success or failure
-of the operation and all is well.
-
-Additionally, after a huge page is instantiated the PagePrivate flag is
-cleared so that accounting when the page is ultimately freed is correct.
-
-However, there are several instances where errors are encountered after a huge
-page is allocated but before it is instantiated.  In this case, the page
-allocation has consumed the reservation and made the appropriate subpool,
-reservation map and global count adjustments.  If the page is freed at this
-time (before instantiation and clearing of PagePrivate), then free_huge_page
-will increment the global reservation count.  However, the reservation map
-indicates the reservation was consumed.  This resulting inconsistent state
-will cause the 'leak' of a reserved huge page.  The global reserve count will
-be  higher than it should and prevent allocation of a pre-allocated page.
-
-The routine restore_reserve_on_error() attempts to handle this situation.  It
-is fairly well documented.  The intention of this routine is to restore
-the reservation map to the way it was before the page allocation.   In this
-way, the state of the reservation map will correspond to the global reservation
-count after the page is freed.
-
-The routine restore_reserve_on_error itself may encounter errors while
-attempting to restore the reservation map entry.  In this case, it will
-simply clear the PagePrivate flag of the page.  In this way, the global
-reserve count will not be incremented when the page is freed.  However, the
-reservation map will continue to look as though the reservation was consumed.
-A page can still be allocated for the address, but it will not use a reserved
-page as originally intended.
-
-There is some code (most notably userfaultfd) which can not call
-restore_reserve_on_error.  In this case, it simply modifies the PagePrivate
-so that a reservation will not be leaked when the huge page is freed.
-
-
-Reservations and Memory Policy
-==============================
-Per-node huge page lists existed in struct hstate when git was first used
-to manage Linux code.  The concept of reservations was added some time later.
-When reservations were added, no attempt was made to take memory policy
-into account.  While cpusets are not exactly the same as memory policy, this
-comment in hugetlb_acct_memory sums up the interaction between reservations
-and cpusets/memory policy::
-
-	/*
-	 * When cpuset is configured, it breaks the strict hugetlb page
-	 * reservation as the accounting is done on a global variable. Such
-	 * reservation is completely rubbish in the presence of cpuset because
-	 * the reservation is not checked against page availability for the
-	 * current cpuset. Application can still potentially OOM'ed by kernel
-	 * with lack of free htlb page in cpuset that the task is in.
-	 * Attempt to enforce strict accounting with cpuset is almost
-	 * impossible (or too ugly) because cpuset is too fluid that
-	 * task or memory node can be dynamically moved between cpusets.
-	 *
-	 * The change of semantics for shared hugetlb mapping with cpuset is
-	 * undesirable. However, in order to preserve some of the semantics,
-	 * we fall back to check against current free page availability as
-	 * a best attempt and hopefully to minimize the impact of changing
-	 * semantics that cpuset has.
-	 */
-
-Huge page reservations were added to prevent unexpected page allocation
-failures (OOM) at page fault time.  However, if an application makes use
-of cpusets or memory policy there is no guarantee that huge pages will be
-available on the required nodes.  This is true even if there are a sufficient
-number of global reservations.
-
-
-Mike Kravetz, 7 April 2017
diff --git a/Documentation/vm/hugetlbpage.rst b/Documentation/vm/hugetlbpage.rst
new file mode 100644
index 000000000000..a5da14b05b4b
--- /dev/null
+++ b/Documentation/vm/hugetlbpage.rst
@@ -0,0 +1,386 @@
+.. _hugetlbpage:
+
+=============
+HugeTLB Pages
+=============
+
+Overview
+========
+
+The intent of this file is to give a brief summary of hugetlbpage support in
+the Linux kernel.  This support is built on top of multiple page size support
+that is provided by most modern architectures.  For example, x86 CPUs normally
+support 4K and 2M (1G if architecturally supported) page sizes, ia64
+architecture supports multiple page sizes 4K, 8K, 64K, 256K, 1M, 4M, 16M,
+256M and ppc64 supports 4K and 16M.  A TLB is a cache of virtual-to-physical
+translations.  Typically this is a very scarce resource on processor.
+Operating systems try to make best use of limited number of TLB resources.
+This optimization is more critical now as bigger and bigger physical memories
+(several GBs) are more readily available.
+
+Users can use the huge page support in Linux kernel by either using the mmap
+system call or standard SYSV shared memory system calls (shmget, shmat).
+
+First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
+(present under "File systems") and CONFIG_HUGETLB_PAGE (selected
+automatically when CONFIG_HUGETLBFS is selected) configuration
+options.
+
+The ``/proc/meminfo`` file provides information about the total number of
+persistent hugetlb pages in the kernel's huge page pool.  It also displays
+default huge page size and information about the number of free, reserved
+and surplus huge pages in the pool of huge pages of default size.
+The huge page size is needed for generating the proper alignment and
+size of the arguments to system calls that map huge page regions.
+
+The output of ``cat /proc/meminfo`` will include lines like::
+
+	HugePages_Total: uuu
+	HugePages_Free:  vvv
+	HugePages_Rsvd:  www
+	HugePages_Surp:  xxx
+	Hugepagesize:    yyy kB
+	Hugetlb:         zzz kB
+
+where:
+
+HugePages_Total
+	is the size of the pool of huge pages.
+HugePages_Free
+	is the number of huge pages in the pool that are not yet
+        allocated.
+HugePages_Rsvd
+	is short for "reserved," and is the number of huge pages for
+        which a commitment to allocate from the pool has been made,
+        but no allocation has yet been made.  Reserved huge pages
+        guarantee that an application will be able to allocate a
+        huge page from the pool of huge pages at fault time.
+HugePages_Surp
+	is short for "surplus," and is the number of huge pages in
+        the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
+        maximum number of surplus huge pages is controlled by
+        ``/proc/sys/vm/nr_overcommit_hugepages``.
+Hugepagesize
+	is the default hugepage size (in Kb).
+Hugetlb
+        is the total amount of memory (in kB), consumed by huge
+        pages of all sizes.
+        If huge pages of different sizes are in use, this number
+        will exceed HugePages_Total \* Hugepagesize. To get more
+        detailed information, please, refer to
+        ``/sys/kernel/mm/hugepages`` (described below).
+
+
+``/proc/filesystems`` should also show a filesystem of type "hugetlbfs"
+configured in the kernel.
+
+``/proc/sys/vm/nr_hugepages`` indicates the current number of "persistent" huge
+pages in the kernel's huge page pool.  "Persistent" huge pages will be
+returned to the huge page pool when freed by a task.  A user with root
+privileges can dynamically allocate more or free some persistent huge pages
+by increasing or decreasing the value of ``nr_hugepages``.
+
+Pages that are used as huge pages are reserved inside the kernel and cannot
+be used for other purposes.  Huge pages cannot be swapped out under
+memory pressure.
+
+Once a number of huge pages have been pre-allocated to the kernel huge page
+pool, a user with appropriate privilege can use either the mmap system call
+or shared memory system calls to use the huge pages.  See the discussion of
+Using Huge Pages, below.
+
+The administrator can allocate persistent huge pages on the kernel boot
+command line by specifying the "hugepages=N" parameter, where 'N' = the
+number of huge pages requested.  This is the most reliable method of
+allocating huge pages as memory has not yet become fragmented.
+
+Some platforms support multiple huge page sizes.  To allocate huge pages
+of a specific size, one must precede the huge pages boot command parameters
+with a huge page size selection parameter "hugepagesz=<size>".  <size> must
+be specified in bytes with optional scale suffix [kKmMgG].  The default huge
+page size may be selected with the "default_hugepagesz=<size>" boot parameter.
+
+When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
+indicates the current number of pre-allocated huge pages of the default size.
+Thus, one can use the following command to dynamically allocate/deallocate
+default sized persistent huge pages::
+
+	echo 20 > /proc/sys/vm/nr_hugepages
+
+This command will try to adjust the number of default sized huge pages in the
+huge page pool to 20, allocating or freeing huge pages, as required.
+
+On a NUMA platform, the kernel will attempt to distribute the huge page pool
+over all the set of allowed nodes specified by the NUMA memory policy of the
+task that modifies ``nr_hugepages``. The default for the allowed nodes--when the
+task has default memory policy--is all on-line nodes with memory.  Allowed
+nodes with insufficient available, contiguous memory for a huge page will be
+silently skipped when allocating persistent huge pages.  See the discussion
+below of the interaction of task memory policy, cpusets and per node attributes
+with the allocation and freeing of persistent huge pages.
+
+The success or failure of huge page allocation depends on the amount of
+physically contiguous memory that is present in system at the time of the
+allocation attempt.  If the kernel is unable to allocate huge pages from
+some nodes in a NUMA system, it will attempt to make up the difference by
+allocating extra pages on other nodes with sufficient available contiguous
+memory, if any.
+
+System administrators may want to put this command in one of the local rc
+init files.  This will enable the kernel to allocate huge pages early in
+the boot process when the possibility of getting physical contiguous pages
+is still very high.  Administrators can verify the number of huge pages
+actually allocated by checking the sysctl or meminfo.  To check the per node
+distribution of huge pages in a NUMA system, use::
+
+	cat /sys/devices/system/node/node*/meminfo | fgrep Huge
+
+``/proc/sys/vm/nr_overcommit_hugepages`` specifies how large the pool of
+huge pages can grow, if more huge pages than ``/proc/sys/vm/nr_hugepages`` are
+requested by applications.  Writing any non-zero value into this file
+indicates that the hugetlb subsystem is allowed to try to obtain that
+number of "surplus" huge pages from the kernel's normal page pool, when the
+persistent huge page pool is exhausted. As these surplus huge pages become
+unused, they are freed back to the kernel's normal page pool.
+
+When increasing the huge page pool size via ``nr_hugepages``, any existing
+surplus pages will first be promoted to persistent huge pages.  Then, additional
+huge pages will be allocated, if necessary and if possible, to fulfill
+the new persistent huge page pool size.
+
+The administrator may shrink the pool of persistent huge pages for
+the default huge page size by setting the ``nr_hugepages`` sysctl to a
+smaller value.  The kernel will attempt to balance the freeing of huge pages
+across all nodes in the memory policy of the task modifying ``nr_hugepages``.
+Any free huge pages on the selected nodes will be freed back to the kernel's
+normal page pool.
+
+Caveat: Shrinking the persistent huge page pool via ``nr_hugepages`` such that
+it becomes less than the number of huge pages in use will convert the balance
+of the in-use huge pages to surplus huge pages.  This will occur even if
+the number of surplus pages it would exceed the overcommit value.  As long as
+this condition holds--that is, until ``nr_hugepages+nr_overcommit_hugepages`` is
+increased sufficiently, or the surplus huge pages go out of use and are freed--
+no more surplus huge pages will be allowed to be allocated.
+
+With support for multiple huge page pools at run-time available, much of
+the huge page userspace interface in ``/proc/sys/vm`` has been duplicated in
+sysfs.
+The ``/proc`` interfaces discussed above have been retained for backwards
+compatibility. The root huge page control directory in sysfs is::
+
+	/sys/kernel/mm/hugepages
+
+For each huge page size supported by the running kernel, a subdirectory
+will exist, of the form::
+
+	hugepages-${size}kB
+
+Inside each of these directories, the same set of files will exist::
+
+	nr_hugepages
+	nr_hugepages_mempolicy
+	nr_overcommit_hugepages
+	free_hugepages
+	resv_hugepages
+	surplus_hugepages
+
+which function as described above for the default huge page-sized case.
+
+
+Interaction of Task Memory Policy with Huge Page Allocation/Freeing
+===================================================================
+
+Whether huge pages are allocated and freed via the ``/proc`` interface or
+the ``/sysfs`` interface using the ``nr_hugepages_mempolicy`` attribute, the
+NUMA nodes from which huge pages are allocated or freed are controlled by the
+NUMA memory policy of the task that modifies the ``nr_hugepages_mempolicy``
+sysctl or attribute.  When the ``nr_hugepages`` attribute is used, mempolicy
+is ignored.
+
+The recommended method to allocate or free huge pages to/from the kernel
+huge page pool, using the ``nr_hugepages`` example above, is::
+
+    numactl --interleave <node-list> echo 20 \
+				>/proc/sys/vm/nr_hugepages_mempolicy
+
+or, more succinctly::
+
+    numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy
+
+This will allocate or free ``abs(20 - nr_hugepages)`` to or from the nodes
+specified in <node-list>, depending on whether number of persistent huge pages
+is initially less than or greater than 20, respectively.  No huge pages will be
+allocated nor freed on any node not included in the specified <node-list>.
+
+When adjusting the persistent hugepage count via ``nr_hugepages_mempolicy``, any
+memory policy mode--bind, preferred, local or interleave--may be used.  The
+resulting effect on persistent huge page allocation is as follows:
+
+#. Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.rst],
+   persistent huge pages will be distributed across the node or nodes
+   specified in the mempolicy as if "interleave" had been specified.
+   However, if a node in the policy does not contain sufficient contiguous
+   memory for a huge page, the allocation will not "fallback" to the nearest
+   neighbor node with sufficient contiguous memory.  To do this would cause
+   undesirable imbalance in the distribution of the huge page pool, or
+   possibly, allocation of persistent huge pages on nodes not allowed by
+   the task's memory policy.
+
+#. One or more nodes may be specified with the bind or interleave policy.
+   If more than one node is specified with the preferred policy, only the
+   lowest numeric id will be used.  Local policy will select the node where
+   the task is running at the time the nodes_allowed mask is constructed.
+   For local policy to be deterministic, the task must be bound to a cpu or
+   cpus in a single node.  Otherwise, the task could be migrated to some
+   other node at any time after launch and the resulting node will be
+   indeterminate.  Thus, local policy is not very useful for this purpose.
+   Any of the other mempolicy modes may be used to specify a single node.
+
+#. The nodes allowed mask will be derived from any non-default task mempolicy,
+   whether this policy was set explicitly by the task itself or one of its
+   ancestors, such as numactl.  This means that if the task is invoked from a
+   shell with non-default policy, that policy will be used.  One can specify a
+   node list of "all" with numactl --interleave or --membind [-m] to achieve
+   interleaving over all nodes in the system or cpuset.
+
+#. Any task mempolicy specified--e.g., using numactl--will be constrained by
+   the resource limits of any cpuset in which the task runs.  Thus, there will
+   be no way for a task with non-default policy running in a cpuset with a
+   subset of the system nodes to allocate huge pages outside the cpuset
+   without first moving to a cpuset that contains all of the desired nodes.
+
+#. Boot-time huge page allocation attempts to distribute the requested number
+   of huge pages over all on-lines nodes with memory.
+
+Per Node Hugepages Attributes
+=============================
+
+A subset of the contents of the root huge page control directory in sysfs,
+described above, will be replicated under each the system device of each
+NUMA node with memory in::
+
+	/sys/devices/system/node/node[0-9]*/hugepages/
+
+Under this directory, the subdirectory for each supported huge page size
+contains the following attribute files::
+
+	nr_hugepages
+	free_hugepages
+	surplus_hugepages
+
+The free\_' and surplus\_' attribute files are read-only.  They return the number
+of free and surplus [overcommitted] huge pages, respectively, on the parent
+node.
+
+The ``nr_hugepages`` attribute returns the total number of huge pages on the
+specified node.  When this attribute is written, the number of persistent huge
+pages on the parent node will be adjusted to the specified value, if sufficient
+resources exist, regardless of the task's mempolicy or cpuset constraints.
+
+Note that the number of overcommit and reserve pages remain global quantities,
+as we don't know until fault time, when the faulting task's mempolicy is
+applied, from which node the huge page allocation will be attempted.
+
+
+Using Huge Pages
+================
+
+If the user applications are going to request huge pages using mmap system
+call, then it is required that system administrator mount a file system of
+type hugetlbfs::
+
+  mount -t hugetlbfs \
+	-o uid=<value>,gid=<value>,mode=<value>,pagesize=<value>,size=<value>,\
+	min_size=<value>,nr_inodes=<value> none /mnt/huge
+
+This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
+``/mnt/huge``.  Any files created on ``/mnt/huge`` uses huge pages.
+
+The ``uid`` and ``gid`` options sets the owner and group of the root of the
+file system.  By default the ``uid`` and ``gid`` of the current process
+are taken.
+
+The ``mode`` option sets the mode of root of file system to value & 01777.
+This value is given in octal. By default the value 0755 is picked.
+
+If the platform supports multiple huge page sizes, the ``pagesize`` option can
+be used to specify the huge page size and associated pool. ``pagesize``
+is specified in bytes. If ``pagesize`` is not specified the platform's
+default huge page size and associated pool will be used.
+
+The ``size`` option sets the maximum value of memory (huge pages) allowed
+for that filesystem (``/mnt/huge``). The ``size`` option can be specified
+in bytes, or as a percentage of the specified huge page pool (``nr_hugepages``).
+The size is rounded down to HPAGE_SIZE boundary.
+
+The ``min_size`` option sets the minimum value of memory (huge pages) allowed
+for the filesystem. ``min_size`` can be specified in the same way as ``size``,
+either bytes or a percentage of the huge page pool.
+At mount time, the number of huge pages specified by ``min_size`` are reserved
+for use by the filesystem.
+If there are not enough free huge pages available, the mount will fail.
+As huge pages are allocated to the filesystem and freed, the reserve count
+is adjusted so that the sum of allocated and reserved huge pages is always
+at least ``min_size``.
+
+The option ``nr_inodes`` sets the maximum number of inodes that ``/mnt/huge``
+can use.
+
+If the ``size``, ``min_size`` or ``nr_inodes`` option is not provided on
+command line then no limits are set.
+
+For ``pagesize``, ``size``, ``min_size`` and ``nr_inodes`` options, you can
+use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo.
+For example, size=2K has the same meaning as size=2048.
+
+While read system calls are supported on files that reside on hugetlb
+file systems, write system calls are not.
+
+Regular chown, chgrp, and chmod commands (with right permissions) could be
+used to change the file attributes on hugetlbfs.
+
+Also, it is important to note that no such mount command is required if
+applications are going to use only shmat/shmget system calls or mmap with
+MAP_HUGETLB.  For an example of how to use mmap with MAP_HUGETLB see
+:ref:`map_hugetlb <map_hugetlb>` below.
+
+Users who wish to use hugetlb memory via shared memory segment should be a
+member of a supplementary group and system admin needs to configure that gid
+into ``/proc/sys/vm/hugetlb_shm_group``.  It is possible for same or different
+applications to use any combination of mmaps and shm* calls, though the mount of
+filesystem will be required for using mmap calls without MAP_HUGETLB.
+
+Syscalls that operate on memory backed by hugetlb pages only have their lengths
+aligned to the native page size of the processor; they will normally fail with
+errno set to EINVAL or exclude hugetlb pages that extend beyond the length if
+not hugepage aligned.  For example, munmap(2) will fail if memory is backed by
+a hugetlb page and the length is smaller than the hugepage size.
+
+
+Examples
+========
+
+.. _map_hugetlb:
+
+``map_hugetlb``
+	see tools/testing/selftests/vm/map_hugetlb.c
+
+``hugepage-shm``
+	see tools/testing/selftests/vm/hugepage-shm.c
+
+``hugepage-mmap``
+	see tools/testing/selftests/vm/hugepage-mmap.c
+
+The `libhugetlbfs`_  library provides a wide range of userspace tools
+to help with huge page usability, environment setup, and control.
+
+.. _libhugetlbfs: https://github.com/libhugetlbfs/libhugetlbfs
+
+Kernel development regression testing
+=====================================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions.  In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
deleted file mode 100644
index 3bb0d991f102..000000000000
--- a/Documentation/vm/hugetlbpage.txt
+++ /dev/null
@@ -1,386 +0,0 @@
-.. _hugetlbpage:
-
-=============
-HugeTLB Pages
-=============
-
-Overview
-========
-
-The intent of this file is to give a brief summary of hugetlbpage support in
-the Linux kernel.  This support is built on top of multiple page size support
-that is provided by most modern architectures.  For example, x86 CPUs normally
-support 4K and 2M (1G if architecturally supported) page sizes, ia64
-architecture supports multiple page sizes 4K, 8K, 64K, 256K, 1M, 4M, 16M,
-256M and ppc64 supports 4K and 16M.  A TLB is a cache of virtual-to-physical
-translations.  Typically this is a very scarce resource on processor.
-Operating systems try to make best use of limited number of TLB resources.
-This optimization is more critical now as bigger and bigger physical memories
-(several GBs) are more readily available.
-
-Users can use the huge page support in Linux kernel by either using the mmap
-system call or standard SYSV shared memory system calls (shmget, shmat).
-
-First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
-(present under "File systems") and CONFIG_HUGETLB_PAGE (selected
-automatically when CONFIG_HUGETLBFS is selected) configuration
-options.
-
-The ``/proc/meminfo`` file provides information about the total number of
-persistent hugetlb pages in the kernel's huge page pool.  It also displays
-default huge page size and information about the number of free, reserved
-and surplus huge pages in the pool of huge pages of default size.
-The huge page size is needed for generating the proper alignment and
-size of the arguments to system calls that map huge page regions.
-
-The output of ``cat /proc/meminfo`` will include lines like::
-
-	HugePages_Total: uuu
-	HugePages_Free:  vvv
-	HugePages_Rsvd:  www
-	HugePages_Surp:  xxx
-	Hugepagesize:    yyy kB
-	Hugetlb:         zzz kB
-
-where:
-
-HugePages_Total
-	is the size of the pool of huge pages.
-HugePages_Free
-	is the number of huge pages in the pool that are not yet
-        allocated.
-HugePages_Rsvd
-	is short for "reserved," and is the number of huge pages for
-        which a commitment to allocate from the pool has been made,
-        but no allocation has yet been made.  Reserved huge pages
-        guarantee that an application will be able to allocate a
-        huge page from the pool of huge pages at fault time.
-HugePages_Surp
-	is short for "surplus," and is the number of huge pages in
-        the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
-        maximum number of surplus huge pages is controlled by
-        ``/proc/sys/vm/nr_overcommit_hugepages``.
-Hugepagesize
-	is the default hugepage size (in Kb).
-Hugetlb
-        is the total amount of memory (in kB), consumed by huge
-        pages of all sizes.
-        If huge pages of different sizes are in use, this number
-        will exceed HugePages_Total \* Hugepagesize. To get more
-        detailed information, please, refer to
-        ``/sys/kernel/mm/hugepages`` (described below).
-
-
-``/proc/filesystems`` should also show a filesystem of type "hugetlbfs"
-configured in the kernel.
-
-``/proc/sys/vm/nr_hugepages`` indicates the current number of "persistent" huge
-pages in the kernel's huge page pool.  "Persistent" huge pages will be
-returned to the huge page pool when freed by a task.  A user with root
-privileges can dynamically allocate more or free some persistent huge pages
-by increasing or decreasing the value of ``nr_hugepages``.
-
-Pages that are used as huge pages are reserved inside the kernel and cannot
-be used for other purposes.  Huge pages cannot be swapped out under
-memory pressure.
-
-Once a number of huge pages have been pre-allocated to the kernel huge page
-pool, a user with appropriate privilege can use either the mmap system call
-or shared memory system calls to use the huge pages.  See the discussion of
-Using Huge Pages, below.
-
-The administrator can allocate persistent huge pages on the kernel boot
-command line by specifying the "hugepages=N" parameter, where 'N' = the
-number of huge pages requested.  This is the most reliable method of
-allocating huge pages as memory has not yet become fragmented.
-
-Some platforms support multiple huge page sizes.  To allocate huge pages
-of a specific size, one must precede the huge pages boot command parameters
-with a huge page size selection parameter "hugepagesz=<size>".  <size> must
-be specified in bytes with optional scale suffix [kKmMgG].  The default huge
-page size may be selected with the "default_hugepagesz=<size>" boot parameter.
-
-When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
-indicates the current number of pre-allocated huge pages of the default size.
-Thus, one can use the following command to dynamically allocate/deallocate
-default sized persistent huge pages::
-
-	echo 20 > /proc/sys/vm/nr_hugepages
-
-This command will try to adjust the number of default sized huge pages in the
-huge page pool to 20, allocating or freeing huge pages, as required.
-
-On a NUMA platform, the kernel will attempt to distribute the huge page pool
-over all the set of allowed nodes specified by the NUMA memory policy of the
-task that modifies ``nr_hugepages``. The default for the allowed nodes--when the
-task has default memory policy--is all on-line nodes with memory.  Allowed
-nodes with insufficient available, contiguous memory for a huge page will be
-silently skipped when allocating persistent huge pages.  See the discussion
-below of the interaction of task memory policy, cpusets and per node attributes
-with the allocation and freeing of persistent huge pages.
-
-The success or failure of huge page allocation depends on the amount of
-physically contiguous memory that is present in system at the time of the
-allocation attempt.  If the kernel is unable to allocate huge pages from
-some nodes in a NUMA system, it will attempt to make up the difference by
-allocating extra pages on other nodes with sufficient available contiguous
-memory, if any.
-
-System administrators may want to put this command in one of the local rc
-init files.  This will enable the kernel to allocate huge pages early in
-the boot process when the possibility of getting physical contiguous pages
-is still very high.  Administrators can verify the number of huge pages
-actually allocated by checking the sysctl or meminfo.  To check the per node
-distribution of huge pages in a NUMA system, use::
-
-	cat /sys/devices/system/node/node*/meminfo | fgrep Huge
-
-``/proc/sys/vm/nr_overcommit_hugepages`` specifies how large the pool of
-huge pages can grow, if more huge pages than ``/proc/sys/vm/nr_hugepages`` are
-requested by applications.  Writing any non-zero value into this file
-indicates that the hugetlb subsystem is allowed to try to obtain that
-number of "surplus" huge pages from the kernel's normal page pool, when the
-persistent huge page pool is exhausted. As these surplus huge pages become
-unused, they are freed back to the kernel's normal page pool.
-
-When increasing the huge page pool size via ``nr_hugepages``, any existing
-surplus pages will first be promoted to persistent huge pages.  Then, additional
-huge pages will be allocated, if necessary and if possible, to fulfill
-the new persistent huge page pool size.
-
-The administrator may shrink the pool of persistent huge pages for
-the default huge page size by setting the ``nr_hugepages`` sysctl to a
-smaller value.  The kernel will attempt to balance the freeing of huge pages
-across all nodes in the memory policy of the task modifying ``nr_hugepages``.
-Any free huge pages on the selected nodes will be freed back to the kernel's
-normal page pool.
-
-Caveat: Shrinking the persistent huge page pool via ``nr_hugepages`` such that
-it becomes less than the number of huge pages in use will convert the balance
-of the in-use huge pages to surplus huge pages.  This will occur even if
-the number of surplus pages it would exceed the overcommit value.  As long as
-this condition holds--that is, until ``nr_hugepages+nr_overcommit_hugepages`` is
-increased sufficiently, or the surplus huge pages go out of use and are freed--
-no more surplus huge pages will be allowed to be allocated.
-
-With support for multiple huge page pools at run-time available, much of
-the huge page userspace interface in ``/proc/sys/vm`` has been duplicated in
-sysfs.
-The ``/proc`` interfaces discussed above have been retained for backwards
-compatibility. The root huge page control directory in sysfs is::
-
-	/sys/kernel/mm/hugepages
-
-For each huge page size supported by the running kernel, a subdirectory
-will exist, of the form::
-
-	hugepages-${size}kB
-
-Inside each of these directories, the same set of files will exist::
-
-	nr_hugepages
-	nr_hugepages_mempolicy
-	nr_overcommit_hugepages
-	free_hugepages
-	resv_hugepages
-	surplus_hugepages
-
-which function as described above for the default huge page-sized case.
-
-
-Interaction of Task Memory Policy with Huge Page Allocation/Freeing
-===================================================================
-
-Whether huge pages are allocated and freed via the ``/proc`` interface or
-the ``/sysfs`` interface using the ``nr_hugepages_mempolicy`` attribute, the
-NUMA nodes from which huge pages are allocated or freed are controlled by the
-NUMA memory policy of the task that modifies the ``nr_hugepages_mempolicy``
-sysctl or attribute.  When the ``nr_hugepages`` attribute is used, mempolicy
-is ignored.
-
-The recommended method to allocate or free huge pages to/from the kernel
-huge page pool, using the ``nr_hugepages`` example above, is::
-
-    numactl --interleave <node-list> echo 20 \
-				>/proc/sys/vm/nr_hugepages_mempolicy
-
-or, more succinctly::
-
-    numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy
-
-This will allocate or free ``abs(20 - nr_hugepages)`` to or from the nodes
-specified in <node-list>, depending on whether number of persistent huge pages
-is initially less than or greater than 20, respectively.  No huge pages will be
-allocated nor freed on any node not included in the specified <node-list>.
-
-When adjusting the persistent hugepage count via ``nr_hugepages_mempolicy``, any
-memory policy mode--bind, preferred, local or interleave--may be used.  The
-resulting effect on persistent huge page allocation is as follows:
-
-#. Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt],
-   persistent huge pages will be distributed across the node or nodes
-   specified in the mempolicy as if "interleave" had been specified.
-   However, if a node in the policy does not contain sufficient contiguous
-   memory for a huge page, the allocation will not "fallback" to the nearest
-   neighbor node with sufficient contiguous memory.  To do this would cause
-   undesirable imbalance in the distribution of the huge page pool, or
-   possibly, allocation of persistent huge pages on nodes not allowed by
-   the task's memory policy.
-
-#. One or more nodes may be specified with the bind or interleave policy.
-   If more than one node is specified with the preferred policy, only the
-   lowest numeric id will be used.  Local policy will select the node where
-   the task is running at the time the nodes_allowed mask is constructed.
-   For local policy to be deterministic, the task must be bound to a cpu or
-   cpus in a single node.  Otherwise, the task could be migrated to some
-   other node at any time after launch and the resulting node will be
-   indeterminate.  Thus, local policy is not very useful for this purpose.
-   Any of the other mempolicy modes may be used to specify a single node.
-
-#. The nodes allowed mask will be derived from any non-default task mempolicy,
-   whether this policy was set explicitly by the task itself or one of its
-   ancestors, such as numactl.  This means that if the task is invoked from a
-   shell with non-default policy, that policy will be used.  One can specify a
-   node list of "all" with numactl --interleave or --membind [-m] to achieve
-   interleaving over all nodes in the system or cpuset.
-
-#. Any task mempolicy specified--e.g., using numactl--will be constrained by
-   the resource limits of any cpuset in which the task runs.  Thus, there will
-   be no way for a task with non-default policy running in a cpuset with a
-   subset of the system nodes to allocate huge pages outside the cpuset
-   without first moving to a cpuset that contains all of the desired nodes.
-
-#. Boot-time huge page allocation attempts to distribute the requested number
-   of huge pages over all on-lines nodes with memory.
-
-Per Node Hugepages Attributes
-=============================
-
-A subset of the contents of the root huge page control directory in sysfs,
-described above, will be replicated under each the system device of each
-NUMA node with memory in::
-
-	/sys/devices/system/node/node[0-9]*/hugepages/
-
-Under this directory, the subdirectory for each supported huge page size
-contains the following attribute files::
-
-	nr_hugepages
-	free_hugepages
-	surplus_hugepages
-
-The free\_' and surplus\_' attribute files are read-only.  They return the number
-of free and surplus [overcommitted] huge pages, respectively, on the parent
-node.
-
-The ``nr_hugepages`` attribute returns the total number of huge pages on the
-specified node.  When this attribute is written, the number of persistent huge
-pages on the parent node will be adjusted to the specified value, if sufficient
-resources exist, regardless of the task's mempolicy or cpuset constraints.
-
-Note that the number of overcommit and reserve pages remain global quantities,
-as we don't know until fault time, when the faulting task's mempolicy is
-applied, from which node the huge page allocation will be attempted.
-
-
-Using Huge Pages
-================
-
-If the user applications are going to request huge pages using mmap system
-call, then it is required that system administrator mount a file system of
-type hugetlbfs::
-
-  mount -t hugetlbfs \
-	-o uid=<value>,gid=<value>,mode=<value>,pagesize=<value>,size=<value>,\
-	min_size=<value>,nr_inodes=<value> none /mnt/huge
-
-This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
-``/mnt/huge``.  Any files created on ``/mnt/huge`` uses huge pages.
-
-The ``uid`` and ``gid`` options sets the owner and group of the root of the
-file system.  By default the ``uid`` and ``gid`` of the current process
-are taken.
-
-The ``mode`` option sets the mode of root of file system to value & 01777.
-This value is given in octal. By default the value 0755 is picked.
-
-If the platform supports multiple huge page sizes, the ``pagesize`` option can
-be used to specify the huge page size and associated pool. ``pagesize``
-is specified in bytes. If ``pagesize`` is not specified the platform's
-default huge page size and associated pool will be used.
-
-The ``size`` option sets the maximum value of memory (huge pages) allowed
-for that filesystem (``/mnt/huge``). The ``size`` option can be specified
-in bytes, or as a percentage of the specified huge page pool (``nr_hugepages``).
-The size is rounded down to HPAGE_SIZE boundary.
-
-The ``min_size`` option sets the minimum value of memory (huge pages) allowed
-for the filesystem. ``min_size`` can be specified in the same way as ``size``,
-either bytes or a percentage of the huge page pool.
-At mount time, the number of huge pages specified by ``min_size`` are reserved
-for use by the filesystem.
-If there are not enough free huge pages available, the mount will fail.
-As huge pages are allocated to the filesystem and freed, the reserve count
-is adjusted so that the sum of allocated and reserved huge pages is always
-at least ``min_size``.
-
-The option ``nr_inodes`` sets the maximum number of inodes that ``/mnt/huge``
-can use.
-
-If the ``size``, ``min_size`` or ``nr_inodes`` option is not provided on
-command line then no limits are set.
-
-For ``pagesize``, ``size``, ``min_size`` and ``nr_inodes`` options, you can
-use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo.
-For example, size=2K has the same meaning as size=2048.
-
-While read system calls are supported on files that reside on hugetlb
-file systems, write system calls are not.
-
-Regular chown, chgrp, and chmod commands (with right permissions) could be
-used to change the file attributes on hugetlbfs.
-
-Also, it is important to note that no such mount command is required if
-applications are going to use only shmat/shmget system calls or mmap with
-MAP_HUGETLB.  For an example of how to use mmap with MAP_HUGETLB see
-:ref:`map_hugetlb <map_hugetlb>` below.
-
-Users who wish to use hugetlb memory via shared memory segment should be a
-member of a supplementary group and system admin needs to configure that gid
-into ``/proc/sys/vm/hugetlb_shm_group``.  It is possible for same or different
-applications to use any combination of mmaps and shm* calls, though the mount of
-filesystem will be required for using mmap calls without MAP_HUGETLB.
-
-Syscalls that operate on memory backed by hugetlb pages only have their lengths
-aligned to the native page size of the processor; they will normally fail with
-errno set to EINVAL or exclude hugetlb pages that extend beyond the length if
-not hugepage aligned.  For example, munmap(2) will fail if memory is backed by
-a hugetlb page and the length is smaller than the hugepage size.
-
-
-Examples
-========
-
-.. _map_hugetlb:
-
-``map_hugetlb``
-	see tools/testing/selftests/vm/map_hugetlb.c
-
-``hugepage-shm``
-	see tools/testing/selftests/vm/hugepage-shm.c
-
-``hugepage-mmap``
-	see tools/testing/selftests/vm/hugepage-mmap.c
-
-The `libhugetlbfs`_  library provides a wide range of userspace tools
-to help with huge page usability, environment setup, and control.
-
-.. _libhugetlbfs: https://github.com/libhugetlbfs/libhugetlbfs
-
-Kernel development regression testing
-=====================================
-
-The most complete set of hugetlb tests are in the libhugetlbfs repository.
-If you modify any hugetlb related code, use the libhugetlbfs test suite
-to check for regressions.  In addition, if you add any new hugetlb
-functionality, please add appropriate tests to libhugetlbfs.
diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst
new file mode 100644
index 000000000000..070aa1e716b7
--- /dev/null
+++ b/Documentation/vm/hwpoison.rst
@@ -0,0 +1,186 @@
+.. hwpoison:
+
+========
+hwpoison
+========
+
+What is hwpoison?
+=================
+
+Upcoming Intel CPUs have support for recovering from some memory errors
+(``MCA recovery``). This requires the OS to declare a page "poisoned",
+kill the processes associated with it and avoid using it in the future.
+
+This patchkit implements the necessary infrastructure in the VM.
+
+To quote the overview comment:
+
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * This focusses on pages detected as corrupted in the background.
+ * When the current CPU tries to consume corruption the currently
+ * running process can just be killed directly instead. This implies
+ * that if the error cannot be handled for some reason it's safe to
+ * just ignore it because no corruption has been consumed yet. Instead
+ * when that happens another machine check will happen.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * Some of the operations here are somewhat inefficient and have non
+ * linear algorithmic complexity, because the data structures have not
+ * been optimized for this case. This is in particular the case
+ * for the mapping from a vma to a process. Since this case is expected
+ * to be rare we hope we can get away with this.
+
+The code consists of a the high level handler in mm/memory-failure.c,
+a new page poison bit and various checks in the VM to handle poisoned
+pages.
+
+The main target right now is KVM guests, but it works for all kinds
+of applications. KVM support requires a recent qemu-kvm release.
+
+For the KVM use there was need for a new signal type so that
+KVM can inject the machine check into the guest with the proper
+address. This in theory allows other applications to handle
+memory failures too. The expection is that near all applications
+won't do that, but some very specialized ones might.
+
+Failure recovery modes
+======================
+
+There are two (actually three) modes memory failure recovery can be in:
+
+vm.memory_failure_recovery sysctl set to zero:
+	All memory failures cause a panic. Do not attempt recovery.
+	(on x86 this can be also affected by the tolerant level of the
+	MCE subsystem)
+
+early kill
+	(can be controlled globally and per process)
+	Send SIGBUS to the application as soon as the error is detected
+	This allows applications who can process memory errors in a gentle
+	way (e.g. drop affected object)
+	This is the mode used by KVM qemu.
+
+late kill
+	Send SIGBUS when the application runs into the corrupted page.
+	This is best for memory error unaware applications and default
+	Note some pages are always handled as late kill.
+
+User control
+============
+
+vm.memory_failure_recovery
+	See sysctl.txt
+
+vm.memory_failure_early_kill
+	Enable early kill mode globally
+
+PR_MCE_KILL
+	Set early/late kill mode/revert to system default
+
+	arg1: PR_MCE_KILL_CLEAR:
+		Revert to system default
+	arg1: PR_MCE_KILL_SET:
+		arg2 defines thread specific mode
+
+		PR_MCE_KILL_EARLY:
+			Early kill
+		PR_MCE_KILL_LATE:
+			Late kill
+		PR_MCE_KILL_DEFAULT
+			Use system global default
+
+	Note that if you want to have a dedicated thread which handles
+	the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
+	call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
+	the SIGBUS is sent to the main thread.
+
+PR_MCE_KILL_GET
+	return current mode
+
+Testing
+=======
+
+* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
+  process for testing
+
+* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
+
+  corrupt-pfn
+	Inject hwpoison fault at PFN echoed into this file. This does
+	some early filtering to avoid corrupted unintended pages in test suites.
+
+  unpoison-pfn
+	Software-unpoison page at PFN echoed into this file. This way
+	a page can be reused again.  This only works for Linux
+	injected failures, not for real memory failures.
+
+  Note these injection interfaces are not stable and might change between
+  kernel versions
+
+  corrupt-filter-dev-major, corrupt-filter-dev-minor
+	Only handle memory failures to pages associated with the file
+	system defined by block device major/minor.  -1U is the
+	wildcard value.  This should be only used for testing with
+	artificial injection.
+
+  corrupt-filter-memcg
+	Limit injection to pages owned by memgroup. Specified by inode
+	number of the memcg.
+
+	Example::
+
+		mkdir /sys/fs/cgroup/mem/hwpoison
+
+	        usemem -m 100 -s 1000 &
+		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+
+		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+
+		page-types -p `pidof init`   --hwpoison  # shall do nothing
+		page-types -p `pidof usemem` --hwpoison  # poison its pages
+
+  corrupt-filter-flags-mask, corrupt-filter-flags-value
+	When specified, only poison pages if ((page_flags & mask) ==
+	value).  This allows stress testing of many kinds of
+	pages. The page_flags are the same as in /proc/kpageflags. The
+	flag bits are defined in include/linux/kernel-page-flags.h and
+	documented in Documentation/vm/pagemap.rst
+
+* Architecture specific MCE injector
+
+  x86 has mce-inject, mce-test
+
+  Some portable hwpoison test programs in mce-test, see below.
+
+References
+==========
+
+http://halobates.de/mce-lc09-2.pdf
+	Overview presentation from LinuxCon 09
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+	Test suite (hwpoison specific portable tests in tsrc)
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+	x86 specific injector
+
+
+Limitations
+===========
+- Not all page types are supported and never will. Most kernel internal
+  objects cannot be recovered, only LRU pages for now.
+- Right now hugepage support is missing.
+
+---
+Andi Kleen, Oct 2009
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
deleted file mode 100644
index b1a8c241d6c2..000000000000
--- a/Documentation/vm/hwpoison.txt
+++ /dev/null
@@ -1,186 +0,0 @@
-.. hwpoison:
-
-========
-hwpoison
-========
-
-What is hwpoison?
-=================
-
-Upcoming Intel CPUs have support for recovering from some memory errors
-(``MCA recovery``). This requires the OS to declare a page "poisoned",
-kill the processes associated with it and avoid using it in the future.
-
-This patchkit implements the necessary infrastructure in the VM.
-
-To quote the overview comment:
-
- * High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
- * failure.
- *
- * This focusses on pages detected as corrupted in the background.
- * When the current CPU tries to consume corruption the currently
- * running process can just be killed directly instead. This implies
- * that if the error cannot be handled for some reason it's safe to
- * just ignore it because no corruption has been consumed yet. Instead
- * when that happens another machine check will happen.
- *
- * Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * Some of the operations here are somewhat inefficient and have non
- * linear algorithmic complexity, because the data structures have not
- * been optimized for this case. This is in particular the case
- * for the mapping from a vma to a process. Since this case is expected
- * to be rare we hope we can get away with this.
-
-The code consists of a the high level handler in mm/memory-failure.c,
-a new page poison bit and various checks in the VM to handle poisoned
-pages.
-
-The main target right now is KVM guests, but it works for all kinds
-of applications. KVM support requires a recent qemu-kvm release.
-
-For the KVM use there was need for a new signal type so that
-KVM can inject the machine check into the guest with the proper
-address. This in theory allows other applications to handle
-memory failures too. The expection is that near all applications
-won't do that, but some very specialized ones might.
-
-Failure recovery modes
-======================
-
-There are two (actually three) modes memory failure recovery can be in:
-
-vm.memory_failure_recovery sysctl set to zero:
-	All memory failures cause a panic. Do not attempt recovery.
-	(on x86 this can be also affected by the tolerant level of the
-	MCE subsystem)
-
-early kill
-	(can be controlled globally and per process)
-	Send SIGBUS to the application as soon as the error is detected
-	This allows applications who can process memory errors in a gentle
-	way (e.g. drop affected object)
-	This is the mode used by KVM qemu.
-
-late kill
-	Send SIGBUS when the application runs into the corrupted page.
-	This is best for memory error unaware applications and default
-	Note some pages are always handled as late kill.
-
-User control
-============
-
-vm.memory_failure_recovery
-	See sysctl.txt
-
-vm.memory_failure_early_kill
-	Enable early kill mode globally
-
-PR_MCE_KILL
-	Set early/late kill mode/revert to system default
-
-	arg1: PR_MCE_KILL_CLEAR:
-		Revert to system default
-	arg1: PR_MCE_KILL_SET:
-		arg2 defines thread specific mode
-
-		PR_MCE_KILL_EARLY:
-			Early kill
-		PR_MCE_KILL_LATE:
-			Late kill
-		PR_MCE_KILL_DEFAULT
-			Use system global default
-
-	Note that if you want to have a dedicated thread which handles
-	the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
-	call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
-	the SIGBUS is sent to the main thread.
-
-PR_MCE_KILL_GET
-	return current mode
-
-Testing
-=======
-
-* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
-  process for testing
-
-* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
-
-  corrupt-pfn
-	Inject hwpoison fault at PFN echoed into this file. This does
-	some early filtering to avoid corrupted unintended pages in test suites.
-
-  unpoison-pfn
-	Software-unpoison page at PFN echoed into this file. This way
-	a page can be reused again.  This only works for Linux
-	injected failures, not for real memory failures.
-
-  Note these injection interfaces are not stable and might change between
-  kernel versions
-
-  corrupt-filter-dev-major, corrupt-filter-dev-minor
-	Only handle memory failures to pages associated with the file
-	system defined by block device major/minor.  -1U is the
-	wildcard value.  This should be only used for testing with
-	artificial injection.
-
-  corrupt-filter-memcg
-	Limit injection to pages owned by memgroup. Specified by inode
-	number of the memcg.
-
-	Example::
-
-		mkdir /sys/fs/cgroup/mem/hwpoison
-
-	        usemem -m 100 -s 1000 &
-		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
-
-		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
-		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
-
-		page-types -p `pidof init`   --hwpoison  # shall do nothing
-		page-types -p `pidof usemem` --hwpoison  # poison its pages
-
-  corrupt-filter-flags-mask, corrupt-filter-flags-value
-	When specified, only poison pages if ((page_flags & mask) ==
-	value).  This allows stress testing of many kinds of
-	pages. The page_flags are the same as in /proc/kpageflags. The
-	flag bits are defined in include/linux/kernel-page-flags.h and
-	documented in Documentation/vm/pagemap.txt
-
-* Architecture specific MCE injector
-
-  x86 has mce-inject, mce-test
-
-  Some portable hwpoison test programs in mce-test, see below.
-
-References
-==========
-
-http://halobates.de/mce-lc09-2.pdf
-	Overview presentation from LinuxCon 09
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
-	Test suite (hwpoison specific portable tests in tsrc)
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
-	x86 specific injector
-
-
-Limitations
-===========
-- Not all page types are supported and never will. Most kernel internal
-  objects cannot be recovered, only LRU pages for now.
-- Right now hugepage support is missing.
-
----
-Andi Kleen, Oct 2009
diff --git a/Documentation/vm/idle_page_tracking.rst b/Documentation/vm/idle_page_tracking.rst
new file mode 100644
index 000000000000..d1c4609a5220
--- /dev/null
+++ b/Documentation/vm/idle_page_tracking.rst
@@ -0,0 +1,115 @@
+.. _idle_page_tracking:
+
+==================
+Idle Page Tracking
+==================
+
+Motivation
+==========
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+.. _user_api:
+
+User API
+========
+
+The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
+Currently, it consists of the only read-write file,
+``/sys/kernel/mm/page_idle/bitmap``.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the :ref:`Implementation
+Details <impl_details>` section).
+To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+``/proc/kpageflags`` in order to correctly count idle huge pages.
+
+Reading from or writing to ``/sys/kernel/mm/page_idle/bitmap`` will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+    ``/sys/kernel/mm/page_idle/bitmap``. The pages can be found by reading
+    ``/proc/pid/pagemap`` if the workload is represented by a process, or by
+    filtering out alien pages using ``/proc/kpagecgroup`` in case the workload
+    is placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read ``/sys/kernel/mm/page_idle/bitmap`` and count the number of bits set.
+    If one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using
+    ``/proc/kpageflags``.
+
+See Documentation/vm/pagemap.rst for more information about
+``/proc/pid/pagemap``, ``/proc/kpageflags``, and ``/proc/kpagecgroup``.
+
+.. _impl_details:
+
+Implementation Details
+======================
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+   or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+   because a process needs filesystem metadata stored in it (e.g. lists a
+   directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
+:ref:`User API <user_api>`
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt
deleted file mode 100644
index 9cbe6f8d7a99..000000000000
--- a/Documentation/vm/idle_page_tracking.txt
+++ /dev/null
@@ -1,115 +0,0 @@
-.. _idle_page_tracking:
-
-==================
-Idle Page Tracking
-==================
-
-Motivation
-==========
-
-The idle page tracking feature allows to track which memory pages are being
-accessed by a workload and which are idle. This information can be useful for
-estimating the workload's working set size, which, in turn, can be taken into
-account when configuring the workload parameters, setting memory cgroup limits,
-or deciding where to place the workload within a compute cluster.
-
-It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
-
-.. _user_api:
-
-User API
-========
-
-The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
-Currently, it consists of the only read-write file,
-``/sys/kernel/mm/page_idle/bitmap``.
-
-The file implements a bitmap where each bit corresponds to a memory page. The
-bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
-mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
-set, the corresponding page is idle.
-
-A page is considered idle if it has not been accessed since it was marked idle
-(for more details on what "accessed" actually means see the :ref:`Implementation
-Details <impl_details>` section).
-To mark a page idle one has to set the bit corresponding to
-the page by writing to the file. A value written to the file is OR-ed with the
-current bitmap value.
-
-Only accesses to user memory pages are tracked. These are pages mapped to a
-process address space, page cache and buffer pages, swap cache pages. For other
-page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
-and hence such pages are never reported idle.
-
-For huge pages the idle flag is set only on the head page, so one has to read
-``/proc/kpageflags`` in order to correctly count idle huge pages.
-
-Reading from or writing to ``/sys/kernel/mm/page_idle/bitmap`` will return
--EINVAL if you are not starting the read/write on an 8-byte boundary, or
-if the size of the read/write is not a multiple of 8 bytes. Writing to
-this file beyond max PFN will return -ENXIO.
-
-That said, in order to estimate the amount of pages that are not used by a
-workload one should:
-
- 1. Mark all the workload's pages as idle by setting corresponding bits in
-    ``/sys/kernel/mm/page_idle/bitmap``. The pages can be found by reading
-    ``/proc/pid/pagemap`` if the workload is represented by a process, or by
-    filtering out alien pages using ``/proc/kpagecgroup`` in case the workload
-    is placed in a memory cgroup.
-
- 2. Wait until the workload accesses its working set.
-
- 3. Read ``/sys/kernel/mm/page_idle/bitmap`` and count the number of bits set.
-    If one wants to ignore certain types of pages, e.g. mlocked pages since they
-    are not reclaimable, he or she can filter them out using
-    ``/proc/kpageflags``.
-
-See Documentation/vm/pagemap.txt for more information about
-``/proc/pid/pagemap``, ``/proc/kpageflags``, and ``/proc/kpagecgroup``.
-
-.. _impl_details:
-
-Implementation Details
-======================
-
-The kernel internally keeps track of accesses to user memory pages in order to
-reclaim unreferenced pages first on memory shortage conditions. A page is
-considered referenced if it has been recently accessed via a process address
-space, in which case one or more PTEs it is mapped to will have the Accessed bit
-set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
-latter happens when:
-
- - a userspace process reads or writes a page using a system call (e.g. read(2)
-   or write(2))
-
- - a page that is used for storing filesystem buffers is read or written,
-   because a process needs filesystem metadata stored in it (e.g. lists a
-   directory tree)
-
- - a page is accessed by a device driver using get_user_pages()
-
-When a dirty page is written to swap or disk as a result of memory reclaim or
-exceeding the dirty memory limit, it is not marked referenced.
-
-The idle memory tracking feature adds a new page flag, the Idle flag. This flag
-is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
-:ref:`User API <user_api>`
-section), and cleared automatically whenever a page is referenced as defined
-above.
-
-When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
-mapped to, otherwise we will not be able to detect accesses to the page coming
-from a process address space. To avoid interference with the reclaimer, which,
-as noted above, uses the Accessed bit to promote actively referenced pages, one
-more page flag is introduced, the Young flag. When the PTE Accessed bit is
-cleared as a result of setting or updating a page's Idle flag, the Young flag
-is set on the page. The reclaimer treats the Young flag as an extra PTE
-Accessed bit and therefore will consider such a page as referenced.
-
-Since the idle memory tracking feature is based on the memory reclaimer logic,
-it only works with pages that are on an LRU list, other pages are silently
-ignored. That means it will ignore a user memory page if it is isolated, but
-since there are usually not many of them, it should not affect the overall
-result noticeably. In order not to stall scanning of the idle page bitmap,
-locked pages may be skipped too.
diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst
new file mode 100644
index 000000000000..87e7eef5ea9c
--- /dev/null
+++ b/Documentation/vm/ksm.rst
@@ -0,0 +1,183 @@
+.. _ksm:
+
+=======================
+Kernel Samepage Merging
+=======================
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
+
+The KSM daemon ksmd periodically scans those areas of user memory which
+have been registered with it, looking for pages of identical content which
+can be replaced by a single write-protected page (which is automatically
+copied if a process later wants to update its content).
+
+KSM was originally developed for use with KVM (where it was known as
+Kernel Shared Memory), to fit more virtual machines into physical memory,
+by sharing the data common between them.  But it can be useful to any
+application which generates many instances of the same data.
+
+KSM only merges anonymous (private) pages, never pagecache (file) pages.
+KSM's merged pages were originally locked into kernel memory, but can now
+be swapped out just like other user pages (but sharing is broken when they
+are swapped back in: ksmd must rediscover their identity and merge again).
+
+KSM only operates on those areas of address space which an application
+has advised to be likely candidates for merging, by using the madvise(2)
+system call: int madvise(addr, length, MADV_MERGEABLE).
+
+The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel
+that advice and restore unshared pages: whereupon KSM unmerges whatever
+it merged in that range.  Note: this unmerging call may suddenly require
+more memory than is available - possibly failing with EAGAIN, but more
+probably arousing the Out-Of-Memory killer.
+
+If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
+and MADV_UNMERGEABLE simply fail with EINVAL.  If the running kernel was
+built with CONFIG_KSM=y, those calls will normally succeed: even if the
+the KSM daemon is not currently running, MADV_MERGEABLE still registers
+the range for whenever the KSM daemon is started; even if the range
+cannot contain any pages which KSM could actually merge; even if
+MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
+
+If a region of memory must be split into at least one new MADV_MERGEABLE
+or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
+will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
+
+Like other madvise calls, they are intended for use on mapped areas of
+the user address space: they will report ENOMEM if the specified range
+includes unmapped gaps (though working on the intervening mapped areas),
+and might fail with EAGAIN if not enough memory for internal structures.
+
+Applications should be considerate in their use of MADV_MERGEABLE,
+restricting its use to areas likely to benefit.  KSM's scans may use a lot
+of processing power: some installations will disable KSM for that reason.
+
+The KSM daemon is controlled by sysfs files in ``/sys/kernel/mm/ksm/``,
+readable by all but writable only by root:
+
+pages_to_scan
+        how many present pages to scan before ksmd goes to sleep
+        e.g. ``echo 100 > /sys/kernel/mm/ksm/pages_to_scan`` Default: 100
+        (chosen for demonstration purposes)
+
+sleep_millisecs
+        how many milliseconds ksmd should sleep before next scan
+        e.g. ``echo 20 > /sys/kernel/mm/ksm/sleep_millisecs`` Default: 20
+        (chosen for demonstration purposes)
+
+merge_across_nodes
+        specifies if pages from different numa nodes can be merged.
+        When set to 0, ksm merges only pages which physically reside
+        in the memory area of same NUMA node. That brings lower
+        latency to access of shared pages. Systems with more nodes, at
+        significant NUMA distances, are likely to benefit from the
+        lower latency of setting 0. Smaller systems, which need to
+        minimize memory usage, are likely to benefit from the greater
+        sharing of setting 1 (default). You may wish to compare how
+        your system performs under each setting, before deciding on
+        which to use. merge_across_nodes setting can be changed only
+        when there are no ksm shared pages in system: set run 2 to
+        unmerge pages first, then to 1 after changing
+        merge_across_nodes, to remerge according to the new setting.
+        Default: 1 (merging across nodes as in earlier releases)
+
+run
+        set 0 to stop ksmd from running but keep merged pages,
+        set 1 to run ksmd e.g. ``echo 1 > /sys/kernel/mm/ksm/run``,
+        set 2 to stop ksmd and unmerge all pages currently merged, but
+        leave mergeable areas registered for next run Default: 0 (must
+        be changed to 1 to activate KSM, except if CONFIG_SYSFS is
+        disabled)
+
+use_zero_pages
+        specifies whether empty pages (i.e. allocated pages that only
+        contain zeroes) should be treated specially.  When set to 1,
+        empty pages are merged with the kernel zero page(s) instead of
+        with each other as it would happen normally. This can improve
+        the performance on architectures with coloured zero pages,
+        depending on the workload. Care should be taken when enabling
+        this setting, as it can potentially degrade the performance of
+        KSM for some workloads, for example if the checksums of pages
+        candidate for merging match the checksum of an empty
+        page. This setting can be changed at any time, it is only
+        effective for pages merged after the change.  Default: 0
+        (normal KSM behaviour as in earlier releases)
+
+max_page_sharing
+        Maximum sharing allowed for each KSM page. This enforces a
+        deduplication limit to avoid the virtual memory rmap lists to
+        grow too large. The minimum value is 2 as a newly created KSM
+        page will have at least two sharers. The rmap walk has O(N)
+        complexity where N is the number of rmap_items (i.e. virtual
+        mappings) that are sharing the page, which is in turn capped
+        by max_page_sharing. So this effectively spread the the linear
+        O(N) computational complexity from rmap walk context over
+        different KSM pages. The ksmd walk over the stable_node
+        "chains" is also O(N), but N is the number of stable_node
+        "dups", not the number of rmap_items, so it has not a
+        significant impact on ksmd performance. In practice the best
+        stable_node "dup" candidate will be kept and found at the head
+        of the "dups" list. The higher this value the faster KSM will
+        merge the memory (because there will be fewer stable_node dups
+        queued into the stable_node chain->hlist to check for pruning)
+        and the higher the deduplication factor will be, but the
+        slowest the worst case rmap walk could be for any given KSM
+        page. Slowing down the rmap_walk means there will be higher
+        latency for certain virtual memory operations happening during
+        swapping, compaction, NUMA balancing and page migration, in
+        turn decreasing responsiveness for the caller of those virtual
+        memory operations. The scheduler latency of other tasks not
+        involved with the VM operations doing the rmap walk is not
+        affected by this parameter as the rmap walks are always
+        schedule friendly themselves.
+
+stable_node_chains_prune_millisecs
+        How frequently to walk the whole list of stable_node "dups"
+        linked in the stable_node "chains" in order to prune stale
+        stable_nodes. Smaller milllisecs values will free up the KSM
+        metadata with lower latency, but they will make ksmd use more
+        CPU during the scan. This only applies to the stable_node
+        chains so it's a noop if not a single KSM page hit the
+        max_page_sharing yet (there would be no stable_node chains in
+        such case).
+
+The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
+
+pages_shared
+        how many shared pages are being used
+pages_sharing
+        how many more sites are sharing them i.e. how much saved
+pages_unshared
+        how many pages unique but repeatedly checked for merging
+pages_volatile
+        how many pages changing too fast to be placed in a tree
+full_scans
+        how many times all mergeable areas have been scanned
+stable_node_chains
+        number of stable node chains allocated, this is effectively
+        the number of KSM pages that hit the max_page_sharing limit
+stable_node_dups
+        number of stable node dups queued into the stable_node chains
+
+A high ratio of pages_sharing to pages_shared indicates good sharing, but
+a high ratio of pages_unshared to pages_sharing indicates wasted effort.
+pages_volatile embraces several different kinds of activity, but a high
+proportion there would also indicate poor use of madvise MADV_MERGEABLE.
+
+The maximum possible page_sharing/page_shared ratio is limited by the
+max_page_sharing tunable. To increase the ratio max_page_sharing must
+be increased accordingly.
+
+The stable_node_dups/stable_node_chains ratio is also affected by the
+max_page_sharing tunable, and an high ratio may indicate fragmentation
+in the stable_node dups, which could be solved by introducing
+fragmentation algorithms in ksmd which would refile rmap_items from
+one stable_node dup to another stable_node dup, in order to freeup
+stable_node "dups" with few rmap_items in them, but that may increase
+the ksmd CPU usage and possibly slowdown the readonly computations on
+the KSM pages of the applications.
+
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
deleted file mode 100644
index 87e7eef5ea9c..000000000000
--- a/Documentation/vm/ksm.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-.. _ksm:
-
-=======================
-Kernel Samepage Merging
-=======================
-
-KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
-added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
-and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
-
-The KSM daemon ksmd periodically scans those areas of user memory which
-have been registered with it, looking for pages of identical content which
-can be replaced by a single write-protected page (which is automatically
-copied if a process later wants to update its content).
-
-KSM was originally developed for use with KVM (where it was known as
-Kernel Shared Memory), to fit more virtual machines into physical memory,
-by sharing the data common between them.  But it can be useful to any
-application which generates many instances of the same data.
-
-KSM only merges anonymous (private) pages, never pagecache (file) pages.
-KSM's merged pages were originally locked into kernel memory, but can now
-be swapped out just like other user pages (but sharing is broken when they
-are swapped back in: ksmd must rediscover their identity and merge again).
-
-KSM only operates on those areas of address space which an application
-has advised to be likely candidates for merging, by using the madvise(2)
-system call: int madvise(addr, length, MADV_MERGEABLE).
-
-The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel
-that advice and restore unshared pages: whereupon KSM unmerges whatever
-it merged in that range.  Note: this unmerging call may suddenly require
-more memory than is available - possibly failing with EAGAIN, but more
-probably arousing the Out-Of-Memory killer.
-
-If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
-and MADV_UNMERGEABLE simply fail with EINVAL.  If the running kernel was
-built with CONFIG_KSM=y, those calls will normally succeed: even if the
-the KSM daemon is not currently running, MADV_MERGEABLE still registers
-the range for whenever the KSM daemon is started; even if the range
-cannot contain any pages which KSM could actually merge; even if
-MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
-
-If a region of memory must be split into at least one new MADV_MERGEABLE
-or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
-will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
-
-Like other madvise calls, they are intended for use on mapped areas of
-the user address space: they will report ENOMEM if the specified range
-includes unmapped gaps (though working on the intervening mapped areas),
-and might fail with EAGAIN if not enough memory for internal structures.
-
-Applications should be considerate in their use of MADV_MERGEABLE,
-restricting its use to areas likely to benefit.  KSM's scans may use a lot
-of processing power: some installations will disable KSM for that reason.
-
-The KSM daemon is controlled by sysfs files in ``/sys/kernel/mm/ksm/``,
-readable by all but writable only by root:
-
-pages_to_scan
-        how many present pages to scan before ksmd goes to sleep
-        e.g. ``echo 100 > /sys/kernel/mm/ksm/pages_to_scan`` Default: 100
-        (chosen for demonstration purposes)
-
-sleep_millisecs
-        how many milliseconds ksmd should sleep before next scan
-        e.g. ``echo 20 > /sys/kernel/mm/ksm/sleep_millisecs`` Default: 20
-        (chosen for demonstration purposes)
-
-merge_across_nodes
-        specifies if pages from different numa nodes can be merged.
-        When set to 0, ksm merges only pages which physically reside
-        in the memory area of same NUMA node. That brings lower
-        latency to access of shared pages. Systems with more nodes, at
-        significant NUMA distances, are likely to benefit from the
-        lower latency of setting 0. Smaller systems, which need to
-        minimize memory usage, are likely to benefit from the greater
-        sharing of setting 1 (default). You may wish to compare how
-        your system performs under each setting, before deciding on
-        which to use. merge_across_nodes setting can be changed only
-        when there are no ksm shared pages in system: set run 2 to
-        unmerge pages first, then to 1 after changing
-        merge_across_nodes, to remerge according to the new setting.
-        Default: 1 (merging across nodes as in earlier releases)
-
-run
-        set 0 to stop ksmd from running but keep merged pages,
-        set 1 to run ksmd e.g. ``echo 1 > /sys/kernel/mm/ksm/run``,
-        set 2 to stop ksmd and unmerge all pages currently merged, but
-        leave mergeable areas registered for next run Default: 0 (must
-        be changed to 1 to activate KSM, except if CONFIG_SYSFS is
-        disabled)
-
-use_zero_pages
-        specifies whether empty pages (i.e. allocated pages that only
-        contain zeroes) should be treated specially.  When set to 1,
-        empty pages are merged with the kernel zero page(s) instead of
-        with each other as it would happen normally. This can improve
-        the performance on architectures with coloured zero pages,
-        depending on the workload. Care should be taken when enabling
-        this setting, as it can potentially degrade the performance of
-        KSM for some workloads, for example if the checksums of pages
-        candidate for merging match the checksum of an empty
-        page. This setting can be changed at any time, it is only
-        effective for pages merged after the change.  Default: 0
-        (normal KSM behaviour as in earlier releases)
-
-max_page_sharing
-        Maximum sharing allowed for each KSM page. This enforces a
-        deduplication limit to avoid the virtual memory rmap lists to
-        grow too large. The minimum value is 2 as a newly created KSM
-        page will have at least two sharers. The rmap walk has O(N)
-        complexity where N is the number of rmap_items (i.e. virtual
-        mappings) that are sharing the page, which is in turn capped
-        by max_page_sharing. So this effectively spread the the linear
-        O(N) computational complexity from rmap walk context over
-        different KSM pages. The ksmd walk over the stable_node
-        "chains" is also O(N), but N is the number of stable_node
-        "dups", not the number of rmap_items, so it has not a
-        significant impact on ksmd performance. In practice the best
-        stable_node "dup" candidate will be kept and found at the head
-        of the "dups" list. The higher this value the faster KSM will
-        merge the memory (because there will be fewer stable_node dups
-        queued into the stable_node chain->hlist to check for pruning)
-        and the higher the deduplication factor will be, but the
-        slowest the worst case rmap walk could be for any given KSM
-        page. Slowing down the rmap_walk means there will be higher
-        latency for certain virtual memory operations happening during
-        swapping, compaction, NUMA balancing and page migration, in
-        turn decreasing responsiveness for the caller of those virtual
-        memory operations. The scheduler latency of other tasks not
-        involved with the VM operations doing the rmap walk is not
-        affected by this parameter as the rmap walks are always
-        schedule friendly themselves.
-
-stable_node_chains_prune_millisecs
-        How frequently to walk the whole list of stable_node "dups"
-        linked in the stable_node "chains" in order to prune stale
-        stable_nodes. Smaller milllisecs values will free up the KSM
-        metadata with lower latency, but they will make ksmd use more
-        CPU during the scan. This only applies to the stable_node
-        chains so it's a noop if not a single KSM page hit the
-        max_page_sharing yet (there would be no stable_node chains in
-        such case).
-
-The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
-
-pages_shared
-        how many shared pages are being used
-pages_sharing
-        how many more sites are sharing them i.e. how much saved
-pages_unshared
-        how many pages unique but repeatedly checked for merging
-pages_volatile
-        how many pages changing too fast to be placed in a tree
-full_scans
-        how many times all mergeable areas have been scanned
-stable_node_chains
-        number of stable node chains allocated, this is effectively
-        the number of KSM pages that hit the max_page_sharing limit
-stable_node_dups
-        number of stable node dups queued into the stable_node chains
-
-A high ratio of pages_sharing to pages_shared indicates good sharing, but
-a high ratio of pages_unshared to pages_sharing indicates wasted effort.
-pages_volatile embraces several different kinds of activity, but a high
-proportion there would also indicate poor use of madvise MADV_MERGEABLE.
-
-The maximum possible page_sharing/page_shared ratio is limited by the
-max_page_sharing tunable. To increase the ratio max_page_sharing must
-be increased accordingly.
-
-The stable_node_dups/stable_node_chains ratio is also affected by the
-max_page_sharing tunable, and an high ratio may indicate fragmentation
-in the stable_node dups, which could be solved by introducing
-fragmentation algorithms in ksmd which would refile rmap_items from
-one stable_node dup to another stable_node dup, in order to freeup
-stable_node "dups" with few rmap_items in them, but that may increase
-the ksmd CPU usage and possibly slowdown the readonly computations on
-the KSM pages of the applications.
-
-Izik Eidus,
-Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst
new file mode 100644
index 000000000000..47baa1cf28c5
--- /dev/null
+++ b/Documentation/vm/mmu_notifier.rst
@@ -0,0 +1,99 @@
+.. _mmu_notifier:
+
+When do you need to notify inside page table lock ?
+===================================================
+
+When clearing a pte/pmd we are given a choice to notify the event through
+(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
+the page table lock. But that notification is not necessary in all cases.
+
+For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
+thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
+process virtual address space). There is only 2 cases when you need to notify
+those secondary TLB while holding page table lock when clearing a pte/pmd:
+
+  A) page backing address is free before mmu_notifier_invalidate_range_end()
+  B) a page table entry is updated to point to a new page (COW, write fault
+     on zero page, __replace_page(), ...)
+
+Case A is obvious you do not want to take the risk for the device to write to
+a page that might now be used by some completely different task.
+
+Case B is more subtle. For correctness it requires the following sequence to
+happen:
+
+  - take page table lock
+  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
+  - set page table entry to point to new page
+
+If clearing the page table entry is not followed by a notify before setting
+the new pte/pmd value then you can break memory model like C11 or C++11 for
+the device.
+
+Consider the following scenario (device use a feature similar to ATS/PASID):
+
+Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
+they are write protected for COW (other case of B apply too).
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0  {try to write to addrA}
+ CPU-thread-1  {try to write to addrB}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA and populate device TLB}
+ DEV-thread-2  {read addrB and populate device TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
+ CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {write to addrA which is a write to new page}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {}
+ CPU-thread-3  {write to addrB which is a write to new page}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA from old page}
+ DEV-thread-2  {read addrB from new page}
+
+So here because at time N+2 the clear page table entry was not pair with a
+notification to invalidate the secondary TLB, the device see the new value for
+addrB before seing the new value for addrA. This break total memory ordering
+for the device.
+
+When changing a pte to write protect or to point to a new write protected page
+with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
+call to mmu_notifier_invalidate_range_end() outside the page table lock. This
+is true even if the thread doing the page table update is preempted right after
+releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt
deleted file mode 100644
index 47baa1cf28c5..000000000000
--- a/Documentation/vm/mmu_notifier.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-.. _mmu_notifier:
-
-When do you need to notify inside page table lock ?
-===================================================
-
-When clearing a pte/pmd we are given a choice to notify the event through
-(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
-the page table lock. But that notification is not necessary in all cases.
-
-For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
-thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
-process virtual address space). There is only 2 cases when you need to notify
-those secondary TLB while holding page table lock when clearing a pte/pmd:
-
-  A) page backing address is free before mmu_notifier_invalidate_range_end()
-  B) a page table entry is updated to point to a new page (COW, write fault
-     on zero page, __replace_page(), ...)
-
-Case A is obvious you do not want to take the risk for the device to write to
-a page that might now be used by some completely different task.
-
-Case B is more subtle. For correctness it requires the following sequence to
-happen:
-
-  - take page table lock
-  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
-  - set page table entry to point to new page
-
-If clearing the page table entry is not followed by a notify before setting
-the new pte/pmd value then you can break memory model like C11 or C++11 for
-the device.
-
-Consider the following scenario (device use a feature similar to ATS/PASID):
-
-Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
-they are write protected for COW (other case of B apply too).
-
-::
-
- [Time N] --------------------------------------------------------------------
- CPU-thread-0  {try to write to addrA}
- CPU-thread-1  {try to write to addrB}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {read addrA and populate device TLB}
- DEV-thread-2  {read addrB and populate device TLB}
- [Time N+1] ------------------------------------------------------------------
- CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
- CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+2] ------------------------------------------------------------------
- CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
- CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {write to addrA which is a write to new page}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {}
- CPU-thread-3  {write to addrB which is a write to new page}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+4] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+5] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {read addrA from old page}
- DEV-thread-2  {read addrB from new page}
-
-So here because at time N+2 the clear page table entry was not pair with a
-notification to invalidate the secondary TLB, the device see the new value for
-addrB before seing the new value for addrA. This break total memory ordering
-for the device.
-
-When changing a pte to write protect or to point to a new write protected page
-with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
-call to mmu_notifier_invalidate_range_end() outside the page table lock. This
-is true even if the thread doing the page table update is preempted right after
-releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
deleted file mode 100644
index c81e7c56f0f9..000000000000
--- a/Documentation/vm/numa
+++ /dev/null
@@ -1,150 +0,0 @@
-.. _numa:
-
-Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
-
-=============
-What is NUMA?
-=============
-
-This question can be answered from a couple of perspectives:  the
-hardware view and the Linux software view.
-
-From the hardware perspective, a NUMA system is a computer platform that
-comprises multiple components or assemblies each of which may contain 0
-or more CPUs, local memory, and/or IO buses.  For brevity and to
-disambiguate the hardware view of these physical components/assemblies
-from the software abstraction thereof, we'll call the components/assemblies
-'cells' in this document.
-
-Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
-of the system--although some components necessary for a stand-alone SMP system
-may not be populated on any given cell.   The cells of the NUMA system are
-connected together with some sort of system interconnect--e.g., a crossbar or
-point-to-point link are common types of NUMA system interconnects.  Both of
-these types of interconnects can be aggregated to create NUMA platforms with
-cells at multiple distances from other cells.
-
-For Linux, the NUMA platforms of interest are primarily what is known as Cache
-Coherent NUMA or ccNUMA systems.   With ccNUMA systems, all memory is visible
-to and accessible from any CPU attached to any cell and cache coherency
-is handled in hardware by the processor caches and/or the system interconnect.
-
-Memory access time and effective memory bandwidth varies depending on how far
-away the cell containing the CPU or IO bus making the memory access is from the
-cell containing the target memory.  For example, access to memory by CPUs
-attached to the same cell will experience faster access times and higher
-bandwidths than accesses to memory on other, remote cells.  NUMA platforms
-can have cells at multiple remote distances from any given cell.
-
-Platform vendors don't build NUMA systems just to make software developers'
-lives interesting.  Rather, this architecture is a means to provide scalable
-memory bandwidth.  However, to achieve scalable memory bandwidth, system and
-application software must arrange for a large majority of the memory references
-[cache misses] to be to "local" memory--memory on the same cell, if any--or
-to the closest cell with memory.
-
-This leads to the Linux software view of a NUMA system:
-
-Linux divides the system's hardware resources into multiple software
-abstractions called "nodes".  Linux maps the nodes onto the physical cells
-of the hardware platform, abstracting away some of the details for some
-architectures.  As with physical cells, software nodes may contain 0 or more
-CPUs, memory and/or IO buses.  And, again, memory accesses to memory on
-"closer" nodes--nodes that map to closer cells--will generally experience
-faster access times and higher effective bandwidth than accesses to more
-remote cells.
-
-For some architectures, such as x86, Linux will "hide" any node representing a
-physical cell that has no memory attached, and reassign any CPUs attached to
-that cell to a node representing a cell that does have memory.  Thus, on
-these architectures, one cannot assume that all CPUs that Linux associates with
-a given node will see the same local memory access times and bandwidth.
-
-In addition, for some architectures, again x86 is an example, Linux supports
-the emulation of additional nodes.  For NUMA emulation, linux will carve up
-the existing nodes--or the system memory for non-NUMA platforms--into multiple
-nodes.  Each emulated node will manage a fraction of the underlying cells'
-physical memory.  NUMA emluation is useful for testing NUMA kernel and
-application features on non-NUMA platforms, and as a sort of memory resource
-management mechanism when used together with cpusets.
-[see Documentation/cgroup-v1/cpusets.txt]
-
-For each node with memory, Linux constructs an independent memory management
-subsystem, complete with its own free page lists, in-use page lists, usage
-statistics and locks to mediate access.  In addition, Linux constructs for
-each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
-an ordered "zonelist".  A zonelist specifies the zones/nodes to visit when a
-selected zone/node cannot satisfy the allocation request.  This situation,
-when a zone has no available memory to satisfy a request, is called
-"overflow" or "fallback".
-
-Because some nodes contain multiple zones containing different types of
-memory, Linux must decide whether to order the zonelists such that allocations
-fall back to the same zone type on a different node, or to a different zone
-type on the same node.  This is an important consideration because some zones,
-such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
-a default Node ordered zonelist. This means it tries to fallback to other zones
-from the same node before using remote nodes which are ordered by NUMA distance.
-
-By default, Linux will attempt to satisfy memory allocation requests from the
-node to which the CPU that executes the request is assigned.  Specifically,
-Linux will attempt to allocate from the first node in the appropriate zonelist
-for the node where the request originates.  This is called "local allocation."
-If the "local" node cannot satisfy the request, the kernel will examine other
-nodes' zones in the selected zonelist looking for the first zone in the list
-that can satisfy the request.
-
-Local allocation will tend to keep subsequent access to the allocated memory
-"local" to the underlying physical resources and off the system interconnect--
-as long as the task on whose behalf the kernel allocated some memory does not
-later migrate away from that memory.  The Linux scheduler is aware of the
-NUMA topology of the platform--embodied in the "scheduling domains" data
-structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler
-attempts to minimize task migration to distant scheduling domains.  However,
-the scheduler does not take a task's NUMA footprint into account directly.
-Thus, under sufficient imbalance, tasks can migrate between nodes, remote
-from their initial node and kernel data structures.
-
-System administrators and application designers can restrict a task's migration
-to improve NUMA locality using various CPU affinity command line interfaces,
-such as taskset(1) and numactl(1), and program interfaces such as
-sched_setaffinity(2).  Further, one can modify the kernel's default local
-allocation behavior using Linux NUMA memory policy.
-[see Documentation/vm/numa_memory_policy.txt.]
-
-System administrators can restrict the CPUs and nodes' memories that a non-
-privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets.  [see Documentation/cgroup-v1/cpusets.txt]
-
-On architectures that do not hide memoryless nodes, Linux will include only
-zones [nodes] with memory in the zonelists.  This means that for a memoryless
-node the "local memory node"--the node of the first zone in CPU's node's
-zonelist--will not be the node itself.  Rather, it will be the node that the
-kernel selected as the nearest node with memory when it built the zonelists.
-So, default, local allocations will succeed with the kernel supplying the
-closest available memory.  This is a consequence of the same mechanism that
-allows such allocations to fallback to other nearby nodes when a node that
-does contain memory overflows.
-
-Some kernel allocations do not want or cannot tolerate this allocation fallback
-behavior.  Rather they want to be sure they get memory from the specified node
-or get notified that the node has no free memory.  This is usually the case when
-a subsystem allocates per CPU memory resources, for example.
-
-A typical model for making such an allocation is to obtain the node id of the
-node to which the "current CPU" is attached using one of the kernel's
-numa_node_id() or CPU_to_node() functions and then request memory from only
-the node id returned.  When such an allocation fails, the requesting subsystem
-may revert to its own fallback path.  The slab kernel memory allocator is an
-example of this.  Or, the subsystem may choose to disable or not to enable
-itself on allocation failure.  The kernel profiling subsystem is an example of
-this.
-
-If the architecture supports--does not hide--memoryless nodes, then CPUs
-attached to memoryless nodes would always incur the fallback path overhead
-or some subsystems would fail to initialize if they attempted to allocated
-memory exclusively from a node without memory.  To support such
-architectures transparently, kernel subsystems can use the numa_mem_id()
-or cpu_to_mem() function to locate the "local memory node" for the calling or
-specified CPU.  Again, this is the same node from which default, local page
-allocations will be attempted.
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
new file mode 100644
index 000000000000..aada84bc8c46
--- /dev/null
+++ b/Documentation/vm/numa.rst
@@ -0,0 +1,150 @@
+.. _numa:
+
+Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
+
+=============
+What is NUMA?
+=============
+
+This question can be answered from a couple of perspectives:  the
+hardware view and the Linux software view.
+
+From the hardware perspective, a NUMA system is a computer platform that
+comprises multiple components or assemblies each of which may contain 0
+or more CPUs, local memory, and/or IO buses.  For brevity and to
+disambiguate the hardware view of these physical components/assemblies
+from the software abstraction thereof, we'll call the components/assemblies
+'cells' in this document.
+
+Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
+of the system--although some components necessary for a stand-alone SMP system
+may not be populated on any given cell.   The cells of the NUMA system are
+connected together with some sort of system interconnect--e.g., a crossbar or
+point-to-point link are common types of NUMA system interconnects.  Both of
+these types of interconnects can be aggregated to create NUMA platforms with
+cells at multiple distances from other cells.
+
+For Linux, the NUMA platforms of interest are primarily what is known as Cache
+Coherent NUMA or ccNUMA systems.   With ccNUMA systems, all memory is visible
+to and accessible from any CPU attached to any cell and cache coherency
+is handled in hardware by the processor caches and/or the system interconnect.
+
+Memory access time and effective memory bandwidth varies depending on how far
+away the cell containing the CPU or IO bus making the memory access is from the
+cell containing the target memory.  For example, access to memory by CPUs
+attached to the same cell will experience faster access times and higher
+bandwidths than accesses to memory on other, remote cells.  NUMA platforms
+can have cells at multiple remote distances from any given cell.
+
+Platform vendors don't build NUMA systems just to make software developers'
+lives interesting.  Rather, this architecture is a means to provide scalable
+memory bandwidth.  However, to achieve scalable memory bandwidth, system and
+application software must arrange for a large majority of the memory references
+[cache misses] to be to "local" memory--memory on the same cell, if any--or
+to the closest cell with memory.
+
+This leads to the Linux software view of a NUMA system:
+
+Linux divides the system's hardware resources into multiple software
+abstractions called "nodes".  Linux maps the nodes onto the physical cells
+of the hardware platform, abstracting away some of the details for some
+architectures.  As with physical cells, software nodes may contain 0 or more
+CPUs, memory and/or IO buses.  And, again, memory accesses to memory on
+"closer" nodes--nodes that map to closer cells--will generally experience
+faster access times and higher effective bandwidth than accesses to more
+remote cells.
+
+For some architectures, such as x86, Linux will "hide" any node representing a
+physical cell that has no memory attached, and reassign any CPUs attached to
+that cell to a node representing a cell that does have memory.  Thus, on
+these architectures, one cannot assume that all CPUs that Linux associates with
+a given node will see the same local memory access times and bandwidth.
+
+In addition, for some architectures, again x86 is an example, Linux supports
+the emulation of additional nodes.  For NUMA emulation, linux will carve up
+the existing nodes--or the system memory for non-NUMA platforms--into multiple
+nodes.  Each emulated node will manage a fraction of the underlying cells'
+physical memory.  NUMA emluation is useful for testing NUMA kernel and
+application features on non-NUMA platforms, and as a sort of memory resource
+management mechanism when used together with cpusets.
+[see Documentation/cgroup-v1/cpusets.txt]
+
+For each node with memory, Linux constructs an independent memory management
+subsystem, complete with its own free page lists, in-use page lists, usage
+statistics and locks to mediate access.  In addition, Linux constructs for
+each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
+an ordered "zonelist".  A zonelist specifies the zones/nodes to visit when a
+selected zone/node cannot satisfy the allocation request.  This situation,
+when a zone has no available memory to satisfy a request, is called
+"overflow" or "fallback".
+
+Because some nodes contain multiple zones containing different types of
+memory, Linux must decide whether to order the zonelists such that allocations
+fall back to the same zone type on a different node, or to a different zone
+type on the same node.  This is an important consideration because some zones,
+such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
+a default Node ordered zonelist. This means it tries to fallback to other zones
+from the same node before using remote nodes which are ordered by NUMA distance.
+
+By default, Linux will attempt to satisfy memory allocation requests from the
+node to which the CPU that executes the request is assigned.  Specifically,
+Linux will attempt to allocate from the first node in the appropriate zonelist
+for the node where the request originates.  This is called "local allocation."
+If the "local" node cannot satisfy the request, the kernel will examine other
+nodes' zones in the selected zonelist looking for the first zone in the list
+that can satisfy the request.
+
+Local allocation will tend to keep subsequent access to the allocated memory
+"local" to the underlying physical resources and off the system interconnect--
+as long as the task on whose behalf the kernel allocated some memory does not
+later migrate away from that memory.  The Linux scheduler is aware of the
+NUMA topology of the platform--embodied in the "scheduling domains" data
+structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler
+attempts to minimize task migration to distant scheduling domains.  However,
+the scheduler does not take a task's NUMA footprint into account directly.
+Thus, under sufficient imbalance, tasks can migrate between nodes, remote
+from their initial node and kernel data structures.
+
+System administrators and application designers can restrict a task's migration
+to improve NUMA locality using various CPU affinity command line interfaces,
+such as taskset(1) and numactl(1), and program interfaces such as
+sched_setaffinity(2).  Further, one can modify the kernel's default local
+allocation behavior using Linux NUMA memory policy.
+[see Documentation/vm/numa_memory_policy.rst.]
+
+System administrators can restrict the CPUs and nodes' memories that a non-
+privileged user can specify in the scheduling or NUMA commands and functions
+using control groups and CPUsets.  [see Documentation/cgroup-v1/cpusets.txt]
+
+On architectures that do not hide memoryless nodes, Linux will include only
+zones [nodes] with memory in the zonelists.  This means that for a memoryless
+node the "local memory node"--the node of the first zone in CPU's node's
+zonelist--will not be the node itself.  Rather, it will be the node that the
+kernel selected as the nearest node with memory when it built the zonelists.
+So, default, local allocations will succeed with the kernel supplying the
+closest available memory.  This is a consequence of the same mechanism that
+allows such allocations to fallback to other nearby nodes when a node that
+does contain memory overflows.
+
+Some kernel allocations do not want or cannot tolerate this allocation fallback
+behavior.  Rather they want to be sure they get memory from the specified node
+or get notified that the node has no free memory.  This is usually the case when
+a subsystem allocates per CPU memory resources, for example.
+
+A typical model for making such an allocation is to obtain the node id of the
+node to which the "current CPU" is attached using one of the kernel's
+numa_node_id() or CPU_to_node() functions and then request memory from only
+the node id returned.  When such an allocation fails, the requesting subsystem
+may revert to its own fallback path.  The slab kernel memory allocator is an
+example of this.  Or, the subsystem may choose to disable or not to enable
+itself on allocation failure.  The kernel profiling subsystem is an example of
+this.
+
+If the architecture supports--does not hide--memoryless nodes, then CPUs
+attached to memoryless nodes would always incur the fallback path overhead
+or some subsystems would fail to initialize if they attempted to allocated
+memory exclusively from a node without memory.  To support such
+architectures transparently, kernel subsystems can use the numa_mem_id()
+or cpu_to_mem() function to locate the "local memory node" for the calling or
+specified CPU.  Again, this is the same node from which default, local page
+allocations will be attempted.
diff --git a/Documentation/vm/numa_memory_policy.rst b/Documentation/vm/numa_memory_policy.rst
new file mode 100644
index 000000000000..8cd942ca114e
--- /dev/null
+++ b/Documentation/vm/numa_memory_policy.rst
@@ -0,0 +1,485 @@
+.. _numa_memory_policy:
+
+===================
+Linux Memory Policy
+===================
+
+What is Linux Memory Policy?
+============================
+
+In the Linux kernel, "memory policy" determines from which node the kernel will
+allocate memory in a NUMA system or in an emulated NUMA system.  Linux has
+supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
+The current memory policy support was added to Linux 2.6 around May 2004.  This
+document attempts to describe the concepts and APIs of the 2.6 memory policy
+support.
+
+Memory policies should not be confused with cpusets
+(``Documentation/cgroup-v1/cpusets.txt``)
+which is an administrative mechanism for restricting the nodes from which
+memory may be allocated by a set of processes. Memory policies are a
+programming interface that a NUMA-aware application can take advantage of.  When
+both cpusets and policies are applied to a task, the restrictions of the cpuset
+takes priority.  See :ref:`Memory Policies and cpusets <mem_pol_and_cpusets>`
+below for more details.
+
+Memory Policy Concepts
+======================
+
+Scope of Memory Policies
+------------------------
+
+The Linux kernel supports _scopes_ of memory policy, described here from
+most general to most specific:
+
+System Default Policy
+	this policy is "hard coded" into the kernel.  It is the policy
+	that governs all page allocations that aren't controlled by
+	one of the more specific policy scopes discussed below.  When
+	the system is "up and running", the system default policy will
+	use "local allocation" described below.  However, during boot
+	up, the system default policy will be set to interleave
+	allocations across all nodes with "sufficient" memory, so as
+	not to overload the initial boot node with boot-time
+	allocations.
+
+Task/Process Policy
+	this is an optional, per-task policy.  When defined for a specific task, this policy controls all page allocations made by or on behalf of the task that aren't controlled by a more specific scope. If a task does not define a task policy, then all page allocations that would have been controlled by the task policy "fall back" to the System Default Policy.
+
+	The task policy applies to the entire address space of a task. Thus,
+	it is inheritable, and indeed is inherited, across both fork()
+	[clone() w/o the CLONE_VM flag] and exec*().  This allows a parent task
+	to establish the task policy for a child task exec()'d from an
+	executable image that has no awareness of memory policy.  See the
+	MEMORY POLICY APIS section, below, for an overview of the system call
+	that a task may use to set/change its task/process policy.
+
+	In a multi-threaded task, task policies apply only to the thread
+	[Linux kernel task] that installs the policy and any threads
+	subsequently created by that thread.  Any sibling threads existing
+	at the time a new task policy is installed retain their current
+	policy.
+
+	A task policy applies only to pages allocated after the policy is
+	installed.  Any pages already faulted in by the task when the task
+	changes its task policy remain where they were allocated based on
+	the policy at the time they were allocated.
+
+.. _vma_policy:
+
+VMA Policy
+	A "VMA" or "Virtual Memory Area" refers to a range of a task's
+	virtual address space.  A task may define a specific policy for a range
+	of its virtual address space.   See the MEMORY POLICIES APIS section,
+	below, for an overview of the mbind() system call used to set a VMA
+	policy.
+
+	A VMA policy will govern the allocation of pages that back
+	this region ofthe address space.  Any regions of the task's
+	address space that don't have an explicit VMA policy will fall
+	back to the task policy, which may itself fall back to the
+	System Default Policy.
+
+	VMA policies have a few complicating details:
+
+	* VMA policy applies ONLY to anonymous pages.  These include
+	  pages allocated for anonymous segments, such as the task
+	  stack and heap, and any regions of the address space
+	  mmap()ed with the MAP_ANONYMOUS flag.  If a VMA policy is
+	  applied to a file mapping, it will be ignored if the mapping
+	  used the MAP_SHARED flag.  If the file mapping used the
+	  MAP_PRIVATE flag, the VMA policy will only be applied when
+	  an anonymous page is allocated on an attempt to write to the
+	  mapping-- i.e., at Copy-On-Write.
+
+	* VMA policies are shared between all tasks that share a
+	  virtual address space--a.k.a. threads--independent of when
+	  the policy is installed; and they are inherited across
+	  fork().  However, because VMA policies refer to a specific
+	  region of a task's address space, and because the address
+	  space is discarded and recreated on exec*(), VMA policies
+	  are NOT inheritable across exec().  Thus, only NUMA-aware
+	  applications may use VMA policies.
+
+	* A task may install a new VMA policy on a sub-range of a
+	  previously mmap()ed region.  When this happens, Linux splits
+	  the existing virtual memory area into 2 or 3 VMAs, each with
+	  it's own policy.
+
+	* By default, VMA policy applies only to pages allocated after
+	  the policy is installed.  Any pages already faulted into the
+	  VMA range remain where they were allocated based on the
+	  policy at the time they were allocated.  However, since
+	  2.6.16, Linux supports page migration via the mbind() system
+	  call, so that page contents can be moved to match a newly
+	  installed policy.
+
+Shared Policy
+	Conceptually, shared policies apply to "memory objects" mapped
+	shared into one or more tasks' distinct address spaces.  An
+	application installs a shared policies the same way as VMA
+	policies--using the mbind() system call specifying a range of
+	virtual addresses that map the shared object.  However, unlike
+	VMA policies, which can be considered to be an attribute of a
+	range of a task's address space, shared policies apply
+	directly to the shared object.  Thus, all tasks that attach to
+	the object share the policy, and all pages allocated for the
+	shared object, by any task, will obey the shared policy.
+
+	As of 2.6.22, only shared memory segments, created by shmget() or
+	mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy.  When shared
+	policy support was added to Linux, the associated data structures were
+	added to hugetlbfs shmem segments.  At the time, hugetlbfs did not
+	support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
+	shmem segments were never "hooked up" to the shared policy support.
+	Although hugetlbfs segments now support lazy allocation, their support
+	for shared policy has not been completed.
+
+	As mentioned above :ref:`VMA policies <vma_policy>`,
+	allocations of page cache pages for regular files mmap()ed
+	with MAP_SHARED ignore any VMA policy installed on the virtual
+	address range backed by the shared file mapping.  Rather,
+	shared page cache pages, including pages backing private
+	mappings that have not yet been written by the task, follow
+	task policy, if any, else System Default Policy.
+
+	The shared policy infrastructure supports different policies on subset
+	ranges of the shared object.  However, Linux still splits the VMA of
+	the task that installs the policy for each range of distinct policy.
+	Thus, different tasks that attach to a shared memory segment can have
+	different VMA configurations mapping that one shared object.  This
+	can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
+	a shared memory region, when one task has installed shared policy on
+	one or more ranges of the region.
+
+Components of Memory Policies
+-----------------------------
+
+A Linux memory policy consists of a "mode", optional mode flags, and
+an optional set of nodes.  The mode determines the behavior of the
+policy, the optional mode flags determine the behavior of the mode,
+and the optional set of nodes can be viewed as the arguments to the
+policy behavior.
+
+Internally, memory policies are implemented by a reference counted
+structure, struct mempolicy.  Details of this structure will be
+discussed in context, below, as required to explain the behavior.
+
+Linux memory policy supports the following 4 behavioral modes:
+
+Default Mode--MPOL_DEFAULT
+	This mode is only used in the memory policy APIs.  Internally,
+	MPOL_DEFAULT is converted to the NULL memory policy in all
+	policy scopes.  Any existing non-default policy will simply be
+	removed when MPOL_DEFAULT is specified.  As a result,
+	MPOL_DEFAULT means "fall back to the next most specific policy
+	scope."
+
+	For example, a NULL or default task policy will fall back to the
+	system default policy.  A NULL or default vma policy will fall
+	back to the task policy.
+
+	When specified in one of the memory policy APIs, the Default mode
+	does not use the optional set of nodes.
+
+	It is an error for the set of nodes specified for this policy to
+	be non-empty.
+
+MPOL_BIND
+	This mode specifies that memory must come from the set of
+	nodes specified by the policy.  Memory will be allocated from
+	the node in the set with sufficient free memory that is
+	closest to the node where the allocation takes place.
+
+MPOL_PREFERRED
+	This mode specifies that the allocation should be attempted
+	from the single node specified in the policy.  If that
+	allocation fails, the kernel will search other nodes, in order
+	of increasing distance from the preferred node based on
+	information provided by the platform firmware.
+
+	Internally, the Preferred policy uses a single node--the
+	preferred_node member of struct mempolicy.  When the internal
+	mode flag MPOL_F_LOCAL is set, the preferred_node is ignored
+	and the policy is interpreted as local allocation.  "Local"
+	allocation policy can be viewed as a Preferred policy that
+	starts at the node containing the cpu where the allocation
+	takes place.
+
+	It is possible for the user to specify that local allocation
+	is always preferred by passing an empty nodemask with this
+	mode.  If an empty nodemask is passed, the policy cannot use
+	the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags
+	described below.
+
+MPOL_INTERLEAVED
+	This mode specifies that page allocations be interleaved, on a
+	page granularity, across the nodes specified in the policy.
+	This mode also behaves slightly differently, based on the
+	context where it is used:
+
+	For allocation of anonymous pages and shared memory pages,
+	Interleave mode indexes the set of nodes specified by the
+	policy using the page offset of the faulting address into the
+	segment [VMA] containing the address modulo the number of
+	nodes specified by the policy.  It then attempts to allocate a
+	page, starting at the selected node, as if the node had been
+	specified by a Preferred policy or had been selected by a
+	local allocation.  That is, allocation will follow the per
+	node zonelist.
+
+	For allocation of page cache pages, Interleave mode indexes
+	the set of nodes specified by the policy using a node counter
+	maintained per task.  This counter wraps around to the lowest
+	specified node after it reaches the highest specified node.
+	This will tend to spread the pages out over the nodes
+	specified by the policy based on the order in which they are
+	allocated, rather than based on any page offset into an
+	address range or file.  During system boot up, the temporary
+	interleaved system default policy works in this mode.
+
+Linux memory policy supports the following optional mode flags:
+
+MPOL_F_STATIC_NODES
+	This flag specifies that the nodemask passed by
+	the user should not be remapped if the task or VMA's set of allowed
+	nodes changes after the memory policy has been defined.
+
+	Without this flag, anytime a mempolicy is rebound because of a
+	change in the set of allowed nodes, the node (Preferred) or
+	nodemask (Bind, Interleave) is remapped to the new set of
+	allowed nodes.  This may result in nodes being used that were
+	previously undesired.
+
+	With this flag, if the user-specified nodes overlap with the
+	nodes allowed by the task's cpuset, then the memory policy is
+	applied to their intersection.  If the two sets of nodes do not
+	overlap, the Default policy is used.
+
+	For example, consider a task that is attached to a cpuset with
+	mems 1-3 that sets an Interleave policy over the same set.  If
+	the cpuset's mems change to 3-5, the Interleave will now occur
+	over nodes 3, 4, and 5.  With this flag, however, since only node
+	3 is allowed from the user's nodemask, the "interleave" only
+	occurs over that node.  If no nodes from the user's nodemask are
+	now allowed, the Default behavior is used.
+
+	MPOL_F_STATIC_NODES cannot be combined with the
+	MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
+	MPOL_PREFERRED policies that were created with an empty nodemask
+	(local allocation).
+
+MPOL_F_RELATIVE_NODES
+	This flag specifies that the nodemask passed
+	by the user will be mapped relative to the set of the task or VMA's
+	set of allowed nodes.  The kernel stores the user-passed nodemask,
+	and if the allowed nodes changes, then that original nodemask will
+	be remapped relative to the new set of allowed nodes.
+
+	Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	mempolicy is rebound because of a change in the set of allowed
+	nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+	remapped to the new set of allowed nodes.  That remap may not
+	preserve the relative nature of the user's passed nodemask to its
+	set of allowed nodes upon successive rebinds: a nodemask of
+	1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+	allowed nodes is restored to its original state.
+
+	With this flag, the remap is done so that the node numbers from
+	the user's passed nodemask are relative to the set of allowed
+	nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
+	nodemask, the policy will be effected over the first (and in the
+	Bind or Interleave case, the third and fifth) nodes in the set of
+	allowed nodes.  The nodemask passed by the user represents nodes
+	relative to task or VMA's set of allowed nodes.
+
+	If the user's nodemask includes nodes that are outside the range
+	of the new set of allowed nodes (for example, node 5 is set in
+	the user's nodemask when the set of allowed nodes is only 0-3),
+	then the remap wraps around to the beginning of the nodemask and,
+	if not already set, sets the node in the mempolicy nodemask.
+
+	For example, consider a task that is attached to a cpuset with
+	mems 2-5 that sets an Interleave policy over the same set with
+	MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
+	interleave now occurs over nodes 3,5-7.  If the cpuset's mems
+	then change to 0,2-3,5, then the interleave occurs over nodes
+	0,2-3,5.
+
+	Thanks to the consistent remapping, applications preparing
+	nodemasks to specify memory policies using this flag should
+	disregard their current, actual cpuset imposed memory placement
+	and prepare the nodemask as if they were always located on
+	memory nodes 0 to N-1, where N is the number of memory nodes the
+	policy is intended to manage.  Let the kernel then remap to the
+	set of memory nodes allowed by the task's cpuset, as that may
+	change over time.
+
+	MPOL_F_RELATIVE_NODES cannot be combined with the
+	MPOL_F_STATIC_NODES flag.  It also cannot be used for
+	MPOL_PREFERRED policies that were created with an empty nodemask
+	(local allocation).
+
+Memory Policy Reference Counting
+================================
+
+To resolve use/free races, struct mempolicy contains an atomic reference
+count field.  Internal interfaces, mpol_get()/mpol_put() increment and
+decrement this reference count, respectively.  mpol_put() will only free
+the structure back to the mempolicy kmem cache when the reference count
+goes to zero.
+
+When a new memory policy is allocated, its reference count is initialized
+to '1', representing the reference held by the task that is installing the
+new policy.  When a pointer to a memory policy structure is stored in another
+structure, another reference is added, as the task's reference will be dropped
+on completion of the policy installation.
+
+During run-time "usage" of the policy, we attempt to minimize atomic operations
+on the reference count, as this can lead to cache lines bouncing between cpus
+and NUMA nodes.  "Usage" here means one of the following:
+
+1) querying of the policy, either by the task itself [using the get_mempolicy()
+   API discussed below] or by another task using the /proc/<pid>/numa_maps
+   interface.
+
+2) examination of the policy to determine the policy mode and associated node
+   or node lists, if any, for page allocation.  This is considered a "hot
+   path".  Note that for MPOL_BIND, the "usage" extends across the entire
+   allocation process, which may sleep during page reclaimation, because the
+   BIND policy nodemask is used, by reference, to filter ineligible nodes.
+
+We can avoid taking an extra reference during the usages listed above as
+follows:
+
+1) we never need to get/free the system default policy as this is never
+   changed nor freed, once the system is up and running.
+
+2) for querying the policy, we do not need to take an extra reference on the
+   target task's task policy nor vma policies because we always acquire the
+   task's mm's mmap_sem for read during the query.  The set_mempolicy() and
+   mbind() APIs [see below] always acquire the mmap_sem for write when
+   installing or replacing task or vma policies.  Thus, there is no possibility
+   of a task or thread freeing a policy while another task or thread is
+   querying it.
+
+3) Page allocation usage of task or vma policy occurs in the fault path where
+   we hold them mmap_sem for read.  Again, because replacing the task or vma
+   policy requires that the mmap_sem be held for write, the policy can't be
+   freed out from under us while we're using it for page allocation.
+
+4) Shared policies require special consideration.  One task can replace a
+   shared memory policy while another task, with a distinct mmap_sem, is
+   querying or allocating a page based on the policy.  To resolve this
+   potential race, the shared policy infrastructure adds an extra reference
+   to the shared policy during lookup while holding a spin lock on the shared
+   policy management structure.  This requires that we drop this extra
+   reference when we're finished "using" the policy.  We must drop the
+   extra reference on shared policies in the same query/allocation paths
+   used for non-shared policies.  For this reason, shared policies are marked
+   as such, and the extra reference is dropped "conditionally"--i.e., only
+   for shared policies.
+
+   Because of this extra reference counting, and because we must lookup
+   shared policies in a tree structure under spinlock, shared policies are
+   more expensive to use in the page allocation path.  This is especially
+   true for shared policies on shared memory regions shared by tasks running
+   on different NUMA nodes.  This extra overhead can be avoided by always
+   falling back to task or system default policy for shared memory regions,
+   or by prefaulting the entire shared memory region into memory and locking
+   it down.  However, this might not be appropriate for all applications.
+
+Memory Policy APIs
+
+Linux supports 3 system calls for controlling memory policy.  These APIS
+always affect only the calling task, the calling task's address space, or
+some shared object mapped into the calling task's address space.
+
+.. note::
+   the headers that define these APIs and the parameter data types for
+   user space applications reside in a package that is not part of the
+   Linux kernel.  The kernel system call interfaces, with the 'sys\_'
+   prefix, are defined in <linux/syscalls.h>; the mode and flag
+   definitions are defined in <linux/mempolicy.h>.
+
+Set [Task] Memory Policy::
+
+	long set_mempolicy(int mode, const unsigned long *nmask,
+					unsigned long maxnode);
+
+Set's the calling task's "task/process memory policy" to mode
+specified by the 'mode' argument and the set of nodes defined by
+'nmask'.  'nmask' points to a bit mask of node ids containing at least
+'maxnode' ids.  Optional mode flags may be passed by combining the
+'mode' argument with the flag (for example: MPOL_INTERLEAVE |
+MPOL_F_STATIC_NODES).
+
+See the set_mempolicy(2) man page for more details
+
+
+Get [Task] Memory Policy or Related Information::
+
+	long get_mempolicy(int *mode,
+			   const unsigned long *nmask, unsigned long maxnode,
+			   void *addr, int flags);
+
+Queries the "task/process memory policy" of the calling task, or the
+policy or location of a specified virtual address, depending on the
+'flags' argument.
+
+See the get_mempolicy(2) man page for more details
+
+
+Install VMA/Shared Policy for a Range of Task's Address Space::
+
+	long mbind(void *start, unsigned long len, int mode,
+		   const unsigned long *nmask, unsigned long maxnode,
+		   unsigned flags);
+
+mbind() installs the policy specified by (mode, nmask, maxnodes) as a
+VMA policy for the range of the calling task's address space specified
+by the 'start' and 'len' arguments.  Additional actions may be
+requested via the 'flags' argument.
+
+See the mbind(2) man page for more details.
+
+Memory Policy Command Line Interface
+====================================
+
+Although not strictly part of the Linux implementation of memory policy,
+a command line tool, numactl(8), exists that allows one to:
+
++ set the task policy for a specified program via set_mempolicy(2), fork(2) and
+  exec(2)
+
++ set the shared policy for a shared memory segment via mbind(2)
+
+The numactl(8) tool is packaged with the run-time version of the library
+containing the memory policy system call wrappers.  Some distributions
+package the headers and compile-time libraries in a separate development
+package.
+
+.. _mem_pol_and_cpusets:
+
+Memory Policies and cpusets
+===========================
+
+Memory policies work within cpusets as described above.  For memory policies
+that require a node or set of nodes, the nodes are restricted to the set of
+nodes whose memories are allowed by the cpuset constraints.  If the nodemask
+specified for the policy contains nodes that are not allowed by the cpuset and
+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
+specified for the policy and the set of nodes with memory is used.  If the
+result is the empty set, the policy is considered invalid and cannot be
+installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
+onto and folded into the task's set of allowed nodes as previously described.
+
+The interaction of memory policies and cpusets can be problematic when tasks
+in two cpusets share access to a memory region, such as shared memory segments
+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
+any of the tasks install shared policy on the region, only nodes whose
+memories are allowed in both cpusets may be used in the policies.  Obtaining
+this information requires "stepping outside" the memory policy APIs to use the
+cpuset information and requires that one know in what cpusets other task might
+be attaching to the shared region.  Furthermore, if the cpusets' allowed
+memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
deleted file mode 100644
index 8cd942ca114e..000000000000
--- a/Documentation/vm/numa_memory_policy.txt
+++ /dev/null
@@ -1,485 +0,0 @@
-.. _numa_memory_policy:
-
-===================
-Linux Memory Policy
-===================
-
-What is Linux Memory Policy?
-============================
-
-In the Linux kernel, "memory policy" determines from which node the kernel will
-allocate memory in a NUMA system or in an emulated NUMA system.  Linux has
-supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
-The current memory policy support was added to Linux 2.6 around May 2004.  This
-document attempts to describe the concepts and APIs of the 2.6 memory policy
-support.
-
-Memory policies should not be confused with cpusets
-(``Documentation/cgroup-v1/cpusets.txt``)
-which is an administrative mechanism for restricting the nodes from which
-memory may be allocated by a set of processes. Memory policies are a
-programming interface that a NUMA-aware application can take advantage of.  When
-both cpusets and policies are applied to a task, the restrictions of the cpuset
-takes priority.  See :ref:`Memory Policies and cpusets <mem_pol_and_cpusets>`
-below for more details.
-
-Memory Policy Concepts
-======================
-
-Scope of Memory Policies
-------------------------
-
-The Linux kernel supports _scopes_ of memory policy, described here from
-most general to most specific:
-
-System Default Policy
-	this policy is "hard coded" into the kernel.  It is the policy
-	that governs all page allocations that aren't controlled by
-	one of the more specific policy scopes discussed below.  When
-	the system is "up and running", the system default policy will
-	use "local allocation" described below.  However, during boot
-	up, the system default policy will be set to interleave
-	allocations across all nodes with "sufficient" memory, so as
-	not to overload the initial boot node with boot-time
-	allocations.
-
-Task/Process Policy
-	this is an optional, per-task policy.  When defined for a specific task, this policy controls all page allocations made by or on behalf of the task that aren't controlled by a more specific scope. If a task does not define a task policy, then all page allocations that would have been controlled by the task policy "fall back" to the System Default Policy.
-
-	The task policy applies to the entire address space of a task. Thus,
-	it is inheritable, and indeed is inherited, across both fork()
-	[clone() w/o the CLONE_VM flag] and exec*().  This allows a parent task
-	to establish the task policy for a child task exec()'d from an
-	executable image that has no awareness of memory policy.  See the
-	MEMORY POLICY APIS section, below, for an overview of the system call
-	that a task may use to set/change its task/process policy.
-
-	In a multi-threaded task, task policies apply only to the thread
-	[Linux kernel task] that installs the policy and any threads
-	subsequently created by that thread.  Any sibling threads existing
-	at the time a new task policy is installed retain their current
-	policy.
-
-	A task policy applies only to pages allocated after the policy is
-	installed.  Any pages already faulted in by the task when the task
-	changes its task policy remain where they were allocated based on
-	the policy at the time they were allocated.
-
-.. _vma_policy:
-
-VMA Policy
-	A "VMA" or "Virtual Memory Area" refers to a range of a task's
-	virtual address space.  A task may define a specific policy for a range
-	of its virtual address space.   See the MEMORY POLICIES APIS section,
-	below, for an overview of the mbind() system call used to set a VMA
-	policy.
-
-	A VMA policy will govern the allocation of pages that back
-	this region ofthe address space.  Any regions of the task's
-	address space that don't have an explicit VMA policy will fall
-	back to the task policy, which may itself fall back to the
-	System Default Policy.
-
-	VMA policies have a few complicating details:
-
-	* VMA policy applies ONLY to anonymous pages.  These include
-	  pages allocated for anonymous segments, such as the task
-	  stack and heap, and any regions of the address space
-	  mmap()ed with the MAP_ANONYMOUS flag.  If a VMA policy is
-	  applied to a file mapping, it will be ignored if the mapping
-	  used the MAP_SHARED flag.  If the file mapping used the
-	  MAP_PRIVATE flag, the VMA policy will only be applied when
-	  an anonymous page is allocated on an attempt to write to the
-	  mapping-- i.e., at Copy-On-Write.
-
-	* VMA policies are shared between all tasks that share a
-	  virtual address space--a.k.a. threads--independent of when
-	  the policy is installed; and they are inherited across
-	  fork().  However, because VMA policies refer to a specific
-	  region of a task's address space, and because the address
-	  space is discarded and recreated on exec*(), VMA policies
-	  are NOT inheritable across exec().  Thus, only NUMA-aware
-	  applications may use VMA policies.
-
-	* A task may install a new VMA policy on a sub-range of a
-	  previously mmap()ed region.  When this happens, Linux splits
-	  the existing virtual memory area into 2 or 3 VMAs, each with
-	  it's own policy.
-
-	* By default, VMA policy applies only to pages allocated after
-	  the policy is installed.  Any pages already faulted into the
-	  VMA range remain where they were allocated based on the
-	  policy at the time they were allocated.  However, since
-	  2.6.16, Linux supports page migration via the mbind() system
-	  call, so that page contents can be moved to match a newly
-	  installed policy.
-
-Shared Policy
-	Conceptually, shared policies apply to "memory objects" mapped
-	shared into one or more tasks' distinct address spaces.  An
-	application installs a shared policies the same way as VMA
-	policies--using the mbind() system call specifying a range of
-	virtual addresses that map the shared object.  However, unlike
-	VMA policies, which can be considered to be an attribute of a
-	range of a task's address space, shared policies apply
-	directly to the shared object.  Thus, all tasks that attach to
-	the object share the policy, and all pages allocated for the
-	shared object, by any task, will obey the shared policy.
-
-	As of 2.6.22, only shared memory segments, created by shmget() or
-	mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy.  When shared
-	policy support was added to Linux, the associated data structures were
-	added to hugetlbfs shmem segments.  At the time, hugetlbfs did not
-	support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
-	shmem segments were never "hooked up" to the shared policy support.
-	Although hugetlbfs segments now support lazy allocation, their support
-	for shared policy has not been completed.
-
-	As mentioned above :ref:`VMA policies <vma_policy>`,
-	allocations of page cache pages for regular files mmap()ed
-	with MAP_SHARED ignore any VMA policy installed on the virtual
-	address range backed by the shared file mapping.  Rather,
-	shared page cache pages, including pages backing private
-	mappings that have not yet been written by the task, follow
-	task policy, if any, else System Default Policy.
-
-	The shared policy infrastructure supports different policies on subset
-	ranges of the shared object.  However, Linux still splits the VMA of
-	the task that installs the policy for each range of distinct policy.
-	Thus, different tasks that attach to a shared memory segment can have
-	different VMA configurations mapping that one shared object.  This
-	can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
-	a shared memory region, when one task has installed shared policy on
-	one or more ranges of the region.
-
-Components of Memory Policies
------------------------------
-
-A Linux memory policy consists of a "mode", optional mode flags, and
-an optional set of nodes.  The mode determines the behavior of the
-policy, the optional mode flags determine the behavior of the mode,
-and the optional set of nodes can be viewed as the arguments to the
-policy behavior.
-
-Internally, memory policies are implemented by a reference counted
-structure, struct mempolicy.  Details of this structure will be
-discussed in context, below, as required to explain the behavior.
-
-Linux memory policy supports the following 4 behavioral modes:
-
-Default Mode--MPOL_DEFAULT
-	This mode is only used in the memory policy APIs.  Internally,
-	MPOL_DEFAULT is converted to the NULL memory policy in all
-	policy scopes.  Any existing non-default policy will simply be
-	removed when MPOL_DEFAULT is specified.  As a result,
-	MPOL_DEFAULT means "fall back to the next most specific policy
-	scope."
-
-	For example, a NULL or default task policy will fall back to the
-	system default policy.  A NULL or default vma policy will fall
-	back to the task policy.
-
-	When specified in one of the memory policy APIs, the Default mode
-	does not use the optional set of nodes.
-
-	It is an error for the set of nodes specified for this policy to
-	be non-empty.
-
-MPOL_BIND
-	This mode specifies that memory must come from the set of
-	nodes specified by the policy.  Memory will be allocated from
-	the node in the set with sufficient free memory that is
-	closest to the node where the allocation takes place.
-
-MPOL_PREFERRED
-	This mode specifies that the allocation should be attempted
-	from the single node specified in the policy.  If that
-	allocation fails, the kernel will search other nodes, in order
-	of increasing distance from the preferred node based on
-	information provided by the platform firmware.
-
-	Internally, the Preferred policy uses a single node--the
-	preferred_node member of struct mempolicy.  When the internal
-	mode flag MPOL_F_LOCAL is set, the preferred_node is ignored
-	and the policy is interpreted as local allocation.  "Local"
-	allocation policy can be viewed as a Preferred policy that
-	starts at the node containing the cpu where the allocation
-	takes place.
-
-	It is possible for the user to specify that local allocation
-	is always preferred by passing an empty nodemask with this
-	mode.  If an empty nodemask is passed, the policy cannot use
-	the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags
-	described below.
-
-MPOL_INTERLEAVED
-	This mode specifies that page allocations be interleaved, on a
-	page granularity, across the nodes specified in the policy.
-	This mode also behaves slightly differently, based on the
-	context where it is used:
-
-	For allocation of anonymous pages and shared memory pages,
-	Interleave mode indexes the set of nodes specified by the
-	policy using the page offset of the faulting address into the
-	segment [VMA] containing the address modulo the number of
-	nodes specified by the policy.  It then attempts to allocate a
-	page, starting at the selected node, as if the node had been
-	specified by a Preferred policy or had been selected by a
-	local allocation.  That is, allocation will follow the per
-	node zonelist.
-
-	For allocation of page cache pages, Interleave mode indexes
-	the set of nodes specified by the policy using a node counter
-	maintained per task.  This counter wraps around to the lowest
-	specified node after it reaches the highest specified node.
-	This will tend to spread the pages out over the nodes
-	specified by the policy based on the order in which they are
-	allocated, rather than based on any page offset into an
-	address range or file.  During system boot up, the temporary
-	interleaved system default policy works in this mode.
-
-Linux memory policy supports the following optional mode flags:
-
-MPOL_F_STATIC_NODES
-	This flag specifies that the nodemask passed by
-	the user should not be remapped if the task or VMA's set of allowed
-	nodes changes after the memory policy has been defined.
-
-	Without this flag, anytime a mempolicy is rebound because of a
-	change in the set of allowed nodes, the node (Preferred) or
-	nodemask (Bind, Interleave) is remapped to the new set of
-	allowed nodes.  This may result in nodes being used that were
-	previously undesired.
-
-	With this flag, if the user-specified nodes overlap with the
-	nodes allowed by the task's cpuset, then the memory policy is
-	applied to their intersection.  If the two sets of nodes do not
-	overlap, the Default policy is used.
-
-	For example, consider a task that is attached to a cpuset with
-	mems 1-3 that sets an Interleave policy over the same set.  If
-	the cpuset's mems change to 3-5, the Interleave will now occur
-	over nodes 3, 4, and 5.  With this flag, however, since only node
-	3 is allowed from the user's nodemask, the "interleave" only
-	occurs over that node.  If no nodes from the user's nodemask are
-	now allowed, the Default behavior is used.
-
-	MPOL_F_STATIC_NODES cannot be combined with the
-	MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
-	MPOL_PREFERRED policies that were created with an empty nodemask
-	(local allocation).
-
-MPOL_F_RELATIVE_NODES
-	This flag specifies that the nodemask passed
-	by the user will be mapped relative to the set of the task or VMA's
-	set of allowed nodes.  The kernel stores the user-passed nodemask,
-	and if the allowed nodes changes, then that original nodemask will
-	be remapped relative to the new set of allowed nodes.
-
-	Without this flag (and without MPOL_F_STATIC_NODES), anytime a
-	mempolicy is rebound because of a change in the set of allowed
-	nodes, the node (Preferred) or nodemask (Bind, Interleave) is
-	remapped to the new set of allowed nodes.  That remap may not
-	preserve the relative nature of the user's passed nodemask to its
-	set of allowed nodes upon successive rebinds: a nodemask of
-	1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
-	allowed nodes is restored to its original state.
-
-	With this flag, the remap is done so that the node numbers from
-	the user's passed nodemask are relative to the set of allowed
-	nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
-	nodemask, the policy will be effected over the first (and in the
-	Bind or Interleave case, the third and fifth) nodes in the set of
-	allowed nodes.  The nodemask passed by the user represents nodes
-	relative to task or VMA's set of allowed nodes.
-
-	If the user's nodemask includes nodes that are outside the range
-	of the new set of allowed nodes (for example, node 5 is set in
-	the user's nodemask when the set of allowed nodes is only 0-3),
-	then the remap wraps around to the beginning of the nodemask and,
-	if not already set, sets the node in the mempolicy nodemask.
-
-	For example, consider a task that is attached to a cpuset with
-	mems 2-5 that sets an Interleave policy over the same set with
-	MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
-	interleave now occurs over nodes 3,5-7.  If the cpuset's mems
-	then change to 0,2-3,5, then the interleave occurs over nodes
-	0,2-3,5.
-
-	Thanks to the consistent remapping, applications preparing
-	nodemasks to specify memory policies using this flag should
-	disregard their current, actual cpuset imposed memory placement
-	and prepare the nodemask as if they were always located on
-	memory nodes 0 to N-1, where N is the number of memory nodes the
-	policy is intended to manage.  Let the kernel then remap to the
-	set of memory nodes allowed by the task's cpuset, as that may
-	change over time.
-
-	MPOL_F_RELATIVE_NODES cannot be combined with the
-	MPOL_F_STATIC_NODES flag.  It also cannot be used for
-	MPOL_PREFERRED policies that were created with an empty nodemask
-	(local allocation).
-
-Memory Policy Reference Counting
-================================
-
-To resolve use/free races, struct mempolicy contains an atomic reference
-count field.  Internal interfaces, mpol_get()/mpol_put() increment and
-decrement this reference count, respectively.  mpol_put() will only free
-the structure back to the mempolicy kmem cache when the reference count
-goes to zero.
-
-When a new memory policy is allocated, its reference count is initialized
-to '1', representing the reference held by the task that is installing the
-new policy.  When a pointer to a memory policy structure is stored in another
-structure, another reference is added, as the task's reference will be dropped
-on completion of the policy installation.
-
-During run-time "usage" of the policy, we attempt to minimize atomic operations
-on the reference count, as this can lead to cache lines bouncing between cpus
-and NUMA nodes.  "Usage" here means one of the following:
-
-1) querying of the policy, either by the task itself [using the get_mempolicy()
-   API discussed below] or by another task using the /proc/<pid>/numa_maps
-   interface.
-
-2) examination of the policy to determine the policy mode and associated node
-   or node lists, if any, for page allocation.  This is considered a "hot
-   path".  Note that for MPOL_BIND, the "usage" extends across the entire
-   allocation process, which may sleep during page reclaimation, because the
-   BIND policy nodemask is used, by reference, to filter ineligible nodes.
-
-We can avoid taking an extra reference during the usages listed above as
-follows:
-
-1) we never need to get/free the system default policy as this is never
-   changed nor freed, once the system is up and running.
-
-2) for querying the policy, we do not need to take an extra reference on the
-   target task's task policy nor vma policies because we always acquire the
-   task's mm's mmap_sem for read during the query.  The set_mempolicy() and
-   mbind() APIs [see below] always acquire the mmap_sem for write when
-   installing or replacing task or vma policies.  Thus, there is no possibility
-   of a task or thread freeing a policy while another task or thread is
-   querying it.
-
-3) Page allocation usage of task or vma policy occurs in the fault path where
-   we hold them mmap_sem for read.  Again, because replacing the task or vma
-   policy requires that the mmap_sem be held for write, the policy can't be
-   freed out from under us while we're using it for page allocation.
-
-4) Shared policies require special consideration.  One task can replace a
-   shared memory policy while another task, with a distinct mmap_sem, is
-   querying or allocating a page based on the policy.  To resolve this
-   potential race, the shared policy infrastructure adds an extra reference
-   to the shared policy during lookup while holding a spin lock on the shared
-   policy management structure.  This requires that we drop this extra
-   reference when we're finished "using" the policy.  We must drop the
-   extra reference on shared policies in the same query/allocation paths
-   used for non-shared policies.  For this reason, shared policies are marked
-   as such, and the extra reference is dropped "conditionally"--i.e., only
-   for shared policies.
-
-   Because of this extra reference counting, and because we must lookup
-   shared policies in a tree structure under spinlock, shared policies are
-   more expensive to use in the page allocation path.  This is especially
-   true for shared policies on shared memory regions shared by tasks running
-   on different NUMA nodes.  This extra overhead can be avoided by always
-   falling back to task or system default policy for shared memory regions,
-   or by prefaulting the entire shared memory region into memory and locking
-   it down.  However, this might not be appropriate for all applications.
-
-Memory Policy APIs
-
-Linux supports 3 system calls for controlling memory policy.  These APIS
-always affect only the calling task, the calling task's address space, or
-some shared object mapped into the calling task's address space.
-
-.. note::
-   the headers that define these APIs and the parameter data types for
-   user space applications reside in a package that is not part of the
-   Linux kernel.  The kernel system call interfaces, with the 'sys\_'
-   prefix, are defined in <linux/syscalls.h>; the mode and flag
-   definitions are defined in <linux/mempolicy.h>.
-
-Set [Task] Memory Policy::
-
-	long set_mempolicy(int mode, const unsigned long *nmask,
-					unsigned long maxnode);
-
-Set's the calling task's "task/process memory policy" to mode
-specified by the 'mode' argument and the set of nodes defined by
-'nmask'.  'nmask' points to a bit mask of node ids containing at least
-'maxnode' ids.  Optional mode flags may be passed by combining the
-'mode' argument with the flag (for example: MPOL_INTERLEAVE |
-MPOL_F_STATIC_NODES).
-
-See the set_mempolicy(2) man page for more details
-
-
-Get [Task] Memory Policy or Related Information::
-
-	long get_mempolicy(int *mode,
-			   const unsigned long *nmask, unsigned long maxnode,
-			   void *addr, int flags);
-
-Queries the "task/process memory policy" of the calling task, or the
-policy or location of a specified virtual address, depending on the
-'flags' argument.
-
-See the get_mempolicy(2) man page for more details
-
-
-Install VMA/Shared Policy for a Range of Task's Address Space::
-
-	long mbind(void *start, unsigned long len, int mode,
-		   const unsigned long *nmask, unsigned long maxnode,
-		   unsigned flags);
-
-mbind() installs the policy specified by (mode, nmask, maxnodes) as a
-VMA policy for the range of the calling task's address space specified
-by the 'start' and 'len' arguments.  Additional actions may be
-requested via the 'flags' argument.
-
-See the mbind(2) man page for more details.
-
-Memory Policy Command Line Interface
-====================================
-
-Although not strictly part of the Linux implementation of memory policy,
-a command line tool, numactl(8), exists that allows one to:
-
-+ set the task policy for a specified program via set_mempolicy(2), fork(2) and
-  exec(2)
-
-+ set the shared policy for a shared memory segment via mbind(2)
-
-The numactl(8) tool is packaged with the run-time version of the library
-containing the memory policy system call wrappers.  Some distributions
-package the headers and compile-time libraries in a separate development
-package.
-
-.. _mem_pol_and_cpusets:
-
-Memory Policies and cpusets
-===========================
-
-Memory policies work within cpusets as described above.  For memory policies
-that require a node or set of nodes, the nodes are restricted to the set of
-nodes whose memories are allowed by the cpuset constraints.  If the nodemask
-specified for the policy contains nodes that are not allowed by the cpuset and
-MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
-specified for the policy and the set of nodes with memory is used.  If the
-result is the empty set, the policy is considered invalid and cannot be
-installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
-onto and folded into the task's set of allowed nodes as previously described.
-
-The interaction of memory policies and cpusets can be problematic when tasks
-in two cpusets share access to a memory region, such as shared memory segments
-created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
-any of the tasks install shared policy on the region, only nodes whose
-memories are allowed in both cpusets may be used in the policies.  Obtaining
-this information requires "stepping outside" the memory policy APIs to use the
-cpuset information and requires that one know in what cpusets other task might
-be attaching to the shared region.  Furthermore, if the cpusets' allowed
-memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
deleted file mode 100644
index 0dd54bbe4afa..000000000000
--- a/Documentation/vm/overcommit-accounting
+++ /dev/null
@@ -1,87 +0,0 @@
-.. _overcommit_accounting:
-
-=====================
-Overcommit Accounting
-=====================
-
-The Linux kernel supports the following overcommit handling modes
-
-0
-	Heuristic overcommit handling. Obvious overcommits of address
-	space are refused. Used for a typical system. It ensures a
-	seriously wild allocation fails while allowing overcommit to
-	reduce swap usage.  root is allowed to allocate slightly more
-	memory in this mode. This is the default.
-
-1
-	Always overcommit. Appropriate for some scientific
-	applications. Classic example is code using sparse arrays and
-	just relying on the virtual memory consisting almost entirely
-	of zero pages.
-
-2
-	Don't overcommit. The total address space commit for the
-	system is not permitted to exceed swap + a configurable amount
-	(default is 50%) of physical RAM.  Depending on the amount you
-	use, in most situations this means a process will not be
-	killed while accessing pages but will receive errors on memory
-	allocation as appropriate.
-
-	Useful for applications that want to guarantee their memory
-	allocations will be available in the future without having to
-	initialize every page.
-
-The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
-
-The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
-or ``vm.overcommit_kbytes`` (absolute value).
-
-The current overcommit limit and amount committed are viewable in
-``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
-
-Gotchas
-=======
-
-The C language stack growth does an implicit mremap. If you want absolute
-guarantees and run close to the edge you MUST mmap your stack for the
-largest size you think you will need. For typical stack usage this does
-not matter much but it's a corner case if you really really care
-
-In mode 2 the MAP_NORESERVE flag is ignored.
-
-
-How It Works
-============
-
-The overcommit is based on the following rules
-
-For a file backed map
-	| SHARED or READ-only	-	0 cost (the file is the map not swap)
-	| PRIVATE WRITABLE	-	size of mapping per instance
-
-For an anonymous or ``/dev/zero`` map
-	| SHARED			-	size of mapping
-	| PRIVATE READ-only	-	0 cost (but of little use)
-	| PRIVATE WRITABLE	-	size of mapping per instance
-
-Additional accounting
-	| Pages made writable copies by mmap
-	| shmfs memory drawn from the same pool
-
-Status
-======
-
-*	We account mmap memory mappings
-*	We account mprotect changes in commit
-*	We account mremap changes in size
-*	We account brk
-*	We account munmap
-*	We report the commit status in /proc
-*	Account and check on fork
-*	Review stack handling/building on exec
-*	SHMfs accounting
-*	Implement actual limit enforcement
-
-To Do
-=====
-*	Account ptrace pages (this is hard)
diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst
new file mode 100644
index 000000000000..0dd54bbe4afa
--- /dev/null
+++ b/Documentation/vm/overcommit-accounting.rst
@@ -0,0 +1,87 @@
+.. _overcommit_accounting:
+
+=====================
+Overcommit Accounting
+=====================
+
+The Linux kernel supports the following overcommit handling modes
+
+0
+	Heuristic overcommit handling. Obvious overcommits of address
+	space are refused. Used for a typical system. It ensures a
+	seriously wild allocation fails while allowing overcommit to
+	reduce swap usage.  root is allowed to allocate slightly more
+	memory in this mode. This is the default.
+
+1
+	Always overcommit. Appropriate for some scientific
+	applications. Classic example is code using sparse arrays and
+	just relying on the virtual memory consisting almost entirely
+	of zero pages.
+
+2
+	Don't overcommit. The total address space commit for the
+	system is not permitted to exceed swap + a configurable amount
+	(default is 50%) of physical RAM.  Depending on the amount you
+	use, in most situations this means a process will not be
+	killed while accessing pages but will receive errors on memory
+	allocation as appropriate.
+
+	Useful for applications that want to guarantee their memory
+	allocations will be available in the future without having to
+	initialize every page.
+
+The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
+
+The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
+or ``vm.overcommit_kbytes`` (absolute value).
+
+The current overcommit limit and amount committed are viewable in
+``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
+
+Gotchas
+=======
+
+The C language stack growth does an implicit mremap. If you want absolute
+guarantees and run close to the edge you MUST mmap your stack for the
+largest size you think you will need. For typical stack usage this does
+not matter much but it's a corner case if you really really care
+
+In mode 2 the MAP_NORESERVE flag is ignored.
+
+
+How It Works
+============
+
+The overcommit is based on the following rules
+
+For a file backed map
+	| SHARED or READ-only	-	0 cost (the file is the map not swap)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+For an anonymous or ``/dev/zero`` map
+	| SHARED			-	size of mapping
+	| PRIVATE READ-only	-	0 cost (but of little use)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+Additional accounting
+	| Pages made writable copies by mmap
+	| shmfs memory drawn from the same pool
+
+Status
+======
+
+*	We account mmap memory mappings
+*	We account mprotect changes in commit
+*	We account mremap changes in size
+*	We account brk
+*	We account munmap
+*	We report the commit status in /proc
+*	Account and check on fork
+*	Review stack handling/building on exec
+*	SHMfs accounting
+*	Implement actual limit enforcement
+
+To Do
+=====
+*	Account ptrace pages (this is hard)
diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags
deleted file mode 100644
index 637cc49d1b2f..000000000000
--- a/Documentation/vm/page_frags
+++ /dev/null
@@ -1,45 +0,0 @@
-.. _page_frags:
-
-==============
-Page fragments
-==============
-
-A page fragment is an arbitrary-length arbitrary-offset area of memory
-which resides within a 0 or higher order compound page.  Multiple
-fragments within that page are individually refcounted, in the page's
-reference counter.
-
-The page_frag functions, page_frag_alloc and page_frag_free, provide a
-simple allocation framework for page fragments.  This is used by the
-network stack and network device drivers to provide a backing region of
-memory for use as either an sk_buff->head, or to be used in the "frags"
-portion of skb_shared_info.
-
-In order to make use of the page fragment APIs a backing page fragment
-cache is needed.  This provides a central point for the fragment allocation
-and tracks allows multiple calls to make use of a cached page.  The
-advantage to doing this is that multiple calls to get_page can be avoided
-which can be expensive at allocation time.  However due to the nature of
-this caching it is required that any calls to the cache be protected by
-either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
-to be disabled when executing the fragment allocation.
-
-The network stack uses two separate caches per CPU to handle fragment
-allocation.  The netdev_alloc_cache is used by callers making use of the
-__netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
-used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
-main difference between these two calls is the context in which they may be
-called.  The "netdev" prefixed functions are usable in any context as these
-functions will disable interrupts, while the "napi" prefixed functions are
-only usable within the softirq context.
-
-Many network device drivers use a similar methodology for allocating page
-fragments, but the page fragments are cached at the ring or descriptor
-level.  In order to enable these cases it is necessary to provide a generic
-way of tearing down a page cache.  For this reason __page_frag_cache_drain
-was implemented.  It allows for freeing multiple references from a single
-page via a single call.  The advantage to doing this is that it allows for
-cleaning up the multiple references that were added to a page in order to
-avoid calling get_page per allocation.
-
-Alexander Duyck, Nov 29, 2016.
diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst
new file mode 100644
index 000000000000..637cc49d1b2f
--- /dev/null
+++ b/Documentation/vm/page_frags.rst
@@ -0,0 +1,45 @@
+.. _page_frags:
+
+==============
+Page fragments
+==============
+
+A page fragment is an arbitrary-length arbitrary-offset area of memory
+which resides within a 0 or higher order compound page.  Multiple
+fragments within that page are individually refcounted, in the page's
+reference counter.
+
+The page_frag functions, page_frag_alloc and page_frag_free, provide a
+simple allocation framework for page fragments.  This is used by the
+network stack and network device drivers to provide a backing region of
+memory for use as either an sk_buff->head, or to be used in the "frags"
+portion of skb_shared_info.
+
+In order to make use of the page fragment APIs a backing page fragment
+cache is needed.  This provides a central point for the fragment allocation
+and tracks allows multiple calls to make use of a cached page.  The
+advantage to doing this is that multiple calls to get_page can be avoided
+which can be expensive at allocation time.  However due to the nature of
+this caching it is required that any calls to the cache be protected by
+either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
+to be disabled when executing the fragment allocation.
+
+The network stack uses two separate caches per CPU to handle fragment
+allocation.  The netdev_alloc_cache is used by callers making use of the
+__netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
+used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
+main difference between these two calls is the context in which they may be
+called.  The "netdev" prefixed functions are usable in any context as these
+functions will disable interrupts, while the "napi" prefixed functions are
+only usable within the softirq context.
+
+Many network device drivers use a similar methodology for allocating page
+fragments, but the page fragments are cached at the ring or descriptor
+level.  In order to enable these cases it is necessary to provide a generic
+way of tearing down a page cache.  For this reason __page_frag_cache_drain
+was implemented.  It allows for freeing multiple references from a single
+page via a single call.  The advantage to doing this is that it allows for
+cleaning up the multiple references that were added to a page in order to
+avoid calling get_page per allocation.
+
+Alexander Duyck, Nov 29, 2016.
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
deleted file mode 100644
index 07b67a821a12..000000000000
--- a/Documentation/vm/page_migration
+++ /dev/null
@@ -1,257 +0,0 @@
-.. _page_migration:
-
-==============
-Page migration
-==============
-
-Page migration allows the moving of the physical location of pages between
-nodes in a numa system while the process is running. This means that the
-virtual addresses that the process sees do not change. However, the
-system rearranges the physical location of those pages.
-
-The main intend of page migration is to reduce the latency of memory access
-by moving pages near to the processor where the process accessing that memory
-is running.
-
-Page migration allows a process to manually relocate the node on which its
-pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
-a new memory policy via mbind(). The pages of process can also be relocated
-from another process using the sys_migrate_pages() function call. The
-migrate_pages function call takes two sets of nodes and moves pages of a
-process that are located on the from nodes to the destination nodes.
-Page migration functions are provided by the numactl package by Andi Kleen
-(a version later than 0.9.3 is required. Get it from
-ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma
-which provides an interface similar to other numa functionality for page
-migration.  cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
-pages of a process are located. See also the numa_maps documentation in the
-proc(5) man page.
-
-Manual migration is useful if for example the scheduler has relocated
-a process to a processor on a distant node. A batch scheduler or an
-administrator may detect the situation and move the pages of the process
-nearer to the new processor. The kernel itself does only provide
-manual page migration support. Automatic page migration may be implemented
-through user space processes that move pages. A special function call
-"move_pages" allows the moving of individual pages within a process.
-A NUMA profiler may f.e. obtain a log showing frequent off node
-accesses and may use the result to move pages to more advantageous
-locations.
-
-Larger installations usually partition the system using cpusets into
-sections of nodes. Paul Jackson has equipped cpusets with the ability to
-move pages when a task is moved to another cpuset (See
-Documentation/cgroup-v1/cpusets.txt).
-Cpusets allows the automation of process locality. If a task is moved to
-a new cpuset then also all its pages are moved with it so that the
-performance of the process does not sink dramatically. Also the pages
-of processes in a cpuset are moved if the allowed memory nodes of a
-cpuset are changed.
-
-Page migration allows the preservation of the relative location of pages
-within a group of nodes for all migration techniques which will preserve a
-particular memory allocation pattern generated even after migrating a
-process. This is necessary in order to preserve the memory latencies.
-Processes will run with similar performance after migration.
-
-Page migration occurs in several steps. First a high level
-description for those trying to use migrate_pages() from the kernel
-(for userspace usage see the Andi Kleen's numactl package mentioned above)
-and then a low level description of how the low level details work.
-
-In kernel use of migrate_pages()
-================================
-
-1. Remove pages from the LRU.
-
-   Lists of pages to be migrated are generated by scanning over
-   pages and moving them into lists. This is done by
-   calling isolate_lru_page().
-   Calling isolate_lru_page increases the references to the page
-   so that it cannot vanish while the page migration occurs.
-   It also prevents the swapper or other scans to encounter
-   the page.
-
-2. We need to have a function of type new_page_t that can be
-   passed to migrate_pages(). This function should figure out
-   how to allocate the correct new page given the old page.
-
-3. The migrate_pages() function is called which attempts
-   to do the migration. It will call the function to allocate
-   the new page for each page that is considered for
-   moving.
-
-How migrate_pages() works
-=========================
-
-migrate_pages() does several passes over its list of pages. A page is moved
-if all references to a page are removable at the time. The page has
-already been removed from the LRU via isolate_lru_page() and the refcount
-is increased so that the page cannot be freed while page migration occurs.
-
-Steps:
-
-1. Lock the page to be migrated
-
-2. Insure that writeback is complete.
-
-3. Lock the new page that we want to move to. It is locked so that accesses to
-   this (not yet uptodate) page immediately lock while the move is in progress.
-
-4. All the page table references to the page are converted to migration
-   entries. This decreases the mapcount of a page. If the resulting
-   mapcount is not zero then we do not migrate the page. All user space
-   processes that attempt to access the page will now wait on the page lock.
-
-5. The radix tree lock is taken. This will cause all processes trying
-   to access the page via the mapping to block on the radix tree spinlock.
-
-6. The refcount of the page is examined and we back out if references remain
-   otherwise we know that we are the only one referencing this page.
-
-7. The radix tree is checked and if it does not contain the pointer to this
-   page then we back out because someone else modified the radix tree.
-
-8. The new page is prepped with some settings from the old page so that
-   accesses to the new page will discover a page with the correct settings.
-
-9. The radix tree is changed to point to the new page.
-
-10. The reference count of the old page is dropped because the radix tree
-    reference is gone. A reference to the new page is established because
-    the new page is referenced to by the radix tree.
-
-11. The radix tree lock is dropped. With that lookups in the mapping
-    become possible again. Processes will move from spinning on the tree_lock
-    to sleeping on the locked new page.
-
-12. The page contents are copied to the new page.
-
-13. The remaining page flags are copied to the new page.
-
-14. The old page flags are cleared to indicate that the page does
-    not provide any information anymore.
-
-15. Queued up writeback on the new page is triggered.
-
-16. If migration entries were page then replace them with real ptes. Doing
-    so will enable access for user space processes not already waiting for
-    the page lock.
-
-19. The page locks are dropped from the old and new page.
-    Processes waiting on the page lock will redo their page faults
-    and will reach the new page.
-
-20. The new page is moved to the LRU and can be scanned by the swapper
-    etc again.
-
-Non-LRU page migration
-======================
-
-Although original migration aimed for reducing the latency of memory access
-for NUMA, compaction who want to create high-order page is also main customer.
-
-Current problem of the implementation is that it is designed to migrate only
-*LRU* pages. However, there are potential non-lru pages which can be migrated
-in drivers, for example, zsmalloc, virtio-balloon pages.
-
-For virtio-balloon pages, some parts of migration code path have been hooked
-up and added virtio-balloon specific functions to intercept migration logics.
-It's too specific to a driver so other drivers who want to make their pages
-movable would have to add own specific hooks in migration path.
-
-To overclome the problem, VM supports non-LRU page migration which provides
-generic functions for non-LRU movable pages without driver specific hooks
-migration path.
-
-If a driver want to make own pages movable, it should define three functions
-which are function pointers of struct address_space_operations.
-
-1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
-
-   What VM expects on isolate_page function of driver is to return *true*
-   if driver isolates page successfully. On returing true, VM marks the page
-   as PG_isolated so concurrent isolation in several CPUs skip the page
-   for isolation. If a driver cannot isolate the page, it should return *false*.
-
-   Once page is successfully isolated, VM uses page.lru fields so driver
-   shouldn't expect to preserve values in that fields.
-
-2. ``int (*migratepage) (struct address_space *mapping,``
-|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
-
-   After isolation, VM calls migratepage of driver with isolated page.
-   The function of migratepage is to move content of the old page to new page
-   and set up fields of struct page newpage. Keep in mind that you should
-   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
-   under page_lock if you migrated the oldpage successfully and returns
-   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
-   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
-   because VM interprets -EAGAIN as "temporal migration failure". On returning
-   any error except -EAGAIN, VM will give up the page migration without retrying
-   in this time.
-
-   Driver shouldn't touch page.lru field VM using in the functions.
-
-3. ``void (*putback_page)(struct page *);``
-
-   If migration fails on isolated page, VM should return the isolated page
-   to the driver so VM calls driver's putback_page with migration failed page.
-   In this function, driver should put the isolated page back to the own data
-   structure.
-
-4. non-lru movable page flags
-
-   There are two page flags for supporting non-lru movable page.
-
-   * PG_movable
-
-     Driver should use the below function to make page movable under page_lock::
-
-	void __SetPageMovable(struct page *page, struct address_space *mapping)
-
-     It needs argument of address_space for registering migration
-     family functions which will be called by VM. Exactly speaking,
-     PG_movable is not a real flag of struct page. Rather than, VM
-     reuses page->mapping's lower bits to represent it.
-
-::
-	#define PAGE_MAPPING_MOVABLE 0x2
-	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
-
-     so driver shouldn't access page->mapping directly. Instead, driver should
-     use page_mapping which mask off the low two bits of page->mapping under
-     page lock so it can get right struct address_space.
-
-     For testing of non-lru movable page, VM supports __PageMovable function.
-     However, it doesn't guarantee to identify non-lru movable page because
-     page->mapping field is unified with other variables in struct page.
-     As well, if driver releases the page after isolation by VM, page->mapping
-     doesn't have stable value although it has PAGE_MAPPING_MOVABLE
-     (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
-     page is LRU or non-lru movable once the page has been isolated. Because
-     LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
-     good for just peeking to test non-lru movable pages before more expensive
-     checking with lock_page in pfn scanning to select victim.
-
-     For guaranteeing non-lru movable page, VM provides PageMovable function.
-     Unlike __PageMovable, PageMovable functions validates page->mapping and
-     mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
-     destroying of page->mapping.
-
-     Driver using __SetPageMovable should clear the flag via __ClearMovablePage
-     under page_lock before the releasing the page.
-
-   * PG_isolated
-
-     To prevent concurrent isolation among several CPUs, VM marks isolated page
-     as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
-     movable page, it can skip it. Driver doesn't need to manipulate the flag
-     because VM will set/clear it automatically. Keep in mind that if driver
-     sees PG_isolated page, it means the page have been isolated by VM so it
-     shouldn't touch page.lru field.
-     PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
-     for own purpose.
-
-Christoph Lameter, May 8, 2006.
-Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
new file mode 100644
index 000000000000..07b67a821a12
--- /dev/null
+++ b/Documentation/vm/page_migration.rst
@@ -0,0 +1,257 @@
+.. _page_migration:
+
+==============
+Page migration
+==============
+
+Page migration allows the moving of the physical location of pages between
+nodes in a numa system while the process is running. This means that the
+virtual addresses that the process sees do not change. However, the
+system rearranges the physical location of those pages.
+
+The main intend of page migration is to reduce the latency of memory access
+by moving pages near to the processor where the process accessing that memory
+is running.
+
+Page migration allows a process to manually relocate the node on which its
+pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
+a new memory policy via mbind(). The pages of process can also be relocated
+from another process using the sys_migrate_pages() function call. The
+migrate_pages function call takes two sets of nodes and moves pages of a
+process that are located on the from nodes to the destination nodes.
+Page migration functions are provided by the numactl package by Andi Kleen
+(a version later than 0.9.3 is required. Get it from
+ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma
+which provides an interface similar to other numa functionality for page
+migration.  cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
+pages of a process are located. See also the numa_maps documentation in the
+proc(5) man page.
+
+Manual migration is useful if for example the scheduler has relocated
+a process to a processor on a distant node. A batch scheduler or an
+administrator may detect the situation and move the pages of the process
+nearer to the new processor. The kernel itself does only provide
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+A NUMA profiler may f.e. obtain a log showing frequent off node
+accesses and may use the result to move pages to more advantageous
+locations.
+
+Larger installations usually partition the system using cpusets into
+sections of nodes. Paul Jackson has equipped cpusets with the ability to
+move pages when a task is moved to another cpuset (See
+Documentation/cgroup-v1/cpusets.txt).
+Cpusets allows the automation of process locality. If a task is moved to
+a new cpuset then also all its pages are moved with it so that the
+performance of the process does not sink dramatically. Also the pages
+of processes in a cpuset are moved if the allowed memory nodes of a
+cpuset are changed.
+
+Page migration allows the preservation of the relative location of pages
+within a group of nodes for all migration techniques which will preserve a
+particular memory allocation pattern generated even after migrating a
+process. This is necessary in order to preserve the memory latencies.
+Processes will run with similar performance after migration.
+
+Page migration occurs in several steps. First a high level
+description for those trying to use migrate_pages() from the kernel
+(for userspace usage see the Andi Kleen's numactl package mentioned above)
+and then a low level description of how the low level details work.
+
+In kernel use of migrate_pages()
+================================
+
+1. Remove pages from the LRU.
+
+   Lists of pages to be migrated are generated by scanning over
+   pages and moving them into lists. This is done by
+   calling isolate_lru_page().
+   Calling isolate_lru_page increases the references to the page
+   so that it cannot vanish while the page migration occurs.
+   It also prevents the swapper or other scans to encounter
+   the page.
+
+2. We need to have a function of type new_page_t that can be
+   passed to migrate_pages(). This function should figure out
+   how to allocate the correct new page given the old page.
+
+3. The migrate_pages() function is called which attempts
+   to do the migration. It will call the function to allocate
+   the new page for each page that is considered for
+   moving.
+
+How migrate_pages() works
+=========================
+
+migrate_pages() does several passes over its list of pages. A page is moved
+if all references to a page are removable at the time. The page has
+already been removed from the LRU via isolate_lru_page() and the refcount
+is increased so that the page cannot be freed while page migration occurs.
+
+Steps:
+
+1. Lock the page to be migrated
+
+2. Insure that writeback is complete.
+
+3. Lock the new page that we want to move to. It is locked so that accesses to
+   this (not yet uptodate) page immediately lock while the move is in progress.
+
+4. All the page table references to the page are converted to migration
+   entries. This decreases the mapcount of a page. If the resulting
+   mapcount is not zero then we do not migrate the page. All user space
+   processes that attempt to access the page will now wait on the page lock.
+
+5. The radix tree lock is taken. This will cause all processes trying
+   to access the page via the mapping to block on the radix tree spinlock.
+
+6. The refcount of the page is examined and we back out if references remain
+   otherwise we know that we are the only one referencing this page.
+
+7. The radix tree is checked and if it does not contain the pointer to this
+   page then we back out because someone else modified the radix tree.
+
+8. The new page is prepped with some settings from the old page so that
+   accesses to the new page will discover a page with the correct settings.
+
+9. The radix tree is changed to point to the new page.
+
+10. The reference count of the old page is dropped because the radix tree
+    reference is gone. A reference to the new page is established because
+    the new page is referenced to by the radix tree.
+
+11. The radix tree lock is dropped. With that lookups in the mapping
+    become possible again. Processes will move from spinning on the tree_lock
+    to sleeping on the locked new page.
+
+12. The page contents are copied to the new page.
+
+13. The remaining page flags are copied to the new page.
+
+14. The old page flags are cleared to indicate that the page does
+    not provide any information anymore.
+
+15. Queued up writeback on the new page is triggered.
+
+16. If migration entries were page then replace them with real ptes. Doing
+    so will enable access for user space processes not already waiting for
+    the page lock.
+
+19. The page locks are dropped from the old and new page.
+    Processes waiting on the page lock will redo their page faults
+    and will reach the new page.
+
+20. The new page is moved to the LRU and can be scanned by the swapper
+    etc again.
+
+Non-LRU page migration
+======================
+
+Although original migration aimed for reducing the latency of memory access
+for NUMA, compaction who want to create high-order page is also main customer.
+
+Current problem of the implementation is that it is designed to migrate only
+*LRU* pages. However, there are potential non-lru pages which can be migrated
+in drivers, for example, zsmalloc, virtio-balloon pages.
+
+For virtio-balloon pages, some parts of migration code path have been hooked
+up and added virtio-balloon specific functions to intercept migration logics.
+It's too specific to a driver so other drivers who want to make their pages
+movable would have to add own specific hooks in migration path.
+
+To overclome the problem, VM supports non-LRU page migration which provides
+generic functions for non-LRU movable pages without driver specific hooks
+migration path.
+
+If a driver want to make own pages movable, it should define three functions
+which are function pointers of struct address_space_operations.
+
+1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
+
+   What VM expects on isolate_page function of driver is to return *true*
+   if driver isolates page successfully. On returing true, VM marks the page
+   as PG_isolated so concurrent isolation in several CPUs skip the page
+   for isolation. If a driver cannot isolate the page, it should return *false*.
+
+   Once page is successfully isolated, VM uses page.lru fields so driver
+   shouldn't expect to preserve values in that fields.
+
+2. ``int (*migratepage) (struct address_space *mapping,``
+|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
+
+   After isolation, VM calls migratepage of driver with isolated page.
+   The function of migratepage is to move content of the old page to new page
+   and set up fields of struct page newpage. Keep in mind that you should
+   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
+   under page_lock if you migrated the oldpage successfully and returns
+   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
+   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
+   because VM interprets -EAGAIN as "temporal migration failure". On returning
+   any error except -EAGAIN, VM will give up the page migration without retrying
+   in this time.
+
+   Driver shouldn't touch page.lru field VM using in the functions.
+
+3. ``void (*putback_page)(struct page *);``
+
+   If migration fails on isolated page, VM should return the isolated page
+   to the driver so VM calls driver's putback_page with migration failed page.
+   In this function, driver should put the isolated page back to the own data
+   structure.
+
+4. non-lru movable page flags
+
+   There are two page flags for supporting non-lru movable page.
+
+   * PG_movable
+
+     Driver should use the below function to make page movable under page_lock::
+
+	void __SetPageMovable(struct page *page, struct address_space *mapping)
+
+     It needs argument of address_space for registering migration
+     family functions which will be called by VM. Exactly speaking,
+     PG_movable is not a real flag of struct page. Rather than, VM
+     reuses page->mapping's lower bits to represent it.
+
+::
+	#define PAGE_MAPPING_MOVABLE 0x2
+	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
+
+     so driver shouldn't access page->mapping directly. Instead, driver should
+     use page_mapping which mask off the low two bits of page->mapping under
+     page lock so it can get right struct address_space.
+
+     For testing of non-lru movable page, VM supports __PageMovable function.
+     However, it doesn't guarantee to identify non-lru movable page because
+     page->mapping field is unified with other variables in struct page.
+     As well, if driver releases the page after isolation by VM, page->mapping
+     doesn't have stable value although it has PAGE_MAPPING_MOVABLE
+     (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
+     page is LRU or non-lru movable once the page has been isolated. Because
+     LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
+     good for just peeking to test non-lru movable pages before more expensive
+     checking with lock_page in pfn scanning to select victim.
+
+     For guaranteeing non-lru movable page, VM provides PageMovable function.
+     Unlike __PageMovable, PageMovable functions validates page->mapping and
+     mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
+     destroying of page->mapping.
+
+     Driver using __SetPageMovable should clear the flag via __ClearMovablePage
+     under page_lock before the releasing the page.
+
+   * PG_isolated
+
+     To prevent concurrent isolation among several CPUs, VM marks isolated page
+     as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
+     movable page, it can skip it. Driver doesn't need to manipulate the flag
+     because VM will set/clear it automatically. Keep in mind that if driver
+     sees PG_isolated page, it means the page have been isolated by VM so it
+     shouldn't touch page.lru field.
+     PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
+     for own purpose.
+
+Christoph Lameter, May 8, 2006.
+Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
new file mode 100644
index 000000000000..0ed5ab8c7ab4
--- /dev/null
+++ b/Documentation/vm/page_owner.rst
@@ -0,0 +1,90 @@
+.. _page_owner:
+
+==================================================
+page owner: Tracking about who allocated each page
+==================================================
+
+Introduction
+============
+
+page owner is for the tracking about who allocated each page.
+It can be used to debug memory leak or to find a memory hogger.
+When allocation happens, information about allocation such as call stack
+and order of pages is stored into certain storage for each page.
+When we need to know about status of all pages, we can get and analyze
+this information.
+
+Although we already have tracepoint for tracing page allocation/free,
+using it for analyzing who allocate each page is rather complex. We need
+to enlarge the trace buffer for preventing overlapping until userspace
+program launched. And, launched program continually dump out the trace
+buffer for later analysis and it would change system behviour with more
+possibility rather than just keeping it in memory, so bad for debugging.
+
+page owner can also be used for various purposes. For example, accurate
+fragmentation statistics can be obtained through gfp flag information of
+each page. It is already implemented and activated if page owner is
+enabled. Other usages are more than welcome.
+
+page owner is disabled in default. So, if you'd like to use it, you need
+to add "page_owner=on" into your boot cmdline. If the kernel is built
+with page owner and page owner is disabled in runtime due to no enabling
+boot option, runtime overhead is marginal. If disabled in runtime, it
+doesn't require memory to store owner information, so there is no runtime
+memory overhead. And, page owner inserts just two unlikely branches into
+the page allocator hotpath and if not enabled, then allocation is done
+like as the kernel without page owner. These two unlikely branches should
+not affect to allocation performance, especially if the static keys jump
+label patching functionality is available. Following is the kernel's code
+size change due to this facility.
+
+- Without page owner::
+
+   text    data     bss     dec     hex filename
+   40662   1493     644   42799    a72f mm/page_alloc.o
+
+- With page owner::
+
+   text    data     bss     dec     hex filename
+   40892   1493     644   43029    a815 mm/page_alloc.o
+   1427      24       8    1459     5b3 mm/page_ext.o
+   2722      50       0    2772     ad4 mm/page_owner.o
+
+Although, roughly, 4 KB code is added in total, page_alloc.o increase by
+230 bytes and only half of it is in hotpath. Building the kernel with
+page owner and turning it on if needed would be great option to debug
+kernel memory problem.
+
+There is one notice that is caused by implementation detail. page owner
+stores information into the memory from struct page extension. This memory
+is initialized some time later than that page allocator starts in sparse
+memory system, so, until initialization, many pages can be allocated and
+they would have no owner information. To fix it up, these early allocated
+pages are investigated and marked as allocated in initialization phase.
+Although it doesn't mean that they have the right owner information,
+at least, we can tell whether the page is allocated or not,
+more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages
+are catched and marked, although they are mostly allocated from struct
+page extension feature. Anyway, after that, no page is left in
+un-tracking state.
+
+Usage
+=====
+
+1) Build user-space helper::
+
+	cd tools/vm
+	make page_owner_sort
+
+2) Enable page owner: add "page_owner=on" to boot cmdline.
+
+3) Do the job what you want to debug
+
+4) Analyze information from page owner::
+
+	cat /sys/kernel/debug/page_owner > page_owner_full.txt
+	grep -v ^PFN page_owner_full.txt > page_owner.txt
+	./page_owner_sort page_owner.txt sorted_page_owner.txt
+
+   See the result about who allocated each page
+   in the ``sorted_page_owner.txt``.
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt
deleted file mode 100644
index 0ed5ab8c7ab4..000000000000
--- a/Documentation/vm/page_owner.txt
+++ /dev/null
@@ -1,90 +0,0 @@
-.. _page_owner:
-
-==================================================
-page owner: Tracking about who allocated each page
-==================================================
-
-Introduction
-============
-
-page owner is for the tracking about who allocated each page.
-It can be used to debug memory leak or to find a memory hogger.
-When allocation happens, information about allocation such as call stack
-and order of pages is stored into certain storage for each page.
-When we need to know about status of all pages, we can get and analyze
-this information.
-
-Although we already have tracepoint for tracing page allocation/free,
-using it for analyzing who allocate each page is rather complex. We need
-to enlarge the trace buffer for preventing overlapping until userspace
-program launched. And, launched program continually dump out the trace
-buffer for later analysis and it would change system behviour with more
-possibility rather than just keeping it in memory, so bad for debugging.
-
-page owner can also be used for various purposes. For example, accurate
-fragmentation statistics can be obtained through gfp flag information of
-each page. It is already implemented and activated if page owner is
-enabled. Other usages are more than welcome.
-
-page owner is disabled in default. So, if you'd like to use it, you need
-to add "page_owner=on" into your boot cmdline. If the kernel is built
-with page owner and page owner is disabled in runtime due to no enabling
-boot option, runtime overhead is marginal. If disabled in runtime, it
-doesn't require memory to store owner information, so there is no runtime
-memory overhead. And, page owner inserts just two unlikely branches into
-the page allocator hotpath and if not enabled, then allocation is done
-like as the kernel without page owner. These two unlikely branches should
-not affect to allocation performance, especially if the static keys jump
-label patching functionality is available. Following is the kernel's code
-size change due to this facility.
-
-- Without page owner::
-
-   text    data     bss     dec     hex filename
-   40662   1493     644   42799    a72f mm/page_alloc.o
-
-- With page owner::
-
-   text    data     bss     dec     hex filename
-   40892   1493     644   43029    a815 mm/page_alloc.o
-   1427      24       8    1459     5b3 mm/page_ext.o
-   2722      50       0    2772     ad4 mm/page_owner.o
-
-Although, roughly, 4 KB code is added in total, page_alloc.o increase by
-230 bytes and only half of it is in hotpath. Building the kernel with
-page owner and turning it on if needed would be great option to debug
-kernel memory problem.
-
-There is one notice that is caused by implementation detail. page owner
-stores information into the memory from struct page extension. This memory
-is initialized some time later than that page allocator starts in sparse
-memory system, so, until initialization, many pages can be allocated and
-they would have no owner information. To fix it up, these early allocated
-pages are investigated and marked as allocated in initialization phase.
-Although it doesn't mean that they have the right owner information,
-at least, we can tell whether the page is allocated or not,
-more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages
-are catched and marked, although they are mostly allocated from struct
-page extension feature. Anyway, after that, no page is left in
-un-tracking state.
-
-Usage
-=====
-
-1) Build user-space helper::
-
-	cd tools/vm
-	make page_owner_sort
-
-2) Enable page owner: add "page_owner=on" to boot cmdline.
-
-3) Do the job what you want to debug
-
-4) Analyze information from page owner::
-
-	cat /sys/kernel/debug/page_owner > page_owner_full.txt
-	grep -v ^PFN page_owner_full.txt > page_owner.txt
-	./page_owner_sort page_owner.txt sorted_page_owner.txt
-
-   See the result about who allocated each page
-   in the ``sorted_page_owner.txt``.
diff --git a/Documentation/vm/pagemap.rst b/Documentation/vm/pagemap.rst
new file mode 100644
index 000000000000..d54b4bfd3043
--- /dev/null
+++ b/Documentation/vm/pagemap.rst
@@ -0,0 +1,197 @@
+.. _pagemap:
+
+======================================
+pagemap from the Userspace Perspective
+======================================
+
+pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
+userspace programs to examine the page tables and related information by
+reading files in ``/proc``.
+
+There are four components to pagemap:
+
+ * ``/proc/pid/pagemap``.  This file lets a userspace process find out which
+   physical frame each virtual page is mapped to.  It contains one 64-bit
+   value for each virtual page, containing the following data (from
+   fs/proc/task_mmu.c, above pagemap_read):
+
+    * Bits 0-54  page frame number (PFN) if present
+    * Bits 0-4   swap type if swapped
+    * Bits 5-54  swap offset if swapped
+    * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.rst)
+    * Bit  56    page exclusively mapped (since 4.2)
+    * Bits 57-60 zero
+    * Bit  61    page is file-page or shared-anon (since 3.5)
+    * Bit  62    page swapped
+    * Bit  63    page present
+
+   Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
+   In 4.0 and 4.1 opens by unprivileged fail with -EPERM.  Starting from
+   4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
+   Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
+
+   If the page is not present but in swap, then the PFN contains an
+   encoding of the swap file number and the page's offset into the
+   swap. Unmapped pages return a null PFN. This allows determining
+   precisely which pages are mapped (or in swap) and comparing mapped
+   pages between processes.
+
+   Efficient users of this interface will use /proc/pid/maps to
+   determine which areas of memory are actually mapped and llseek to
+   skip over unmapped regions.
+
+ * ``/proc/kpagecount``.  This file contains a 64-bit count of the number of
+   times each page is mapped, indexed by PFN.
+
+ * ``/proc/kpageflags``.  This file contains a 64-bit set of flags for each
+   page, indexed by PFN.
+
+   The flags are (from ``fs/proc/page.c``, above kpageflags_read):
+
+    0. LOCKED
+    1. ERROR
+    2. REFERENCED
+    3. UPTODATE
+    4. DIRTY
+    5. LRU
+    6. ACTIVE
+    7. SLAB
+    8. WRITEBACK
+    9. RECLAIM
+    10. BUDDY
+    11. MMAP
+    12. ANON
+    13. SWAPCACHE
+    14. SWAPBACKED
+    15. COMPOUND_HEAD
+    16. COMPOUND_TAIL
+    17. HUGE
+    18. UNEVICTABLE
+    19. HWPOISON
+    20. NOPAGE
+    21. KSM
+    22. THP
+    23. BALLOON
+    24. ZERO_PAGE
+    25. IDLE
+
+ * ``/proc/kpagecgroup``.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
+
+Short descriptions to the page flags:
+=====================================
+
+0 - LOCKED
+   page is being locked for exclusive access, eg. by undergoing read/write IO
+7 - SLAB
+   page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
+   When compound page is used, SLUB/SLQB will only set this flag on the head
+   page; SLOB will not flag it at all.
+10 - BUDDY
+    a free memory block managed by the buddy system allocator
+    The buddy system organizes free memory in blocks of various orders.
+    An order N block has 2^N physically contiguous pages, with the BUDDY flag
+    set for and _only_ for the first page.
+15 - COMPOUND_HEAD
+    A compound page with order N consists of 2^N physically contiguous pages.
+    A compound page with order 2 takes the form of "HTTT", where H donates its
+    head page and T donates its tail page(s).  The major consumers of compound
+    pages are hugeTLB pages (Documentation/vm/hugetlbpage.rst), the SLUB etc.
+    memory allocators and various device drivers. However in this interface,
+    only huge/giga pages are made visible to end users.
+16 - COMPOUND_TAIL
+    A compound page tail (see description above).
+17 - HUGE
+    this is an integral part of a HugeTLB page
+19 - HWPOISON
+    hardware detected memory corruption on this page: don't touch the data!
+20 - NOPAGE
+    no page frame exists at the requested address
+21 - KSM
+    identical memory pages dynamically shared between one or more processes
+22 - THP
+    contiguous pages which construct transparent hugepages
+23 - BALLOON
+    balloon compaction page
+24 - ZERO_PAGE
+    zero page for pfn_zero or huge_zero page
+25 - IDLE
+    page has not been accessed since it was marked idle (see
+    Documentation/vm/idle_page_tracking.rst). Note that this flag may be
+    stale in case the page was accessed via a PTE. To make sure the flag
+    is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first.
+
+IO related page flags
+---------------------
+
+1 - ERROR
+   IO error occurred
+3 - UPTODATE
+   page has up-to-date data
+   ie. for file backed page: (in-memory data revision >= on-disk one)
+4 - DIRTY
+   page has been written to, hence contains new data
+   ie. for file backed page: (in-memory data revision >  on-disk one)
+8 - WRITEBACK
+   page is being synced to disk
+
+LRU related page flags
+----------------------
+
+5 - LRU
+   page is in one of the LRU lists
+6 - ACTIVE
+   page is in the active LRU list
+18 - UNEVICTABLE
+   page is in the unevictable (non-)LRU list It is somehow pinned and
+   not a candidate for LRU page reclaims, eg. ramfs pages,
+   shmctl(SHM_LOCK) and mlock() memory segments
+2 - REFERENCED
+   page has been referenced since last LRU list enqueue/requeue
+9 - RECLAIM
+   page will be reclaimed soon after its pageout IO completed
+11 - MMAP
+   a memory mapped page
+12 - ANON
+   a memory mapped page that is not part of a file
+13 - SWAPCACHE
+   page is mapped to swap space, ie. has an associated swap entry
+14 - SWAPBACKED
+   page is backed by swap/RAM
+
+The page-types tool in the tools/vm directory can be used to query the
+above flags.
+
+Using pagemap to do something useful
+====================================
+
+The general procedure for using pagemap to find out about a process' memory
+usage goes like this:
+
+ 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are
+    mapped to what.
+ 2. Select the maps you are interested in -- all of them, or a particular
+    library, or the stack or the heap, etc.
+ 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine.
+ 4. Read a u64 for each page from pagemap.
+ 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``.  For each PFN you
+    just read, seek to that entry in the file, and read the data you want.
+
+For example, to find the "unique set size" (USS), which is the amount of
+memory that a process is using that is not shared with any other process,
+you can go through every map in the process, find the PFNs, look those up
+in kpagecount, and tally up the number of pages that are only referenced
+once.
+
+Other notes
+===========
+
+Reading from any of the files will return -EINVAL if you are not starting
+the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
+into the file), or if the size of the read is not a multiple of 8 bytes.
+
+Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
+always 12 at most architectures). Since Linux 3.11 their meaning changes
+after first clear of soft-dirty bits. Since Linux 4.2 they are used for
+flags unconditionally.
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
deleted file mode 100644
index bd6d71740c88..000000000000
--- a/Documentation/vm/pagemap.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-.. _pagemap:
-
-======================================
-pagemap from the Userspace Perspective
-======================================
-
-pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
-userspace programs to examine the page tables and related information by
-reading files in ``/proc``.
-
-There are four components to pagemap:
-
- * ``/proc/pid/pagemap``.  This file lets a userspace process find out which
-   physical frame each virtual page is mapped to.  It contains one 64-bit
-   value for each virtual page, containing the following data (from
-   fs/proc/task_mmu.c, above pagemap_read):
-
-    * Bits 0-54  page frame number (PFN) if present
-    * Bits 0-4   swap type if swapped
-    * Bits 5-54  swap offset if swapped
-    * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
-    * Bit  56    page exclusively mapped (since 4.2)
-    * Bits 57-60 zero
-    * Bit  61    page is file-page or shared-anon (since 3.5)
-    * Bit  62    page swapped
-    * Bit  63    page present
-
-   Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
-   In 4.0 and 4.1 opens by unprivileged fail with -EPERM.  Starting from
-   4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
-   Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
-
-   If the page is not present but in swap, then the PFN contains an
-   encoding of the swap file number and the page's offset into the
-   swap. Unmapped pages return a null PFN. This allows determining
-   precisely which pages are mapped (or in swap) and comparing mapped
-   pages between processes.
-
-   Efficient users of this interface will use /proc/pid/maps to
-   determine which areas of memory are actually mapped and llseek to
-   skip over unmapped regions.
-
- * ``/proc/kpagecount``.  This file contains a 64-bit count of the number of
-   times each page is mapped, indexed by PFN.
-
- * ``/proc/kpageflags``.  This file contains a 64-bit set of flags for each
-   page, indexed by PFN.
-
-   The flags are (from ``fs/proc/page.c``, above kpageflags_read):
-
-    0. LOCKED
-    1. ERROR
-    2. REFERENCED
-    3. UPTODATE
-    4. DIRTY
-    5. LRU
-    6. ACTIVE
-    7. SLAB
-    8. WRITEBACK
-    9. RECLAIM
-    10. BUDDY
-    11. MMAP
-    12. ANON
-    13. SWAPCACHE
-    14. SWAPBACKED
-    15. COMPOUND_HEAD
-    16. COMPOUND_TAIL
-    17. HUGE
-    18. UNEVICTABLE
-    19. HWPOISON
-    20. NOPAGE
-    21. KSM
-    22. THP
-    23. BALLOON
-    24. ZERO_PAGE
-    25. IDLE
-
- * ``/proc/kpagecgroup``.  This file contains a 64-bit inode number of the
-   memory cgroup each page is charged to, indexed by PFN. Only available when
-   CONFIG_MEMCG is set.
-
-Short descriptions to the page flags:
-=====================================
-
-0 - LOCKED
-   page is being locked for exclusive access, eg. by undergoing read/write IO
-7 - SLAB
-   page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
-   When compound page is used, SLUB/SLQB will only set this flag on the head
-   page; SLOB will not flag it at all.
-10 - BUDDY
-    a free memory block managed by the buddy system allocator
-    The buddy system organizes free memory in blocks of various orders.
-    An order N block has 2^N physically contiguous pages, with the BUDDY flag
-    set for and _only_ for the first page.
-15 - COMPOUND_HEAD
-    A compound page with order N consists of 2^N physically contiguous pages.
-    A compound page with order 2 takes the form of "HTTT", where H donates its
-    head page and T donates its tail page(s).  The major consumers of compound
-    pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
-    memory allocators and various device drivers. However in this interface,
-    only huge/giga pages are made visible to end users.
-16 - COMPOUND_TAIL
-    A compound page tail (see description above).
-17 - HUGE
-    this is an integral part of a HugeTLB page
-19 - HWPOISON
-    hardware detected memory corruption on this page: don't touch the data!
-20 - NOPAGE
-    no page frame exists at the requested address
-21 - KSM
-    identical memory pages dynamically shared between one or more processes
-22 - THP
-    contiguous pages which construct transparent hugepages
-23 - BALLOON
-    balloon compaction page
-24 - ZERO_PAGE
-    zero page for pfn_zero or huge_zero page
-25 - IDLE
-    page has not been accessed since it was marked idle (see
-    Documentation/vm/idle_page_tracking.txt). Note that this flag may be
-    stale in case the page was accessed via a PTE. To make sure the flag
-    is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first.
-
-IO related page flags
----------------------
-
-1 - ERROR
-   IO error occurred
-3 - UPTODATE
-   page has up-to-date data
-   ie. for file backed page: (in-memory data revision >= on-disk one)
-4 - DIRTY
-   page has been written to, hence contains new data
-   ie. for file backed page: (in-memory data revision >  on-disk one)
-8 - WRITEBACK
-   page is being synced to disk
-
-LRU related page flags
-----------------------
-
-5 - LRU
-   page is in one of the LRU lists
-6 - ACTIVE
-   page is in the active LRU list
-18 - UNEVICTABLE
-   page is in the unevictable (non-)LRU list It is somehow pinned and
-   not a candidate for LRU page reclaims, eg. ramfs pages,
-   shmctl(SHM_LOCK) and mlock() memory segments
-2 - REFERENCED
-   page has been referenced since last LRU list enqueue/requeue
-9 - RECLAIM
-   page will be reclaimed soon after its pageout IO completed
-11 - MMAP
-   a memory mapped page
-12 - ANON
-   a memory mapped page that is not part of a file
-13 - SWAPCACHE
-   page is mapped to swap space, ie. has an associated swap entry
-14 - SWAPBACKED
-   page is backed by swap/RAM
-
-The page-types tool in the tools/vm directory can be used to query the
-above flags.
-
-Using pagemap to do something useful
-====================================
-
-The general procedure for using pagemap to find out about a process' memory
-usage goes like this:
-
- 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are
-    mapped to what.
- 2. Select the maps you are interested in -- all of them, or a particular
-    library, or the stack or the heap, etc.
- 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine.
- 4. Read a u64 for each page from pagemap.
- 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``.  For each PFN you
-    just read, seek to that entry in the file, and read the data you want.
-
-For example, to find the "unique set size" (USS), which is the amount of
-memory that a process is using that is not shared with any other process,
-you can go through every map in the process, find the PFNs, look those up
-in kpagecount, and tally up the number of pages that are only referenced
-once.
-
-Other notes
-===========
-
-Reading from any of the files will return -EINVAL if you are not starting
-the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
-into the file), or if the size of the read is not a multiple of 8 bytes.
-
-Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
-always 12 at most architectures). Since Linux 3.11 their meaning changes
-after first clear of soft-dirty bits. Since Linux 4.2 they are used for
-flags unconditionally.
diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/vm/remap_file_pages.rst
new file mode 100644
index 000000000000..7bef6718e3a9
--- /dev/null
+++ b/Documentation/vm/remap_file_pages.rst
@@ -0,0 +1,33 @@
+.. _remap_file_pages:
+
+==============================
+remap_file_pages() system call
+==============================
+
+The remap_file_pages() system call is used to create a nonlinear mapping,
+that is, a mapping in which the pages of the file are mapped into a
+nonsequential order in memory. The advantage of using remap_file_pages()
+over using repeated calls to mmap(2) is that the former approach does not
+require the kernel to create additional VMA (Virtual Memory Area) data
+structures.
+
+Supporting of nonlinear mapping requires significant amount of non-trivial
+code in kernel virtual memory subsystem including hot paths. Also to get
+nonlinear mapping work kernel need a way to distinguish normal page table
+entries from entries with file offset (pte_file). Kernel reserves flag in
+PTE for this purpose. PTE flags are scarce resource especially on some CPU
+architectures. It would be nice to free up the flag for other usage.
+
+Fortunately, there are not many users of remap_file_pages() in the wild.
+It's only known that one enterprise RDBMS implementation uses the syscall
+on 32-bit systems to map files bigger than can linearly fit into 32-bit
+virtual address space. This use-case is not critical anymore since 64-bit
+systems are widely available.
+
+The syscall is deprecated and replaced it with an emulation now. The
+emulation creates new VMAs instead of nonlinear mappings. It's going to
+work slower for rare users of remap_file_pages() but ABI is preserved.
+
+One side effect of emulation (apart from performance) is that user can hit
+vm.max_map_count limit more easily due to additional VMAs. See comment for
+DEFAULT_MAX_MAP_COUNT for more details on the limit.
diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt
deleted file mode 100644
index 7bef6718e3a9..000000000000
--- a/Documentation/vm/remap_file_pages.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-.. _remap_file_pages:
-
-==============================
-remap_file_pages() system call
-==============================
-
-The remap_file_pages() system call is used to create a nonlinear mapping,
-that is, a mapping in which the pages of the file are mapped into a
-nonsequential order in memory. The advantage of using remap_file_pages()
-over using repeated calls to mmap(2) is that the former approach does not
-require the kernel to create additional VMA (Virtual Memory Area) data
-structures.
-
-Supporting of nonlinear mapping requires significant amount of non-trivial
-code in kernel virtual memory subsystem including hot paths. Also to get
-nonlinear mapping work kernel need a way to distinguish normal page table
-entries from entries with file offset (pte_file). Kernel reserves flag in
-PTE for this purpose. PTE flags are scarce resource especially on some CPU
-architectures. It would be nice to free up the flag for other usage.
-
-Fortunately, there are not many users of remap_file_pages() in the wild.
-It's only known that one enterprise RDBMS implementation uses the syscall
-on 32-bit systems to map files bigger than can linearly fit into 32-bit
-virtual address space. This use-case is not critical anymore since 64-bit
-systems are widely available.
-
-The syscall is deprecated and replaced it with an emulation now. The
-emulation creates new VMAs instead of nonlinear mappings. It's going to
-work slower for rare users of remap_file_pages() but ABI is preserved.
-
-One side effect of emulation (apart from performance) is that user can hit
-vm.max_map_count limit more easily due to additional VMAs. See comment for
-DEFAULT_MAX_MAP_COUNT for more details on the limit.
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
new file mode 100644
index 000000000000..3a775fd64e2d
--- /dev/null
+++ b/Documentation/vm/slub.rst
@@ -0,0 +1,361 @@
+.. _slub:
+
+==========================
+Short users guide for SLUB
+==========================
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+slab caches. SLUB always includes full debugging but it is off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add an option ``slub_debug``
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the ``slabinfo`` command to get statistical
+data and perform operation on the slabs. By default ``slabinfo`` only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. ``slabinfo`` can be compiled with
+::
+
+	gcc -o slabinfo tools/vm/slabinfo.c
+
+Some of the modes of operation of ``slabinfo`` require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to ``slub_debug``. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options>
+	Enable options for all slabs
+slub_debug=<Debug-Options>,<slab name>
+	Enable options only for select slabs
+
+
+Possible debug options are::
+
+	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
+			Sorry SLAB legacy issues)
+	Z		Red zoning
+	P		Poisoning (object and padding)
+	U		User tracking (free and alloc)
+	T		Trace (please only use on single slabs)
+	A		Toggle failslab filter mark for the cache
+	O		Switch debugging off for caches that would have
+			caused higher minimum slab orders
+	-		Switch all debugging off (useful if the kernel is
+			configured with CONFIG_SLUB_DEBUG_ON)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify::
+
+	slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try::
+
+	slub_debug=,dentry
+
+to only enable debugging on the dentry cache.
+
+Red zoning and tracking may realign the slab.  We can just apply sanity checks
+to the dentry cache with::
+
+	slub_debug=F,dentry
+
+Debugging options may require the minimum possible slab order to increase as
+a result of storing the metadata (for example, caches with PAGE_SIZE object
+sizes).  This has a higher liklihood of resulting in slab allocation errors
+in low memory situations or if there's high fragmentation of memory.  To
+switch off debugging for such caches by default, use::
+
+	slub_debug=O
+
+In case you forgot to enable debugging on the kernel command line: It is
+possible to enable debugging manually when the kernel is up. Look at the
+contents of::
+
+	/sys/kernel/slab/<slab name>/
+
+Look at the writable files. Writing 1 to them will enable the
+corresponding debug option. All options can be set on a slab that does
+not contain objects. If the slab already contains objects then sanity checks
+and tracing may only be enabled. The other options may cause the realignment
+of objects.
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+Slab merging
+============
+
+If no debug options are specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+``slabinfo -a`` displays which slabs were merged together.
+
+Slab validation
+===============
+
+SLUB can validate all object if the kernel was booted with slub_debug. In
+order to do so you must have the ``slabinfo`` tool. Then you can do
+::
+
+	slabinfo -v
+
+which will test all objects. Output will be generated to the syslog.
+
+This also works in a more limited way if boot was without slab debug.
+In that case ``slabinfo -v`` simply tests all reachable objects. Usually
+these are in the cpu slabs and the partial slabs. Full slabs are not
+tracked by SLUB in a non debug situation.
+
+Getting more performance
+========================
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+.. slub_min_objects=x		(default 4)
+.. slub_min_order=x		(default 0)
+.. slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
+
+``slub_min_objects``
+	allows to specify how many objects must at least fit into one
+	slab in order for the allocation order to be acceptable.  In
+	general slub will be able to perform this number of
+	allocations on a slab without consulting centralized resources
+	(list_lock) where contention may occur.
+
+``slub_min_order``
+	specifies a minim order of slabs. A similar effect like
+	``slub_min_objects``.
+
+``slub_max_order``
+	specified the order at which ``slub_min_objects`` should no
+	longer be checked. This is useful to avoid SLUB trying to
+	generate super large order pages to fit ``slub_min_objects``
+	of a slab cache with large object sizes into one high order
+	page. Setting command line parameter
+	``debug_guardpage_minorder=N`` (N > 0), forces setting
+	``slub_max_order`` to 0, what cause minimum possible order of
+	slabs allocation.
+
+SLUB Debug output
+=================
+
+Here is a sample of slub debug output::
+
+ ====================================================================
+ BUG kmalloc-8: Redzone overwritten
+ --------------------------------------------------------------------
+
+ INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
+ INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
+ INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
+ INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
+
+ Bytes b4 0xc90f6d10:  00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
+   Object 0xc90f6d20:  31 30 31 39 2e 30 30 35                         1019.005
+  Redzone 0xc90f6d28:  00 cc cc cc                                     .
+  Padding 0xc90f6d50:  5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
+
+   [<c010523d>] dump_trace+0x63/0x1eb
+   [<c01053df>] show_trace_log_lvl+0x1a/0x2f
+   [<c010601d>] show_trace+0x12/0x14
+   [<c0106035>] dump_stack+0x16/0x18
+   [<c017e0fa>] object_err+0x143/0x14b
+   [<c017e2cc>] check_object+0x66/0x234
+   [<c017eb43>] __slab_free+0x239/0x384
+   [<c017f446>] kfree+0xa6/0xc6
+   [<c02e2335>] get_modalias+0xb9/0xf5
+   [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
+   [<c027866a>] dev_uevent+0x1ad/0x1da
+   [<c0205024>] kobject_uevent_env+0x20a/0x45b
+   [<c020527f>] kobject_uevent+0xa/0xf
+   [<c02779f1>] store_uevent+0x4f/0x58
+   [<c027758e>] dev_attr_store+0x29/0x2f
+   [<c01bec4f>] sysfs_write_file+0x16e/0x19c
+   [<c0183ba7>] vfs_write+0xd1/0x15a
+   [<c01841d7>] sys_write+0x3d/0x72
+   [<c0104112>] sysenter_past_esp+0x5f/0x99
+   [<b7f7b410>] 0xb7f7b410
+   =======================
+
+ FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
+
+If SLUB encounters a corrupted object (full detection requires the kernel
+to be booted with slub_debug) then the following output will be dumped
+into the syslog:
+
+1. Description of the problem encountered
+
+   This will be a message in the system log starting with::
+
+     ===============================================
+     BUG <slab cache affected>: <What went wrong>
+     -----------------------------------------------
+
+     INFO: <corruption start>-<corruption_end> <more info>
+     INFO: Slab <address> <slab information>
+     INFO: Object <address> <object information>
+     INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
+	cpu> pid=<pid of the process>
+     INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
+	pid=<pid of the process>
+
+   (Object allocation / free information is only available if SLAB_STORE_USER is
+   set for the slab. slub_debug sets that option)
+
+2. The object contents if an object was involved.
+
+   Various types of lines can follow the BUG SLUB line:
+
+   Bytes b4 <address> : <bytes>
+	Shows a few bytes before the object where the problem was detected.
+	Can be useful if the corruption does not stop with the start of the
+	object.
+
+   Object <address> : <bytes>
+	The bytes of the object. If the object is inactive then the bytes
+	typically contain poison values. Any non-poison value shows a
+	corruption by a write after free.
+
+   Redzone <address> : <bytes>
+	The Redzone following the object. The Redzone is used to detect
+	writes after the object. All bytes should always have the same
+	value. If there is any deviation then it is due to a write after
+	the object boundary.
+
+	(Redzone information is only available if SLAB_RED_ZONE is set.
+	slub_debug sets that option)
+
+   Padding <address> : <bytes>
+	Unused data to fill up the space in order to get the next object
+	properly aligned. In the debug case we make sure that there are
+	at least 4 bytes of padding. This allows the detection of writes
+	before the object.
+
+3. A stackdump
+
+   The stackdump describes the location where the error was detected. The cause
+   of the corruption is may be more likely found by looking at the function that
+   allocated or freed the object.
+
+4. Report on how the problem was dealt with in order to ensure the continued
+   operation of the system.
+
+   These are messages in the system log beginning with::
+
+	FIX <slab cache affected>: <corrective action taken>
+
+   In the above sample SLUB found that the Redzone of an active object has
+   been overwritten. Here a string of 8 characters was written into a slab that
+   has the length of 8 characters. However, a 8 character string needs a
+   terminating 0. That zero has overwritten the first byte of the Redzone field.
+   After reporting the details of the issue encountered the FIX SLUB message
+   tells us that SLUB has restored the Redzone to its proper value and then
+   system operations continue.
+
+Emergency operations
+====================
+
+Minimal debugging (sanity checks alone) can be enabled by booting with::
+
+	slub_debug=F
+
+This will be generally be enough to enable the resiliency features of slub
+which will keep the system running even if a bad kernel component will
+keep corrupting objects. This may be important for production systems.
+Performance will be impacted by the sanity checks and there will be a
+continual stream of error messages to the syslog but no additional memory
+will be used (unlike full debugging).
+
+No guarantees. The kernel component still needs to be fixed. Performance
+may be optimized further by locating the slab that experiences corruption
+and enabling debugging only for that cache
+
+I.e.::
+
+	slub_debug=F,dentry
+
+If the corruption occurs by writing after the end of the object then it
+may be advisable to enable a Redzone to avoid corrupting the beginning
+of other objects::
+
+	slub_debug=FZ,dentry
+
+Extended slabinfo mode and plotting
+===================================
+
+The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
+ - Slabcache Totals
+ - Slabs sorted by size (up to -N <num> slabs, default 1)
+ - Slabs sorted by loss (up to -N <num> slabs, default 1)
+
+Additionally, in this mode ``slabinfo`` does not dynamically scale
+sizes (G/M/K) and reports everything in bytes (this functionality is
+also available to other slabinfo modes via '-B' option) which makes
+reporting more precise and accurate. Moreover, in some sense the `-X'
+mode also simplifies the analysis of slabs' behaviour, because its
+output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
+pushes the analysis from looking through the numbers (tons of numbers)
+to something easier -- visual analysis.
+
+To generate plots:
+
+a) collect slabinfo extended records, for example::
+
+	while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
+
+b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
+
+	slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
+
+   The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
+   and generates 3 png files (and 3 pre-processing cache files) per STATS
+   file:
+   - Slabcache Totals: FOO_STATS-totals.png
+   - Slabs sorted by size: FOO_STATS-slabs-by-size.png
+   - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
+
+Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
+need to compare slabs' behaviour "prior to" and "after" some code
+modification.  To help you out there, ``slabinfo-gnuplot.sh`` script
+can 'merge' the `Slabcache Totals` sections from different
+measurements. To visually compare N plots:
+
+a) Collect as many STATS1, STATS2, .. STATSN files as you need::
+
+	while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
+
+b) Pre-process those STATS files::
+
+	slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
+
+c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
+   generated pre-processed \*-totals::
+
+	slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
+
+   This will produce a single plot (png file).
+
+   Plots, expectedly, can be large so some fluctuations or small spikes
+   can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
+   options to 'zoom-in'/'zoom-out':
+
+   a) ``-s %d,%d`` -- overwrites the default image width and heigh
+   b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
+      in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
+      40,60`` range will plot only samples collected between 40th and
+      60th seconds).
+
+Christoph Lameter, May 30, 2007
+Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
deleted file mode 100644
index 3a775fd64e2d..000000000000
--- a/Documentation/vm/slub.txt
+++ /dev/null
@@ -1,361 +0,0 @@
-.. _slub:
-
-==========================
-Short users guide for SLUB
-==========================
-
-The basic philosophy of SLUB is very different from SLAB. SLAB
-requires rebuilding the kernel to activate debug options for all
-slab caches. SLUB always includes full debugging but it is off by default.
-SLUB can enable debugging only for selected slabs in order to avoid
-an impact on overall system performance which may make a bug more
-difficult to find.
-
-In order to switch debugging on one can add an option ``slub_debug``
-to the kernel command line. That will enable full debugging for
-all slabs.
-
-Typically one would then use the ``slabinfo`` command to get statistical
-data and perform operation on the slabs. By default ``slabinfo`` only lists
-slabs that have data in them. See "slabinfo -h" for more options when
-running the command. ``slabinfo`` can be compiled with
-::
-
-	gcc -o slabinfo tools/vm/slabinfo.c
-
-Some of the modes of operation of ``slabinfo`` require that slub debugging
-be enabled on the command line. F.e. no tracking information will be
-available without debugging on and validation can only partially
-be performed if debugging was not switched on.
-
-Some more sophisticated uses of slub_debug:
--------------------------------------------
-
-Parameters may be given to ``slub_debug``. If none is specified then full
-debugging is enabled. Format:
-
-slub_debug=<Debug-Options>
-	Enable options for all slabs
-slub_debug=<Debug-Options>,<slab name>
-	Enable options only for select slabs
-
-
-Possible debug options are::
-
-	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
-			Sorry SLAB legacy issues)
-	Z		Red zoning
-	P		Poisoning (object and padding)
-	U		User tracking (free and alloc)
-	T		Trace (please only use on single slabs)
-	A		Toggle failslab filter mark for the cache
-	O		Switch debugging off for caches that would have
-			caused higher minimum slab orders
-	-		Switch all debugging off (useful if the kernel is
-			configured with CONFIG_SLUB_DEBUG_ON)
-
-F.e. in order to boot just with sanity checks and red zoning one would specify::
-
-	slub_debug=FZ
-
-Trying to find an issue in the dentry cache? Try::
-
-	slub_debug=,dentry
-
-to only enable debugging on the dentry cache.
-
-Red zoning and tracking may realign the slab.  We can just apply sanity checks
-to the dentry cache with::
-
-	slub_debug=F,dentry
-
-Debugging options may require the minimum possible slab order to increase as
-a result of storing the metadata (for example, caches with PAGE_SIZE object
-sizes).  This has a higher liklihood of resulting in slab allocation errors
-in low memory situations or if there's high fragmentation of memory.  To
-switch off debugging for such caches by default, use::
-
-	slub_debug=O
-
-In case you forgot to enable debugging on the kernel command line: It is
-possible to enable debugging manually when the kernel is up. Look at the
-contents of::
-
-	/sys/kernel/slab/<slab name>/
-
-Look at the writable files. Writing 1 to them will enable the
-corresponding debug option. All options can be set on a slab that does
-not contain objects. If the slab already contains objects then sanity checks
-and tracing may only be enabled. The other options may cause the realignment
-of objects.
-
-Careful with tracing: It may spew out lots of information and never stop if
-used on the wrong slab.
-
-Slab merging
-============
-
-If no debug options are specified then SLUB may merge similar slabs together
-in order to reduce overhead and increase cache hotness of objects.
-``slabinfo -a`` displays which slabs were merged together.
-
-Slab validation
-===============
-
-SLUB can validate all object if the kernel was booted with slub_debug. In
-order to do so you must have the ``slabinfo`` tool. Then you can do
-::
-
-	slabinfo -v
-
-which will test all objects. Output will be generated to the syslog.
-
-This also works in a more limited way if boot was without slab debug.
-In that case ``slabinfo -v`` simply tests all reachable objects. Usually
-these are in the cpu slabs and the partial slabs. Full slabs are not
-tracked by SLUB in a non debug situation.
-
-Getting more performance
-========================
-
-To some degree SLUB's performance is limited by the need to take the
-list_lock once in a while to deal with partial slabs. That overhead is
-governed by the order of the allocation for each slab. The allocations
-can be influenced by kernel parameters:
-
-.. slub_min_objects=x		(default 4)
-.. slub_min_order=x		(default 0)
-.. slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
-
-``slub_min_objects``
-	allows to specify how many objects must at least fit into one
-	slab in order for the allocation order to be acceptable.  In
-	general slub will be able to perform this number of
-	allocations on a slab without consulting centralized resources
-	(list_lock) where contention may occur.
-
-``slub_min_order``
-	specifies a minim order of slabs. A similar effect like
-	``slub_min_objects``.
-
-``slub_max_order``
-	specified the order at which ``slub_min_objects`` should no
-	longer be checked. This is useful to avoid SLUB trying to
-	generate super large order pages to fit ``slub_min_objects``
-	of a slab cache with large object sizes into one high order
-	page. Setting command line parameter
-	``debug_guardpage_minorder=N`` (N > 0), forces setting
-	``slub_max_order`` to 0, what cause minimum possible order of
-	slabs allocation.
-
-SLUB Debug output
-=================
-
-Here is a sample of slub debug output::
-
- ====================================================================
- BUG kmalloc-8: Redzone overwritten
- --------------------------------------------------------------------
-
- INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
- INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
- INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
- INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
-
- Bytes b4 0xc90f6d10:  00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
-   Object 0xc90f6d20:  31 30 31 39 2e 30 30 35                         1019.005
-  Redzone 0xc90f6d28:  00 cc cc cc                                     .
-  Padding 0xc90f6d50:  5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
-
-   [<c010523d>] dump_trace+0x63/0x1eb
-   [<c01053df>] show_trace_log_lvl+0x1a/0x2f
-   [<c010601d>] show_trace+0x12/0x14
-   [<c0106035>] dump_stack+0x16/0x18
-   [<c017e0fa>] object_err+0x143/0x14b
-   [<c017e2cc>] check_object+0x66/0x234
-   [<c017eb43>] __slab_free+0x239/0x384
-   [<c017f446>] kfree+0xa6/0xc6
-   [<c02e2335>] get_modalias+0xb9/0xf5
-   [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
-   [<c027866a>] dev_uevent+0x1ad/0x1da
-   [<c0205024>] kobject_uevent_env+0x20a/0x45b
-   [<c020527f>] kobject_uevent+0xa/0xf
-   [<c02779f1>] store_uevent+0x4f/0x58
-   [<c027758e>] dev_attr_store+0x29/0x2f
-   [<c01bec4f>] sysfs_write_file+0x16e/0x19c
-   [<c0183ba7>] vfs_write+0xd1/0x15a
-   [<c01841d7>] sys_write+0x3d/0x72
-   [<c0104112>] sysenter_past_esp+0x5f/0x99
-   [<b7f7b410>] 0xb7f7b410
-   =======================
-
- FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
-
-If SLUB encounters a corrupted object (full detection requires the kernel
-to be booted with slub_debug) then the following output will be dumped
-into the syslog:
-
-1. Description of the problem encountered
-
-   This will be a message in the system log starting with::
-
-     ===============================================
-     BUG <slab cache affected>: <What went wrong>
-     -----------------------------------------------
-
-     INFO: <corruption start>-<corruption_end> <more info>
-     INFO: Slab <address> <slab information>
-     INFO: Object <address> <object information>
-     INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
-	cpu> pid=<pid of the process>
-     INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
-	pid=<pid of the process>
-
-   (Object allocation / free information is only available if SLAB_STORE_USER is
-   set for the slab. slub_debug sets that option)
-
-2. The object contents if an object was involved.
-
-   Various types of lines can follow the BUG SLUB line:
-
-   Bytes b4 <address> : <bytes>
-	Shows a few bytes before the object where the problem was detected.
-	Can be useful if the corruption does not stop with the start of the
-	object.
-
-   Object <address> : <bytes>
-	The bytes of the object. If the object is inactive then the bytes
-	typically contain poison values. Any non-poison value shows a
-	corruption by a write after free.
-
-   Redzone <address> : <bytes>
-	The Redzone following the object. The Redzone is used to detect
-	writes after the object. All bytes should always have the same
-	value. If there is any deviation then it is due to a write after
-	the object boundary.
-
-	(Redzone information is only available if SLAB_RED_ZONE is set.
-	slub_debug sets that option)
-
-   Padding <address> : <bytes>
-	Unused data to fill up the space in order to get the next object
-	properly aligned. In the debug case we make sure that there are
-	at least 4 bytes of padding. This allows the detection of writes
-	before the object.
-
-3. A stackdump
-
-   The stackdump describes the location where the error was detected. The cause
-   of the corruption is may be more likely found by looking at the function that
-   allocated or freed the object.
-
-4. Report on how the problem was dealt with in order to ensure the continued
-   operation of the system.
-
-   These are messages in the system log beginning with::
-
-	FIX <slab cache affected>: <corrective action taken>
-
-   In the above sample SLUB found that the Redzone of an active object has
-   been overwritten. Here a string of 8 characters was written into a slab that
-   has the length of 8 characters. However, a 8 character string needs a
-   terminating 0. That zero has overwritten the first byte of the Redzone field.
-   After reporting the details of the issue encountered the FIX SLUB message
-   tells us that SLUB has restored the Redzone to its proper value and then
-   system operations continue.
-
-Emergency operations
-====================
-
-Minimal debugging (sanity checks alone) can be enabled by booting with::
-
-	slub_debug=F
-
-This will be generally be enough to enable the resiliency features of slub
-which will keep the system running even if a bad kernel component will
-keep corrupting objects. This may be important for production systems.
-Performance will be impacted by the sanity checks and there will be a
-continual stream of error messages to the syslog but no additional memory
-will be used (unlike full debugging).
-
-No guarantees. The kernel component still needs to be fixed. Performance
-may be optimized further by locating the slab that experiences corruption
-and enabling debugging only for that cache
-
-I.e.::
-
-	slub_debug=F,dentry
-
-If the corruption occurs by writing after the end of the object then it
-may be advisable to enable a Redzone to avoid corrupting the beginning
-of other objects::
-
-	slub_debug=FZ,dentry
-
-Extended slabinfo mode and plotting
-===================================
-
-The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
- - Slabcache Totals
- - Slabs sorted by size (up to -N <num> slabs, default 1)
- - Slabs sorted by loss (up to -N <num> slabs, default 1)
-
-Additionally, in this mode ``slabinfo`` does not dynamically scale
-sizes (G/M/K) and reports everything in bytes (this functionality is
-also available to other slabinfo modes via '-B' option) which makes
-reporting more precise and accurate. Moreover, in some sense the `-X'
-mode also simplifies the analysis of slabs' behaviour, because its
-output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
-pushes the analysis from looking through the numbers (tons of numbers)
-to something easier -- visual analysis.
-
-To generate plots:
-
-a) collect slabinfo extended records, for example::
-
-	while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
-
-b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
-
-	slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
-
-   The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
-   and generates 3 png files (and 3 pre-processing cache files) per STATS
-   file:
-   - Slabcache Totals: FOO_STATS-totals.png
-   - Slabs sorted by size: FOO_STATS-slabs-by-size.png
-   - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
-
-Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
-need to compare slabs' behaviour "prior to" and "after" some code
-modification.  To help you out there, ``slabinfo-gnuplot.sh`` script
-can 'merge' the `Slabcache Totals` sections from different
-measurements. To visually compare N plots:
-
-a) Collect as many STATS1, STATS2, .. STATSN files as you need::
-
-	while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
-
-b) Pre-process those STATS files::
-
-	slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
-
-c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
-   generated pre-processed \*-totals::
-
-	slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
-
-   This will produce a single plot (png file).
-
-   Plots, expectedly, can be large so some fluctuations or small spikes
-   can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
-   options to 'zoom-in'/'zoom-out':
-
-   a) ``-s %d,%d`` -- overwrites the default image width and heigh
-   b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
-      in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
-      40,60`` range will plot only samples collected between 40th and
-      60th seconds).
-
-Christoph Lameter, May 30, 2007
-Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/soft-dirty.rst b/Documentation/vm/soft-dirty.rst
new file mode 100644
index 000000000000..cb0cfd6672fa
--- /dev/null
+++ b/Documentation/vm/soft-dirty.rst
@@ -0,0 +1,47 @@
+.. _soft_dirty:
+
+===============
+Soft-Dirty PTEs
+===============
+
+The soft-dirty is a bit on a PTE which helps to track which pages a task
+writes to. In order to do this tracking one should
+
+  1. Clear soft-dirty bits from the task's PTEs.
+
+     This is done by writing "4" into the ``/proc/PID/clear_refs`` file of the
+     task in question.
+
+  2. Wait some time.
+
+  3. Read soft-dirty bits from the PTEs.
+
+     This is done by reading from the ``/proc/PID/pagemap``. The bit 55 of the
+     64-bit qword is the soft-dirty one. If set, the respective PTE was
+     written to since step 1.
+
+
+Internally, to do this tracking, the writable bit is cleared from PTEs
+when the soft-dirty bit is cleared. So, after this, when the task tries to
+modify a page at some virtual address the #PF occurs and the kernel sets
+the soft-dirty bit on the respective PTE.
+
+Note, that although all the task's address space is marked as r/o after the
+soft-dirty bits clear, the #PF-s that occur after that are processed fast.
+This is so, since the pages are still mapped to physical memory, and thus all
+the kernel does is finds this fact out and puts both writable and soft-dirty
+bits on the PTE.
+
+While in most cases tracking memory changes by #PF-s is more than enough
+there is still a scenario when we can lose soft dirty bits -- a task
+unmaps a previously mapped memory region and then maps a new one at exactly
+the same place. When unmap is called, the kernel internally clears PTE values
+including soft dirty bits. To notify user space application about such
+memory region renewal the kernel always marks new memory regions (and
+expanded regions) as soft dirty.
+
+This feature is actively used by the checkpoint-restore project. You
+can find more details about it on http://criu.org
+
+
+-- Pavel Emelyanov, Apr 9, 2013
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt
deleted file mode 100644
index cb0cfd6672fa..000000000000
--- a/Documentation/vm/soft-dirty.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-.. _soft_dirty:
-
-===============
-Soft-Dirty PTEs
-===============
-
-The soft-dirty is a bit on a PTE which helps to track which pages a task
-writes to. In order to do this tracking one should
-
-  1. Clear soft-dirty bits from the task's PTEs.
-
-     This is done by writing "4" into the ``/proc/PID/clear_refs`` file of the
-     task in question.
-
-  2. Wait some time.
-
-  3. Read soft-dirty bits from the PTEs.
-
-     This is done by reading from the ``/proc/PID/pagemap``. The bit 55 of the
-     64-bit qword is the soft-dirty one. If set, the respective PTE was
-     written to since step 1.
-
-
-Internally, to do this tracking, the writable bit is cleared from PTEs
-when the soft-dirty bit is cleared. So, after this, when the task tries to
-modify a page at some virtual address the #PF occurs and the kernel sets
-the soft-dirty bit on the respective PTE.
-
-Note, that although all the task's address space is marked as r/o after the
-soft-dirty bits clear, the #PF-s that occur after that are processed fast.
-This is so, since the pages are still mapped to physical memory, and thus all
-the kernel does is finds this fact out and puts both writable and soft-dirty
-bits on the PTE.
-
-While in most cases tracking memory changes by #PF-s is more than enough
-there is still a scenario when we can lose soft dirty bits -- a task
-unmaps a previously mapped memory region and then maps a new one at exactly
-the same place. When unmap is called, the kernel internally clears PTE values
-including soft dirty bits. To notify user space application about such
-memory region renewal the kernel always marks new memory regions (and
-expanded regions) as soft dirty.
-
-This feature is actively used by the checkpoint-restore project. You
-can find more details about it on http://criu.org
-
-
--- Pavel Emelyanov, Apr 9, 2013
diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock
deleted file mode 100644
index 889b00be469f..000000000000
--- a/Documentation/vm/split_page_table_lock
+++ /dev/null
@@ -1,100 +0,0 @@
-.. _split_page_table_lock:
-
-=====================
-Split page table lock
-=====================
-
-Originally, mm->page_table_lock spinlock protected all page tables of the
-mm_struct. But this approach leads to poor page fault scalability of
-multi-threaded applications due high contention on the lock. To improve
-scalability, split page table lock was introduced.
-
-With split page table lock we have separate per-table lock to serialize
-access to the table. At the moment we use split lock for PTE and PMD
-tables. Access to higher level tables protected by mm->page_table_lock.
-
-There are helpers to lock/unlock a table and other accessor functions:
-
- - pte_offset_map_lock()
-	maps pte and takes PTE table lock, returns pointer to the taken
-	lock;
- - pte_unmap_unlock()
-	unlocks and unmaps PTE table;
- - pte_alloc_map_lock()
-	allocates PTE table if needed and take the lock, returns pointer
-	to taken lock or NULL if allocation failed;
- - pte_lockptr()
-	returns pointer to PTE table lock;
- - pmd_lock()
-	takes PMD table lock, returns pointer to taken lock;
- - pmd_lockptr()
-	returns pointer to PMD table lock;
-
-Split page table lock for PTE tables is enabled compile-time if
-CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
-If split lock is disabled, all tables guaded by mm->page_table_lock.
-
-Split page table lock for PMD tables is enabled, if it's enabled for PTE
-tables and the architecture supports it (see below).
-
-Hugetlb and split page table lock
-=================================
-
-Hugetlb can support several page sizes. We use split lock only for PMD
-level, but not for PUD.
-
-Hugetlb-specific helpers:
-
- - huge_pte_lock()
-	takes pmd split lock for PMD_SIZE page, mm->page_table_lock
-	otherwise;
- - huge_pte_lockptr()
-	returns pointer to table lock;
-
-Support of split page table lock by an architecture
-===================================================
-
-There's no need in special enabling of PTE split page table lock:
-everything required is done by pgtable_page_ctor() and pgtable_page_dtor(),
-which must be called on PTE table allocation / freeing.
-
-Make sure the architecture doesn't use slab allocator for page table
-allocation: slab uses page->slab_cache for its pages.
-This field shares storage with page->ptl.
-
-PMD split lock only makes sense if you have more than two page table
-levels.
-
-PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
-allocation and pgtable_pmd_page_dtor() on freeing.
-
-Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
-pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
-paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
-
-With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
-
-NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
-be handled properly.
-
-page->ptl
-=========
-
-page->ptl is used to access split page table lock, where 'page' is struct
-page of page containing the table. It shares storage with page->private
-(and few other fields in union).
-
-To avoid increasing size of struct page and have best performance, we use a
-trick:
-
- - if spinlock_t fits into long, we use page->ptr as spinlock, so we
-   can avoid indirect access and save a cache line.
- - if size of spinlock_t is bigger then size of long, we use page->ptl as
-   pointer to spinlock_t and allocate it dynamically. This allows to use
-   split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
-   one more cache line for indirect access;
-
-The spinlock_t allocated in pgtable_page_ctor() for PTE table and in
-pgtable_pmd_page_ctor() for PMD table.
-
-Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst
new file mode 100644
index 000000000000..889b00be469f
--- /dev/null
+++ b/Documentation/vm/split_page_table_lock.rst
@@ -0,0 +1,100 @@
+.. _split_page_table_lock:
+
+=====================
+Split page table lock
+=====================
+
+Originally, mm->page_table_lock spinlock protected all page tables of the
+mm_struct. But this approach leads to poor page fault scalability of
+multi-threaded applications due high contention on the lock. To improve
+scalability, split page table lock was introduced.
+
+With split page table lock we have separate per-table lock to serialize
+access to the table. At the moment we use split lock for PTE and PMD
+tables. Access to higher level tables protected by mm->page_table_lock.
+
+There are helpers to lock/unlock a table and other accessor functions:
+
+ - pte_offset_map_lock()
+	maps pte and takes PTE table lock, returns pointer to the taken
+	lock;
+ - pte_unmap_unlock()
+	unlocks and unmaps PTE table;
+ - pte_alloc_map_lock()
+	allocates PTE table if needed and take the lock, returns pointer
+	to taken lock or NULL if allocation failed;
+ - pte_lockptr()
+	returns pointer to PTE table lock;
+ - pmd_lock()
+	takes PMD table lock, returns pointer to taken lock;
+ - pmd_lockptr()
+	returns pointer to PMD table lock;
+
+Split page table lock for PTE tables is enabled compile-time if
+CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
+If split lock is disabled, all tables guaded by mm->page_table_lock.
+
+Split page table lock for PMD tables is enabled, if it's enabled for PTE
+tables and the architecture supports it (see below).
+
+Hugetlb and split page table lock
+=================================
+
+Hugetlb can support several page sizes. We use split lock only for PMD
+level, but not for PUD.
+
+Hugetlb-specific helpers:
+
+ - huge_pte_lock()
+	takes pmd split lock for PMD_SIZE page, mm->page_table_lock
+	otherwise;
+ - huge_pte_lockptr()
+	returns pointer to table lock;
+
+Support of split page table lock by an architecture
+===================================================
+
+There's no need in special enabling of PTE split page table lock:
+everything required is done by pgtable_page_ctor() and pgtable_page_dtor(),
+which must be called on PTE table allocation / freeing.
+
+Make sure the architecture doesn't use slab allocator for page table
+allocation: slab uses page->slab_cache for its pages.
+This field shares storage with page->ptl.
+
+PMD split lock only makes sense if you have more than two page table
+levels.
+
+PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
+allocation and pgtable_pmd_page_dtor() on freeing.
+
+Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
+pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
+paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
+
+With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
+
+NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+be handled properly.
+
+page->ptl
+=========
+
+page->ptl is used to access split page table lock, where 'page' is struct
+page of page containing the table. It shares storage with page->private
+(and few other fields in union).
+
+To avoid increasing size of struct page and have best performance, we use a
+trick:
+
+ - if spinlock_t fits into long, we use page->ptr as spinlock, so we
+   can avoid indirect access and save a cache line.
+ - if size of spinlock_t is bigger then size of long, we use page->ptl as
+   pointer to spinlock_t and allocate it dynamically. This allows to use
+   split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
+   one more cache line for indirect access;
+
+The spinlock_t allocated in pgtable_page_ctor() for PTE table and in
+pgtable_pmd_page_ctor() for PMD table.
+
+Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/vm/swap_numa.rst b/Documentation/vm/swap_numa.rst
new file mode 100644
index 000000000000..e0466f2db8fa
--- /dev/null
+++ b/Documentation/vm/swap_numa.rst
@@ -0,0 +1,80 @@
+.. _swap_numa:
+
+===========================================
+Automatically bind swap device to numa node
+===========================================
+
+If the system has more than one swap device and swap device has the node
+information, we can make use of this information to decide which swap
+device to use in get_swap_pages() to get better performance.
+
+
+How to use this feature
+=======================
+
+Swap device has priority and that decides the order of it to be used. To make
+use of automatically binding, there is no need to manipulate priority settings
+for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
+swapB, with swapA attached to node 0 and swapB attached to node 1, are going
+to be swapped on. Simply swapping them on by doing::
+
+	# swapon /dev/swapA
+	# swapon /dev/swapB
+
+Then node 0 will use the two swap devices in the order of swapA then swapB and
+node 1 will use the two swap devices in the order of swapB then swapA. Note
+that the order of them being swapped on doesn't matter.
+
+A more complex example on a 4 node machine. Assume 6 swap devices are going to
+be swapped on: swapA and swapB are attached to node 0, swapC is attached to
+node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
+The way to swap them on is the same as above::
+
+	# swapon /dev/swapA
+	# swapon /dev/swapB
+	# swapon /dev/swapC
+	# swapon /dev/swapD
+	# swapon /dev/swapE
+	# swapon /dev/swapF
+
+Then node 0 will use them in the order of::
+
+	swapA/swapB -> swapC -> swapD -> swapE -> swapF
+
+swapA and swapB will be used in a round robin mode before any other swap device.
+
+node 1 will use them in the order of::
+
+	swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+
+node 2 will use them in the order of::
+
+	swapD/swapE -> swapA -> swapB -> swapC -> swapF
+
+Similaly, swapD and swapE will be used in a round robin mode before any
+other swap devices.
+
+node 3 will use them in the order of::
+
+	swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+
+
+Implementation details
+======================
+
+The current code uses a priority based list, swap_avail_list, to decide
+which swap device to use and if multiple swap devices share the same
+priority, they are used round robin. This change here replaces the single
+global swap_avail_list with a per-numa-node list, i.e. for each numa node,
+it sees its own priority based list of available swap devices. Swap
+device's priority can be promoted on its matching node's swap_avail_list.
+
+The current swap device's priority is set as: user can set a >=0 value,
+or the system will pick one starting from -1 then downwards. The priority
+value in the swap_avail_list is the negated value of the swap device's
+due to plist being sorted from low to high. The new policy doesn't change
+the semantics for priority >=0 cases, the previous starting from -1 then
+downwards now becomes starting from -2 then downwards and -1 is reserved
+as the promoted value. So if multiple swap devices are attached to the same
+node, they will all be promoted to priority -1 on that node's plist and will
+be used round robin before any other swap devices.
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt
deleted file mode 100644
index e0466f2db8fa..000000000000
--- a/Documentation/vm/swap_numa.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-.. _swap_numa:
-
-===========================================
-Automatically bind swap device to numa node
-===========================================
-
-If the system has more than one swap device and swap device has the node
-information, we can make use of this information to decide which swap
-device to use in get_swap_pages() to get better performance.
-
-
-How to use this feature
-=======================
-
-Swap device has priority and that decides the order of it to be used. To make
-use of automatically binding, there is no need to manipulate priority settings
-for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
-swapB, with swapA attached to node 0 and swapB attached to node 1, are going
-to be swapped on. Simply swapping them on by doing::
-
-	# swapon /dev/swapA
-	# swapon /dev/swapB
-
-Then node 0 will use the two swap devices in the order of swapA then swapB and
-node 1 will use the two swap devices in the order of swapB then swapA. Note
-that the order of them being swapped on doesn't matter.
-
-A more complex example on a 4 node machine. Assume 6 swap devices are going to
-be swapped on: swapA and swapB are attached to node 0, swapC is attached to
-node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
-The way to swap them on is the same as above::
-
-	# swapon /dev/swapA
-	# swapon /dev/swapB
-	# swapon /dev/swapC
-	# swapon /dev/swapD
-	# swapon /dev/swapE
-	# swapon /dev/swapF
-
-Then node 0 will use them in the order of::
-
-	swapA/swapB -> swapC -> swapD -> swapE -> swapF
-
-swapA and swapB will be used in a round robin mode before any other swap device.
-
-node 1 will use them in the order of::
-
-	swapC -> swapA -> swapB -> swapD -> swapE -> swapF
-
-node 2 will use them in the order of::
-
-	swapD/swapE -> swapA -> swapB -> swapC -> swapF
-
-Similaly, swapD and swapE will be used in a round robin mode before any
-other swap devices.
-
-node 3 will use them in the order of::
-
-	swapF -> swapA -> swapB -> swapC -> swapD -> swapE
-
-
-Implementation details
-======================
-
-The current code uses a priority based list, swap_avail_list, to decide
-which swap device to use and if multiple swap devices share the same
-priority, they are used round robin. This change here replaces the single
-global swap_avail_list with a per-numa-node list, i.e. for each numa node,
-it sees its own priority based list of available swap devices. Swap
-device's priority can be promoted on its matching node's swap_avail_list.
-
-The current swap device's priority is set as: user can set a >=0 value,
-or the system will pick one starting from -1 then downwards. The priority
-value in the swap_avail_list is the negated value of the swap device's
-due to plist being sorted from low to high. The new policy doesn't change
-the semantics for priority >=0 cases, the previous starting from -1 then
-downwards now becomes starting from -2 then downwards and -1 is reserved
-as the promoted value. So if multiple swap devices are attached to the same
-node, they will all be promoted to priority -1 on that node's plist and will
-be used round robin before any other swap devices.
diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst
new file mode 100644
index 000000000000..569d182cc973
--- /dev/null
+++ b/Documentation/vm/transhuge.rst
@@ -0,0 +1,573 @@
+.. _transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+Objective
+=========
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent Hugepage Support is an alternative means of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently it only works for anonymous memory mappings and tmpfs/shmem.
+But in the future it can expand to other filesystems.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components: 1) the TLB miss will run faster (especially with
+virtualization using nested pagetables but almost always also on bare
+metal without virtualization) and 2) a single TLB entry will be
+mapping a much larger amount of virtual memory in turn reducing the
+number of TLB misses. With virtualization and nested pagetables the
+TLB can be mapped of larger size only if both KVM and the Linux guest
+are using hugepages but a significant speedup already happens if only
+one of the two is using hugepages just because of the fact the TLB
+miss is going to run faster.
+
+Design
+======
+
+- "graceful fallback": mm components which don't have transparent hugepage
+  knowledge fall back to breaking huge pmd mapping into table of ptes and,
+  if necessary, split a transparent hugepage. Therefore these components
+  can continue working on the regular pages or regular pte mappings.
+
+- if a hugepage allocation fails because of memory fragmentation,
+  regular pages should be gracefully allocated instead and mixed in
+  the same vma without any failure or significant delay and without
+  userland noticing
+
+- if some task quits and more hugepages become available (either
+  immediately in the buddy or through the VM), guest physical memory
+  backed by regular pages should be relocated on hugepages
+  automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+  whenever possible (the only possible reservation here is kernelcore=
+  to avoid unmovable pages to fragment all the memory but such a tweak
+  is not specific to transparent hugepage support and it's a generic
+  feature that applies to all dynamic high order allocations in the
+  kernel)
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+sysfs
+=====
+
+Transparent Hugepage Support for anonymous memory can be entirely disabled
+(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
+regions (to avoid the risk of consuming more memory resources) or enabled
+system wide. This can be achieved with one of::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/enabled
+	echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+	echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+anonymous hugepages in case they're not immediately free to madvise
+regions or to never try to defrag memory and simply fallback to regular
+pages unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact we
+use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/defrag
+	echo defer >/sys/kernel/mm/transparent_hugepage/defrag
+	echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
+	echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+	echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+always
+	means that an application requesting THP will stall on
+	allocation failure and directly reclaim pages and compact
+	memory in an effort to allocate a THP immediately. This may be
+	desirable for virtual machines that benefit heavily from THP
+	use and are willing to delay the VM start to utilise them.
+
+defer
+	means that an application will wake kswapd in the background
+	to reclaim pages and wake kcompactd to compact memory so that
+	THP is available in the near future. It's the responsibility
+	of khugepaged to then install the THP pages later.
+
+defer+madvise
+	will enter direct reclaim and compaction like ``always``, but
+	only for regions that have used madvise(MADV_HUGEPAGE); all
+	other regions will wake kswapd in the background to reclaim
+	pages and wake kcompactd to compact memory so that THP is
+	available in the near future.
+
+madvise
+	will enter direct reclaim like ``always`` but only for regions
+	that are have used madvise(MADV_HUGEPAGE). This is the default
+	behaviour.
+
+never
+	should be self-explanatory.
+
+By default kernel tries to use huge zero page on read page fault to
+anonymous mapping. It's possible to disable huge zero page by writing 0
+or enable it back by writing 1::
+
+	echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+	echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+
+Some userspace (such as a test program, or an optimized memory allocation
+library) may want to know the size (in bytes) of a transparent hugepage::
+
+	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged by writing 0 or enable
+defrag in khugepaged by writing 1::
+
+	echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+	echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core)::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+``max_ptes_none`` specifies how many extra small pages (that are
+not already mapped) can be allocated when collapsing a group
+of small pages into one large page::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
+
+A higher value leads to use additional memory for programs.
+A lower value leads to gain less thp performance. Value of
+max_ptes_none can waste cpu time very little, you can
+ignore it.
+
+``max_ptes_swap`` specifies how many pages can be brought in from
+swap when collapsing a group of pages into a transparent huge page::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
+
+A higher value can cause excessive swap IO and waste
+memory. A lower value can prevent THPs from being
+collapsed, resulting fewer pages being collapsed into
+THPs, and lower memory access performance.
+
+Boot parameter
+==============
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter ``transparent_hugepage=always`` or
+``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
+to the kernel command line.
+
+Hugepages in tmpfs/shmem
+========================
+
+You can control hugepage allocation policy in tmpfs with mount option
+``huge=``. It can have following values:
+
+always
+    Attempt to allocate huge pages every time we need a new page;
+
+never
+    Do not allocate huge pages;
+
+within_size
+    Only allocate huge page if it will be fully within i_size.
+    Also respect fadvise()/madvise() hints;
+
+advise
+    Only allocate huge pages if requested with fadvise()/madvise();
+
+The default policy is ``never``.
+
+``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
+``huge=never`` will not attempt to break up huge pages at all, just stop more
+from being allocated.
+
+There's also sysfs knob to control hugepage allocation policy for internal
+shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
+is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
+MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
+
+In addition to policies listed above, shmem_enabled allows two further
+values:
+
+deny
+    For use in emergencies, to force the huge option off from
+    all mounts;
+force
+    Force the huge option on for all - very useful for testing;
+
+Need of application restart
+===========================
+
+The transparent_hugepage/enabled values and tmpfs mount option only affect
+future behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to the
+regions registered in khugepaged.
+
+Monitoring usage
+================
+
+The number of anonymous transparent huge pages currently used by the
+system is available by reading the AnonHugePages field in ``/proc/meminfo``.
+To identify what applications are using anonymous transparent huge pages,
+it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
+for each mapping.
+
+The number of file transparent huge pages mapped to userspace is available
+by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
+To identify what applications are mapping file transparent huge pages, it
+is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
+for each mapping.
+
+Note that reading the smaps file is expensive and reading it
+frequently will incur overhead.
+
+There are a number of counters in ``/proc/vmstat`` that may be used to
+monitor how successfully the system is providing huge pages for use.
+
+thp_fault_alloc
+	is incremented every time a huge page is successfully
+	allocated to handle a page fault. This applies to both the
+	first time a page is faulted and for COW faults.
+
+thp_collapse_alloc
+	is incremented by khugepaged when it has found
+	a range of pages to collapse into one huge page and has
+	successfully allocated a new huge page to store the data.
+
+thp_fault_fallback
+	is incremented if a page fault fails to allocate
+	a huge page and instead falls back to using small pages.
+
+thp_collapse_alloc_failed
+	is incremented if khugepaged found a range
+	of pages that should be collapsed into one huge page but failed
+	the allocation.
+
+thp_file_alloc
+	is incremented every time a file huge page is successfully
+	allocated.
+
+thp_file_mapped
+	is incremented every time a file huge page is mapped into
+	user address space.
+
+thp_split_page
+	is incremented every time a huge page is split into base
+	pages. This can happen for a variety of reasons but a common
+	reason is that a huge page is old and is being reclaimed.
+	This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed
+	is incremented if kernel fails to split huge
+	page. This can happen if the page was pinned by somebody.
+
+thp_deferred_split_page
+	is incremented when a huge page is put onto split
+	queue. This happens when a huge page is partially unmapped and
+	splitting it would free up some memory. Pages on split queue are
+	going to be split under memory pressure.
+
+thp_split_pmd
+	is incremented every time a PMD split into table of PTEs.
+	This can happen, for instance, when application calls mprotect() or
+	munmap() on part of huge page. It doesn't split huge page, only
+	page table entry.
+
+thp_zero_page_alloc
+	is incremented every time a huge zero page is
+	successfully allocated. It includes allocations which where
+	dropped due race with other allocation. Note, it doesn't count
+	every map of the huge zero page, only its allocation.
+
+thp_zero_page_alloc_failed
+	is incremented if kernel fails to allocate
+	huge zero page and falls back to using small pages.
+
+As the system ages, allocating huge pages may be expensive as the
+system uses memory compaction to copy data around memory to free a
+huge page for use. There are some counters in ``/proc/vmstat`` to help
+monitor this overhead.
+
+compact_stall
+	is incremented every time a process stalls to run
+	memory compaction so that a huge page is free for use.
+
+compact_success
+	is incremented if the system compacted memory and
+	freed a huge page for use.
+
+compact_fail
+	is incremented if the system tries to compact memory
+	but failed.
+
+compact_pages_moved
+	is incremented each time a page is moved. If
+	this value is increasing rapidly, it implies that the system
+	is copying a lot of data to satisfy the huge page allocation.
+	It is possible that the cost of copying exceeds any savings
+	from reduced TLB misses.
+
+compact_pagemigrate_failed
+	is incremented when the underlying mechanism
+	for moving a page failed.
+
+compact_blocks_moved
+	is incremented each time memory compaction examines
+	a huge page aligned range of pages.
+
+It is possible to establish how long the stalls were using the function
+tracer to record how long was spent in __alloc_pages_nodemask and
+using the mm_page_alloc tracepoint to identify which allocations were
+for huge pages.
+
+get_user_pages and follow_page
+==============================
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
+
+.. note::
+   these aren't new constraints to the GUP API, and they match the
+   same constrains that applies to hugetlbfs too, so any driver capable
+   of handling GUP on hugetlbfs will also work fine on transparent
+   hugepage backed mappings.
+
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+
+Optimizing the applications
+===========================
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+Hugetlbfs
+=========
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
+
+Graceful fallback
+=================
+
+Code walking pagetables but unaware about huge pmds can simply call
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change::
+
+	diff --git a/mm/mremap.c b/mm/mremap.c
+	--- a/mm/mremap.c
+	+++ b/mm/mremap.c
+	@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+			return NULL;
+
+		pmd = pmd_offset(pud, addr);
+	+	split_huge_pmd(vma, pmd, addr);
+		if (pmd_none_or_clear_bad(pmd))
+			return NULL;
+
+Locking in hugepage aware code
+==============================
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+Refcounts and transparent huge pages
+====================================
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+  - get_page()/put_page() and GUP operate in head page's ->_refcount.
+
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
+    succeed on tail pages.
+
+  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+    on relevant sub-page of the compound page.
+
+  - map/unmap of the whole compound page accounted in compound_mapcount
+    (stored in first tail page). For file huge pages, we also increment
+    ->_mapcount of all sub-pages in order to have race-free detection of
+    last unmap of subpages.
+
+PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
+
+For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
+subpages is offset up by one. This additional reference is required to
+get race-free detection of unmap of subpages when we have them mapped with
+both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+For anonymous pages, we set PG_double_map when a PMD of the page got split
+for the first time, but still have PMD mapping. The additional references
+go away with last compound_mapcount.
+
+File pages get PG_double_map set on first map of the page with PTE and
+goes away when the page gets evicted from page cache.
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_refcount and
+page->_mapcount of anonymous pages. File pages just got unmapped.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages have zero ->_refcount until atomic_add(). This prevents the
+scanner from getting a reference to the tail page up to that point. After the
+atomic_add() we don't care about the ->_refcount value. We already known how
+many references should be uncharged from the head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+Partial unmap and deferred_split_huge_page()
+============================================
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap happens during exit(2) if
+a THP crosses a VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
deleted file mode 100644
index 569d182cc973..000000000000
--- a/Documentation/vm/transhuge.txt
+++ /dev/null
@@ -1,573 +0,0 @@
-.. _transhuge:
-
-============================
-Transparent Hugepage Support
-============================
-
-Objective
-=========
-
-Performance critical computing applications dealing with large memory
-working sets are already running on top of libhugetlbfs and in turn
-hugetlbfs. Transparent Hugepage Support is an alternative means of
-using huge pages for the backing of virtual memory with huge pages
-that supports the automatic promotion and demotion of page sizes and
-without the shortcomings of hugetlbfs.
-
-Currently it only works for anonymous memory mappings and tmpfs/shmem.
-But in the future it can expand to other filesystems.
-
-The reason applications are running faster is because of two
-factors. The first factor is almost completely irrelevant and it's not
-of significant interest because it'll also have the downside of
-requiring larger clear-page copy-page in page faults which is a
-potentially negative effect. The first factor consists in taking a
-single page fault for each 2M virtual region touched by userland (so
-reducing the enter/exit kernel frequency by a 512 times factor). This
-only matters the first time the memory is accessed for the lifetime of
-a memory mapping. The second long lasting and much more important
-factor will affect all subsequent accesses to the memory for the whole
-runtime of the application. The second factor consist of two
-components: 1) the TLB miss will run faster (especially with
-virtualization using nested pagetables but almost always also on bare
-metal without virtualization) and 2) a single TLB entry will be
-mapping a much larger amount of virtual memory in turn reducing the
-number of TLB misses. With virtualization and nested pagetables the
-TLB can be mapped of larger size only if both KVM and the Linux guest
-are using hugepages but a significant speedup already happens if only
-one of the two is using hugepages just because of the fact the TLB
-miss is going to run faster.
-
-Design
-======
-
-- "graceful fallback": mm components which don't have transparent hugepage
-  knowledge fall back to breaking huge pmd mapping into table of ptes and,
-  if necessary, split a transparent hugepage. Therefore these components
-  can continue working on the regular pages or regular pte mappings.
-
-- if a hugepage allocation fails because of memory fragmentation,
-  regular pages should be gracefully allocated instead and mixed in
-  the same vma without any failure or significant delay and without
-  userland noticing
-
-- if some task quits and more hugepages become available (either
-  immediately in the buddy or through the VM), guest physical memory
-  backed by regular pages should be relocated on hugepages
-  automatically (with khugepaged)
-
-- it doesn't require memory reservation and in turn it uses hugepages
-  whenever possible (the only possible reservation here is kernelcore=
-  to avoid unmovable pages to fragment all the memory but such a tweak
-  is not specific to transparent hugepage support and it's a generic
-  feature that applies to all dynamic high order allocations in the
-  kernel)
-
-Transparent Hugepage Support maximizes the usefulness of free memory
-if compared to the reservation approach of hugetlbfs by allowing all
-unused memory to be used as cache or other movable (or even unmovable
-entities). It doesn't require reservation to prevent hugepage
-allocation failures to be noticeable from userland. It allows paging
-and all other advanced VM features to be available on the
-hugepages. It requires no modifications for applications to take
-advantage of it.
-
-Applications however can be further optimized to take advantage of
-this feature, like for example they've been optimized before to avoid
-a flood of mmap system calls for every malloc(4k). Optimizing userland
-is by far not mandatory and khugepaged already can take care of long
-lived page allocations even for hugepage unaware applications that
-deals with large amounts of memory.
-
-In certain cases when hugepages are enabled system wide, application
-may end up allocating more memory resources. An application may mmap a
-large region but only touch 1 byte of it, in that case a 2M page might
-be allocated instead of a 4k page for no good. This is why it's
-possible to disable hugepages system-wide and to only have them inside
-MADV_HUGEPAGE madvise regions.
-
-Embedded systems should enable hugepages only inside madvise regions
-to eliminate any risk of wasting any precious byte of memory and to
-only run faster.
-
-Applications that gets a lot of benefit from hugepages and that don't
-risk to lose memory by using hugepages, should use
-madvise(MADV_HUGEPAGE) on their critical mmapped regions.
-
-sysfs
-=====
-
-Transparent Hugepage Support for anonymous memory can be entirely disabled
-(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
-regions (to avoid the risk of consuming more memory resources) or enabled
-system wide. This can be achieved with one of::
-
-	echo always >/sys/kernel/mm/transparent_hugepage/enabled
-	echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
-	echo never >/sys/kernel/mm/transparent_hugepage/enabled
-
-It's also possible to limit defrag efforts in the VM to generate
-anonymous hugepages in case they're not immediately free to madvise
-regions or to never try to defrag memory and simply fallback to regular
-pages unless hugepages are immediately available. Clearly if we spend CPU
-time to defrag memory, we would expect to gain even more by the fact we
-use hugepages later instead of regular pages. This isn't always
-guaranteed, but it may be more likely in case the allocation is for a
-MADV_HUGEPAGE region.
-
-::
-
-	echo always >/sys/kernel/mm/transparent_hugepage/defrag
-	echo defer >/sys/kernel/mm/transparent_hugepage/defrag
-	echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
-	echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
-	echo never >/sys/kernel/mm/transparent_hugepage/defrag
-
-always
-	means that an application requesting THP will stall on
-	allocation failure and directly reclaim pages and compact
-	memory in an effort to allocate a THP immediately. This may be
-	desirable for virtual machines that benefit heavily from THP
-	use and are willing to delay the VM start to utilise them.
-
-defer
-	means that an application will wake kswapd in the background
-	to reclaim pages and wake kcompactd to compact memory so that
-	THP is available in the near future. It's the responsibility
-	of khugepaged to then install the THP pages later.
-
-defer+madvise
-	will enter direct reclaim and compaction like ``always``, but
-	only for regions that have used madvise(MADV_HUGEPAGE); all
-	other regions will wake kswapd in the background to reclaim
-	pages and wake kcompactd to compact memory so that THP is
-	available in the near future.
-
-madvise
-	will enter direct reclaim like ``always`` but only for regions
-	that are have used madvise(MADV_HUGEPAGE). This is the default
-	behaviour.
-
-never
-	should be self-explanatory.
-
-By default kernel tries to use huge zero page on read page fault to
-anonymous mapping. It's possible to disable huge zero page by writing 0
-or enable it back by writing 1::
-
-	echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-	echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-
-Some userspace (such as a test program, or an optimized memory allocation
-library) may want to know the size (in bytes) of a transparent hugepage::
-
-	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
-
-khugepaged will be automatically started when
-transparent_hugepage/enabled is set to "always" or "madvise, and it'll
-be automatically shutdown if it's set to "never".
-
-khugepaged runs usually at low frequency so while one may not want to
-invoke defrag algorithms synchronously during the page faults, it
-should be worth invoking defrag at least in khugepaged. However it's
-also possible to disable defrag in khugepaged by writing 0 or enable
-defrag in khugepaged by writing 1::
-
-	echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-	echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-
-You can also control how many pages khugepaged should scan at each
-pass::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
-
-and how many milliseconds to wait in khugepaged between each pass (you
-can set this to 0 to run khugepaged at 100% utilization of one core)::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
-
-and how many milliseconds to wait in khugepaged if there's an hugepage
-allocation failure to throttle the next allocation attempt::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
-
-The khugepaged progress can be seen in the number of pages collapsed::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
-
-for each pass::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-
-``max_ptes_none`` specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
-
-``max_ptes_swap`` specifies how many pages can be brought in from
-swap when collapsing a group of pages into a transparent huge page::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
-
-A higher value can cause excessive swap IO and waste
-memory. A lower value can prevent THPs from being
-collapsed, resulting fewer pages being collapsed into
-THPs, and lower memory access performance.
-
-Boot parameter
-==============
-
-You can change the sysfs boot time defaults of Transparent Hugepage
-Support by passing the parameter ``transparent_hugepage=always`` or
-``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
-to the kernel command line.
-
-Hugepages in tmpfs/shmem
-========================
-
-You can control hugepage allocation policy in tmpfs with mount option
-``huge=``. It can have following values:
-
-always
-    Attempt to allocate huge pages every time we need a new page;
-
-never
-    Do not allocate huge pages;
-
-within_size
-    Only allocate huge page if it will be fully within i_size.
-    Also respect fadvise()/madvise() hints;
-
-advise
-    Only allocate huge pages if requested with fadvise()/madvise();
-
-The default policy is ``never``.
-
-``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
-``huge=never`` will not attempt to break up huge pages at all, just stop more
-from being allocated.
-
-There's also sysfs knob to control hugepage allocation policy for internal
-shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
-is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
-MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
-
-In addition to policies listed above, shmem_enabled allows two further
-values:
-
-deny
-    For use in emergencies, to force the huge option off from
-    all mounts;
-force
-    Force the huge option on for all - very useful for testing;
-
-Need of application restart
-===========================
-
-The transparent_hugepage/enabled values and tmpfs mount option only affect
-future behavior. So to make them effective you need to restart any
-application that could have been using hugepages. This also applies to the
-regions registered in khugepaged.
-
-Monitoring usage
-================
-
-The number of anonymous transparent huge pages currently used by the
-system is available by reading the AnonHugePages field in ``/proc/meminfo``.
-To identify what applications are using anonymous transparent huge pages,
-it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
-for each mapping.
-
-The number of file transparent huge pages mapped to userspace is available
-by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
-To identify what applications are mapping file transparent huge pages, it
-is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
-for each mapping.
-
-Note that reading the smaps file is expensive and reading it
-frequently will incur overhead.
-
-There are a number of counters in ``/proc/vmstat`` that may be used to
-monitor how successfully the system is providing huge pages for use.
-
-thp_fault_alloc
-	is incremented every time a huge page is successfully
-	allocated to handle a page fault. This applies to both the
-	first time a page is faulted and for COW faults.
-
-thp_collapse_alloc
-	is incremented by khugepaged when it has found
-	a range of pages to collapse into one huge page and has
-	successfully allocated a new huge page to store the data.
-
-thp_fault_fallback
-	is incremented if a page fault fails to allocate
-	a huge page and instead falls back to using small pages.
-
-thp_collapse_alloc_failed
-	is incremented if khugepaged found a range
-	of pages that should be collapsed into one huge page but failed
-	the allocation.
-
-thp_file_alloc
-	is incremented every time a file huge page is successfully
-	allocated.
-
-thp_file_mapped
-	is incremented every time a file huge page is mapped into
-	user address space.
-
-thp_split_page
-	is incremented every time a huge page is split into base
-	pages. This can happen for a variety of reasons but a common
-	reason is that a huge page is old and is being reclaimed.
-	This action implies splitting all PMD the page mapped with.
-
-thp_split_page_failed
-	is incremented if kernel fails to split huge
-	page. This can happen if the page was pinned by somebody.
-
-thp_deferred_split_page
-	is incremented when a huge page is put onto split
-	queue. This happens when a huge page is partially unmapped and
-	splitting it would free up some memory. Pages on split queue are
-	going to be split under memory pressure.
-
-thp_split_pmd
-	is incremented every time a PMD split into table of PTEs.
-	This can happen, for instance, when application calls mprotect() or
-	munmap() on part of huge page. It doesn't split huge page, only
-	page table entry.
-
-thp_zero_page_alloc
-	is incremented every time a huge zero page is
-	successfully allocated. It includes allocations which where
-	dropped due race with other allocation. Note, it doesn't count
-	every map of the huge zero page, only its allocation.
-
-thp_zero_page_alloc_failed
-	is incremented if kernel fails to allocate
-	huge zero page and falls back to using small pages.
-
-As the system ages, allocating huge pages may be expensive as the
-system uses memory compaction to copy data around memory to free a
-huge page for use. There are some counters in ``/proc/vmstat`` to help
-monitor this overhead.
-
-compact_stall
-	is incremented every time a process stalls to run
-	memory compaction so that a huge page is free for use.
-
-compact_success
-	is incremented if the system compacted memory and
-	freed a huge page for use.
-
-compact_fail
-	is incremented if the system tries to compact memory
-	but failed.
-
-compact_pages_moved
-	is incremented each time a page is moved. If
-	this value is increasing rapidly, it implies that the system
-	is copying a lot of data to satisfy the huge page allocation.
-	It is possible that the cost of copying exceeds any savings
-	from reduced TLB misses.
-
-compact_pagemigrate_failed
-	is incremented when the underlying mechanism
-	for moving a page failed.
-
-compact_blocks_moved
-	is incremented each time memory compaction examines
-	a huge page aligned range of pages.
-
-It is possible to establish how long the stalls were using the function
-tracer to record how long was spent in __alloc_pages_nodemask and
-using the mm_page_alloc tracepoint to identify which allocations were
-for huge pages.
-
-get_user_pages and follow_page
-==============================
-
-get_user_pages and follow_page if run on a hugepage, will return the
-head or tail pages as usual (exactly as they would do on
-hugetlbfs). Most gup users will only care about the actual physical
-address of the page and its temporary pinning to release after the I/O
-is complete, so they won't ever notice the fact the page is huge. But
-if any driver is going to mangle over the page structure of the tail
-page (like for checking page->mapping or other bits that are relevant
-for the head page and not the tail page), it should be updated to jump
-to check head page instead. Taking reference on any head/tail page would
-prevent page from being split by anyone.
-
-.. note::
-   these aren't new constraints to the GUP API, and they match the
-   same constrains that applies to hugetlbfs too, so any driver capable
-   of handling GUP on hugetlbfs will also work fine on transparent
-   hugepage backed mappings.
-
-In case you can't handle compound pages if they're returned by
-follow_page, the FOLL_SPLIT bit can be specified as parameter to
-follow_page, so that it will split the hugepages before returning
-them. Migration for example passes FOLL_SPLIT as parameter to
-follow_page because it's not hugepage aware and in fact it can't work
-at all on hugetlbfs (but it instead works fine on transparent
-hugepages thanks to FOLL_SPLIT). migration simply can't deal with
-hugepages being returned (as it's not only checking the pfn of the
-page and pinning it during the copy but it pretends to migrate the
-memory in regular page sizes and with regular pte/pmd mappings).
-
-Optimizing the applications
-===========================
-
-To be guaranteed that the kernel will map a 2M page immediately in any
-memory region, the mmap region has to be hugepage naturally
-aligned. posix_memalign() can provide that guarantee.
-
-Hugetlbfs
-=========
-
-You can use hugetlbfs on a kernel that has transparent hugepage
-support enabled just fine as always. No difference can be noted in
-hugetlbfs other than there will be less overall fragmentation. All
-usual features belonging to hugetlbfs are preserved and
-unaffected. libhugetlbfs will also work fine as usual.
-
-Graceful fallback
-=================
-
-Code walking pagetables but unaware about huge pmds can simply call
-split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
-pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_pmd where
-missing after pmd_offset returns the pmd. Thanks to the graceful
-fallback design, with a one liner change, you can avoid to write
-hundred if not thousand of lines of complex code to make your code
-hugepage aware.
-
-If you're not walking pagetables but you run into a physical hugepage
-but you can't handle it natively in your code, you can split it by
-calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example. split_huge_page() can fail
-if the page is pinned and you must handle this correctly.
-
-Example to make mremap.c transparent hugepage aware with a one liner
-change::
-
-	diff --git a/mm/mremap.c b/mm/mremap.c
-	--- a/mm/mremap.c
-	+++ b/mm/mremap.c
-	@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
-			return NULL;
-
-		pmd = pmd_offset(pud, addr);
-	+	split_huge_pmd(vma, pmd, addr);
-		if (pmd_none_or_clear_bad(pmd))
-			return NULL;
-
-Locking in hugepage aware code
-==============================
-
-We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_pmd() has a cost.
-
-To make pagetable walks huge pmd aware, all you need to do is to call
-pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
-mmap_sem in read (or write) mode to be sure an huge pmd cannot be
-created from under you by khugepaged (khugepaged collapse_huge_page
-takes the mmap_sem in write mode in addition to the anon_vma lock). If
-pmd_trans_huge returns false, you just fallback in the old code
-paths. If instead pmd_trans_huge returns true, you have to take the
-page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
-page table lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_pmd can run in parallel to the
-pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page table lock and fallback to the old code as
-before. Otherwise you can proceed to process the huge pmd and the
-hugepage natively. Once finished you can drop the page table lock.
-
-Refcounts and transparent huge pages
-====================================
-
-Refcounting on THP is mostly consistent with refcounting on other compound
-pages:
-
-  - get_page()/put_page() and GUP operate in head page's ->_refcount.
-
-  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
-    succeed on tail pages.
-
-  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
-    on relevant sub-page of the compound page.
-
-  - map/unmap of the whole compound page accounted in compound_mapcount
-    (stored in first tail page). For file huge pages, we also increment
-    ->_mapcount of all sub-pages in order to have race-free detection of
-    last unmap of subpages.
-
-PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
-
-For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
-subpages is offset up by one. This additional reference is required to
-get race-free detection of unmap of subpages when we have them mapped with
-both PMDs and PTEs.
-
-This is optimization required to lower overhead of per-subpage mapcount
-tracking. The alternative is alter ->_mapcount in all subpages on each
-map/unmap of the whole compound page.
-
-For anonymous pages, we set PG_double_map when a PMD of the page got split
-for the first time, but still have PMD mapping. The additional references
-go away with last compound_mapcount.
-
-File pages get PG_double_map set on first map of the page with PTE and
-goes away when the page gets evicted from page cache.
-
-split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the page
-structures. It can be done easily for refcounts taken by page table
-entries. But we don't have enough information on how to distribute any
-additional pins (i.e. from get_user_pages). split_huge_page() fails any
-requests to split pinned huge page: it expects page count to be equal to
-sum of mapcount of all sub-pages plus one (split_huge_page caller must
-have reference for head page).
-
-split_huge_page uses migration entries to stabilize page->_refcount and
-page->_mapcount of anonymous pages. File pages just got unmapped.
-
-We safe against physical memory scanners too: the only legitimate way
-scanner can get reference to a page is get_page_unless_zero().
-
-All tail pages have zero ->_refcount until atomic_add(). This prevents the
-scanner from getting a reference to the tail page up to that point. After the
-atomic_add() we don't care about the ->_refcount value. We already known how
-many references should be uncharged from the head page.
-
-For head page get_page_unless_zero() will succeed and we don't mind. It's
-clear where reference should go after split: it will stay on head page.
-
-Note that split_huge_pmd() doesn't have any limitation on refcounting:
-pmd can be split at any point and never fails.
-
-Partial unmap and deferred_split_huge_page()
-============================================
-
-Unmapping part of THP (with munmap() or other way) is not going to free
-memory immediately. Instead, we detect that a subpage of THP is not in use
-in page_remove_rmap() and queue the THP for splitting if memory pressure
-comes. Splitting will free up unused subpages.
-
-Splitting the page right away is not an option due to locking context in
-the place where we can detect partial unmap. It's also might be
-counterproductive since in many cases partial unmap happens during exit(2) if
-a THP crosses a VMA boundary.
-
-Function deferred_split_huge_page() is used to queue page for splitting.
-The splitting itself will happen when we get memory pressure via shrinker
-interface.
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
new file mode 100644
index 000000000000..fdd84cb8d511
--- /dev/null
+++ b/Documentation/vm/unevictable-lru.rst
@@ -0,0 +1,614 @@
+.. _unevictable_lru:
+
+==============================
+Unevictable LRU Infrastructure
+==============================
+
+.. contents:: :local:
+
+
+Introduction
+============
+
+This document describes the Linux memory manager's "Unevictable LRU"
+infrastructure and the use of this to manage several types of "unevictable"
+pages.
+
+The document attempts to provide the overall rationale behind this mechanism
+and the rationale for some of the design decisions that drove the
+implementation.  The latter design rationale is discussed in the context of an
+implementation description.  Admittedly, one can obtain the implementation
+details - the "what does it do?" - by reading the code.  One hopes that the
+descriptions below add value by provide the answer to "why does it do that?".
+
+
+
+The Unevictable LRU
+===================
+
+The Unevictable LRU facility adds an additional LRU list to track unevictable
+pages and to hide these pages from vmscan.  This mechanism is based on a patch
+by Larry Woodman of Red Hat to address several scalability problems with page
+reclaim in Linux.  The problems have been observed at customer sites on large
+memory x86_64 systems.
+
+To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
+main memory will have over 32 million 4k pages in a single zone.  When a large
+fraction of these pages are not evictable for any reason [see below], vmscan
+will spend a lot of time scanning the LRU lists looking for the small fraction
+of pages that are evictable.  This can result in a situation where all CPUs are
+spending 100% of their time in vmscan for hours or days on end, with the system
+completely unresponsive.
+
+The unevictable list addresses the following classes of unevictable pages:
+
+ * Those owned by ramfs.
+
+ * Those mapped into SHM_LOCK'd shared memory regions.
+
+ * Those mapped into VM_LOCKED [mlock()ed] VMAs.
+
+The infrastructure may also be able to handle other conditions that make pages
+unevictable, either by definition or by circumstance, in the future.
+
+
+The Unevictable Page List
+-------------------------
+
+The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
+called the "unevictable" list and an associated page flag, PG_unevictable, to
+indicate that the page is being managed on the unevictable list.
+
+The PG_unevictable flag is analogous to, and mutually exclusive with, the
+PG_active flag in that it indicates on which LRU list a page resides when
+PG_lru is set.
+
+The Unevictable LRU infrastructure maintains unevictable pages on an additional
+LRU list for a few reasons:
+
+ (1) We get to "treat unevictable pages just like we treat other pages in the
+     system - which means we get to use the same code to manipulate them, the
+     same code to isolate them (for migrate, etc.), the same code to keep track
+     of the statistics, etc..." [Rik van Riel]
+
+ (2) We want to be able to migrate unevictable pages between nodes for memory
+     defragmentation, workload management and memory hotplug.  The linux kernel
+     can only migrate pages that it can successfully isolate from the LRU
+     lists.  If we were to maintain pages elsewhere than on an LRU-like list,
+     where they can be found by isolate_lru_page(), we would prevent their
+     migration, unless we reworked migration code to find the unevictable pages
+     itself.
+
+
+The unevictable list does not differentiate between file-backed and anonymous,
+swap-backed pages.  This differentiation is only important while the pages are,
+in fact, evictable.
+
+The unevictable list benefits from the "arrayification" of the per-zone LRU
+lists and statistics originally proposed and posted by Christoph Lameter.
+
+The unevictable list does not use the LRU pagevec mechanism. Rather,
+unevictable pages are placed directly on the page's zone's unevictable list
+under the zone lru_lock.  This allows us to prevent the stranding of pages on
+the unevictable list when one task has the page isolated from the LRU and other
+tasks are changing the "evictability" state of the page.
+
+
+Memory Control Group Interaction
+--------------------------------
+
+The unevictable LRU facility interacts with the memory control group [aka
+memory controller; see Documentation/cgroup-v1/memory.txt] by extending the
+lru_list enum.
+
+The memory controller data structure automatically gets a per-zone unevictable
+list as a result of the "arrayification" of the per-zone LRU lists (one per
+lru_list enum element).  The memory controller tracks the movement of pages to
+and from the unevictable list.
+
+When a memory control group comes under memory pressure, the controller will
+not attempt to reclaim pages on the unevictable list.  This has a couple of
+effects:
+
+ (1) Because the pages are "hidden" from reclaim on the unevictable list, the
+     reclaim process can be more efficient, dealing only with pages that have a
+     chance of being reclaimed.
+
+ (2) On the other hand, if too many of the pages charged to the control group
+     are unevictable, the evictable portion of the working set of the tasks in
+     the control group may not fit into the available memory.  This can cause
+     the control group to thrash or to OOM-kill tasks.
+
+
+.. _mark_addr_space_unevict:
+
+Marking Address Spaces Unevictable
+----------------------------------
+
+For facilities such as ramfs none of the pages attached to the address space
+may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
+address space flag is provided, and this can be manipulated by a filesystem
+using a number of wrapper functions:
+
+ * ``void mapping_set_unevictable(struct address_space *mapping);``
+
+	Mark the address space as being completely unevictable.
+
+ * ``void mapping_clear_unevictable(struct address_space *mapping);``
+
+	Mark the address space as being evictable.
+
+ * ``int mapping_unevictable(struct address_space *mapping);``
+
+	Query the address space, and return true if it is completely
+	unevictable.
+
+These are currently used in two places in the kernel:
+
+ (1) By ramfs to mark the address spaces of its inodes when they are created,
+     and this mark remains for the life of the inode.
+
+ (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
+
+     Note that SHM_LOCK is not required to page in the locked pages if they're
+     swapped out; the application must touch the pages manually if it wants to
+     ensure they're in memory.
+
+
+Detecting Unevictable Pages
+---------------------------
+
+The function page_evictable() in vmscan.c determines whether a page is
+evictable or not using the query function outlined above [see section
+:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
+to check the AS_UNEVICTABLE flag.
+
+For address spaces that are so marked after being populated (as SHM regions
+might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
+the page tables for the region as does, for example, mlock(), nor need it make
+any special effort to push any pages in the SHM_LOCK'd area to the unevictable
+list.  Instead, vmscan will do this if and when it encounters the pages during
+a reclamation scan.
+
+On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
+the pages in the region and "rescue" them from the unevictable list if no other
+condition is keeping them unevictable.  If an unevictable region is destroyed,
+the pages are also "rescued" from the unevictable list in the process of
+freeing them.
+
+page_evictable() also checks for mlocked pages by testing an additional page
+flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
+faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED.
+
+
+Vmscan's Handling of Unevictable Pages
+--------------------------------------
+
+If unevictable pages are culled in the fault path, or moved to the unevictable
+list at mlock() or mmap() time, vmscan will not encounter the pages until they
+have become evictable again (via munlock() for example) and have been "rescued"
+from the unevictable list.  However, there may be situations where we decide,
+for the sake of expediency, to leave a unevictable page on one of the regular
+active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
+pages in all of the shrink_{active|inactive|page}_list() functions and will
+"cull" such pages that it encounters: that is, it diverts those pages to the
+unevictable list for the zone being scanned.
+
+There may be situations where a page is mapped into a VM_LOCKED VMA, but the
+page is not marked as PG_mlocked.  Such pages will make it all the way to
+shrink_page_list() where they will be detected when vmscan walks the reverse
+map in try_to_unmap().  If try_to_unmap() returns SWAP_MLOCK,
+shrink_page_list() will cull the page at that point.
+
+To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
+using putback_lru_page() - the inverse operation to isolate_lru_page() - after
+dropping the page lock.  Because the condition which makes the page unevictable
+may change once the page is unlocked, putback_lru_page() will recheck the
+unevictable state of a page that it places on the unevictable list.  If the
+page has become unevictable, putback_lru_page() removes it from the list and
+retries, including the page_unevictable() test.  Because such a race is a rare
+event and movement of pages onto the unevictable list should be rare, these
+extra evictabilty checks should not occur in the majority of calls to
+putback_lru_page().
+
+
+MLOCKED Pages
+=============
+
+The unevictable page list is also useful for mlock(), in addition to ramfs and
+SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
+NOMMU situations, all mappings are effectively mlocked.
+
+
+History
+-------
+
+The "Unevictable mlocked Pages" infrastructure is based on work originally
+posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
+Nick posted his patch as an alternative to a patch posted by Christoph Lameter
+to achieve the same objective: hiding mlocked pages from vmscan.
+
+In Nick's patch, he used one of the struct page LRU list link fields as a count
+of VM_LOCKED VMAs that map the page.  This use of the link field for a count
+prevented the management of the pages on an LRU list, and thus mlocked pages
+were not migratable as isolate_lru_page() could not find them, and the LRU list
+link field was not available to the migration subsystem.
+
+Nick resolved this by putting mlocked pages back on the lru list before
+attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
+Nick's patch was integrated with the Unevictable LRU work, the count was
+replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
+mapped the page.  More on this below.
+
+
+Basic Management
+----------------
+
+mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
+pages.  When such a page has been "noticed" by the memory management subsystem,
+the page is marked with the PG_mlocked flag.  This can be manipulated using the
+PageMlocked() functions.
+
+A PG_mlocked page will be placed on the unevictable list when it is added to
+the LRU.  Such pages can be "noticed" by memory management in several places:
+
+ (1) in the mlock()/mlockall() system call handlers;
+
+ (2) in the mmap() system call handler when mmapping a region with the
+     MAP_LOCKED flag;
+
+ (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
+     flag
+
+ (4) in the fault path, if mlocked pages are "culled" in the fault path,
+     and when a VM_LOCKED stack segment is expanded; or
+
+ (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
+     reclaim a page in a VM_LOCKED VMA via try_to_unmap()
+
+all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
+already have it set.
+
+mlocked pages become unlocked and rescued from the unevictable list when:
+
+ (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
+
+ (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
+     unmapping at task exit;
+
+ (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
+     or
+
+ (4) before a page is COW'd in a VM_LOCKED VMA.
+
+
+mlock()/mlockall() System Call Handling
+---------------------------------------
+
+Both [do\_]mlock() and [do\_]mlockall() system call handlers call mlock_fixup()
+for each VMA in the range specified by the call.  In the case of mlockall(),
+this is the entire active address space of the task.  Note that mlock_fixup()
+is used for both mlocking and munlocking a range of memory.  A call to mlock()
+an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
+treated as a no-op, and mlock_fixup() simply returns.
+
+If the VMA passes some filtering as described in "Filtering Special Vmas"
+below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
+off a subset of the VMA if the range does not cover the entire VMA.  Once the
+VMA has been merged or split or neither, mlock_fixup() will call
+populate_vma_page_range() to fault in the pages via get_user_pages() and to
+mark the pages as mlocked via mlock_vma_page().
+
+Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
+get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
+do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
+fault path or in vmscan.
+
+Also note that a page returned by get_user_pages() could be truncated or
+migrated out from under us, while we're trying to mlock it.  To detect this,
+populate_vma_page_range() checks page_mapping() after acquiring the page lock.
+If the page is still associated with its mapping, we'll go ahead and call
+mlock_vma_page().  If the mapping is gone, we just unlock the page and move on.
+In the worst case, this will result in a page mapped in a VM_LOCKED VMA
+remaining on a normal LRU list without being PageMlocked().  Again, vmscan will
+detect and cull such pages.
+
+mlock_vma_page() will call TestSetPageMlocked() for each page returned by
+get_user_pages().  We use TestSetPageMlocked() because the page might already
+be mlocked by another task/VMA and we don't want to do extra work.  We
+especially do not want to count an mlocked page more than once in the
+statistics.  If the page was already mlocked, mlock_vma_page() need do nothing
+more.
+
+If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
+page from the LRU, as it is likely on the appropriate active or inactive list
+at that time.  If the isolate_lru_page() succeeds, mlock_vma_page() will put
+back the page - by calling putback_lru_page() - which will notice that the page
+is now mlocked and divert the page to the zone's unevictable list.  If
+mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
+it later if and when it attempts to reclaim the page.
+
+
+Filtering Special VMAs
+----------------------
+
+mlock_fixup() filters several classes of "special" VMAs:
+
+1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
+   these mappings are inherently pinned, so we don't need to mark them as
+   mlocked.  In any case, most of the pages have no struct page in which to so
+   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
+   so there is no sense in attempting to visit them.
+
+2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
+   neither need nor want to mlock() these pages.  However, to preserve the
+   prior behavior of mlock() - before the unevictable/mlock changes -
+   mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
+   allocate the huge pages and populate the ptes.
+
+3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
+   such as the VDSO page, relay channel pages, etc. These pages
+   are inherently unevictable and are not managed on the LRU lists.
+   mlock_fixup() treats these VMAs the same as hugetlbfs VMAs.  It calls
+   make_pages_present() to populate the ptes.
+
+Note that for all of these special VMAs, mlock_fixup() does not set the
+VM_LOCKED flag.  Therefore, we won't have to deal with them later during
+munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
+VMAs against the task's "locked_vm".
+
+.. _munlock_munlockall_handling:
+
+munlock()/munlockall() System Call Handling
+-------------------------------------------
+
+The munlock() and munlockall() system calls are handled by the same functions -
+do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
+lock operation indicated by an argument.  So, these system calls are also
+handled by mlock_fixup().  Again, if called for an already munlocked VMA,
+mlock_fixup() simply returns.  Because of the VMA filtering discussed above,
+VM_LOCKED will not be set in any "special" VMAs.  So, these VMAs will be
+ignored for munlock.
+
+If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
+specified range.  The range is then munlocked via the function
+populate_vma_page_range() - the same function used to mlock a VMA range -
+passing a flag to indicate that munlock() is being performed.
+
+Because the VMA access protections could have been changed to PROT_NONE after
+faulting in and mlocking pages, get_user_pages() was unreliable for visiting
+these pages for munlocking.  Because we don't want to leave pages mlocked,
+get_user_pages() was enhanced to accept a flag to ignore the permissions when
+fetching the pages - all of which should be resident as a result of previous
+mlocking.
+
+For munlock(), populate_vma_page_range() unlocks individual pages by calling
+munlock_vma_page().  munlock_vma_page() unconditionally clears the PG_mlocked
+flag using TestClearPageMlocked().  As with mlock_vma_page(),
+munlock_vma_page() use the Test*PageMlocked() function to handle the case where
+the page might have already been unlocked by another task.  If the page was
+mlocked, munlock_vma_page() updates that zone statistics for the number of
+mlocked pages.  Note, however, that at this point we haven't checked whether
+the page is mapped by other VM_LOCKED VMAs.
+
+We can't call try_to_munlock(), the function that walks the reverse map to
+check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
+try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
+not be on an LRU list [more on these below].  However, the call to
+isolate_lru_page() could fail, in which case we couldn't try_to_munlock().  So,
+we go ahead and clear PG_mlocked up front, as this might be the only chance we
+have.  If we can successfully isolate the page, we go ahead and
+try_to_munlock(), which will restore the PG_mlocked flag and update the zone
+page statistics if it finds another VMA holding the page mlocked.  If we fail
+to isolate the page, we'll have left a potentially mlocked page on the LRU.
+This is fine, because we'll catch it later if and if vmscan tries to reclaim
+the page.  This should be relatively rare.
+
+
+Migrating MLOCKED Pages
+-----------------------
+
+A page that is being migrated has been isolated from the LRU lists and is held
+locked across unmapping of the page, updating the page's address space entry
+and copying the contents and state, until the page table entry has been
+replaced with an entry that refers to the new page.  Linux supports migration
+of mlocked pages and other unevictable pages.  This involves simply moving the
+PG_mlocked and PG_unevictable states from the old page to the new page.
+
+Note that page migration can race with mlocking or munlocking of the same page.
+This has been discussed from the mlock/munlock perspective in the respective
+sections above.  Both processes (migration and m[un]locking) hold the page
+locked.  This provides the first level of synchronization.  Page migration
+zeros out the page_mapping of the old page before unlocking it, so m[un]lock
+can skip these pages by testing the page mapping under page lock.
+
+To complete page migration, we place the new and old pages back onto the LRU
+after dropping the page lock.  The "unneeded" page - old page on success, new
+page on failure - will be freed when the reference count held by the migration
+process is released.  To ensure that we don't strand pages on the unevictable
+list because of a race between munlock and migration, page migration uses the
+putback_lru_page() function to add migrated pages back to the LRU.
+
+
+Compacting MLOCKED Pages
+------------------------
+
+The unevictable LRU can be scanned for compactable regions and the default
+behavior is to do so.  /proc/sys/vm/compact_unevictable_allowed controls
+this behavior (see Documentation/sysctl/vm.txt).  Once scanning of the
+unevictable LRU is enabled, the work of compaction is mostly handled by
+the page migration code and the same work flow as described in MIGRATING
+MLOCKED PAGES will apply.
+
+MLOCKING Transparent Huge Pages
+-------------------------------
+
+A transparent huge page is represented by a single entry on an LRU list.
+Therefore, we can only make unevictable an entire compound page, not
+individual subpages.
+
+If a user tries to mlock() part of a huge page, we want the rest of the
+page to be reclaimable.
+
+We cannot just split the page on partial mlock() as split_huge_page() can
+fail and new intermittent failure mode for the syscall is undesirable.
+
+We handle this by keeping PTE-mapped huge pages on normal LRU lists: the
+PMD on border of VM_LOCKED VMA will be split into PTE table.
+
+This way the huge page is accessible for vmscan. Under memory pressure the
+page will be split, subpages which belong to VM_LOCKED VMAs will be moved
+to unevictable LRU and the rest can be reclaimed.
+
+See also comment in follow_trans_huge_pmd().
+
+mmap(MAP_LOCKED) System Call Handling
+-------------------------------------
+
+In addition the mlock()/mlockall() system calls, an application can request
+that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
+call. There is one important and subtle difference here, though. mmap() + mlock()
+will fail if the range cannot be faulted in (e.g. because mm_populate fails)
+and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped
+area will still have properties of the locked area - aka. pages will not get
+swapped out - but major page faults to fault memory in might still happen.
+
+Furthermore, any mmap() call or brk() call that expands the heap by a
+task that has previously called mlockall() with the MCL_FUTURE flag will result
+in the newly mapped memory being mlocked.  Before the unevictable/mlock
+changes, the kernel simply called make_pages_present() to allocate pages and
+populate the page table.
+
+To mlock a range of memory under the unevictable/mlock infrastructure, the
+mmap() handler and task address space expansion functions call
+populate_vma_page_range() specifying the vma and the address range to mlock.
+
+The callers of populate_vma_page_range() will have already added the memory range
+to be mlocked to the task's "locked_vm".  To account for filtered VMAs,
+populate_vma_page_range() returns the number of pages NOT mlocked.  All of the
+callers then subtract a non-negative return value from the task's locked_vm.  A
+negative return value represent an error - for example, from get_user_pages()
+attempting to fault in a VMA with PROT_NONE access.  In this case, we leave the
+memory range accounted as locked_vm, as the protections could be changed later
+and pages allocated into that region.
+
+
+munmap()/exit()/exec() System Call Handling
+-------------------------------------------
+
+When unmapping an mlocked region of memory, whether by an explicit call to
+munmap() or via an internal unmap from exit() or exec() processing, we must
+munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
+Before the unevictable/mlock changes, mlocking did not mark the pages in any
+way, so unmapping them required no processing.
+
+To munlock a range of memory under the unevictable/mlock infrastructure, the
+munmap() handler and task address space call tear down function
+munlock_vma_pages_all().  The name reflects the observation that one always
+specifies the entire VMA range when munlock()ing during unmap of a region.
+Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
+actually contain mlocked pages will be passed to munlock_vma_pages_all().
+
+munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
+for the munlock case, calls __munlock_vma_pages_range() to walk the page table
+for the VMA's memory range and munlock_vma_page() each resident page mapped by
+the VMA.  This effectively munlocks the page, only if this is the last
+VM_LOCKED VMA that maps the page.
+
+
+try_to_unmap()
+--------------
+
+Pages can, of course, be mapped into multiple VMAs.  Some of these VMAs may
+have VM_LOCKED flag set.  It is possible for a page mapped into one or more
+VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
+of the active or inactive LRU lists.  This could happen if, for example, a task
+in the process of munlocking the page could not isolate the page from the LRU.
+As a result, vmscan/shrink_page_list() might encounter such a page as described
+in section "vmscan's handling of unevictable pages".  To handle this situation,
+try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
+map.
+
+try_to_unmap() is always called, by either vmscan for reclaim or for page
+migration, with the argument page locked and isolated from the LRU.  Separate
+functions handle anonymous and mapped file and KSM pages, as these types of
+pages have different reverse map lookup mechanisms, with different locking.
+In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(),
+it will call try_to_unmap_one() for every VMA which might contain the page.
+
+When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED
+VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it,
+and return SWAP_MLOCK to indicate that the page is unevictable: and the scan
+stops there.
+
+mlock_vma_page() is called while holding the page table's lock (in addition
+to the page lock, and the rmap lock): to serialize against concurrent mlock or
+munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
+holepunching, and truncation of file pages and their anonymous COWed pages.
+
+
+try_to_munlock() Reverse Map Scan
+---------------------------------
+
+.. warning::
+   [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
+   page_referenced() reverse map walker.
+
+When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call
+Handling <munlock_munlockall_handling>` above] tries to munlock a
+page, it needs to determine whether or not the page is mapped by any
+VM_LOCKED VMA without actually attempting to unmap all PTEs from the
+page.  For this purpose, the unevictable/mlock infrastructure
+introduced a variant of try_to_unmap() called try_to_munlock().
+
+try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
+mapped file and KSM pages with a flag argument specifying unlock versus unmap
+processing.  Again, these functions walk the respective reverse maps looking
+for VM_LOCKED VMAs.  When such a VMA is found, as in the try_to_unmap() case,
+the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK.  This
+undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
+
+Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
+reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
+However, the scan can terminate when it encounters a VM_LOCKED VMA.
+Although try_to_munlock() might be called a great many times when munlocking a
+large region or tearing down a large address space that has been mlocked via
+mlockall(), overall this is a fairly rare event.
+
+
+Page Reclaim in shrink_*_list()
+-------------------------------
+
+shrink_active_list() culls any obviously unevictable pages - i.e.
+!page_evictable(page) - diverting these to the unevictable list.
+However, shrink_active_list() only sees unevictable pages that made it onto the
+active/inactive lru lists.  Note that these pages do not have PageUnevictable
+set - otherwise they would be on the unevictable list and shrink_active_list
+would never see them.
+
+Some examples of these unevictable pages on the LRU lists are:
+
+ (1) ramfs pages that have been placed on the LRU lists when first allocated.
+
+ (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
+     allocate or fault in the pages in the shared memory region.  This happens
+     when an application accesses the page the first time after SHM_LOCK'ing
+     the segment.
+
+ (3) mlocked pages that could not be isolated from the LRU and moved to the
+     unevictable list in mlock_vma_page().
+
+shrink_inactive_list() also diverts any unevictable pages that it finds on the
+inactive lists to the appropriate zone's unevictable list.
+
+shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
+after shrink_active_list() had moved them to the inactive list, or pages mapped
+into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
+recheck via try_to_munlock().  shrink_inactive_list() won't notice the latter,
+but will pass on to shrink_page_list().
+
+shrink_page_list() again culls obviously unevictable pages that it could
+encounter for similar reason to shrink_inactive_list().  Pages mapped into
+VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
+try_to_unmap().  shrink_page_list() will divert them to the unevictable list
+when try_to_unmap() returns SWAP_MLOCK, as discussed above.
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
deleted file mode 100644
index fdd84cb8d511..000000000000
--- a/Documentation/vm/unevictable-lru.txt
+++ /dev/null
@@ -1,614 +0,0 @@
-.. _unevictable_lru:
-
-==============================
-Unevictable LRU Infrastructure
-==============================
-
-.. contents:: :local:
-
-
-Introduction
-============
-
-This document describes the Linux memory manager's "Unevictable LRU"
-infrastructure and the use of this to manage several types of "unevictable"
-pages.
-
-The document attempts to provide the overall rationale behind this mechanism
-and the rationale for some of the design decisions that drove the
-implementation.  The latter design rationale is discussed in the context of an
-implementation description.  Admittedly, one can obtain the implementation
-details - the "what does it do?" - by reading the code.  One hopes that the
-descriptions below add value by provide the answer to "why does it do that?".
-
-
-
-The Unevictable LRU
-===================
-
-The Unevictable LRU facility adds an additional LRU list to track unevictable
-pages and to hide these pages from vmscan.  This mechanism is based on a patch
-by Larry Woodman of Red Hat to address several scalability problems with page
-reclaim in Linux.  The problems have been observed at customer sites on large
-memory x86_64 systems.
-
-To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
-main memory will have over 32 million 4k pages in a single zone.  When a large
-fraction of these pages are not evictable for any reason [see below], vmscan
-will spend a lot of time scanning the LRU lists looking for the small fraction
-of pages that are evictable.  This can result in a situation where all CPUs are
-spending 100% of their time in vmscan for hours or days on end, with the system
-completely unresponsive.
-
-The unevictable list addresses the following classes of unevictable pages:
-
- * Those owned by ramfs.
-
- * Those mapped into SHM_LOCK'd shared memory regions.
-
- * Those mapped into VM_LOCKED [mlock()ed] VMAs.
-
-The infrastructure may also be able to handle other conditions that make pages
-unevictable, either by definition or by circumstance, in the future.
-
-
-The Unevictable Page List
--------------------------
-
-The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
-called the "unevictable" list and an associated page flag, PG_unevictable, to
-indicate that the page is being managed on the unevictable list.
-
-The PG_unevictable flag is analogous to, and mutually exclusive with, the
-PG_active flag in that it indicates on which LRU list a page resides when
-PG_lru is set.
-
-The Unevictable LRU infrastructure maintains unevictable pages on an additional
-LRU list for a few reasons:
-
- (1) We get to "treat unevictable pages just like we treat other pages in the
-     system - which means we get to use the same code to manipulate them, the
-     same code to isolate them (for migrate, etc.), the same code to keep track
-     of the statistics, etc..." [Rik van Riel]
-
- (2) We want to be able to migrate unevictable pages between nodes for memory
-     defragmentation, workload management and memory hotplug.  The linux kernel
-     can only migrate pages that it can successfully isolate from the LRU
-     lists.  If we were to maintain pages elsewhere than on an LRU-like list,
-     where they can be found by isolate_lru_page(), we would prevent their
-     migration, unless we reworked migration code to find the unevictable pages
-     itself.
-
-
-The unevictable list does not differentiate between file-backed and anonymous,
-swap-backed pages.  This differentiation is only important while the pages are,
-in fact, evictable.
-
-The unevictable list benefits from the "arrayification" of the per-zone LRU
-lists and statistics originally proposed and posted by Christoph Lameter.
-
-The unevictable list does not use the LRU pagevec mechanism. Rather,
-unevictable pages are placed directly on the page's zone's unevictable list
-under the zone lru_lock.  This allows us to prevent the stranding of pages on
-the unevictable list when one task has the page isolated from the LRU and other
-tasks are changing the "evictability" state of the page.
-
-
-Memory Control Group Interaction
---------------------------------
-
-The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/cgroup-v1/memory.txt] by extending the
-lru_list enum.
-
-The memory controller data structure automatically gets a per-zone unevictable
-list as a result of the "arrayification" of the per-zone LRU lists (one per
-lru_list enum element).  The memory controller tracks the movement of pages to
-and from the unevictable list.
-
-When a memory control group comes under memory pressure, the controller will
-not attempt to reclaim pages on the unevictable list.  This has a couple of
-effects:
-
- (1) Because the pages are "hidden" from reclaim on the unevictable list, the
-     reclaim process can be more efficient, dealing only with pages that have a
-     chance of being reclaimed.
-
- (2) On the other hand, if too many of the pages charged to the control group
-     are unevictable, the evictable portion of the working set of the tasks in
-     the control group may not fit into the available memory.  This can cause
-     the control group to thrash or to OOM-kill tasks.
-
-
-.. _mark_addr_space_unevict:
-
-Marking Address Spaces Unevictable
-----------------------------------
-
-For facilities such as ramfs none of the pages attached to the address space
-may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
-address space flag is provided, and this can be manipulated by a filesystem
-using a number of wrapper functions:
-
- * ``void mapping_set_unevictable(struct address_space *mapping);``
-
-	Mark the address space as being completely unevictable.
-
- * ``void mapping_clear_unevictable(struct address_space *mapping);``
-
-	Mark the address space as being evictable.
-
- * ``int mapping_unevictable(struct address_space *mapping);``
-
-	Query the address space, and return true if it is completely
-	unevictable.
-
-These are currently used in two places in the kernel:
-
- (1) By ramfs to mark the address spaces of its inodes when they are created,
-     and this mark remains for the life of the inode.
-
- (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
-
-     Note that SHM_LOCK is not required to page in the locked pages if they're
-     swapped out; the application must touch the pages manually if it wants to
-     ensure they're in memory.
-
-
-Detecting Unevictable Pages
----------------------------
-
-The function page_evictable() in vmscan.c determines whether a page is
-evictable or not using the query function outlined above [see section
-:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
-to check the AS_UNEVICTABLE flag.
-
-For address spaces that are so marked after being populated (as SHM regions
-might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
-the page tables for the region as does, for example, mlock(), nor need it make
-any special effort to push any pages in the SHM_LOCK'd area to the unevictable
-list.  Instead, vmscan will do this if and when it encounters the pages during
-a reclamation scan.
-
-On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
-the pages in the region and "rescue" them from the unevictable list if no other
-condition is keeping them unevictable.  If an unevictable region is destroyed,
-the pages are also "rescued" from the unevictable list in the process of
-freeing them.
-
-page_evictable() also checks for mlocked pages by testing an additional page
-flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
-faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED.
-
-
-Vmscan's Handling of Unevictable Pages
---------------------------------------
-
-If unevictable pages are culled in the fault path, or moved to the unevictable
-list at mlock() or mmap() time, vmscan will not encounter the pages until they
-have become evictable again (via munlock() for example) and have been "rescued"
-from the unevictable list.  However, there may be situations where we decide,
-for the sake of expediency, to leave a unevictable page on one of the regular
-active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
-pages in all of the shrink_{active|inactive|page}_list() functions and will
-"cull" such pages that it encounters: that is, it diverts those pages to the
-unevictable list for the zone being scanned.
-
-There may be situations where a page is mapped into a VM_LOCKED VMA, but the
-page is not marked as PG_mlocked.  Such pages will make it all the way to
-shrink_page_list() where they will be detected when vmscan walks the reverse
-map in try_to_unmap().  If try_to_unmap() returns SWAP_MLOCK,
-shrink_page_list() will cull the page at that point.
-
-To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
-using putback_lru_page() - the inverse operation to isolate_lru_page() - after
-dropping the page lock.  Because the condition which makes the page unevictable
-may change once the page is unlocked, putback_lru_page() will recheck the
-unevictable state of a page that it places on the unevictable list.  If the
-page has become unevictable, putback_lru_page() removes it from the list and
-retries, including the page_unevictable() test.  Because such a race is a rare
-event and movement of pages onto the unevictable list should be rare, these
-extra evictabilty checks should not occur in the majority of calls to
-putback_lru_page().
-
-
-MLOCKED Pages
-=============
-
-The unevictable page list is also useful for mlock(), in addition to ramfs and
-SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
-NOMMU situations, all mappings are effectively mlocked.
-
-
-History
--------
-
-The "Unevictable mlocked Pages" infrastructure is based on work originally
-posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
-Nick posted his patch as an alternative to a patch posted by Christoph Lameter
-to achieve the same objective: hiding mlocked pages from vmscan.
-
-In Nick's patch, he used one of the struct page LRU list link fields as a count
-of VM_LOCKED VMAs that map the page.  This use of the link field for a count
-prevented the management of the pages on an LRU list, and thus mlocked pages
-were not migratable as isolate_lru_page() could not find them, and the LRU list
-link field was not available to the migration subsystem.
-
-Nick resolved this by putting mlocked pages back on the lru list before
-attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
-Nick's patch was integrated with the Unevictable LRU work, the count was
-replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
-mapped the page.  More on this below.
-
-
-Basic Management
-----------------
-
-mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
-pages.  When such a page has been "noticed" by the memory management subsystem,
-the page is marked with the PG_mlocked flag.  This can be manipulated using the
-PageMlocked() functions.
-
-A PG_mlocked page will be placed on the unevictable list when it is added to
-the LRU.  Such pages can be "noticed" by memory management in several places:
-
- (1) in the mlock()/mlockall() system call handlers;
-
- (2) in the mmap() system call handler when mmapping a region with the
-     MAP_LOCKED flag;
-
- (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
-     flag
-
- (4) in the fault path, if mlocked pages are "culled" in the fault path,
-     and when a VM_LOCKED stack segment is expanded; or
-
- (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
-     reclaim a page in a VM_LOCKED VMA via try_to_unmap()
-
-all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
-already have it set.
-
-mlocked pages become unlocked and rescued from the unevictable list when:
-
- (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
-
- (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
-     unmapping at task exit;
-
- (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
-     or
-
- (4) before a page is COW'd in a VM_LOCKED VMA.
-
-
-mlock()/mlockall() System Call Handling
----------------------------------------
-
-Both [do\_]mlock() and [do\_]mlockall() system call handlers call mlock_fixup()
-for each VMA in the range specified by the call.  In the case of mlockall(),
-this is the entire active address space of the task.  Note that mlock_fixup()
-is used for both mlocking and munlocking a range of memory.  A call to mlock()
-an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
-treated as a no-op, and mlock_fixup() simply returns.
-
-If the VMA passes some filtering as described in "Filtering Special Vmas"
-below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
-off a subset of the VMA if the range does not cover the entire VMA.  Once the
-VMA has been merged or split or neither, mlock_fixup() will call
-populate_vma_page_range() to fault in the pages via get_user_pages() and to
-mark the pages as mlocked via mlock_vma_page().
-
-Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
-get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
-do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
-fault path or in vmscan.
-
-Also note that a page returned by get_user_pages() could be truncated or
-migrated out from under us, while we're trying to mlock it.  To detect this,
-populate_vma_page_range() checks page_mapping() after acquiring the page lock.
-If the page is still associated with its mapping, we'll go ahead and call
-mlock_vma_page().  If the mapping is gone, we just unlock the page and move on.
-In the worst case, this will result in a page mapped in a VM_LOCKED VMA
-remaining on a normal LRU list without being PageMlocked().  Again, vmscan will
-detect and cull such pages.
-
-mlock_vma_page() will call TestSetPageMlocked() for each page returned by
-get_user_pages().  We use TestSetPageMlocked() because the page might already
-be mlocked by another task/VMA and we don't want to do extra work.  We
-especially do not want to count an mlocked page more than once in the
-statistics.  If the page was already mlocked, mlock_vma_page() need do nothing
-more.
-
-If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
-page from the LRU, as it is likely on the appropriate active or inactive list
-at that time.  If the isolate_lru_page() succeeds, mlock_vma_page() will put
-back the page - by calling putback_lru_page() - which will notice that the page
-is now mlocked and divert the page to the zone's unevictable list.  If
-mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
-it later if and when it attempts to reclaim the page.
-
-
-Filtering Special VMAs
-----------------------
-
-mlock_fixup() filters several classes of "special" VMAs:
-
-1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
-   these mappings are inherently pinned, so we don't need to mark them as
-   mlocked.  In any case, most of the pages have no struct page in which to so
-   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
-   so there is no sense in attempting to visit them.
-
-2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
-   neither need nor want to mlock() these pages.  However, to preserve the
-   prior behavior of mlock() - before the unevictable/mlock changes -
-   mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
-   allocate the huge pages and populate the ptes.
-
-3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
-   such as the VDSO page, relay channel pages, etc. These pages
-   are inherently unevictable and are not managed on the LRU lists.
-   mlock_fixup() treats these VMAs the same as hugetlbfs VMAs.  It calls
-   make_pages_present() to populate the ptes.
-
-Note that for all of these special VMAs, mlock_fixup() does not set the
-VM_LOCKED flag.  Therefore, we won't have to deal with them later during
-munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
-VMAs against the task's "locked_vm".
-
-.. _munlock_munlockall_handling:
-
-munlock()/munlockall() System Call Handling
--------------------------------------------
-
-The munlock() and munlockall() system calls are handled by the same functions -
-do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
-lock operation indicated by an argument.  So, these system calls are also
-handled by mlock_fixup().  Again, if called for an already munlocked VMA,
-mlock_fixup() simply returns.  Because of the VMA filtering discussed above,
-VM_LOCKED will not be set in any "special" VMAs.  So, these VMAs will be
-ignored for munlock.
-
-If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
-specified range.  The range is then munlocked via the function
-populate_vma_page_range() - the same function used to mlock a VMA range -
-passing a flag to indicate that munlock() is being performed.
-
-Because the VMA access protections could have been changed to PROT_NONE after
-faulting in and mlocking pages, get_user_pages() was unreliable for visiting
-these pages for munlocking.  Because we don't want to leave pages mlocked,
-get_user_pages() was enhanced to accept a flag to ignore the permissions when
-fetching the pages - all of which should be resident as a result of previous
-mlocking.
-
-For munlock(), populate_vma_page_range() unlocks individual pages by calling
-munlock_vma_page().  munlock_vma_page() unconditionally clears the PG_mlocked
-flag using TestClearPageMlocked().  As with mlock_vma_page(),
-munlock_vma_page() use the Test*PageMlocked() function to handle the case where
-the page might have already been unlocked by another task.  If the page was
-mlocked, munlock_vma_page() updates that zone statistics for the number of
-mlocked pages.  Note, however, that at this point we haven't checked whether
-the page is mapped by other VM_LOCKED VMAs.
-
-We can't call try_to_munlock(), the function that walks the reverse map to
-check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
-try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
-not be on an LRU list [more on these below].  However, the call to
-isolate_lru_page() could fail, in which case we couldn't try_to_munlock().  So,
-we go ahead and clear PG_mlocked up front, as this might be the only chance we
-have.  If we can successfully isolate the page, we go ahead and
-try_to_munlock(), which will restore the PG_mlocked flag and update the zone
-page statistics if it finds another VMA holding the page mlocked.  If we fail
-to isolate the page, we'll have left a potentially mlocked page on the LRU.
-This is fine, because we'll catch it later if and if vmscan tries to reclaim
-the page.  This should be relatively rare.
-
-
-Migrating MLOCKED Pages
------------------------
-
-A page that is being migrated has been isolated from the LRU lists and is held
-locked across unmapping of the page, updating the page's address space entry
-and copying the contents and state, until the page table entry has been
-replaced with an entry that refers to the new page.  Linux supports migration
-of mlocked pages and other unevictable pages.  This involves simply moving the
-PG_mlocked and PG_unevictable states from the old page to the new page.
-
-Note that page migration can race with mlocking or munlocking of the same page.
-This has been discussed from the mlock/munlock perspective in the respective
-sections above.  Both processes (migration and m[un]locking) hold the page
-locked.  This provides the first level of synchronization.  Page migration
-zeros out the page_mapping of the old page before unlocking it, so m[un]lock
-can skip these pages by testing the page mapping under page lock.
-
-To complete page migration, we place the new and old pages back onto the LRU
-after dropping the page lock.  The "unneeded" page - old page on success, new
-page on failure - will be freed when the reference count held by the migration
-process is released.  To ensure that we don't strand pages on the unevictable
-list because of a race between munlock and migration, page migration uses the
-putback_lru_page() function to add migrated pages back to the LRU.
-
-
-Compacting MLOCKED Pages
-------------------------
-
-The unevictable LRU can be scanned for compactable regions and the default
-behavior is to do so.  /proc/sys/vm/compact_unevictable_allowed controls
-this behavior (see Documentation/sysctl/vm.txt).  Once scanning of the
-unevictable LRU is enabled, the work of compaction is mostly handled by
-the page migration code and the same work flow as described in MIGRATING
-MLOCKED PAGES will apply.
-
-MLOCKING Transparent Huge Pages
--------------------------------
-
-A transparent huge page is represented by a single entry on an LRU list.
-Therefore, we can only make unevictable an entire compound page, not
-individual subpages.
-
-If a user tries to mlock() part of a huge page, we want the rest of the
-page to be reclaimable.
-
-We cannot just split the page on partial mlock() as split_huge_page() can
-fail and new intermittent failure mode for the syscall is undesirable.
-
-We handle this by keeping PTE-mapped huge pages on normal LRU lists: the
-PMD on border of VM_LOCKED VMA will be split into PTE table.
-
-This way the huge page is accessible for vmscan. Under memory pressure the
-page will be split, subpages which belong to VM_LOCKED VMAs will be moved
-to unevictable LRU and the rest can be reclaimed.
-
-See also comment in follow_trans_huge_pmd().
-
-mmap(MAP_LOCKED) System Call Handling
--------------------------------------
-
-In addition the mlock()/mlockall() system calls, an application can request
-that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
-call. There is one important and subtle difference here, though. mmap() + mlock()
-will fail if the range cannot be faulted in (e.g. because mm_populate fails)
-and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped
-area will still have properties of the locked area - aka. pages will not get
-swapped out - but major page faults to fault memory in might still happen.
-
-Furthermore, any mmap() call or brk() call that expands the heap by a
-task that has previously called mlockall() with the MCL_FUTURE flag will result
-in the newly mapped memory being mlocked.  Before the unevictable/mlock
-changes, the kernel simply called make_pages_present() to allocate pages and
-populate the page table.
-
-To mlock a range of memory under the unevictable/mlock infrastructure, the
-mmap() handler and task address space expansion functions call
-populate_vma_page_range() specifying the vma and the address range to mlock.
-
-The callers of populate_vma_page_range() will have already added the memory range
-to be mlocked to the task's "locked_vm".  To account for filtered VMAs,
-populate_vma_page_range() returns the number of pages NOT mlocked.  All of the
-callers then subtract a non-negative return value from the task's locked_vm.  A
-negative return value represent an error - for example, from get_user_pages()
-attempting to fault in a VMA with PROT_NONE access.  In this case, we leave the
-memory range accounted as locked_vm, as the protections could be changed later
-and pages allocated into that region.
-
-
-munmap()/exit()/exec() System Call Handling
--------------------------------------------
-
-When unmapping an mlocked region of memory, whether by an explicit call to
-munmap() or via an internal unmap from exit() or exec() processing, we must
-munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
-Before the unevictable/mlock changes, mlocking did not mark the pages in any
-way, so unmapping them required no processing.
-
-To munlock a range of memory under the unevictable/mlock infrastructure, the
-munmap() handler and task address space call tear down function
-munlock_vma_pages_all().  The name reflects the observation that one always
-specifies the entire VMA range when munlock()ing during unmap of a region.
-Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
-actually contain mlocked pages will be passed to munlock_vma_pages_all().
-
-munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
-for the munlock case, calls __munlock_vma_pages_range() to walk the page table
-for the VMA's memory range and munlock_vma_page() each resident page mapped by
-the VMA.  This effectively munlocks the page, only if this is the last
-VM_LOCKED VMA that maps the page.
-
-
-try_to_unmap()
---------------
-
-Pages can, of course, be mapped into multiple VMAs.  Some of these VMAs may
-have VM_LOCKED flag set.  It is possible for a page mapped into one or more
-VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
-of the active or inactive LRU lists.  This could happen if, for example, a task
-in the process of munlocking the page could not isolate the page from the LRU.
-As a result, vmscan/shrink_page_list() might encounter such a page as described
-in section "vmscan's handling of unevictable pages".  To handle this situation,
-try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
-map.
-
-try_to_unmap() is always called, by either vmscan for reclaim or for page
-migration, with the argument page locked and isolated from the LRU.  Separate
-functions handle anonymous and mapped file and KSM pages, as these types of
-pages have different reverse map lookup mechanisms, with different locking.
-In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(),
-it will call try_to_unmap_one() for every VMA which might contain the page.
-
-When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED
-VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it,
-and return SWAP_MLOCK to indicate that the page is unevictable: and the scan
-stops there.
-
-mlock_vma_page() is called while holding the page table's lock (in addition
-to the page lock, and the rmap lock): to serialize against concurrent mlock or
-munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
-holepunching, and truncation of file pages and their anonymous COWed pages.
-
-
-try_to_munlock() Reverse Map Scan
----------------------------------
-
-.. warning::
-   [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
-   page_referenced() reverse map walker.
-
-When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call
-Handling <munlock_munlockall_handling>` above] tries to munlock a
-page, it needs to determine whether or not the page is mapped by any
-VM_LOCKED VMA without actually attempting to unmap all PTEs from the
-page.  For this purpose, the unevictable/mlock infrastructure
-introduced a variant of try_to_unmap() called try_to_munlock().
-
-try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
-mapped file and KSM pages with a flag argument specifying unlock versus unmap
-processing.  Again, these functions walk the respective reverse maps looking
-for VM_LOCKED VMAs.  When such a VMA is found, as in the try_to_unmap() case,
-the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK.  This
-undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
-
-Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
-reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
-However, the scan can terminate when it encounters a VM_LOCKED VMA.
-Although try_to_munlock() might be called a great many times when munlocking a
-large region or tearing down a large address space that has been mlocked via
-mlockall(), overall this is a fairly rare event.
-
-
-Page Reclaim in shrink_*_list()
--------------------------------
-
-shrink_active_list() culls any obviously unevictable pages - i.e.
-!page_evictable(page) - diverting these to the unevictable list.
-However, shrink_active_list() only sees unevictable pages that made it onto the
-active/inactive lru lists.  Note that these pages do not have PageUnevictable
-set - otherwise they would be on the unevictable list and shrink_active_list
-would never see them.
-
-Some examples of these unevictable pages on the LRU lists are:
-
- (1) ramfs pages that have been placed on the LRU lists when first allocated.
-
- (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
-     allocate or fault in the pages in the shared memory region.  This happens
-     when an application accesses the page the first time after SHM_LOCK'ing
-     the segment.
-
- (3) mlocked pages that could not be isolated from the LRU and moved to the
-     unevictable list in mlock_vma_page().
-
-shrink_inactive_list() also diverts any unevictable pages that it finds on the
-inactive lists to the appropriate zone's unevictable list.
-
-shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
-after shrink_active_list() had moved them to the inactive list, or pages mapped
-into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
-recheck via try_to_munlock().  shrink_inactive_list() won't notice the latter,
-but will pass on to shrink_page_list().
-
-shrink_page_list() again culls obviously unevictable pages that it could
-encounter for similar reason to shrink_inactive_list().  Pages mapped into
-VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
-try_to_unmap().  shrink_page_list() will divert them to the unevictable list
-when try_to_unmap() returns SWAP_MLOCK, as discussed above.
diff --git a/Documentation/vm/userfaultfd.rst b/Documentation/vm/userfaultfd.rst
new file mode 100644
index 000000000000..5048cf661a8a
--- /dev/null
+++ b/Documentation/vm/userfaultfd.rst
@@ -0,0 +1,241 @@
+.. _userfaultfd:
+
+===========
+Userfaultfd
+===========
+
+Objective
+=========
+
+Userfaults allow the implementation of on-demand paging from userland
+and more generally they allow userland to take control of various
+memory page faults, something otherwise only the kernel code could do.
+
+For example userfaults allows a proper and more optimal implementation
+of the PROT_NONE+SIGSEGV trick.
+
+Design
+======
+
+Userfaults are delivered and resolved through the userfaultfd syscall.
+
+The userfaultfd (aside from registering and unregistering virtual
+memory ranges) provides two primary functionalities:
+
+1) read/POLLIN protocol to notify a userland thread of the faults
+   happening
+
+2) various UFFDIO_* ioctls that can manage the virtual memory regions
+   registered in the userfaultfd that allows userland to efficiently
+   resolve the userfaults it receives via 1) or to manage the virtual
+   memory in the background
+
+The real advantage of userfaults if compared to regular virtual memory
+management of mremap/mprotect is that the userfaults in all their
+operations never involve heavyweight structures like vmas (in fact the
+userfaultfd runtime load never takes the mmap_sem for writing).
+
+Vmas are not suitable for page- (or hugepage) granular fault tracking
+when dealing with virtual address spaces that could span
+Terabytes. Too many vmas would be needed for that.
+
+The userfaultfd once opened by invoking the syscall, can also be
+passed using unix domain sockets to a manager process, so the same
+manager process could handle the userfaults of a multitude of
+different processes without them being aware about what is going on
+(well of course unless they later try to use the userfaultfd
+themselves on the same region the manager is already tracking, which
+is a corner case that would currently return -EBUSY).
+
+API
+===
+
+When first opened the userfaultfd must be enabled invoking the
+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
+a later API version) which will specify the read/POLLIN protocol
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
+
+The uffdio_api.features bitmask returned by the UFFDIO_API ioctl
+defines what memory types are supported by the userfaultfd and what
+events, except page fault notifications, may be generated.
+
+If the kernel supports registering userfaultfd ranges on hugetlbfs
+virtual memory areas, UFFD_FEATURE_MISSING_HUGETLBFS will be set in
+uffdio_api.features. Similarly, UFFD_FEATURE_MISSING_SHMEM will be
+set if the kernel supports registering userfaultfd ranges on shared
+memory (covering all shmem APIs, i.e. tmpfs, IPCSHM, /dev/zero
+MAP_SHARED, memfd_create, etc).
+
+The userland application that wants to use userfaultfd with hugetlbfs
+or shared memory need to set the corresponding flag in
+uffdio_api.features to enable those features.
+
+If the userland desires to receive notifications for events other than
+page faults, it has to verify that uffdio_api.features has appropriate
+UFFD_FEATURE_EVENT_* bits set. These events are described in more
+detail below in "Non-cooperative userfaultfd" section.
+
+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
+register a memory range in the userfaultfd by setting the
+uffdio_register structure accordingly. The uffdio_register.mode
+bitmask will specify to the kernel which kind of faults to track for
+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
+pages). The UFFDIO_REGISTER ioctl will return the
+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
+userfaults on the range registered. Not all ioctls will necessarily be
+supported for all memory types depending on the underlying virtual
+memory backend (anonymous memory vs tmpfs vs real filebacked
+mappings).
+
+Userland can use the uffdio_register.ioctls to manage the virtual
+address space in the background (to add or potentially also remove
+memory from the userfaultfd registered range). This means a userfault
+could be triggering just before userland maps in the background the
+user-faulted page.
+
+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
+atomically copies a page into the userfault registered range and wakes
+up the blocked userfaults (unless uffdio_copy.mode &
+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
+half copied page since it'll keep userfaulting until the copy has
+finished.
+
+QEMU/KVM
+========
+
+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
+migration. Postcopy live migration is one form of memory
+externalization consisting of a virtual machine running with part or
+all of its memory residing on a different node in the cloud. The
+userfaultfd abstraction is generic enough that not a single line of
+KVM kernel code had to be modified in order to add postcopy live
+migration to QEMU.
+
+Guest async page faults, FOLL_NOWAIT and all other GUP features work
+just fine in combination with userfaults. Userfaults trigger async
+page faults in the guest scheduler so those guest processes that
+aren't waiting for userfaults (i.e. network bound) can keep running in
+the guest vcpus.
+
+It is generally beneficial to run one pass of precopy live migration
+just before starting postcopy live migration, in order to avoid
+generating userfaults for readonly guest regions.
+
+The implementation of postcopy live migration currently uses one
+single bidirectional socket but in the future two different sockets
+will be used (to reduce the latency of the userfaults to the minimum
+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
+
+The QEMU in the source node writes all pages that it knows are missing
+in the destination node, into the socket, and the migration thread of
+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
+ioctls on the userfaultfd in order to map the received pages into the
+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
+
+A different postcopy thread in the destination node listens with
+poll() to the userfaultfd in parallel. When a POLLIN event is
+generated after a userfault triggers, the postcopy thread read() from
+the userfaultfd and receives the fault address (or -EAGAIN in case the
+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
+by the parallel QEMU migration thread).
+
+After the QEMU postcopy thread (running in the destination node) gets
+the userfault address it writes the information about the missing page
+into the socket. The QEMU source node receives the information and
+roughly "seeks" to that page address and continues sending all
+remaining missing pages from that new page offset. Soon after that
+(just the time to flush the tcp_wmem queue through the network) the
+migration thread in the QEMU running in the destination node will
+receive the page that triggered the userfault and it'll map it as
+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
+was spontaneously sent by the source or if it was an urgent page
+requested through a userfault).
+
+By the time the userfaults start, the QEMU in the destination node
+doesn't need to keep any per-page state bitmap relative to the live
+migration around and a single per-page bitmap has to be maintained in
+the QEMU running in the source node to know which pages are still
+missing in the destination node. The bitmap in the source node is
+checked to find which missing pages to send in round robin and we seek
+over it when receiving incoming userfaults. After sending each page of
+course the bitmap is updated accordingly. It's also useful to avoid
+sending the same page twice (in case the userfault is read by the
+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
+thread).
+
+Non-cooperative userfaultfd
+===========================
+
+When the userfaultfd is monitored by an external manager, the manager
+must be able to track changes in the process virtual memory
+layout. Userfaultfd can notify the manager about such changes using
+the same read(2) protocol as for the page fault notifications. The
+manager has to explicitly enable these events by setting appropriate
+bits in uffdio_api.features passed to UFFDIO_API ioctl:
+
+UFFD_FEATURE_EVENT_FORK
+	enable userfaultfd hooks for fork(). When this feature is
+	enabled, the userfaultfd context of the parent process is
+	duplicated into the newly created process. The manager
+	receives UFFD_EVENT_FORK with file descriptor of the new
+	userfaultfd context in the uffd_msg.fork.
+
+UFFD_FEATURE_EVENT_REMAP
+	enable notifications about mremap() calls. When the
+	non-cooperative process moves a virtual memory area to a
+	different location, the manager will receive
+	UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and
+	new addresses of the area and its original length.
+
+UFFD_FEATURE_EVENT_REMOVE
+	enable notifications about madvise(MADV_REMOVE) and
+	madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will
+	be generated upon these calls to madvise. The uffd_msg.remove
+	will contain start and end addresses of the removed area.
+
+UFFD_FEATURE_EVENT_UNMAP
+	enable notifications about memory unmapping. The manager will
+	get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and
+	end addresses of the unmapped area.
+
+Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP
+are pretty similar, they quite differ in the action expected from the
+userfaultfd manager. In the former case, the virtual memory is
+removed, but the area is not, the area remains monitored by the
+userfaultfd, and if a page fault occurs in that area it will be
+delivered to the manager. The proper resolution for such page fault is
+to zeromap the faulting address. However, in the latter case, when an
+area is unmapped, either explicitly (with munmap() system call), or
+implicitly (e.g. during mremap()), the area is removed and in turn the
+userfaultfd context for such area disappears too and the manager will
+not get further userland page faults from the removed area. Still, the
+notification is required in order to prevent manager from using
+UFFDIO_COPY on the unmapped area.
+
+Unlike userland page faults which have to be synchronous and require
+explicit or implicit wakeup, all the events are delivered
+asynchronously and the non-cooperative process resumes execution as
+soon as manager executes read(). The userfaultfd manager should
+carefully synchronize calls to UFFDIO_COPY with the events
+processing. To aid the synchronization, the UFFDIO_COPY ioctl will
+return -ENOSPC when the monitored process exits at the time of
+UFFDIO_COPY, and -ENOENT, when the non-cooperative process has changed
+its virtual memory layout simultaneously with outstanding UFFDIO_COPY
+operation.
+
+The current asynchronous model of the event delivery is optimal for
+single threaded non-cooperative userfaultfd manager implementations. A
+synchronous event delivery model can be added later as a new
+userfaultfd feature to facilitate multithreading enhancements of the
+non cooperative manager, for example to allow UFFDIO_COPY ioctls to
+run in parallel to the event reception. Single threaded
+implementations should continue to use the current async event
+delivery model instead.
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
deleted file mode 100644
index 5048cf661a8a..000000000000
--- a/Documentation/vm/userfaultfd.txt
+++ /dev/null
@@ -1,241 +0,0 @@
-.. _userfaultfd:
-
-===========
-Userfaultfd
-===========
-
-Objective
-=========
-
-Userfaults allow the implementation of on-demand paging from userland
-and more generally they allow userland to take control of various
-memory page faults, something otherwise only the kernel code could do.
-
-For example userfaults allows a proper and more optimal implementation
-of the PROT_NONE+SIGSEGV trick.
-
-Design
-======
-
-Userfaults are delivered and resolved through the userfaultfd syscall.
-
-The userfaultfd (aside from registering and unregistering virtual
-memory ranges) provides two primary functionalities:
-
-1) read/POLLIN protocol to notify a userland thread of the faults
-   happening
-
-2) various UFFDIO_* ioctls that can manage the virtual memory regions
-   registered in the userfaultfd that allows userland to efficiently
-   resolve the userfaults it receives via 1) or to manage the virtual
-   memory in the background
-
-The real advantage of userfaults if compared to regular virtual memory
-management of mremap/mprotect is that the userfaults in all their
-operations never involve heavyweight structures like vmas (in fact the
-userfaultfd runtime load never takes the mmap_sem for writing).
-
-Vmas are not suitable for page- (or hugepage) granular fault tracking
-when dealing with virtual address spaces that could span
-Terabytes. Too many vmas would be needed for that.
-
-The userfaultfd once opened by invoking the syscall, can also be
-passed using unix domain sockets to a manager process, so the same
-manager process could handle the userfaults of a multitude of
-different processes without them being aware about what is going on
-(well of course unless they later try to use the userfaultfd
-themselves on the same region the manager is already tracking, which
-is a corner case that would currently return -EBUSY).
-
-API
-===
-
-When first opened the userfaultfd must be enabled invoking the
-UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
-a later API version) which will specify the read/POLLIN protocol
-userland intends to speak on the UFFD and the uffdio_api.features
-userland requires. The UFFDIO_API ioctl if successful (i.e. if the
-requested uffdio_api.api is spoken also by the running kernel and the
-requested features are going to be enabled) will return into
-uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
-respectively all the available features of the read(2) protocol and
-the generic ioctl available.
-
-The uffdio_api.features bitmask returned by the UFFDIO_API ioctl
-defines what memory types are supported by the userfaultfd and what
-events, except page fault notifications, may be generated.
-
-If the kernel supports registering userfaultfd ranges on hugetlbfs
-virtual memory areas, UFFD_FEATURE_MISSING_HUGETLBFS will be set in
-uffdio_api.features. Similarly, UFFD_FEATURE_MISSING_SHMEM will be
-set if the kernel supports registering userfaultfd ranges on shared
-memory (covering all shmem APIs, i.e. tmpfs, IPCSHM, /dev/zero
-MAP_SHARED, memfd_create, etc).
-
-The userland application that wants to use userfaultfd with hugetlbfs
-or shared memory need to set the corresponding flag in
-uffdio_api.features to enable those features.
-
-If the userland desires to receive notifications for events other than
-page faults, it has to verify that uffdio_api.features has appropriate
-UFFD_FEATURE_EVENT_* bits set. These events are described in more
-detail below in "Non-cooperative userfaultfd" section.
-
-Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
-be invoked (if present in the returned uffdio_api.ioctls bitmask) to
-register a memory range in the userfaultfd by setting the
-uffdio_register structure accordingly. The uffdio_register.mode
-bitmask will specify to the kernel which kind of faults to track for
-the range (UFFDIO_REGISTER_MODE_MISSING would track missing
-pages). The UFFDIO_REGISTER ioctl will return the
-uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
-userfaults on the range registered. Not all ioctls will necessarily be
-supported for all memory types depending on the underlying virtual
-memory backend (anonymous memory vs tmpfs vs real filebacked
-mappings).
-
-Userland can use the uffdio_register.ioctls to manage the virtual
-address space in the background (to add or potentially also remove
-memory from the userfaultfd registered range). This means a userfault
-could be triggering just before userland maps in the background the
-user-faulted page.
-
-The primary ioctl to resolve userfaults is UFFDIO_COPY. That
-atomically copies a page into the userfault registered range and wakes
-up the blocked userfaults (unless uffdio_copy.mode &
-UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
-UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
-half copied page since it'll keep userfaulting until the copy has
-finished.
-
-QEMU/KVM
-========
-
-QEMU/KVM is using the userfaultfd syscall to implement postcopy live
-migration. Postcopy live migration is one form of memory
-externalization consisting of a virtual machine running with part or
-all of its memory residing on a different node in the cloud. The
-userfaultfd abstraction is generic enough that not a single line of
-KVM kernel code had to be modified in order to add postcopy live
-migration to QEMU.
-
-Guest async page faults, FOLL_NOWAIT and all other GUP features work
-just fine in combination with userfaults. Userfaults trigger async
-page faults in the guest scheduler so those guest processes that
-aren't waiting for userfaults (i.e. network bound) can keep running in
-the guest vcpus.
-
-It is generally beneficial to run one pass of precopy live migration
-just before starting postcopy live migration, in order to avoid
-generating userfaults for readonly guest regions.
-
-The implementation of postcopy live migration currently uses one
-single bidirectional socket but in the future two different sockets
-will be used (to reduce the latency of the userfaults to the minimum
-possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
-
-The QEMU in the source node writes all pages that it knows are missing
-in the destination node, into the socket, and the migration thread of
-the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
-ioctls on the userfaultfd in order to map the received pages into the
-guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
-
-A different postcopy thread in the destination node listens with
-poll() to the userfaultfd in parallel. When a POLLIN event is
-generated after a userfault triggers, the postcopy thread read() from
-the userfaultfd and receives the fault address (or -EAGAIN in case the
-userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
-by the parallel QEMU migration thread).
-
-After the QEMU postcopy thread (running in the destination node) gets
-the userfault address it writes the information about the missing page
-into the socket. The QEMU source node receives the information and
-roughly "seeks" to that page address and continues sending all
-remaining missing pages from that new page offset. Soon after that
-(just the time to flush the tcp_wmem queue through the network) the
-migration thread in the QEMU running in the destination node will
-receive the page that triggered the userfault and it'll map it as
-usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
-was spontaneously sent by the source or if it was an urgent page
-requested through a userfault).
-
-By the time the userfaults start, the QEMU in the destination node
-doesn't need to keep any per-page state bitmap relative to the live
-migration around and a single per-page bitmap has to be maintained in
-the QEMU running in the source node to know which pages are still
-missing in the destination node. The bitmap in the source node is
-checked to find which missing pages to send in round robin and we seek
-over it when receiving incoming userfaults. After sending each page of
-course the bitmap is updated accordingly. It's also useful to avoid
-sending the same page twice (in case the userfault is read by the
-postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
-thread).
-
-Non-cooperative userfaultfd
-===========================
-
-When the userfaultfd is monitored by an external manager, the manager
-must be able to track changes in the process virtual memory
-layout. Userfaultfd can notify the manager about such changes using
-the same read(2) protocol as for the page fault notifications. The
-manager has to explicitly enable these events by setting appropriate
-bits in uffdio_api.features passed to UFFDIO_API ioctl:
-
-UFFD_FEATURE_EVENT_FORK
-	enable userfaultfd hooks for fork(). When this feature is
-	enabled, the userfaultfd context of the parent process is
-	duplicated into the newly created process. The manager
-	receives UFFD_EVENT_FORK with file descriptor of the new
-	userfaultfd context in the uffd_msg.fork.
-
-UFFD_FEATURE_EVENT_REMAP
-	enable notifications about mremap() calls. When the
-	non-cooperative process moves a virtual memory area to a
-	different location, the manager will receive
-	UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and
-	new addresses of the area and its original length.
-
-UFFD_FEATURE_EVENT_REMOVE
-	enable notifications about madvise(MADV_REMOVE) and
-	madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will
-	be generated upon these calls to madvise. The uffd_msg.remove
-	will contain start and end addresses of the removed area.
-
-UFFD_FEATURE_EVENT_UNMAP
-	enable notifications about memory unmapping. The manager will
-	get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and
-	end addresses of the unmapped area.
-
-Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP
-are pretty similar, they quite differ in the action expected from the
-userfaultfd manager. In the former case, the virtual memory is
-removed, but the area is not, the area remains monitored by the
-userfaultfd, and if a page fault occurs in that area it will be
-delivered to the manager. The proper resolution for such page fault is
-to zeromap the faulting address. However, in the latter case, when an
-area is unmapped, either explicitly (with munmap() system call), or
-implicitly (e.g. during mremap()), the area is removed and in turn the
-userfaultfd context for such area disappears too and the manager will
-not get further userland page faults from the removed area. Still, the
-notification is required in order to prevent manager from using
-UFFDIO_COPY on the unmapped area.
-
-Unlike userland page faults which have to be synchronous and require
-explicit or implicit wakeup, all the events are delivered
-asynchronously and the non-cooperative process resumes execution as
-soon as manager executes read(). The userfaultfd manager should
-carefully synchronize calls to UFFDIO_COPY with the events
-processing. To aid the synchronization, the UFFDIO_COPY ioctl will
-return -ENOSPC when the monitored process exits at the time of
-UFFDIO_COPY, and -ENOENT, when the non-cooperative process has changed
-its virtual memory layout simultaneously with outstanding UFFDIO_COPY
-operation.
-
-The current asynchronous model of the event delivery is optimal for
-single threaded non-cooperative userfaultfd manager implementations. A
-synchronous event delivery model can be added later as a new
-userfaultfd feature to facilitate multithreading enhancements of the
-non cooperative manager, for example to allow UFFDIO_COPY ioctls to
-run in parallel to the event reception. Single threaded
-implementations should continue to use the current async event
-delivery model instead.
diff --git a/Documentation/vm/z3fold.rst b/Documentation/vm/z3fold.rst
new file mode 100644
index 000000000000..224e3c61d686
--- /dev/null
+++ b/Documentation/vm/z3fold.rst
@@ -0,0 +1,30 @@
+.. _z3fold:
+
+======
+z3fold
+======
+
+z3fold is a special purpose allocator for storing compressed pages.
+It is designed to store up to three compressed pages per physical page.
+It is a zbud derivative which allows for higher compression
+ratio keeping the simplicity and determinism of its predecessor.
+
+The main differences between z3fold and zbud are:
+
+* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
+* z3fold can hold up to 3 compressed pages in its page
+* z3fold doesn't export any API itself and is thus intended to be used
+  via the zpool API.
+
+To keep the determinism and simplicity, z3fold, just like zbud, always
+stores an integral number of compressed pages per page, but it can store
+up to 3 pages unlike zbud which can store at most 2. Therefore the
+compression ratio goes to around 2.7x while zbud's one is around 1.7x.
+
+Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
+return a dereferenceable pointer. Instead, it returns an unsigned long
+handle which encodes actual location of the allocated object.
+
+Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
+depend on MMU enabled and provides more predictable reclaim behavior
+which makes it a better fit for small and response-critical systems.
diff --git a/Documentation/vm/z3fold.txt b/Documentation/vm/z3fold.txt
deleted file mode 100644
index 224e3c61d686..000000000000
--- a/Documentation/vm/z3fold.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-.. _z3fold:
-
-======
-z3fold
-======
-
-z3fold is a special purpose allocator for storing compressed pages.
-It is designed to store up to three compressed pages per physical page.
-It is a zbud derivative which allows for higher compression
-ratio keeping the simplicity and determinism of its predecessor.
-
-The main differences between z3fold and zbud are:
-
-* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
-* z3fold can hold up to 3 compressed pages in its page
-* z3fold doesn't export any API itself and is thus intended to be used
-  via the zpool API.
-
-To keep the determinism and simplicity, z3fold, just like zbud, always
-stores an integral number of compressed pages per page, but it can store
-up to 3 pages unlike zbud which can store at most 2. Therefore the
-compression ratio goes to around 2.7x while zbud's one is around 1.7x.
-
-Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
-return a dereferenceable pointer. Instead, it returns an unsigned long
-handle which encodes actual location of the allocated object.
-
-Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
-depend on MMU enabled and provides more predictable reclaim behavior
-which makes it a better fit for small and response-critical systems.
diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/vm/zsmalloc.rst
new file mode 100644
index 000000000000..6e79893d6132
--- /dev/null
+++ b/Documentation/vm/zsmalloc.rst
@@ -0,0 +1,82 @@
+.. _zsmalloc:
+
+========
+zsmalloc
+========
+
+This allocator is designed for use with zram. Thus, the allocator is
+supposed to work well under low memory conditions. In particular, it
+never attempts higher order page allocation which is very likely to
+fail under memory pressure. On the other hand, if we just use single
+(0-order) pages, it would suffer from very high fragmentation --
+any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+This was one of the major issues with its predecessor (xvmalloc).
+
+To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+and links them together using various 'struct page' fields. These linked
+pages act as a single higher-order page i.e. an object can span 0-order
+page boundaries. The code refers to these linked pages as a single entity
+called zspage.
+
+For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+since this satisfies the requirements of all its current users (in the
+worst case, page is incompressible and is thus stored "as-is" i.e. in
+uncompressed form). For allocation requests larger than this size, failure
+is returned (see zs_malloc).
+
+Additionally, zs_malloc() does not return a dereferenceable pointer.
+Instead, it returns an opaque handle (unsigned long) which encodes actual
+location of the allocated object. The reason for this indirection is that
+zsmalloc does not keep zspages permanently mapped since that would cause
+issues on 32-bit systems where the VA region for kernel space mappings
+is very small. So, before using the allocating memory, the object has to
+be mapped using zs_map_object() to get a usable pointer and subsequently
+unmapped using zs_unmap_object().
+
+stat
+====
+
+With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
+``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
+
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
+    ...
+    ...
+     9   176           0            1           186        129          8                4
+    10   192           1            0          2880       2872        135                3
+    11   208           0            1           819        795         42                2
+    12   224           0            1           219        159         12                4
+    ...
+    ...
+
+
+class
+	index
+size
+	object size zspage stores
+almost_empty
+	the number of ZS_ALMOST_EMPTY zspages(see below)
+almost_full
+	the number of ZS_ALMOST_FULL zspages(see below)
+obj_allocated
+	the number of objects allocated
+obj_used
+	the number of objects allocated to the user
+pages_used
+	the number of pages allocated for the class
+pages_per_zspage
+	the number of 0-order pages to make a zspage
+
+We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
+
+* n = number of allocated objects
+* N = total number of objects zspage can store
+* f = fullness_threshold_frac(ie, 4 at the moment)
+
+Similarly, we assign zspage to:
+
+* ZS_ALMOST_FULL  when n > N / f
+* ZS_EMPTY        when n == 0
+* ZS_FULL         when n == N
diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.txt
deleted file mode 100644
index 6e79893d6132..000000000000
--- a/Documentation/vm/zsmalloc.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-.. _zsmalloc:
-
-========
-zsmalloc
-========
-
-This allocator is designed for use with zram. Thus, the allocator is
-supposed to work well under low memory conditions. In particular, it
-never attempts higher order page allocation which is very likely to
-fail under memory pressure. On the other hand, if we just use single
-(0-order) pages, it would suffer from very high fragmentation --
-any object of size PAGE_SIZE/2 or larger would occupy an entire page.
-This was one of the major issues with its predecessor (xvmalloc).
-
-To overcome these issues, zsmalloc allocates a bunch of 0-order pages
-and links them together using various 'struct page' fields. These linked
-pages act as a single higher-order page i.e. an object can span 0-order
-page boundaries. The code refers to these linked pages as a single entity
-called zspage.
-
-For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
-since this satisfies the requirements of all its current users (in the
-worst case, page is incompressible and is thus stored "as-is" i.e. in
-uncompressed form). For allocation requests larger than this size, failure
-is returned (see zs_malloc).
-
-Additionally, zs_malloc() does not return a dereferenceable pointer.
-Instead, it returns an opaque handle (unsigned long) which encodes actual
-location of the allocated object. The reason for this indirection is that
-zsmalloc does not keep zspages permanently mapped since that would cause
-issues on 32-bit systems where the VA region for kernel space mappings
-is very small. So, before using the allocating memory, the object has to
-be mapped using zs_map_object() to get a usable pointer and subsequently
-unmapped using zs_unmap_object().
-
-stat
-====
-
-With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
-``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
-
- # cat /sys/kernel/debug/zsmalloc/zram0/classes
-
- class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
-    ...
-    ...
-     9   176           0            1           186        129          8                4
-    10   192           1            0          2880       2872        135                3
-    11   208           0            1           819        795         42                2
-    12   224           0            1           219        159         12                4
-    ...
-    ...
-
-
-class
-	index
-size
-	object size zspage stores
-almost_empty
-	the number of ZS_ALMOST_EMPTY zspages(see below)
-almost_full
-	the number of ZS_ALMOST_FULL zspages(see below)
-obj_allocated
-	the number of objects allocated
-obj_used
-	the number of objects allocated to the user
-pages_used
-	the number of pages allocated for the class
-pages_per_zspage
-	the number of 0-order pages to make a zspage
-
-We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
-
-* n = number of allocated objects
-* N = total number of objects zspage can store
-* f = fullness_threshold_frac(ie, 4 at the moment)
-
-Similarly, we assign zspage to:
-
-* ZS_ALMOST_FULL  when n > N / f
-* ZS_EMPTY        when n == 0
-* ZS_FULL         when n == N
diff --git a/Documentation/vm/zswap.rst b/Documentation/vm/zswap.rst
new file mode 100644
index 000000000000..1444ecd40911
--- /dev/null
+++ b/Documentation/vm/zswap.rst
@@ -0,0 +1,135 @@
+.. _zswap:
+
+=====
+zswap
+=====
+
+Overview
+========
+
+Zswap is a lightweight compressed cache for swap pages. It takes pages that are
+in the process of being swapped out and attempts to compress them into a
+dynamically allocated RAM-based memory pool.  zswap basically trades CPU cycles
+for potentially reduced swap I/O.  This trade-off can also result in a
+significant performance improvement if reads from the compressed cache are
+faster than reads from a swap device.
+
+.. note::
+   Zswap is a new feature as of v3.11 and interacts heavily with memory
+   reclaim.  This interaction has not been fully explored on the large set of
+   potential configurations and workloads that exist.  For this reason, zswap
+   is a work in progress and should be considered experimental.
+
+   Some potential benefits:
+
+* Desktop/laptop users with limited RAM capacities can mitigate the
+  performance impact of swapping.
+* Overcommitted guests that share a common I/O resource can
+  dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
+  throttling by the hypervisor. This allows more work to get done with less
+  impact to the guest workload and guests sharing the I/O subsystem
+* Users with SSDs as swap devices can extend the life of the device by
+  drastically reducing life-shortening writes.
+
+Zswap evicts pages from compressed cache on an LRU basis to the backing swap
+device when the compressed pool reaches its size limit.  This requirement had
+been identified in prior community discussions.
+
+Zswap is disabled by default but can be enabled at boot time by setting
+the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``.  Zswap
+can also be enabled and disabled at runtime using the sysfs interface.
+An example command to enable zswap at runtime, assuming sysfs is mounted
+at ``/sys``, is::
+
+	echo 1 > /sys/module/zswap/parameters/enabled
+
+When zswap is disabled at runtime it will stop storing pages that are
+being swapped out.  However, it will _not_ immediately write out or fault
+back into memory all of the pages stored in the compressed pool.  The
+pages stored in zswap will remain in the compressed pool until they are
+either invalidated or faulted back into memory.  In order to force all
+pages out of the compressed pool, a swapoff on the swap device(s) will
+fault back into memory all swapped out pages, including those in the
+compressed pool.
+
+Design
+======
+
+Zswap receives pages for compression through the Frontswap API and is able to
+evict pages from its own compressed pool on an LRU basis and write them back to
+the backing swap device in the case that the compressed pool is full.
+
+Zswap makes use of zpool for the managing the compressed memory pool.  Each
+allocation in zpool is not directly accessible by address.  Rather, a handle is
+returned by the allocation routine and that handle must be mapped before being
+accessed.  The compressed memory pool grows on demand and shrinks as compressed
+pages are freed.  The pool is not preallocated.  By default, a zpool
+of type zbud is created, but it can be selected at boot time by
+setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can
+also be changed at runtime using the sysfs ``zpool`` attribute, e.g.::
+
+	echo zbud > /sys/module/zswap/parameters/zpool
+
+The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
+means the compression ratio will always be 2:1 or worse (because of half-full
+zbud pages).  The zsmalloc type zpool has a more complex compressed page
+storage method, and it can achieve greater storage densities.  However,
+zsmalloc does not implement compressed page eviction, so once zswap fills it
+cannot evict the oldest page, it can only reject new pages.
+
+When a swap page is passed from frontswap to zswap, zswap maintains a mapping
+of the swap entry, a combination of the swap type and swap offset, to the zpool
+handle that references that compressed swap page.  This mapping is achieved
+with a red-black tree per swap type.  The swap offset is the search key for the
+tree nodes.
+
+During a page fault on a PTE that is a swap entry, frontswap calls the zswap
+load function to decompress the page into the page allocated by the page fault
+handler.
+
+Once there are no PTEs referencing a swap page stored in zswap (i.e. the count
+in the swap_map goes to 0) the swap code calls the zswap invalidate function,
+via frontswap, to free the compressed entry.
+
+Zswap seeks to be simple in its policies.  Sysfs attributes allow for one user
+controlled policy:
+
+* max_pool_percent - The maximum percentage of memory that the compressed
+  pool can occupy.
+
+The default compressor is lzo, but it can be selected at boot time by
+setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
+It can also be changed at runtime using the sysfs "compressor"
+attribute, e.g.::
+
+	echo lzo > /sys/module/zswap/parameters/compressor
+
+When the zpool and/or compressor parameter is changed at runtime, any existing
+compressed pages are not modified; they are left in their own zpool.  When a
+request is made for a page in an old zpool, it is uncompressed using its
+original compressor.  Once all pages are removed from an old zpool, the zpool
+and its compressor are freed.
+
+Some of the pages in zswap are same-value filled pages (i.e. contents of the
+page have same value or repetitive pattern). These pages include zero-filled
+pages and they are handled differently. During store operation, a page is
+checked if it is a same-value filled page before compressing it. If true, the
+compressed length of the page is set to zero and the pattern or same-filled
+value is stored.
+
+Same-value filled pages identification feature is enabled by default and can be
+disabled at boot time by setting the ``same_filled_pages_enabled`` attribute
+to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and
+disabled at runtime using the sysfs ``same_filled_pages_enabled``
+attribute, e.g.::
+
+	echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
+
+When zswap same-filled page identification is disabled at runtime, it will stop
+checking for the same-value filled pages during store operation. However, the
+existing pages which are marked as same-value filled pages remain stored
+unchanged in zswap until they are either loaded or invalidated.
+
+A debugfs interface is provided for various statistic about pool size, number
+of pages stored, same-value filled pages and various counters for the reasons
+pages are rejected.
diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
deleted file mode 100644
index 1444ecd40911..000000000000
--- a/Documentation/vm/zswap.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-.. _zswap:
-
-=====
-zswap
-=====
-
-Overview
-========
-
-Zswap is a lightweight compressed cache for swap pages. It takes pages that are
-in the process of being swapped out and attempts to compress them into a
-dynamically allocated RAM-based memory pool.  zswap basically trades CPU cycles
-for potentially reduced swap I/O.  This trade-off can also result in a
-significant performance improvement if reads from the compressed cache are
-faster than reads from a swap device.
-
-.. note::
-   Zswap is a new feature as of v3.11 and interacts heavily with memory
-   reclaim.  This interaction has not been fully explored on the large set of
-   potential configurations and workloads that exist.  For this reason, zswap
-   is a work in progress and should be considered experimental.
-
-   Some potential benefits:
-
-* Desktop/laptop users with limited RAM capacities can mitigate the
-  performance impact of swapping.
-* Overcommitted guests that share a common I/O resource can
-  dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
-  throttling by the hypervisor. This allows more work to get done with less
-  impact to the guest workload and guests sharing the I/O subsystem
-* Users with SSDs as swap devices can extend the life of the device by
-  drastically reducing life-shortening writes.
-
-Zswap evicts pages from compressed cache on an LRU basis to the backing swap
-device when the compressed pool reaches its size limit.  This requirement had
-been identified in prior community discussions.
-
-Zswap is disabled by default but can be enabled at boot time by setting
-the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``.  Zswap
-can also be enabled and disabled at runtime using the sysfs interface.
-An example command to enable zswap at runtime, assuming sysfs is mounted
-at ``/sys``, is::
-
-	echo 1 > /sys/module/zswap/parameters/enabled
-
-When zswap is disabled at runtime it will stop storing pages that are
-being swapped out.  However, it will _not_ immediately write out or fault
-back into memory all of the pages stored in the compressed pool.  The
-pages stored in zswap will remain in the compressed pool until they are
-either invalidated or faulted back into memory.  In order to force all
-pages out of the compressed pool, a swapoff on the swap device(s) will
-fault back into memory all swapped out pages, including those in the
-compressed pool.
-
-Design
-======
-
-Zswap receives pages for compression through the Frontswap API and is able to
-evict pages from its own compressed pool on an LRU basis and write them back to
-the backing swap device in the case that the compressed pool is full.
-
-Zswap makes use of zpool for the managing the compressed memory pool.  Each
-allocation in zpool is not directly accessible by address.  Rather, a handle is
-returned by the allocation routine and that handle must be mapped before being
-accessed.  The compressed memory pool grows on demand and shrinks as compressed
-pages are freed.  The pool is not preallocated.  By default, a zpool
-of type zbud is created, but it can be selected at boot time by
-setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can
-also be changed at runtime using the sysfs ``zpool`` attribute, e.g.::
-
-	echo zbud > /sys/module/zswap/parameters/zpool
-
-The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
-means the compression ratio will always be 2:1 or worse (because of half-full
-zbud pages).  The zsmalloc type zpool has a more complex compressed page
-storage method, and it can achieve greater storage densities.  However,
-zsmalloc does not implement compressed page eviction, so once zswap fills it
-cannot evict the oldest page, it can only reject new pages.
-
-When a swap page is passed from frontswap to zswap, zswap maintains a mapping
-of the swap entry, a combination of the swap type and swap offset, to the zpool
-handle that references that compressed swap page.  This mapping is achieved
-with a red-black tree per swap type.  The swap offset is the search key for the
-tree nodes.
-
-During a page fault on a PTE that is a swap entry, frontswap calls the zswap
-load function to decompress the page into the page allocated by the page fault
-handler.
-
-Once there are no PTEs referencing a swap page stored in zswap (i.e. the count
-in the swap_map goes to 0) the swap code calls the zswap invalidate function,
-via frontswap, to free the compressed entry.
-
-Zswap seeks to be simple in its policies.  Sysfs attributes allow for one user
-controlled policy:
-
-* max_pool_percent - The maximum percentage of memory that the compressed
-  pool can occupy.
-
-The default compressor is lzo, but it can be selected at boot time by
-setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
-It can also be changed at runtime using the sysfs "compressor"
-attribute, e.g.::
-
-	echo lzo > /sys/module/zswap/parameters/compressor
-
-When the zpool and/or compressor parameter is changed at runtime, any existing
-compressed pages are not modified; they are left in their own zpool.  When a
-request is made for a page in an old zpool, it is uncompressed using its
-original compressor.  Once all pages are removed from an old zpool, the zpool
-and its compressor are freed.
-
-Some of the pages in zswap are same-value filled pages (i.e. contents of the
-page have same value or repetitive pattern). These pages include zero-filled
-pages and they are handled differently. During store operation, a page is
-checked if it is a same-value filled page before compressing it. If true, the
-compressed length of the page is set to zero and the pattern or same-filled
-value is stored.
-
-Same-value filled pages identification feature is enabled by default and can be
-disabled at boot time by setting the ``same_filled_pages_enabled`` attribute
-to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and
-disabled at runtime using the sysfs ``same_filled_pages_enabled``
-attribute, e.g.::
-
-	echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
-
-When zswap same-filled page identification is disabled at runtime, it will stop
-checking for the same-value filled pages during store operation. However, the
-existing pages which are marked as same-value filled pages remain stored
-unchanged in zswap until they are either loaded or invalidated.
-
-A debugfs interface is provided for various statistic about pool size, number
-of pages stored, same-value filled pages and various counters for the reasons
-pages are rejected.
diff --git a/MAINTAINERS b/MAINTAINERS
index 3bdc260e36b7..575849a8343e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15406,7 +15406,7 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	mm/zsmalloc.c
 F:	include/linux/zsmalloc.h
-F:	Documentation/vm/zsmalloc.txt
+F:	Documentation/vm/zsmalloc.rst
 
 ZSWAP COMPRESSED SWAP CACHING
 M:	Seth Jennings <sjenning@redhat.com>
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index e96adcbcab41..f53e5060afe7 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -584,7 +584,7 @@ config ARCH_DISCONTIGMEM_ENABLE
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
-	  See <file:Documentation/vm/numa> for more.
+	  See <file:Documentation/vm/numa.rst> for more.
 
 source "mm/Kconfig"
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bbe12a038d21..3ac9bf4cc2a0 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -397,7 +397,7 @@ config ARCH_DISCONTIGMEM_ENABLE
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
- 	  See <file:Documentation/vm/numa> for more.
+	  See <file:Documentation/vm/numa.rst> for more.
 
 config ARCH_FLATMEM_ENABLE
 	def_bool y
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8128c3b68d6b..4562810857eb 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2551,7 +2551,7 @@ config ARCH_DISCONTIGMEM_ENABLE
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
-	  See <file:Documentation/vm/numa> for more.
+	  See <file:Documentation/vm/numa.rst> for more.
 
 config ARCH_SPARSEMEM_ENABLE
 	bool
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 73ce5dd07642..f8c0f10949ea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -880,7 +880,7 @@ config PPC_MEM_KEYS
 	  page-based protections, but without requiring modification of the
 	  page tables when an application changes protection domains.
 
-	  For details, see Documentation/vm/protection-keys.txt
+	  For details, see Documentation/vm/protection-keys.rst
 
 	  If unsure, say y.
 
diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a86d965..ba53dc2a9691 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -196,7 +196,7 @@ config HUGETLBFS
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read
-	  <file:Documentation/vm/hugetlbpage.txt> for details.
+	  <file:Documentation/vm/hugetlbpage.rst> for details.
 
 	  If unsure, say N.
 
diff --git a/fs/dax.c b/fs/dax.c
index 0276df90e86c..0eb65c34d5a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -618,7 +618,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 		 * downgrading page table protection not changing it to point
 		 * to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		if (pmdp) {
 #ifdef CONFIG_FS_DAX_PMD
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ec6d2983a5cb..91d14c4ac04a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -956,7 +956,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	/*
 	 * The soft-dirty tracker uses #PF-s to catch writes
 	 * to pages, so write-protect the pte as well. See the
-	 * Documentation/vm/soft-dirty.txt for full description
+	 * Documentation/vm/soft-dirty.rst for full description
 	 * of how soft-dirty works.
 	 */
 	pte_t ptent = *pte;
@@ -1436,7 +1436,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
  * Bits 0-54  page frame number (PFN) if present
  * Bits 0-4   swap type if swapped
  * Bits 5-54  swap offset if swapped
- * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.rst)
  * Bit  56    page exclusively mapped
  * Bits 57-60 zero
  * Bit  61    page is file-page or shared-anon
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 325017ad9311..77be87c095f2 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -16,7 +16,7 @@
 /*
  * Heterogeneous Memory Management (HMM)
  *
- * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
  * is for. Here we focus on the HMM API description, with some explanation of
  * the underlying implementation.
  *
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7b4899c06f49..74ea5e2310a8 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -45,7 +45,7 @@ struct vmem_altmap {
  * must be treated as an opaque object, rather than a "normal" struct page.
  *
  * A more complete discussion of unaddressable memory may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
  * MEMORY_DEVICE_PUBLIC:
  * Device memory that is cache coherent from device and CPU point of view. This
@@ -67,7 +67,7 @@ enum memory_type {
  *   page_free()
  *
  * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
+ * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
  * explanation in include/linux/memory_hotplug.h.
  *
  * The page_fault() callback must migrate page back, from device memory to
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 2d07a1ed5a31..392e6af82701 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -174,7 +174,7 @@ struct mmu_notifier_ops {
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() alread catches the points in time when an
 	 * external TLB range needs to be flushed. For more in depth
-	 * discussion on this see Documentation/vm/mmu_notifier.txt
+	 * discussion on this see Documentation/vm/mmu_notifier.rst
 	 *
 	 * Note that this function might be called with just a sub-range
 	 * of what was passed to invalidate_range_start()/end(), if
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1149533aa2fa..df2c7d11f496 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -28,7 +28,7 @@ extern struct mm_struct *mm_alloc(void);
  *
  * Use mmdrop() to release the reference acquired by mmgrab().
  *
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmgrab(struct mm_struct *mm)
@@ -51,7 +51,7 @@ extern void mmdrop(struct mm_struct *mm);
  *
  * Use mmput() to release the reference acquired by mmget().
  *
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmget(struct mm_struct *mm)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7b6a59f722a3..4003973deff4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -53,7 +53,7 @@ static inline int current_is_kswapd(void)
 
 /*
  * Unaddressable device memory support. See include/linux/hmm.h and
- * Documentation/vm/hmm.txt. Short description is we need struct pages for
+ * Documentation/vm/hmm.rst. Short description is we need struct pages for
  * device memory that is unaddressable (inaccessible) by CPU, so that we can
  * migrate part of a process memory to device memory.
  *
diff --git a/mm/Kconfig b/mm/Kconfig
index c782e8fb7235..b9f04213a353 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -312,7 +312,7 @@ config KSM
 	  the many instances by a single page with that content, so
 	  saving memory until one or another app needs to modify the content.
 	  Recommended for use with KVM, or with other duplicative applications.
-	  See Documentation/vm/ksm.txt for more information: KSM is inactive
+	  See Documentation/vm/ksm.rst for more information: KSM is inactive
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 
@@ -537,7 +537,7 @@ config MEM_SOFT_DIRTY
 	  into a page just as regular dirty bit, but unlike the latter
 	  it can be cleared by hands.
 
-	  See Documentation/vm/soft-dirty.txt for more details.
+	  See Documentation/vm/soft-dirty.rst for more details.
 
 config ZSWAP
 	bool "Compressed cache for swap pages (EXPERIMENTAL)"
@@ -664,7 +664,7 @@ config IDLE_PAGE_TRACKING
 	  be useful to tune memory cgroup limits and/or for job placement
 	  within a compute cluster.
 
-	  See Documentation/vm/idle_page_tracking.txt for more details.
+	  See Documentation/vm/idle_page_tracking.rst for more details.
 
 # arch_add_memory() comprehends device memory
 config ARCH_HAS_ZONE_DEVICE
diff --git a/mm/cleancache.c b/mm/cleancache.c
index f7b9fdc79d97..126548b5a292 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -3,7 +3,7 @@
  *
  * This code provides the generic "frontend" layer to call a matching
  * "backend" driver implementation of cleancache.  See
- * Documentation/vm/cleancache.txt for more information.
+ * Documentation/vm/cleancache.rst for more information.
  *
  * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
  * Author: Dan Magenheimer
diff --git a/mm/frontswap.c b/mm/frontswap.c
index fec8b5044040..4f5476a0f955 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -3,7 +3,7 @@
  *
  * This code provides the generic "frontend" layer to call a matching
  * "backend" driver implementation of frontswap.  See
- * Documentation/vm/frontswap.txt for more information.
+ * Documentation/vm/frontswap.rst for more information.
  *
  * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
  * Author: Dan Magenheimer
diff --git a/mm/hmm.c b/mm/hmm.c
index 320545b98ff5..af176c6820cf 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -37,7 +37,7 @@
 
 #if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
 /*
- * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ * Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h
  */
 DEFINE_STATIC_KEY_FALSE(device_private_key);
 EXPORT_SYMBOL(device_private_key);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87ab9b8f56b5..6d5911673450 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1185,7 +1185,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
 	 * mmu_notifier_invalidate_range_end() happens which can lead to a
 	 * device seeing memory write in different order than CPU.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
 
@@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	 * replacing a zero pmd write protected page with a zero pte write
 	 * protected page.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	pmdp_huge_clear_flush(vma, haddr, pmd);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c204e3d132b..5af974abae46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3289,7 +3289,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				 * table protection not changing it to point
 				 * to a new page.
 				 *
-				 * See Documentation/vm/mmu_notifier.txt
+				 * See Documentation/vm/mmu_notifier.rst
 				 */
 				huge_ptep_set_wrprotect(src, addr, src_pte);
 			}
@@ -4355,7 +4355,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * No need to call mmu_notifier_invalidate_range() we are downgrading
 	 * page table protection not changing it to point to a new page.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(mm, start, end);
diff --git a/mm/ksm.c b/mm/ksm.c
index 293721f5da70..0b88698a9014 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1049,7 +1049,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * No need to notify as we are downgrading page table to read
 		 * only not changing it to point to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
 		/*
@@ -1138,7 +1138,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	 * No need to notify as we are replacing a read only page with another
 	 * read only page with the same content.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	ptep_clear_flush(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, newpte);
diff --git a/mm/mmap.c b/mm/mmap.c
index 9efdc021ad22..39fc51d1639c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2769,7 +2769,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	unsigned long ret = -EINVAL;
 	struct file *file;
 
-	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
 		     current->comm, current->pid);
 
 	if (prot)
diff --git a/mm/rmap.c b/mm/rmap.c
index 47db27f8049e..854b703fbe2a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -942,7 +942,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		 * downgrading page table protection not changing it to point
 		 * to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		if (ret)
 			(*cleaned)++;
@@ -1587,7 +1587,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * point at new page while a device still is using this
 			 * page.
 			 *
-			 * See Documentation/vm/mmu_notifier.txt
+			 * See Documentation/vm/mmu_notifier.rst
 			 */
 			dec_mm_counter(mm, mm_counter_file(page));
 		}
@@ -1597,7 +1597,7 @@ discard:
 		 * done above for all cases requiring it to happen under page
 		 * table lock before mmu_notifier_invalidate_range_end()
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		page_remove_rmap(subpage, PageHuge(page));
 		put_page(page);
diff --git a/mm/util.c b/mm/util.c
index c1250501364f..e857c80c6f4a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -609,7 +609,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  * succeed and -ENOMEM implies there is not.
  *
  * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
+ * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
  *
  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  * Additional code 2002 Jul 20 by Robert Love.
-- 
cgit v1.2.3


From d1361840f8c519eaee9a78ffe09e4f0a1b586846 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:35 -0700
Subject: tcp: fix SO_RCVLOWAT and RCVBUF autotuning

Applications might use SO_RCVLOWAT on TCP socket hoping to receive
one [E]POLLIN event only when a given amount of bytes are ready in socket
receive queue.

Problem is that receive autotuning is not aware of this constraint,
meaning sk_rcvbuf might be too small to allow all bytes to be stored.

Add a new (struct proto_ops)->set_rcvlowat method so that a protocol
can override the default setsockopt(SO_RCVLOWAT) behavior.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  1 +
 include/net/tcp.h   |  1 +
 net/core/sock.c     |  5 ++++-
 net/ipv4/af_inet.c  |  1 +
 net/ipv4/tcp.c      | 21 +++++++++++++++++++++
 net/ipv6/af_inet6.c |  1 +
 6 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 2248a052061d..6554d3ba4396 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -197,6 +197,7 @@ struct proto_ops {
 					   int offset, size_t size, int flags);
 	int		(*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
 					  size_t size);
+	int		(*set_rcvlowat)(struct sock *sk, int val);
 };
 
 #define DECLARE_SOCKADDR(type, dst, src)	\
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9c9b3768b350..b2318242cad8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -402,6 +402,7 @@ void tcp_set_keepalive(struct sock *sk, int val);
 void tcp_syn_ack_timeout(const struct request_sock *req);
 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		int flags, int *addr_len);
+int tcp_set_rcvlowat(struct sock *sk, int val);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6444525f610c..b2c3db169ca1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -905,7 +905,10 @@ set_rcvbuf:
 	case SO_RCVLOWAT:
 		if (val < 0)
 			val = INT_MAX;
-		sk->sk_rcvlowat = val ? : 1;
+		if (sock->ops->set_rcvlowat)
+			ret = sock->ops->set_rcvlowat(sk, val);
+		else
+			sk->sk_rcvlowat = val ? : 1;
 		break;
 
 	case SO_RCVTIMEO:
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eaed0367e669..f5c562aaef35 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1006,6 +1006,7 @@ const struct proto_ops inet_stream_ops = {
 	.compat_getsockopt = compat_sock_common_getsockopt,
 	.compat_ioctl	   = inet_compat_ioctl,
 #endif
+	.set_rcvlowat	   = tcp_set_rcvlowat,
 };
 EXPORT_SYMBOL(inet_stream_ops);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bccc4c270087..0abd8d1d3d1d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1701,6 +1701,27 @@ int tcp_peek_len(struct socket *sock)
 }
 EXPORT_SYMBOL(tcp_peek_len);
 
+/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
+int tcp_set_rcvlowat(struct sock *sk, int val)
+{
+	sk->sk_rcvlowat = val ? : 1;
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+		return 0;
+
+	/* val comes from user space and might be close to INT_MAX */
+	val <<= 1;
+	if (val < 0)
+		val = INT_MAX;
+
+	val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+	if (val > sk->sk_rcvbuf) {
+		sk->sk_rcvbuf = val;
+		tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcp_set_rcvlowat);
+
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8da0b513f188..e70d59fb26e1 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -590,6 +590,7 @@ const struct proto_ops inet6_stream_ops = {
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
+	.set_rcvlowat	   = tcp_set_rcvlowat,
 };
 
 const struct proto_ops inet6_dgram_ops = {
-- 
cgit v1.2.3


From 03f45c883c6f391ed4fff8292415b35bd1107519 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:37 -0700
Subject: tcp: avoid extra wakeups for SO_RCVLOWAT users

SO_RCVLOWAT is properly handled in tcp_poll(), so that POLLIN is only
generated when enough bytes are available in receive queue, after
David change (commit c7004482e8dc "tcp: Respect SO_RCVLOWAT in tcp_poll().")

But TCP still calls sk->sk_data_ready() for each chunk added in receive
queue, meaning thread is awaken, and goes back to sleep shortly after.

Tested:

tcp_mmap test program, receiving 32768 MB of data with SO_RCVLOWAT set to 512KB

-> Should get ~2 wakeups (c-switches) per MB, regardless of how many
(tiny or big) packets were received.

High speed (mostly full size GRO packets)

received 32768 MB (100 % mmap'ed) in 8.03112 s, 34.2266 Gbit,
  cpu usage user:0.037 sys:1.404, 43.9758 usec per MB, 65497 c-switches

received 32768 MB (99.9954 % mmap'ed) in 7.98453 s, 34.4263 Gbit,
  cpu usage user:0.03 sys:1.422, 44.3115 usec per MB, 65485 c-switches

Low speed (sender is ratelimited and sends 1-MSS at a time, so GRO is not helping)

received 22474.5 MB (100 % mmap'ed) in 6015.35 s, 0.0313414 Gbit,
  cpu usage user:0.05 sys:1.586, 72.7952 usec per MB, 44950 c-switches

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  1 +
 net/ipv4/tcp.c       |  4 ++++
 net/ipv4/tcp_input.c | 15 +++++++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b2318242cad8..0ee85c47c185 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -403,6 +403,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req);
 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		int flags, int *addr_len);
 int tcp_set_rcvlowat(struct sock *sk, int val);
+void tcp_data_ready(struct sock *sk);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0abd8d1d3d1d..c768d306b657 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1705,6 +1705,10 @@ EXPORT_SYMBOL(tcp_peek_len);
 int tcp_set_rcvlowat(struct sock *sk, int val)
 {
 	sk->sk_rcvlowat = val ? : 1;
+
+	/* Check if we need to signal EPOLLIN right now */
+	tcp_data_ready(sk);
+
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
 		return 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d854363a4387..f93687f97d80 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4576,6 +4576,17 @@ err:
 
 }
 
+void tcp_data_ready(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	int avail = tp->rcv_nxt - tp->copied_seq;
+
+	if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
+		return;
+
+	sk->sk_data_ready(sk);
+}
+
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4633,7 +4644,7 @@ queue_and_out:
 		if (eaten > 0)
 			kfree_skb_partial(skb, fragstolen);
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_data_ready(sk);
+			tcp_data_ready(sk);
 		return;
 	}
 
@@ -5434,7 +5445,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 no_ack:
 			if (eaten)
 				kfree_skb_partial(skb, fragstolen);
-			sk->sk_data_ready(sk);
+			tcp_data_ready(sk);
 			return;
 		}
 	}
-- 
cgit v1.2.3


From 93ab6cc69162775201587cc9da00d5016dc890e2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:38 -0700
Subject: tcp: implement mmap() for zero copy receive

Some networks can make sure TCP payload can exactly fit 4KB pages,
with well chosen MSS/MTU and architectures.

Implement mmap() system call so that applications can avoid
copying data without complex splice() games.

Note that a successful mmap( X bytes) on TCP socket is consuming
bytes, as if recvmsg() has been done. (tp->copied += X)

Only PROT_READ mappings are accepted, as skb page frags
are fundamentally shared and read only.

If tcp_mmap() finds data that is not a full page, or a patch of
urgent data, -EINVAL is returned, no bytes are consumed.

Application must fallback to recvmsg() to read the problematic sequence.

mmap() wont block,  regardless of socket being in blocking or
non-blocking mode. If not enough bytes are in receive queue,
mmap() would return -EAGAIN, or -EIO if socket is in a state
where no other bytes can be added into receive queue.

An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD)
to efficiently use mmap()

On the sender side, MSG_EOR might help to clearly separate unaligned
headers and 4K-aligned chunks if necessary.

Tested:

mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch.
MTU set to 4168  (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header)

Without mmap() (tcp_mmap -s)

received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit,
  cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches
received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit,
  cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches
received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit,
  cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches
received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit,
  cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches

With mmap() on receiver (tcp_mmap -s -z)

received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit,
  cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches
received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit,
  cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches
received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit,
  cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches
received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit,
  cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |   2 +
 net/ipv4/af_inet.c  |   2 +-
 net/ipv4/tcp.c      | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/af_inet6.c |   2 +-
 4 files changed, 117 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ee85c47c185..833154e3df17 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		int flags, int *addr_len);
 int tcp_set_rcvlowat(struct sock *sk, int val);
 void tcp_data_ready(struct sock *sk);
+int tcp_mmap(struct file *file, struct socket *sock,
+	     struct vm_area_struct *vma);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f5c562aaef35..3ebf599cebae 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
-	.mmap		   = sock_no_mmap,
+	.mmap		   = tcp_mmap,
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c768d306b657..438fbca96cd3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_set_rcvlowat);
 
+/* When user wants to mmap X pages, we first need to perform the mapping
+ * before freeing any skbs in receive queue, otherwise user would be unable
+ * to fallback to standard recvmsg(). This happens if some data in the
+ * requested block is not exactly fitting in a page.
+ *
+ * We only support order-0 pages for the moment.
+ * mmap() on TCP is very strict, there is no point
+ * trying to accommodate with pathological layouts.
+ */
+int tcp_mmap(struct file *file, struct socket *sock,
+	     struct vm_area_struct *vma)
+{
+	unsigned long size = vma->vm_end - vma->vm_start;
+	unsigned int nr_pages = size >> PAGE_SHIFT;
+	struct page **pages_array = NULL;
+	u32 seq, len, offset, nr = 0;
+	struct sock *sk = sock->sk;
+	const skb_frag_t *frags;
+	struct tcp_sock *tp;
+	struct sk_buff *skb;
+	int ret;
+
+	if (vma->vm_pgoff || !nr_pages)
+		return -EINVAL;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+	/* TODO: Maybe the following is not needed if pages are COW */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	lock_sock(sk);
+
+	ret = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	sock_rps_record_flow(sk);
+
+	if (tcp_inq(sk) < size) {
+		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+		goto out;
+	}
+	tp = tcp_sk(sk);
+	seq = tp->copied_seq;
+	/* Abort if urgent data is in the area */
+	if (unlikely(tp->urg_data)) {
+		u32 urg_offset = tp->urg_seq - seq;
+
+		ret = -EINVAL;
+		if (urg_offset < size)
+			goto out;
+	}
+	ret = -ENOMEM;
+	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
+				     GFP_KERNEL);
+	if (!pages_array)
+		goto out;
+	skb = tcp_recv_skb(sk, seq, &offset);
+	ret = -EINVAL;
+skb_start:
+	/* We do not support anything not in page frags */
+	offset -= skb_headlen(skb);
+	if ((int)offset < 0)
+		goto out;
+	if (skb_has_frag_list(skb))
+		goto out;
+	len = skb->data_len - offset;
+	frags = skb_shinfo(skb)->frags;
+	while (offset) {
+		if (frags->size > offset)
+			goto out;
+		offset -= frags->size;
+		frags++;
+	}
+	while (nr < nr_pages) {
+		if (len) {
+			if (len < PAGE_SIZE)
+				goto out;
+			if (frags->size != PAGE_SIZE || frags->page_offset)
+				goto out;
+			pages_array[nr++] = skb_frag_page(frags);
+			frags++;
+			len -= PAGE_SIZE;
+			seq += PAGE_SIZE;
+			continue;
+		}
+		skb = skb->next;
+		offset = seq - TCP_SKB_CB(skb)->seq;
+		goto skb_start;
+	}
+	/* OK, we have a full set of pages ready to be inserted into vma */
+	for (nr = 0; nr < nr_pages; nr++) {
+		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
+				     pages_array[nr]);
+		if (ret)
+			goto out;
+	}
+	/* operation is complete, we can 'consume' all skbs */
+	tp->copied_seq = seq;
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_recv_skb(sk, seq, &offset);
+	tcp_cleanup_rbuf(sk, size);
+
+	ret = 0;
+out:
+	release_sock(sk);
+	kvfree(pages_array);
+	return ret;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e70d59fb26e1..2c694912df2e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet_sendmsg,		/* ok		*/
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
-	.mmap		   = sock_no_mmap,
+	.mmap		   = tcp_mmap,
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
-- 
cgit v1.2.3


From a5724fc3834643a975bd0db71f001ca65f4a8382 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 16 Apr 2018 21:37:13 +0200
Subject: PCI: Add two more values for PCIe Max_Read_Request_Size

This patch adds missing values for the max read request size.
E.g. network driver r8169 uses a value of 4K.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pci_regs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba797a8f3..83ade9b5cf95 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -506,6 +506,8 @@
 #define  PCI_EXP_DEVCTL_READRQ_256B  0x1000 /* 256 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_512B  0x2000 /* 512 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_1024B 0x3000 /* 1024 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */
 #define  PCI_EXP_DEVCTL_BCR_FLR 0x8000  /* Bridge Configuration Retry / FLR */
 #define PCI_EXP_DEVSTA		10	/* Device Status */
 #define  PCI_EXP_DEVSTA_CED	0x0001	/* Correctable Error Detected */
-- 
cgit v1.2.3


From ef53e9e14714de2ce26eaae0244c07c426064d69 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Mon, 16 Apr 2018 15:07:13 -0700
Subject: net: Remove unused tcp_set_state tracepoint

This tracepoint was replaced by inet_sock_set_state in 563e0bb and not
used anywhere in the kernel anymore. Remove it.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 47 ----------------------------------------------
 1 file changed, 47 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 878b2be7ce77..3dd68029d77a 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -166,53 +166,6 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,
 	TP_ARGS(sk)
 );
 
-TRACE_EVENT(tcp_set_state,
-
-	TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),
-
-	TP_ARGS(sk, oldstate, newstate),
-
-	TP_STRUCT__entry(
-		__field(const void *, skaddr)
-		__field(int, oldstate)
-		__field(int, newstate)
-		__field(__u16, sport)
-		__field(__u16, dport)
-		__array(__u8, saddr, 4)
-		__array(__u8, daddr, 4)
-		__array(__u8, saddr_v6, 16)
-		__array(__u8, daddr_v6, 16)
-	),
-
-	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
-		__be32 *p32;
-
-		__entry->skaddr = sk;
-		__entry->oldstate = oldstate;
-		__entry->newstate = newstate;
-
-		__entry->sport = ntohs(inet->inet_sport);
-		__entry->dport = ntohs(inet->inet_dport);
-
-		p32 = (__be32 *) __entry->saddr;
-		*p32 = inet->inet_saddr;
-
-		p32 = (__be32 *) __entry->daddr;
-		*p32 =  inet->inet_daddr;
-
-		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
-			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
-	),
-
-	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
-		  __entry->sport, __entry->dport,
-		  __entry->saddr, __entry->daddr,
-		  __entry->saddr_v6, __entry->daddr_v6,
-		  show_tcp_state_name(__entry->oldstate),
-		  show_tcp_state_name(__entry->newstate))
-);
-
 TRACE_EVENT(tcp_retransmit_synack,
 
 	TP_PROTO(const struct sock *sk, const struct request_sock *req),
-- 
cgit v1.2.3


From e59644b720aed4b9ec9d3818b483f97376fb31ed Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Sun, 1 Apr 2018 08:42:08 +0000
Subject: security: remove security_settime

security_settime was a wrapper around security_settime64. There are no more
users of it. Therefore it can be removed. It was removed in:
commit 4eb1bca17933 ("time: Use do_settimeofday64() internally")

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Signed-off-by: James Morris <james.morris@microsoft.com>
---
 include/linux/security.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 3f5fd988ee87..5111fe8159ce 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -222,12 +222,6 @@ int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
 int security_settime64(const struct timespec64 *ts, const struct timezone *tz);
-static inline int security_settime(const struct timespec *ts, const struct timezone *tz)
-{
-	struct timespec64 ts64 = timespec_to_timespec64(*ts);
-
-	return security_settime64(&ts64, tz);
-}
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_bprm_set_creds(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
@@ -509,14 +503,6 @@ static inline int security_settime64(const struct timespec64 *ts,
 	return cap_settime(ts, tz);
 }
 
-static inline int security_settime(const struct timespec *ts,
-				   const struct timezone *tz)
-{
-	struct timespec64 ts64 = timespec_to_timespec64(*ts);
-
-	return cap_settime(&ts64, tz);
-}
-
 static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
 	return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages));
-- 
cgit v1.2.3


From b5f5f525c547e05fad3ecb4c8d6ceef9cdb14ac3 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Tue, 27 Mar 2018 08:25:18 -0700
Subject: clk: qcom: Add MSM8998 Global Clock Control (GCC) driver

Add support for the global clock controller found on MSM8998
based devices. This should allow most non-multimedia device
drivers to probe and control their clocks.

Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: Imran Khan <kimran@codeaurora.org>
Signed-off-by: Rajendra Nayak <rnayak@codeaurora.org>
[bjorn: Specify regs for alpha_plls, fix white spaces and add binding]
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/qcom,gcc.txt         |    1 +
 drivers/clk/qcom/Kconfig                           |    8 +
 drivers/clk/qcom/Makefile                          |    1 +
 drivers/clk/qcom/gcc-msm8998.c                     | 2834 ++++++++++++++++++++
 include/dt-bindings/clock/qcom,gcc-msm8998.h       |  208 ++
 5 files changed, 3052 insertions(+)
 create mode 100644 drivers/clk/qcom/gcc-msm8998.c
 create mode 100644 include/dt-bindings/clock/qcom,gcc-msm8998.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc.txt b/Documentation/devicetree/bindings/clock/qcom,gcc.txt
index 551d03be9665..d1fb8b213dde 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc.txt
@@ -17,6 +17,7 @@ Required properties :
 			"qcom,gcc-msm8974pro-ac"
 			"qcom,gcc-msm8994"
 			"qcom,gcc-msm8996"
+			"qcom,gcc-msm8998"
 			"qcom,gcc-mdm9615"
 
 - reg : shall contain base register location and length
diff --git a/drivers/clk/qcom/Kconfig b/drivers/clk/qcom/Kconfig
index fbf4532f94b8..e42e1afb0c51 100644
--- a/drivers/clk/qcom/Kconfig
+++ b/drivers/clk/qcom/Kconfig
@@ -218,6 +218,14 @@ config MSM_MMCC_8996
 	  Say Y if you want to support multimedia devices such as display,
 	  graphics, video encode/decode, camera, etc.
 
+config MSM_GCC_8998
+	tristate "MSM8998 Global Clock Controller"
+	depends on COMMON_CLK_QCOM
+	help
+	  Support for the global clock controller on msm8998 devices.
+	  Say Y if you want to use peripheral devices such as UART, SPI,
+	  i2c, USB, UFS, SD/eMMC, PCIe, etc.
+
 config SPMI_PMIC_CLKDIV
 	tristate "SPMI PMIC clkdiv Support"
 	depends on (COMMON_CLK_QCOM && SPMI) || COMPILE_TEST
diff --git a/drivers/clk/qcom/Makefile b/drivers/clk/qcom/Makefile
index 230332cf317e..7c09ab1a640c 100644
--- a/drivers/clk/qcom/Makefile
+++ b/drivers/clk/qcom/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_MSM_GCC_8974) += gcc-msm8974.o
 obj-$(CONFIG_MSM_GCC_8994) += gcc-msm8994.o
 obj-$(CONFIG_MSM_GCC_8996) += gcc-msm8996.o
 obj-$(CONFIG_MSM_LCC_8960) += lcc-msm8960.o
+obj-$(CONFIG_MSM_GCC_8998) += gcc-msm8998.o
 obj-$(CONFIG_MSM_MMCC_8960) += mmcc-msm8960.o
 obj-$(CONFIG_MSM_MMCC_8974) += mmcc-msm8974.o
 obj-$(CONFIG_MSM_MMCC_8996) += mmcc-msm8996.o
diff --git a/drivers/clk/qcom/gcc-msm8998.c b/drivers/clk/qcom/gcc-msm8998.c
new file mode 100644
index 000000000000..78d87f5c7098
--- /dev/null
+++ b/drivers/clk/qcom/gcc-msm8998.c
@@ -0,0 +1,2834 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/clk-provider.h>
+#include <linux/regmap.h>
+#include <linux/reset-controller.h>
+
+#include <dt-bindings/clock/qcom,gcc-msm8998.h>
+
+#include "common.h"
+#include "clk-regmap.h"
+#include "clk-alpha-pll.h"
+#include "clk-pll.h"
+#include "clk-rcg.h"
+#include "clk-branch.h"
+#include "reset.h"
+#include "gdsc.h"
+
+#define F(f, s, h, m, n) { (f), (s), (2 * (h) - 1), (m), (n) }
+
+enum {
+	P_AUD_REF_CLK,
+	P_CORE_BI_PLL_TEST_SE,
+	P_GPLL0_OUT_MAIN,
+	P_GPLL4_OUT_MAIN,
+	P_PLL0_EARLY_DIV_CLK_SRC,
+	P_SLEEP_CLK,
+	P_XO,
+};
+
+static const struct parent_map gcc_parent_map_0[] = {
+	{ P_XO, 0 },
+	{ P_GPLL0_OUT_MAIN, 1 },
+	{ P_PLL0_EARLY_DIV_CLK_SRC, 6 },
+	{ P_CORE_BI_PLL_TEST_SE, 7 },
+};
+
+static const char * const gcc_parent_names_0[] = {
+	"xo",
+	"gpll0_out_main",
+	"gpll0_out_main",
+	"core_bi_pll_test_se",
+};
+
+static const struct parent_map gcc_parent_map_1[] = {
+	{ P_XO, 0 },
+	{ P_GPLL0_OUT_MAIN, 1 },
+	{ P_CORE_BI_PLL_TEST_SE, 7 },
+};
+
+static const char * const gcc_parent_names_1[] = {
+	"xo",
+	"gpll0_out_main",
+	"core_bi_pll_test_se",
+};
+
+static const struct parent_map gcc_parent_map_2[] = {
+	{ P_XO, 0 },
+	{ P_GPLL0_OUT_MAIN, 1 },
+	{ P_SLEEP_CLK, 5 },
+	{ P_PLL0_EARLY_DIV_CLK_SRC, 6 },
+	{ P_CORE_BI_PLL_TEST_SE, 7 },
+};
+
+static const char * const gcc_parent_names_2[] = {
+	"xo",
+	"gpll0_out_main",
+	"core_pi_sleep_clk",
+	"gpll0_out_main",
+	"core_bi_pll_test_se",
+};
+
+static const struct parent_map gcc_parent_map_3[] = {
+	{ P_XO, 0 },
+	{ P_SLEEP_CLK, 5 },
+	{ P_CORE_BI_PLL_TEST_SE, 7 },
+};
+
+static const char * const gcc_parent_names_3[] = {
+	"xo",
+	"core_pi_sleep_clk",
+	"core_bi_pll_test_se",
+};
+
+static const struct parent_map gcc_parent_map_4[] = {
+	{ P_XO, 0 },
+	{ P_GPLL0_OUT_MAIN, 1 },
+	{ P_GPLL4_OUT_MAIN, 5 },
+	{ P_CORE_BI_PLL_TEST_SE, 7 },
+};
+
+static const char * const gcc_parent_names_4[] = {
+	"xo",
+	"gpll0_out_main",
+	"gpll4_out_main",
+	"core_bi_pll_test_se",
+};
+
+static const struct parent_map gcc_parent_map_5[] = {
+	{ P_XO, 0 },
+	{ P_GPLL0_OUT_MAIN, 1 },
+	{ P_AUD_REF_CLK, 2 },
+	{ P_CORE_BI_PLL_TEST_SE, 7 },
+};
+
+static const char * const gcc_parent_names_5[] = {
+	"xo",
+	"gpll0_out_main",
+	"aud_ref_clk",
+	"core_bi_pll_test_se",
+};
+
+static struct pll_vco fabia_vco[] = {
+	{ 250000000, 2000000000, 0 },
+	{ 125000000, 1000000000, 1 },
+};
+
+static struct clk_alpha_pll gpll0 = {
+	.offset = 0x0,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.vco_table = fabia_vco,
+	.num_vco = ARRAY_SIZE(fabia_vco),
+	.clkr = {
+		.enable_reg = 0x52000,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gpll0",
+			.parent_names = (const char *[]){ "xo" },
+			.num_parents = 1,
+			.ops = &clk_alpha_pll_ops,
+		}
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll0_out_even = {
+	.offset = 0x0,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll0_out_even",
+		.parent_names = (const char *[]){ "gpll0" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll0_out_main = {
+	.offset = 0x0,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll0_out_main",
+		.parent_names = (const char *[]){ "gpll0" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll0_out_odd = {
+	.offset = 0x0,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll0_out_odd",
+		.parent_names = (const char *[]){ "gpll0" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll0_out_test = {
+	.offset = 0x0,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll0_out_test",
+		.parent_names = (const char *[]){ "gpll0" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll gpll1 = {
+	.offset = 0x1000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.vco_table = fabia_vco,
+	.num_vco = ARRAY_SIZE(fabia_vco),
+	.clkr = {
+		.enable_reg = 0x52000,
+		.enable_mask = BIT(1),
+		.hw.init = &(struct clk_init_data){
+			.name = "gpll1",
+			.parent_names = (const char *[]){ "xo" },
+			.num_parents = 1,
+			.ops = &clk_alpha_pll_ops,
+		}
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll1_out_even = {
+	.offset = 0x1000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll1_out_even",
+		.parent_names = (const char *[]){ "gpll1" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll1_out_main = {
+	.offset = 0x1000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll1_out_main",
+		.parent_names = (const char *[]){ "gpll1" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll1_out_odd = {
+	.offset = 0x1000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll1_out_odd",
+		.parent_names = (const char *[]){ "gpll1" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll1_out_test = {
+	.offset = 0x1000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll1_out_test",
+		.parent_names = (const char *[]){ "gpll1" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll gpll2 = {
+	.offset = 0x2000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.vco_table = fabia_vco,
+	.num_vco = ARRAY_SIZE(fabia_vco),
+	.clkr = {
+		.enable_reg = 0x52000,
+		.enable_mask = BIT(2),
+		.hw.init = &(struct clk_init_data){
+			.name = "gpll2",
+			.parent_names = (const char *[]){ "xo" },
+			.num_parents = 1,
+			.ops = &clk_alpha_pll_ops,
+		}
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll2_out_even = {
+	.offset = 0x2000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll2_out_even",
+		.parent_names = (const char *[]){ "gpll2" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll2_out_main = {
+	.offset = 0x2000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll2_out_main",
+		.parent_names = (const char *[]){ "gpll2" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll2_out_odd = {
+	.offset = 0x2000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll2_out_odd",
+		.parent_names = (const char *[]){ "gpll2" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll2_out_test = {
+	.offset = 0x2000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll2_out_test",
+		.parent_names = (const char *[]){ "gpll2" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll gpll3 = {
+	.offset = 0x3000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.vco_table = fabia_vco,
+	.num_vco = ARRAY_SIZE(fabia_vco),
+	.clkr = {
+		.enable_reg = 0x52000,
+		.enable_mask = BIT(3),
+		.hw.init = &(struct clk_init_data){
+			.name = "gpll3",
+			.parent_names = (const char *[]){ "xo" },
+			.num_parents = 1,
+			.ops = &clk_alpha_pll_ops,
+		}
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll3_out_even = {
+	.offset = 0x3000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll3_out_even",
+		.parent_names = (const char *[]){ "gpll3" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll3_out_main = {
+	.offset = 0x3000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll3_out_main",
+		.parent_names = (const char *[]){ "gpll3" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll3_out_odd = {
+	.offset = 0x3000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll3_out_odd",
+		.parent_names = (const char *[]){ "gpll3" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll3_out_test = {
+	.offset = 0x3000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll3_out_test",
+		.parent_names = (const char *[]){ "gpll3" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll gpll4 = {
+	.offset = 0x77000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.vco_table = fabia_vco,
+	.num_vco = ARRAY_SIZE(fabia_vco),
+	.clkr = {
+		.enable_reg = 0x52000,
+		.enable_mask = BIT(4),
+		.hw.init = &(struct clk_init_data){
+			.name = "gpll4",
+			.parent_names = (const char *[]){ "xo" },
+			.num_parents = 1,
+			.ops = &clk_alpha_pll_ops,
+		}
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll4_out_even = {
+	.offset = 0x77000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll4_out_even",
+		.parent_names = (const char *[]){ "gpll4" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll4_out_main = {
+	.offset = 0x77000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll4_out_main",
+		.parent_names = (const char *[]){ "gpll4" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll4_out_odd = {
+	.offset = 0x77000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll4_out_odd",
+		.parent_names = (const char *[]){ "gpll4" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static struct clk_alpha_pll_postdiv gpll4_out_test = {
+	.offset = 0x77000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT],
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gpll4_out_test",
+		.parent_names = (const char *[]){ "gpll4" },
+		.num_parents = 1,
+		.ops = &clk_alpha_pll_postdiv_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_blsp1_qup1_i2c_apps_clk_src[] = {
+	F(19200000, P_XO, 1, 0, 0),
+	F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 blsp1_qup1_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x19020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup1_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_blsp1_qup1_spi_apps_clk_src[] = {
+	F(960000, P_XO, 10, 1, 2),
+	F(4800000, P_XO, 4, 0, 0),
+	F(9600000, P_XO, 2, 0, 0),
+	F(15000000, P_GPLL0_OUT_MAIN, 10, 1, 4),
+	F(19200000, P_XO, 1, 0, 0),
+	F(25000000, P_GPLL0_OUT_MAIN, 12, 1, 2),
+	F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 blsp1_qup1_spi_apps_clk_src = {
+	.cmd_rcgr = 0x1900c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup1_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup2_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x1b020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup2_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup2_spi_apps_clk_src = {
+	.cmd_rcgr = 0x1b00c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup2_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup3_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x1d020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup3_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup3_spi_apps_clk_src = {
+	.cmd_rcgr = 0x1d00c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup3_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup4_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x1f020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup4_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup4_spi_apps_clk_src = {
+	.cmd_rcgr = 0x1f00c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup4_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup5_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x21020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup5_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup5_spi_apps_clk_src = {
+	.cmd_rcgr = 0x2100c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup5_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup6_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x23020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup6_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_qup6_spi_apps_clk_src = {
+	.cmd_rcgr = 0x2300c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_qup6_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_blsp1_uart1_apps_clk_src[] = {
+	F(3686400, P_GPLL0_OUT_MAIN, 1, 96, 15625),
+	F(7372800, P_GPLL0_OUT_MAIN, 1, 192, 15625),
+	F(14745600, P_GPLL0_OUT_MAIN, 1, 384, 15625),
+	F(16000000, P_GPLL0_OUT_MAIN, 5, 2, 15),
+	F(19200000, P_XO, 1, 0, 0),
+	F(24000000, P_GPLL0_OUT_MAIN, 5, 1, 5),
+	F(32000000, P_GPLL0_OUT_MAIN, 1, 4, 75),
+	F(40000000, P_GPLL0_OUT_MAIN, 15, 0, 0),
+	F(46400000, P_GPLL0_OUT_MAIN, 1, 29, 375),
+	F(48000000, P_GPLL0_OUT_MAIN, 12.5, 0, 0),
+	F(51200000, P_GPLL0_OUT_MAIN, 1, 32, 375),
+	F(56000000, P_GPLL0_OUT_MAIN, 1, 7, 75),
+	F(58982400, P_GPLL0_OUT_MAIN, 1, 1536, 15625),
+	F(60000000, P_GPLL0_OUT_MAIN, 10, 0, 0),
+	F(63157895, P_GPLL0_OUT_MAIN, 9.5, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 blsp1_uart1_apps_clk_src = {
+	.cmd_rcgr = 0x1a00c,
+	.mnd_width = 16,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_uart1_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_uart1_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_uart2_apps_clk_src = {
+	.cmd_rcgr = 0x1c00c,
+	.mnd_width = 16,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_uart1_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_uart2_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp1_uart3_apps_clk_src = {
+	.cmd_rcgr = 0x1e00c,
+	.mnd_width = 16,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_uart1_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp1_uart3_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup1_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x26020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup1_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup1_spi_apps_clk_src = {
+	.cmd_rcgr = 0x2600c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup1_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup2_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x28020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup2_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup2_spi_apps_clk_src = {
+	.cmd_rcgr = 0x2800c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup2_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup3_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x2a020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup3_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup3_spi_apps_clk_src = {
+	.cmd_rcgr = 0x2a00c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup3_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup4_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x2c020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup4_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup4_spi_apps_clk_src = {
+	.cmd_rcgr = 0x2c00c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup4_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup5_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x2e020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup5_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup5_spi_apps_clk_src = {
+	.cmd_rcgr = 0x2e00c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup5_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup6_i2c_apps_clk_src = {
+	.cmd_rcgr = 0x30020,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup6_i2c_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_qup6_spi_apps_clk_src = {
+	.cmd_rcgr = 0x3000c,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_qup6_spi_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_uart1_apps_clk_src = {
+	.cmd_rcgr = 0x2700c,
+	.mnd_width = 16,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_uart1_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_uart1_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_uart2_apps_clk_src = {
+	.cmd_rcgr = 0x2900c,
+	.mnd_width = 16,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_uart1_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_uart2_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 blsp2_uart3_apps_clk_src = {
+	.cmd_rcgr = 0x2b00c,
+	.mnd_width = 16,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_blsp1_uart1_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "blsp2_uart3_apps_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_gp1_clk_src[] = {
+	F(19200000, P_XO, 1, 0, 0),
+	F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0),
+	F(200000000, P_GPLL0_OUT_MAIN, 3, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 gp1_clk_src = {
+	.cmd_rcgr = 0x64004,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_2,
+	.freq_tbl = ftbl_gp1_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gp1_clk_src",
+		.parent_names = gcc_parent_names_2,
+		.num_parents = 5,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 gp2_clk_src = {
+	.cmd_rcgr = 0x65004,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_2,
+	.freq_tbl = ftbl_gp1_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gp2_clk_src",
+		.parent_names = gcc_parent_names_2,
+		.num_parents = 5,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 gp3_clk_src = {
+	.cmd_rcgr = 0x66004,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_2,
+	.freq_tbl = ftbl_gp1_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "gp3_clk_src",
+		.parent_names = gcc_parent_names_2,
+		.num_parents = 5,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_hmss_ahb_clk_src[] = {
+	F(19200000, P_XO, 1, 0, 0),
+	F(37500000, P_GPLL0_OUT_MAIN, 16, 0, 0),
+	F(75000000, P_GPLL0_OUT_MAIN, 8, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 hmss_ahb_clk_src = {
+	.cmd_rcgr = 0x48014,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_hmss_ahb_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "hmss_ahb_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_hmss_rbcpr_clk_src[] = {
+	F(19200000, P_XO, 1, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 hmss_rbcpr_clk_src = {
+	.cmd_rcgr = 0x48044,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_hmss_rbcpr_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "hmss_rbcpr_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_pcie_aux_clk_src[] = {
+	F(1010526, P_XO, 1, 1, 19),
+	{ }
+};
+
+static struct clk_rcg2 pcie_aux_clk_src = {
+	.cmd_rcgr = 0x6c000,
+	.mnd_width = 16,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_3,
+	.freq_tbl = ftbl_pcie_aux_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "pcie_aux_clk_src",
+		.parent_names = gcc_parent_names_3,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_pdm2_clk_src[] = {
+	F(60000000, P_GPLL0_OUT_MAIN, 10, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 pdm2_clk_src = {
+	.cmd_rcgr = 0x33010,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_pdm2_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "pdm2_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_sdcc2_apps_clk_src[] = {
+	F(144000, P_XO, 16, 3, 25),
+	F(400000, P_XO, 12, 1, 4),
+	F(20000000, P_GPLL0_OUT_MAIN, 15, 1, 2),
+	F(25000000, P_GPLL0_OUT_MAIN, 12, 1, 2),
+	F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0),
+	F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0),
+	F(200000000, P_GPLL0_OUT_MAIN, 3, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 sdcc2_apps_clk_src = {
+	.cmd_rcgr = 0x14010,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_4,
+	.freq_tbl = ftbl_sdcc2_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "sdcc2_apps_clk_src",
+		.parent_names = gcc_parent_names_4,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_sdcc4_apps_clk_src[] = {
+	F(144000, P_XO, 16, 3, 25),
+	F(400000, P_XO, 12, 1, 4),
+	F(20000000, P_GPLL0_OUT_MAIN, 15, 1, 2),
+	F(25000000, P_GPLL0_OUT_MAIN, 12, 1, 2),
+	F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0),
+	F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 sdcc4_apps_clk_src = {
+	.cmd_rcgr = 0x16010,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_1,
+	.freq_tbl = ftbl_sdcc4_apps_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "sdcc4_apps_clk_src",
+		.parent_names = gcc_parent_names_1,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_tsif_ref_clk_src[] = {
+	F(105495, P_XO, 1, 1, 182),
+	{ }
+};
+
+static struct clk_rcg2 tsif_ref_clk_src = {
+	.cmd_rcgr = 0x36010,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_5,
+	.freq_tbl = ftbl_tsif_ref_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "tsif_ref_clk_src",
+		.parent_names = gcc_parent_names_5,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_ufs_axi_clk_src[] = {
+	F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0),
+	F(200000000, P_GPLL0_OUT_MAIN, 3, 0, 0),
+	F(240000000, P_GPLL0_OUT_MAIN, 2.5, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 ufs_axi_clk_src = {
+	.cmd_rcgr = 0x75018,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_ufs_axi_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "ufs_axi_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_usb30_master_clk_src[] = {
+	F(19200000, P_XO, 1, 0, 0),
+	F(120000000, P_GPLL0_OUT_MAIN, 5, 0, 0),
+	F(150000000, P_GPLL0_OUT_MAIN, 4, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 usb30_master_clk_src = {
+	.cmd_rcgr = 0xf014,
+	.mnd_width = 8,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_usb30_master_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "usb30_master_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_rcg2 usb30_mock_utmi_clk_src = {
+	.cmd_rcgr = 0xf028,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_0,
+	.freq_tbl = ftbl_hmss_rbcpr_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "usb30_mock_utmi_clk_src",
+		.parent_names = gcc_parent_names_0,
+		.num_parents = 4,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static const struct freq_tbl ftbl_usb3_phy_aux_clk_src[] = {
+	F(1200000, P_XO, 16, 0, 0),
+	{ }
+};
+
+static struct clk_rcg2 usb3_phy_aux_clk_src = {
+	.cmd_rcgr = 0x5000c,
+	.mnd_width = 0,
+	.hid_width = 5,
+	.parent_map = gcc_parent_map_3,
+	.freq_tbl = ftbl_usb3_phy_aux_clk_src,
+	.clkr.hw.init = &(struct clk_init_data){
+		.name = "usb3_phy_aux_clk_src",
+		.parent_names = gcc_parent_names_3,
+		.num_parents = 3,
+		.ops = &clk_rcg2_ops,
+	},
+};
+
+static struct clk_branch gcc_aggre1_noc_xo_clk = {
+	.halt_reg = 0x8202c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x8202c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_aggre1_noc_xo_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_aggre1_ufs_axi_clk = {
+	.halt_reg = 0x82028,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x82028,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_aggre1_ufs_axi_clk",
+			.parent_names = (const char *[]){
+				"ufs_axi_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_aggre1_usb3_axi_clk = {
+	.halt_reg = 0x82024,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x82024,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_aggre1_usb3_axi_clk",
+			.parent_names = (const char *[]){
+				"usb30_master_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_apss_qdss_tsctr_div2_clk = {
+	.halt_reg = 0x48090,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x48090,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_apss_qdss_tsctr_div2_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_apss_qdss_tsctr_div8_clk = {
+	.halt_reg = 0x48094,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x48094,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_apss_qdss_tsctr_div8_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_bimc_hmss_axi_clk = {
+	.halt_reg = 0x48004,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x52004,
+		.enable_mask = BIT(22),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_bimc_hmss_axi_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_bimc_mss_q6_axi_clk = {
+	.halt_reg = 0x4401c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x4401c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_bimc_mss_q6_axi_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_ahb_clk = {
+	.halt_reg = 0x17004,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x52004,
+		.enable_mask = BIT(17),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup1_i2c_apps_clk = {
+	.halt_reg = 0x19008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x19008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup1_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup1_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup1_spi_apps_clk = {
+	.halt_reg = 0x19004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x19004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup1_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup1_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup2_i2c_apps_clk = {
+	.halt_reg = 0x1b008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1b008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup2_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup2_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup2_spi_apps_clk = {
+	.halt_reg = 0x1b004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1b004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup2_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup2_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup3_i2c_apps_clk = {
+	.halt_reg = 0x1d008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1d008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup3_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup3_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup3_spi_apps_clk = {
+	.halt_reg = 0x1d004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1d004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup3_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup3_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup4_i2c_apps_clk = {
+	.halt_reg = 0x1f008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1f008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup4_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup4_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup4_spi_apps_clk = {
+	.halt_reg = 0x1f004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1f004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup4_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup4_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup5_i2c_apps_clk = {
+	.halt_reg = 0x21008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x21008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup5_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup5_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup5_spi_apps_clk = {
+	.halt_reg = 0x21004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x21004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup5_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup5_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup6_i2c_apps_clk = {
+	.halt_reg = 0x23008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x23008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup6_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup6_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_qup6_spi_apps_clk = {
+	.halt_reg = 0x23004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x23004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_qup6_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_qup6_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_sleep_clk = {
+	.halt_reg = 0x17008,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x52004,
+		.enable_mask = BIT(16),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_sleep_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_uart1_apps_clk = {
+	.halt_reg = 0x1a004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1a004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_uart1_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_uart1_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_uart2_apps_clk = {
+	.halt_reg = 0x1c004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1c004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_uart2_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_uart2_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp1_uart3_apps_clk = {
+	.halt_reg = 0x1e004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x1e004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp1_uart3_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp1_uart3_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_ahb_clk = {
+	.halt_reg = 0x25004,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x52004,
+		.enable_mask = BIT(15),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup1_i2c_apps_clk = {
+	.halt_reg = 0x26008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x26008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup1_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup1_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup1_spi_apps_clk = {
+	.halt_reg = 0x26004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x26004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup1_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup1_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup2_i2c_apps_clk = {
+	.halt_reg = 0x28008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x28008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup2_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup2_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup2_spi_apps_clk = {
+	.halt_reg = 0x28004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x28004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup2_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup2_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup3_i2c_apps_clk = {
+	.halt_reg = 0x2a008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x2a008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup3_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup3_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup3_spi_apps_clk = {
+	.halt_reg = 0x2a004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x2a004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup3_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup3_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup4_i2c_apps_clk = {
+	.halt_reg = 0x2c008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x2c008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup4_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup4_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup4_spi_apps_clk = {
+	.halt_reg = 0x2c004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x2c004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup4_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup4_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup5_i2c_apps_clk = {
+	.halt_reg = 0x2e008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x2e008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup5_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup5_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup5_spi_apps_clk = {
+	.halt_reg = 0x2e004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x2e004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup5_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup5_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup6_i2c_apps_clk = {
+	.halt_reg = 0x30008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x30008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup6_i2c_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup6_i2c_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_qup6_spi_apps_clk = {
+	.halt_reg = 0x30004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x30004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_qup6_spi_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_qup6_spi_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_sleep_clk = {
+	.halt_reg = 0x25008,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x52004,
+		.enable_mask = BIT(14),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_sleep_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_uart1_apps_clk = {
+	.halt_reg = 0x27004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x27004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_uart1_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_uart1_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_uart2_apps_clk = {
+	.halt_reg = 0x29004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x29004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_uart2_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_uart2_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_blsp2_uart3_apps_clk = {
+	.halt_reg = 0x2b004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x2b004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_blsp2_uart3_apps_clk",
+			.parent_names = (const char *[]){
+				"blsp2_uart3_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_cfg_noc_usb3_axi_clk = {
+	.halt_reg = 0x5018,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x5018,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_cfg_noc_usb3_axi_clk",
+			.parent_names = (const char *[]){
+				"usb30_master_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_gp1_clk = {
+	.halt_reg = 0x64000,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x64000,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gp1_clk",
+			.parent_names = (const char *[]){
+				"gp1_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_gp2_clk = {
+	.halt_reg = 0x65000,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x65000,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gp2_clk",
+			.parent_names = (const char *[]){
+				"gp2_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_gp3_clk = {
+	.halt_reg = 0x66000,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x66000,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gp3_clk",
+			.parent_names = (const char *[]){
+				"gp3_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_gpu_bimc_gfx_clk = {
+	.halt_reg = 0x71010,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x71010,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gpu_bimc_gfx_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_gpu_bimc_gfx_src_clk = {
+	.halt_reg = 0x7100c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x7100c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gpu_bimc_gfx_src_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_gpu_cfg_ahb_clk = {
+	.halt_reg = 0x71004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x71004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gpu_cfg_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_gpu_snoc_dvm_gfx_clk = {
+	.halt_reg = 0x71018,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x71018,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gpu_snoc_dvm_gfx_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_hmss_ahb_clk = {
+	.halt_reg = 0x48000,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x52004,
+		.enable_mask = BIT(21),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_hmss_ahb_clk",
+			.parent_names = (const char *[]){
+				"hmss_ahb_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_hmss_at_clk = {
+	.halt_reg = 0x48010,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x48010,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_hmss_at_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_hmss_dvm_bus_clk = {
+	.halt_reg = 0x4808c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x4808c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_hmss_dvm_bus_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_hmss_rbcpr_clk = {
+	.halt_reg = 0x48008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x48008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_hmss_rbcpr_clk",
+			.parent_names = (const char *[]){
+				"hmss_rbcpr_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_hmss_trig_clk = {
+	.halt_reg = 0x4800c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x4800c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_hmss_trig_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_lpass_at_clk = {
+	.halt_reg = 0x47020,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x47020,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_lpass_at_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_lpass_trig_clk = {
+	.halt_reg = 0x4701c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x4701c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_lpass_trig_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_mmss_noc_cfg_ahb_clk = {
+	.halt_reg = 0x9004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x9004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_mmss_noc_cfg_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_mmss_qm_ahb_clk = {
+	.halt_reg = 0x9030,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x9030,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_mmss_qm_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_mmss_qm_core_clk = {
+	.halt_reg = 0x900c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x900c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_mmss_qm_core_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_mmss_sys_noc_axi_clk = {
+	.halt_reg = 0x9000,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x9000,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_mmss_sys_noc_axi_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_mss_at_clk = {
+	.halt_reg = 0x8a00c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x8a00c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_mss_at_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pcie_0_aux_clk = {
+	.halt_reg = 0x6b014,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x6b014,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pcie_0_aux_clk",
+			.parent_names = (const char *[]){
+				"pcie_aux_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pcie_0_cfg_ahb_clk = {
+	.halt_reg = 0x6b010,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x6b010,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pcie_0_cfg_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pcie_0_mstr_axi_clk = {
+	.halt_reg = 0x6b00c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x6b00c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pcie_0_mstr_axi_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pcie_0_pipe_clk = {
+	.halt_reg = 0x6b018,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x6b018,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pcie_0_pipe_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pcie_0_slv_axi_clk = {
+	.halt_reg = 0x6b008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x6b008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pcie_0_slv_axi_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pcie_phy_aux_clk = {
+	.halt_reg = 0x6f004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x6f004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pcie_phy_aux_clk",
+			.parent_names = (const char *[]){
+				"pcie_aux_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pdm2_clk = {
+	.halt_reg = 0x3300c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x3300c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pdm2_clk",
+			.parent_names = (const char *[]){
+				"pdm2_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pdm_ahb_clk = {
+	.halt_reg = 0x33004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x33004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pdm_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_pdm_xo4_clk = {
+	.halt_reg = 0x33008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x33008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_pdm_xo4_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_prng_ahb_clk = {
+	.halt_reg = 0x34004,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x52004,
+		.enable_mask = BIT(13),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_prng_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_sdcc2_ahb_clk = {
+	.halt_reg = 0x14008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x14008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_sdcc2_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_sdcc2_apps_clk = {
+	.halt_reg = 0x14004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x14004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_sdcc2_apps_clk",
+			.parent_names = (const char *[]){
+				"sdcc2_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_sdcc4_ahb_clk = {
+	.halt_reg = 0x16008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x16008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_sdcc4_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_sdcc4_apps_clk = {
+	.halt_reg = 0x16004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x16004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_sdcc4_apps_clk",
+			.parent_names = (const char *[]){
+				"sdcc4_apps_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_tsif_ahb_clk = {
+	.halt_reg = 0x36004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x36004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_tsif_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_tsif_inactivity_timers_clk = {
+	.halt_reg = 0x3600c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x3600c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_tsif_inactivity_timers_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_tsif_ref_clk = {
+	.halt_reg = 0x36008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x36008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_tsif_ref_clk",
+			.parent_names = (const char *[]){
+				"tsif_ref_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_ahb_clk = {
+	.halt_reg = 0x7500c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x7500c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_ahb_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_axi_clk = {
+	.halt_reg = 0x75008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x75008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_axi_clk",
+			.parent_names = (const char *[]){
+				"ufs_axi_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_ice_core_clk = {
+	.halt_reg = 0x7600c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x7600c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_ice_core_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_phy_aux_clk = {
+	.halt_reg = 0x76040,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x76040,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_phy_aux_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_rx_symbol_0_clk = {
+	.halt_reg = 0x75014,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x75014,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_rx_symbol_0_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_rx_symbol_1_clk = {
+	.halt_reg = 0x7605c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x7605c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_rx_symbol_1_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_tx_symbol_0_clk = {
+	.halt_reg = 0x75010,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x75010,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_tx_symbol_0_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_ufs_unipro_core_clk = {
+	.halt_reg = 0x76008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x76008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_ufs_unipro_core_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_usb30_master_clk = {
+	.halt_reg = 0xf008,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0xf008,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_usb30_master_clk",
+			.parent_names = (const char *[]){
+				"usb30_master_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_usb30_mock_utmi_clk = {
+	.halt_reg = 0xf010,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0xf010,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_usb30_mock_utmi_clk",
+			.parent_names = (const char *[]){
+				"usb30_mock_utmi_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_usb30_sleep_clk = {
+	.halt_reg = 0xf00c,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0xf00c,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_usb30_sleep_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_usb3_phy_aux_clk = {
+	.halt_reg = 0x50000,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x50000,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_usb3_phy_aux_clk",
+			.parent_names = (const char *[]){
+				"usb3_phy_aux_clk_src",
+			},
+			.num_parents = 1,
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_usb3_phy_pipe_clk = {
+	.halt_reg = 0x50004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x50004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_usb3_phy_pipe_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_usb_phy_cfg_ahb2phy_clk = {
+	.halt_reg = 0x6a004,
+	.halt_check = BRANCH_HALT,
+	.clkr = {
+		.enable_reg = 0x6a004,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_usb_phy_cfg_ahb2phy_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct gdsc pcie_0_gdsc = {
+	.gdscr = 0x6b004,
+	.gds_hw_ctrl = 0x0,
+	.pd = {
+		.name = "pcie_0_gdsc",
+	},
+	.pwrsts = PWRSTS_OFF_ON,
+	.flags = VOTABLE,
+};
+
+static struct gdsc ufs_gdsc = {
+	.gdscr = 0x75004,
+	.gds_hw_ctrl = 0x0,
+	.pd = {
+		.name = "ufs_gdsc",
+	},
+	.pwrsts = PWRSTS_OFF_ON,
+	.flags = VOTABLE,
+};
+
+static struct gdsc usb_30_gdsc = {
+	.gdscr = 0xf004,
+	.gds_hw_ctrl = 0x0,
+	.pd = {
+		.name = "usb_30_gdsc",
+	},
+	.pwrsts = PWRSTS_OFF_ON,
+	.flags = VOTABLE,
+};
+
+static struct clk_regmap *gcc_msm8998_clocks[] = {
+	[BLSP1_QUP1_I2C_APPS_CLK_SRC] = &blsp1_qup1_i2c_apps_clk_src.clkr,
+	[BLSP1_QUP1_SPI_APPS_CLK_SRC] = &blsp1_qup1_spi_apps_clk_src.clkr,
+	[BLSP1_QUP2_I2C_APPS_CLK_SRC] = &blsp1_qup2_i2c_apps_clk_src.clkr,
+	[BLSP1_QUP2_SPI_APPS_CLK_SRC] = &blsp1_qup2_spi_apps_clk_src.clkr,
+	[BLSP1_QUP3_I2C_APPS_CLK_SRC] = &blsp1_qup3_i2c_apps_clk_src.clkr,
+	[BLSP1_QUP3_SPI_APPS_CLK_SRC] = &blsp1_qup3_spi_apps_clk_src.clkr,
+	[BLSP1_QUP4_I2C_APPS_CLK_SRC] = &blsp1_qup4_i2c_apps_clk_src.clkr,
+	[BLSP1_QUP4_SPI_APPS_CLK_SRC] = &blsp1_qup4_spi_apps_clk_src.clkr,
+	[BLSP1_QUP5_I2C_APPS_CLK_SRC] = &blsp1_qup5_i2c_apps_clk_src.clkr,
+	[BLSP1_QUP5_SPI_APPS_CLK_SRC] = &blsp1_qup5_spi_apps_clk_src.clkr,
+	[BLSP1_QUP6_I2C_APPS_CLK_SRC] = &blsp1_qup6_i2c_apps_clk_src.clkr,
+	[BLSP1_QUP6_SPI_APPS_CLK_SRC] = &blsp1_qup6_spi_apps_clk_src.clkr,
+	[BLSP1_UART1_APPS_CLK_SRC] = &blsp1_uart1_apps_clk_src.clkr,
+	[BLSP1_UART2_APPS_CLK_SRC] = &blsp1_uart2_apps_clk_src.clkr,
+	[BLSP1_UART3_APPS_CLK_SRC] = &blsp1_uart3_apps_clk_src.clkr,
+	[BLSP2_QUP1_I2C_APPS_CLK_SRC] = &blsp2_qup1_i2c_apps_clk_src.clkr,
+	[BLSP2_QUP1_SPI_APPS_CLK_SRC] = &blsp2_qup1_spi_apps_clk_src.clkr,
+	[BLSP2_QUP2_I2C_APPS_CLK_SRC] = &blsp2_qup2_i2c_apps_clk_src.clkr,
+	[BLSP2_QUP2_SPI_APPS_CLK_SRC] = &blsp2_qup2_spi_apps_clk_src.clkr,
+	[BLSP2_QUP3_I2C_APPS_CLK_SRC] = &blsp2_qup3_i2c_apps_clk_src.clkr,
+	[BLSP2_QUP3_SPI_APPS_CLK_SRC] = &blsp2_qup3_spi_apps_clk_src.clkr,
+	[BLSP2_QUP4_I2C_APPS_CLK_SRC] = &blsp2_qup4_i2c_apps_clk_src.clkr,
+	[BLSP2_QUP4_SPI_APPS_CLK_SRC] = &blsp2_qup4_spi_apps_clk_src.clkr,
+	[BLSP2_QUP5_I2C_APPS_CLK_SRC] = &blsp2_qup5_i2c_apps_clk_src.clkr,
+	[BLSP2_QUP5_SPI_APPS_CLK_SRC] = &blsp2_qup5_spi_apps_clk_src.clkr,
+	[BLSP2_QUP6_I2C_APPS_CLK_SRC] = &blsp2_qup6_i2c_apps_clk_src.clkr,
+	[BLSP2_QUP6_SPI_APPS_CLK_SRC] = &blsp2_qup6_spi_apps_clk_src.clkr,
+	[BLSP2_UART1_APPS_CLK_SRC] = &blsp2_uart1_apps_clk_src.clkr,
+	[BLSP2_UART2_APPS_CLK_SRC] = &blsp2_uart2_apps_clk_src.clkr,
+	[BLSP2_UART3_APPS_CLK_SRC] = &blsp2_uart3_apps_clk_src.clkr,
+	[GCC_AGGRE1_NOC_XO_CLK] = &gcc_aggre1_noc_xo_clk.clkr,
+	[GCC_AGGRE1_UFS_AXI_CLK] = &gcc_aggre1_ufs_axi_clk.clkr,
+	[GCC_AGGRE1_USB3_AXI_CLK] = &gcc_aggre1_usb3_axi_clk.clkr,
+	[GCC_APSS_QDSS_TSCTR_DIV2_CLK] = &gcc_apss_qdss_tsctr_div2_clk.clkr,
+	[GCC_APSS_QDSS_TSCTR_DIV8_CLK] = &gcc_apss_qdss_tsctr_div8_clk.clkr,
+	[GCC_BIMC_HMSS_AXI_CLK] = &gcc_bimc_hmss_axi_clk.clkr,
+	[GCC_BIMC_MSS_Q6_AXI_CLK] = &gcc_bimc_mss_q6_axi_clk.clkr,
+	[GCC_BLSP1_AHB_CLK] = &gcc_blsp1_ahb_clk.clkr,
+	[GCC_BLSP1_QUP1_I2C_APPS_CLK] = &gcc_blsp1_qup1_i2c_apps_clk.clkr,
+	[GCC_BLSP1_QUP1_SPI_APPS_CLK] = &gcc_blsp1_qup1_spi_apps_clk.clkr,
+	[GCC_BLSP1_QUP2_I2C_APPS_CLK] = &gcc_blsp1_qup2_i2c_apps_clk.clkr,
+	[GCC_BLSP1_QUP2_SPI_APPS_CLK] = &gcc_blsp1_qup2_spi_apps_clk.clkr,
+	[GCC_BLSP1_QUP3_I2C_APPS_CLK] = &gcc_blsp1_qup3_i2c_apps_clk.clkr,
+	[GCC_BLSP1_QUP3_SPI_APPS_CLK] = &gcc_blsp1_qup3_spi_apps_clk.clkr,
+	[GCC_BLSP1_QUP4_I2C_APPS_CLK] = &gcc_blsp1_qup4_i2c_apps_clk.clkr,
+	[GCC_BLSP1_QUP4_SPI_APPS_CLK] = &gcc_blsp1_qup4_spi_apps_clk.clkr,
+	[GCC_BLSP1_QUP5_I2C_APPS_CLK] = &gcc_blsp1_qup5_i2c_apps_clk.clkr,
+	[GCC_BLSP1_QUP5_SPI_APPS_CLK] = &gcc_blsp1_qup5_spi_apps_clk.clkr,
+	[GCC_BLSP1_QUP6_I2C_APPS_CLK] = &gcc_blsp1_qup6_i2c_apps_clk.clkr,
+	[GCC_BLSP1_QUP6_SPI_APPS_CLK] = &gcc_blsp1_qup6_spi_apps_clk.clkr,
+	[GCC_BLSP1_SLEEP_CLK] = &gcc_blsp1_sleep_clk.clkr,
+	[GCC_BLSP1_UART1_APPS_CLK] = &gcc_blsp1_uart1_apps_clk.clkr,
+	[GCC_BLSP1_UART2_APPS_CLK] = &gcc_blsp1_uart2_apps_clk.clkr,
+	[GCC_BLSP1_UART3_APPS_CLK] = &gcc_blsp1_uart3_apps_clk.clkr,
+	[GCC_BLSP2_AHB_CLK] = &gcc_blsp2_ahb_clk.clkr,
+	[GCC_BLSP2_QUP1_I2C_APPS_CLK] = &gcc_blsp2_qup1_i2c_apps_clk.clkr,
+	[GCC_BLSP2_QUP1_SPI_APPS_CLK] = &gcc_blsp2_qup1_spi_apps_clk.clkr,
+	[GCC_BLSP2_QUP2_I2C_APPS_CLK] = &gcc_blsp2_qup2_i2c_apps_clk.clkr,
+	[GCC_BLSP2_QUP2_SPI_APPS_CLK] = &gcc_blsp2_qup2_spi_apps_clk.clkr,
+	[GCC_BLSP2_QUP3_I2C_APPS_CLK] = &gcc_blsp2_qup3_i2c_apps_clk.clkr,
+	[GCC_BLSP2_QUP3_SPI_APPS_CLK] = &gcc_blsp2_qup3_spi_apps_clk.clkr,
+	[GCC_BLSP2_QUP4_I2C_APPS_CLK] = &gcc_blsp2_qup4_i2c_apps_clk.clkr,
+	[GCC_BLSP2_QUP4_SPI_APPS_CLK] = &gcc_blsp2_qup4_spi_apps_clk.clkr,
+	[GCC_BLSP2_QUP5_I2C_APPS_CLK] = &gcc_blsp2_qup5_i2c_apps_clk.clkr,
+	[GCC_BLSP2_QUP5_SPI_APPS_CLK] = &gcc_blsp2_qup5_spi_apps_clk.clkr,
+	[GCC_BLSP2_QUP6_I2C_APPS_CLK] = &gcc_blsp2_qup6_i2c_apps_clk.clkr,
+	[GCC_BLSP2_QUP6_SPI_APPS_CLK] = &gcc_blsp2_qup6_spi_apps_clk.clkr,
+	[GCC_BLSP2_SLEEP_CLK] = &gcc_blsp2_sleep_clk.clkr,
+	[GCC_BLSP2_UART1_APPS_CLK] = &gcc_blsp2_uart1_apps_clk.clkr,
+	[GCC_BLSP2_UART2_APPS_CLK] = &gcc_blsp2_uart2_apps_clk.clkr,
+	[GCC_BLSP2_UART3_APPS_CLK] = &gcc_blsp2_uart3_apps_clk.clkr,
+	[GCC_CFG_NOC_USB3_AXI_CLK] = &gcc_cfg_noc_usb3_axi_clk.clkr,
+	[GCC_GP1_CLK] = &gcc_gp1_clk.clkr,
+	[GCC_GP2_CLK] = &gcc_gp2_clk.clkr,
+	[GCC_GP3_CLK] = &gcc_gp3_clk.clkr,
+	[GCC_GPU_BIMC_GFX_CLK] = &gcc_gpu_bimc_gfx_clk.clkr,
+	[GCC_GPU_BIMC_GFX_SRC_CLK] = &gcc_gpu_bimc_gfx_src_clk.clkr,
+	[GCC_GPU_CFG_AHB_CLK] = &gcc_gpu_cfg_ahb_clk.clkr,
+	[GCC_GPU_SNOC_DVM_GFX_CLK] = &gcc_gpu_snoc_dvm_gfx_clk.clkr,
+	[GCC_HMSS_AHB_CLK] = &gcc_hmss_ahb_clk.clkr,
+	[GCC_HMSS_AT_CLK] = &gcc_hmss_at_clk.clkr,
+	[GCC_HMSS_DVM_BUS_CLK] = &gcc_hmss_dvm_bus_clk.clkr,
+	[GCC_HMSS_RBCPR_CLK] = &gcc_hmss_rbcpr_clk.clkr,
+	[GCC_HMSS_TRIG_CLK] = &gcc_hmss_trig_clk.clkr,
+	[GCC_LPASS_AT_CLK] = &gcc_lpass_at_clk.clkr,
+	[GCC_LPASS_TRIG_CLK] = &gcc_lpass_trig_clk.clkr,
+	[GCC_MMSS_NOC_CFG_AHB_CLK] = &gcc_mmss_noc_cfg_ahb_clk.clkr,
+	[GCC_MMSS_QM_AHB_CLK] = &gcc_mmss_qm_ahb_clk.clkr,
+	[GCC_MMSS_QM_CORE_CLK] = &gcc_mmss_qm_core_clk.clkr,
+	[GCC_MMSS_SYS_NOC_AXI_CLK] = &gcc_mmss_sys_noc_axi_clk.clkr,
+	[GCC_MSS_AT_CLK] = &gcc_mss_at_clk.clkr,
+	[GCC_PCIE_0_AUX_CLK] = &gcc_pcie_0_aux_clk.clkr,
+	[GCC_PCIE_0_CFG_AHB_CLK] = &gcc_pcie_0_cfg_ahb_clk.clkr,
+	[GCC_PCIE_0_MSTR_AXI_CLK] = &gcc_pcie_0_mstr_axi_clk.clkr,
+	[GCC_PCIE_0_PIPE_CLK] = &gcc_pcie_0_pipe_clk.clkr,
+	[GCC_PCIE_0_SLV_AXI_CLK] = &gcc_pcie_0_slv_axi_clk.clkr,
+	[GCC_PCIE_PHY_AUX_CLK] = &gcc_pcie_phy_aux_clk.clkr,
+	[GCC_PDM2_CLK] = &gcc_pdm2_clk.clkr,
+	[GCC_PDM_AHB_CLK] = &gcc_pdm_ahb_clk.clkr,
+	[GCC_PDM_XO4_CLK] = &gcc_pdm_xo4_clk.clkr,
+	[GCC_PRNG_AHB_CLK] = &gcc_prng_ahb_clk.clkr,
+	[GCC_SDCC2_AHB_CLK] = &gcc_sdcc2_ahb_clk.clkr,
+	[GCC_SDCC2_APPS_CLK] = &gcc_sdcc2_apps_clk.clkr,
+	[GCC_SDCC4_AHB_CLK] = &gcc_sdcc4_ahb_clk.clkr,
+	[GCC_SDCC4_APPS_CLK] = &gcc_sdcc4_apps_clk.clkr,
+	[GCC_TSIF_AHB_CLK] = &gcc_tsif_ahb_clk.clkr,
+	[GCC_TSIF_INACTIVITY_TIMERS_CLK] = &gcc_tsif_inactivity_timers_clk.clkr,
+	[GCC_TSIF_REF_CLK] = &gcc_tsif_ref_clk.clkr,
+	[GCC_UFS_AHB_CLK] = &gcc_ufs_ahb_clk.clkr,
+	[GCC_UFS_AXI_CLK] = &gcc_ufs_axi_clk.clkr,
+	[GCC_UFS_ICE_CORE_CLK] = &gcc_ufs_ice_core_clk.clkr,
+	[GCC_UFS_PHY_AUX_CLK] = &gcc_ufs_phy_aux_clk.clkr,
+	[GCC_UFS_RX_SYMBOL_0_CLK] = &gcc_ufs_rx_symbol_0_clk.clkr,
+	[GCC_UFS_RX_SYMBOL_1_CLK] = &gcc_ufs_rx_symbol_1_clk.clkr,
+	[GCC_UFS_TX_SYMBOL_0_CLK] = &gcc_ufs_tx_symbol_0_clk.clkr,
+	[GCC_UFS_UNIPRO_CORE_CLK] = &gcc_ufs_unipro_core_clk.clkr,
+	[GCC_USB30_MASTER_CLK] = &gcc_usb30_master_clk.clkr,
+	[GCC_USB30_MOCK_UTMI_CLK] = &gcc_usb30_mock_utmi_clk.clkr,
+	[GCC_USB30_SLEEP_CLK] = &gcc_usb30_sleep_clk.clkr,
+	[GCC_USB3_PHY_AUX_CLK] = &gcc_usb3_phy_aux_clk.clkr,
+	[GCC_USB3_PHY_PIPE_CLK] = &gcc_usb3_phy_pipe_clk.clkr,
+	[GCC_USB_PHY_CFG_AHB2PHY_CLK] = &gcc_usb_phy_cfg_ahb2phy_clk.clkr,
+	[GP1_CLK_SRC] = &gp1_clk_src.clkr,
+	[GP2_CLK_SRC] = &gp2_clk_src.clkr,
+	[GP3_CLK_SRC] = &gp3_clk_src.clkr,
+	[GPLL0] = &gpll0.clkr,
+	[GPLL0_OUT_EVEN] = &gpll0_out_even.clkr,
+	[GPLL0_OUT_MAIN] = &gpll0_out_main.clkr,
+	[GPLL0_OUT_ODD] = &gpll0_out_odd.clkr,
+	[GPLL0_OUT_TEST] = &gpll0_out_test.clkr,
+	[GPLL1] = &gpll1.clkr,
+	[GPLL1_OUT_EVEN] = &gpll1_out_even.clkr,
+	[GPLL1_OUT_MAIN] = &gpll1_out_main.clkr,
+	[GPLL1_OUT_ODD] = &gpll1_out_odd.clkr,
+	[GPLL1_OUT_TEST] = &gpll1_out_test.clkr,
+	[GPLL2] = &gpll2.clkr,
+	[GPLL2_OUT_EVEN] = &gpll2_out_even.clkr,
+	[GPLL2_OUT_MAIN] = &gpll2_out_main.clkr,
+	[GPLL2_OUT_ODD] = &gpll2_out_odd.clkr,
+	[GPLL2_OUT_TEST] = &gpll2_out_test.clkr,
+	[GPLL3] = &gpll3.clkr,
+	[GPLL3_OUT_EVEN] = &gpll3_out_even.clkr,
+	[GPLL3_OUT_MAIN] = &gpll3_out_main.clkr,
+	[GPLL3_OUT_ODD] = &gpll3_out_odd.clkr,
+	[GPLL3_OUT_TEST] = &gpll3_out_test.clkr,
+	[GPLL4] = &gpll4.clkr,
+	[GPLL4_OUT_EVEN] = &gpll4_out_even.clkr,
+	[GPLL4_OUT_MAIN] = &gpll4_out_main.clkr,
+	[GPLL4_OUT_ODD] = &gpll4_out_odd.clkr,
+	[GPLL4_OUT_TEST] = &gpll4_out_test.clkr,
+	[HMSS_AHB_CLK_SRC] = &hmss_ahb_clk_src.clkr,
+	[HMSS_RBCPR_CLK_SRC] = &hmss_rbcpr_clk_src.clkr,
+	[PCIE_AUX_CLK_SRC] = &pcie_aux_clk_src.clkr,
+	[PDM2_CLK_SRC] = &pdm2_clk_src.clkr,
+	[SDCC2_APPS_CLK_SRC] = &sdcc2_apps_clk_src.clkr,
+	[SDCC4_APPS_CLK_SRC] = &sdcc4_apps_clk_src.clkr,
+	[TSIF_REF_CLK_SRC] = &tsif_ref_clk_src.clkr,
+	[UFS_AXI_CLK_SRC] = &ufs_axi_clk_src.clkr,
+	[USB30_MASTER_CLK_SRC] = &usb30_master_clk_src.clkr,
+	[USB30_MOCK_UTMI_CLK_SRC] = &usb30_mock_utmi_clk_src.clkr,
+	[USB3_PHY_AUX_CLK_SRC] = &usb3_phy_aux_clk_src.clkr,
+};
+
+static struct gdsc *gcc_msm8998_gdscs[] = {
+	[PCIE_0_GDSC] = &pcie_0_gdsc,
+	[UFS_GDSC] = &ufs_gdsc,
+	[USB_30_GDSC] = &usb_30_gdsc,
+};
+
+static const struct qcom_reset_map gcc_msm8998_resets[] = {
+	[GCC_BLSP1_QUP1_BCR] = { 0x102400 },
+	[GCC_BLSP1_QUP2_BCR] = { 0x110592 },
+	[GCC_BLSP1_QUP3_BCR] = { 0x118784 },
+	[GCC_BLSP1_QUP4_BCR] = { 0x126976 },
+	[GCC_BLSP1_QUP5_BCR] = { 0x135168 },
+	[GCC_BLSP1_QUP6_BCR] = { 0x143360 },
+	[GCC_BLSP2_QUP1_BCR] = { 0x155648 },
+	[GCC_BLSP2_QUP2_BCR] = { 0x163840 },
+	[GCC_BLSP2_QUP3_BCR] = { 0x172032 },
+	[GCC_BLSP2_QUP4_BCR] = { 0x180224 },
+	[GCC_BLSP2_QUP5_BCR] = { 0x188416 },
+	[GCC_BLSP2_QUP6_BCR] = { 0x196608 },
+	[GCC_PCIE_0_BCR] = { 0x438272 },
+	[GCC_PDM_BCR] = { 0x208896 },
+	[GCC_SDCC2_BCR] = { 0x81920 },
+	[GCC_SDCC4_BCR] = { 0x90112 },
+	[GCC_TSIF_BCR] = { 0x221184 },
+	[GCC_UFS_BCR] = { 0x479232 },
+	[GCC_USB_30_BCR] = { 0x61440 },
+};
+
+static const struct regmap_config gcc_msm8998_regmap_config = {
+	.reg_bits	= 32,
+	.reg_stride	= 4,
+	.val_bits	= 32,
+	.max_register	= 0x8f000,
+	.fast_io	= true,
+};
+
+static const struct qcom_cc_desc gcc_msm8998_desc = {
+	.config = &gcc_msm8998_regmap_config,
+	.clks = gcc_msm8998_clocks,
+	.num_clks = ARRAY_SIZE(gcc_msm8998_clocks),
+	.resets = gcc_msm8998_resets,
+	.num_resets = ARRAY_SIZE(gcc_msm8998_resets),
+	.gdscs = gcc_msm8998_gdscs,
+	.num_gdscs = ARRAY_SIZE(gcc_msm8998_gdscs),
+};
+
+static int gcc_msm8998_probe(struct platform_device *pdev)
+{
+	struct regmap *regmap;
+	int ret;
+
+	regmap = qcom_cc_map(pdev, &gcc_msm8998_desc);
+	if (IS_ERR(regmap))
+		return PTR_ERR(regmap);
+
+	/*
+	 * Set the HMSS_AHB_CLK_SLEEP_ENA bit to allow the hmss_ahb_clk to be
+	 * turned off by hardware during certain apps low power modes.
+	 */
+	ret = regmap_update_bits(regmap, 0x52008, BIT(21), BIT(21));
+	if (ret)
+		return ret;
+
+	return qcom_cc_really_probe(pdev, &gcc_msm8998_desc, regmap);
+}
+
+static const struct of_device_id gcc_msm8998_match_table[] = {
+	{ .compatible = "qcom,gcc-msm8998" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, gcc_msm8998_match_table);
+
+static struct platform_driver gcc_msm8998_driver = {
+	.probe		= gcc_msm8998_probe,
+	.driver		= {
+		.name	= "gcc-msm8998",
+		.of_match_table = gcc_msm8998_match_table,
+	},
+};
+
+static int __init gcc_msm8998_init(void)
+{
+	return platform_driver_register(&gcc_msm8998_driver);
+}
+core_initcall(gcc_msm8998_init);
+
+static void __exit gcc_msm8998_exit(void)
+{
+	platform_driver_unregister(&gcc_msm8998_driver);
+}
+module_exit(gcc_msm8998_exit);
+
+MODULE_DESCRIPTION("QCOM GCC msm8998 Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:gcc-msm8998");
diff --git a/include/dt-bindings/clock/qcom,gcc-msm8998.h b/include/dt-bindings/clock/qcom,gcc-msm8998.h
new file mode 100644
index 000000000000..58a242e656b1
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,gcc-msm8998.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _DT_BINDINGS_CLK_MSM_GCC_COBALT_H
+#define _DT_BINDINGS_CLK_MSM_GCC_COBALT_H
+
+#define BLSP1_QUP1_I2C_APPS_CLK_SRC				0
+#define BLSP1_QUP1_SPI_APPS_CLK_SRC				1
+#define BLSP1_QUP2_I2C_APPS_CLK_SRC				2
+#define BLSP1_QUP2_SPI_APPS_CLK_SRC				3
+#define BLSP1_QUP3_I2C_APPS_CLK_SRC				4
+#define BLSP1_QUP3_SPI_APPS_CLK_SRC				5
+#define BLSP1_QUP4_I2C_APPS_CLK_SRC				6
+#define BLSP1_QUP4_SPI_APPS_CLK_SRC				7
+#define BLSP1_QUP5_I2C_APPS_CLK_SRC				8
+#define BLSP1_QUP5_SPI_APPS_CLK_SRC				9
+#define BLSP1_QUP6_I2C_APPS_CLK_SRC				10
+#define BLSP1_QUP6_SPI_APPS_CLK_SRC				11
+#define BLSP1_UART1_APPS_CLK_SRC				12
+#define BLSP1_UART2_APPS_CLK_SRC				13
+#define BLSP1_UART3_APPS_CLK_SRC				14
+#define BLSP2_QUP1_I2C_APPS_CLK_SRC				15
+#define BLSP2_QUP1_SPI_APPS_CLK_SRC				16
+#define BLSP2_QUP2_I2C_APPS_CLK_SRC				17
+#define BLSP2_QUP2_SPI_APPS_CLK_SRC				18
+#define BLSP2_QUP3_I2C_APPS_CLK_SRC				19
+#define BLSP2_QUP3_SPI_APPS_CLK_SRC				20
+#define BLSP2_QUP4_I2C_APPS_CLK_SRC				21
+#define BLSP2_QUP4_SPI_APPS_CLK_SRC				22
+#define BLSP2_QUP5_I2C_APPS_CLK_SRC				23
+#define BLSP2_QUP5_SPI_APPS_CLK_SRC				24
+#define BLSP2_QUP6_I2C_APPS_CLK_SRC				25
+#define BLSP2_QUP6_SPI_APPS_CLK_SRC				26
+#define BLSP2_UART1_APPS_CLK_SRC				27
+#define BLSP2_UART2_APPS_CLK_SRC				28
+#define BLSP2_UART3_APPS_CLK_SRC				29
+#define GCC_AGGRE1_NOC_XO_CLK					30
+#define GCC_AGGRE1_UFS_AXI_CLK					31
+#define GCC_AGGRE1_USB3_AXI_CLK					32
+#define GCC_APSS_QDSS_TSCTR_DIV2_CLK				33
+#define GCC_APSS_QDSS_TSCTR_DIV8_CLK				34
+#define GCC_BIMC_HMSS_AXI_CLK					35
+#define GCC_BIMC_MSS_Q6_AXI_CLK					36
+#define GCC_BLSP1_AHB_CLK					37
+#define GCC_BLSP1_QUP1_I2C_APPS_CLK				38
+#define GCC_BLSP1_QUP1_SPI_APPS_CLK				39
+#define GCC_BLSP1_QUP2_I2C_APPS_CLK				40
+#define GCC_BLSP1_QUP2_SPI_APPS_CLK				41
+#define GCC_BLSP1_QUP3_I2C_APPS_CLK				42
+#define GCC_BLSP1_QUP3_SPI_APPS_CLK				43
+#define GCC_BLSP1_QUP4_I2C_APPS_CLK				44
+#define GCC_BLSP1_QUP4_SPI_APPS_CLK				45
+#define GCC_BLSP1_QUP5_I2C_APPS_CLK				46
+#define GCC_BLSP1_QUP5_SPI_APPS_CLK				47
+#define GCC_BLSP1_QUP6_I2C_APPS_CLK				48
+#define GCC_BLSP1_QUP6_SPI_APPS_CLK				49
+#define GCC_BLSP1_SLEEP_CLK					50
+#define GCC_BLSP1_UART1_APPS_CLK				51
+#define GCC_BLSP1_UART2_APPS_CLK				52
+#define GCC_BLSP1_UART3_APPS_CLK				53
+#define GCC_BLSP2_AHB_CLK					54
+#define GCC_BLSP2_QUP1_I2C_APPS_CLK				55
+#define GCC_BLSP2_QUP1_SPI_APPS_CLK				56
+#define GCC_BLSP2_QUP2_I2C_APPS_CLK				57
+#define GCC_BLSP2_QUP2_SPI_APPS_CLK				58
+#define GCC_BLSP2_QUP3_I2C_APPS_CLK				59
+#define GCC_BLSP2_QUP3_SPI_APPS_CLK				60
+#define GCC_BLSP2_QUP4_I2C_APPS_CLK				61
+#define GCC_BLSP2_QUP4_SPI_APPS_CLK				62
+#define GCC_BLSP2_QUP5_I2C_APPS_CLK				63
+#define GCC_BLSP2_QUP5_SPI_APPS_CLK				64
+#define GCC_BLSP2_QUP6_I2C_APPS_CLK				65
+#define GCC_BLSP2_QUP6_SPI_APPS_CLK				66
+#define GCC_BLSP2_SLEEP_CLK					67
+#define GCC_BLSP2_UART1_APPS_CLK				68
+#define GCC_BLSP2_UART2_APPS_CLK				69
+#define GCC_BLSP2_UART3_APPS_CLK				70
+#define GCC_CFG_NOC_USB3_AXI_CLK				71
+#define GCC_GP1_CLK						72
+#define GCC_GP2_CLK						73
+#define GCC_GP3_CLK						74
+#define GCC_GPU_BIMC_GFX_CLK					75
+#define GCC_GPU_BIMC_GFX_SRC_CLK				76
+#define GCC_GPU_CFG_AHB_CLK					77
+#define GCC_GPU_SNOC_DVM_GFX_CLK				78
+#define GCC_HMSS_AHB_CLK					79
+#define GCC_HMSS_AT_CLK						80
+#define GCC_HMSS_DVM_BUS_CLK					81
+#define GCC_HMSS_RBCPR_CLK					82
+#define GCC_HMSS_TRIG_CLK					83
+#define GCC_LPASS_AT_CLK					84
+#define GCC_LPASS_TRIG_CLK					85
+#define GCC_MMSS_NOC_CFG_AHB_CLK				86
+#define GCC_MMSS_QM_AHB_CLK					87
+#define GCC_MMSS_QM_CORE_CLK					88
+#define GCC_MMSS_SYS_NOC_AXI_CLK				89
+#define GCC_MSS_AT_CLK						90
+#define GCC_PCIE_0_AUX_CLK					91
+#define GCC_PCIE_0_CFG_AHB_CLK					92
+#define GCC_PCIE_0_MSTR_AXI_CLK					93
+#define GCC_PCIE_0_PIPE_CLK					94
+#define GCC_PCIE_0_SLV_AXI_CLK					95
+#define GCC_PCIE_PHY_AUX_CLK					96
+#define GCC_PDM2_CLK						97
+#define GCC_PDM_AHB_CLK						98
+#define GCC_PDM_XO4_CLK						99
+#define GCC_PRNG_AHB_CLK					100
+#define GCC_SDCC2_AHB_CLK					101
+#define GCC_SDCC2_APPS_CLK					102
+#define GCC_SDCC4_AHB_CLK					103
+#define GCC_SDCC4_APPS_CLK					104
+#define GCC_TSIF_AHB_CLK					105
+#define GCC_TSIF_INACTIVITY_TIMERS_CLK				106
+#define GCC_TSIF_REF_CLK					107
+#define GCC_UFS_AHB_CLK						108
+#define GCC_UFS_AXI_CLK						109
+#define GCC_UFS_ICE_CORE_CLK					110
+#define GCC_UFS_PHY_AUX_CLK					111
+#define GCC_UFS_RX_SYMBOL_0_CLK					112
+#define GCC_UFS_RX_SYMBOL_1_CLK					113
+#define GCC_UFS_TX_SYMBOL_0_CLK					114
+#define GCC_UFS_UNIPRO_CORE_CLK					115
+#define GCC_USB30_MASTER_CLK					116
+#define GCC_USB30_MOCK_UTMI_CLK					117
+#define GCC_USB30_SLEEP_CLK					118
+#define GCC_USB3_PHY_AUX_CLK					119
+#define GCC_USB3_PHY_PIPE_CLK					120
+#define GCC_USB_PHY_CFG_AHB2PHY_CLK				121
+#define GP1_CLK_SRC						122
+#define GP2_CLK_SRC						123
+#define GP3_CLK_SRC						124
+#define GPLL0							125
+#define GPLL0_OUT_EVEN						126
+#define GPLL0_OUT_MAIN						127
+#define GPLL0_OUT_ODD						128
+#define GPLL0_OUT_TEST						129
+#define GPLL1							130
+#define GPLL1_OUT_EVEN						131
+#define GPLL1_OUT_MAIN						132
+#define GPLL1_OUT_ODD						133
+#define GPLL1_OUT_TEST						134
+#define GPLL2							135
+#define GPLL2_OUT_EVEN						136
+#define GPLL2_OUT_MAIN						137
+#define GPLL2_OUT_ODD						138
+#define GPLL2_OUT_TEST						139
+#define GPLL3							140
+#define GPLL3_OUT_EVEN						141
+#define GPLL3_OUT_MAIN						142
+#define GPLL3_OUT_ODD						143
+#define GPLL3_OUT_TEST						144
+#define GPLL4							145
+#define GPLL4_OUT_EVEN						146
+#define GPLL4_OUT_MAIN						147
+#define GPLL4_OUT_ODD						148
+#define GPLL4_OUT_TEST						149
+#define GPLL6							150
+#define GPLL6_OUT_EVEN						151
+#define GPLL6_OUT_MAIN						152
+#define GPLL6_OUT_ODD						153
+#define GPLL6_OUT_TEST						154
+#define HMSS_AHB_CLK_SRC					155
+#define HMSS_RBCPR_CLK_SRC					156
+#define PCIE_AUX_CLK_SRC					157
+#define PDM2_CLK_SRC						158
+#define SDCC2_APPS_CLK_SRC					159
+#define SDCC4_APPS_CLK_SRC					160
+#define TSIF_REF_CLK_SRC					161
+#define UFS_AXI_CLK_SRC						162
+#define USB30_MASTER_CLK_SRC					163
+#define USB30_MOCK_UTMI_CLK_SRC					164
+#define USB3_PHY_AUX_CLK_SRC					165
+
+#define PCIE_0_GDSC						0
+#define UFS_GDSC						1
+#define USB_30_GDSC						2
+
+#define GCC_BLSP1_QUP1_BCR					0
+#define GCC_BLSP1_QUP2_BCR					1
+#define GCC_BLSP1_QUP3_BCR					2
+#define GCC_BLSP1_QUP4_BCR					3
+#define GCC_BLSP1_QUP5_BCR					4
+#define GCC_BLSP1_QUP6_BCR					5
+#define GCC_BLSP2_QUP1_BCR					6
+#define GCC_BLSP2_QUP2_BCR					7
+#define GCC_BLSP2_QUP3_BCR					8
+#define GCC_BLSP2_QUP4_BCR					9
+#define GCC_BLSP2_QUP5_BCR					10
+#define GCC_BLSP2_QUP6_BCR					11
+#define GCC_PCIE_0_BCR						12
+#define GCC_PDM_BCR						13
+#define GCC_SDCC2_BCR						14
+#define GCC_SDCC4_BCR						15
+#define GCC_TSIF_BCR						16
+#define GCC_UFS_BCR						17
+#define GCC_USB_30_BCR						18
+
+#endif
-- 
cgit v1.2.3


From 8b4aa4d8dc283a408eb14ca5e27e0a9a689c2e5a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 5 Apr 2018 12:03:05 -0400
Subject: media: mmp-camera.h: add missing platform data

Those definitions used to be part of the original patch:
	https://patchwork.kernel.org/patch/2815221/

But, somehow, nobody ever noticed until today. Years later,
Arnd discovered that mmp-camera driver doesn't build and make
it depend on BROKEN.

Add the missing bits here, in order to remove BROKEN dependency.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/linux/platform_data/media/mmp-camera.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/media/mmp-camera.h b/include/linux/platform_data/media/mmp-camera.h
index 83804028115c..d2d3a443eedf 100644
--- a/include/linux/platform_data/media/mmp-camera.h
+++ b/include/linux/platform_data/media/mmp-camera.h
@@ -3,8 +3,27 @@
  * Information for the Marvell Armada MMP camera
  */
 
+#include <media/v4l2-mediabus.h>
+
+enum dphy3_algo {
+	DPHY3_ALGO_DEFAULT = 0,
+	DPHY3_ALGO_PXA910,
+	DPHY3_ALGO_PXA2128
+};
+
 struct mmp_camera_platform_data {
 	struct platform_device *i2c_device;
 	int sensor_power_gpio;
 	int sensor_reset_gpio;
+	enum v4l2_mbus_type bus_type;
+	int mclk_min;	/* The minimal value of MCLK */
+	int mclk_src;	/* which clock source the MCLK derives from */
+	int mclk_div;	/* Clock Divider Value for MCLK */
+	/*
+	 * MIPI support
+	 */
+	int dphy[3];		/* DPHY: CSI2_DPHY3, CSI2_DPHY5, CSI2_DPHY6 */
+	enum dphy3_algo dphy3_algo;	/* algos for calculate CSI2_DPHY3 */
+	int lane;		/* ccic used lane number; 0 means DVP mode */
+	int lane_clk;
 };
-- 
cgit v1.2.3


From 190d7f02ce8ef6774a69d3ec18c288c8a9601a4e Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 8 Dec 2017 15:28:18 +0100
Subject: HID: input: do not increment usages when a duplicate is found

This is something that bothered us from a long time. When hid-input
doesn't know how to map a usage, it uses *_MISC. But there is something
else which increments the usage if the evdev code is already used.

This leads to few issues:
- some devices may have their ABS_X mapped to ABS_Y if they export a bad
  set of usages (see the DragonRise joysticks IIRC -> fixed in a specific
  HID driver)
- *_MISC + N might (will) conflict with other defined axes (my Logitech
  H800 exports some multitouch axes because of that)
- this prevents to freely add some new evdev usages, because "hey, my
  headset will now report ABS_COFFEE, and it's not coffee capable".

So let's try to kill this nonsense, and hope we won't break too many
devices.

I my headset case, the ABS_MISC axes are created because of some
proprietary usages, so we might not break that many devices.

For backward compatibility, a quirk HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE
is created and can be applied to any device that needs this behavior.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Acked-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 33 +++++++++++++++++++++++++++++++--
 include/linux/hid.h     |  2 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 6836a856c243..04056773102e 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1100,8 +1100,31 @@ mapped:
 
 	set_bit(usage->type, input->evbit);
 
-	while (usage->code <= max && test_and_set_bit(usage->code, bit))
-		usage->code = find_next_zero_bit(bit, max + 1, usage->code);
+	/*
+	 * This part is *really* controversial:
+	 * - HID aims at being generic so we should do our best to export
+	 *   all incoming events
+	 * - HID describes what events are, so there is no reason for ABS_X
+	 *   to be mapped to ABS_Y
+	 * - HID is using *_MISC+N as a default value, but nothing prevents
+	 *   *_MISC+N to overwrite a legitimate even, which confuses userspace
+	 *   (for instance ABS_MISC + 7 is ABS_MT_SLOT, which has a different
+	 *   processing)
+	 *
+	 * If devices still want to use this (at their own risk), they will
+	 * have to use the quirk HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE, but
+	 * the default should be a reliable mapping.
+	 */
+	while (usage->code <= max && test_and_set_bit(usage->code, bit)) {
+		if (device->quirks & HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE) {
+			usage->code = find_next_zero_bit(bit,
+							 max + 1,
+							 usage->code);
+		} else {
+			device->status |= HID_STAT_DUP_DETECTED;
+			goto ignore;
+		}
+	}
 
 	if (usage->code > max)
 		goto ignore;
@@ -1611,6 +1634,8 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 	INIT_LIST_HEAD(&hid->inputs);
 	INIT_WORK(&hid->led_work, hidinput_led_worker);
 
+	hid->status &= ~HID_STAT_DUP_DETECTED;
+
 	if (!force) {
 		for (i = 0; i < hid->maxcollection; i++) {
 			struct hid_collection *col = &hid->collection[i];
@@ -1677,6 +1702,10 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 		goto out_unwind;
 	}
 
+	if (hid->status & HID_STAT_DUP_DETECTED)
+		hid_dbg(hid,
+			"Some usages could not be mapped, please use HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE if this is legitimate.\n");
+
 	return 0;
 
 out_unwind:
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 8da3e1f48195..0267aa5c1ea3 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -345,6 +345,7 @@ struct hid_item {
 #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID		BIT(17)
 #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP	BIT(18)
 #define HID_QUIRK_HAVE_SPECIAL_DRIVER		BIT(19)
+#define HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE	BIT(20)
 #define HID_QUIRK_FULLSPEED_INTERVAL		BIT(28)
 #define HID_QUIRK_NO_INIT_REPORTS		BIT(29)
 #define HID_QUIRK_NO_IGNORE			BIT(30)
@@ -502,6 +503,7 @@ struct hid_output_fifo {
 
 #define HID_STAT_ADDED		BIT(0)
 #define HID_STAT_PARSED		BIT(1)
+#define HID_STAT_DUP_DETECTED	BIT(2)
 
 struct hid_input {
 	struct list_head list;
-- 
cgit v1.2.3


From 5ab073ffd326480a6185d096e9703f62ef92b86c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:26 +0200
Subject: xdp: introduce xdp_return_frame API and use in cpumap

Introduce an xdp_return_frame API, and convert over cpumap as
the first user, given it have queued XDP frame structure to leverage.

V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck.
V6: Remove comment that id will be added later (Req by Alex Duyck)
V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xdp.h   | 27 ++++++++++++++++++++++++
 kernel/bpf/cpumap.c | 60 ++++++++++++++++++++++++++++++++---------------------
 net/core/xdp.c      | 18 ++++++++++++++++
 3 files changed, 81 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index b2362ddfa694..e4207699c410 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -33,16 +33,43 @@
  * also mandatory during RX-ring setup.
  */
 
+enum xdp_mem_type {
+	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
+	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
+	MEM_TYPE_MAX,
+};
+
+struct xdp_mem_info {
+	u32 type; /* enum xdp_mem_type, but known size type */
+};
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
 	u32 reg_state;
+	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+
+static inline
+void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+{
+	if (mem->type == MEM_TYPE_PAGE_SHARED)
+		page_frag_free(data);
+
+	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
+		struct page *page = virt_to_page(data); /* Assumes order0 page*/
+
+		put_page(page);
+	}
+}
+
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator);
 
 #endif /* __LINUX_NET_XDP_H__ */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a4bb0b34375a..3e4bbcbe3e86 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -19,6 +19,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
+#include <net/xdp.h>
 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
@@ -137,27 +138,6 @@ free_cmap:
 	return ERR_PTR(err);
 }
 
-static void __cpu_map_queue_destructor(void *ptr)
-{
-	/* The tear-down procedure should have made sure that queue is
-	 * empty.  See __cpu_map_entry_replace() and work-queue
-	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
-	 * gracefully and warn once.
-	 */
-	if (WARN_ON_ONCE(ptr))
-		page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
-	if (atomic_dec_and_test(&rcpu->refcnt)) {
-		/* The queue should be empty at this point */
-		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
-		kfree(rcpu->queue);
-		kfree(rcpu);
-	}
-}
-
 static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
 	atomic_inc(&rcpu->refcnt);
@@ -188,6 +168,10 @@ struct xdp_pkt {
 	u16 len;
 	u16 headroom;
 	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
 	struct net_device *dev_rx;
 };
 
@@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
 	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
 	xdp_pkt->metasize = metasize;
 
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_pkt->mem = xdp->rxq->mem;
+
 	return xdp_pkt;
 }
 
@@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	return skb;
 }
 
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	struct xdp_pkt *xdp_pkt;
+
+	while ((xdp_pkt = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdp_pkt))
+			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		__cpu_map_ring_cleanup(rcpu->queue);
+		ptr_ring_cleanup(rcpu->queue, NULL);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data)
 
 			skb = cpu_map_build_skb(rcpu, xdp_pkt);
 			if (!skb) {
-				page_frag_free(xdp_pkt);
+				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
 				continue;
 			}
 
@@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		void *xdp_pkt = bq->q[i];
+		struct xdp_pkt *xdp_pkt = bq->q[i];
 		int err;
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
 			drops++;
-			page_frag_free(xdp_pkt); /* Free xdp_pkt */
+			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e004..7e6b3545277d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -71,3 +71,21 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator)
+{
+	if (type >= MEM_TYPE_MAX)
+		return -EINVAL;
+
+	xdp_rxq->mem.type = type;
+
+	if (allocator)
+		return -EOPNOTSUPP;
+
+	/* TODO: Allocate an ID that maps to allocator pointer
+	 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
-- 
cgit v1.2.3


From 106ca27f2922e8de820d1bd3d79b1cbdf2d78eea Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:37 +0200
Subject: xdp: move struct xdp_buff from filter.h to xdp.h

This is done to prepare for the next patch, and it is also
nice to move this XDP related struct out of filter.h.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 24 +-----------------------
 include/net/xdp.h      | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index fc4e8f91b03d..4da8b2308174 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -30,6 +30,7 @@ struct sock;
 struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
+struct xdp_buff;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -500,14 +501,6 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
-struct xdp_buff {
-	void *data;
-	void *data_end;
-	void *data_meta;
-	void *data_hard_start;
-	struct xdp_rxq_info *rxq;
-};
-
 struct sk_msg_buff {
 	void *data;
 	void *data_end;
@@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
-/* Drivers not supporting XDP metadata can use this helper, which
- * rejects any room expansion for metadata as a result.
- */
-static __always_inline void
-xdp_set_data_meta_invalid(struct xdp_buff *xdp)
-{
-	xdp->data_meta = xdp->data + 1;
-}
-
-static __always_inline bool
-xdp_data_meta_unsupported(const struct xdp_buff *xdp)
-{
-	return unlikely(xdp->data_meta > xdp->data);
-}
-
 void bpf_warn_invalid_xdp_action(u32 act);
 
 struct sock *do_sk_redirect_map(struct sk_buff *skb);
diff --git a/include/net/xdp.h b/include/net/xdp.h
index e4207699c410..15f8ade008b5 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -50,6 +50,13 @@ struct xdp_rxq_info {
 	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+struct xdp_buff {
+	void *data;
+	void *data_end;
+	void *data_meta;
+	void *data_hard_start;
+	struct xdp_rxq_info *rxq;
+};
 
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
@@ -72,4 +79,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 #endif /* __LINUX_NET_XDP_H__ */
-- 
cgit v1.2.3


From c0048cff8abb69c956ce1277d17a3f7a14e41522 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:42 +0200
Subject: xdp: introduce a new xdp_frame type

This is needed to convert drivers tuntap and virtio_net.

This is a generalization of what is done inside cpumap, which will be
converted later.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xdp.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 15f8ade008b5..756c42811e78 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -58,6 +58,46 @@ struct xdp_buff {
 	struct xdp_rxq_info *rxq;
 };
 
+struct xdp_frame {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
+};
+
+/* Convert xdp_buff to xdp_frame */
+static inline
+struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+{
+	struct xdp_frame *xdp_frame;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_frame = xdp->data_hard_start;
+
+	xdp_frame->data = xdp->data;
+	xdp_frame->len  = xdp->data_end - xdp->data;
+	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
+	xdp_frame->metasize = metasize;
+
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_frame->mem = xdp->rxq->mem;
+
+	return xdp_frame;
+}
+
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
 {
-- 
cgit v1.2.3


From 1ffcbc8537d0bc32aaca7000cb9c904ec4b6300f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:47 +0200
Subject: tun: convert to use generic xdp_frame and xdp_return_frame API

The tuntap driver invented it's own driver specific way of queuing
XDP packets, by storing the xdp_buff information in the top of
the XDP frame data.

Convert it over to use the more generic xdp_frame structure.  The
main problem with the in-driver method is that the xdp_rxq_info pointer
cannot be trused/used when dequeueing the frame.

V3: Remove check based on feedback from Jason

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 43 ++++++++++++++++++++-----------------------
 drivers/vhost/net.c    |  7 ++++---
 include/linux/if_tun.h |  4 ++--
 3 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 28583aa0c17d..2c85e5cac2a9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -248,11 +248,11 @@ struct veth {
 	__be16 h_vlan_TCI;
 };
 
-bool tun_is_xdp_buff(void *ptr)
+bool tun_is_xdp_frame(void *ptr)
 {
 	return (unsigned long)ptr & TUN_XDP_FLAG;
 }
-EXPORT_SYMBOL(tun_is_xdp_buff);
+EXPORT_SYMBOL(tun_is_xdp_frame);
 
 void *tun_xdp_to_ptr(void *ptr)
 {
@@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr)
 {
 	if (!ptr)
 		return;
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		put_page(virt_to_head_page(xdp->data));
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -1298,17 +1298,14 @@ static const struct net_device_ops tun_netdev_ops = {
 static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct xdp_buff *buff = xdp->data_hard_start;
-	int headroom = xdp->data - xdp->data_hard_start;
+	struct xdp_frame *frame;
 	struct tun_file *tfile;
 	u32 numqueues;
 	int ret = 0;
 
-	/* Assure headroom is available and buff is properly aligned */
-	if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
-		return -ENOSPC;
-
-	*buff = *xdp;
+	frame = convert_to_xdp_frame(xdp);
+	if (unlikely(!frame))
+		return -EOVERFLOW;
 
 	rcu_read_lock();
 
@@ -1323,7 +1320,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	/* Encode the XDP flag into lowest bit for consumer to differ
 	 * XDP buffer from sk_buff.
 	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
+	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
 		this_cpu_inc(tun->pcpu_stats->tx_dropped);
 		ret = -ENOSPC;
 	}
@@ -2001,11 +1998,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 				struct tun_file *tfile,
-				struct xdp_buff *xdp,
+				struct xdp_frame *xdp_frame,
 				struct iov_iter *iter)
 {
 	int vnet_hdr_sz = 0;
-	size_t size = xdp->data_end - xdp->data;
+	size_t size = xdp_frame->len;
 	struct tun_pcpu_stats *stats;
 	size_t ret;
 
@@ -2021,7 +2018,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
 	}
 
-	ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
+	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -2189,11 +2186,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			return err;
 	}
 
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		ret = tun_put_user_xdp(tun, tfile, xdp, to);
-		put_page(virt_to_head_page(xdp->data));
+		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		struct sk_buff *skb = ptr;
 
@@ -2432,10 +2429,10 @@ out_free:
 static int tun_ptr_peek_len(void *ptr)
 {
 	if (likely(ptr)) {
-		if (tun_is_xdp_buff(ptr)) {
-			struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+		if (tun_is_xdp_frame(ptr)) {
+			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-			return xdp->data_end - xdp->data;
+			return xdpf->len;
 		}
 		return __skb_array_len_with_tag(ptr);
 	} else {
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 986058a57917..bbf38befefb2 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,7 @@
 #include <linux/skbuff.h>
 
 #include <net/sock.h>
+#include <net/xdp.h>
 
 #include "vhost.h"
 
@@ -181,10 +182,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_buf_peek_len(void *ptr)
 {
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		return xdp->data_end - xdp->data;
+		return xdpf->len;
 	}
 
 	return __skb_array_len_with_tag(ptr);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index fd00170b494f..3d2996dc7d85 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -22,7 +22,7 @@
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-bool tun_is_xdp_buff(void *ptr);
+bool tun_is_xdp_frame(void *ptr);
 void *tun_xdp_to_ptr(void *ptr);
 void *tun_ptr_to_xdp(void *ptr);
 void tun_ptr_free(void *ptr);
@@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline bool tun_is_xdp_buff(void *ptr)
+static inline bool tun_is_xdp_frame(void *ptr)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 70280ed91cb8acb43e8fd7a8094840846c172ac5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:57 +0200
Subject: bpf: cpumap convert to use generic xdp_frame

The generic xdp_frame format, was inspired by the cpumap own internal
xdp_pkt format.  It is now time to convert it over to the generic
xdp_frame format.  The cpumap needs one extra field dev_rx.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xdp.h   |   1 +
 kernel/bpf/cpumap.c | 100 +++++++++++++++-------------------------------------
 2 files changed, 29 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 756c42811e78..ea3773f94f65 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -67,6 +67,7 @@ struct xdp_frame {
 	 * while mem info is valid on remote CPU.
 	 */
 	struct xdp_mem_info mem;
+	struct net_device *dev_rx; /* used by cpumap */
 };
 
 /* Convert xdp_buff to xdp_frame */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3e4bbcbe3e86..bcdc4dea5ce7 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -159,52 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-	u16 metasize;
-	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
-	 * while mem info is valid on remote CPU.
-	 */
-	struct xdp_mem_info mem;
-	struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
-	struct xdp_pkt *xdp_pkt;
-	int metasize;
-	int headroom;
-
-	/* Assure headroom is available for storing info */
-	headroom = xdp->data - xdp->data_hard_start;
-	metasize = xdp->data - xdp->data_meta;
-	metasize = metasize > 0 ? metasize : 0;
-	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
-		return NULL;
-
-	/* Store info in top of packet */
-	xdp_pkt = xdp->data_hard_start;
-
-	xdp_pkt->data = xdp->data;
-	xdp_pkt->len  = xdp->data_end - xdp->data;
-	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
-	xdp_pkt->metasize = metasize;
-
-	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
-	xdp_pkt->mem = xdp->rxq->mem;
-
-	return xdp_pkt;
-}
-
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-					 struct xdp_pkt *xdp_pkt)
+					 struct xdp_frame *xdpf)
 {
 	unsigned int frame_size;
 	void *pkt_data_start;
@@ -219,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * would be preferred to set frame_size to 2048 or 4096
 	 * depending on the driver.
 	 *   frame_size = 2048;
-	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *   frame_len  = frame_size - sizeof(*xdp_frame);
 	 *
 	 * Instead, with info avail, skb_shared_info in placed after
 	 * packet len.  This, unfortunately fakes the truesize.
@@ -227,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * is not at a fixed memory location, with mixed length
 	 * packets, which is bad for cache-line hotness.
 	 */
-	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+	frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	pkt_data_start = xdpf->data - xdpf->headroom;
 	skb = build_skb(pkt_data_start, frame_size);
 	if (!skb)
 		return NULL;
 
-	skb_reserve(skb, xdp_pkt->headroom);
-	__skb_put(skb, xdp_pkt->len);
-	if (xdp_pkt->metasize)
-		skb_metadata_set(skb, xdp_pkt->metasize);
+	skb_reserve(skb, xdpf->headroom);
+	__skb_put(skb, xdpf->len);
+	if (xdpf->metasize)
+		skb_metadata_set(skb, xdpf->metasize);
 
 	/* Essential SKB info: protocol and skb->dev */
-	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+	skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
 
 	/* Optional SKB info, currently missing:
 	 * - HW checksum info		(skb->ip_summed)
@@ -259,11 +215,11 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
 	 * gracefully and warn once.
 	 */
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	while ((xdp_pkt = ptr_ring_consume(ring)))
-		if (WARN_ON_ONCE(xdp_pkt))
-			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+	while ((xdpf = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdpf))
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 }
 
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -290,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
 		unsigned int processed = 0, drops = 0, sched = 0;
-		struct xdp_pkt *xdp_pkt;
+		struct xdp_frame *xdpf;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -313,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+		while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
 			struct sk_buff *skb;
 			int ret;
 
-			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+				xdp_return_frame(xdpf->data, &xdpf->mem);
 				continue;
 			}
 
@@ -616,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		struct xdp_pkt *xdp_pkt = bq->q[i];
+		struct xdp_frame *xdpf = bq->q[i];
 		int err;
 
-		err = __ptr_ring_produce(q, xdp_pkt);
+		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 		}
 		processed++;
 	}
@@ -637,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -648,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	 * driver to code invoking us to finished, due to driver
 	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
 	 *
-	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * Thus, incoming xdp_frame is always queued here (else we race
 	 * with another CPU on page-refcnt and remaining driver code).
 	 * Queue time is very short, as driver will invoke flush
 	 * operation, when completing napi->poll call.
 	 */
-	bq->q[bq->count++] = xdp_pkt;
+	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (unlikely(!xdp_pkt))
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-	xdp_pkt->dev_rx = dev_rx;
+	xdpf->dev_rx = dev_rx;
 
-	bq_enqueue(rcpu, xdp_pkt);
+	bq_enqueue(rcpu, xdpf);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8d5d88527587516bd58ff0f3810f07c38e65e2be Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:12 +0200
Subject: xdp: rhashtable with allocator ID to pointer mapping

Use the IDA infrastructure for getting a cyclic increasing ID number,
that is used for keeping track of each registered allocator per
RX-queue xdp_rxq_info.  Instead of using the IDR infrastructure, which
uses a radix tree, use a dynamic rhashtable, for creating ID to
pointer lookup table, because this is faster.

The problem that is being solved here is that, the xdp_rxq_info
pointer (stored in xdp_buff) cannot be used directly, as the
guaranteed lifetime is too short.  The info is needed on a
(potentially) remote CPU during DMA-TX completion time . In an
xdp_frame the xdp_mem_info is stored, when it got converted from an
xdp_buff, which is sufficient for the simple page refcnt based recycle
schemes.

For more advanced allocators there is a need to store a pointer to the
registered allocator.  Thus, there is a need to guard the lifetime or
validity of the allocator pointer, which is done through this
rhashtable ID map to pointer. The removal and validity of of the
allocator and helper struct xdp_mem_allocator is guarded by RCU.  The
allocator will be created by the driver, and registered with
xdp_rxq_info_reg_mem_model().

It is up-to debate who is responsible for freeing the allocator
pointer or invoking the allocator destructor function.  In any case,
this must happen via RCU freeing.

Use the IDA infrastructure for getting a cyclic increasing ID number,
that is used for keeping track of each registered allocator per
RX-queue xdp_rxq_info.

V4: Per req of Jason Wang
- Use xdp_rxq_info_reg_mem_model() in all drivers implementing
  XDP_REDIRECT, even-though it's not strictly necessary when
  allocator==NULL for type MEM_TYPE_PAGE_SHARED (given it's zero).

V6: Per req of Alex Duyck
- Introduce rhashtable_lookup() call in later patch

V8: Address sparse should be static warnings (from kbuild test robot)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   9 +-
 drivers/net/tun.c                             |   6 +
 drivers/net/virtio_net.c                      |   7 +
 include/net/xdp.h                             |  14 +-
 net/core/xdp.c                                | 223 +++++++++++++++++++++++++-
 5 files changed, 241 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0bfe6cf2bf8b..f10904ec2172 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6370,7 +6370,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 	struct device *dev = rx_ring->dev;
 	int orig_node = dev_to_node(dev);
 	int ring_node = -1;
-	int size;
+	int size, err;
 
 	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
 
@@ -6407,6 +6407,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 			     rx_ring->queue_index) < 0)
 		goto err;
 
+	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
+					 MEM_TYPE_PAGE_SHARED, NULL);
+	if (err) {
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+		goto err;
+	}
+
 	rx_ring->xdp_prog = adapter->xdp_prog;
 
 	return 0;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c85e5cac2a9..283bde85c455 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -854,6 +854,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 				       tun->dev, tfile->queue_index);
 		if (err < 0)
 			goto out;
+		err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err < 0) {
+			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+			goto out;
+		}
 		err = 0;
 	}
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f50e1ad81ad4..42d338fe9a8d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1305,6 +1305,13 @@ static int virtnet_open(struct net_device *dev)
 		if (err < 0)
 			return err;
 
+		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err < 0) {
+			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
+			return err;
+		}
+
 		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
 	}
diff --git a/include/net/xdp.h b/include/net/xdp.h
index ea3773f94f65..5f67c62540aa 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,6 +41,7 @@ enum xdp_mem_type {
 
 struct xdp_mem_info {
 	u32 type; /* enum xdp_mem_type, but known size type */
+	u32 id;
 };
 
 struct xdp_rxq_info {
@@ -99,18 +100,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	return xdp_frame;
 }
 
-static inline
-void xdp_return_frame(void *data, struct xdp_mem_info *mem)
-{
-	if (mem->type == MEM_TYPE_PAGE_SHARED)
-		page_frag_free(data);
-
-	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
-		struct page *page = virt_to_page(data); /* Assumes order0 page*/
-
-		put_page(page);
-	}
-}
+void xdp_return_frame(void *data, struct xdp_mem_info *mem);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 7e6b3545277d..8b2cb79b5de0 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -5,6 +5,9 @@
  */
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
 
 #include <net/xdp.h>
 
@@ -13,6 +16,99 @@
 #define REG_STATE_UNREGISTERED	0x2
 #define REG_STATE_UNUSED	0x3
 
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+struct xdp_mem_allocator {
+	struct xdp_mem_info mem;
+	void *allocator;
+	struct rhash_head node;
+	struct rcu_head rcu;
+};
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+	const u32 *k = data;
+	const u32 key = *k;
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+		     != sizeof(u32));
+
+	/* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
+	return key << RHT_HASH_RESERVED_SPACE;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+			  const void *ptr)
+{
+	const struct xdp_mem_allocator *xa = ptr;
+	u32 mem_id = *(u32 *)arg->key;
+
+	return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+	.nelem_hint = 64,
+	.head_offset = offsetof(struct xdp_mem_allocator, node),
+	.key_offset  = offsetof(struct xdp_mem_allocator, mem.id),
+	.key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+	.max_size = MEM_ID_MAX,
+	.min_size = 8,
+	.automatic_shrinking = true,
+	.hashfn    = xdp_mem_id_hashfn,
+	.obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+	struct xdp_mem_allocator *xa;
+
+	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+	/* Allow this ID to be reused */
+	ida_simple_remove(&mem_id_pool, xa->mem.id);
+
+	/* TODO: Depending on allocator type/pointer free resources */
+
+	/* Poison memory */
+	xa->mem.id = 0xFFFF;
+	xa->mem.type = 0xF0F0;
+	xa->allocator = (void *)0xDEAD9001;
+
+	kfree(xa);
+}
+
+static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+	struct xdp_mem_allocator *xa;
+	int id = xdp_rxq->mem.id;
+	int err;
+
+	if (id == 0)
+		return;
+
+	mutex_lock(&mem_id_lock);
+
+	xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+	if (!xa) {
+		mutex_unlock(&mem_id_lock);
+		return;
+	}
+
+	err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
+	WARN_ON(err);
+
+	call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+	mutex_unlock(&mem_id_lock);
+}
+
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 {
 	/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -21,8 +117,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 
 	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
 
+	__xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
 	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
 	xdp_rxq->dev = NULL;
+
+	/* Reset mem info to defaults */
+	xdp_rxq->mem.id = 0;
+	xdp_rxq->mem.type = 0;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
 
@@ -72,20 +174,131 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
 
+static int __mem_id_init_hash_table(void)
+{
+	struct rhashtable *rht;
+	int ret;
+
+	if (unlikely(mem_id_init))
+		return 0;
+
+	rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+	if (!rht)
+		return -ENOMEM;
+
+	ret = rhashtable_init(rht, &mem_id_rht_params);
+	if (ret < 0) {
+		kfree(rht);
+		return ret;
+	}
+	mem_id_ht = rht;
+	smp_mb(); /* mutex lock should provide enough pairing */
+	mem_id_init = true;
+
+	return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+	int retries = 1;
+	int id;
+
+again:
+	id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+	if (id < 0) {
+		if (id == -ENOSPC) {
+			/* Cyclic allocator, reset next id */
+			if (retries--) {
+				mem_id_next = MEM_ID_MIN;
+				goto again;
+			}
+		}
+		return id; /* errno */
+	}
+	mem_id_next = id + 1;
+
+	return id;
+}
+
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator)
 {
+	struct xdp_mem_allocator *xdp_alloc;
+	gfp_t gfp = GFP_KERNEL;
+	int id, errno, ret;
+	void *ptr;
+
+	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+		WARN(1, "Missing register, driver bug");
+		return -EFAULT;
+	}
+
 	if (type >= MEM_TYPE_MAX)
 		return -EINVAL;
 
 	xdp_rxq->mem.type = type;
 
-	if (allocator)
-		return -EOPNOTSUPP;
+	if (!allocator)
+		return 0;
+
+	/* Delay init of rhashtable to save memory if feature isn't used */
+	if (!mem_id_init) {
+		mutex_lock(&mem_id_lock);
+		ret = __mem_id_init_hash_table();
+		mutex_unlock(&mem_id_lock);
+		if (ret < 0) {
+			WARN_ON(1);
+			return ret;
+		}
+	}
+
+	xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+	if (!xdp_alloc)
+		return -ENOMEM;
+
+	mutex_lock(&mem_id_lock);
+	id = __mem_id_cyclic_get(gfp);
+	if (id < 0) {
+		errno = id;
+		goto err;
+	}
+	xdp_rxq->mem.id = id;
+	xdp_alloc->mem  = xdp_rxq->mem;
+	xdp_alloc->allocator = allocator;
+
+	/* Insert allocator into ID lookup table */
+	ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+	if (IS_ERR(ptr)) {
+		errno = PTR_ERR(ptr);
+		goto err;
+	}
+
+	mutex_unlock(&mem_id_lock);
 
-	/* TODO: Allocate an ID that maps to allocator pointer
-	 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
-	 */
 	return 0;
+err:
+	mutex_unlock(&mem_id_lock);
+	kfree(xdp_alloc);
+	return errno;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+{
+	if (mem->type == MEM_TYPE_PAGE_SHARED) {
+		page_frag_free(data);
+		return;
+	}
+
+	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
+		struct page *page = virt_to_page(data); /* Assumes order0 page*/
+
+		put_page(page);
+	}
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);
-- 
cgit v1.2.3


From ff7d6b27f894f1469dc51ccb828b7363ccd9799f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:17 +0200
Subject: page_pool: refurbish version of page_pool code

Need a fast page recycle mechanism for ndo_xdp_xmit API for returning
pages on DMA-TX completion time, which have good cross CPU
performance, given DMA-TX completion time can happen on a remote CPU.

Refurbish my page_pool code, that was presented[1] at MM-summit 2016.
Adapted page_pool code to not depend the page allocator and
integration into struct page.  The DMA mapping feature is kept,
even-though it will not be activated/used in this patchset.

[1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf

V2: Adjustments requested by Tariq
 - Changed page_pool_create return codes, don't return NULL, only
   ERR_PTR, as this simplifies err handling in drivers.

V4: many small improvements and cleanups
- Add DOC comment section, that can be used by kernel-doc
- Improve fallback mode, to work better with refcnt based recycling
  e.g. remove a WARN as pointed out by Tariq
  e.g. quicker fallback if ptr_ring is empty.

V5: Fixed SPDX license as pointed out by Alexei

V6: Adjustments requested by Eric Dumazet
 - Adjust ____cacheline_aligned_in_smp usage/placement
 - Move rcu_head in struct page_pool
 - Free pages quicker on destroy, minimize resources delayed an RCU period
 - Remove code for forward/backward compat ABI interface

V8: Issues found by kbuild test robot
 - Address sparse should be static warnings
 - Only compile+link when a driver use/select page_pool,
   mlx5 selects CONFIG_PAGE_POOL, although its first used in two patches

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig |   1 +
 include/net/page_pool.h                         | 129 ++++++++++
 net/Kconfig                                     |   3 +
 net/core/Makefile                               |   1 +
 net/core/page_pool.c                            | 317 ++++++++++++++++++++++++
 5 files changed, 451 insertions(+)
 create mode 100644 include/net/page_pool.h
 create mode 100644 net/core/page_pool.c

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index c032319f1cb9..12257034131e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -30,6 +30,7 @@ config MLX5_CORE_EN
 	bool "Mellanox Technologies ConnectX-4 Ethernet support"
 	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
 	depends on IPV6=y || IPV6=n || MLX5_CORE=m
+	select PAGE_POOL
 	default n
 	---help---
 	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
new file mode 100644
index 000000000000..1fe77db59518
--- /dev/null
+++ b/include/net/page_pool.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.h
+ *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ *	Copyright (C) 2016 Red Hat, Inc.
+ */
+
+/**
+ * DOC: page_pool allocator
+ *
+ * This page_pool allocator is optimized for the XDP mode that
+ * uses one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * Basic use involve replacing alloc_pages() calls with the
+ * page_pool_alloc_pages() call.  Drivers should likely use
+ * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
+ *
+ * If page_pool handles DMA mapping (use page->private), then API user
+ * is responsible for invoking page_pool_put_page() once.  In-case of
+ * elevated refcnt, the DMA state is released, assuming other users of
+ * the page will eventually call put_page().
+ *
+ * If no DMA mapping is done, then it can act as shim-layer that
+ * fall-through to alloc_page.  As no state is kept on the page, the
+ * regular put_page() call is sufficient.
+ */
+#ifndef _NET_PAGE_POOL_H
+#define _NET_PAGE_POOL_H
+
+#include <linux/mm.h> /* Needed by ptr_ring */
+#include <linux/ptr_ring.h>
+#include <linux/dma-direction.h>
+
+#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */
+#define PP_FLAG_ALL	PP_FLAG_DMA_MAP
+
+/*
+ * Fast allocation side cache array/stack
+ *
+ * The cache size and refill watermark is related to the network
+ * use-case.  The NAPI budget is 64 packets.  After a NAPI poll the RX
+ * ring is usually refilled and the max consumed elements will be 64,
+ * thus a natural max size of objects needed in the cache.
+ *
+ * Keeping room for more objects, is due to XDP_DROP use-case.  As
+ * XDP_DROP allows the opportunity to recycle objects directly into
+ * this array, as it shares the same softirq/NAPI protection.  If
+ * cache is already full (or partly full) then the XDP_DROP recycles
+ * would have to take a slower code path.
+ */
+#define PP_ALLOC_CACHE_SIZE	128
+#define PP_ALLOC_CACHE_REFILL	64
+struct pp_alloc_cache {
+	u32 count;
+	void *cache[PP_ALLOC_CACHE_SIZE];
+};
+
+struct page_pool_params {
+	unsigned int	flags;
+	unsigned int	order;
+	unsigned int	pool_size;
+	int		nid;  /* Numa node id to allocate from pages from */
+	struct device	*dev; /* device, for DMA pre-mapping purposes */
+	enum dma_data_direction dma_dir; /* DMA mapping direction */
+};
+
+struct page_pool {
+	struct rcu_head rcu;
+	struct page_pool_params p;
+
+	/*
+	 * Data structure for allocation side
+	 *
+	 * Drivers allocation side usually already perform some kind
+	 * of resource protection.  Piggyback on this protection, and
+	 * require driver to protect allocation side.
+	 *
+	 * For NIC drivers this means, allocate a page_pool per
+	 * RX-queue. As the RX-queue is already protected by
+	 * Softirq/BH scheduling and napi_schedule. NAPI schedule
+	 * guarantee that a single napi_struct will only be scheduled
+	 * on a single CPU (see napi_schedule).
+	 */
+	struct pp_alloc_cache alloc ____cacheline_aligned_in_smp;
+
+	/* Data structure for storing recycled pages.
+	 *
+	 * Returning/freeing pages is more complicated synchronization
+	 * wise, because free's can happen on remote CPUs, with no
+	 * association with allocation resource.
+	 *
+	 * Use ptr_ring, as it separates consumer and producer
+	 * effeciently, it a way that doesn't bounce cache-lines.
+	 *
+	 * TODO: Implement bulk return pages into this structure.
+	 */
+	struct ptr_ring ring;
+};
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
+{
+	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc_pages(pool, gfp);
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params);
+
+void page_pool_destroy(struct page_pool *pool);
+
+/* Never call this directly, use helpers below */
+void __page_pool_put_page(struct page_pool *pool,
+			  struct page *page, bool allow_direct);
+
+static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+{
+	__page_pool_put_page(pool, page, false);
+}
+/* Very limited use-cases allow recycle direct */
+static inline void page_pool_recycle_direct(struct page_pool *pool,
+					    struct page *page)
+{
+	__page_pool_put_page(pool, page, true);
+}
+
+#endif /* _NET_PAGE_POOL_H */
diff --git a/net/Kconfig b/net/Kconfig
index 0428f12c25c2..6fa1a4493b8c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -423,6 +423,9 @@ config MAY_USE_DEVLINK
 	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
 	  devlink is a loadable module and the driver using it is built-in.
 
+config PAGE_POOL
+       bool
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 6dbbba8c57ae..7080417f8bc8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -14,6 +14,7 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			fib_notifier.o xdp.o
 
 obj-y += net-sysfs.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 000000000000..68bf07206744
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ *	Copyright (C) 2016 Red Hat, Inc.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <net/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+static int page_pool_init(struct page_pool *pool,
+			  const struct page_pool_params *params)
+{
+	unsigned int ring_qsize = 1024; /* Default */
+
+	memcpy(&pool->p, params, sizeof(pool->p));
+
+	/* Validate only known flags were used */
+	if (pool->p.flags & ~(PP_FLAG_ALL))
+		return -EINVAL;
+
+	if (pool->p.pool_size)
+		ring_qsize = pool->p.pool_size;
+
+	/* Sanity limit mem that can be pinned down */
+	if (ring_qsize > 32768)
+		return -E2BIG;
+
+	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+	 * which is the XDP_TX use-case.
+	 */
+	if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+	    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+		return -EINVAL;
+
+	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+	struct page_pool *pool;
+	int err = 0;
+
+	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	err = page_pool_init(pool, params);
+	if (err < 0) {
+		pr_warn("%s() gave up with errno %d\n", __func__, err);
+		kfree(pool);
+		return ERR_PTR(err);
+	}
+	return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+	struct ptr_ring *r = &pool->ring;
+	struct page *page;
+
+	/* Quicker fallback, avoid locks when ring is empty */
+	if (__ptr_ring_empty(r))
+		return NULL;
+
+	/* Test for safe-context, caller should provide this guarantee */
+	if (likely(in_serving_softirq())) {
+		if (likely(pool->alloc.count)) {
+			/* Fast-path */
+			page = pool->alloc.cache[--pool->alloc.count];
+			return page;
+		}
+		/* Slower-path: Alloc array empty, time to refill
+		 *
+		 * Open-coded bulk ptr_ring consumer.
+		 *
+		 * Discussion: the ring consumer lock is not really
+		 * needed due to the softirq/NAPI protection, but
+		 * later need the ability to reclaim pages on the
+		 * ring. Thus, keeping the locks.
+		 */
+		spin_lock(&r->consumer_lock);
+		while ((page = __ptr_ring_consume(r))) {
+			if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+				break;
+			pool->alloc.cache[pool->alloc.count++] = page;
+		}
+		spin_unlock(&r->consumer_lock);
+		return page;
+	}
+
+	/* Slow-path: Get page from locked ring queue */
+	page = ptr_ring_consume(&pool->ring);
+	return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+						 gfp_t _gfp)
+{
+	struct page *page;
+	gfp_t gfp = _gfp;
+	dma_addr_t dma;
+
+	/* We could always set __GFP_COMP, and avoid this branch, as
+	 * prep_new_page() can handle order-0 with __GFP_COMP.
+	 */
+	if (pool->p.order)
+		gfp |= __GFP_COMP;
+
+	/* FUTURE development:
+	 *
+	 * Current slow-path essentially falls back to single page
+	 * allocations, which doesn't improve performance.  This code
+	 * need bulk allocation support from the page allocator code.
+	 */
+
+	/* Cache was empty, do real allocation */
+	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+	if (!page)
+		return NULL;
+
+	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		goto skip_dma_map;
+
+	/* Setup DMA mapping: use page->private for DMA-addr
+	 * This mapping is kept for lifetime of page, until leaving pool.
+	 */
+	dma = dma_map_page(pool->p.dev, page, 0,
+			   (PAGE_SIZE << pool->p.order),
+			   pool->p.dma_dir);
+	if (dma_mapping_error(pool->p.dev, dma)) {
+		put_page(page);
+		return NULL;
+	}
+	set_page_private(page, dma); /* page->private = dma; */
+
+skip_dma_map:
+	/* When page just alloc'ed is should/must have refcnt 1. */
+	return page;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	/* Fast-path: Get a page from cache */
+	page = __page_pool_get_cached(pool);
+	if (page)
+		return page;
+
+	/* Slow-path: cache empty, do real allocation */
+	page = __page_pool_alloc_pages_slow(pool, gfp);
+	return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+static void __page_pool_clean_page(struct page_pool *pool,
+				   struct page *page)
+{
+	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		return;
+
+	/* DMA unmap */
+	dma_unmap_page(pool->p.dev, page_private(page),
+		       PAGE_SIZE << pool->p.order, pool->p.dma_dir);
+	set_page_private(page, 0);
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+	__page_pool_clean_page(pool, page);
+	put_page(page);
+	/* An optimization would be to call __free_pages(page, pool->p.order)
+	 * knowing page is not part of page-cache (thus avoiding a
+	 * __page_cache_release() call).
+	 */
+}
+
+static bool __page_pool_recycle_into_ring(struct page_pool *pool,
+				   struct page *page)
+{
+	int ret;
+	/* BH protection not needed if current is serving softirq */
+	if (in_serving_softirq())
+		ret = ptr_ring_produce(&pool->ring, page);
+	else
+		ret = ptr_ring_produce_bh(&pool->ring, page);
+
+	return (ret == 0) ? true : false;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+				       struct page_pool *pool)
+{
+	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+		return false;
+
+	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
+	pool->alloc.cache[pool->alloc.count++] = page;
+	return true;
+}
+
+void __page_pool_put_page(struct page_pool *pool,
+			  struct page *page, bool allow_direct)
+{
+	/* This allocator is optimized for the XDP mode that uses
+	 * one-frame-per-page, but have fallbacks that act like the
+	 * regular page allocator APIs.
+	 *
+	 * refcnt == 1 means page_pool owns page, and can recycle it.
+	 */
+	if (likely(page_ref_count(page) == 1)) {
+		/* Read barrier done in page_ref_count / READ_ONCE */
+
+		if (allow_direct && in_serving_softirq())
+			if (__page_pool_recycle_direct(page, pool))
+				return;
+
+		if (!__page_pool_recycle_into_ring(pool, page)) {
+			/* Cache full, fallback to free pages */
+			__page_pool_return_page(pool, page);
+		}
+		return;
+	}
+	/* Fallback/non-XDP mode: API user have elevated refcnt.
+	 *
+	 * Many drivers split up the page into fragments, and some
+	 * want to keep doing this to save memory and do refcnt based
+	 * recycling. Support this use case too, to ease drivers
+	 * switching between XDP/non-XDP.
+	 *
+	 * In-case page_pool maintains the DMA mapping, API user must
+	 * call page_pool_put_page once.  In this elevated refcnt
+	 * case, the DMA is unmapped/released, as driver is likely
+	 * doing refcnt based recycle tricks, meaning another process
+	 * will be invoking put_page.
+	 */
+	__page_pool_clean_page(pool, page);
+	put_page(page);
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __page_pool_empty_ring(struct page_pool *pool)
+{
+	struct page *page;
+
+	/* Empty recycle ring */
+	while ((page = ptr_ring_consume(&pool->ring))) {
+		/* Verify the refcnt invariant of cached pages */
+		if (!(page_ref_count(page) == 1))
+			pr_crit("%s() page_pool refcnt %d violation\n",
+				__func__, page_ref_count(page));
+
+		__page_pool_return_page(pool, page);
+	}
+}
+
+static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+{
+	struct page_pool *pool;
+
+	pool = container_of(rcu, struct page_pool, rcu);
+
+	WARN(pool->alloc.count, "API usage violation");
+
+	__page_pool_empty_ring(pool);
+	ptr_ring_cleanup(&pool->ring, NULL);
+	kfree(pool);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+	struct page *page;
+
+	/* Empty alloc cache, assume caller made sure this is
+	 * no-longer in use, and page_pool_alloc_pages() cannot be
+	 * call concurrently.
+	 */
+	while (pool->alloc.count) {
+		page = pool->alloc.cache[--pool->alloc.count];
+		__page_pool_return_page(pool, page);
+	}
+
+	/* No more consumers should exist, but producers could still
+	 * be in-flight.
+	 */
+	__page_pool_empty_ring(pool);
+
+	/* An xdp_mem_allocator can still ref page_pool pointer */
+	call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+}
+EXPORT_SYMBOL(page_pool_destroy);
-- 
cgit v1.2.3


From 57d0a1c1ac9e6a836bbab4698ba2a2e03f64bf1b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:22 +0200
Subject: xdp: allow page_pool as an allocator type in xdp_return_frame

New allocator type MEM_TYPE_PAGE_POOL for page_pool usage.

The registered allocator page_pool pointer is not available directly
from xdp_rxq_info, but it could be (if needed).  For now, the driver
should keep separate track of the page_pool pointer, which it should
use for RX-ring page allocation.

As suggested by Saeed, to maintain a symmetric API it is the drivers
responsibility to allocate/create and free/destroy the page_pool.
Thus, after the driver have called xdp_rxq_info_unreg(), it is drivers
responsibility to free the page_pool, but with a RCU free call.  This
is done easily via the page_pool helper page_pool_destroy() (which
avoids touching any driver code during the RCU callback, which could
happen after the driver have been unloaded).

V8: address issues found by kbuild test robot
 - Address sparse should be static warnings
 - Allow xdp.o to be compiled without page_pool.o

V9: Remove inline from .c file, compiler knows best

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 14 ++++++++++++
 include/net/xdp.h       |  3 +++
 net/core/xdp.c          | 60 +++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 65 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 1fe77db59518..c79087153148 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -117,7 +117,12 @@ void __page_pool_put_page(struct page_pool *pool,
 
 static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
 {
+	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+	 */
+#ifdef CONFIG_PAGE_POOL
 	__page_pool_put_page(pool, page, false);
+#endif
 }
 /* Very limited use-cases allow recycle direct */
 static inline void page_pool_recycle_direct(struct page_pool *pool,
@@ -126,4 +131,13 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	__page_pool_put_page(pool, page, true);
 }
 
+static inline bool is_page_pool_compiled_in(void)
+{
+#ifdef CONFIG_PAGE_POOL
+	return true;
+#else
+	return false;
+#endif
+}
+
 #endif /* _NET_PAGE_POOL_H */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 5f67c62540aa..d0ee437753dc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -36,6 +36,7 @@
 enum xdp_mem_type {
 	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
 	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
+	MEM_TYPE_PAGE_POOL,
 	MEM_TYPE_MAX,
 };
 
@@ -44,6 +45,8 @@ struct xdp_mem_info {
 	u32 id;
 };
 
+struct page_pool;
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8b2cb79b5de0..33e382afbd95 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/rhashtable.h>
+#include <net/page_pool.h>
 
 #include <net/xdp.h>
 
@@ -27,7 +28,10 @@ static struct rhashtable *mem_id_ht;
 
 struct xdp_mem_allocator {
 	struct xdp_mem_info mem;
-	void *allocator;
+	union {
+		void *allocator;
+		struct page_pool *page_pool;
+	};
 	struct rhash_head node;
 	struct rcu_head rcu;
 };
@@ -74,7 +78,9 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	/* Allow this ID to be reused */
 	ida_simple_remove(&mem_id_pool, xa->mem.id);
 
-	/* TODO: Depending on allocator type/pointer free resources */
+	/* Notice, driver is expected to free the *allocator,
+	 * e.g. page_pool, and MUST also use RCU free.
+	 */
 
 	/* Poison memory */
 	xa->mem.id = 0xFFFF;
@@ -225,6 +231,17 @@ again:
 	return id;
 }
 
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+	if (type == MEM_TYPE_PAGE_POOL)
+		return is_page_pool_compiled_in();
+
+	if (type >= MEM_TYPE_MAX)
+		return false;
+
+	return true;
+}
+
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator)
 {
@@ -238,13 +255,16 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 		return -EFAULT;
 	}
 
-	if (type >= MEM_TYPE_MAX)
-		return -EINVAL;
+	if (!__is_supported_mem_type(type))
+		return -EOPNOTSUPP;
 
 	xdp_rxq->mem.type = type;
 
-	if (!allocator)
+	if (!allocator) {
+		if (type == MEM_TYPE_PAGE_POOL)
+			return -EINVAL; /* Setup time check page_pool req */
 		return 0;
+	}
 
 	/* Delay init of rhashtable to save memory if feature isn't used */
 	if (!mem_id_init) {
@@ -290,15 +310,31 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
 {
-	if (mem->type == MEM_TYPE_PAGE_SHARED) {
+	struct xdp_mem_allocator *xa;
+	struct page *page;
+
+	switch (mem->type) {
+	case MEM_TYPE_PAGE_POOL:
+		rcu_read_lock();
+		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+		page = virt_to_head_page(data);
+		if (xa)
+			page_pool_put_page(xa->page_pool, page);
+		else
+			put_page(page);
+		rcu_read_unlock();
+		break;
+	case MEM_TYPE_PAGE_SHARED:
 		page_frag_free(data);
-		return;
-	}
-
-	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
-		struct page *page = virt_to_page(data); /* Assumes order0 page*/
-
+		break;
+	case MEM_TYPE_PAGE_ORDER0:
+		page = virt_to_page(data); /* Assumes order0 page*/
 		put_page(page);
+		break;
+	default:
+		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+		break;
 	}
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
-- 
cgit v1.2.3


From 039930945a72d9af5ff04ae9b9e60658a52e0770 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:32 +0200
Subject: xdp: transition into using xdp_frame for return API

Changing API xdp_return_frame() to take struct xdp_frame as argument,
seems like a natural choice. But there are some subtle performance
details here that needs extra care, which is a deliberate choice.

When de-referencing xdp_frame on a remote CPU during DMA-TX
completion, result in the cache-line is change to "Shared"
state. Later when the page is reused for RX, then this xdp_frame
cache-line is written, which change the state to "Modified".

This situation already happens (naturally) for, virtio_net, tun and
cpumap as the xdp_frame pointer is the queued object.  In tun and
cpumap, the ptr_ring is used for efficiently transferring cache-lines
(with pointers) between CPUs. Thus, the only option is to
de-referencing xdp_frame.

It is only the ixgbe driver that had an optimization, in which it can
avoid doing the de-reference of xdp_frame.  The driver already have
TX-ring queue, which (in case of remote DMA-TX completion) have to be
transferred between CPUs anyhow.  In this data area, we stored a
struct xdp_mem_info and a data pointer, which allowed us to avoid
de-referencing xdp_frame.

To compensate for this, a prefetchw is used for telling the cache
coherency protocol about our access pattern.  My benchmarks show that
this prefetchw is enough to compensate the ixgbe driver.

V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT")
V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address
and offset in dma_sync call")

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c     |  5 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h        |  4 +---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 17 +++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |  1 +
 drivers/net/tun.c                               |  4 ++--
 drivers/net/virtio_net.c                        |  2 +-
 include/net/xdp.h                               |  2 +-
 kernel/bpf/cpumap.c                             |  6 +++---
 net/core/xdp.c                                  |  4 +++-
 9 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 96c54cbfb1f9..c8bf4d35fdea 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -638,8 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
 			kfree(tx_buffer->raw_buf);
 		else if (ring_is_xdp(ring))
-			xdp_return_frame(tx_buffer->xdpf->data,
-					 &tx_buffer->xdpf->mem);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
@@ -842,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 
 		/* free the skb/XDP data */
 		if (ring_is_xdp(tx_ring))
-			xdp_return_frame(tx_buf->xdpf->data, &tx_buf->xdpf->mem);
+			xdp_return_frame(tx_buf->xdpf);
 		else
 			napi_consume_skb(tx_buf->skb, napi_budget);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index abb5248e917e..7dd5038cfcc4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -241,8 +241,7 @@ struct ixgbe_tx_buffer {
 	unsigned long time_stamp;
 	union {
 		struct sk_buff *skb;
-		/* XDP uses address ptr on irq_clean */
-		void *data;
+		struct xdp_frame *xdpf;
 	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
@@ -250,7 +249,6 @@ struct ixgbe_tx_buffer {
 	DEFINE_DMA_UNMAP_ADDR(dma);
 	DEFINE_DMA_UNMAP_LEN(len);
 	u32 tx_flags;
-	struct xdp_mem_info xdp_mem;
 };
 
 struct ixgbe_rx_buffer {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f10904ec2172..4f2864165723 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -2386,6 +2386,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
+			prefetchw(xdp.data_hard_start); /* xdp_frame write */
 
 			skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
 		}
@@ -5797,7 +5798,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
@@ -8348,16 +8349,21 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 	struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
 	struct ixgbe_tx_buffer *tx_buffer;
 	union ixgbe_adv_tx_desc *tx_desc;
+	struct xdp_frame *xdpf;
 	u32 len, cmd_type;
 	dma_addr_t dma;
 	u16 i;
 
-	len = xdp->data_end - xdp->data;
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	len = xdpf->len;
 
 	if (unlikely(!ixgbe_desc_unused(ring)))
 		return IXGBE_XDP_CONSUMED;
 
-	dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE);
+	dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE);
 	if (dma_mapping_error(ring->dev, dma))
 		return IXGBE_XDP_CONSUMED;
 
@@ -8372,8 +8378,7 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
-	tx_buffer->data = xdp->data;
-	tx_buffer->xdp_mem = xdp->rxq->mem;
+	tx_buffer->xdpf = xdpf;
 
 	tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f42436d7f2d9..7bbf0db27a01 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -890,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 	dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset,
 				      frag_size, DMA_FROM_DEVICE);
+	prefetchw(va); /* xdp_frame data area */
 	prefetch(data);
 	wi->offset += frag_size;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 283bde85c455..bec130cdbd9d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -663,7 +663,7 @@ void tun_ptr_free(void *ptr)
 	if (tun_is_xdp_frame(ptr)) {
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		xdp_return_frame(xdpf->data, &xdpf->mem);
+		xdp_return_frame(xdpf);
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -2196,7 +2196,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
-		xdp_return_frame(xdpf->data, &xdpf->mem);
+		xdp_return_frame(xdpf);
 	} else {
 		struct sk_buff *skb = ptr;
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 42d338fe9a8d..ab3d7cbc4c49 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -430,7 +430,7 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 
 	/* Free up any pending old buffers before queueing new ones. */
 	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
-		xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem);
+		xdp_return_frame(xdpf_sent);
 
 	xdpf = convert_to_xdp_frame(xdp);
 	if (unlikely(!xdpf))
diff --git a/include/net/xdp.h b/include/net/xdp.h
index d0ee437753dc..137ad5f9f40f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	return xdp_frame;
 }
 
-void xdp_return_frame(void *data, struct xdp_mem_info *mem);
+void xdp_return_frame(struct xdp_frame *xdpf);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index bcdc4dea5ce7..c95b04ec103e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -219,7 +219,7 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 
 	while ((xdpf = ptr_ring_consume(ring)))
 		if (WARN_ON_ONCE(xdpf))
-			xdp_return_frame(xdpf->data, &xdpf->mem);
+			xdp_return_frame(xdpf);
 }
 
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -275,7 +275,7 @@ static int cpu_map_kthread_run(void *data)
 
 			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				xdp_return_frame(xdpf->data, &xdpf->mem);
+				xdp_return_frame(xdpf);
 				continue;
 			}
 
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf->data, &xdpf->mem);
+			xdp_return_frame(xdpf);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 33e382afbd95..0c86b53a3a63 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,9 +308,11 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+void xdp_return_frame(struct xdp_frame *xdpf)
 {
+	struct xdp_mem_info *mem = &xdpf->mem;
 	struct xdp_mem_allocator *xa;
+	void *data = xdpf->data;
 	struct page *page;
 
 	switch (mem->type) {
-- 
cgit v1.2.3


From 44fa2dbd475996ddc8f3a0e6113dee983e0ee3aa Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:37 +0200
Subject: xdp: transition into using xdp_frame for ndo_xdp_xmit

Changing API ndo_xdp_xmit to take a struct xdp_frame instead of struct
xdp_buff.  This brings xdp_return_frame and ndp_xdp_xmit in sync.

This builds towards changing the API further to become a bulk API,
because xdp_buff is not a queue-able object while xdp_frame is.

V4: Adjust for commit 59655a5b6c83 ("tuntap: XDP_TX can use native XDP")
V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT")

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 30 +++++++++++++++------------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 24 +++++++++++----------
 drivers/net/tun.c                             | 19 ++++++++++-------
 drivers/net/virtio_net.c                      | 24 ++++++++++++---------
 include/linux/netdevice.h                     |  4 ++--
 net/core/filter.c                             | 17 +++++++++++++--
 7 files changed, 74 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c8bf4d35fdea..87fb27ab9c24 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
 #define I40E_XDP_CONSUMED 1
 #define I40E_XDP_TX 2
 
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
 			      struct i40e_ring *xdp_ring);
 
+static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp,
+				 struct i40e_ring *xdp_ring)
+{
+	struct xdp_frame *xdpf = convert_to_xdp_frame(xdp);
+
+	if (unlikely(!xdpf))
+		return I40E_XDP_CONSUMED;
+
+	return i40e_xmit_xdp_ring(xdpf, xdp_ring);
+}
+
 /**
  * i40e_run_xdp - run an XDP program
  * @rx_ring: Rx ring being processed
@@ -2233,7 +2244,7 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
 		break;
 	case XDP_TX:
 		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
-		result = i40e_xmit_xdp_ring(xdp, xdp_ring);
+		result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
 		break;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
@@ -3480,21 +3491,14 @@ dma_error:
  * @xdp: data to transmit
  * @xdp_ring: XDP Tx ring
  **/
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
 			      struct i40e_ring *xdp_ring)
 {
 	u16 i = xdp_ring->next_to_use;
 	struct i40e_tx_buffer *tx_bi;
 	struct i40e_tx_desc *tx_desc;
-	struct xdp_frame *xdpf;
+	u32 size = xdpf->len;
 	dma_addr_t dma;
-	u32 size;
-
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return I40E_XDP_CONSUMED;
-
-	size = xdpf->len;
 
 	if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
 		xdp_ring->tx_stats.tx_busy++;
@@ -3684,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  *
  * Returns Zero if sent, else an error code
  **/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
@@ -3697,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
+	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
 	if (err != I40E_XDP_TX)
 		return -ENOSPC;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 857b1d743c8d..4bf318b8be85 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -511,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4f2864165723..51e7d82a5860 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2262,7 +2262,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 #define IXGBE_XDP_TX 2
 
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
-			       struct xdp_buff *xdp);
+			       struct xdp_frame *xdpf);
 
 static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 				     struct ixgbe_ring *rx_ring,
@@ -2270,6 +2270,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 {
 	int err, result = IXGBE_XDP_PASS;
 	struct bpf_prog *xdp_prog;
+	struct xdp_frame *xdpf;
 	u32 act;
 
 	rcu_read_lock();
@@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 	if (!xdp_prog)
 		goto xdp_out;
 
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
 		break;
 	case XDP_TX:
-		result = ixgbe_xmit_xdp_ring(adapter, xdp);
+		xdpf = convert_to_xdp_frame(xdp);
+		if (unlikely(!xdpf)) {
+			result = IXGBE_XDP_CONSUMED;
+			break;
+		}
+		result = ixgbe_xmit_xdp_ring(adapter, xdpf);
 		break;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
@@ -2386,7 +2394,6 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
-			prefetchw(xdp.data_hard_start); /* xdp_frame write */
 
 			skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
 		}
@@ -8344,20 +8351,15 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 }
 
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
-			       struct xdp_buff *xdp)
+			       struct xdp_frame *xdpf)
 {
 	struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
 	struct ixgbe_tx_buffer *tx_buffer;
 	union ixgbe_adv_tx_desc *tx_desc;
-	struct xdp_frame *xdpf;
 	u32 len, cmd_type;
 	dma_addr_t dma;
 	u16 i;
 
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
 	len = xdpf->len;
 
 	if (unlikely(!ixgbe_desc_unused(ring)))
@@ -10010,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
@@ -10026,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!ring))
 		return -ENXIO;
 
-	err = ixgbe_xmit_xdp_ring(adapter, xdp);
+	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
 	if (err != IXGBE_XDP_TX)
 		return -ENOSPC;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bec130cdbd9d..1e58be152d5c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1301,18 +1301,13 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct xdp_frame *frame;
 	struct tun_file *tfile;
 	u32 numqueues;
 	int ret = 0;
 
-	frame = convert_to_xdp_frame(xdp);
-	if (unlikely(!frame))
-		return -EOVERFLOW;
-
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
@@ -1336,6 +1331,16 @@ out:
 	return ret;
 }
 
+static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct xdp_frame *frame = convert_to_xdp_frame(xdp);
+
+	if (unlikely(!frame))
+		return -EOVERFLOW;
+
+	return tun_xdp_xmit(dev, frame);
+}
+
 static void tun_xdp_flush(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
@@ -1683,7 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		case XDP_TX:
 			get_page(alloc_frag->page);
 			alloc_frag->offset += buflen;
-			if (tun_xdp_xmit(tun->dev, &xdp))
+			if (tun_xdp_tx(tun->dev, &xdp))
 				goto err_redirect;
 			tun_xdp_flush(tun->dev);
 			rcu_read_unlock();
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ab3d7cbc4c49..01694e26f03e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -416,10 +416,10 @@ static void virtnet_xdp_flush(struct net_device *dev)
 }
 
 static int __virtnet_xdp_xmit(struct virtnet_info *vi,
-			      struct xdp_buff *xdp)
+			       struct xdp_frame *xdpf)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	struct xdp_frame *xdpf, *xdpf_sent;
+	struct xdp_frame *xdpf_sent;
 	struct send_queue *sq;
 	unsigned int len;
 	unsigned int qp;
@@ -432,10 +432,6 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
 		xdp_return_frame(xdpf_sent);
 
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
 	/* virtqueue want to use data area in-front of packet */
 	if (unlikely(xdpf->metasize > 0))
 		return -EOPNOTSUPP;
@@ -459,7 +455,7 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	return 0;
 }
 
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
@@ -472,7 +468,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	return __virtnet_xdp_xmit(vi, xdp);
+	return __virtnet_xdp_xmit(vi, xdpf);
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -569,6 +565,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
 		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
+		struct xdp_frame *xdpf;
 		struct xdp_buff xdp;
 		void *orig_data;
 		u32 act;
@@ -611,7 +608,10 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			delta = orig_data - xdp.data;
 			break;
 		case XDP_TX:
-			err = __virtnet_xdp_xmit(vi, &xdp);
+			xdpf = convert_to_xdp_frame(&xdp);
+			if (unlikely(!xdpf))
+				goto err_xdp;
+			err = __virtnet_xdp_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
@@ -702,6 +702,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
+		struct xdp_frame *xdpf;
 		struct page *xdp_page;
 		struct xdp_buff xdp;
 		void *data;
@@ -766,7 +767,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			}
 			break;
 		case XDP_TX:
-			err = __virtnet_xdp_xmit(vi, &xdp);
+			xdpf = convert_to_xdp_frame(&xdp);
+			if (unlikely(!xdpf))
+				goto err_xdp;
+			err = __virtnet_xdp_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cf44503ea81a..14e0777ffcfb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1165,7 +1165,7 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
  * void (*ndo_xdp_flush)(struct net_device *dev);
@@ -1356,7 +1356,7 @@ struct net_device_ops {
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev,
-						struct xdp_buff *xdp);
+						struct xdp_frame *xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index d31aff93270d..3bb0cb98a9be 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2749,13 +2749,18 @@ static int __bpf_tx_xdp(struct net_device *dev,
 			struct xdp_buff *xdp,
 			u32 index)
 {
+	struct xdp_frame *xdpf;
 	int err;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit) {
 		return -EOPNOTSUPP;
 	}
 
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 	if (err)
 		return err;
 	dev->netdev_ops->ndo_xdp_flush(dev);
@@ -2771,11 +2776,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		struct net_device *dev = fwd;
+		struct xdp_frame *xdpf;
 
 		if (!dev->netdev_ops->ndo_xdp_xmit)
 			return -EOPNOTSUPP;
 
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		xdpf = convert_to_xdp_frame(xdp);
+		if (unlikely(!xdpf))
+			return -EOVERFLOW;
+
+		/* TODO: move to inside map code instead, for bulk support
+		 * err = dev_map_enqueue(dev, xdp);
+		 */
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
-- 
cgit v1.2.3


From 45f8cb57da0d7a9ead4b39d7f5def333a5b0c08b Mon Sep 17 00:00:00 2001
From: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Date: Tue, 27 Mar 2018 14:30:40 +0100
Subject: ASoC: core: Allow topology to override machine driver FE DAI link
 config.

Machine drivers statically define a number of DAI links that currently
cannot be changed or removed by topology. This means PCMs and platform
components cannot be changed by topology at runtime AND machine drivers
are tightly coupled to topology.

This patch allows topology to override the machine driver DAI link config
in order to reuse machine drivers with different topologies and platform
components. The patch supports :-

1) create new FE PCMs with a topology defined PCM ID.
2) destroy existing static FE PCMs
3) change the platform component driver.
4) assign any new HW params fixups.

The patch requires no changes to the machine drivers, but does add some
platform component flags that the platform component driver can assign
before loading topologies.

Signed-off-by: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 10 +++++++
 sound/soc/soc-core.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 sound/soc/soc-pcm.c  | 12 ++++++++
 3 files changed, 98 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index ad266d7e9553..fac4ff04fb7d 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1012,6 +1012,13 @@ struct snd_soc_platform_driver {
 
 	/* platform stream compress ops */
 	const struct snd_compr_ops *compr_ops;
+
+	/* this platform uses topology and ignore machine driver FEs */
+	const char *ignore_machine;
+	int (*be_hw_params_fixup)(struct snd_soc_pcm_runtime *rtd,
+				  struct snd_pcm_hw_params *params);
+	bool use_dai_pcm_id;	/* use the DAI link PCM ID as PCM device number */
+	int be_pcm_base;	/* base device ID for all BE PCMs */
 };
 
 struct snd_soc_dai_link_component {
@@ -1118,6 +1125,9 @@ struct snd_soc_dai_link {
 	/* pmdown_time is ignored at stop */
 	unsigned int ignore_pmdown_time:1;
 
+	/* Do not create a PCM for this DAI link (Backend link) */
+	unsigned int ignore:1;
+
 	struct list_head list; /* DAI link list of the soc card */
 	struct snd_soc_dobj dobj; /* For topology */
 };
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index bf7ca32ab31f..8f01fe8296ff 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1050,6 +1050,9 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 	const char *platform_name;
 	int i;
 
+	if (dai_link->ignore)
+		return 0;
+
 	dev_dbg(card->dev, "ASoC: binding %s\n", dai_link->name);
 
 	if (soc_is_dai_link_bound(card, dai_link)) {
@@ -1672,7 +1675,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card,
 {
 	struct snd_soc_dai_link *dai_link = rtd->dai_link;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
-	int i, ret;
+	int i, ret, num;
 
 	dev_dbg(card->dev, "ASoC: probe %s dai link %d late %d\n",
 			card->name, rtd->num, order);
@@ -1718,9 +1721,23 @@ static int soc_probe_link_dais(struct snd_soc_card *card,
 		soc_dpcm_debugfs_add(rtd);
 #endif
 
+	/*
+	 * most drivers will register their PCMs using DAI link ordering but
+	 * topology based drivers can use the DAI link id field to set PCM
+	 * device number and then use rtd + a base offset of the BEs.
+	 */
+	if (rtd->platform->driver->use_dai_pcm_id) {
+		if (rtd->dai_link->no_pcm)
+			num = rtd->platform->driver->be_pcm_base + rtd->num;
+		else
+			num = rtd->dai_link->id;
+	} else {
+		num = rtd->num;
+	}
+
 	if (cpu_dai->driver->compress_new) {
 		/*create compress_device"*/
-		ret = cpu_dai->driver->compress_new(rtd, rtd->num);
+		ret = cpu_dai->driver->compress_new(rtd, num);
 		if (ret < 0) {
 			dev_err(card->dev, "ASoC: can't create compress %s\n",
 					 dai_link->stream_name);
@@ -1730,7 +1747,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card,
 
 		if (!dai_link->params) {
 			/* create the pcm */
-			ret = soc_new_pcm(rtd, rtd->num);
+			ret = soc_new_pcm(rtd, num);
 			if (ret < 0) {
 				dev_err(card->dev, "ASoC: can't create pcm %s :%d\n",
 				       dai_link->stream_name, ret);
@@ -2076,6 +2093,59 @@ int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour)
 EXPORT_SYMBOL_GPL(snd_soc_set_dmi_name);
 #endif /* CONFIG_DMI */
 
+static void soc_check_tplg_fes(struct snd_soc_card *card)
+{
+	struct snd_soc_platform *platform;
+	struct snd_soc_dai_link *dai_link;
+	int i;
+
+	list_for_each_entry(platform, &platform_list, list) {
+
+		/* does this platform override FEs ? */
+		if (!platform->driver->ignore_machine)
+			continue;
+
+		/* for this machine ? */
+		if (strcmp(platform->driver->ignore_machine,
+			   card->dev->driver->name))
+			continue;
+
+		/* machine matches, so override the rtd data */
+		for (i = 0; i < card->num_links; i++) {
+
+			dai_link = &card->dai_link[i];
+
+			/* ignore this FE */
+			if (dai_link->dynamic) {
+				dai_link->ignore = true;
+				continue;
+			}
+
+			dev_info(card->dev, "info: override FE DAI link %s\n",
+				 card->dai_link[i].name);
+
+			/* override platform */
+			dai_link->platform_name = platform->component.name;
+			dai_link->cpu_dai_name = platform->component.name;
+
+			/* convert non BE into BE */
+			dai_link->no_pcm = 1;
+			dai_link->dpcm_playback = 1;
+			dai_link->dpcm_capture = 1;
+
+			/* override any BE fixups */
+			dai_link->be_hw_params_fixup =
+				platform->driver->be_hw_params_fixup;
+
+			/* most BE links dont set stream name, so set it to
+			 * dai link name if it's NULL to help bind widgets.
+			 */
+			if (!dai_link->stream_name)
+				dai_link->stream_name = dai_link->name;
+		}
+	}
+}
+
 static int snd_soc_instantiate_card(struct snd_soc_card *card)
 {
 	struct snd_soc_codec *codec;
@@ -2086,6 +2156,9 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card)
 	mutex_lock(&client_mutex);
 	mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_INIT);
 
+	/* check whether any platform is ignore machine FE and using topology */
+	soc_check_tplg_fes(card);
+
 	/* bind DAIs */
 	for (i = 0; i < card->num_links; i++) {
 		ret = soc_bind_dai_link(card, &card->dai_link[i]);
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 68d9dc930096..4ce489165a6d 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -909,8 +909,20 @@ int soc_dai_hw_params(struct snd_pcm_substream *substream,
 		      struct snd_pcm_hw_params *params,
 		      struct snd_soc_dai *dai)
 {
+	struct snd_soc_pcm_runtime *rtd = substream->private_data;
 	int ret;
 
+	/* perform any topology hw_params fixups before DAI  */
+	if (rtd->dai_link->be_hw_params_fixup) {
+		ret = rtd->dai_link->be_hw_params_fixup(rtd, params);
+		if (ret < 0) {
+			dev_err(rtd->dev,
+				"ASoC: hw_params topology fixup failed %d\n",
+				ret);
+			return ret;
+		}
+	}
+
 	if (dai->driver->ops->hw_params) {
 		ret = dai->driver->ops->hw_params(substream, params, dai);
 		if (ret < 0) {
-- 
cgit v1.2.3


From f11a5c27f9287cacad74af31cd92d4413eccc05a Mon Sep 17 00:00:00 2001
From: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Date: Tue, 27 Mar 2018 14:30:41 +0100
Subject: ASoC: core: Add name prefix for machines with topology rewrites

Signed-off-by: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 2 ++
 sound/soc/soc-core.c | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index fac4ff04fb7d..3676d0a8f532 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1015,6 +1015,7 @@ struct snd_soc_platform_driver {
 
 	/* this platform uses topology and ignore machine driver FEs */
 	const char *ignore_machine;
+	const char *topology_name_prefix;
 	int (*be_hw_params_fixup)(struct snd_soc_pcm_runtime *rtd,
 				  struct snd_pcm_hw_params *params);
 	bool use_dai_pcm_id;	/* use the DAI link PCM ID as PCM device number */
@@ -1167,6 +1168,7 @@ struct snd_soc_card {
 	const char *long_name;
 	const char *driver_name;
 	char dmi_longname[80];
+	char topology_shortname[32];
 
 	struct device *dev;
 	struct snd_card *snd_card;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 8f01fe8296ff..aa1c33b3cce0 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2143,6 +2143,14 @@ static void soc_check_tplg_fes(struct snd_soc_card *card)
 			if (!dai_link->stream_name)
 				dai_link->stream_name = dai_link->name;
 		}
+
+		/* Inform userspace we are using alternate topology */
+		if (platform->driver->topology_name_prefix) {
+			snprintf(card->topology_shortname, 32, "%s-%s",
+				 platform->driver->topology_name_prefix,
+				 card->name);
+			card->name = card->topology_shortname;
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 81e9b0a078894841a50a8dd666fd64ca452a2a50 Mon Sep 17 00:00:00 2001
From: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Date: Tue, 27 Mar 2018 14:30:42 +0100
Subject: ASoC: topology: Give more data to clients via callbacks

Give topology clients more access to the topology data by passing index,
pcm, link_config and dai_driver to clients. This allows clients to fully
instantiate and track topology objects.

The SOF driver is the first user of these new APIs and needs them to build
component topology driver and FW objects.

Signed-off-by: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-topology.h           | 23 ++++++++++++++---------
 sound/soc/intel/skylake/skl-pcm.c      |  7 ++++---
 sound/soc/intel/skylake/skl-topology.c |  5 +++--
 sound/soc/intel/skylake/skl-topology.h | 20 +++++---------------
 sound/soc/soc-topology.c               | 31 ++++++++++++++++++-------------
 5 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-topology.h b/include/sound/soc-topology.h
index f552c3f56368..e1f265e21ee1 100644
--- a/include/sound/soc-topology.h
+++ b/include/sound/soc-topology.h
@@ -30,6 +30,8 @@ struct snd_soc_dapm_context;
 struct snd_soc_card;
 struct snd_kcontrol_new;
 struct snd_soc_dai_link;
+struct snd_soc_dai_driver;
+struct snd_soc_dai;
 
 /* object scan be loaded and unloaded in groups with identfying indexes */
 #define SND_SOC_TPLG_INDEX_ALL	0	/* ID that matches all FW objects */
@@ -109,35 +111,38 @@ struct snd_soc_tplg_widget_events {
 struct snd_soc_tplg_ops {
 
 	/* external kcontrol init - used for any driver specific init */
-	int (*control_load)(struct snd_soc_component *,
+	int (*control_load)(struct snd_soc_component *, int index,
 		struct snd_kcontrol_new *, struct snd_soc_tplg_ctl_hdr *);
 	int (*control_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
 	/* external widget init - used for any driver specific init */
-	int (*widget_load)(struct snd_soc_component *,
+	int (*widget_load)(struct snd_soc_component *, int index,
 		struct snd_soc_dapm_widget *,
 		struct snd_soc_tplg_dapm_widget *);
-	int (*widget_ready)(struct snd_soc_component *,
+	int (*widget_ready)(struct snd_soc_component *, int index,
 		struct snd_soc_dapm_widget *,
 		struct snd_soc_tplg_dapm_widget *);
 	int (*widget_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
 	/* FE DAI - used for any driver specific init */
-	int (*dai_load)(struct snd_soc_component *,
-		struct snd_soc_dai_driver *dai_drv);
+	int (*dai_load)(struct snd_soc_component *, int index,
+		struct snd_soc_dai_driver *dai_drv,
+		struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai);
+
 	int (*dai_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
 	/* DAI link - used for any driver specific init */
-	int (*link_load)(struct snd_soc_component *,
-		struct snd_soc_dai_link *link);
+	int (*link_load)(struct snd_soc_component *, int index,
+		struct snd_soc_dai_link *link,
+		struct snd_soc_tplg_link_config *cfg);
 	int (*link_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
 	/* callback to handle vendor bespoke data */
-	int (*vendor_load)(struct snd_soc_component *,
+	int (*vendor_load)(struct snd_soc_component *, int index,
 		struct snd_soc_tplg_hdr *);
 	int (*vendor_unload)(struct snd_soc_component *,
 		struct snd_soc_tplg_hdr *);
@@ -146,7 +151,7 @@ struct snd_soc_tplg_ops {
 	void (*complete)(struct snd_soc_component *);
 
 	/* manifest - optional to inform component of manifest */
-	int (*manifest)(struct snd_soc_component *,
+	int (*manifest)(struct snd_soc_component *, int index,
 		struct snd_soc_tplg_manifest *);
 
 	/* vendor specific kcontrol handlers available for binding */
diff --git a/sound/soc/intel/skylake/skl-pcm.c b/sound/soc/intel/skylake/skl-pcm.c
index afa86b9e4dcf..1f4dd08d36c5 100644
--- a/sound/soc/intel/skylake/skl-pcm.c
+++ b/sound/soc/intel/skylake/skl-pcm.c
@@ -1017,10 +1017,11 @@ static struct snd_soc_dai_driver skl_platform_dai[] = {
 },
 };
 
-int skl_dai_load(struct snd_soc_component *cmp,
-		 struct snd_soc_dai_driver *pcm_dai)
+int skl_dai_load(struct snd_soc_component *cmp, int index,
+			struct snd_soc_dai_driver *dai_drv,
+			struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai)
 {
-	pcm_dai->ops = &skl_pcm_dai_ops;
+	dai_drv->ops = &skl_pcm_dai_ops;
 
 	return 0;
 }
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index 3b1dca419883..6ac081f1f215 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -2851,7 +2851,7 @@ void skl_cleanup_resources(struct skl *skl)
  * information to the driver about module and pipeline parameters which DSP
  * FW expects like ids, resource values, formats etc
  */
-static int skl_tplg_widget_load(struct snd_soc_component *cmpnt,
+static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, int index,
 				struct snd_soc_dapm_widget *w,
 				struct snd_soc_tplg_dapm_widget *tplg_w)
 {
@@ -2958,6 +2958,7 @@ static int skl_init_enum_data(struct device *dev, struct soc_enum *se,
 }
 
 static int skl_tplg_control_load(struct snd_soc_component *cmpnt,
+				int index,
 				struct snd_kcontrol_new *kctl,
 				struct snd_soc_tplg_ctl_hdr *hdr)
 {
@@ -3446,7 +3447,7 @@ static int skl_tplg_get_manifest_data(struct snd_soc_tplg_manifest *manifest,
 	return 0;
 }
 
-static int skl_manifest_load(struct snd_soc_component *cmpnt,
+static int skl_manifest_load(struct snd_soc_component *cmpnt, int index,
 				struct snd_soc_tplg_manifest *manifest)
 {
 	struct hdac_ext_bus *ebus = snd_soc_component_get_drvdata(cmpnt);
diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h
index b1e0667c0ae0..77857c598eed 100644
--- a/sound/soc/intel/skylake/skl-topology.h
+++ b/sound/soc/intel/skylake/skl-topology.h
@@ -221,18 +221,9 @@ struct skl_mod_inst_map {
 	u16 inst_id;
 };
 
-struct skl_uuid_inst_map {
-	u16 inst_id;
-	u16 reserved;
-	uuid_le mod_uuid;
-} __packed;
-
 struct skl_kpb_params {
 	u32 num_modules;
-	union {
-		struct skl_mod_inst_map map[0];
-		struct skl_uuid_inst_map map_uuid[0];
-	} u;
+	struct skl_mod_inst_map map[0];
 };
 
 struct skl_module_inst_id {
@@ -469,7 +460,7 @@ int skl_dsp_set_dma_control(struct skl_sst *ctx, u32 *caps,
 			u32 caps_size, u32 node_id);
 void skl_tplg_set_be_dmic_config(struct snd_soc_dai *dai,
 	struct skl_pipe_params *params, int stream);
-int skl_tplg_init(struct snd_soc_component *component,
+int skl_tplg_init(struct snd_soc_platform *platform,
 				struct hdac_ext_bus *ebus);
 struct skl_module_cfg *skl_tplg_fe_get_cpr_module(
 		struct snd_soc_dai *dai, int stream);
@@ -512,8 +503,7 @@ int skl_pcm_host_dma_prepare(struct device *dev,
 int skl_pcm_link_dma_prepare(struct device *dev,
 			struct skl_pipe_params *params);
 
-int skl_dai_load(struct snd_soc_component *cmp,
-		 struct snd_soc_dai_driver *pcm_dai);
-void skl_tplg_add_moduleid_in_bind_params(struct skl *skl,
-				struct snd_soc_dapm_widget *w);
+int skl_dai_load(struct snd_soc_component *, int index,
+		struct snd_soc_dai_driver *dai_drv,
+		struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai);
 #endif
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index ec2ef7629dbb..028bcaa0eda5 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -315,7 +315,7 @@ static int soc_tplg_vendor_load_(struct soc_tplg *tplg,
 	int ret = 0;
 
 	if (tplg->comp && tplg->ops && tplg->ops->vendor_load)
-		ret = tplg->ops->vendor_load(tplg->comp, hdr);
+		ret = tplg->ops->vendor_load(tplg->comp, tplg->index, hdr);
 	else {
 		dev_err(tplg->dev, "ASoC: no vendor load callback for ID %d\n",
 			hdr->vendor_type);
@@ -347,7 +347,8 @@ static int soc_tplg_widget_load(struct soc_tplg *tplg,
 	struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->widget_load)
-		return tplg->ops->widget_load(tplg->comp, w, tplg_w);
+		return tplg->ops->widget_load(tplg->comp, tplg->index, w,
+			tplg_w);
 
 	return 0;
 }
@@ -358,27 +359,30 @@ static int soc_tplg_widget_ready(struct soc_tplg *tplg,
 	struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->widget_ready)
-		return tplg->ops->widget_ready(tplg->comp, w, tplg_w);
+		return tplg->ops->widget_ready(tplg->comp, tplg->index, w,
+			tplg_w);
 
 	return 0;
 }
 
 /* pass DAI configurations to component driver for extra initialization */
 static int soc_tplg_dai_load(struct soc_tplg *tplg,
-	struct snd_soc_dai_driver *dai_drv)
+	struct snd_soc_dai_driver *dai_drv,
+	struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->dai_load)
-		return tplg->ops->dai_load(tplg->comp, dai_drv);
+		return tplg->ops->dai_load(tplg->comp, tplg->index, dai_drv,
+			pcm, dai);
 
 	return 0;
 }
 
 /* pass link configurations to component driver for extra initialization */
 static int soc_tplg_dai_link_load(struct soc_tplg *tplg,
-	struct snd_soc_dai_link *link)
+	struct snd_soc_dai_link *link, struct snd_soc_tplg_link_config *cfg)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->link_load)
-		return tplg->ops->link_load(tplg->comp, link);
+		return tplg->ops->link_load(tplg->comp, tplg->index, link, cfg);
 
 	return 0;
 }
@@ -699,7 +703,8 @@ static int soc_tplg_init_kcontrol(struct soc_tplg *tplg,
 	struct snd_kcontrol_new *k, struct snd_soc_tplg_ctl_hdr *hdr)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->control_load)
-		return tplg->ops->control_load(tplg->comp, k, hdr);
+		return tplg->ops->control_load(tplg->comp, tplg->index, k,
+			hdr);
 
 	return 0;
 }
@@ -1755,7 +1760,7 @@ static int soc_tplg_dai_create(struct soc_tplg *tplg,
 	}
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_load(tplg, dai_drv);
+	ret = soc_tplg_dai_load(tplg, dai_drv, pcm, NULL);
 	if (ret < 0) {
 		dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n");
 		kfree(dai_drv);
@@ -1825,7 +1830,7 @@ static int soc_tplg_fe_link_create(struct soc_tplg *tplg,
 		set_link_flags(link, pcm->flag_mask, pcm->flags);
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_link_load(tplg, link);
+	ret = soc_tplg_dai_link_load(tplg, link, NULL);
 	if (ret < 0) {
 		dev_err(tplg->comp->dev, "ASoC: FE link loading failed\n");
 		kfree(link);
@@ -2133,7 +2138,7 @@ static int soc_tplg_link_config(struct soc_tplg *tplg,
 		set_link_flags(link, cfg->flag_mask, cfg->flags);
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_link_load(tplg, link);
+	ret = soc_tplg_dai_link_load(tplg, link, cfg);
 	if (ret < 0) {
 		dev_err(tplg->dev, "ASoC: physical link loading failed\n");
 		return ret;
@@ -2255,7 +2260,7 @@ static int soc_tplg_dai_config(struct soc_tplg *tplg,
 		set_dai_flags(dai_drv, d->flag_mask, d->flags);
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_load(tplg, dai_drv);
+	ret = soc_tplg_dai_load(tplg, dai_drv, NULL, dai);
 	if (ret < 0) {
 		dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n");
 		return ret;
@@ -2361,7 +2366,7 @@ static int soc_tplg_manifest_load(struct soc_tplg *tplg,
 
 	/* pass control to component driver for optional further init */
 	if (tplg->comp && tplg->ops && tplg->ops->manifest)
-		return tplg->ops->manifest(tplg->comp, _manifest);
+		return tplg->ops->manifest(tplg->comp, tplg->index, _manifest);
 
 	if (!abi_match)	/* free the duplicated one */
 		kfree(_manifest);
-- 
cgit v1.2.3


From 28aa6f7779f77a863a08c1b9db4b654a94c86dd0 Mon Sep 17 00:00:00 2001
From: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Date: Tue, 27 Mar 2018 14:30:43 +0100
Subject: ASoC: topology: Add callback for DAPM route load/unload

Add a callback fro clients for notification about DAPM route loading and
unloading.

Signed-off-by: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-topology.h |  7 +++++++
 sound/soc/soc-topology.c     | 13 +++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc-topology.h b/include/sound/soc-topology.h
index e1f265e21ee1..401ef2c45d6c 100644
--- a/include/sound/soc-topology.h
+++ b/include/sound/soc-topology.h
@@ -32,6 +32,7 @@ struct snd_kcontrol_new;
 struct snd_soc_dai_link;
 struct snd_soc_dai_driver;
 struct snd_soc_dai;
+struct snd_soc_dapm_route;
 
 /* object scan be loaded and unloaded in groups with identfying indexes */
 #define SND_SOC_TPLG_INDEX_ALL	0	/* ID that matches all FW objects */
@@ -116,6 +117,12 @@ struct snd_soc_tplg_ops {
 	int (*control_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
+	/* DAPM graph route element loading and unloading */
+	int (*dapm_route_load)(struct snd_soc_component *, int index,
+		struct snd_soc_dapm_route *route);
+	int (*dapm_route_unload)(struct snd_soc_component *,
+		struct snd_soc_dobj *);
+
 	/* external widget init - used for any driver specific init */
 	int (*widget_load)(struct snd_soc_component *, int index,
 		struct snd_soc_dapm_widget *,
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 028bcaa0eda5..b9370ae31907 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -1161,6 +1161,17 @@ static int soc_tplg_kcontrol_elems_load(struct soc_tplg *tplg,
 	return 0;
 }
 
+/* optionally pass new dynamic kcontrol to component driver. */
+static int soc_tplg_add_route(struct soc_tplg *tplg,
+	struct snd_soc_dapm_route *route)
+{
+	if (tplg->comp && tplg->ops && tplg->ops->dapm_route_load)
+		return tplg->ops->dapm_route_load(tplg->comp, tplg->index,
+			route);
+
+	return 0;
+}
+
 static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg,
 	struct snd_soc_tplg_hdr *hdr)
 {
@@ -1209,6 +1220,8 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg,
 		else
 			route.control = elem->control;
 
+		soc_tplg_add_route(tplg, &route);
+
 		/* add route, but keep going if some fail */
 		snd_soc_dapm_add_routes(dapm, &route, 1);
 	}
-- 
cgit v1.2.3


From 032234d8231909ac049f22ea3b408487e1c103eb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 10:00:39 -0700
Subject: net/ipv6: Make __inet6_bind static

BPF core gets access to __inet6_bind via ipv6_bpf_stub_impl, so it is
not invoked directly outside of af_inet6.c. Make it static and move
inet6_bind after to avoid forward declaration.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h  |  2 --
 net/ipv6/af_inet6.c | 53 ++++++++++++++++++++++++++---------------------------
 2 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 836f31af1369..68b167d98879 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1044,8 +1044,6 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
 void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);
 
 int inet6_release(struct socket *sock);
-int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len,
-		 bool force_bind_address_no_port, bool with_lock);
 int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2c694912df2e..36d622c477b1 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -273,33 +273,8 @@ out_rcu_unlock:
 	goto out;
 }
 
-
-/* bind for INET6 API */
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
-	struct sock *sk = sock->sk;
-	int err = 0;
-
-	/* If the socket has its own bind function then use it. */
-	if (sk->sk_prot->bind)
-		return sk->sk_prot->bind(sk, uaddr, addr_len);
-
-	if (addr_len < SIN6_LEN_RFC2133)
-		return -EINVAL;
-
-	/* BPF prog is run before any checks are done so that if the prog
-	 * changes context in a wrong way it will be caught.
-	 */
-	err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
-	if (err)
-		return err;
-
-	return __inet6_bind(sk, uaddr, addr_len, false, true);
-}
-EXPORT_SYMBOL(inet6_bind);
-
-int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-		 bool force_bind_address_no_port, bool with_lock)
+static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+			bool force_bind_address_no_port, bool with_lock)
 {
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
 	struct inet_sock *inet = inet_sk(sk);
@@ -444,6 +419,30 @@ out_unlock:
 	goto out;
 }
 
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	/* If the socket has its own bind function then use it. */
+	if (sk->sk_prot->bind)
+		return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	/* BPF prog is run before any checks are done so that if the prog
+	 * changes context in a wrong way it will be caught.
+	 */
+	err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+	if (err)
+		return err;
+
+	return __inet6_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet6_bind);
+
 int inet6_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
-- 
cgit v1.2.3


From bdb7cc643fc9db8d6ed9a2b9e524e27ac5882029 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Mon, 16 Apr 2018 13:42:16 -0400
Subject: ipv6: Count interface receive statistics on the ingress netdev

The statistics such as InHdrErrors should be counted on the ingress
netdev rather than on the dev from the dst, which is the egress.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h          | 14 +++++++++++
 net/ipv6/exthdrs.c              | 55 ++++++++++++++++-------------------------
 net/ipv6/ip6_input.c            |  2 +-
 net/ipv6/ip6_output.c           | 18 ++++++--------
 net/ipv6/reassembly.c           |  6 ++---
 net/ipv6/route.c                |  3 ++-
 net/netfilter/ipvs/ip_vs_xmit.c |  5 ++--
 7 files changed, 51 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 378d601258be..8312cc25a3af 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -307,6 +307,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
 	return rcu_dereference_rtnl(dev->ip6_ptr);
 }
 
+/**
+ * __in6_dev_get_safely - get inet6_dev pointer from netdevice
+ * @dev: network device
+ *
+ * This is a safer version of __in6_dev_get
+ */
+static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev)
+{
+	if (likely(dev))
+		return rcu_dereference_rtnl(dev->ip6_ptr);
+	else
+		return NULL;
+}
+
 /**
  * in6_dev_get - get inet6_dev pointer from netdevice
  * @dev: network device
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index bc68eb661970..5bc2bf3733ab 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -280,6 +280,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = {
 
 static int ipv6_destopt_rcv(struct sk_buff *skb)
 {
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
 	struct inet6_skb_parm *opt = IP6CB(skb);
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 	__u16 dstbuf;
@@ -291,7 +292,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		__IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+		__IP6_INC_STATS(dev_net(dst->dev), idev,
 				IPSTATS_MIB_INHDRERRORS);
 fail_and_free:
 		kfree_skb(skb);
@@ -319,8 +320,7 @@ fail_and_free:
 		return 1;
 	}
 
-	__IP6_INC_STATS(dev_net(dst->dev),
-			ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 	return -1;
 }
 
@@ -416,8 +416,7 @@ looped_back:
 	}
 
 	if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((&hdr->segments_left) -
 				   skb_network_header(skb)));
@@ -456,8 +455,7 @@ looped_back:
 
 	if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
 				    ICMPV6_EXC_HOPLIMIT, 0);
 			kfree_skb(skb);
@@ -481,10 +479,10 @@ looped_back:
 /* called with rcu_read_lock() */
 static int ipv6_rthdr_rcv(struct sk_buff *skb)
 {
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
 	struct inet6_skb_parm *opt = IP6CB(skb);
 	struct in6_addr *addr = NULL;
 	struct in6_addr daddr;
-	struct inet6_dev *idev;
 	int n, i;
 	struct ipv6_rt_hdr *hdr;
 	struct rt0_hdr *rthdr;
@@ -498,8 +496,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		kfree_skb(skb);
 		return -1;
 	}
@@ -508,8 +505,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 
 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
 	    skb->pkt_type != PACKET_HOST) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INADDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
 	}
@@ -527,7 +523,7 @@ looped_back:
 			 * processed by own
 			 */
 			if (!addr) {
-				__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+				__IP6_INC_STATS(net, idev,
 						IPSTATS_MIB_INADDRERRORS);
 				kfree_skb(skb);
 				return -1;
@@ -553,8 +549,7 @@ looped_back:
 			goto unknown_rh;
 		/* Silently discard invalid RTH type 2 */
 		if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
@@ -572,8 +567,7 @@ looped_back:
 	n = hdr->hdrlen >> 1;
 
 	if (hdr->segments_left > n) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((&hdr->segments_left) -
 				   skb_network_header(skb)));
@@ -609,14 +603,12 @@ looped_back:
 		if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
 				     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
 				     IPPROTO_ROUTING) < 0) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INADDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
 		if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INADDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
@@ -627,8 +619,7 @@ looped_back:
 	}
 
 	if (ipv6_addr_is_multicast(addr)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INADDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
 	}
@@ -647,8 +638,7 @@ looped_back:
 
 	if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 				    0);
 			kfree_skb(skb);
@@ -663,7 +653,7 @@ looped_back:
 	return -1;
 
 unknown_rh:
-	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 			  (&hdr->type) - skb_network_header(skb));
 	return -1;
@@ -755,34 +745,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
 static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
 {
 	const unsigned char *nh = skb_network_header(skb);
+	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 	struct net *net = ipv6_skb_net(skb);
 	u32 pkt_len;
 
 	if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
 		net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
 				    nh[optoff+1]);
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		goto drop;
 	}
 
 	pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
 	if (pkt_len <= IPV6_MAXPLEN) {
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
 		return false;
 	}
 	if (ipv6_hdr(skb)->payload_len) {
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
 		return false;
 	}
 
 	if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INTRUNCATEDPKTS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS);
 		goto drop;
 	}
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9ee208a348f5..f08d34491ece 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -336,7 +336,7 @@ int ip6_mc_input(struct sk_buff *skb)
 	bool deliver;
 
 	__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
-			 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
+			 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 			 skb->len);
 
 	hdr = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2e891d2c30ef..a39b04f9fa6e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -425,6 +425,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 
 int ip6_forward(struct sk_buff *skb)
 {
+	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 	struct dst_entry *dst = skb_dst(skb);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -444,8 +445,7 @@ int ip6_forward(struct sk_buff *skb)
 		goto drop;
 
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INDISCARDS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
 
@@ -476,8 +476,7 @@ int ip6_forward(struct sk_buff *skb)
 		/* Force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 
 		kfree_skb(skb);
 		return -ETIMEDOUT;
@@ -490,15 +489,13 @@ int ip6_forward(struct sk_buff *skb)
 		if (proxied > 0)
 			return ip6_input(skb);
 		else if (proxied < 0) {
-			__IP6_INC_STATS(net, ip6_dst_idev(dst),
-					IPSTATS_MIB_INDISCARDS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 			goto drop;
 		}
 	}
 
 	if (!xfrm6_route_forward(skb)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INDISCARDS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
 	dst = skb_dst(skb);
@@ -554,8 +551,7 @@ int ip6_forward(struct sk_buff *skb)
 		/* Again, force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INTOOBIGERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 				IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
@@ -579,7 +575,7 @@ int ip6_forward(struct sk_buff *skb)
 		       ip6_forward_finish);
 
 error:
-	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 drop:
 	kfree_skb(skb);
 	return -EINVAL;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 4979610287e2..2cdf3dcf8c2c 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -179,7 +179,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
 				IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((u8 *)&fhdr->frag_off -
@@ -214,7 +214,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			/* RFC2460 says always send parameter problem in
 			 * this case. -DaveM
 			 */
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+			__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
 					IPSTATS_MIB_INHDRERRORS);
 			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 					  offsetof(struct ipv6hdr, payload_len));
@@ -536,7 +536,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	return -1;
 
 fail_hdr:
-	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+	__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
 			IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
 	return -1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 49b954d6d0fa..1d738bfe893b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3541,7 +3541,8 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
 	case IPSTATS_MIB_INNOROUTES:
 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
 		if (type == IPV6_ADDR_ANY) {
-			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+			IP6_INC_STATS(dev_net(dst->dev),
+				      __in6_dev_get_safely(skb->dev),
 				      IPSTATS_MIB_INADDRERRORS);
 			break;
 		}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4527921b1c3a..ba0a0fd045c8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
 
 		/* check and decrement ttl */
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
+			struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
+
 			/* Force OUTPUT device used as source address */
 			skb->dev = dst->dev;
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
 				    ICMPV6_EXC_HOPLIMIT, 0);
-			__IP6_INC_STATS(net, ip6_dst_idev(dst),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 
 			return false;
 		}
-- 
cgit v1.2.3


From 72f6d71e491e6ce269b564865b21fab0a4402dd3 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 17 Apr 2018 14:11:28 +0800
Subject: vxlan: add ttl inherit support

Like tos inherit, ttl inherit should also means inherit the inner protocol's
ttl values, which actually not implemented in vxlan yet.

But we could not treat ttl == 0 as "use the inner TTL", because that would be
used also when the "ttl" option is not specified and that would be a behavior
change, and breaking real use cases.

So add a different attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" is
specified with ip cmd.

Reported-by: Jianlin Shi <jishi@redhat.com>
Suggested-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 17 ++++++++++++++---
 include/net/ip_tunnels.h     | 11 +++++++++++
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  1 +
 4 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index fab7a4db249e..aee0e60471f1 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2085,9 +2085,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		local_ip = vxlan->cfg.saddr;
 		dst_cache = &rdst->dst_cache;
 		md->gbp = skb->mark;
-		ttl = vxlan->cfg.ttl;
-		if (!ttl && vxlan_addr_multicast(dst))
-			ttl = 1;
+		if (flags & VXLAN_F_TTL_INHERIT) {
+			ttl = ip_tunnel_get_ttl(old_iph, skb);
+		} else {
+			ttl = vxlan->cfg.ttl;
+			if (!ttl && vxlan_addr_multicast(dst))
+				ttl = 1;
+		}
 
 		tos = vxlan->cfg.tos;
 		if (tos == 1)
@@ -2709,6 +2713,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -3254,6 +3259,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_VXLAN_TTL])
 		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
+	if (data[IFLA_VXLAN_TTL_INHERIT]) {
+		if (changelink)
+			return -EOPNOTSUPP;
+		conf->flags |= VXLAN_F_TTL_INHERIT;
+	}
+
 	if (data[IFLA_VXLAN_LABEL])
 		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
 			     IPV6_FLOWLABEL_MASK;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 540a4b4417bf..751646adc769 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -379,6 +379,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 		return 0;
 }
 
+static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
+				       const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->ttl;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ((const struct ipv6hdr *)iph)->hop_limit;
+	else
+		return 0;
+}
+
 /* Propogate ECN bits out */
 static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 				     const struct sk_buff *skb)
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index ad73d8b3fcc2..b99a02ae3934 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -262,6 +262,7 @@ struct vxlan_dev {
 #define VXLAN_F_COLLECT_METADATA	0x2000
 #define VXLAN_F_GPE			0x4000
 #define VXLAN_F_IPV6_LINKLOCAL		0x8000
+#define VXLAN_F_TTL_INHERIT		0x10000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 68699f654118..b85266420bfb 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -516,6 +516,7 @@ enum {
 	IFLA_VXLAN_COLLECT_METADATA,
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
+	IFLA_VXLAN_TTL_INHERIT,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 1e6c06a7e88c251d8a30271ad5206fbd967a4576 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Sun, 8 Apr 2018 11:06:57 +0800
Subject: hwspinlock: Introduce one new mode for hwspinlock

In some scenarios, user need do some time-consuming or sleepable
operations under the hardware spinlock protection for synchronization
between the multiple subsystems.

For example, there is one PMIC efuse on Spreadtrum platform, which
need to be accessed under one hardware lock. But during the hardware
lock protection, the efuse operation is time-consuming to almost 5 ms,
so we can not disable the interrupts or preemption so long in this case.

Thus we can introduce one new mode to indicate that we just acquire the
hardware lock and do not disable interrupts or preemption, meanwhile we
should force user to protect the hardware lock with mutex or spinlock to
avoid dead-lock.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/hwspinlock/hwspinlock_core.c | 34 ++++++++++++++++-----
 include/linux/hwspinlock.h           | 58 ++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index f4a59f5631e4..5278d0560a4a 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -71,10 +71,16 @@ static DEFINE_MUTEX(hwspinlock_tree_lock);
  * This function attempts to lock an hwspinlock, and will immediately
  * fail if the hwspinlock is already taken.
  *
- * Upon a successful return from this function, preemption (and possibly
- * interrupts) is disabled, so the caller must not sleep, and is advised to
- * release the hwspinlock as soon as possible. This is required in order to
- * minimize remote cores polling on the hardware interconnect.
+ * Caution: If the mode is HWLOCK_RAW, that means user must protect the routine
+ * of getting hardware lock with mutex or spinlock. Since in some scenarios,
+ * user need some time-consuming or sleepable operations under the hardware
+ * lock, they need one sleepable lock (like mutex) to protect the operations.
+ *
+ * If the mode is not HWLOCK_RAW, upon a successful return from this function,
+ * preemption (and possibly interrupts) is disabled, so the caller must not
+ * sleep, and is advised to release the hwspinlock as soon as possible. This is
+ * required in order to minimize remote cores polling on the hardware
+ * interconnect.
  *
  * The user decides whether local interrupts are disabled or not, and if yes,
  * whether he wants their previous state to be saved. It is up to the user
@@ -113,6 +119,9 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 	case HWLOCK_IRQ:
 		ret = spin_trylock_irq(&hwlock->lock);
 		break;
+	case HWLOCK_RAW:
+		ret = 1;
+		break;
 	default:
 		ret = spin_trylock(&hwlock->lock);
 		break;
@@ -134,6 +143,9 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 		case HWLOCK_IRQ:
 			spin_unlock_irq(&hwlock->lock);
 			break;
+		case HWLOCK_RAW:
+			/* Nothing to do */
+			break;
 		default:
 			spin_unlock(&hwlock->lock);
 			break;
@@ -170,9 +182,14 @@ EXPORT_SYMBOL_GPL(__hwspin_trylock);
  * is already taken, the function will busy loop waiting for it to
  * be released, but give up after @timeout msecs have elapsed.
  *
- * Upon a successful return from this function, preemption is disabled
- * (and possibly local interrupts, too), so the caller must not sleep,
- * and is advised to release the hwspinlock as soon as possible.
+ * Caution: If the mode is HWLOCK_RAW, that means user must protect the routine
+ * of getting hardware lock with mutex or spinlock. Since in some scenarios,
+ * user need some time-consuming or sleepable operations under the hardware
+ * lock, they need one sleepable lock (like mutex) to protect the operations.
+ *
+ * If the mode is not HWLOCK_RAW, upon a successful return from this function,
+ * preemption is disabled (and possibly local interrupts, too), so the caller
+ * must not sleep, and is advised to release the hwspinlock as soon as possible.
  * This is required in order to minimize remote cores polling on the
  * hardware interconnect.
  *
@@ -266,6 +283,9 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 	case HWLOCK_IRQ:
 		spin_unlock_irq(&hwlock->lock);
 		break;
+	case HWLOCK_RAW:
+		/* Nothing to do */
+		break;
 	default:
 		spin_unlock(&hwlock->lock);
 		break;
diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h
index 859d673d98c8..fe450ee58d55 100644
--- a/include/linux/hwspinlock.h
+++ b/include/linux/hwspinlock.h
@@ -24,6 +24,7 @@
 /* hwspinlock mode argument */
 #define HWLOCK_IRQSTATE	0x01	/* Disable interrupts, save state */
 #define HWLOCK_IRQ	0x02	/* Disable interrupts, don't save state */
+#define HWLOCK_RAW	0x03
 
 struct device;
 struct device_node;
@@ -175,6 +176,25 @@ static inline int hwspin_trylock_irq(struct hwspinlock *hwlock)
 	return __hwspin_trylock(hwlock, HWLOCK_IRQ, NULL);
 }
 
+/**
+ * hwspin_trylock_raw() - attempt to lock a specific hwspinlock
+ * @hwlock: an hwspinlock which we want to trylock
+ *
+ * This function attempts to lock an hwspinlock, and will immediately fail
+ * if the hwspinlock is already taken.
+ *
+ * Caution: User must protect the routine of getting hardware lock with mutex
+ * or spinlock to avoid dead-lock, that will let user can do some time-consuming
+ * or sleepable operations under the hardware lock.
+ *
+ * Returns 0 if we successfully locked the hwspinlock, -EBUSY if
+ * the hwspinlock was already taken, and -EINVAL if @hwlock is invalid.
+ */
+static inline int hwspin_trylock_raw(struct hwspinlock *hwlock)
+{
+	return __hwspin_trylock(hwlock, HWLOCK_RAW, NULL);
+}
+
 /**
  * hwspin_trylock() - attempt to lock a specific hwspinlock
  * @hwlock: an hwspinlock which we want to trylock
@@ -242,6 +262,29 @@ int hwspin_lock_timeout_irq(struct hwspinlock *hwlock, unsigned int to)
 	return __hwspin_lock_timeout(hwlock, to, HWLOCK_IRQ, NULL);
 }
 
+/**
+ * hwspin_lock_timeout_raw() - lock an hwspinlock with timeout limit
+ * @hwlock: the hwspinlock to be locked
+ * @to: timeout value in msecs
+ *
+ * This function locks the underlying @hwlock. If the @hwlock
+ * is already taken, the function will busy loop waiting for it to
+ * be released, but give up when @timeout msecs have elapsed.
+ *
+ * Caution: User must protect the routine of getting hardware lock with mutex
+ * or spinlock to avoid dead-lock, that will let user can do some time-consuming
+ * or sleepable operations under the hardware lock.
+ *
+ * Returns 0 when the @hwlock was successfully taken, and an appropriate
+ * error code otherwise (most notably an -ETIMEDOUT if the @hwlock is still
+ * busy after @timeout msecs). The function will never sleep.
+ */
+static inline
+int hwspin_lock_timeout_raw(struct hwspinlock *hwlock, unsigned int to)
+{
+	return __hwspin_lock_timeout(hwlock, to, HWLOCK_RAW, NULL);
+}
+
 /**
  * hwspin_lock_timeout() - lock an hwspinlock with timeout limit
  * @hwlock: the hwspinlock to be locked
@@ -301,6 +344,21 @@ static inline void hwspin_unlock_irq(struct hwspinlock *hwlock)
 	__hwspin_unlock(hwlock, HWLOCK_IRQ, NULL);
 }
 
+/**
+ * hwspin_unlock_raw() - unlock hwspinlock
+ * @hwlock: a previously-acquired hwspinlock which we want to unlock
+ *
+ * This function will unlock a specific hwspinlock.
+ *
+ * @hwlock must be already locked (e.g. by hwspin_trylock()) before calling
+ * this function: it is a bug to call unlock on a @hwlock that is already
+ * unlocked.
+ */
+static inline void hwspin_unlock_raw(struct hwspinlock *hwlock)
+{
+	__hwspin_unlock(hwlock, HWLOCK_RAW, NULL);
+}
+
 /**
  * hwspin_unlock() - unlock hwspinlock
  * @hwlock: a previously-acquired hwspinlock which we want to unlock
-- 
cgit v1.2.3


From ee6548d1d98df7df3b9c8103a42cf68b31c29417 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 3 Apr 2018 07:52:04 +0300
Subject: RDMA/rdma_cm: Delete rdma_addr_client

The only thing it does is block module unload while work is posted from
rdma_resolve_ip().

However, this is not the right place to do this. The users of
rdma_resolve_ip() must ensure their own module does not unload until
rdma_resolve_ip() calls the callback, or until rdma_addr_cancel() is
called.

Similarly callers to rdma_addr_find_l2_eth_by_grh() must ensure their
module does not unload while they are calling code.

The only two users are already safe, so there is no need for this.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/addr.c | 38 ++++----------------------------------
 drivers/infiniband/core/cma.c  |  6 +-----
 include/rdma/ib_addr.h         | 20 +-------------------
 3 files changed, 6 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 9756cfbdef0e..4f32c4062fb6 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -56,7 +56,6 @@ struct addr_req {
 	struct sockaddr_storage src_addr;
 	struct sockaddr_storage dst_addr;
 	struct rdma_dev_addr *addr;
-	struct rdma_addr_client *client;
 	void *context;
 	void (*callback)(int status, struct sockaddr *src_addr,
 			 struct rdma_dev_addr *addr, void *context);
@@ -220,28 +219,6 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr)
 }
 EXPORT_SYMBOL(rdma_addr_size_kss);
 
-static struct rdma_addr_client self;
-
-void rdma_addr_register_client(struct rdma_addr_client *client)
-{
-	atomic_set(&client->refcount, 1);
-	init_completion(&client->comp);
-}
-EXPORT_SYMBOL(rdma_addr_register_client);
-
-static inline void put_client(struct rdma_addr_client *client)
-{
-	if (atomic_dec_and_test(&client->refcount))
-		complete(&client->comp);
-}
-
-void rdma_addr_unregister_client(struct rdma_addr_client *client)
-{
-	put_client(client);
-	wait_for_completion(&client->comp);
-}
-EXPORT_SYMBOL(rdma_addr_unregister_client);
-
 void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
 		    const struct net_device *dev,
 		    const unsigned char *dst_dev_addr)
@@ -605,14 +582,12 @@ static void process_one_req(struct work_struct *_work)
 		 */
 		cancel_delayed_work(&req->work);
 		list_del_init(&req->list);
-		put_client(req->client);
 		kfree(req);
 	}
 	spin_unlock_bh(&lock);
 }
 
-int rdma_resolve_ip(struct rdma_addr_client *client,
-		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr,
 		    struct rdma_dev_addr *addr, int timeout_ms,
 		    void (*callback)(int status, struct sockaddr *src_addr,
 				     struct rdma_dev_addr *addr, void *context),
@@ -644,8 +619,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
 	req->addr = addr;
 	req->callback = callback;
 	req->context = context;
-	req->client = client;
-	atomic_inc(&client->refcount);
 	INIT_DELAYED_WORK(&req->work, process_one_req);
 	req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
 
@@ -661,7 +634,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
 		break;
 	default:
 		ret = req->status;
-		atomic_dec(&client->refcount);
 		goto err;
 	}
 	return ret;
@@ -722,7 +694,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
 		found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr,
 			      found->addr, found->context);
 
-	put_client(found->client);
 	kfree(found);
 }
 EXPORT_SYMBOL(rdma_addr_cancel);
@@ -761,8 +732,8 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 	dev_addr.net = &init_net;
 
 	init_completion(&ctx.comp);
-	ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
-			&dev_addr, 1000, resolve_cb, &ctx);
+	ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr,
+			      &dev_addr, 1000, resolve_cb, &ctx);
 	if (ret)
 		return ret;
 
@@ -806,14 +777,13 @@ int addr_init(void)
 		return -ENOMEM;
 
 	register_netevent_notifier(&nb);
-	rdma_addr_register_client(&self);
 
 	return 0;
 }
 
 void addr_cleanup(void)
 {
-	rdma_addr_unregister_client(&self);
 	unregister_netevent_notifier(&nb);
 	destroy_workqueue(addr_wq);
+	WARN_ON(!list_empty(&req_list));
 }
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 51a641002e10..48300838e354 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -156,7 +156,6 @@ static struct ib_client cma_client = {
 };
 
 static struct ib_sa_client sa_client;
-static struct rdma_addr_client addr_client;
 static LIST_HEAD(dev_list);
 static LIST_HEAD(listen_any_list);
 static DEFINE_MUTEX(lock);
@@ -2910,7 +2909,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 		if (dst_addr->sa_family == AF_IB) {
 			ret = cma_resolve_ib_addr(id_priv);
 		} else {
-			ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv),
+			ret = rdma_resolve_ip(cma_src_addr(id_priv),
 					      dst_addr, &id->route.addr.dev_addr,
 					      timeout_ms, addr_handler, id_priv);
 		}
@@ -4547,7 +4546,6 @@ static int __init cma_init(void)
 		goto err_wq;
 
 	ib_sa_register_client(&sa_client);
-	rdma_addr_register_client(&addr_client);
 	register_netdevice_notifier(&cma_nb);
 
 	ret = ib_register_client(&cma_client);
@@ -4561,7 +4559,6 @@ static int __init cma_init(void)
 
 err:
 	unregister_netdevice_notifier(&cma_nb);
-	rdma_addr_unregister_client(&addr_client);
 	ib_sa_unregister_client(&sa_client);
 err_wq:
 	destroy_workqueue(cma_wq);
@@ -4574,7 +4571,6 @@ static void __exit cma_cleanup(void)
 	rdma_nl_unregister(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
-	rdma_addr_unregister_client(&addr_client);
 	ib_sa_unregister_client(&sa_client);
 	unregister_pernet_subsys(&cma_pernet_operations);
 	destroy_workqueue(cma_wq);
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index a08cc7278980..c2c8b1fdeead 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -49,22 +49,6 @@
 #include <net/ipv6.h>
 #include <net/net_namespace.h>
 
-struct rdma_addr_client {
-	atomic_t refcount;
-	struct completion comp;
-};
-
-/**
- * rdma_addr_register_client - Register an address client.
- */
-void rdma_addr_register_client(struct rdma_addr_client *client);
-
-/**
- * rdma_addr_unregister_client - Deregister an address client.
- * @client: Client object to deregister.
- */
-void rdma_addr_unregister_client(struct rdma_addr_client *client);
-
 /**
  * struct rdma_dev_addr - Contains resolved RDMA hardware addresses
  * @src_dev_addr:	Source MAC address.
@@ -99,7 +83,6 @@ int rdma_translate_ip(const struct sockaddr *addr,
 /**
  * rdma_resolve_ip - Resolve source and destination IP addresses to
  *   RDMA hardware addresses.
- * @client: Address client associated with request.
  * @src_addr: An optional source address to use in the resolution.  If a
  *   source address is not provided, a usable address will be returned via
  *   the callback.
@@ -112,8 +95,7 @@ int rdma_translate_ip(const struct sockaddr *addr,
  *   or been canceled.  A status of 0 indicates success.
  * @context: User-specified context associated with the call.
  */
-int rdma_resolve_ip(struct rdma_addr_client *client,
-		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr,
 		    struct rdma_dev_addr *addr, int timeout_ms,
 		    void (*callback)(int status, struct sockaddr *src_addr,
 				     struct rdma_dev_addr *addr, void *context),
-- 
cgit v1.2.3


From fe1bd78bf18a7cb3eb76fceea9193534fb6619e3 Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@impinj.com>
Date: Mon, 2 Apr 2018 13:06:05 -0700
Subject: ARM: imx: Update spi_imx platform data to reflect current state

The docs for the spi_imx platform data still refer to a -32 offset used to
specify a native chip select.  This was removed in commit 602c8f4485cd
("spi: imx: fix use of native chip-selects with devicetree") and no
longer works as documented.  Update documentation.

The macro MXC_SPI_CS() is no longer is needed.

If a board uses all native chip selects, then it's not necessary to
specify a chip select array at all, as all native is the default (this is
how device-tree configured SPI masters work too).  Most of the spi-imx
platform data users have their chip select arrays removed by this patch.

This patch also fixes a bug in mx31moboard introduced in the '602 commit.
When that board was updated in commit 901f26bce64a ("ARM: imx: set
correct chip_select in platform setup") to reflect the SPI change, only
SPI bus 2 was updated and SPI bus 1 was left with non-sequential chip
selects.  The mc13783 spi device on bus 1 had its chip select updated as
if it were on bus 2.

CC: Sascha Hauer <kernel@pengutronix.de>
CC: Fabio Estevam <fabio.estevam@nxp.com>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Trent Piepho <tpiepho@impinj.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 arch/arm/mach-imx/mach-mx31_3ds.c     | 18 ++----------------
 arch/arm/mach-imx/mach-mx31lilly.c    | 12 ++----------
 arch/arm/mach-imx/mach-mx31lite.c     | 16 ++--------------
 arch/arm/mach-imx/mach-mx31moboard.c  | 17 +++--------------
 arch/arm/mach-imx/mach-pcm037_eet.c   |  5 +----
 include/linux/platform_data/spi-imx.h | 29 +++++++++++++++++------------
 6 files changed, 27 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-imx/mach-mx31_3ds.c b/arch/arm/mach-imx/mach-mx31_3ds.c
index 68c3f0799d5b..9d87f1dcf7bb 100644
--- a/arch/arm/mach-imx/mach-mx31_3ds.c
+++ b/arch/arm/mach-imx/mach-mx31_3ds.c
@@ -374,26 +374,12 @@ static struct imx_ssi_platform_data mx31_3ds_ssi_pdata = {
 };
 
 /* SPI */
-static int spi0_internal_chipselect[] = {
-	MXC_SPI_CS(0),
-	MXC_SPI_CS(1),
-	MXC_SPI_CS(2),
-};
-
 static const struct spi_imx_master spi0_pdata __initconst = {
-	.chipselect	= spi0_internal_chipselect,
-	.num_chipselect	= ARRAY_SIZE(spi0_internal_chipselect),
-};
-
-static int spi1_internal_chipselect[] = {
-	MXC_SPI_CS(0),
-	MXC_SPI_CS(1),
-	MXC_SPI_CS(2),
+	.num_chipselect	= 3,
 };
 
 static const struct spi_imx_master spi1_pdata __initconst = {
-	.chipselect	= spi1_internal_chipselect,
-	.num_chipselect	= ARRAY_SIZE(spi1_internal_chipselect),
+	.num_chipselect	= 3,
 };
 
 static struct spi_board_info mx31_3ds_spi_devs[] __initdata = {
diff --git a/arch/arm/mach-imx/mach-mx31lilly.c b/arch/arm/mach-imx/mach-mx31lilly.c
index 6fd463642954..8bf52819d4d9 100644
--- a/arch/arm/mach-imx/mach-mx31lilly.c
+++ b/arch/arm/mach-imx/mach-mx31lilly.c
@@ -226,20 +226,12 @@ static void __init lilly1131_usb_init(void)
 
 /* SPI */
 
-static int spi_internal_chipselect[] = {
-	MXC_SPI_CS(0),
-	MXC_SPI_CS(1),
-	MXC_SPI_CS(2),
-};
-
 static const struct spi_imx_master spi0_pdata __initconst = {
-	.chipselect = spi_internal_chipselect,
-	.num_chipselect = ARRAY_SIZE(spi_internal_chipselect),
+	.num_chipselect = 3,
 };
 
 static const struct spi_imx_master spi1_pdata __initconst = {
-	.chipselect = spi_internal_chipselect,
-	.num_chipselect = ARRAY_SIZE(spi_internal_chipselect),
+	.num_chipselect = 3,
 };
 
 static struct mc13xxx_platform_data mc13783_pdata __initdata = {
diff --git a/arch/arm/mach-imx/mach-mx31lite.c b/arch/arm/mach-imx/mach-mx31lite.c
index a3250bc7f114..a3cbba6c955b 100644
--- a/arch/arm/mach-imx/mach-mx31lite.c
+++ b/arch/arm/mach-imx/mach-mx31lite.c
@@ -83,15 +83,8 @@ static const struct imxuart_platform_data uart_pdata __initconst = {
 };
 
 /* SPI */
-static int spi0_internal_chipselect[] = {
-	MXC_SPI_CS(0),
-	MXC_SPI_CS(1),
-	MXC_SPI_CS(2),
-};
-
 static const struct spi_imx_master spi0_pdata __initconst = {
-	.chipselect	= spi0_internal_chipselect,
-	.num_chipselect	= ARRAY_SIZE(spi0_internal_chipselect),
+	.num_chipselect	= 3,
 };
 
 static const struct mxc_nand_platform_data
@@ -133,13 +126,8 @@ static struct platform_device smsc911x_device = {
  * The MC13783 is the only hard-wired SPI device on the module.
  */
 
-static int spi1_internal_chipselect[] = {
-	MXC_SPI_CS(0),
-};
-
 static const struct spi_imx_master spi1_pdata __initconst = {
-	.chipselect	= spi1_internal_chipselect,
-	.num_chipselect	= ARRAY_SIZE(spi1_internal_chipselect),
+	.num_chipselect	= 1,
 };
 
 static struct mc13xxx_platform_data mc13783_pdata __initdata = {
diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c
index 7716f83aecdd..643a3d749703 100644
--- a/arch/arm/mach-imx/mach-mx31moboard.c
+++ b/arch/arm/mach-imx/mach-mx31moboard.c
@@ -152,14 +152,8 @@ static const struct imxi2c_platform_data moboard_i2c1_data __initconst = {
 	.bitrate = 100000,
 };
 
-static int moboard_spi1_cs[] = {
-	MXC_SPI_CS(0),
-	MXC_SPI_CS(2),
-};
-
 static const struct spi_imx_master moboard_spi1_pdata __initconst = {
-	.chipselect	= moboard_spi1_cs,
-	.num_chipselect	= ARRAY_SIZE(moboard_spi1_cs),
+	.num_chipselect	= 3,
 };
 
 static struct regulator_consumer_supply sdhc_consumers[] = {
@@ -296,19 +290,14 @@ static struct spi_board_info moboard_spi_board_info[] __initdata = {
 		/* irq number is run-time assigned */
 		.max_speed_hz = 300000,
 		.bus_num = 1,
-		.chip_select = 1,
+		.chip_select = 0,
 		.platform_data = &moboard_pmic,
 		.mode = SPI_CS_HIGH,
 	},
 };
 
-static int moboard_spi2_cs[] = {
-	MXC_SPI_CS(0), MXC_SPI_CS(1),
-};
-
 static const struct spi_imx_master moboard_spi2_pdata __initconst = {
-	.chipselect	= moboard_spi2_cs,
-	.num_chipselect	= ARRAY_SIZE(moboard_spi2_cs),
+	.num_chipselect	= 2,
 };
 
 #define SDHC1_CD IOMUX_TO_GPIO(MX31_PIN_ATA_CS0)
diff --git a/arch/arm/mach-imx/mach-pcm037_eet.c b/arch/arm/mach-imx/mach-pcm037_eet.c
index 95bd97710494..15bc956d466b 100644
--- a/arch/arm/mach-imx/mach-pcm037_eet.c
+++ b/arch/arm/mach-imx/mach-pcm037_eet.c
@@ -56,11 +56,8 @@ static struct spi_board_info pcm037_spi_dev[] = {
 };
 
 /* Platform Data for MXC CSPI */
-static int pcm037_spi1_cs[] = { MXC_SPI_CS(0), MXC_SPI_CS(1), };
-
 static const struct spi_imx_master pcm037_spi1_pdata __initconst = {
-	.chipselect = pcm037_spi1_cs,
-	.num_chipselect = ARRAY_SIZE(pcm037_spi1_cs),
+	.num_chipselect = 2,
 };
 
 /* GPIO-keys input device */
diff --git a/include/linux/platform_data/spi-imx.h b/include/linux/platform_data/spi-imx.h
index 6f012fefa1a2..328f670d10bd 100644
--- a/include/linux/platform_data/spi-imx.h
+++ b/include/linux/platform_data/spi-imx.h
@@ -5,24 +5,29 @@
 
 /*
  * struct spi_imx_master - device.platform_data for SPI controller devices.
- * @chipselect: Array of chipselects for this master. Numbers >= 0 mean gpio
- *              pins, numbers < 0 mean internal CSPI chipselects according
- *              to MXC_SPI_CS(). Normally you want to use gpio based chip
- *              selects as the CSPI module tries to be intelligent about
- *              when to assert the chipselect: The CSPI module deasserts the
- *              chipselect once it runs out of input data. The other problem
- *              is that it is not possible to mix between high active and low
- *              active chipselects on one single bus using the internal
- *              chipselects. Unfortunately Freescale decided to put some
+ * @chipselect: Array of chipselects for this master or NULL.  Numbers >= 0
+ *              mean GPIO pins, -ENOENT means internal CSPI chipselect
+ *              matching the position in the array.  E.g., if chipselect[1] =
+ *              -ENOENT then a SPI slave using chip select 1 will use the
+ *              native SS1 line of the CSPI.  Omitting the array will use
+ *              all native chip selects.
+
+ *              Normally you want to use gpio based chip selects as the CSPI
+ *              module tries to be intelligent about when to assert the
+ *              chipselect:  The CSPI module deasserts the chipselect once it
+ *              runs out of input data.  The other problem is that it is not
+ *              possible to mix between high active and low active chipselects
+ *              on one single bus using the internal chipselects.
+ *              Unfortunately, on some SoCs, Freescale decided to put some
  *              chipselects on dedicated pins which are not usable as gpios,
  *              so we have to support the internal chipselects.
- * @num_chipselect: ARRAY_SIZE(chipselect)
+ *
+ * @num_chipselect: If @chipselect is specified, ARRAY_SIZE(chipselect),
+ *                  otherwise the number of native chip selects.
  */
 struct spi_imx_master {
 	int	*chipselect;
 	int	num_chipselect;
 };
 
-#define MXC_SPI_CS(no)	((no) - 32)
-
 #endif /* __MACH_SPI_H_*/
-- 
cgit v1.2.3


From a919525ad832d2bb1388b2303832a2307b30aeff Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:07 -0700
Subject: net: Move fib_convert_metrics to metrics file

Move logic of fib_convert_metrics into ip_metrics_convert. This allows
the code that converts netlink attributes into metrics struct to be
re-used in a later patch by IPv6.

This is mostly a code move with the following changes to variable names:
  - fi->fib_net becomes net
  - fc_mx and fc_mx_len are passed as inputs pulled from fib_config
  - metrics array is passed as an input from fi->fib_metrics->metrics

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         |  3 +++
 net/ipv4/Makefile        |  3 ++-
 net/ipv4/fib_semantics.c | 43 ++-------------------------------------
 net/ipv4/metrics.c       | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 42 deletions(-)
 create mode 100644 net/ipv4/metrics.c

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index ecffd843e7b8..dc4a2d6e58a5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -396,6 +396,9 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
+int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+		       u32 *metrics);
+
 u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a07b7dd06def..b379520f9133 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,7 +13,8 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
-	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
+	     metrics.o
 
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c27122f01b87..6608db23f54b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1019,47 +1019,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
 static int
 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 {
-	bool ecn_ca = false;
-	struct nlattr *nla;
-	int remaining;
-
-	if (!cfg->fc_mx)
-		return 0;
-
-	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
-		int type = nla_type(nla);
-		u32 val;
-
-		if (!type)
-			continue;
-		if (type > RTAX_MAX)
-			return -EINVAL;
-
-		if (type == RTAX_CC_ALGO) {
-			char tmp[TCP_CA_NAME_MAX];
-
-			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
-			if (val == TCP_CA_UNSPEC)
-				return -EINVAL;
-		} else {
-			val = nla_get_u32(nla);
-		}
-		if (type == RTAX_ADVMSS && val > 65535 - 40)
-			val = 65535 - 40;
-		if (type == RTAX_MTU && val > 65535 - 15)
-			val = 65535 - 15;
-		if (type == RTAX_HOPLIMIT && val > 255)
-			val = 255;
-		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
-			return -EINVAL;
-		fi->fib_metrics->metrics[type - 1] = val;
-	}
-
-	if (ecn_ca)
-		fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
-
-	return 0;
+	return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
+				  fi->fib_metrics->metrics);
 }
 
 struct fib_info *fib_create_info(struct fib_config *cfg,
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
new file mode 100644
index 000000000000..5121c6475e6b
--- /dev/null
+++ b/net/ipv4/metrics.c
@@ -0,0 +1,53 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/tcp.h>
+
+int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+		       u32 *metrics)
+{
+	bool ecn_ca = false;
+	struct nlattr *nla;
+	int remaining;
+
+	if (!fc_mx)
+		return 0;
+
+	nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
+		int type = nla_type(nla);
+		u32 val;
+
+		if (!type)
+			continue;
+		if (type > RTAX_MAX)
+			return -EINVAL;
+
+		if (type == RTAX_CC_ALGO) {
+			char tmp[TCP_CA_NAME_MAX];
+
+			nla_strlcpy(tmp, nla, sizeof(tmp));
+			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
+			if (val == TCP_CA_UNSPEC)
+				return -EINVAL;
+		} else {
+			val = nla_get_u32(nla);
+		}
+		if (type == RTAX_ADVMSS && val > 65535 - 40)
+			val = 65535 - 40;
+		if (type == RTAX_MTU && val > 65535 - 15)
+			val = 65535 - 15;
+		if (type == RTAX_HOPLIMIT && val > 255)
+			val = 255;
+		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+			return -EINVAL;
+		metrics[type - 1] = val;
+	}
+
+	if (ecn_ca)
+		metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_metrics_convert);
-- 
cgit v1.2.3


From 7aef6859ee91ea867a3dff9ba47bca9b2de382f6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:10 -0700
Subject: net/ipv6: Pass net to fib6_update_sernum

Pass net namespace to fib6_update_sernum. It can not be marked const
as fib6_new_sernum will change ipv6.fib6_sernum.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  2 +-
 net/ipv6/ip6_fib.c    |  3 +--
 net/ipv6/route.c      | 10 +++++-----
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5e86fd9dc857..f0aaf1c8f1a8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -408,7 +408,7 @@ void __net_exit fib6_notifier_exit(struct net *net);
 unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
-void fib6_update_sernum(struct rt6_info *rt);
+void fib6_update_sernum(struct net *net, struct rt6_info *rt);
 void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index deab2db6692e..74d2a3748e2f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -105,9 +105,8 @@ enum {
 	FIB6_NO_SERNUM_CHANGE = 0,
 };
 
-void fib6_update_sernum(struct rt6_info *rt)
+void fib6_update_sernum(struct net *net, struct rt6_info *rt)
 {
-	struct net *net = dev_net(rt->dst.dev);
 	struct fib6_node *fn;
 
 	fn = rcu_dereference_protected(rt->rt6i_node,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1d738bfe893b..0a99cda9fd7b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1352,7 +1352,7 @@ out:
 	/* Update fn->fn_sernum to invalidate all cached dst */
 	if (!err) {
 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
-		fib6_update_sernum(ort);
+		fib6_update_sernum(net, ort);
 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
 		fib6_force_start_gc(net);
 	}
@@ -3786,11 +3786,11 @@ void rt6_multipath_rebalance(struct rt6_info *rt)
 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
-	const struct net *net = dev_net(arg->dev);
+	struct net *net = dev_net(arg->dev);
 
 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
 		rt->rt6i_nh_flags &= ~arg->nh_flags;
-		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
 	}
 
@@ -3869,7 +3869,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
 	const struct net_device *dev = arg->dev;
-	const struct net *net = dev_net(dev);
+	struct net *net = dev_net(dev);
 
 	if (rt == net->ipv6.ip6_null_entry)
 		return 0;
@@ -3892,7 +3892,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 			}
 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
 						   RTNH_F_LINKDOWN);
-			fib6_update_sernum(rt);
+			fib6_update_sernum(net, rt);
 			rt6_multipath_rebalance(rt);
 		}
 		return -2;
-- 
cgit v1.2.3


From afb1d4b59311a8252f67c214b37ec69d8100cb55 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:11 -0700
Subject: net/ipv6: Pass net namespace to route functions

Pass network namespace reference into route add, delete and get
functions.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 12 ++++++-----
 net/ipv6/addrconf.c     | 33 ++++++++++++++++--------------
 net/ipv6/anycast.c      | 10 +++++----
 net/ipv6/ndisc.c        | 12 ++++++-----
 net/ipv6/route.c        | 54 +++++++++++++++++++++++++------------------------
 5 files changed, 66 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 08b132381984..1130a1144dfd 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -101,8 +101,8 @@ void ip6_route_cleanup(void);
 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
 int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
-int ip6_ins_rt(struct rt6_info *);
-int ip6_del_rt(struct rt6_info *);
+int ip6_ins_rt(struct net *net, struct rt6_info *rt);
+int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
 void rt6_flush_exceptions(struct rt6_info *rt);
 int rt6_remove_exception_rt(struct rt6_info *rt);
@@ -137,7 +137,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
 void fib6_force_start_gc(struct net *net);
 
-struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
 				    const struct in6_addr *addr, bool anycast);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
@@ -147,9 +147,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
  *	support functions for ND
  *
  */
-struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr,
+struct rt6_info *rt6_get_dflt_router(struct net *net,
+				     const struct in6_addr *addr,
 				     struct net_device *dev);
-struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+struct rt6_info *rt6_add_dflt_router(struct net *net,
+				     const struct in6_addr *gwaddr,
 				     struct net_device *dev, unsigned int pref);
 
 void rt6_purge_dflt_routers(struct net *net);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b2c0175125db..7ff7466c52e5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1037,7 +1037,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	rt = addrconf_dst_alloc(idev, addr, false);
+	rt = addrconf_dst_alloc(net, idev, addr, false);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		rt = NULL;
@@ -1187,7 +1187,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r
 				       0, RTF_GATEWAY | RTF_DEFAULT);
 	if (rt) {
 		if (del_rt)
-			ip6_del_rt(rt);
+			ip6_del_rt(dev_net(ifp->idev->dev), rt);
 		else {
 			if (!(rt->rt6i_flags & RTF_EXPIRES))
 				rt6_set_expires(rt, expires);
@@ -2666,7 +2666,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 		if (rt) {
 			/* Autoconf prefix route */
 			if (valid_lft == 0) {
-				ip6_del_rt(rt);
+				ip6_del_rt(net, rt);
 				rt = NULL;
 			} else if (addrconf_finite_timeout(rt_expires)) {
 				/* not infinity */
@@ -3333,7 +3333,8 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
-static int fixup_permanent_addr(struct inet6_dev *idev,
+static int fixup_permanent_addr(struct net *net,
+				struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
 	/* !rt6i_node means the host route was removed from the
@@ -3343,7 +3344,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
 	if (!ifp->rt || !ifp->rt->rt6i_node) {
 		struct rt6_info *rt, *prev;
 
-		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
@@ -3367,7 +3368,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
 	return 0;
 }
 
-static void addrconf_permanent_addr(struct net_device *dev)
+static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
 {
 	struct inet6_ifaddr *ifp, *tmp;
 	struct inet6_dev *idev;
@@ -3380,7 +3381,7 @@ static void addrconf_permanent_addr(struct net_device *dev)
 
 	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
 		if ((ifp->flags & IFA_F_PERMANENT) &&
-		    fixup_permanent_addr(idev, ifp) < 0) {
+		    fixup_permanent_addr(net, idev, ifp) < 0) {
 			write_unlock_bh(&idev->lock);
 			in6_ifa_hold(ifp);
 			ipv6_del_addr(ifp);
@@ -3449,7 +3450,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 
 		if (event == NETDEV_UP) {
 			/* restore routes for permanent addresses */
-			addrconf_permanent_addr(dev);
+			addrconf_permanent_addr(net, dev);
 
 			if (!addrconf_link_ready(dev)) {
 				/* device is not ready yet. */
@@ -3735,7 +3736,7 @@ restart:
 		spin_unlock_bh(&ifa->lock);
 
 		if (rt)
-			ip6_del_rt(rt);
+			ip6_del_rt(net, rt);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -3853,6 +3854,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	struct inet6_dev *idev = ifp->idev;
 	struct net_device *dev = idev->dev;
 	bool bump_id, notify = false;
+	struct net *net;
 
 	addrconf_join_solict(dev, &ifp->addr);
 
@@ -3863,8 +3865,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	if (ifp->state == INET6_IFADDR_STATE_DEAD)
 		goto out;
 
+	net = dev_net(dev);
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
-	    (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 &&
+	    (net->ipv6.devconf_all->accept_dad < 1 &&
 	     idev->cnf.accept_dad < 1) ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
 	    ifp->flags & IFA_F_NODAD) {
@@ -3900,8 +3903,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	 * Frames right away
 	 */
 	if (ifp->flags & IFA_F_OPTIMISTIC) {
-		ip6_ins_rt(ifp->rt);
-		if (ipv6_use_optimistic_addr(dev_net(dev), idev)) {
+		ip6_ins_rt(net, ifp->rt);
+		if (ipv6_use_optimistic_addr(net, idev)) {
 			/* Because optimistic nodes can use this address,
 			 * notify listeners. If DAD fails, RTM_DELADDR is sent.
 			 */
@@ -5604,7 +5607,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		 * to do it again
 		 */
 		if (!rcu_access_pointer(ifp->rt->rt6i_node))
-			ip6_ins_rt(ifp->rt);
+			ip6_ins_rt(net, ifp->rt);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
 		if (!ipv6_addr_any(&ifp->peer_addr))
@@ -5621,11 +5624,11 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
 						       ifp->idev->dev, 0, 0);
 			if (rt)
-				ip6_del_rt(rt);
+				ip6_del_rt(net, rt);
 		}
 		if (ifp->rt) {
 			if (dst_hold_safe(&ifp->rt->dst))
-				ip6_del_rt(ifp->rt);
+				ip6_del_rt(net, ifp->rt);
 		}
 		rt_genid_bump_ipv6(net);
 		break;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index bbcabbba9bd8..1122fb299b86 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -247,6 +247,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 {
 	struct ifacaddr6 *aca;
 	struct rt6_info *rt;
+	struct net *net;
 	int err;
 
 	ASSERT_RTNL();
@@ -265,7 +266,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 		}
 	}
 
-	rt = addrconf_dst_alloc(idev, addr, true);
+	net = dev_net(idev->dev);
+	rt = addrconf_dst_alloc(net, idev, addr, true);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		goto out;
@@ -286,7 +288,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	aca_get(aca);
 	write_unlock_bh(&idev->lock);
 
-	ip6_ins_rt(rt);
+	ip6_ins_rt(net, rt);
 
 	addrconf_join_solict(idev->dev, &aca->aca_addr);
 
@@ -329,7 +331,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
 	dst_hold(&aca->aca_rt->dst);
-	ip6_del_rt(aca->aca_rt);
+	ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 	aca_put(aca);
 	return 0;
@@ -357,7 +359,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
 		addrconf_leave_solict(idev, &aca->aca_addr);
 
 		dst_hold(&aca->aca_rt->dst);
-		ip6_del_rt(aca->aca_rt);
+		ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 		aca_put(aca);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9de4dfb126ba..3fbc3805e69b 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1156,6 +1156,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	struct neighbour *neigh = NULL;
 	struct inet6_dev *in6_dev;
 	struct rt6_info *rt = NULL;
+	struct net *net;
 	int lifetime;
 	struct ndisc_options ndopts;
 	int optlen;
@@ -1253,9 +1254,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	/* Do not accept RA with source-addr found on local machine unless
 	 * accept_ra_from_local is set to true.
 	 */
+	net = dev_net(in6_dev->dev);
 	if (!in6_dev->cnf.accept_ra_from_local &&
-	    ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
-			  in6_dev->dev, 0)) {
+	    ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
 		ND_PRINTK(2, info,
 			  "RA from local address detected on dev: %s: default router ignored\n",
 			  skb->dev->name);
@@ -1272,7 +1273,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
 #endif
 
-	rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
+	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt) {
 		neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
@@ -1285,7 +1286,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		}
 	}
 	if (rt && lifetime == 0) {
-		ip6_del_rt(rt);
+		ip6_del_rt(net, rt);
 		rt = NULL;
 	}
 
@@ -1294,7 +1295,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	if (!rt && lifetime) {
 		ND_PRINTK(3, info, "RA: adding default router\n");
 
-		rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref);
+		rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
+					 skb->dev, pref);
 		if (!rt) {
 			ND_PRINTK(0, err,
 				  "RA: %s failed to add default route\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0a99cda9fd7b..045811a3da76 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -850,13 +850,13 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	}
 
 	if (rinfo->prefix_len == 0)
-		rt = rt6_get_dflt_router(gwaddr, dev);
+		rt = rt6_get_dflt_router(net, gwaddr, dev);
 	else
 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
 					gwaddr, dev);
 
 	if (rt && !lifetime) {
-		ip6_del_rt(rt);
+		ip6_del_rt(net, rt);
 		rt = NULL;
 	}
 
@@ -1014,9 +1014,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 	return err;
 }
 
-int ip6_ins_rt(struct rt6_info *rt)
+int ip6_ins_rt(struct net *net, struct rt6_info *rt)
 {
-	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
+	struct nl_info info = {	.nl_net = net, };
 	struct mx6_config mxc = { .mx = NULL, };
 
 	/* Hold dst to account for the reference from the fib6 tree */
@@ -1121,14 +1121,13 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 	return pcpu_rt;
 }
 
-static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
+static struct rt6_info *rt6_make_pcpu_route(struct net *net,
+					    struct rt6_info *rt)
 {
 	struct rt6_info *pcpu_rt, *prev, **p;
 
 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
 	if (!pcpu_rt) {
-		struct net *net = dev_net(rt->dst.dev);
-
 		dst_hold(&net->ipv6.ip6_null_entry->dst);
 		return net->ipv6.ip6_null_entry;
 	}
@@ -1787,7 +1786,7 @@ uncached_rt_out:
 				/* No dst_hold() on rt is needed because grabbing
 				 * rt->rt6i_ref makes sure rt can't be released.
 				 */
-				pcpu_rt = rt6_make_pcpu_route(rt);
+				pcpu_rt = rt6_make_pcpu_route(net, rt);
 				rt6_release(rt);
 			} else {
 				/* rt is already removed from tree */
@@ -2088,7 +2087,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (rt6_check_expired(rt)) {
-				ip6_del_rt(rt);
+				ip6_del_rt(dev_net(dst->dev), rt);
 				dst = NULL;
 			}
 		} else {
@@ -2109,7 +2108,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
-				ip6_del_rt(rt);
+				ip6_del_rt(dev_net(rt->dst.dev), rt);
 		} else {
 			struct fib6_node *fn;
 
@@ -3018,9 +3017,9 @@ out:
 
 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 {
-	int err;
+	struct net *net = info->nl_net;
 	struct fib6_table *table;
-	struct net *net = dev_net(rt->dst.dev);
+	int err;
 
 	if (rt == net->ipv6.ip6_null_entry) {
 		err = -ENOENT;
@@ -3037,11 +3036,10 @@ out:
 	return err;
 }
 
-int ip6_del_rt(struct rt6_info *rt)
+int ip6_del_rt(struct net *net, struct rt6_info *rt)
 {
-	struct nl_info info = {
-		.nl_net = dev_net(rt->dst.dev),
-	};
+	struct nl_info info = { .nl_net = net };
+
 	return __ip6_del_rt(rt, &info);
 }
 
@@ -3376,13 +3374,15 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 }
 #endif
 
-struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
+struct rt6_info *rt6_get_dflt_router(struct net *net,
+				     const struct in6_addr *addr,
+				     struct net_device *dev)
 {
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
 	struct rt6_info *rt;
 	struct fib6_table *table;
 
-	table = fib6_get_table(dev_net(dev), tb_id);
+	table = fib6_get_table(net, tb_id);
 	if (!table)
 		return NULL;
 
@@ -3399,7 +3399,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
 	return rt;
 }
 
-struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+struct rt6_info *rt6_add_dflt_router(struct net *net,
+				     const struct in6_addr *gwaddr,
 				     struct net_device *dev,
 				     unsigned int pref)
 {
@@ -3412,7 +3413,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
 		.fc_protocol = RTPROT_RA,
 		.fc_nlinfo.portid = 0,
 		.fc_nlinfo.nlh = NULL,
-		.fc_nlinfo.nl_net = dev_net(dev),
+		.fc_nlinfo.nl_net = net,
 	};
 
 	cfg.fc_gateway = *gwaddr;
@@ -3425,10 +3426,11 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
 	}
 
-	return rt6_get_dflt_router(gwaddr, dev);
+	return rt6_get_dflt_router(net, gwaddr, dev);
 }
 
-static void __rt6_purge_dflt_routers(struct fib6_table *table)
+static void __rt6_purge_dflt_routers(struct net *net,
+				     struct fib6_table *table)
 {
 	struct rt6_info *rt;
 
@@ -3439,7 +3441,7 @@ restart:
 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
 			if (dst_hold_safe(&rt->dst)) {
 				rcu_read_unlock();
-				ip6_del_rt(rt);
+				ip6_del_rt(net, rt);
 			} else {
 				rcu_read_unlock();
 			}
@@ -3463,7 +3465,7 @@ void rt6_purge_dflt_routers(struct net *net)
 		head = &net->ipv6.fib_table_hash[h];
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
-				__rt6_purge_dflt_routers(table);
+				__rt6_purge_dflt_routers(net, table);
 		}
 	}
 
@@ -3583,12 +3585,12 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
  *	Allocate a dst for local (unicast / anycast) address.
  */
 
-struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+struct rt6_info *addrconf_dst_alloc(struct net *net,
+				    struct inet6_dev *idev,
 				    const struct in6_addr *addr,
 				    bool anycast)
 {
 	u32 tb_id;
-	struct net *net = dev_net(idev->dev);
 	struct net_device *dev = idev->dev;
 	struct rt6_info *rt;
 
-- 
cgit v1.2.3


From e8478e80e5a74f4ce47b043735f0066588fb64c7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:13 -0700
Subject: net/ipv6: Save route type in rt6_info

The RTN_ type for IPv6 FIB entries is currently embedded in rt6i_flags
and dst.error. Since dst is going to be removed, it can no longer be
relied on for FIB dumps so save the route type as fib6_type.

fc_type is set in current users based on the algorithm in rt6_fill_node:
  - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL
  - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST
  - else fc_type = RTN_UNICAST

Similarly, fib6_type is set in the rt6_info templates based on the
RTF_REJECT section of rt6_fill_node converting dst.error to RTN type.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/addrconf.c   |  2 ++
 net/ipv6/route.c      | 46 ++++++++++++++++++++--------------------------
 3 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index f0aaf1c8f1a8..0165820bbafb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -174,6 +174,7 @@ struct rt6_info {
 	int				rt6i_nh_weight;
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
+	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
 					unused:6;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7ff7466c52e5..f71fcf2635d5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2331,6 +2331,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 		.fc_flags = RTF_UP | flags,
 		.fc_nlinfo.nl_net = dev_net(dev),
 		.fc_protocol = RTPROT_KERNEL,
+		.fc_type = RTN_UNICAST,
 	};
 
 	cfg.fc_dst = *pfx;
@@ -2394,6 +2395,7 @@ static void addrconf_add_mroute(struct net_device *dev)
 		.fc_ifindex = dev->ifindex,
 		.fc_dst_len = 8,
 		.fc_flags = RTF_UP,
+		.fc_type = RTN_UNICAST,
 		.fc_nlinfo.nl_net = dev_net(dev),
 	};
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0daf4c9c9f2b..1cc01f5bb773 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -307,6 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_UNREACHABLE,
 };
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -324,6 +325,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_PROHIBIT,
 };
 
 static const struct rt6_info ip6_blk_hole_entry_template = {
@@ -339,6 +341,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_BLACKHOLE,
 };
 
 #endif
@@ -2802,6 +2805,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	if (cfg->fc_type > RTN_MAX) {
+		NL_SET_ERR_MSG(extack, "Invalid route type");
+		goto out;
+	}
+
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -2914,6 +2922,8 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	rt->rt6i_metric = cfg->fc_metric;
 	rt->rt6i_nh_weight = 1;
 
+	rt->fib6_type = cfg->fc_type;
+
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
 	 */
@@ -3354,6 +3364,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
 				  RTF_UP | RTF_PREF(pref),
 		.fc_protocol = RTPROT_RA,
+		.fc_type = RTN_UNICAST,
 		.fc_nlinfo.portid = 0,
 		.fc_nlinfo.nlh = NULL,
 		.fc_nlinfo.nl_net = net,
@@ -3410,6 +3421,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
 		.fc_protocol = RTPROT_RA,
+		.fc_type = RTN_UNICAST,
 		.fc_nlinfo.portid = 0,
 		.fc_nlinfo.nlh = NULL,
 		.fc_nlinfo.nl_net = net,
@@ -3485,6 +3497,7 @@ static void rtmsg_to_fib6_config(struct net *net,
 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
 	cfg->fc_flags = rtmsg->rtmsg_flags;
+	cfg->fc_type = rtmsg->rtmsg_type;
 
 	cfg->fc_nlinfo.nl_net = net;
 
@@ -3606,10 +3619,13 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 
 	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
-	if (anycast)
+	if (anycast) {
+		rt->fib6_type = RTN_ANYCAST;
 		rt->rt6i_flags |= RTF_ANYCAST;
-	else
+	} else {
+		rt->fib6_type = RTN_LOCAL;
 		rt->rt6i_flags |= RTF_LOCAL;
+	}
 
 	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
@@ -4509,30 +4525,8 @@ static int rt6_fill_node(struct net *net,
 	rtm->rtm_table = table;
 	if (nla_put_u32(skb, RTA_TABLE, table))
 		goto nla_put_failure;
-	if (rt->rt6i_flags & RTF_REJECT) {
-		switch (rt->dst.error) {
-		case -EINVAL:
-			rtm->rtm_type = RTN_BLACKHOLE;
-			break;
-		case -EACCES:
-			rtm->rtm_type = RTN_PROHIBIT;
-			break;
-		case -EAGAIN:
-			rtm->rtm_type = RTN_THROW;
-			break;
-		default:
-			rtm->rtm_type = RTN_UNREACHABLE;
-			break;
-		}
-	}
-	else if (rt->rt6i_flags & RTF_LOCAL)
-		rtm->rtm_type = RTN_LOCAL;
-	else if (rt->rt6i_flags & RTF_ANYCAST)
-		rtm->rtm_type = RTN_ANYCAST;
-	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
-		rtm->rtm_type = RTN_LOCAL;
-	else
-		rtm->rtm_type = RTN_UNICAST;
+
+	rtm->rtm_type = rt->fib6_type;
 	rtm->rtm_flags = 0;
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
 	rtm->rtm_protocol = rt->rt6i_protocol;
-- 
cgit v1.2.3


From 5e670d844b2a4e47d1b9b9aceb14dd3c12a6d4bf Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:14 -0700
Subject: net/ipv6: Move nexthop data to fib6_nh

Introduce fib6_nh structure and move nexthop related data from
rt6_info and rt6_info.dst to fib6_nh. References to dev, gateway or
lwtstate from a FIB lookup perspective are converted to use fib6_nh;
datapath references to dst version are left as is.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  32 ++--
 include/net/ip6_fib.h                              |  16 +-
 include/net/ip6_route.h                            |   6 +-
 net/ipv6/addrconf.c                                |   2 +-
 net/ipv6/ip6_fib.c                                 |   6 +-
 net/ipv6/route.c                                   | 162 +++++++++++----------
 6 files changed, 125 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 1904c0323d39..d995a0b52d7c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2770,9 +2770,9 @@ mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp,
 		struct in6_addr *gw;
 		int ifindex, weight;
 
-		ifindex = mlxsw_sp_rt6->rt->dst.dev->ifindex;
-		weight = mlxsw_sp_rt6->rt->rt6i_nh_weight;
-		gw = &mlxsw_sp_rt6->rt->rt6i_gateway;
+		ifindex = mlxsw_sp_rt6->rt->fib6_nh.nh_dev->ifindex;
+		weight = mlxsw_sp_rt6->rt->fib6_nh.nh_weight;
+		gw = &mlxsw_sp_rt6->rt->fib6_nh.nh_gw;
 		if (!mlxsw_sp_nexthop6_group_has_nexthop(nh_grp, gw, ifindex,
 							 weight))
 			return false;
@@ -2838,7 +2838,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed)
 	struct net_device *dev;
 
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
-		dev = mlxsw_sp_rt6->rt->dst.dev;
+		dev = mlxsw_sp_rt6->rt->fib6_nh.nh_dev;
 		val ^= dev->ifindex;
 	}
 
@@ -3836,9 +3836,9 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
 		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
 		struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-		if (nh->rif && nh->rif->dev == rt->dst.dev &&
+		if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
 		    ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
-				    &rt->rt6i_gateway))
+				    &rt->fib6_nh.nh_gw))
 			return nh;
 		continue;
 	}
@@ -3895,7 +3895,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
 		list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
-				 list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+				 list)->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
 		return;
 	}
 
@@ -3905,9 +3905,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 		nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
 		if (nh && nh->offloaded)
-			mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+			mlxsw_sp_rt6->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
 		else
-			mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+			mlxsw_sp_rt6->rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
@@ -3922,7 +3922,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
 		struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-		rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+		rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
@@ -4818,8 +4818,8 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
 					const struct rt6_info *rt,
 					enum mlxsw_sp_ipip_type *ret)
 {
-	return rt->dst.dev &&
-	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->dst.dev, ret);
+	return rt->fib6_nh.nh_dev &&
+	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.nh_dev, ret);
 }
 
 static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
@@ -4829,7 +4829,7 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
 {
 	const struct mlxsw_sp_ipip_ops *ipip_ops;
 	struct mlxsw_sp_ipip_entry *ipip_entry;
-	struct net_device *dev = rt->dst.dev;
+	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct mlxsw_sp_rif *rif;
 	int err;
 
@@ -4872,11 +4872,11 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
 				  struct mlxsw_sp_nexthop *nh,
 				  const struct rt6_info *rt)
 {
-	struct net_device *dev = rt->dst.dev;
+	struct net_device *dev = rt->fib6_nh.nh_dev;
 
 	nh->nh_grp = nh_grp;
-	nh->nh_weight = rt->rt6i_nh_weight;
-	memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr));
+	nh->nh_weight = rt->fib6_nh.nh_weight;
+	memcpy(&nh->gw_addr, &rt->fib6_nh.nh_gw, sizeof(nh->gw_addr));
 	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
 
 	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 0165820bbafb..f0a88370ba95 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -127,6 +127,16 @@ struct rt6_exception {
 #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
 #define FIB6_MAX_DEPTH 5
 
+struct fib6_nh {
+	struct in6_addr		nh_gw;
+	struct net_device	*nh_dev;
+	struct lwtunnel_state	*nh_lwtstate;
+
+	unsigned int		nh_flags;
+	atomic_t		nh_upper_bound;
+	int			nh_weight;
+};
+
 struct rt6_info {
 	struct dst_entry		dst;
 	struct rt6_info __rcu		*rt6_next;
@@ -149,12 +159,9 @@ struct rt6_info {
 	 */
 	struct list_head		rt6i_siblings;
 	unsigned int			rt6i_nsiblings;
-	atomic_t			rt6i_nh_upper_bound;
 
 	atomic_t			rt6i_ref;
 
-	unsigned int			rt6i_nh_flags;
-
 	/* These are in a separate cache line. */
 	struct rt6key			rt6i_dst ____cacheline_aligned_in_smp;
 	u32				rt6i_flags;
@@ -171,13 +178,14 @@ struct rt6_info {
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
-	int				rt6i_nh_weight;
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
 	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
 					unused:6;
+
+	struct fib6_nh			fib6_nh;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 1130a1144dfd..655e13017a45 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -273,10 +273,10 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b)
 {
-	return a->dst.dev == b->dst.dev &&
+	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
 	       a->rt6i_idev == b->rt6i_idev &&
-	       ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) &&
-	       !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate);
+	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
+	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
 
 #endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f71fcf2635d5..483c8772e856 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2369,7 +2369,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->dst.dev->ifindex != dev->ifindex)
+		if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
 			continue;
 		if ((rt->rt6i_flags & flags) != flags)
 			continue;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 74d2a3748e2f..64b73e65f114 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2221,6 +2221,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct rt6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
+	const struct net_device *dev;
 
 	seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
 
@@ -2230,14 +2231,15 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
 	if (rt->rt6i_flags & RTF_GATEWAY)
-		seq_printf(seq, "%pi6", &rt->rt6i_gateway);
+		seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
 	else
 		seq_puts(seq, "00000000000000000000000000000000");
 
+	dev = rt->fib6_nh.nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
 		   rt->dst.__use, rt->rt6i_flags,
-		   rt->dst.dev ? rt->dst.dev->name : "");
+		   dev ? dev->name : "");
 	iter->w.leaf = NULL;
 	return 0;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1cc01f5bb773..222af19d3403 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -466,12 +466,15 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
 	if (!fl6->mp_hash)
 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
-	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
+	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
 		return match;
 
 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
 				 rt6i_siblings) {
-		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
+		int nh_upper_bound;
+
+		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
+		if (fl6->mp_hash > nh_upper_bound)
 			continue;
 		if (rt6_score_route(sibling, oif, strict) < 0)
 			break;
@@ -495,13 +498,14 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 	struct rt6_info *local = NULL;
 	struct rt6_info *sprt;
 
-	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
+	if (!oif && ipv6_addr_any(saddr) &&
+	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
 		return rt;
 
 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
-		struct net_device *dev = sprt->dst.dev;
+		const struct net_device *dev = sprt->fib6_nh.nh_dev;
 
-		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
+		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
 			continue;
 
 		if (oif) {
@@ -533,7 +537,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 			return net->ipv6.ip6_null_entry;
 	}
 
-	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
+	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -558,7 +562,10 @@ static void rt6_probe_deferred(struct work_struct *w)
 static void rt6_probe(struct rt6_info *rt)
 {
 	struct __rt6_probe_work *work;
+	const struct in6_addr *nh_gw;
 	struct neighbour *neigh;
+	struct net_device *dev;
+
 	/*
 	 * Okay, this does not seem to be appropriate
 	 * for now, however, we need to check if it
@@ -569,8 +576,11 @@ static void rt6_probe(struct rt6_info *rt)
 	 */
 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
 		return;
+
+	nh_gw = &rt->fib6_nh.nh_gw;
+	dev = rt->fib6_nh.nh_dev;
 	rcu_read_lock_bh();
-	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
 		if (neigh->nud_state & NUD_VALID)
 			goto out;
@@ -592,9 +602,9 @@ static void rt6_probe(struct rt6_info *rt)
 
 	if (work) {
 		INIT_WORK(&work->work, rt6_probe_deferred);
-		work->target = rt->rt6i_gateway;
-		dev_hold(rt->dst.dev);
-		work->dev = rt->dst.dev;
+		work->target = *nh_gw;
+		dev_hold(dev);
+		work->dev = dev;
 		schedule_work(&work->work);
 	}
 
@@ -612,7 +622,8 @@ static inline void rt6_probe(struct rt6_info *rt)
  */
 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 {
-	struct net_device *dev = rt->dst.dev;
+	const struct net_device *dev = rt->fib6_nh.nh_dev;
+
 	if (!oif || dev->ifindex == oif)
 		return 2;
 	if ((dev->flags & IFF_LOOPBACK) &&
@@ -623,15 +634,16 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 
 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
 {
-	struct neighbour *neigh;
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
+	struct neighbour *neigh;
 
 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
 	    !(rt->rt6i_flags & RTF_GATEWAY))
 		return RT6_NUD_SUCCEED;
 
 	rcu_read_lock_bh();
-	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
+					  &rt->fib6_nh.nh_gw);
 	if (neigh) {
 		read_lock(&neigh->lock);
 		if (neigh->nud_state & NUD_VALID)
@@ -679,11 +691,11 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 	bool match_do_rr = false;
 	struct inet6_dev *idev = rt->rt6i_idev;
 
-	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		goto out;
 
 	if (idev->cnf.ignore_routes_with_linkdown &&
-	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
 
@@ -888,7 +900,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 /* called with rcu_lock held */
 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
 {
-	struct net_device *dev = rt->dst.dev;
+	struct net_device *dev = rt->fib6_nh.nh_dev;
 
 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
 		/* for copies of local routes, dst->dev needs to be the
@@ -928,7 +940,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 	if (rt->rt6i_idev)
 		in6_dev_hold(rt->rt6i_idev);
 	rt->dst.lastuse = jiffies;
-	rt->rt6i_gateway = ort->rt6i_gateway;
+	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 	rt->rt6i_flags = ort->rt6i_flags;
 	rt6_set_from(rt, ort);
 	rt->rt6i_metric = ort->rt6i_metric;
@@ -937,7 +949,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 #endif
 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
 	rt->rt6i_table = ort->rt6i_table;
-	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
+	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 }
 
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1308,7 +1320,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
 static int rt6_insert_exception(struct rt6_info *nrt,
 				struct rt6_info *ort)
 {
-	struct net *net = dev_net(ort->dst.dev);
+	struct net *net = dev_net(nrt->dst.dev);
 	struct rt6_exception_bucket *bucket;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
@@ -2313,7 +2325,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 			continue;
 		if (rt6_check_expired(rt))
 			continue;
@@ -2321,14 +2333,14 @@ restart:
 			break;
 		if (!(rt->rt6i_flags & RTF_GATEWAY))
 			continue;
-		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
+		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
 			continue;
 		/* rt_cache's gateway might be different from its 'parent'
 		 * in the case of an ip redirect.
 		 * So we keep searching in the exception table if the gateway
 		 * is different.
 		 */
-		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
 			rt_cache = rt6_find_cached_rt(rt,
 						      &fl6->daddr,
 						      &fl6->saddr);
@@ -2905,7 +2917,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 					   &lwtstate, extack);
 		if (err)
 			goto out;
-		rt->dst.lwtstate = lwtstate_get(lwtstate);
+		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
 		lwtunnel_set_redirect(&rt->dst);
 	}
 
@@ -2920,7 +2932,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 #endif
 
 	rt->rt6i_metric = cfg->fc_metric;
-	rt->rt6i_nh_weight = 1;
+	rt->fib6_nh.nh_weight = 1;
 
 	rt->fib6_type = cfg->fc_type;
 
@@ -2975,7 +2987,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		if (err)
 			goto out;
 
-		rt->rt6i_gateway = cfg->fc_gateway;
+		rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway;
 	}
 
 	err = -ENODEV;
@@ -3010,9 +3022,9 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 install_route:
 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
 	    !netif_carrier_ok(dev))
-		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
-	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
-	rt->dst.dev = dev;
+		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
+	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
+	rt->fib6_nh.nh_dev = rt->dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
 
@@ -3171,11 +3183,11 @@ static int ip6_route_del(struct fib6_config *cfg,
 				rt = rt_cache;
 			}
 			if (cfg->fc_ifindex &&
-			    (!rt->dst.dev ||
-			     rt->dst.dev->ifindex != cfg->fc_ifindex))
+			    (!rt->fib6_nh.nh_dev ||
+			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
 				continue;
 			if (cfg->fc_flags & RTF_GATEWAY &&
-			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
 				continue;
 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
 				continue;
@@ -3337,11 +3349,11 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->dst.dev->ifindex != ifindex)
+		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
 			continue;
 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
 			continue;
-		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
+		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
 		ip6_hold_safe(NULL, &rt, false);
 		break;
@@ -3398,9 +3410,9 @@ struct rt6_info *rt6_get_dflt_router(struct net *net,
 
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		if (dev == rt->dst.dev &&
+		if (dev == rt->fib6_nh.nh_dev &&
 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
-		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
+		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
 			break;
 	}
 	if (rt)
@@ -3627,6 +3639,8 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 		rt->rt6i_flags |= RTF_LOCAL;
 	}
 
+	rt->fib6_nh.nh_gw = *addr;
+	rt->fib6_nh.nh_dev = dev;
 	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
 	rt->rt6i_dst.plen = 128;
@@ -3649,7 +3663,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
-	if (((void *)rt->dst.dev == dev || !dev) &&
+	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
 	    rt != net->ipv6.ip6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
@@ -3681,7 +3695,7 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
-	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
 		return -1;
 	}
 
@@ -3729,8 +3743,8 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
 
 static bool rt6_is_dead(const struct rt6_info *rt)
 {
-	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
-	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
+	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
 		return true;
 
@@ -3743,11 +3757,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt)
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
-		total += rt->rt6i_nh_weight;
+		total += rt->fib6_nh.nh_weight;
 
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
 		if (!rt6_is_dead(iter))
-			total += iter->rt6i_nh_weight;
+			total += iter->fib6_nh.nh_weight;
 	}
 
 	return total;
@@ -3758,11 +3772,11 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
 	int upper_bound = -1;
 
 	if (!rt6_is_dead(rt)) {
-		*weight += rt->rt6i_nh_weight;
+		*weight += rt->fib6_nh.nh_weight;
 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
 						    total) - 1;
 	}
-	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
+	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
 }
 
 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
@@ -3805,8 +3819,8 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
-		rt->rt6i_nh_flags &= ~arg->nh_flags;
+	if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
+		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
 	}
@@ -3834,10 +3848,10 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
 {
 	struct rt6_info *iter;
 
-	if (rt->dst.dev == dev)
+	if (rt->fib6_nh.nh_dev == dev)
 		return true;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
-		if (iter->dst.dev == dev)
+		if (iter->fib6_nh.nh_dev == dev)
 			return true;
 
 	return false;
@@ -3858,11 +3872,12 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
 	struct rt6_info *iter;
 	unsigned int dead = 0;
 
-	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh.nh_dev == down_dev ||
+	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		dead++;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
-		if (iter->dst.dev == down_dev ||
-		    iter->rt6i_nh_flags & RTNH_F_DEAD)
+		if (iter->fib6_nh.nh_dev == down_dev ||
+		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
 			dead++;
 
 	return dead;
@@ -3874,11 +3889,11 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
 {
 	struct rt6_info *iter;
 
-	if (rt->dst.dev == dev)
-		rt->rt6i_nh_flags |= nh_flags;
+	if (rt->fib6_nh.nh_dev == dev)
+		rt->fib6_nh.nh_flags |= nh_flags;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
-		if (iter->dst.dev == dev)
-			iter->rt6i_nh_flags |= nh_flags;
+		if (iter->fib6_nh.nh_dev == dev)
+			iter->fib6_nh.nh_flags |= nh_flags;
 }
 
 /* called with write lock held for table with rt */
@@ -3893,12 +3908,12 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 
 	switch (arg->event) {
 	case NETDEV_UNREGISTER:
-		return rt->dst.dev == dev ? -1 : 0;
+		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
 	case NETDEV_DOWN:
 		if (rt->should_flush)
 			return -1;
 		if (!rt->rt6i_nsiblings)
-			return rt->dst.dev == dev ? -1 : 0;
+			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
 		if (rt6_multipath_uses_dev(rt, dev)) {
 			unsigned int count;
 
@@ -3914,10 +3929,10 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 		}
 		return -2;
 	case NETDEV_CHANGE:
-		if (rt->dst.dev != dev ||
+		if (rt->fib6_nh.nh_dev != dev ||
 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
-		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 		rt6_multipath_rebalance(rt);
 		break;
 	}
@@ -3969,7 +3984,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	   Since RFC 1981 doesn't include administrative MTU increase
 	   update PMTU increase is a MUST. (i.e. jumbo frame)
 	 */
-	if (rt->dst.dev == arg->dev &&
+	if (rt->fib6_nh.nh_dev == arg->dev &&
 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
 		spin_lock_bh(&rt6_exception_lock);
 		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
@@ -4255,7 +4270,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			goto cleanup;
 		}
 
-		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
+		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
 
 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
 		if (err) {
@@ -4412,7 +4427,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
 			    + nla_total_size(16) /* RTA_GATEWAY */
-			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
+			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
 
 		nexthop_len *= rt->rt6i_nsiblings;
 	}
@@ -4430,38 +4445,38 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
+	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
 	       + nexthop_len;
 }
 
 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
-	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		*flags |= RTNH_F_DEAD;
 
-	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
+	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
 			*flags |= RTNH_F_DEAD;
 	}
 
 	if (rt->rt6i_flags & RTF_GATEWAY) {
-		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
+		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
 			goto nla_put_failure;
 	}
 
-	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
-	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
+	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
+	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
-	if (!skip_oif && rt->dst.dev &&
-	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+	if (!skip_oif && rt->fib6_nh.nh_dev &&
+	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
 		goto nla_put_failure;
 
-	if (rt->dst.lwtstate &&
-	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
+	if (rt->fib6_nh.nh_lwtstate &&
+	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
 		goto nla_put_failure;
 
 	return 0;
@@ -4473,6 +4488,7 @@ nla_put_failure:
 /* add multipath next hop */
 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 {
+	const struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rtnexthop *rtnh;
 	unsigned int flags = 0;
 
@@ -4480,8 +4496,8 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 	if (!rtnh)
 		goto nla_put_failure;
 
-	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
-	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
+	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
+	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
 
 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From d4ead6b34b67fd711639324b6465a050bcb197d4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:16 -0700
Subject: net/ipv6: move metrics from dst to rt6_info

Similar to IPv4, add fib metrics to the fib struct, which at the moment
is rt6_info. Will be moved to fib6_info in a later patch. Copy metrics
into dst by reference using refcount.

To make the transition:
- add dst_metrics to rt6_info. Default to dst_default_metrics if no
  metrics are passed during route add. No need for a separate pmtu
  entry; it can reference the MTU slot in fib6_metrics

- ip6_convert_metrics allocates memory in the FIB entry and uses
  ip_metrics_convert to copy from netlink attribute to metrics entry

- the convert metrics call is done in ip6_route_info_create simplifying
  the route add path
  + fib6_commit_metrics and fib6_copy_metrics and the temporary
    mx6_config are no longer needed

- add fib6_metric_set helper to change the value of a metric in the
  fib entry since dst_metric_set can no longer be used

- cow_metrics for IPv6 can drop to dst_cow_metrics_generic

- rt6_dst_from_metrics_check is no longer needed

- rt6_fill_node needs the FIB entry and dst as separate arguments to
  keep compatibility with existing output. Current dst address is
  renamed to dest.
  (to be consistent with IPv4 rt6_fill_node really should be split
  into 2 functions similar to fib_dump_info and rt_fill_info)

- rt6_fill_node no longer needs the temporary metrics variable

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  17 ++--
 net/core/dst.c        |   1 +
 net/ipv6/ip6_fib.c    |  66 +++++--------
 net/ipv6/ndisc.c      |  10 +-
 net/ipv6/route.c      | 257 +++++++++++++++++++-------------------------------
 5 files changed, 133 insertions(+), 218 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index f0a88370ba95..1f8dc9d12abb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -94,11 +94,6 @@ struct fib6_gc_args {
 #define FIB6_SUBTREE(fn)	(rcu_dereference_protected((fn)->subtree, 1))
 #endif
 
-struct mx6_config {
-	const u32 *mx;
-	DECLARE_BITMAP(mx_valid, RTAX_MAX);
-};
-
 /*
  *	routing information
  *
@@ -176,7 +171,6 @@ struct rt6_info {
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
 	u32				rt6i_metric;
-	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
@@ -185,6 +179,8 @@ struct rt6_info {
 					should_flush:1,
 					unused:6;
 
+	struct dst_metrics		*fib6_metrics;
+#define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
 	struct fib6_nh			fib6_nh;
 };
 
@@ -390,8 +386,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 		    void *arg);
 
 int fib6_add(struct fib6_node *root, struct rt6_info *rt,
-	     struct nl_info *info, struct mx6_config *mxc,
-	     struct netlink_ext_ack *extack);
+	     struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct rt6_info *rt, struct nl_info *info);
 
 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
@@ -420,6 +415,12 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 void fib6_update_sernum(struct net *net, struct rt6_info *rt);
 void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
 
+void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val);
+static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric)
+{
+	return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
+}
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
diff --git a/net/core/dst.c b/net/core/dst.c
index 007aa0b08291..2d9b37f8944a 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = {
 	 */
 	.refcnt = REFCOUNT_INIT(1),
 };
+EXPORT_SYMBOL(dst_default_metrics);
 
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	      struct net_device *dev, int initial_ref, int initial_obsolete,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 64b73e65f114..0d94c56c3e41 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -578,6 +578,24 @@ out:
 	return res;
 }
 
+void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val)
+{
+	if (!f6i)
+		return;
+
+	if (f6i->fib6_metrics == &dst_default_metrics) {
+		struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
+
+		if (!p)
+			return;
+
+		refcount_set(&p->refcnt, 1);
+		f6i->fib6_metrics = p;
+	}
+
+	f6i->fib6_metrics->metrics[metric - 1] = val;
+}
+
 /*
  *	Routing Table
  *
@@ -801,38 +819,6 @@ insert_above:
 	return ln;
 }
 
-static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
-{
-	int i;
-
-	for (i = 0; i < RTAX_MAX; i++) {
-		if (test_bit(i, mxc->mx_valid))
-			mp[i] = mxc->mx[i];
-	}
-}
-
-static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
-{
-	if (!mxc->mx)
-		return 0;
-
-	if (dst->flags & DST_HOST) {
-		u32 *mp = dst_metrics_write_ptr(dst);
-
-		if (unlikely(!mp))
-			return -ENOMEM;
-
-		fib6_copy_metrics(mp, mxc);
-	} else {
-		dst_init_metrics(dst, mxc->mx, false);
-
-		/* We've stolen mx now. */
-		mxc->mx = NULL;
-	}
-
-	return 0;
-}
-
 static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
@@ -866,7 +852,7 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
  */
 
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
-			    struct nl_info *info, struct mx6_config *mxc,
+			    struct nl_info *info,
 			    struct netlink_ext_ack *extack)
 {
 	struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
@@ -923,7 +909,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 					rt6_clean_expires(iter);
 				else
 					rt6_set_expires(iter, rt->dst.expires);
-				iter->rt6i_pmtu = rt->rt6i_pmtu;
+				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
 			/* If we have the same destination and the same metric,
@@ -1002,9 +988,6 @@ next_iter:
 
 add:
 		nlflags |= NLM_F_CREATE;
-		err = fib6_commit_metrics(&rt->dst, mxc);
-		if (err)
-			return err;
 
 		err = call_fib6_entry_notifiers(info->nl_net,
 						FIB_EVENT_ENTRY_ADD,
@@ -1035,10 +1018,6 @@ add:
 			return -ENOENT;
 		}
 
-		err = fib6_commit_metrics(&rt->dst, mxc);
-		if (err)
-			return err;
-
 		err = call_fib6_entry_notifiers(info->nl_net,
 						FIB_EVENT_ENTRY_REPLACE,
 						rt, extack);
@@ -1135,8 +1114,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
  */
 
 int fib6_add(struct fib6_node *root, struct rt6_info *rt,
-	     struct nl_info *info, struct mx6_config *mxc,
-	     struct netlink_ext_ack *extack)
+	     struct nl_info *info, struct netlink_ext_ack *extack)
 {
 	struct fib6_table *table = rt->rt6i_table;
 	struct fib6_node *fn, *pn = NULL;
@@ -1244,7 +1222,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	}
 #endif
 
-	err = fib6_add_rt2node(fn, rt, info, mxc, extack);
+	err = fib6_add_rt2node(fn, rt, info, extack);
 	if (!err) {
 		__fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3fbc3805e69b..b058ea9ecec0 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1323,9 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	    ra_msg->icmph.icmp6_hop_limit) {
 		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
 			in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
-			if (rt)
-				dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
-					       ra_msg->icmph.icmp6_hop_limit);
+			fib6_metric_set(rt, RTAX_HOPLIMIT,
+					ra_msg->icmph.icmp6_hop_limit);
 		} else {
 			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
 		}
@@ -1477,10 +1476,7 @@ skip_routeinfo:
 			ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
 		} else if (in6_dev->cnf.mtu6 != mtu) {
 			in6_dev->cnf.mtu6 = mtu;
-
-			if (rt)
-				dst_metric_set(&rt->dst, RTAX_MTU, mtu);
-
+			fib6_metric_set(rt, RTAX_MTU, mtu);
 			rt6_mtu_change(skb->dev, mtu);
 		}
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3b301aafd2ed..62aafd06c35f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -96,12 +96,11 @@ static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
-static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 static size_t rt6_nlmsg_size(struct rt6_info *rt);
-static int rt6_fill_node(struct net *net,
-			 struct sk_buff *skb, struct rt6_info *rt,
-			 struct in6_addr *dst, struct in6_addr *src,
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags);
 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
@@ -183,23 +182,6 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 	}
 }
 
-static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
-{
-	return dst_metrics_write_ptr(&rt->from->dst);
-}
-
-static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
-{
-	struct rt6_info *rt = (struct rt6_info *)dst;
-
-	if (rt->rt6i_flags & RTF_PCPU)
-		return rt6_pcpu_cow_metrics(rt);
-	else if (rt->rt6i_flags & RTF_CACHE)
-		return NULL;
-	else
-		return dst_cow_metrics_generic(dst, old);
-}
-
 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
 					     struct sk_buff *skb,
 					     const void *daddr)
@@ -249,7 +231,7 @@ static struct dst_ops ip6_dst_ops_template = {
 	.check			=	ip6_dst_check,
 	.default_advmss		=	ip6_default_advmss,
 	.mtu			=	ip6_mtu,
-	.cow_metrics		=	ipv6_cow_metrics,
+	.cow_metrics		=	dst_cow_metrics_generic,
 	.destroy		=	ip6_dst_destroy,
 	.ifdown			=	ip6_dst_ifdown,
 	.negative_advice	=	ip6_negative_advice,
@@ -353,6 +335,7 @@ static void rt6_info_init(struct rt6_info *rt)
 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 	INIT_LIST_HEAD(&rt->rt6i_siblings);
 	INIT_LIST_HEAD(&rt->rt6i_uncached);
+	rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
 }
 
 /* allocate dst with ip6_dst_ops */
@@ -395,6 +378,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 	struct rt6_exception_bucket *bucket;
 	struct rt6_info *from = rt->from;
 	struct inet6_dev *idev;
+	struct dst_metrics *m;
 
 	dst_destroy_metrics_generic(dst);
 	free_percpu(rt->rt6i_pcpu);
@@ -411,6 +395,10 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		kfree(bucket);
 	}
 
+	m = rt->fib6_metrics;
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+		kfree(m);
+
 	rt->from = NULL;
 	dst_release(&from->dst);
 }
@@ -996,7 +984,11 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	dst_hold(&from->dst);
 	rt->from = from;
-	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
+	if (from->fib6_metrics != &dst_default_metrics) {
+		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
+		refcount_inc(&from->fib6_metrics->refcnt);
+	}
 }
 
 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
@@ -1140,7 +1132,6 @@ EXPORT_SYMBOL(rt6_lookup);
  */
 
 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
-			struct mx6_config *mxc,
 			struct netlink_ext_ack *extack)
 {
 	int err;
@@ -1148,7 +1139,7 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 
 	table = rt->rt6i_table;
 	spin_lock_bh(&table->tb6_lock);
-	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
+	err = fib6_add(&table->tb6_root, rt, info, extack);
 	spin_unlock_bh(&table->tb6_lock);
 
 	return err;
@@ -1157,11 +1148,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 int ip6_ins_rt(struct net *net, struct rt6_info *rt)
 {
 	struct nl_info info = {	.nl_net = net, };
-	struct mx6_config mxc = { .mx = NULL, };
 
 	/* Hold dst to account for the reference from the fib6 tree */
 	dst_hold(&rt->dst);
-	return __ip6_ins_rt(rt, &info, &mxc, NULL);
+	return __ip6_ins_rt(rt, &info, NULL);
 }
 
 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
@@ -1232,8 +1222,8 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 	p = this_cpu_ptr(rt->rt6i_pcpu);
 	pcpu_rt = *p;
 
-	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
-		rt6_dst_from_metrics_check(pcpu_rt);
+	if (pcpu_rt)
+		ip6_hold_safe(NULL, &pcpu_rt, false);
 
 	return pcpu_rt;
 }
@@ -1254,7 +1244,6 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
 	prev = cmpxchg(p, NULL, pcpu_rt);
 	BUG_ON(prev);
 
-	rt6_dst_from_metrics_check(pcpu_rt);
 	return pcpu_rt;
 }
 
@@ -1384,6 +1373,16 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
 	return NULL;
 }
 
+static unsigned int fib6_mtu(const struct rt6_info *rt)
+{
+	unsigned int mtu;
+
+	mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
+	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+
+	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
+}
+
 static int rt6_insert_exception(struct rt6_info *nrt,
 				struct rt6_info *ort)
 {
@@ -1436,7 +1435,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	 * Only insert this exception route if its mtu
 	 * is less than ort's mtu value.
 	 */
-	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
+	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -1673,12 +1672,12 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
 			struct rt6_info *entry = rt6_ex->rt6i;
 
 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
-			 * route), the metrics of its rt->dst.from have already
+			 * route), the metrics of its rt->from have already
 			 * been updated.
 			 */
-			if (entry->rt6i_pmtu &&
+			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
-				entry->rt6i_pmtu = mtu;
+				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
 		}
 		bucket++;
 	}
@@ -1844,10 +1843,9 @@ redo_rt6_select:
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (rt->rt6i_flags & RTF_CACHE) {
-		if (ip6_hold_safe(net, &rt, true)) {
+		if (ip6_hold_safe(net, &rt, true))
 			dst_use_noref(&rt->dst, jiffies);
-			rt6_dst_from_metrics_check(rt);
-		}
+
 		rcu_read_unlock();
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
@@ -2147,13 +2145,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *	Destination cache support functions
  */
 
-static void rt6_dst_from_metrics_check(struct rt6_info *rt)
-{
-	if (rt->from &&
-	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
-		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
-}
-
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
 	u32 rt_cookie = 0;
@@ -2188,8 +2179,6 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	 * into this function always.
 	 */
 
-	rt6_dst_from_metrics_check(rt);
-
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
 		return rt6_dst_from_check(rt, cookie);
@@ -2242,8 +2231,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 {
 	struct net *net = dev_net(rt->dst.dev);
 
+	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
 	rt->rt6i_flags |= RTF_MODIFIED;
-	rt->rt6i_pmtu = mtu;
 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
 }
 
@@ -2289,10 +2278,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	} else if (daddr) {
 		struct rt6_info *nrt6;
 
-		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+		nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
-			if (rt6_insert_exception(nrt6, rt6))
+			if (rt6_insert_exception(nrt6, rt6->from))
 				dst_release_immediate(&nrt6->dst);
 		}
 	}
@@ -2533,12 +2522,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 
 static unsigned int ip6_mtu(const struct dst_entry *dst)
 {
-	const struct rt6_info *rt = (const struct rt6_info *)dst;
-	unsigned int mtu = rt->rt6i_pmtu;
 	struct inet6_dev *idev;
-
-	if (mtu)
-		goto out;
+	unsigned int mtu;
 
 	mtu = dst_metric_raw(dst, RTAX_MTU);
 	if (mtu)
@@ -2622,60 +2607,24 @@ out:
 	return entries > rt_max_size;
 }
 
-static int ip6_convert_metrics(struct mx6_config *mxc,
-			       const struct fib6_config *cfg)
+static int ip6_convert_metrics(struct net *net, struct rt6_info *rt,
+			       struct fib6_config *cfg)
 {
-	struct net *net = cfg->fc_nlinfo.nl_net;
-	bool ecn_ca = false;
-	struct nlattr *nla;
-	int remaining;
-	u32 *mp;
-
-	if (!cfg->fc_mx)
-		return 0;
-
-	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
-	if (unlikely(!mp))
-		return -ENOMEM;
-
-	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
-		int type = nla_type(nla);
-		u32 val;
-
-		if (!type)
-			continue;
-		if (unlikely(type > RTAX_MAX))
-			goto err;
-
-		if (type == RTAX_CC_ALGO) {
-			char tmp[TCP_CA_NAME_MAX];
+	int err = 0;
 
-			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
-			if (val == TCP_CA_UNSPEC)
-				goto err;
-		} else {
-			val = nla_get_u32(nla);
-		}
-		if (type == RTAX_HOPLIMIT && val > 255)
-			val = 255;
-		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
-			goto err;
+	if (cfg->fc_mx) {
+		rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
+					   GFP_KERNEL);
+		if (unlikely(!rt->fib6_metrics))
+			return -ENOMEM;
 
-		mp[type - 1] = val;
-		__set_bit(type - 1, mxc->mx_valid);
-	}
+		refcount_set(&rt->fib6_metrics->refcnt, 1);
 
-	if (ecn_ca) {
-		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
-		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+		err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
+					 rt->fib6_metrics->metrics);
 	}
 
-	mxc->mx = mp;
-	return 0;
- err:
-	kfree(mp);
-	return -EINVAL;
+	return err;
 }
 
 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
@@ -2955,6 +2904,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	err = ip6_convert_metrics(net, rt, cfg);
+	if (err < 0)
+		goto out;
+
 	if (cfg->fc_flags & RTF_EXPIRES)
 		rt6_set_expires(rt, jiffies +
 				clock_t_to_jiffies(cfg->fc_expires));
@@ -3078,32 +3031,16 @@ out:
 	return ERR_PTR(err);
 }
 
-int ip6_route_add(struct fib6_config *cfg,
-		  struct netlink_ext_ack *extack)
+int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack)
 {
-	struct mx6_config mxc = { .mx = NULL, };
 	struct rt6_info *rt;
 	int err;
 
 	rt = ip6_route_info_create(cfg, extack);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		rt = NULL;
-		goto out;
-	}
-
-	err = ip6_convert_metrics(&mxc, cfg);
-	if (err)
-		goto out;
-
-	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
-
-	kfree(mxc.mx);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
 
-	return err;
-out:
-	if (rt)
-		dst_release_immediate(&rt->dst);
+	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
 
 	return err;
 }
@@ -3157,7 +3094,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 		if (skb) {
 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 
-			if (rt6_fill_node(net, skb, rt,
+			if (rt6_fill_node(net, skb, rt, NULL,
 					  NULL, NULL, 0, RTM_DELROUTE,
 					  info->portid, seq, 0) < 0) {
 				kfree_skb(skb);
@@ -3348,7 +3285,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 * a cached route because rt6_insert_exception() will
 	 * takes care of it
 	 */
-	if (rt6_insert_exception(nrt, rt)) {
+	if (rt6_insert_exception(nrt, rt->from)) {
 		dst_release_immediate(&nrt->dst);
 		goto out;
 	}
@@ -4018,11 +3955,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	   update PMTU increase is a MUST. (i.e. jumbo frame)
 	 */
 	if (rt->fib6_nh.nh_dev == arg->dev &&
-	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+	    !fib6_metric_locked(rt, RTAX_MTU)) {
+		u32 mtu = rt->fib6_pmtu;
+
+		if (mtu >= arg->mtu ||
+		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
+
 		spin_lock_bh(&rt6_exception_lock);
-		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
-		    rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
-			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
 		spin_unlock_bh(&rt6_exception_lock);
 	}
@@ -4183,7 +4123,6 @@ errout:
 struct rt6_nh {
 	struct rt6_info *rt6_info;
 	struct fib6_config r_cfg;
-	struct mx6_config mxc;
 	struct list_head next;
 };
 
@@ -4198,7 +4137,8 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
 	}
 }
 
-static int ip6_route_info_append(struct list_head *rt6_nh_list,
+static int ip6_route_info_append(struct net *net,
+				 struct list_head *rt6_nh_list,
 				 struct rt6_info *rt, struct fib6_config *r_cfg)
 {
 	struct rt6_nh *nh;
@@ -4214,7 +4154,7 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
 	if (!nh)
 		return -ENOMEM;
 	nh->rt6_info = rt;
-	err = ip6_convert_metrics(&nh->mxc, r_cfg);
+	err = ip6_convert_metrics(net, rt, r_cfg);
 	if (err) {
 		kfree(nh);
 		return err;
@@ -4305,7 +4245,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 
 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
 
-		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
+		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
+					    rt, &r_cfg);
 		if (err) {
 			dst_release_immediate(&rt->dst);
 			goto cleanup;
@@ -4323,7 +4264,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 	err_nh = NULL;
 	list_for_each_entry(nh, &rt6_nh_list, next) {
 		rt_last = nh->rt6_info;
-		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
+		err = __ip6_ins_rt(nh->rt6_info, info, extack);
 		/* save reference to first route for notification */
 		if (!rt_notif && !err)
 			rt_notif = nh->rt6_info;
@@ -4372,7 +4313,6 @@ cleanup:
 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
 		if (nh->rt6_info)
 			dst_release_immediate(&nh->rt6_info->dst);
-		kfree(nh->mxc.mx);
 		list_del(&nh->next);
 		kfree(nh);
 	}
@@ -4546,16 +4486,16 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-static int rt6_fill_node(struct net *net,
-			 struct sk_buff *skb, struct rt6_info *rt,
-			 struct in6_addr *dst, struct in6_addr *src,
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags)
 {
-	u32 metrics[RTAX_MAX];
 	struct rtmsg *rtm;
 	struct nlmsghdr *nlh;
-	long expires;
+	long expires = 0;
+	u32 *pmetrics;
 	u32 table;
 
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
@@ -4583,8 +4523,8 @@ static int rt6_fill_node(struct net *net,
 	if (rt->rt6i_flags & RTF_CACHE)
 		rtm->rtm_flags |= RTM_F_CLONED;
 
-	if (dst) {
-		if (nla_put_in6_addr(skb, RTA_DST, dst))
+	if (dest) {
+		if (nla_put_in6_addr(skb, RTA_DST, dest))
 			goto nla_put_failure;
 		rtm->rtm_dst_len = 128;
 	} else if (rtm->rtm_dst_len)
@@ -4612,9 +4552,9 @@ static int rt6_fill_node(struct net *net,
 #endif
 			if (nla_put_u32(skb, RTA_IIF, iif))
 				goto nla_put_failure;
-	} else if (dst) {
+	} else if (dest) {
 		struct in6_addr saddr_buf;
-		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
+		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
 			goto nla_put_failure;
 	}
@@ -4626,10 +4566,8 @@ static int rt6_fill_node(struct net *net,
 			goto nla_put_failure;
 	}
 
-	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
-	if (rt->rt6i_pmtu)
-		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
-	if (rtnetlink_put_metrics(skb, metrics) < 0)
+	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
+	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
 		goto nla_put_failure;
 
 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
@@ -4661,9 +4599,10 @@ static int rt6_fill_node(struct net *net,
 			goto nla_put_failure;
 	}
 
-	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
+	if (rt->rt6i_flags & RTF_EXPIRES && dst)
+		expires = dst->expires - jiffies;
 
-	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
+	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
 		goto nla_put_failure;
 
 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
@@ -4697,10 +4636,9 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 		}
 	}
 
-	return rt6_fill_node(net,
-		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
-		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
-		     NLM_F_MULTI);
+	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
+			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
+			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -4814,13 +4752,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 	skb_dst_set(skb, &rt->dst);
 	if (fibmatch)
-		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
+		err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif,
 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, 0);
 	else
-		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
-				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
-				    nlh->nlmsg_seq, 0);
+		err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr,
+				    iif, RTM_NEWROUTE,
+				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+				    0);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
@@ -4846,8 +4785,8 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
 	if (!skb)
 		goto errout;
 
-	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-				event, info->portid, seq, nlm_flags);
+	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+			    event, info->portid, seq, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
cgit v1.2.3


From 14895687d36805f051bb54014c32e48e5937f7e1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:17 -0700
Subject: net/ipv6: move expires into rt6_info

Add expires to rt6_info for FIB entries, and add fib6 helpers to
manage it. Data path use of dst.expires remains.

The transition is fairly straightforward: when working with fib entries,
rt->dst.expires is just rt->expires, rt6_clean_expires is replaced with
fib6_clean_expires, rt6_set_expires becomes fib6_set_expires, and
rt6_check_expired becomes fib6_check_expired, where the fib6 versions
are added by this patch.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 27 +++++++++++++++++++++++----
 net/ipv6/addrconf.c   |  6 +++---
 net/ipv6/ip6_fib.c    |  8 ++++----
 net/ipv6/ndisc.c      |  2 +-
 net/ipv6/route.c      | 20 +++++++++++---------
 5 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1f8dc9d12abb..c73b985734f5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -179,6 +179,7 @@ struct rt6_info {
 					should_flush:1,
 					unused:6;
 
+	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
 	struct fib6_nh			fib6_nh;
@@ -197,6 +198,26 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 	return ((struct rt6_info *)dst)->rt6i_idev;
 }
 
+static inline void fib6_clean_expires(struct rt6_info *f6i)
+{
+	f6i->rt6i_flags &= ~RTF_EXPIRES;
+	f6i->expires = 0;
+}
+
+static inline void fib6_set_expires(struct rt6_info *f6i,
+				    unsigned long expires)
+{
+	f6i->expires = expires;
+	f6i->rt6i_flags |= RTF_EXPIRES;
+}
+
+static inline bool fib6_check_expired(const struct rt6_info *f6i)
+{
+	if (f6i->rt6i_flags & RTF_EXPIRES)
+		return time_after(jiffies, f6i->expires);
+	return false;
+}
+
 static inline void rt6_clean_expires(struct rt6_info *rt)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
@@ -211,11 +232,9 @@ static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires)
 
 static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 {
-	struct rt6_info *rt;
+	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
+		rt0->dst.expires = rt0->from->expires;
 
-	for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from);
-	if (rt && rt != rt0)
-		rt0->dst.expires = rt->dst.expires;
 	dst_set_expires(&rt0->dst, timeout);
 	rt0->rt6i_flags |= RTF_EXPIRES;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 483c8772e856..a156f1a0b1a7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1190,7 +1190,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r
 			ip6_del_rt(dev_net(ifp->idev->dev), rt);
 		else {
 			if (!(rt->rt6i_flags & RTF_EXPIRES))
-				rt6_set_expires(rt, expires);
+				fib6_set_expires(rt, expires);
 			ip6_rt_put(rt);
 		}
 	}
@@ -2672,9 +2672,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 				rt = NULL;
 			} else if (addrconf_finite_timeout(rt_expires)) {
 				/* not infinity */
-				rt6_set_expires(rt, jiffies + rt_expires);
+				fib6_set_expires(rt, jiffies + rt_expires);
 			} else {
-				rt6_clean_expires(rt);
+				fib6_clean_expires(rt);
 			}
 		} else if (valid_lft) {
 			clock_t expires = 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0d94c56c3e41..f25f4d9831e8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -906,9 +906,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 				if (!(iter->rt6i_flags & RTF_EXPIRES))
 					return -EEXIST;
 				if (!(rt->rt6i_flags & RTF_EXPIRES))
-					rt6_clean_expires(iter);
+					fib6_clean_expires(iter);
 				else
-					rt6_set_expires(iter, rt->dst.expires);
+					fib6_set_expires(iter, rt->expires);
 				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
@@ -2003,8 +2003,8 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 	 *	Routes are expired even if they are in use.
 	 */
 
-	if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
-		if (time_after(now, rt->dst.expires)) {
+	if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) {
+		if (time_after(now, rt->expires)) {
 			RT6_TRACE("expiring %p\n", rt);
 			return -1;
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b058ea9ecec0..e4d9eea92139 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1318,7 +1318,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	}
 
 	if (rt)
-		rt6_set_expires(rt, jiffies + (HZ * lifetime));
+		fib6_set_expires(rt, jiffies + (HZ * lifetime));
 	if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
 	    ra_msg->icmph.icmp6_hop_limit) {
 		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 62aafd06c35f..f6ca55d21fac 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -435,7 +435,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 			return true;
 	} else if (rt->from) {
 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
-			rt6_check_expired(rt->from);
+			fib6_check_expired(rt->from);
 	}
 	return false;
 }
@@ -687,7 +687,7 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
 
-	if (rt6_check_expired(rt))
+	if (fib6_check_expired(rt))
 		goto out;
 
 	m = rt6_score_route(rt, oif, strict);
@@ -871,9 +871,9 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 
 	if (rt) {
 		if (!addrconf_finite_timeout(lifetime))
-			rt6_clean_expires(rt);
+			fib6_clean_expires(rt);
 		else
-			rt6_set_expires(rt, jiffies + HZ * lifetime);
+			fib6_set_expires(rt, jiffies + HZ * lifetime);
 
 		ip6_rt_put(rt);
 	}
@@ -2383,7 +2383,7 @@ restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 			continue;
-		if (rt6_check_expired(rt))
+		if (fib6_check_expired(rt))
 			continue;
 		if (rt->rt6i_flags & RTF_REJECT)
 			break;
@@ -2909,10 +2909,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 
 	if (cfg->fc_flags & RTF_EXPIRES)
-		rt6_set_expires(rt, jiffies +
+		fib6_set_expires(rt, jiffies +
 				clock_t_to_jiffies(cfg->fc_expires));
 	else
-		rt6_clean_expires(rt);
+		fib6_clean_expires(rt);
 
 	if (cfg->fc_protocol == RTPROT_UNSPEC)
 		cfg->fc_protocol = RTPROT_BOOT;
@@ -4599,8 +4599,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_flags & RTF_EXPIRES && dst)
-		expires = dst->expires - jiffies;
+	if (rt->rt6i_flags & RTF_EXPIRES) {
+		expires = dst ? dst->expires : rt->expires;
+		expires -= jiffies;
+	}
 
 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 421842edeaf62c4e180b687f5a4efca8c19c49ad Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:18 -0700
Subject: net/ipv6: Add fib6_null_entry

ip6_null_entry will stay a dst based return for lookups that fail to
match an entry.

Add a new fib6_null_entry which constitutes the root node and leafs
for fibs. Replace existing references to ip6_null_entry with the
new fib6_null_entry when dealing with FIBs.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h |  3 ++-
 net/ipv6/ip6_fib.c       | 26 ++++++++++----------
 net/ipv6/route.c         | 62 +++++++++++++++++++++++++++++++++---------------
 3 files changed, 58 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index c29f09cfc9d7..74e4e1e449d5 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -60,7 +60,8 @@ struct netns_ipv6 {
 #endif
 	struct xt_table		*ip6table_nat;
 #endif
-	struct rt6_info         *ip6_null_entry;
+	struct rt6_info         *fib6_null_entry;
+	struct rt6_info		*ip6_null_entry;
 	struct rt6_statistics   *rt6_stats;
 	struct timer_list       ip6_fib_timer;
 	struct hlist_head       *fib_table_hash;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f25f4d9831e8..280b69497ad0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -231,7 +231,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
 	if (table) {
 		table->tb6_id = id;
 		rcu_assign_pointer(table->tb6_root.leaf,
-				   net->ipv6.ip6_null_entry);
+				   net->ipv6.fib6_null_entry);
 		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 		inet_peer_base_init(&table->tb6_peers);
 	}
@@ -369,7 +369,7 @@ struct fib6_dump_arg {
 
 static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
 {
-	if (rt == arg->net->ipv6.ip6_null_entry)
+	if (rt == arg->net->ipv6.fib6_null_entry)
 		return;
 	call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
 }
@@ -658,7 +658,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
 			/* remove null_entry in the root node */
 			} else if (fn->fn_flags & RTN_TL_ROOT &&
 				   rcu_access_pointer(fn->leaf) ==
-				   net->ipv6.ip6_null_entry) {
+				   net->ipv6.fib6_null_entry) {
 				RCU_INIT_POINTER(fn->leaf, NULL);
 			}
 
@@ -1171,9 +1171,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			if (!sfn)
 				goto failure;
 
-			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+			atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref);
 			rcu_assign_pointer(sfn->leaf,
-					   info->nl_net->ipv6.ip6_null_entry);
+					   info->nl_net->ipv6.fib6_null_entry);
 			sfn->fn_flags = RTN_ROOT;
 
 			/* Now add the first leaf node to new subtree */
@@ -1212,7 +1212,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			if (fn->fn_flags & RTN_TL_ROOT) {
 				/* put back null_entry for root node */
 				rcu_assign_pointer(fn->leaf,
-					    info->nl_net->ipv6.ip6_null_entry);
+					    info->nl_net->ipv6.fib6_null_entry);
 			} else {
 				atomic_inc(&rt->rt6i_ref);
 				rcu_assign_pointer(fn->leaf, rt);
@@ -1251,7 +1251,7 @@ out:
 				if (!pn_leaf) {
 					WARN_ON(!pn_leaf);
 					pn_leaf =
-					    info->nl_net->ipv6.ip6_null_entry;
+					    info->nl_net->ipv6.fib6_null_entry;
 				}
 #endif
 				atomic_inc(&pn_leaf->rt6i_ref);
@@ -1494,7 +1494,7 @@ static struct rt6_info *fib6_find_prefix(struct net *net,
 	struct fib6_node *child_left, *child_right;
 
 	if (fn->fn_flags & RTN_ROOT)
-		return net->ipv6.ip6_null_entry;
+		return net->ipv6.fib6_null_entry;
 
 	while (fn) {
 		child_left = rcu_dereference_protected(fn->left,
@@ -1531,7 +1531,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 
 	/* Set fn->leaf to null_entry for root node. */
 	if (fn->fn_flags & RTN_TL_ROOT) {
-		rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry);
+		rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
 		return fn;
 	}
 
@@ -1576,7 +1576,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 #if RT6_DEBUG >= 2
 			if (!new_fn_leaf) {
 				WARN_ON(!new_fn_leaf);
-				new_fn_leaf = net->ipv6.ip6_null_entry;
+				new_fn_leaf = net->ipv6.fib6_null_entry;
 			}
 #endif
 			atomic_inc(&new_fn_leaf->rt6i_ref);
@@ -1726,7 +1726,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 		return -ENOENT;
 	}
 #endif
-	if (!fn || rt == net->ipv6.ip6_null_entry)
+	if (!fn || rt == net->ipv6.fib6_null_entry)
 		return -ENOENT;
 
 	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
@@ -2087,7 +2087,7 @@ static int __net_init fib6_net_init(struct net *net)
 
 	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
 	rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
-			   net->ipv6.ip6_null_entry);
+			   net->ipv6.fib6_null_entry);
 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2099,7 +2099,7 @@ static int __net_init fib6_net_init(struct net *net)
 		goto out_fib6_main_tbl;
 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
 	rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
-			   net->ipv6.ip6_null_entry);
+			   net->ipv6.fib6_null_entry);
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f6ca55d21fac..7c141394d4f1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -276,6 +276,15 @@ static const u32 ip6_template_metrics[RTAX_MAX] = {
 	[RTAX_HOPLIMIT - 1] = 0,
 };
 
+static const struct rt6_info fib6_null_entry_template = {
+	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
+	.rt6i_metric	= ~(u32)0,
+	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_UNREACHABLE,
+	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
+};
+
 static const struct rt6_info ip6_null_entry_template = {
 	.dst = {
 		.__refcnt	= ATOMIC_INIT(1),
@@ -522,10 +531,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 			return local;
 
 		if (flags & RT6_LOOKUP_F_IFACE)
-			return net->ipv6.ip6_null_entry;
+			return net->ipv6.fib6_null_entry;
 	}
 
-	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
+	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -758,8 +767,8 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 	bool do_rr = false;
 	int key_plen;
 
-	if (!leaf || leaf == net->ipv6.ip6_null_entry)
-		return net->ipv6.ip6_null_entry;
+	if (!leaf || leaf == net->ipv6.fib6_null_entry)
+		return net->ipv6.fib6_null_entry;
 
 	rt0 = rcu_dereference(fn->rr_ptr);
 	if (!rt0)
@@ -776,7 +785,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 		key_plen = rt0->rt6i_src.plen;
 #endif
 	if (fn->fn_bit != key_plen)
-		return net->ipv6.ip6_null_entry;
+		return net->ipv6.fib6_null_entry;
 
 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
 			     &do_rr);
@@ -797,7 +806,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 		}
 	}
 
-	return match ? match : net->ipv6.ip6_null_entry;
+	return match ? match : net->ipv6.fib6_null_entry;
 }
 
 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
@@ -1063,7 +1072,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 restart:
 	rt = rcu_dereference(fn->leaf);
 	if (!rt) {
-		rt = net->ipv6.ip6_null_entry;
+		rt = net->ipv6.fib6_null_entry;
 	} else {
 		rt = rt6_device_match(net, rt, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
@@ -1071,7 +1080,7 @@ restart:
 			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
 						  skb, flags);
 	}
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto restart;
@@ -1820,7 +1829,7 @@ redo_rt6_select:
 	rt = rt6_select(net, fn, oif, strict);
 	if (rt->rt6i_nsiblings)
 		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
@@ -1837,7 +1846,8 @@ redo_rt6_select:
 	if (rt_cache)
 		rt = rt_cache;
 
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
+		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
 		trace_fib6_table_lookup(net, rt, table, fl6);
@@ -2412,13 +2422,13 @@ restart:
 	}
 
 	if (!rt)
-		rt = net->ipv6.ip6_null_entry;
+		rt = net->ipv6.fib6_null_entry;
 	else if (rt->rt6i_flags & RTF_REJECT) {
 		rt = net->ipv6.ip6_null_entry;
 		goto out;
 	}
 
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto restart;
@@ -3051,7 +3061,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 	struct fib6_table *table;
 	int err;
 
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		err = -ENOENT;
 		goto out;
 	}
@@ -3081,7 +3091,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 	struct fib6_table *table;
 	int err = -ENOENT;
 
-	if (rt == net->ipv6.ip6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry)
 		goto out_put;
 	table = rt->rt6i_table;
 	spin_lock_bh(&table->tb6_lock);
@@ -3634,7 +3644,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
-	    rt != net->ipv6.ip6_null_entry &&
+	    rt != net->ipv6.fib6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
 		/* remove prefsrc entry */
@@ -3789,7 +3799,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
+	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
@@ -3873,7 +3883,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 	const struct net_device *dev = arg->dev;
 	struct net *net = dev_net(dev);
 
-	if (rt == net->ipv6.ip6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
 	switch (arg->event) {
@@ -4624,7 +4634,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 	struct net *net = arg->net;
 
-	if (rt == net->ipv6.ip6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
@@ -4813,6 +4823,8 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		return NOTIFY_OK;
 
 	if (event == NETDEV_REGISTER) {
+		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
+		net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -4826,6 +4838,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		/* NETDEV_UNREGISTER could be fired for multiple times by
 		 * netdev_wait_allrefs(). Make sure we only call this once.
 		 */
+		in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
@@ -5009,11 +5022,17 @@ static int __net_init ip6_route_net_init(struct net *net)
 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
 		goto out_ip6_dst_ops;
 
+	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
+					    sizeof(*net->ipv6.fib6_null_entry),
+					    GFP_KERNEL);
+	if (!net->ipv6.fib6_null_entry)
+		goto out_ip6_dst_entries;
+
 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
 					   sizeof(*net->ipv6.ip6_null_entry),
 					   GFP_KERNEL);
 	if (!net->ipv6.ip6_null_entry)
-		goto out_ip6_dst_entries;
+		goto out_fib6_null_entry;
 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
 			 ip6_template_metrics, true);
@@ -5060,6 +5079,8 @@ out_ip6_prohibit_entry:
 out_ip6_null_entry:
 	kfree(net->ipv6.ip6_null_entry);
 #endif
+out_fib6_null_entry:
+	kfree(net->ipv6.fib6_null_entry);
 out_ip6_dst_entries:
 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
 out_ip6_dst_ops:
@@ -5068,6 +5089,7 @@ out_ip6_dst_ops:
 
 static void __net_exit ip6_route_net_exit(struct net *net)
 {
+	kfree(net->ipv6.fib6_null_entry);
 	kfree(net->ipv6.ip6_null_entry);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	kfree(net->ipv6.ip6_prohibit_entry);
@@ -5138,6 +5160,8 @@ void __init ip6_route_init_special_entries(void)
 	/* Registering of the loopback is done before this portion of code,
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
+	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
+	init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
cgit v1.2.3


From 3b6761d18bc11f2af2a6fc494e9026d39593f22c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:20 -0700
Subject: net/ipv6: Move dst flags to booleans in fib entries

Continuing to wean FIB paths off of dst_entry, use a bool to hold
requests for certain dst settings. Add a helper to convert the
flags to DST flags when a FIB entry is converted to a dst_entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 ++++-
 net/ipv6/addrconf.c   |  4 ++--
 net/ipv6/route.c      | 29 ++++++++++++++++++++++++-----
 3 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c73b985734f5..159f651dee55 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -177,7 +177,10 @@ struct rt6_info {
 	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
-					unused:6;
+					dst_nocount:1,
+					dst_nopolicy:1,
+					dst_host:1,
+					unused:3;
 
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a156f1a0b1a7..c8df5c6499db 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1046,7 +1046,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 
 	if (net->ipv6.devconf_all->disable_policy ||
 	    idev->cnf.disable_policy)
-		rt->dst.flags |= DST_NOPOLICY;
+		rt->dst_nopolicy = true;
 
 	neigh_parms_data_state_setall(idev->nd_parms);
 
@@ -5981,7 +5981,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 			int cpu;
 
 			rcu_read_lock();
-			addrconf_set_nopolicy(ifa->rt, val);
+			ifa->rt->dst_nopolicy = val ? true : false;
 			if (rt->rt6i_pcpu) {
 				for_each_possible_cpu(cpu) {
 					struct rt6_info **rtp;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e293692174ba..1a3e0db31b34 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -937,6 +937,20 @@ static int ip6_rt_type_to_error(u8 fib6_type)
 	return fib6_prop[fib6_type];
 }
 
+static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
+{
+	unsigned short flags = 0;
+
+	if (rt->dst_nocount)
+		flags |= DST_NOCOUNT;
+	if (rt->dst_nopolicy)
+		flags |= DST_NOPOLICY;
+	if (rt->dst_host)
+		flags |= DST_HOST;
+
+	return flags;
+}
+
 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
 {
 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
@@ -961,6 +975,8 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
 
 static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 {
+	rt->dst.flags |= fib6_info_dst_flags(ort);
+
 	if (ort->rt6i_flags & RTF_REJECT) {
 		ip6_rt_init_dst_reject(rt, ort);
 		return;
@@ -970,7 +986,6 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 	rt->dst.output = ip6_output;
 
 	if (ort->fib6_type == RTN_LOCAL) {
-		rt->dst.flags |= DST_HOST;
 		rt->dst.input = ip6_input;
 	} else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
 		rt->dst.input = ip6_mc_input;
@@ -1058,10 +1073,11 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
 /* called with rcu_lock held */
 static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
 {
+	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rt6_info *nrt;
 
-	nrt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+	nrt = __ip6_dst_alloc(dev_net(dev), dev, flags);
 	if (nrt)
 		ip6_rt_copy_init(nrt, rt);
 
@@ -1229,12 +1245,13 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 
 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 {
+	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev;
 	struct rt6_info *pcpu_rt;
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(rt);
-	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
+	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags);
 	rcu_read_unlock();
 	if (!pcpu_rt)
 		return NULL;
@@ -2965,7 +2982,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
 	rt->rt6i_dst.plen = cfg->fc_dst_len;
 	if (rt->rt6i_dst.plen == 128)
-		rt->dst.flags |= DST_HOST;
+		rt->dst_host = true;
 
 #ifdef CONFIG_IPV6_SUBTREES
 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -3626,10 +3643,12 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 	if (!rt)
 		return ERR_PTR(-ENOMEM);
 
+	rt->dst_nocount = true;
+
 	in6_dev_hold(idev);
 	rt->rt6i_idev = idev;
 
-	rt->dst.flags |= DST_HOST;
+	rt->dst_host = true;
 	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast) {
-- 
cgit v1.2.3


From f8a1b43b709d8ef33a8de2f8f35095b4a4413713 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:21 -0700
Subject: net/ipv6: Create a neigh_lookup for FIB entries

The router discovery code has a FIB entry and wants to validate the
gateway has a neighbor entry. Refactor the existing dst_neigh_lookup
for IPv6 and create a new function that takes the gateway and device
and returns a neighbor entry. Use the new function in
ndisc_router_discovery to validate the gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  3 +++
 net/ipv6/ndisc.c        |  8 ++++++--
 net/ipv6/route.c        | 33 ++++++++++++++++++++-------------
 3 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 655e13017a45..cb6fb7e16a28 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -279,4 +279,7 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b)
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
 
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+				   struct net_device *dev, struct sk_buff *skb,
+				   const void *daddr);
 #endif
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e4d9eea92139..556717154fa3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1276,7 +1276,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt) {
-		neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+		neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+					 rt->fib6_nh.nh_dev, NULL,
+					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
@@ -1304,7 +1306,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 
-		neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+		neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+					 rt->fib6_nh.nh_dev, NULL,
+					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1a3e0db31b34..d635d71f7d51 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -182,12 +182,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 	}
 }
 
-static inline const void *choose_neigh_daddr(struct rt6_info *rt,
+static inline const void *choose_neigh_daddr(const struct in6_addr *p,
 					     struct sk_buff *skb,
 					     const void *daddr)
 {
-	struct in6_addr *p = &rt->rt6i_gateway;
-
 	if (!ipv6_addr_any(p))
 		return (const void *) p;
 	else if (skb)
@@ -195,18 +193,27 @@ static inline const void *choose_neigh_daddr(struct rt6_info *rt,
 	return daddr;
 }
 
-static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
-					  struct sk_buff *skb,
-					  const void *daddr)
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+				   struct net_device *dev,
+				   struct sk_buff *skb,
+				   const void *daddr)
 {
-	struct rt6_info *rt = (struct rt6_info *) dst;
 	struct neighbour *n;
 
-	daddr = choose_neigh_daddr(rt, skb, daddr);
-	n = __ipv6_neigh_lookup(dst->dev, daddr);
+	daddr = choose_neigh_daddr(gw, skb, daddr);
+	n = __ipv6_neigh_lookup(dev, daddr);
 	if (n)
 		return n;
-	return neigh_create(&nd_tbl, daddr, dst->dev);
+	return neigh_create(&nd_tbl, daddr, dev);
+}
+
+static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
+					      struct sk_buff *skb,
+					      const void *daddr)
+{
+	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
+
+	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
 }
 
 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
@@ -214,7 +221,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 	struct net_device *dev = dst->dev;
 	struct rt6_info *rt = (struct rt6_info *)dst;
 
-	daddr = choose_neigh_daddr(rt, NULL, daddr);
+	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
 	if (!daddr)
 		return;
 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -239,7 +246,7 @@ static struct dst_ops ip6_dst_ops_template = {
 	.update_pmtu		=	ip6_rt_update_pmtu,
 	.redirect		=	rt6_do_redirect,
 	.local_out		=	__ip6_local_out,
-	.neigh_lookup		=	ip6_neigh_lookup,
+	.neigh_lookup		=	ip6_dst_neigh_lookup,
 	.confirm_neigh		=	ip6_confirm_neigh,
 };
 
@@ -269,7 +276,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
 	.redirect		=	ip6_rt_blackhole_redirect,
 	.cow_metrics		=	dst_cow_metrics_generic,
-	.neigh_lookup		=	ip6_neigh_lookup,
+	.neigh_lookup		=	ip6_dst_neigh_lookup,
 };
 
 static const u32 ip6_template_metrics[RTAX_MAX] = {
-- 
cgit v1.2.3


From acb54e3cba404c20f07733f3222c0418a7724a5b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:22 -0700
Subject: net/ipv6: Add gfp_flags to route add functions

Most FIB entries can be added using memory allocated with GFP_KERNEL.
Add gfp_flags to ip6_route_add and addrconf_dst_alloc. Code paths that
can be reached from the packet path (e.g., ndisc and autoconfig) or
atomic notifiers use GFP_ATOMIC; paths from user context (adding
addresses and routes) use GFP_KERNEL.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  6 ++++--
 net/ipv6/addrconf.c     | 39 +++++++++++++++++++++++----------------
 net/ipv6/anycast.c      |  2 +-
 net/ipv6/route.c        | 18 ++++++++++--------
 4 files changed, 38 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cb6fb7e16a28..ff70266e30d7 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -100,7 +100,8 @@ void ip6_route_cleanup(void);
 
 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
-int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
+int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
+		  struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct net *net, struct rt6_info *rt);
 int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
@@ -138,7 +139,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 void fib6_force_start_gc(struct net *net);
 
 struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
-				    const struct in6_addr *addr, bool anycast);
+				    const struct in6_addr *addr, bool anycast,
+				    gfp_t gfp_flags);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 			       int flags);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c8df5c6499db..915cd0734b27 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1037,7 +1037,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	rt = addrconf_dst_alloc(net, idev, addr, false);
+	rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		rt = NULL;
@@ -2320,7 +2320,7 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
 
 static void
 addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
-		      unsigned long expires, u32 flags)
+		      unsigned long expires, u32 flags, gfp_t gfp_flags)
 {
 	struct fib6_config cfg = {
 		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
@@ -2345,7 +2345,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 		cfg.fc_flags |= RTF_NONEXTHOP;
 #endif
 
-	ip6_route_add(&cfg, NULL);
+	ip6_route_add(&cfg, gfp_flags, NULL);
 }
 
 
@@ -2401,7 +2401,7 @@ static void addrconf_add_mroute(struct net_device *dev)
 
 	ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
 
-	ip6_route_add(&cfg, NULL);
+	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
 }
 
 static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -2685,7 +2685,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 				expires = jiffies_to_clock_t(rt_expires);
 			}
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
-					      dev, expires, flags);
+					      dev, expires, flags, GFP_ATOMIC);
 		}
 		ip6_rt_put(rt);
 	}
@@ -2900,7 +2900,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	if (!IS_ERR(ifp)) {
 		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
 			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
-					      expires, flags);
+					      expires, flags, GFP_KERNEL);
 		}
 
 		/* Send a netlink notification if DAD is enabled and
@@ -3053,7 +3053,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
 	if (addr.s6_addr32[3]) {
 		add_addr(idev, &addr, plen, scope);
-		addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags);
+		addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags,
+				      GFP_ATOMIC);
 		return;
 	}
 
@@ -3078,7 +3079,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
 				add_addr(idev, &addr, plen, flag);
 				addrconf_prefix_route(&addr, plen, idev->dev, 0,
-						      pflags);
+						      pflags, GFP_ATOMIC);
 			}
 		}
 	}
@@ -3118,7 +3119,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 	ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
 			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
 	if (!IS_ERR(ifp)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
+				      0, 0, GFP_ATOMIC);
 		addrconf_dad_start(ifp);
 		in6_ifa_put(ifp);
 	}
@@ -3233,7 +3235,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 			addrconf_add_linklocal(idev, &addr,
 					       IFA_F_STABLE_PRIVACY);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+			addrconf_prefix_route(&addr, 64, idev->dev,
+					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_EUI64:
 		/* addrconf_add_linklocal also adds a prefix_route and we
@@ -3243,7 +3246,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 		if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
 			addrconf_add_linklocal(idev, &addr, 0);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+			addrconf_prefix_route(&addr, 64, idev->dev,
+					      0, 0, GFP_ATOMIC);
 		break;
 	case IN6_ADDR_GEN_MODE_NONE:
 	default:
@@ -3346,7 +3350,8 @@ static int fixup_permanent_addr(struct net *net,
 	if (!ifp->rt || !ifp->rt->rt6i_node) {
 		struct rt6_info *rt, *prev;
 
-		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false);
+		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
+					GFP_ATOMIC);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
@@ -3361,7 +3366,7 @@ static int fixup_permanent_addr(struct net *net,
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      idev->dev, 0, 0);
+				      idev->dev, 0, 0, GFP_ATOMIC);
 	}
 
 	if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -4572,8 +4577,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 		ipv6_ifa_notify(0, ifp);
 
 	if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
-				      expires, flags);
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      ifp->idev->dev, expires, flags,
+				      GFP_KERNEL);
 	} else if (had_prefixroute) {
 		enum cleanup_prefix_rt_t action;
 		unsigned long rt_expires;
@@ -5614,7 +5620,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			addrconf_join_anycast(ifp);
 		if (!ipv6_addr_any(&ifp->peer_addr))
 			addrconf_prefix_route(&ifp->peer_addr, 128,
-					      ifp->idev->dev, 0, 0);
+					      ifp->idev->dev, 0, 0,
+					      GFP_KERNEL);
 		break;
 	case RTM_DELADDR:
 		if (ifp->idev->cnf.forwarding)
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 1122fb299b86..e456386fe4d5 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -267,7 +267,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	}
 
 	net = dev_net(idev->dev);
-	rt = addrconf_dst_alloc(net, idev, addr, true);
+	rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		goto out;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d635d71f7d51..56a854b426a6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2866,6 +2866,7 @@ out:
 }
 
 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
+					      gfp_t gfp_flags,
 					      struct netlink_ext_ack *extack)
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
@@ -3086,12 +3087,13 @@ out:
 	return ERR_PTR(err);
 }
 
-int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack)
+int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
+		  struct netlink_ext_ack *extack)
 {
 	struct rt6_info *rt;
 	int err;
 
-	rt = ip6_route_info_create(cfg, extack);
+	rt = ip6_route_info_create(cfg, gfp_flags, extack);
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 
@@ -3418,7 +3420,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 	if (!prefixlen)
 		cfg.fc_flags |= RTF_DEFAULT;
 
-	ip6_route_add(&cfg, NULL);
+	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
 
 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
 }
@@ -3469,7 +3471,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net,
 
 	cfg.fc_gateway = *gwaddr;
 
-	if (!ip6_route_add(&cfg, NULL)) {
+	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
 		struct fib6_table *table;
 
 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
@@ -3567,7 +3569,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		rtnl_lock();
 		switch (cmd) {
 		case SIOCADDRT:
-			err = ip6_route_add(&cfg, NULL);
+			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
 			break;
 		case SIOCDELRT:
 			err = ip6_route_del(&cfg, NULL);
@@ -3640,7 +3642,7 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
 struct rt6_info *addrconf_dst_alloc(struct net *net,
 				    struct inet6_dev *idev,
 				    const struct in6_addr *addr,
-				    bool anycast)
+				    bool anycast, gfp_t gfp_flags)
 {
 	u32 tb_id;
 	struct net_device *dev = idev->dev;
@@ -4293,7 +4295,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		}
 
 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
-		rt = ip6_route_info_create(&r_cfg, extack);
+		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
 		if (IS_ERR(rt)) {
 			err = PTR_ERR(rt);
 			rt = NULL;
@@ -4446,7 +4448,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (cfg.fc_mp)
 		return ip6_route_multipath_add(&cfg, extack);
 	else
-		return ip6_route_add(&cfg, extack);
+		return ip6_route_add(&cfg, GFP_KERNEL, extack);
 }
 
 static size_t rt6_nlmsg_size(struct rt6_info *rt)
-- 
cgit v1.2.3


From 23fb93a4d3f118a900790066d03368a296dce0d6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:23 -0700
Subject: net/ipv6: Cleanup exception and cache route handling

IPv6 FIB will only contain FIB entries with exception routes added to
the FIB entry. Once this transformation is complete, FIB lookups will
return a fib6_info with the lookup functions still returning a dst
based rt6_info. The current code uses rt6_info for both paths and
overloads the rt6_info variable usually called 'rt'.

This patch introduces a new 'f6i' variable name for the result of the FIB
lookup and keeps 'rt' as the dst based return variable. 'f6i' becomes a
fib6_info in a later patch which is why it is introduced as f6i now;
avoids the additional churn in the later patch.

In addition, remove RTF_CACHE and dst checks from fib6 add and delete
since they can not happen now and will never happen after the data
type flip.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |   1 -
 net/ipv6/ip6_fib.c      |  16 +-----
 net/ipv6/route.c        | 142 +++++++++++++++++++++++++++---------------------
 3 files changed, 81 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index ff70266e30d7..686cdc7f356a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -106,7 +106,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt);
 int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
 void rt6_flush_exceptions(struct rt6_info *rt);
-int rt6_remove_exception_rt(struct rt6_info *rt);
 void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
 			unsigned long now);
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 280b69497ad0..53df4a98a7f7 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1074,7 +1074,7 @@ add:
 static void fib6_start_gc(struct net *net, struct rt6_info *rt)
 {
 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
-	    (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
+	    (rt->rt6i_flags & RTF_EXPIRES))
 		mod_timer(&net->ipv6.ip6_fib_timer,
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
@@ -1125,8 +1125,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 	if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
 		return -EINVAL;
-	if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
-		return -EINVAL;
 
 	if (info->nlh) {
 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1650,8 +1648,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	RT6_TRACE("fib6_del_route\n");
 
-	WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
-
 	/* Unlink it */
 	*rtp = rt->rt6_next;
 	rt->rt6i_node = NULL;
@@ -1720,21 +1716,11 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 	struct rt6_info __rcu **rtp;
 	struct rt6_info __rcu **rtp_next;
 
-#if RT6_DEBUG >= 2
-	if (rt->dst.obsolete > 0) {
-		WARN_ON(fn);
-		return -ENOENT;
-	}
-#endif
 	if (!fn || rt == net->ipv6.fib6_null_entry)
 		return -ENOENT;
 
 	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
 
-	/* remove cached dst from exception table */
-	if (rt->rt6i_flags & RTF_CACHE)
-		return rt6_remove_exception_rt(rt);
-
 	/*
 	 *	Walk the leaf entries looking for ourself
 	 */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 56a854b426a6..ad9eaecf539c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1013,8 +1013,8 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 	BUG_ON(from->from);
 
 	rt->rt6i_flags &= ~RTF_EXPIRES;
-	dst_hold(&from->dst);
-	rt->from = from;
+	if (dst_hold_safe(&from->dst))
+		rt->from = from;
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
@@ -1097,8 +1097,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     const struct sk_buff *skb,
 					     int flags)
 {
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *f6i;
 	struct fib6_node *fn;
+	struct rt6_info *rt;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
 		flags &= ~RT6_LOOKUP_F_IFACE;
@@ -1106,36 +1107,36 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 	rcu_read_lock();
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-	rt = rcu_dereference(fn->leaf);
-	if (!rt) {
-		rt = net->ipv6.fib6_null_entry;
+	f6i = rcu_dereference(fn->leaf);
+	if (!f6i) {
+		f6i = net->ipv6.fib6_null_entry;
 	} else {
-		rt = rt6_device_match(net, rt, &fl6->saddr,
+		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
-		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
-						  skb, flags);
+		if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
+			f6i = rt6_multipath_select(net, f6i, fl6,
+						   fl6->flowi6_oif, skb, flags);
 	}
-	if (rt == net->ipv6.fib6_null_entry) {
+	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto restart;
 	}
+
 	/* Search through exception table */
-	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
-	if (rt_cache) {
-		rt = rt_cache;
+	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+	if (rt) {
 		if (ip6_hold_safe(net, &rt, true))
 			dst_use_noref(&rt->dst, jiffies);
-	} else if (dst_hold_safe(&rt->dst)) {
-		struct rt6_info *nrt;
-
-		nrt = ip6_create_rt_rcu(rt);
-		dst_release(&rt->dst);
-		rt = nrt;
-	} else {
+	} else if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		dst_hold(&rt->dst);
+	} else {
+		rt = ip6_create_rt_rcu(f6i);
+		if (!rt) {
+			rt = net->ipv6.ip6_null_entry;
+			dst_hold(&rt->dst);
+		}
 	}
 
 	rcu_read_unlock();
@@ -1218,9 +1219,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	 *	Clone the route.
 	 */
 
-	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = ort->from;
-
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(ort);
 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
@@ -1446,11 +1444,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	struct rt6_exception *rt6_ex;
 	int err = 0;
 
-	/* ort can't be a cache or pcpu route */
-	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = ort->from;
-	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
-
 	spin_lock_bh(&rt6_exception_lock);
 
 	if (ort->exception_bucket_flushed) {
@@ -1589,7 +1582,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
 }
 
 /* Remove the passed in cached rt from the hash table that contains it */
-int rt6_remove_exception_rt(struct rt6_info *rt)
+static int rt6_remove_exception_rt(struct rt6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_info *from = rt->from;
@@ -1854,7 +1847,8 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       const struct sk_buff *skb, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *f6i;
+	struct rt6_info *rt;
 	int strict = 0;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -1871,10 +1865,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		oif = 0;
 
 redo_rt6_select:
-	rt = rt6_select(net, fn, oif, strict);
-	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
-	if (rt == net->ipv6.fib6_null_entry) {
+	f6i = rt6_select(net, fn, oif, strict);
+	if (f6i->rt6i_nsiblings)
+		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
@@ -1886,18 +1880,17 @@ redo_rt6_select:
 		}
 	}
 
-	/*Search through exception table */
-	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
-	if (rt_cache)
-		rt = rt_cache;
-
-	if (rt == net->ipv6.fib6_null_entry) {
+	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
-	} else if (rt->rt6i_flags & RTF_CACHE) {
+	}
+
+	/*Search through exception table */
+	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+	if (rt) {
 		if (ip6_hold_safe(net, &rt, true))
 			dst_use_noref(&rt->dst, jiffies);
 
@@ -1905,7 +1898,7 @@ redo_rt6_select:
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
-			    !(rt->rt6i_flags & RTF_GATEWAY))) {
+			    !(f6i->rt6i_flags & RTF_GATEWAY))) {
 		/* Create a RTF_CACHE clone which will not be
 		 * owned by the fib6 tree.  It is for the special case where
 		 * the daddr in the skb during the neighbor look-up is different
@@ -1914,16 +1907,16 @@ redo_rt6_select:
 
 		struct rt6_info *uncached_rt;
 
-		if (ip6_hold_safe(net, &rt, true)) {
-			dst_use_noref(&rt->dst, jiffies);
+		if (ip6_hold_safe(net, &f6i, true)) {
+			dst_use_noref(&f6i->dst, jiffies);
 		} else {
 			rcu_read_unlock();
-			uncached_rt = rt;
+			uncached_rt = f6i;
 			goto uncached_rt_out;
 		}
 		rcu_read_unlock();
 
-		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
+		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
 		dst_release(&rt->dst);
 
 		if (uncached_rt) {
@@ -1946,18 +1939,18 @@ uncached_rt_out:
 
 		struct rt6_info *pcpu_rt;
 
-		dst_use_noref(&rt->dst, jiffies);
+		dst_use_noref(&f6i->dst, jiffies);
 		local_bh_disable();
-		pcpu_rt = rt6_get_pcpu_route(rt);
+		pcpu_rt = rt6_get_pcpu_route(f6i);
 
 		if (!pcpu_rt) {
 			/* atomic_inc_not_zero() is needed when using rcu */
-			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
+			if (atomic_inc_not_zero(&f6i->rt6i_ref)) {
 				/* No dst_hold() on rt is needed because grabbing
 				 * rt->rt6i_ref makes sure rt can't be released.
 				 */
-				pcpu_rt = rt6_make_pcpu_route(net, rt);
-				rt6_release(rt);
+				pcpu_rt = rt6_make_pcpu_route(net, f6i);
+				rt6_release(f6i);
 			} else {
 				/* rt is already removed from tree */
 				pcpu_rt = net->ipv6.ip6_null_entry;
@@ -2419,7 +2412,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 					     int flags)
 {
 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *ret = NULL, *rt_cache;
+	struct rt6_info *rt;
 	struct fib6_node *fn;
 
 	/* Get the "current" route for this destination and
@@ -2458,7 +2452,7 @@ restart:
 			if (rt_cache &&
 			    ipv6_addr_equal(&rdfl->gateway,
 					    &rt_cache->rt6i_gateway)) {
-				rt = rt_cache;
+				ret = rt_cache;
 				break;
 			}
 			continue;
@@ -2469,7 +2463,7 @@ restart:
 	if (!rt)
 		rt = net->ipv6.fib6_null_entry;
 	else if (rt->rt6i_flags & RTF_REJECT) {
-		rt = net->ipv6.ip6_null_entry;
+		ret = net->ipv6.ip6_null_entry;
 		goto out;
 	}
 
@@ -2480,12 +2474,15 @@ restart:
 	}
 
 out:
-	ip6_hold_safe(net, &rt, true);
+	if (ret)
+		dst_hold(&ret->dst);
+	else
+		ret = ip6_create_rt_rcu(rt);
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-	return rt;
+	trace_fib6_table_lookup(net, ret, table, fl6);
+	return ret;
 };
 
 static struct dst_entry *ip6_route_redirect(struct net *net,
@@ -3182,6 +3179,22 @@ out_put:
 	return err;
 }
 
+static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+{
+	int rc = -ESRCH;
+
+	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
+		goto out;
+
+	if (cfg->fc_flags & RTF_GATEWAY &&
+	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+		goto out;
+	if (dst_hold_safe(&rt->dst))
+		rc = rt6_remove_exception_rt(rt);
+out:
+	return rc;
+}
+
 static int ip6_route_del(struct fib6_config *cfg,
 			 struct netlink_ext_ack *extack)
 {
@@ -3206,11 +3219,16 @@ static int ip6_route_del(struct fib6_config *cfg,
 	if (fn) {
 		for_each_fib6_node_rt_rcu(fn) {
 			if (cfg->fc_flags & RTF_CACHE) {
+				int rc;
+
 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
 							      &cfg->fc_src);
-				if (!rt_cache)
-					continue;
-				rt = rt_cache;
+				if (rt_cache) {
+					rc = ip6_del_cached_rt(rt_cache, cfg);
+					if (rc != -ESRCH)
+						return rc;
+				}
+				continue;
 			}
 			if (cfg->fc_ifindex &&
 			    (!rt->fib6_nh.nh_dev ||
@@ -3327,7 +3345,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 				     NEIGH_UPDATE_F_ISROUTER)),
 		     NDISC_REDIRECT, &ndopts);
 
-	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
+	nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
 	if (!nrt)
 		goto out;
 
-- 
cgit v1.2.3


From a64efe142f5e70b7e39276a414bbb3b96691c608 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:24 -0700
Subject: net/ipv6: introduce fib6_info struct and helpers

Add fib6_info struct and alloc, destroy, hold and release helpers.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_fib.c    | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 159f651dee55..630392ae12d8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -38,6 +38,7 @@
 #endif
 
 struct rt6_info;
+struct fib6_info;
 
 struct fib6_config {
 	u32		fc_table;
@@ -132,6 +133,46 @@ struct fib6_nh {
 	int			nh_weight;
 };
 
+struct fib6_info {
+	struct fib6_table		*rt6i_table;
+	struct fib6_info __rcu		*rt6_next;
+	struct fib6_node __rcu		*rt6i_node;
+
+	/* Multipath routes:
+	 * siblings is a list of fib6_info that have the the same metric/weight,
+	 * destination, but not the same gateway. nsiblings is just a cache
+	 * to speed up lookup.
+	 */
+	struct list_head		rt6i_siblings;
+	unsigned int			rt6i_nsiblings;
+
+	atomic_t			rt6i_ref;
+	struct inet6_dev		*rt6i_idev;
+	unsigned long			expires;
+	struct dst_metrics		*fib6_metrics;
+#define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
+
+	struct rt6key			rt6i_dst;
+	u32				rt6i_flags;
+	struct rt6key			rt6i_src;
+	struct rt6key			rt6i_prefsrc;
+
+	struct rt6_info * __percpu	*rt6i_pcpu;
+	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
+
+	u32				rt6i_metric;
+	u8				rt6i_protocol;
+	u8				fib6_type;
+	u8				exception_bucket_flushed:1,
+					should_flush:1,
+					dst_nocount:1,
+					dst_nopolicy:1,
+					dst_host:1,
+					unused:3;
+
+	struct fib6_nh			fib6_nh;
+};
+
 struct rt6_info {
 	struct dst_entry		dst;
 	struct rt6_info __rcu		*rt6_next;
@@ -291,6 +332,20 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 
 void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
 
+struct rt6_info *fib6_info_alloc(gfp_t gfp_flags);
+void fib6_info_destroy(struct rt6_info *f6i);
+
+static inline void fib6_info_hold(struct rt6_info *f6i)
+{
+	atomic_inc(&f6i->rt6i_ref);
+}
+
+static inline void fib6_info_release(struct rt6_info *f6i)
+{
+	if (f6i && atomic_dec_and_test(&f6i->rt6i_ref))
+		fib6_info_destroy(f6i);
+}
+
 static inline void rt6_hold(struct rt6_info *rt)
 {
 	atomic_inc(&rt->rt6i_ref);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 53df4a98a7f7..d07578d84db0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -145,6 +145,66 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
+struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
+{
+	struct rt6_info *f6i;
+
+	f6i = kzalloc(sizeof(*f6i), gfp_flags);
+	if (!f6i)
+		return NULL;
+
+	f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+	if (!f6i->rt6i_pcpu) {
+		kfree(f6i);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&f6i->rt6i_siblings);
+	f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
+
+	atomic_inc(&f6i->rt6i_ref);
+
+	return f6i;
+}
+
+void fib6_info_destroy(struct rt6_info *f6i)
+{
+	struct rt6_exception_bucket *bucket;
+
+	WARN_ON(f6i->rt6i_node);
+
+	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
+	if (bucket) {
+		f6i->rt6i_exception_bucket = NULL;
+		kfree(bucket);
+	}
+
+	if (f6i->rt6i_pcpu) {
+		int cpu;
+
+		for_each_possible_cpu(cpu) {
+			struct rt6_info **ppcpu_rt;
+			struct rt6_info *pcpu_rt;
+
+			ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+			pcpu_rt = *ppcpu_rt;
+			if (pcpu_rt) {
+				dst_dev_put(&pcpu_rt->dst);
+				dst_release(&pcpu_rt->dst);
+				*ppcpu_rt = NULL;
+			}
+		}
+	}
+
+	if (f6i->rt6i_idev)
+		in6_dev_put(f6i->rt6i_idev);
+	if (f6i->fib6_nh.nh_dev)
+		dev_put(f6i->fib6_nh.nh_dev);
+
+	kfree(f6i);
+}
+EXPORT_SYMBOL_GPL(fib6_info_destroy);
+
 static struct fib6_node *node_alloc(struct net *net)
 {
 	struct fib6_node *fn;
-- 
cgit v1.2.3


From 93531c6743157d7e8c5792f8ed1a57641149d62c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:25 -0700
Subject: net/ipv6: separate handling of FIB entries from dst based routes

Last step before flipping the data type for FIB entries:
- use fib6_info_alloc to create FIB entries in ip6_route_info_create
  and addrconf_dst_alloc
- use fib6_info_release in place of dst_release, ip6_rt_put and
  rt6_release
- remove the dst_hold before calling __ip6_ins_rt or ip6_del_rt
- when purging routes, drop per-cpu routes
- replace inc and dec of rt6i_ref with fib6_info_hold and fib6_info_release
- use rt->from since it points to the FIB entry
- drop references to exception bucket, fib6_metrics and per-cpu from
  dst entries (those are relevant for fib entries only)

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |   4 +-
 include/net/ip6_route.h |   3 +-
 net/ipv6/addrconf.c     |  18 +++--
 net/ipv6/anycast.c      |   7 +-
 net/ipv6/ip6_fib.c      |  55 ++++++++++------
 net/ipv6/ip6_output.c   |   3 +-
 net/ipv6/ndisc.c        |   6 +-
 net/ipv6/route.c        | 171 +++++++++++++++++-------------------------------
 8 files changed, 115 insertions(+), 152 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 630392ae12d8..6c3d92bb3459 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -314,9 +314,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		rt = rt->from;
-
-	rt6_get_cookie_safe(rt, &cookie);
+		rt6_get_cookie_safe(rt->from, &cookie);
 
 	return cookie;
 }
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 686cdc7f356a..57d0d45667f1 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -114,8 +114,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
 {
-	struct inet6_dev *idev =
-			rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
+	struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL;
 	int err = 0;
 
 	if (rt && rt->rt6i_prefsrc.plen)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 915cd0734b27..e533a447f68c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -916,7 +916,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
 		pr_warn("Freeing alive inet6 address %p\n", ifp);
 		return;
 	}
-	ip6_rt_put(ifp->rt);
 
 	kfree_rcu(ifp, rcu);
 }
@@ -1102,8 +1101,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	inet6addr_notifier_call_chain(NETDEV_UP, ifa);
 out:
 	if (unlikely(err < 0)) {
-		if (rt)
-			ip6_rt_put(rt);
+		fib6_info_release(rt);
+
 		if (ifa) {
 			if (ifa->idev)
 				in6_dev_put(ifa->idev);
@@ -1191,7 +1190,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r
 		else {
 			if (!(rt->rt6i_flags & RTF_EXPIRES))
 				fib6_set_expires(rt, expires);
-			ip6_rt_put(rt);
+			fib6_info_release(rt);
 		}
 	}
 }
@@ -2375,8 +2374,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 			continue;
 		if ((rt->rt6i_flags & noflags) != 0)
 			continue;
-		if (!dst_hold_safe(&rt->dst))
-			rt = NULL;
+		fib6_info_hold(rt);
 		break;
 	}
 out:
@@ -2687,7 +2685,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
 					      dev, expires, flags, GFP_ATOMIC);
 		}
-		ip6_rt_put(rt);
+		fib6_info_release(rt);
 	}
 
 	/* Try to figure out our local address for this prefix */
@@ -3361,7 +3359,7 @@ static int fixup_permanent_addr(struct net *net,
 		ifp->rt = rt;
 		spin_unlock(&ifp->lock);
 
-		ip6_rt_put(prev);
+		fib6_info_release(prev);
 	}
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
@@ -5636,8 +5634,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 				ip6_del_rt(net, rt);
 		}
 		if (ifp->rt) {
-			if (dst_hold_safe(&ifp->rt->dst))
-				ip6_del_rt(net, ifp->rt);
+			ip6_del_rt(net, ifp->rt);
+			ifp->rt = NULL;
 		}
 		rt_genid_bump_ipv6(net);
 		break;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index e456386fe4d5..3db8fe10322b 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -213,7 +213,7 @@ static void aca_put(struct ifacaddr6 *ac)
 {
 	if (refcount_dec_and_test(&ac->aca_refcnt)) {
 		in6_dev_put(ac->aca_idev);
-		dst_release(&ac->aca_rt->dst);
+		fib6_info_release(ac->aca_rt);
 		kfree(ac);
 	}
 }
@@ -231,6 +231,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
 	aca->aca_addr = *addr;
 	in6_dev_hold(idev);
 	aca->aca_idev = idev;
+	fib6_info_hold(rt);
 	aca->aca_rt = rt;
 	aca->aca_users = 1;
 	/* aca_tstamp should be updated upon changes */
@@ -274,7 +275,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	}
 	aca = aca_alloc(rt, addr);
 	if (!aca) {
-		ip6_rt_put(rt);
+		fib6_info_release(rt);
 		err = -ENOMEM;
 		goto out;
 	}
@@ -330,7 +331,6 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
-	dst_hold(&aca->aca_rt->dst);
 	ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 	aca_put(aca);
@@ -358,7 +358,6 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
 
 		addrconf_leave_solict(idev, &aca->aca_addr);
 
-		dst_hold(&aca->aca_rt->dst);
 		ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 		aca_put(aca);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d07578d84db0..4d6bd033dccd 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -170,6 +170,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
 void fib6_info_destroy(struct rt6_info *f6i)
 {
 	struct rt6_exception_bucket *bucket;
+	struct dst_metrics *m;
 
 	WARN_ON(f6i->rt6i_node);
 
@@ -201,6 +202,10 @@ void fib6_info_destroy(struct rt6_info *f6i)
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
+	m = f6i->fib6_metrics;
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+		kfree(m);
+
 	kfree(f6i);
 }
 EXPORT_SYMBOL_GPL(fib6_info_destroy);
@@ -714,7 +719,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
 			/* clean up an intermediate node */
 			if (!(fn->fn_flags & RTN_RTINFO)) {
 				RCU_INIT_POINTER(fn->leaf, NULL);
-				rt6_release(leaf);
+				fib6_info_release(leaf);
 			/* remove null_entry in the root node */
 			} else if (fn->fn_flags & RTN_TL_ROOT &&
 				   rcu_access_pointer(fn->leaf) ==
@@ -898,12 +903,32 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
 				new_leaf = fib6_find_prefix(net, table, fn);
 				atomic_inc(&new_leaf->rt6i_ref);
+
 				rcu_assign_pointer(fn->leaf, new_leaf);
-				rt6_release(rt);
+				fib6_info_release(rt);
 			}
 			fn = rcu_dereference_protected(fn->parent,
 				    lockdep_is_held(&table->tb6_lock));
 		}
+
+		if (rt->rt6i_pcpu) {
+			int cpu;
+
+			/* release the reference to this fib entry from
+			 * all of its cached pcpu routes
+			 */
+			for_each_possible_cpu(cpu) {
+				struct rt6_info **ppcpu_rt;
+				struct rt6_info *pcpu_rt;
+
+				ppcpu_rt = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+				pcpu_rt = *ppcpu_rt;
+				if (pcpu_rt) {
+					fib6_info_release(pcpu_rt->from);
+					pcpu_rt->from = NULL;
+				}
+			}
+		}
 	}
 }
 
@@ -1099,7 +1124,7 @@ add:
 		fib6_purge_rt(iter, fn, info->nl_net);
 		if (rcu_access_pointer(fn->rr_ptr) == iter)
 			fn->rr_ptr = NULL;
-		rt6_release(iter);
+		fib6_info_release(iter);
 
 		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
@@ -1115,7 +1140,7 @@ add:
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
 						fn->rr_ptr = NULL;
-					rt6_release(iter);
+					fib6_info_release(iter);
 					nsiblings--;
 					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 				} else {
@@ -1183,9 +1208,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	int replace_required = 0;
 	int sernum = fib6_new_sernum(info->nl_net);
 
-	if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
-		return -EINVAL;
-
 	if (info->nlh) {
 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
 			allow_create = 0;
@@ -1300,7 +1322,7 @@ out:
 			if (pn_leaf == rt) {
 				pn_leaf = NULL;
 				RCU_INIT_POINTER(pn->leaf, NULL);
-				atomic_dec(&rt->rt6i_ref);
+				fib6_info_release(rt);
 			}
 			if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
 				pn_leaf = fib6_find_prefix(info->nl_net, table,
@@ -1312,7 +1334,7 @@ out:
 					    info->nl_net->ipv6.fib6_null_entry;
 				}
 #endif
-				atomic_inc(&pn_leaf->rt6i_ref);
+				fib6_info_hold(pn_leaf);
 				rcu_assign_pointer(pn->leaf, pn_leaf);
 			}
 		}
@@ -1334,10 +1356,6 @@ failure:
 	     (fn->fn_flags & RTN_TL_ROOT &&
 	      !rcu_access_pointer(fn->leaf))))
 		fib6_repair_tree(info->nl_net, table, fn);
-	/* Always release dst as dst->__refcnt is guaranteed
-	 * to be taken before entering this function
-	 */
-	dst_release_immediate(&rt->dst);
 	return err;
 }
 
@@ -1637,7 +1655,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 				new_fn_leaf = net->ipv6.fib6_null_entry;
 			}
 #endif
-			atomic_inc(&new_fn_leaf->rt6i_ref);
+			fib6_info_hold(new_fn_leaf);
 			rcu_assign_pointer(fn->leaf, new_fn_leaf);
 			return pn;
 		}
@@ -1693,7 +1711,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 			return pn;
 
 		RCU_INIT_POINTER(pn->leaf, NULL);
-		rt6_release(pn_leaf);
+		fib6_info_release(pn_leaf);
 		fn = pn;
 	}
 }
@@ -1763,7 +1781,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
 	if (!info->skip_notify)
 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
-	rt6_release(rt);
+	fib6_info_release(rt);
 }
 
 /* Need to own table->tb6_lock */
@@ -2261,9 +2279,8 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 
 	dev = rt->fib6_nh.nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
-		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
-		   rt->dst.__use, rt->rt6i_flags,
-		   dev ? dev->name : "");
+		   rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0,
+		   rt->rt6i_flags, dev ? dev->name : "");
 	iter->w.leaf = NULL;
 	return 0;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a39b04f9fa6e..3db47986ef38 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -968,7 +968,8 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 		if (!had_dst)
 			*dst = ip6_route_output(net, sk, fl6);
 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
-		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+		err = ip6_route_get_saddr(net, rt ? rt->from : NULL,
+					  &fl6->daddr,
 					  sk ? inet6_sk(sk)->srcprefs : 0,
 					  &fl6->saddr);
 		if (err)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 556717154fa3..a28857088bff 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1283,7 +1283,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
 				  __func__);
-			ip6_rt_put(rt);
+			fib6_info_release(rt);
 			return;
 		}
 	}
@@ -1313,7 +1313,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
 				  __func__);
-			ip6_rt_put(rt);
+			fib6_info_release(rt);
 			return;
 		}
 		neigh->flags |= NTF_ROUTER;
@@ -1499,7 +1499,7 @@ skip_routeinfo:
 		ND_PRINTK(2, warn, "RA: invalid RA options\n");
 	}
 out:
-	ip6_rt_put(rt);
+	fib6_info_release(rt);
 	if (neigh)
 		neigh_release(neigh);
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad9eaecf539c..0d861bd07673 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -351,13 +351,11 @@ static void rt6_info_init(struct rt6_info *rt)
 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 	INIT_LIST_HEAD(&rt->rt6i_siblings);
 	INIT_LIST_HEAD(&rt->rt6i_uncached);
-	rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
 }
 
 /* allocate dst with ip6_dst_ops */
-static struct rt6_info *__ip6_dst_alloc(struct net *net,
-					struct net_device *dev,
-					int flags)
+struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
+			       int flags)
 {
 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 					1, DST_OBSOLETE_FORCE_CHK, flags);
@@ -369,35 +367,15 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
 
 	return rt;
 }
-
-struct rt6_info *ip6_dst_alloc(struct net *net,
-			       struct net_device *dev,
-			       int flags)
-{
-	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
-
-	if (rt) {
-		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
-		if (!rt->rt6i_pcpu) {
-			dst_release_immediate(&rt->dst);
-			return NULL;
-		}
-	}
-
-	return rt;
-}
 EXPORT_SYMBOL(ip6_dst_alloc);
 
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
-	struct rt6_exception_bucket *bucket;
 	struct rt6_info *from = rt->from;
 	struct inet6_dev *idev;
-	struct dst_metrics *m;
 
 	dst_destroy_metrics_generic(dst);
-	free_percpu(rt->rt6i_pcpu);
 	rt6_uncached_list_del(rt);
 
 	idev = rt->rt6i_idev;
@@ -405,18 +383,9 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		rt->rt6i_idev = NULL;
 		in6_dev_put(idev);
 	}
-	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
-	if (bucket) {
-		rt->rt6i_exception_bucket = NULL;
-		kfree(bucket);
-	}
-
-	m = rt->fib6_metrics;
-	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
-		kfree(m);
 
 	rt->from = NULL;
-	dst_release(&from->dst);
+	fib6_info_release(from);
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -891,7 +860,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		else
 			fib6_set_expires(rt, jiffies + HZ * lifetime);
 
-		ip6_rt_put(rt);
+		fib6_info_release(rt);
 	}
 	return 0;
 }
@@ -1010,11 +979,9 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 
 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
-	BUG_ON(from->from);
-
 	rt->rt6i_flags &= ~RTF_EXPIRES;
-	if (dst_hold_safe(&from->dst))
-		rt->from = from;
+	fib6_info_hold(from);
+	rt->from = from;
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
@@ -1084,7 +1051,7 @@ static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rt6_info *nrt;
 
-	nrt = __ip6_dst_alloc(dev_net(dev), dev, flags);
+	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	if (nrt)
 		ip6_rt_copy_init(nrt, rt);
 
@@ -1203,8 +1170,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt)
 {
 	struct nl_info info = {	.nl_net = net, };
 
-	/* Hold dst to account for the reference from the fib6 tree */
-	dst_hold(&rt->dst);
 	return __ip6_ins_rt(rt, &info, NULL);
 }
 
@@ -1221,7 +1186,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(ort);
-	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
 	rcu_read_unlock();
 	if (!rt)
 		return NULL;
@@ -1256,7 +1221,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(rt);
-	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags);
+	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	rcu_read_unlock();
 	if (!pcpu_rt)
 		return NULL;
@@ -1317,7 +1282,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
 	net = dev_net(rt6_ex->rt6i->dst.dev);
 	rt6_ex->rt6i->rt6i_node = NULL;
 	hlist_del_rcu(&rt6_ex->hlist);
-	rt6_release(rt6_ex->rt6i);
+	ip6_rt_put(rt6_ex->rt6i);
 	kfree_rcu(rt6_ex, rcu);
 	WARN_ON_ONCE(!bucket->depth);
 	bucket->depth--;
@@ -1907,17 +1872,11 @@ redo_rt6_select:
 
 		struct rt6_info *uncached_rt;
 
-		if (ip6_hold_safe(net, &f6i, true)) {
-			dst_use_noref(&f6i->dst, jiffies);
-		} else {
-			rcu_read_unlock();
-			uncached_rt = f6i;
-			goto uncached_rt_out;
-		}
+		fib6_info_hold(f6i);
 		rcu_read_unlock();
 
 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
-		dst_release(&rt->dst);
+		fib6_info_release(f6i);
 
 		if (uncached_rt) {
 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
@@ -1930,7 +1889,6 @@ redo_rt6_select:
 			dst_hold(&uncached_rt->dst);
 		}
 
-uncached_rt_out:
 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
 
@@ -1939,24 +1897,12 @@ uncached_rt_out:
 
 		struct rt6_info *pcpu_rt;
 
-		dst_use_noref(&f6i->dst, jiffies);
 		local_bh_disable();
 		pcpu_rt = rt6_get_pcpu_route(f6i);
 
-		if (!pcpu_rt) {
-			/* atomic_inc_not_zero() is needed when using rcu */
-			if (atomic_inc_not_zero(&f6i->rt6i_ref)) {
-				/* No dst_hold() on rt is needed because grabbing
-				 * rt->rt6i_ref makes sure rt can't be released.
-				 */
-				pcpu_rt = rt6_make_pcpu_route(net, f6i);
-				rt6_release(f6i);
-			} else {
-				/* rt is already removed from tree */
-				pcpu_rt = net->ipv6.ip6_null_entry;
-				dst_hold(&pcpu_rt->dst);
-			}
-		}
+		if (!pcpu_rt)
+			pcpu_rt = rt6_make_pcpu_route(net, f6i);
+
 		local_bh_enable();
 		rcu_read_unlock();
 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
@@ -2193,11 +2139,26 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *	Destination cache support functions
  */
 
+static bool fib6_check(struct rt6_info *f6i, u32 cookie)
+{
+	u32 rt_cookie = 0;
+
+	if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
+	     rt_cookie != cookie)
+		return false;
+
+	if (fib6_check_expired(f6i))
+		return false;
+
+	return true;
+}
+
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
+	if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
+	    rt_cookie != cookie)
 		return NULL;
 
 	if (rt6_check_expired(rt))
@@ -2210,7 +2171,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 {
 	if (!__rt6_check_expired(rt) &&
 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
-	    rt6_check(rt->from, cookie))
+	    fib6_check(rt->from, cookie))
 		return &rt->dst;
 	else
 		return NULL;
@@ -2241,7 +2202,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (rt6_check_expired(rt)) {
-				ip6_del_rt(dev_net(dst->dev), rt);
+				rt6_remove_exception_rt(rt);
 				dst = NULL;
 			}
 		} else {
@@ -2262,12 +2223,12 @@ static void ip6_link_failure(struct sk_buff *skb)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
-				ip6_del_rt(dev_net(rt->dst.dev), rt);
-		} else {
+				rt6_remove_exception_rt(rt);
+		} else if (rt->from) {
 			struct fib6_node *fn;
 
 			rcu_read_lock();
-			fn = rcu_dereference(rt->rt6i_node);
+			fn = rcu_dereference(rt->from->rt6i_node);
 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
 				fn->fn_sernum = -1;
 			rcu_read_unlock();
@@ -2949,13 +2910,13 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (!table)
 		goto out;
 
-	rt = ip6_dst_alloc(net, NULL,
-			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
-
-	if (!rt) {
-		err = -ENOMEM;
+	err = -ENOMEM;
+	rt = fib6_info_alloc(gfp_flags);
+	if (!rt)
 		goto out;
-	}
+
+	if (cfg->fc_flags & RTF_ADDRCONF)
+		rt->dst_nocount = true;
 
 	err = ip6_convert_metrics(net, rt, cfg);
 	if (err < 0)
@@ -3029,7 +2990,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		if (err)
 			goto out;
 
-		rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway;
+		rt->fib6_nh.nh_gw = cfg->fc_gateway;
 	}
 
 	err = -ENODEV;
@@ -3066,7 +3027,7 @@ install_route:
 	    !netif_carrier_ok(dev))
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
-	rt->fib6_nh.nh_dev = rt->dst.dev = dev;
+	rt->fib6_nh.nh_dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
 
@@ -3078,9 +3039,8 @@ out:
 		dev_put(dev);
 	if (idev)
 		in6_dev_put(idev);
-	if (rt)
-		dst_release_immediate(&rt->dst);
 
+	fib6_info_release(rt);
 	return ERR_PTR(err);
 }
 
@@ -3095,6 +3055,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		return PTR_ERR(rt);
 
 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
+	fib6_info_release(rt);
 
 	return err;
 }
@@ -3116,7 +3077,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 	spin_unlock_bh(&table->tb6_lock);
 
 out:
-	ip6_rt_put(rt);
+	fib6_info_release(rt);
 	return err;
 }
 
@@ -3170,7 +3131,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 out_unlock:
 	spin_unlock_bh(&table->tb6_lock);
 out_put:
-	ip6_rt_put(rt);
+	fib6_info_release(rt);
 
 	if (skb) {
 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
@@ -3241,8 +3202,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
 				continue;
-			if (!dst_hold_safe(&rt->dst))
-				break;
+			fib6_info_hold(rt);
 			rcu_read_unlock();
 
 			/* if gateway was specified only delete the one hop */
@@ -3510,12 +3470,9 @@ restart:
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
-			if (dst_hold_safe(&rt->dst)) {
-				rcu_read_unlock();
-				ip6_del_rt(net, rt);
-			} else {
-				rcu_read_unlock();
-			}
+			fib6_info_hold(rt);
+			rcu_read_unlock();
+			ip6_del_rt(net, rt);
 			goto restart;
 		}
 	}
@@ -3666,7 +3623,7 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 	struct net_device *dev = idev->dev;
 	struct rt6_info *rt;
 
-	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
+	rt = fib6_info_alloc(gfp_flags);
 	if (!rt)
 		return ERR_PTR(-ENOMEM);
 
@@ -3687,8 +3644,8 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 	}
 
 	rt->fib6_nh.nh_gw = *addr;
+	dev_hold(dev);
 	rt->fib6_nh.nh_dev = dev;
-	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
 	rt->rt6i_dst.plen = 128;
 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
@@ -4325,7 +4282,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
 					    rt, &r_cfg);
 		if (err) {
-			dst_release_immediate(&rt->dst);
+			fib6_info_release(rt);
 			goto cleanup;
 		}
 
@@ -4342,6 +4299,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 	list_for_each_entry(nh, &rt6_nh_list, next) {
 		rt_last = nh->rt6_info;
 		err = __ip6_ins_rt(nh->rt6_info, info, extack);
+		fib6_info_release(nh->rt6_info);
+
 		/* save reference to first route for notification */
 		if (!rt_notif && !err)
 			rt_notif = nh->rt6_info;
@@ -4389,7 +4348,7 @@ add_errout:
 cleanup:
 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
 		if (nh->rt6_info)
-			dst_release_immediate(&nh->rt6_info->dst);
+			fib6_info_release(nh->rt6_info);
 		list_del(&nh->next);
 		kfree(nh);
 	}
@@ -4814,14 +4773,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
-	if (fibmatch && rt->from) {
-		struct rt6_info *ort = rt->from;
-
-		dst_hold(&ort->dst);
-		ip6_rt_put(rt);
-		rt = ort;
-	}
-
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb) {
 		ip6_rt_put(rt);
@@ -4831,12 +4782,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 	skb_dst_set(skb, &rt->dst);
 	if (fibmatch)
-		err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif,
+		err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, 0);
 	else
-		err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr,
-				    iif, RTM_NEWROUTE,
+		err = rt6_fill_node(net, skb, rt->from, dst,
+				    &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
 				    0);
 	if (err < 0) {
-- 
cgit v1.2.3


From 8d1c802b2815edc97af8a58c5045ebaf3848621a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:26 -0700
Subject: net/ipv6: Flip FIB entries to fib6_info

Convert all code paths referencing a FIB entry from
rt6_info to fib6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  64 ++---
 include/net/if_inet6.h                             |   4 +-
 include/net/ip6_fib.h                              |  42 ++--
 include/net/ip6_route.h                            |  28 +--
 include/net/netns/ipv6.h                           |   2 +-
 net/ipv6/addrconf.c                                |  20 +-
 net/ipv6/anycast.c                                 |   4 +-
 net/ipv6/ip6_fib.c                                 | 116 ++++-----
 net/ipv6/ndisc.c                                   |   2 +-
 net/ipv6/route.c                                   | 259 +++++++++++----------
 10 files changed, 271 insertions(+), 270 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index d995a0b52d7c..b85b15851841 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -442,7 +442,7 @@ struct mlxsw_sp_fib6_entry {
 
 struct mlxsw_sp_rt6 {
 	struct list_head list;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 };
 
 struct mlxsw_sp_lpm_tree {
@@ -3834,7 +3834,7 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
 
 	for (i = 0; i < nh_grp->count; i++) {
 		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
-		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
 		if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
 		    ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
@@ -3920,7 +3920,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
 	fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
 				  common);
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
-		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
 		rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
 	}
@@ -4699,7 +4699,7 @@ static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp,
 	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
 }
 
-static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
+static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt)
 {
 	/* Packets with link-local destination IP arriving to the router
 	 * are trapped to the CPU, so no need to program specific routes
@@ -4721,7 +4721,7 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
 	return false;
 }
 
-static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
+static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 
@@ -4734,18 +4734,18 @@ static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
 	 * memory.
 	 */
 	mlxsw_sp_rt6->rt = rt;
-	rt6_hold(rt);
+	fib6_info_hold(rt);
 
 	return mlxsw_sp_rt6;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
 {
-	rt6_release(rt);
+	fib6_info_release(rt);
 }
 #else
-static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
 {
 }
 #endif
@@ -4756,13 +4756,13 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 	kfree(mlxsw_sp_rt6);
 }
 
-static bool mlxsw_sp_fib6_rt_can_mp(const struct rt6_info *rt)
+static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
 {
 	/* RTF_CACHE routes are ignored */
 	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
 }
 
-static struct rt6_info *
+static struct fib6_info *
 mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 {
 	return list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
@@ -4771,7 +4771,7 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-				 const struct rt6_info *nrt, bool replace)
+				 const struct fib6_info *nrt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 
@@ -4779,7 +4779,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
-		struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
 		/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
 		 * virtual router.
@@ -4802,7 +4802,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 
 static struct mlxsw_sp_rt6 *
 mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
-			    const struct rt6_info *rt)
+			    const struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 
@@ -4815,7 +4815,7 @@ mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
 }
 
 static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
-					const struct rt6_info *rt,
+					const struct fib6_info *rt,
 					enum mlxsw_sp_ipip_type *ret)
 {
 	return rt->fib6_nh.nh_dev &&
@@ -4825,7 +4825,7 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
 static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
 				       struct mlxsw_sp_nexthop_group *nh_grp,
 				       struct mlxsw_sp_nexthop *nh,
-				       const struct rt6_info *rt)
+				       const struct fib6_info *rt)
 {
 	const struct mlxsw_sp_ipip_ops *ipip_ops;
 	struct mlxsw_sp_ipip_entry *ipip_entry;
@@ -4870,7 +4870,7 @@ static void mlxsw_sp_nexthop6_type_fini(struct mlxsw_sp *mlxsw_sp,
 static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
 				  struct mlxsw_sp_nexthop_group *nh_grp,
 				  struct mlxsw_sp_nexthop *nh,
-				  const struct rt6_info *rt)
+				  const struct fib6_info *rt)
 {
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 
@@ -4897,7 +4897,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
 }
 
 static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
-				    const struct rt6_info *rt)
+				    const struct fib6_info *rt)
 {
 	return rt->rt6i_flags & RTF_GATEWAY ||
 	       mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
@@ -4928,7 +4928,7 @@ mlxsw_sp_nexthop6_group_create(struct mlxsw_sp *mlxsw_sp,
 	nh_grp->gateway = mlxsw_sp_rt6_is_gateway(mlxsw_sp, mlxsw_sp_rt6->rt);
 	nh_grp->count = fib6_entry->nrt6;
 	for (i = 0; i < nh_grp->count; i++) {
-		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
 		nh = &nh_grp->nexthops[i];
 		err = mlxsw_sp_nexthop6_init(mlxsw_sp, nh_grp, nh, rt);
@@ -5040,7 +5040,7 @@ err_nexthop6_group_get:
 static int
 mlxsw_sp_fib6_entry_nexthop_add(struct mlxsw_sp *mlxsw_sp,
 				struct mlxsw_sp_fib6_entry *fib6_entry,
-				struct rt6_info *rt)
+				struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 	int err;
@@ -5068,7 +5068,7 @@ err_nexthop6_group_update:
 static void
 mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,
 				struct mlxsw_sp_fib6_entry *fib6_entry,
-				struct rt6_info *rt)
+				struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 
@@ -5084,7 +5084,7 @@ mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,
 
 static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp,
 					 struct mlxsw_sp_fib_entry *fib_entry,
-					 const struct rt6_info *rt)
+					 const struct fib6_info *rt)
 {
 	/* Packets hitting RTF_REJECT routes need to be discarded by the
 	 * stack. We can rely on their destination device not having a
@@ -5118,7 +5118,7 @@ mlxsw_sp_fib6_entry_rt_destroy_all(struct mlxsw_sp_fib6_entry *fib6_entry)
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_entry_create(struct mlxsw_sp *mlxsw_sp,
 			   struct mlxsw_sp_fib_node *fib_node,
-			   struct rt6_info *rt)
+			   struct fib6_info *rt)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_entry *fib_entry;
@@ -5168,12 +5168,12 @@ static void mlxsw_sp_fib6_entry_destroy(struct mlxsw_sp *mlxsw_sp,
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-			      const struct rt6_info *nrt, bool replace)
+			      const struct fib6_info *nrt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
-		struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
 		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
 			continue;
@@ -5198,7 +5198,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
 			       bool replace)
 {
 	struct mlxsw_sp_fib_node *fib_node = new6_entry->common.fib_node;
-	struct rt6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
+	struct fib6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 
 	fib6_entry = mlxsw_sp_fib6_node_entry_find(fib_node, nrt, replace);
@@ -5213,7 +5213,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
 		struct mlxsw_sp_fib6_entry *last;
 
 		list_for_each_entry(last, &fib_node->entry_list, common.list) {
-			struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(last);
+			struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last);
 
 			if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id)
 				break;
@@ -5268,7 +5268,7 @@ mlxsw_sp_fib6_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
-			   const struct rt6_info *rt)
+			   const struct fib6_info *rt)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5287,7 +5287,7 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
-		struct rt6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+		struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
 		if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id &&
 		    rt->rt6i_metric == iter_rt->rt6i_metric &&
@@ -5316,7 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
-				    struct rt6_info *rt, bool replace)
+				    struct fib6_info *rt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5373,7 +5373,7 @@ err_fib6_entry_nexthop_add:
 }
 
 static void mlxsw_sp_router_fib6_del(struct mlxsw_sp *mlxsw_sp,
-				     struct rt6_info *rt)
+				     struct fib6_info *rt)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5836,7 +5836,7 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
 		fen6_info = container_of(info, struct fib6_entry_notifier_info,
 					 info);
 		fib_work->fen6_info = *fen6_info;
-		rt6_hold(fib_work->fen6_info.rt);
+		fib6_info_hold(fib_work->fen6_info.rt);
 		break;
 	}
 }
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d4088d1a688d..d6089b2e64fe 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -64,7 +64,7 @@ struct inet6_ifaddr {
 	struct delayed_work	dad_work;
 
 	struct inet6_dev	*idev;
-	struct rt6_info		*rt;
+	struct fib6_info	*rt;
 
 	struct hlist_node	addr_lst;
 	struct list_head	if_list;
@@ -144,7 +144,7 @@ struct ipv6_ac_socklist {
 struct ifacaddr6 {
 	struct in6_addr		aca_addr;
 	struct inet6_dev	*aca_idev;
-	struct rt6_info		*aca_rt;
+	struct fib6_info	*aca_rt;
 	struct ifacaddr6	*aca_next;
 	int			aca_users;
 	refcount_t		aca_refcnt;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6c3d92bb3459..d41b7bd69fb3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -75,12 +75,12 @@ struct fib6_node {
 #ifdef CONFIG_IPV6_SUBTREES
 	struct fib6_node __rcu	*subtree;
 #endif
-	struct rt6_info __rcu	*leaf;
+	struct fib6_info __rcu	*leaf;
 
 	__u16			fn_bit;		/* bit key */
 	__u16			fn_flags;
 	int			fn_sernum;
-	struct rt6_info __rcu	*rr_ptr;
+	struct fib6_info __rcu	*rr_ptr;
 	struct rcu_head		rcu;
 };
 
@@ -176,7 +176,7 @@ struct fib6_info {
 struct rt6_info {
 	struct dst_entry		dst;
 	struct rt6_info __rcu		*rt6_next;
-	struct rt6_info			*from;
+	struct fib6_info		*from;
 
 	/*
 	 * Tail elements of dst_entry (__refcnt etc.)
@@ -242,20 +242,20 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 	return ((struct rt6_info *)dst)->rt6i_idev;
 }
 
-static inline void fib6_clean_expires(struct rt6_info *f6i)
+static inline void fib6_clean_expires(struct fib6_info *f6i)
 {
 	f6i->rt6i_flags &= ~RTF_EXPIRES;
 	f6i->expires = 0;
 }
 
-static inline void fib6_set_expires(struct rt6_info *f6i,
+static inline void fib6_set_expires(struct fib6_info *f6i,
 				    unsigned long expires)
 {
 	f6i->expires = expires;
 	f6i->rt6i_flags |= RTF_EXPIRES;
 }
 
-static inline bool fib6_check_expired(const struct rt6_info *f6i)
+static inline bool fib6_check_expired(const struct fib6_info *f6i)
 {
 	if (f6i->rt6i_flags & RTF_EXPIRES)
 		return time_after(jiffies, f6i->expires);
@@ -288,7 +288,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
  * Return true if we can get cookie safely
  * Return false if not
  */
-static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
+static inline bool rt6_get_cookie_safe(const struct fib6_info *rt,
 				       u32 *cookie)
 {
 	struct fib6_node *fn;
@@ -330,15 +330,15 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 
 void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
 
-struct rt6_info *fib6_info_alloc(gfp_t gfp_flags);
-void fib6_info_destroy(struct rt6_info *f6i);
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
+void fib6_info_destroy(struct fib6_info *f6i);
 
-static inline void fib6_info_hold(struct rt6_info *f6i)
+static inline void fib6_info_hold(struct fib6_info *f6i)
 {
 	atomic_inc(&f6i->rt6i_ref);
 }
 
-static inline void fib6_info_release(struct rt6_info *f6i)
+static inline void fib6_info_release(struct fib6_info *f6i)
 {
 	if (f6i && atomic_dec_and_test(&f6i->rt6i_ref))
 		fib6_info_destroy(f6i);
@@ -371,7 +371,7 @@ enum fib6_walk_state {
 struct fib6_walker {
 	struct list_head lh;
 	struct fib6_node *root, *node;
-	struct rt6_info *leaf;
+	struct fib6_info *leaf;
 	enum fib6_walk_state state;
 	unsigned int skip;
 	unsigned int count;
@@ -435,7 +435,7 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *,
 
 struct fib6_entry_notifier_info {
 	struct fib_notifier_info info; /* must be first */
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 };
 
 /*
@@ -457,14 +457,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *saddr, int src_len,
 			      bool exact_match);
 
-void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
+void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
 		    void *arg);
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
-int fib6_del(struct rt6_info *rt, struct nl_info *info);
+int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
 void fib6_run_gc(unsigned long expires, struct net *net, bool force);
@@ -487,11 +487,11 @@ void __net_exit fib6_notifier_exit(struct net *net);
 unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
-void fib6_update_sernum(struct net *net, struct rt6_info *rt);
-void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
+void fib6_update_sernum(struct net *net, struct fib6_info *rt);
+void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
 
-void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val);
-static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric)
+void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
+static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
 {
 	return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
 }
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 57d0d45667f1..d5fb1e4ae7ac 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,7 +66,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
-static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
+static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt)
 {
 	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
 	       RTF_GATEWAY;
@@ -102,14 +102,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack);
-int ip6_ins_rt(struct net *net, struct rt6_info *rt);
-int ip6_del_rt(struct net *net, struct rt6_info *rt);
+int ip6_ins_rt(struct net *net, struct fib6_info *rt);
+int ip6_del_rt(struct net *net, struct fib6_info *rt);
 
-void rt6_flush_exceptions(struct rt6_info *rt);
-void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
+void rt6_flush_exceptions(struct fib6_info *rt);
+void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
 			unsigned long now);
 
-static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
+static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt,
 				      const struct in6_addr *daddr,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
@@ -136,9 +136,9 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
 void fib6_force_start_gc(struct net *net);
 
-struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
-				    const struct in6_addr *addr, bool anycast,
-				    gfp_t gfp_flags);
+struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
+				     const struct in6_addr *addr, bool anycast,
+				     gfp_t gfp_flags);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 			       int flags);
@@ -147,10 +147,10 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
  *	support functions for ND
  *
  */
-struct rt6_info *rt6_get_dflt_router(struct net *net,
+struct fib6_info *rt6_get_dflt_router(struct net *net,
 				     const struct in6_addr *addr,
 				     struct net_device *dev);
-struct rt6_info *rt6_add_dflt_router(struct net *net,
+struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
 				     struct net_device *dev, unsigned int pref);
 
@@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg {
 	struct net *net;
 };
 
-int rt6_dump_route(struct rt6_info *rt, void *p_arg);
+int rt6_dump_route(struct fib6_info *rt, void *p_arg);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
-void rt6_multipath_rebalance(struct rt6_info *rt);
+void rt6_multipath_rebalance(struct fib6_info *rt);
 
 void rt6_uncached_list_add(struct rt6_info *rt);
 void rt6_uncached_list_del(struct rt6_info *rt);
@@ -271,7 +271,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 		return daddr;
 }
 
-static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b)
+static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
 	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
 	       a->rt6i_idev == b->rt6i_idev &&
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 74e4e1e449d5..97b3a54579c8 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -60,7 +60,7 @@ struct netns_ipv6 {
 #endif
 	struct xt_table		*ip6table_nat;
 #endif
-	struct rt6_info         *fib6_null_entry;
+	struct fib6_info	*fib6_null_entry;
 	struct rt6_info		*ip6_null_entry;
 	struct rt6_statistics   *rt6_stats;
 	struct timer_list       ip6_fib_timer;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e533a447f68c..a294e86a9b25 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -170,7 +170,7 @@ static void addrconf_type_change(struct net_device *dev,
 				 unsigned long event);
 static int addrconf_ifdown(struct net_device *dev, int how);
 
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 						  int plen,
 						  const struct net_device *dev,
 						  u32 flags, u32 noflags);
@@ -994,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
 	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	int err = 0;
 	int addr_type = ipv6_addr_type(addr);
 
@@ -1178,7 +1178,7 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
 static void
 cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	rt = addrconf_get_prefix_route(&ifp->addr,
 				       ifp->prefix_len,
@@ -2348,13 +2348,13 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 }
 
 
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 						  int plen,
 						  const struct net_device *dev,
 						  u32 flags, u32 noflags)
 {
 	struct fib6_node *fn;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct fib6_table *table;
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
 
@@ -2641,7 +2641,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 	 */
 
 	if (pinfo->onlink) {
-		struct rt6_info *rt;
+		struct fib6_info *rt;
 		unsigned long rt_expires;
 
 		/* Avoid arithmetic overflow. Really, we could
@@ -3346,7 +3346,7 @@ static int fixup_permanent_addr(struct net *net,
 	 * case regenerate the host route.
 	 */
 	if (!ifp->rt || !ifp->rt->rt6i_node) {
-		struct rt6_info *rt, *prev;
+		struct fib6_info *rt, *prev;
 
 		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
 					GFP_ATOMIC);
@@ -3713,7 +3713,7 @@ restart:
 	keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
 
 	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
-		struct rt6_info *rt = NULL;
+		struct fib6_info *rt = NULL;
 		bool keep;
 
 		addrconf_del_dad_work(ifa);
@@ -5626,7 +5626,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			addrconf_leave_anycast(ifp);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		if (!ipv6_addr_any(&ifp->peer_addr)) {
-			struct rt6_info *rt;
+			struct fib6_info *rt;
 
 			rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
 						       ifp->idev->dev, 0, 0);
@@ -5982,7 +5982,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
 		spin_lock(&ifa->lock);
 		if (ifa->rt) {
-			struct rt6_info *rt = ifa->rt;
+			struct fib6_info *rt = ifa->rt;
 			int cpu;
 
 			rcu_read_lock();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 3db8fe10322b..0e35657501a7 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -218,7 +218,7 @@ static void aca_put(struct ifacaddr6 *ac)
 	}
 }
 
-static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
+static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
 				   const struct in6_addr *addr)
 {
 	struct inet6_dev *idev = rt->rt6i_idev;
@@ -247,7 +247,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
 int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 {
 	struct ifacaddr6 *aca;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct net *net;
 	int err;
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4d6bd033dccd..77cf43b2d858 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly;
 struct fib6_cleaner {
 	struct fib6_walker w;
 	struct net *net;
-	int (*func)(struct rt6_info *, void *arg);
+	int (*func)(struct fib6_info *, void *arg);
 	int sernum;
 	void *arg;
 };
@@ -54,7 +54,7 @@ struct fib6_cleaner {
 #define FWS_INIT FWS_L
 #endif
 
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
 					 struct fib6_table *table,
 					 struct fib6_node *fn);
 static struct fib6_node *fib6_repair_tree(struct net *net,
@@ -105,7 +105,7 @@ enum {
 	FIB6_NO_SERNUM_CHANGE = 0,
 };
 
-void fib6_update_sernum(struct net *net, struct rt6_info *rt)
+void fib6_update_sernum(struct net *net, struct fib6_info *rt)
 {
 	struct fib6_node *fn;
 
@@ -145,9 +145,9 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
-struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 {
-	struct rt6_info *f6i;
+	struct fib6_info *f6i;
 
 	f6i = kzalloc(sizeof(*f6i), gfp_flags);
 	if (!f6i)
@@ -167,7 +167,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
 	return f6i;
 }
 
-void fib6_info_destroy(struct rt6_info *f6i)
+void fib6_info_destroy(struct fib6_info *f6i)
 {
 	struct rt6_exception_bucket *bucket;
 	struct dst_metrics *m;
@@ -404,7 +404,7 @@ unsigned int fib6_tables_seq_read(struct net *net)
 
 static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 				    enum fib_event_type event_type,
-				    struct rt6_info *rt)
+				    struct fib6_info *rt)
 {
 	struct fib6_entry_notifier_info info = {
 		.rt = rt,
@@ -415,7 +415,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 
 static int call_fib6_entry_notifiers(struct net *net,
 				     enum fib_event_type event_type,
-				     struct rt6_info *rt,
+				     struct fib6_info *rt,
 				     struct netlink_ext_ack *extack)
 {
 	struct fib6_entry_notifier_info info = {
@@ -432,7 +432,7 @@ struct fib6_dump_arg {
 	struct notifier_block *nb;
 };
 
-static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
 {
 	if (rt == arg->net->ipv6.fib6_null_entry)
 		return;
@@ -441,7 +441,7 @@ static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
 
 static int fib6_node_dump(struct fib6_walker *w)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	for_each_fib6_walker_rt(w)
 		fib6_rt_dump(rt, w->args);
@@ -490,7 +490,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
 static int fib6_dump_node(struct fib6_walker *w)
 {
 	int res;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	for_each_fib6_walker_rt(w) {
 		res = rt6_dump_route(rt, w->args);
@@ -507,7 +507,7 @@ static int fib6_dump_node(struct fib6_walker *w)
 		 */
 		if (rt->rt6i_nsiblings)
 			rt = list_last_entry(&rt->rt6i_siblings,
-					     struct rt6_info,
+					     struct fib6_info,
 					     rt6i_siblings);
 	}
 	w->leaf = NULL;
@@ -643,7 +643,7 @@ out:
 	return res;
 }
 
-void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val)
+void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
 {
 	if (!f6i)
 		return;
@@ -690,7 +690,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
 	fn = root;
 
 	do {
-		struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+		struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
 		key = (struct rt6key *)((u8 *)leaf + offset);
 
@@ -884,7 +884,7 @@ insert_above:
 	return ln;
 }
 
-static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
+static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
 	struct fib6_table *table = rt->rt6i_table;
@@ -897,9 +897,9 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 		 * to still alive ones.
 		 */
 		while (fn) {
-			struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+			struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
-			struct rt6_info *new_leaf;
+			struct fib6_info *new_leaf;
 			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
 				new_leaf = fib6_find_prefix(net, table, fn);
 				atomic_inc(&new_leaf->rt6i_ref);
@@ -936,15 +936,15 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
  *	Insert routing information in a node.
  */
 
-static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
+static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			    struct nl_info *info,
 			    struct netlink_ext_ack *extack)
 {
-	struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
-	struct rt6_info *iter = NULL;
-	struct rt6_info __rcu **ins;
-	struct rt6_info __rcu **fallback_ins = NULL;
+	struct fib6_info *iter = NULL;
+	struct fib6_info __rcu **ins;
+	struct fib6_info __rcu **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
 	int add = (!info->nlh ||
@@ -1035,7 +1035,7 @@ next_iter:
 	/* Link this route to others same route. */
 	if (rt->rt6i_nsiblings) {
 		unsigned int rt6i_nsiblings;
-		struct rt6_info *sibling, *temp_sibling;
+		struct fib6_info *sibling, *temp_sibling;
 
 		/* Find the first route that have the same metric */
 		sibling = leaf;
@@ -1156,7 +1156,7 @@ add:
 	return 0;
 }
 
-static void fib6_start_gc(struct net *net, struct rt6_info *rt)
+static void fib6_start_gc(struct net *net, struct fib6_info *rt)
 {
 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
 	    (rt->rt6i_flags & RTF_EXPIRES))
@@ -1171,7 +1171,7 @@ void fib6_force_start_gc(struct net *net)
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
 
-static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
+static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
 					   int sernum)
 {
 	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
@@ -1186,7 +1186,7 @@ static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
 	}
 }
 
-void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
+void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
 {
 	__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
 }
@@ -1198,7 +1198,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
  *	Need to own table->tb6_lock
  */
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack)
 {
 	struct fib6_table *table = rt->rt6i_table;
@@ -1219,7 +1219,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 	fn = fib6_add_1(info->nl_net, table, root,
 			&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
-			offsetof(struct rt6_info, rt6i_dst), allow_create,
+			offsetof(struct fib6_info, rt6i_dst), allow_create,
 			replace_required, extack);
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
@@ -1260,7 +1260,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			sn = fib6_add_1(info->nl_net, table, sfn,
 					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct rt6_info, rt6i_src),
+					offsetof(struct fib6_info, rt6i_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1279,7 +1279,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 		} else {
 			sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
 					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct rt6_info, rt6i_src),
+					offsetof(struct fib6_info, rt6i_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1316,7 +1316,7 @@ out:
 		 * super-tree leaf node we have to find a new one for it.
 		 */
 		if (pn != fn) {
-			struct rt6_info *pn_leaf =
+			struct fib6_info *pn_leaf =
 				rcu_dereference_protected(pn->leaf,
 				    lockdep_is_held(&table->tb6_lock));
 			if (pn_leaf == rt) {
@@ -1365,7 +1365,7 @@ failure:
  */
 
 struct lookup_args {
-	int			offset;		/* key offset on rt6_info	*/
+	int			offset;		/* key offset on fib6_info */
 	const struct in6_addr	*addr;		/* search key			*/
 };
 
@@ -1403,7 +1403,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 		struct fib6_node *subtree = FIB6_SUBTREE(fn);
 
 		if (subtree || fn->fn_flags & RTN_RTINFO) {
-			struct rt6_info *leaf = rcu_dereference(fn->leaf);
+			struct fib6_info *leaf = rcu_dereference(fn->leaf);
 			struct rt6key *key;
 
 			if (!leaf)
@@ -1443,12 +1443,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
 		{
-			.offset = offsetof(struct rt6_info, rt6i_dst),
+			.offset = offsetof(struct fib6_info, rt6i_dst),
 			.addr = daddr,
 		},
 #ifdef CONFIG_IPV6_SUBTREES
 		{
-			.offset = offsetof(struct rt6_info, rt6i_src),
+			.offset = offsetof(struct fib6_info, rt6i_src),
 			.addr = saddr,
 		},
 #endif
@@ -1484,7 +1484,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
 	struct fib6_node *fn, *prev = NULL;
 
 	for (fn = root; fn ; ) {
-		struct rt6_info *leaf = rcu_dereference(fn->leaf);
+		struct fib6_info *leaf = rcu_dereference(fn->leaf);
 		struct rt6key *key;
 
 		/* This node is being deleted */
@@ -1533,7 +1533,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 	struct fib6_node *fn;
 
 	fn = fib6_locate_1(root, daddr, dst_len,
-			   offsetof(struct rt6_info, rt6i_dst),
+			   offsetof(struct fib6_info, rt6i_dst),
 			   exact_match);
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1544,7 +1544,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 			if (subtree) {
 				fn = fib6_locate_1(subtree, saddr, src_len,
-					   offsetof(struct rt6_info, rt6i_src),
+					   offsetof(struct fib6_info, rt6i_src),
 					   exact_match);
 			}
 		}
@@ -1563,7 +1563,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
  *
  */
 
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
 					 struct fib6_table *table,
 					 struct fib6_node *fn)
 {
@@ -1622,11 +1622,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 					    lockdep_is_held(&table->tb6_lock));
 		struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
 					    lockdep_is_held(&table->tb6_lock));
-		struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+		struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
-		struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+		struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
-		struct rt6_info *new_fn_leaf;
+		struct fib6_info *new_fn_leaf;
 
 		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
 		iter++;
@@ -1717,10 +1717,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 }
 
 static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
-			   struct rt6_info __rcu **rtp, struct nl_info *info)
+			   struct fib6_info __rcu **rtp, struct nl_info *info)
 {
 	struct fib6_walker *w;
-	struct rt6_info *rt = rcu_dereference_protected(*rtp,
+	struct fib6_info *rt = rcu_dereference_protected(*rtp,
 				    lockdep_is_held(&table->tb6_lock));
 	struct net *net = info->nl_net;
 
@@ -1741,7 +1741,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	/* Remove this entry from other siblings */
 	if (rt->rt6i_nsiblings) {
-		struct rt6_info *sibling, *next_sibling;
+		struct fib6_info *sibling, *next_sibling;
 
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &rt->rt6i_siblings, rt6i_siblings)
@@ -1785,14 +1785,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 }
 
 /* Need to own table->tb6_lock */
-int fib6_del(struct rt6_info *rt, struct nl_info *info)
+int fib6_del(struct fib6_info *rt, struct nl_info *info)
 {
 	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 	struct fib6_table *table = rt->rt6i_table;
 	struct net *net = info->nl_net;
-	struct rt6_info __rcu **rtp;
-	struct rt6_info __rcu **rtp_next;
+	struct fib6_info __rcu **rtp;
+	struct fib6_info __rcu **rtp_next;
 
 	if (!fn || rt == net->ipv6.fib6_null_entry)
 		return -ENOENT;
@@ -1804,7 +1804,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 	 */
 
 	for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
-		struct rt6_info *cur = rcu_dereference_protected(*rtp,
+		struct fib6_info *cur = rcu_dereference_protected(*rtp,
 					lockdep_is_held(&table->tb6_lock));
 		if (rt == cur) {
 			fib6_del_route(table, fn, rtp, info);
@@ -1948,7 +1948,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w)
 static int fib6_clean_node(struct fib6_walker *w)
 {
 	int res;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
 	struct nl_info info = {
 		.nl_net = c->net,
@@ -1983,7 +1983,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 			if (WARN_ON(!rt->rt6i_nsiblings))
 				continue;
 			rt = list_last_entry(&rt->rt6i_siblings,
-					     struct rt6_info, rt6i_siblings);
+					     struct fib6_info, rt6i_siblings);
 			continue;
 		}
 		WARN_ON(res != 0);
@@ -2002,7 +2002,7 @@ static int fib6_clean_node(struct fib6_walker *w)
  */
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
-			    int (*func)(struct rt6_info *, void *arg),
+			    int (*func)(struct fib6_info *, void *arg),
 			    int sernum, void *arg)
 {
 	struct fib6_cleaner c;
@@ -2020,7 +2020,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 }
 
 static void __fib6_clean_all(struct net *net,
-			     int (*func)(struct rt6_info *, void *),
+			     int (*func)(struct fib6_info *, void *),
 			     int sernum, void *arg)
 {
 	struct fib6_table *table;
@@ -2040,7 +2040,7 @@ static void __fib6_clean_all(struct net *net,
 	rcu_read_unlock();
 }
 
-void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
+void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
 		    void *arg)
 {
 	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
@@ -2057,7 +2057,7 @@ static void fib6_flush_trees(struct net *net)
  *	Garbage collection
  */
 
-static int fib6_age(struct rt6_info *rt, void *arg)
+static int fib6_age(struct fib6_info *rt, void *arg)
 {
 	struct fib6_gc_args *gc_args = arg;
 	unsigned long now = jiffies;
@@ -2261,7 +2261,7 @@ struct ipv6_route_iter {
 
 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
-	struct rt6_info *rt = v;
+	struct fib6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
 	const struct net_device *dev;
 
@@ -2353,14 +2353,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
 static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	int r;
-	struct rt6_info *n;
+	struct fib6_info *n;
 	struct net *net = seq_file_net(seq);
 	struct ipv6_route_iter *iter = seq->private;
 
 	if (!v)
 		goto iter_table;
 
-	n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next);
+	n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next);
 	if (n) {
 		++*pos;
 		return n;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a28857088bff..102645298692 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1155,7 +1155,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
 	struct neighbour *neigh = NULL;
 	struct inet6_dev *in6_dev;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct net *net;
 	int lifetime;
 	struct ndisc_options ndopts;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0d861bd07673..2ccf939e1a20 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -96,24 +96,24 @@ static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
-static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
-static size_t rt6_nlmsg_size(struct rt6_info *rt);
+static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
+static size_t rt6_nlmsg_size(struct fib6_info *rt);
 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
-			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct fib6_info *rt, struct dst_entry *dst,
 			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags);
-static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 					   struct in6_addr *daddr,
 					   struct in6_addr *saddr);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct fib6_info *rt6_add_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev,
 					   unsigned int pref);
-static struct rt6_info *rt6_get_route_info(struct net *net,
+static struct fib6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev);
@@ -283,7 +283,7 @@ static const u32 ip6_template_metrics[RTAX_MAX] = {
 	[RTAX_HOPLIMIT - 1] = 0,
 };
 
-static const struct rt6_info fib6_null_entry_template = {
+static const struct fib6_info fib6_null_entry_template = {
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32)0,
@@ -372,7 +372,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
-	struct rt6_info *from = rt->from;
+	struct fib6_info *from = rt->from;
 	struct inet6_dev *idev;
 
 	dst_destroy_metrics_generic(dst);
@@ -425,13 +425,13 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct rt6_info *rt6_multipath_select(const struct net *net,
-					     struct rt6_info *match,
+static struct fib6_info *rt6_multipath_select(const struct net *net,
+					      struct fib6_info *match,
 					     struct flowi6 *fl6, int oif,
 					     const struct sk_buff *skb,
 					     int strict)
 {
-	struct rt6_info *sibling, *next_sibling;
+	struct fib6_info *sibling, *next_sibling;
 
 	/* We might have already computed the hash for ICMPv6 errors. In such
 	 * case it will always be non-zero. Otherwise now is the time to do it.
@@ -462,14 +462,14 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
  *	Route lookup. rcu_read_lock() should be held.
  */
 
-static inline struct rt6_info *rt6_device_match(struct net *net,
-						    struct rt6_info *rt,
+static inline struct fib6_info *rt6_device_match(struct net *net,
+						 struct fib6_info *rt,
 						    const struct in6_addr *saddr,
 						    int oif,
 						    int flags)
 {
-	struct rt6_info *local = NULL;
-	struct rt6_info *sprt;
+	struct fib6_info *local = NULL;
+	struct fib6_info *sprt;
 
 	if (!oif && ipv6_addr_any(saddr) &&
 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
@@ -532,7 +532,7 @@ static void rt6_probe_deferred(struct work_struct *w)
 	kfree(work);
 }
 
-static void rt6_probe(struct rt6_info *rt)
+static void rt6_probe(struct fib6_info *rt)
 {
 	struct __rt6_probe_work *work;
 	const struct in6_addr *nh_gw;
@@ -585,7 +585,7 @@ out:
 	rcu_read_unlock_bh();
 }
 #else
-static inline void rt6_probe(struct rt6_info *rt)
+static inline void rt6_probe(struct fib6_info *rt)
 {
 }
 #endif
@@ -593,7 +593,7 @@ static inline void rt6_probe(struct rt6_info *rt)
 /*
  * Default Router Selection (RFC 2461 6.3.6)
  */
-static inline int rt6_check_dev(struct rt6_info *rt, int oif)
+static inline int rt6_check_dev(struct fib6_info *rt, int oif)
 {
 	const struct net_device *dev = rt->fib6_nh.nh_dev;
 
@@ -605,7 +605,7 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 	return 0;
 }
 
-static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
+static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
 {
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 	struct neighbour *neigh;
@@ -637,8 +637,7 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
 	return ret;
 }
 
-static int rt6_score_route(struct rt6_info *rt, int oif,
-			   int strict)
+static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 {
 	int m;
 
@@ -656,8 +655,8 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 	return m;
 }
 
-static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
-				   int *mpri, struct rt6_info *match,
+static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
+				   int *mpri, struct fib6_info *match,
 				   bool *do_rr)
 {
 	int m;
@@ -696,13 +695,13 @@ out:
 	return match;
 }
 
-static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
-				     struct rt6_info *leaf,
-				     struct rt6_info *rr_head,
+static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
+				     struct fib6_info *leaf,
+				     struct fib6_info *rr_head,
 				     u32 metric, int oif, int strict,
 				     bool *do_rr)
 {
-	struct rt6_info *rt, *match, *cont;
+	struct fib6_info *rt, *match, *cont;
 	int mpri = -1;
 
 	match = NULL;
@@ -735,11 +734,11 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	return match;
 }
 
-static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
+static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 				   int oif, int strict)
 {
-	struct rt6_info *leaf = rcu_dereference(fn->leaf);
-	struct rt6_info *match, *rt0;
+	struct fib6_info *leaf = rcu_dereference(fn->leaf);
+	struct fib6_info *match, *rt0;
 	bool do_rr = false;
 	int key_plen;
 
@@ -767,7 +766,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 			     &do_rr);
 
 	if (do_rr) {
-		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
+		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
 
 		/* no entries matched; do round-robin */
 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
@@ -785,7 +784,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 	return match ? match : net->ipv6.fib6_null_entry;
 }
 
-static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
 {
 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
 }
@@ -799,7 +798,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	struct in6_addr prefix_buf, *prefix;
 	unsigned int pref;
 	unsigned long lifetime;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	if (len < sizeof(struct route_info)) {
 		return -EINVAL;
@@ -871,7 +870,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
  */
 
 /* called with rcu_lock held */
-static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
+static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
 {
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 
@@ -913,7 +912,7 @@ static int ip6_rt_type_to_error(u8 fib6_type)
 	return fib6_prop[fib6_type];
 }
 
-static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
+static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
 {
 	unsigned short flags = 0;
 
@@ -927,7 +926,7 @@ static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
 	return flags;
 }
 
-static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
 {
 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
 
@@ -949,7 +948,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
 	}
 }
 
-static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
+static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 {
 	rt->dst.flags |= fib6_info_dst_flags(ort);
 
@@ -977,7 +976,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 	rt->dst.lastuse = jiffies;
 }
 
-static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
+static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	fib6_info_hold(from);
@@ -989,7 +988,7 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 	}
 }
 
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
 	ip6_rt_init_dst(rt, ort);
 
@@ -1045,7 +1044,7 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
 }
 
 /* called with rcu_lock held */
-static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
+static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
 {
 	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev = rt->fib6_nh.nh_dev;
@@ -1064,7 +1063,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     const struct sk_buff *skb,
 					     int flags)
 {
-	struct rt6_info *f6i;
+	struct fib6_info *f6i;
 	struct fib6_node *fn;
 	struct rt6_info *rt;
 
@@ -1152,7 +1151,7 @@ EXPORT_SYMBOL(rt6_lookup);
  * Caller must hold dst before calling it.
  */
 
-static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
+static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
 			struct netlink_ext_ack *extack)
 {
 	int err;
@@ -1166,14 +1165,14 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 	return err;
 }
 
-int ip6_ins_rt(struct net *net, struct rt6_info *rt)
+int ip6_ins_rt(struct net *net, struct fib6_info *rt)
 {
 	struct nl_info info = {	.nl_net = net, };
 
 	return __ip6_ins_rt(rt, &info, NULL);
 }
 
-static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 					   const struct in6_addr *daddr,
 					   const struct in6_addr *saddr)
 {
@@ -1213,7 +1212,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	return rt;
 }
 
-static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
+static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
 {
 	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev;
@@ -1232,7 +1231,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 }
 
 /* It should be called with rcu_read_lock() acquired */
-static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
 {
 	struct rt6_info *pcpu_rt, **p;
 
@@ -1246,7 +1245,7 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 }
 
 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
-					    struct rt6_info *rt)
+					    struct fib6_info *rt)
 {
 	struct rt6_info *pcpu_rt, *prev, **p;
 
@@ -1390,7 +1389,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
 	return NULL;
 }
 
-static unsigned int fib6_mtu(const struct rt6_info *rt)
+static unsigned int fib6_mtu(const struct fib6_info *rt)
 {
 	unsigned int mtu;
 
@@ -1401,7 +1400,7 @@ static unsigned int fib6_mtu(const struct rt6_info *rt)
 }
 
 static int rt6_insert_exception(struct rt6_info *nrt,
-				struct rt6_info *ort)
+				struct fib6_info *ort)
 {
 	struct net *net = dev_net(nrt->dst.dev);
 	struct rt6_exception_bucket *bucket;
@@ -1487,7 +1486,7 @@ out:
 	return err;
 }
 
-void rt6_flush_exceptions(struct rt6_info *rt)
+void rt6_flush_exceptions(struct fib6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
@@ -1517,7 +1516,7 @@ out:
 /* Find cached rt in the hash table inside passed in rt
  * Caller has to hold rcu_read_lock()
  */
-static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 					   struct in6_addr *daddr,
 					   struct in6_addr *saddr)
 {
@@ -1550,7 +1549,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
 static int rt6_remove_exception_rt(struct rt6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
-	struct rt6_info *from = rt->from;
+	struct fib6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 	int err;
@@ -1595,7 +1594,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
-	struct rt6_info *from = rt->from;
+	struct fib6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 
@@ -1625,7 +1624,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 	rcu_read_unlock();
 }
 
-static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
+static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
@@ -1667,7 +1666,7 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
 }
 
 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
-				       struct rt6_info *rt, int mtu)
+				       struct fib6_info *rt, int mtu)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
@@ -1697,7 +1696,7 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
 
 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
 
-static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
+static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
 					struct in6_addr *gateway)
 {
 	struct rt6_exception_bucket *bucket;
@@ -1776,7 +1775,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 	gc_args->more++;
 }
 
-void rt6_age_exceptions(struct rt6_info *rt,
+void rt6_age_exceptions(struct fib6_info *rt,
 			struct fib6_gc_args *gc_args,
 			unsigned long now)
 {
@@ -1812,7 +1811,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       const struct sk_buff *skb, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
-	struct rt6_info *f6i;
+	struct fib6_info *f6i;
 	struct rt6_info *rt;
 	int strict = 0;
 
@@ -2139,7 +2138,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *	Destination cache support functions
  */
 
-static bool fib6_check(struct rt6_info *f6i, u32 cookie)
+static bool fib6_check(struct fib6_info *f6i, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
@@ -2374,7 +2373,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 {
 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
 	struct rt6_info *ret = NULL, *rt_cache;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct fib6_node *fn;
 
 	/* Get the "current" route for this destination and
@@ -2620,7 +2619,7 @@ out:
 	return entries > rt_max_size;
 }
 
-static int ip6_convert_metrics(struct net *net, struct rt6_info *rt,
+static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
 			       struct fib6_config *cfg)
 {
 	int err = 0;
@@ -2823,12 +2822,12 @@ out:
 	return err;
 }
 
-static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
+static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 					      gfp_t gfp_flags,
 					      struct netlink_ext_ack *extack)
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev = NULL;
 	struct fib6_table *table;
@@ -3047,7 +3046,7 @@ out:
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	int err;
 
 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
@@ -3060,7 +3059,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 	return err;
 }
 
-static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
+static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
 {
 	struct net *net = info->nl_net;
 	struct fib6_table *table;
@@ -3081,14 +3080,14 @@ out:
 	return err;
 }
 
-int ip6_del_rt(struct net *net, struct rt6_info *rt)
+int ip6_del_rt(struct net *net, struct fib6_info *rt)
 {
 	struct nl_info info = { .nl_net = net };
 
 	return __ip6_del_rt(rt, &info);
 }
 
-static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
+static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
 {
 	struct nl_info *info = &cfg->fc_nlinfo;
 	struct net *net = info->nl_net;
@@ -3102,7 +3101,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 	spin_lock_bh(&table->tb6_lock);
 
 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
-		struct rt6_info *sibling, *next_sibling;
+		struct fib6_info *sibling, *next_sibling;
 
 		/* prefer to send a single notification with all hops */
 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
@@ -3159,8 +3158,9 @@ out:
 static int ip6_route_del(struct fib6_config *cfg,
 			 struct netlink_ext_ack *extack)
 {
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *rt_cache;
 	struct fib6_table *table;
+	struct fib6_info *rt;
 	struct fib6_node *fn;
 	int err = -ESRCH;
 
@@ -3336,7 +3336,7 @@ out:
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_get_route_info(struct net *net,
+static struct fib6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev)
@@ -3344,7 +3344,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
 	int ifindex = dev->ifindex;
 	struct fib6_node *fn;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct fib6_table *table;
 
 	table = fib6_get_table(net, tb_id);
@@ -3363,7 +3363,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 			continue;
 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
-		ip6_hold_safe(NULL, &rt, false);
+		fib6_info_hold(rt);
 		break;
 	}
 out:
@@ -3371,7 +3371,7 @@ out:
 	return rt;
 }
 
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct fib6_info *rt6_add_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev,
@@ -3404,12 +3404,12 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 }
 #endif
 
-struct rt6_info *rt6_get_dflt_router(struct net *net,
+struct fib6_info *rt6_get_dflt_router(struct net *net,
 				     const struct in6_addr *addr,
 				     struct net_device *dev)
 {
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct fib6_table *table;
 
 	table = fib6_get_table(net, tb_id);
@@ -3424,12 +3424,12 @@ struct rt6_info *rt6_get_dflt_router(struct net *net,
 			break;
 	}
 	if (rt)
-		ip6_hold_safe(NULL, &rt, false);
+		fib6_info_hold(rt);
 	rcu_read_unlock();
 	return rt;
 }
 
-struct rt6_info *rt6_add_dflt_router(struct net *net,
+struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
 				     struct net_device *dev,
 				     unsigned int pref)
@@ -3463,7 +3463,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net,
 static void __rt6_purge_dflt_routers(struct net *net,
 				     struct fib6_table *table)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 restart:
 	rcu_read_lock();
@@ -3614,14 +3614,14 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
  *	Allocate a dst for local (unicast / anycast) address.
  */
 
-struct rt6_info *addrconf_dst_alloc(struct net *net,
+struct fib6_info *addrconf_dst_alloc(struct net *net,
 				    struct inet6_dev *idev,
 				    const struct in6_addr *addr,
 				    bool anycast, gfp_t gfp_flags)
 {
 	u32 tb_id;
 	struct net_device *dev = idev->dev;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	rt = fib6_info_alloc(gfp_flags);
 	if (!rt)
@@ -3661,7 +3661,7 @@ struct arg_dev_net_ip {
 	struct in6_addr *addr;
 };
 
-static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
+static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 {
 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
@@ -3694,7 +3694,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
 
 /* Remove routers and update dst entries when gateway turn into host. */
-static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
+static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
@@ -3725,9 +3725,9 @@ struct arg_netdev_event {
 	};
 };
 
-static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
+static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	struct fib6_node *fn;
 
 	fn = rcu_dereference_protected(rt->rt6i_node,
@@ -3745,7 +3745,7 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
 	return NULL;
 }
 
-static bool rt6_is_dead(const struct rt6_info *rt)
+static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
@@ -3755,9 +3755,9 @@ static bool rt6_is_dead(const struct rt6_info *rt)
 	return false;
 }
 
-static int rt6_multipath_total_weight(const struct rt6_info *rt)
+static int rt6_multipath_total_weight(const struct fib6_info *rt)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
@@ -3771,7 +3771,7 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt)
 	return total;
 }
 
-static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
+static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
 {
 	int upper_bound = -1;
 
@@ -3783,9 +3783,9 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
 }
 
-static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
+static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	int weight = 0;
 
 	rt6_upper_bound_set(rt, &weight, total);
@@ -3794,9 +3794,9 @@ static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
 		rt6_upper_bound_set(iter, &weight, total);
 }
 
-void rt6_multipath_rebalance(struct rt6_info *rt)
+void rt6_multipath_rebalance(struct fib6_info *rt)
 {
-	struct rt6_info *first;
+	struct fib6_info *first;
 	int total;
 
 	/* In case the entire multipath route was marked for flushing,
@@ -3818,7 +3818,7 @@ void rt6_multipath_rebalance(struct rt6_info *rt)
 	rt6_multipath_upper_bound_set(first, total);
 }
 
-static int fib6_ifup(struct rt6_info *rt, void *p_arg)
+static int fib6_ifup(struct fib6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
@@ -3847,10 +3847,10 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
 }
 
-static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
+static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 				   const struct net_device *dev)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 
 	if (rt->fib6_nh.nh_dev == dev)
 		return true;
@@ -3861,19 +3861,19 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
 	return false;
 }
 
-static void rt6_multipath_flush(struct rt6_info *rt)
+static void rt6_multipath_flush(struct fib6_info *rt)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 
 	rt->should_flush = 1;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
 		iter->should_flush = 1;
 }
 
-static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
+static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
 					     const struct net_device *down_dev)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	unsigned int dead = 0;
 
 	if (rt->fib6_nh.nh_dev == down_dev ||
@@ -3887,11 +3887,11 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
 	return dead;
 }
 
-static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
+static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
 				       const struct net_device *dev,
 				       unsigned int nh_flags)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 
 	if (rt->fib6_nh.nh_dev == dev)
 		rt->fib6_nh.nh_flags |= nh_flags;
@@ -3901,7 +3901,7 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
 }
 
 /* called with write lock held for table with rt */
-static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
+static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
 	const struct net_device *dev = arg->dev;
@@ -3968,7 +3968,7 @@ struct rt6_mtu_change_arg {
 	unsigned int mtu;
 };
 
-static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
+static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
 	struct inet6_dev *idev;
@@ -4155,7 +4155,7 @@ errout:
 }
 
 struct rt6_nh {
-	struct rt6_info *rt6_info;
+	struct fib6_info *fib6_info;
 	struct fib6_config r_cfg;
 	struct list_head next;
 };
@@ -4173,21 +4173,22 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
 
 static int ip6_route_info_append(struct net *net,
 				 struct list_head *rt6_nh_list,
-				 struct rt6_info *rt, struct fib6_config *r_cfg)
+				 struct fib6_info *rt,
+				 struct fib6_config *r_cfg)
 {
 	struct rt6_nh *nh;
 	int err = -EEXIST;
 
 	list_for_each_entry(nh, rt6_nh_list, next) {
-		/* check if rt6_info already exists */
-		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
+		/* check if fib6_info already exists */
+		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
 			return err;
 	}
 
 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
 	if (!nh)
 		return -ENOMEM;
-	nh->rt6_info = rt;
+	nh->fib6_info = rt;
 	err = ip6_convert_metrics(net, rt, r_cfg);
 	if (err) {
 		kfree(nh);
@@ -4199,8 +4200,8 @@ static int ip6_route_info_append(struct net *net,
 	return 0;
 }
 
-static void ip6_route_mpath_notify(struct rt6_info *rt,
-				   struct rt6_info *rt_last,
+static void ip6_route_mpath_notify(struct fib6_info *rt,
+				   struct fib6_info *rt_last,
 				   struct nl_info *info,
 				   __u16 nlflags)
 {
@@ -4212,7 +4213,7 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
 	 */
 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
 		rt = list_first_entry(&rt_last->rt6i_siblings,
-				      struct rt6_info,
+				      struct fib6_info,
 				      rt6i_siblings);
 	}
 
@@ -4223,11 +4224,11 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
 static int ip6_route_multipath_add(struct fib6_config *cfg,
 				   struct netlink_ext_ack *extack)
 {
-	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
+	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
 	struct nl_info *info = &cfg->fc_nlinfo;
 	struct fib6_config r_cfg;
 	struct rtnexthop *rtnh;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct rt6_nh *err_nh;
 	struct rt6_nh *nh, *nh_safe;
 	__u16 nlflags;
@@ -4247,7 +4248,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 	rtnh = (struct rtnexthop *)cfg->fc_mp;
 
 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
-	 * rt6_info structs per nexthop
+	 * fib6_info structs per nexthop
 	 */
 	while (rtnh_ok(rtnh, remaining)) {
 		memcpy(&r_cfg, cfg, sizeof(*cfg));
@@ -4297,16 +4298,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 
 	err_nh = NULL;
 	list_for_each_entry(nh, &rt6_nh_list, next) {
-		rt_last = nh->rt6_info;
-		err = __ip6_ins_rt(nh->rt6_info, info, extack);
-		fib6_info_release(nh->rt6_info);
+		rt_last = nh->fib6_info;
+		err = __ip6_ins_rt(nh->fib6_info, info, extack);
+		fib6_info_release(nh->fib6_info);
 
 		/* save reference to first route for notification */
 		if (!rt_notif && !err)
-			rt_notif = nh->rt6_info;
+			rt_notif = nh->fib6_info;
 
-		/* nh->rt6_info is used or freed at this point, reset to NULL*/
-		nh->rt6_info = NULL;
+		/* nh->fib6_info is used or freed at this point, reset to NULL*/
+		nh->fib6_info = NULL;
 		if (err) {
 			if (replace && nhn)
 				ip6_print_replace_route_err(&rt6_nh_list);
@@ -4347,8 +4348,8 @@ add_errout:
 
 cleanup:
 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
-		if (nh->rt6_info)
-			fib6_info_release(nh->rt6_info);
+		if (nh->fib6_info)
+			fib6_info_release(nh->fib6_info);
 		list_del(&nh->next);
 		kfree(nh);
 	}
@@ -4428,7 +4429,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
 }
 
-static size_t rt6_nlmsg_size(struct rt6_info *rt)
+static size_t rt6_nlmsg_size(struct fib6_info *rt)
 {
 	int nexthop_len = 0;
 
@@ -4458,7 +4459,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 	       + nexthop_len;
 }
 
-static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
+static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
@@ -4495,7 +4496,7 @@ nla_put_failure:
 }
 
 /* add multipath next hop */
-static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
+static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
 {
 	const struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rtnexthop *rtnh;
@@ -4523,7 +4524,7 @@ nla_put_failure:
 }
 
 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
-			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct fib6_info *rt, struct dst_entry *dst,
 			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags)
@@ -4613,7 +4614,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	 * each as a nexthop within RTA_MULTIPATH.
 	 */
 	if (rt->rt6i_nsiblings) {
-		struct rt6_info *sibling, *next_sibling;
+		struct fib6_info *sibling, *next_sibling;
 		struct nlattr *mp;
 
 		mp = nla_nest_start(skb, RTA_MULTIPATH);
@@ -4655,7 +4656,7 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-int rt6_dump_route(struct rt6_info *rt, void *p_arg)
+int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 	struct net *net = arg->net;
@@ -4800,7 +4801,7 @@ errout:
 	return err;
 }
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int nlm_flags)
 {
 	struct sk_buff *skb;
-- 
cgit v1.2.3


From 77634cc67dc1ffc8ae6d869af6dee4b2ea6025ee Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:27 -0700
Subject: net/ipv6: Remove unused code and variables for rt6_info

Drop unneeded elements from rt6_info struct and rearrange layout to
something more relevant for the data path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   | 60 +++----------------------------------------------
 net/ipv6/ip6_fib.c      | 22 ------------------
 net/ipv6/route.c        | 27 ++--------------------
 net/ipv6/xfrm6_policy.c |  2 --
 4 files changed, 5 insertions(+), 106 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index d41b7bd69fb3..a36116b92100 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -175,58 +175,20 @@ struct fib6_info {
 
 struct rt6_info {
 	struct dst_entry		dst;
-	struct rt6_info __rcu		*rt6_next;
 	struct fib6_info		*from;
 
-	/*
-	 * Tail elements of dst_entry (__refcnt etc.)
-	 * and these elements (rarely used in hot path) are in
-	 * the same cache line.
-	 */
-	struct fib6_table		*rt6i_table;
-	struct fib6_node __rcu		*rt6i_node;
-
+	struct rt6key			rt6i_dst;
+	struct rt6key			rt6i_src;
 	struct in6_addr			rt6i_gateway;
-
-	/* Multipath routes:
-	 * siblings is a list of rt6_info that have the the same metric/weight,
-	 * destination, but not the same gateway. nsiblings is just a cache
-	 * to speed up lookup.
-	 */
-	struct list_head		rt6i_siblings;
-	unsigned int			rt6i_nsiblings;
-
-	atomic_t			rt6i_ref;
-
-	/* These are in a separate cache line. */
-	struct rt6key			rt6i_dst ____cacheline_aligned_in_smp;
+	struct inet6_dev		*rt6i_idev;
 	u32				rt6i_flags;
-	struct rt6key			rt6i_src;
 	struct rt6key			rt6i_prefsrc;
 
 	struct list_head		rt6i_uncached;
 	struct uncached_list		*rt6i_uncached_list;
 
-	struct inet6_dev		*rt6i_idev;
-	struct rt6_info * __percpu	*rt6i_pcpu;
-	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
-
-	u32				rt6i_metric;
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
-	u8				rt6i_protocol;
-	u8				fib6_type;
-	u8				exception_bucket_flushed:1,
-					should_flush:1,
-					dst_nocount:1,
-					dst_nopolicy:1,
-					dst_host:1,
-					unused:3;
-
-	unsigned long			expires;
-	struct dst_metrics		*fib6_metrics;
-#define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
-	struct fib6_nh			fib6_nh;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
@@ -328,8 +290,6 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 	dst_release(&rt->dst);
 }
 
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
-
 struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
 void fib6_info_destroy(struct fib6_info *f6i);
 
@@ -344,20 +304,6 @@ static inline void fib6_info_release(struct fib6_info *f6i)
 		fib6_info_destroy(f6i);
 }
 
-static inline void rt6_hold(struct rt6_info *rt)
-{
-	atomic_inc(&rt->rt6i_ref);
-}
-
-static inline void rt6_release(struct rt6_info *rt)
-{
-	if (atomic_dec_and_test(&rt->rt6i_ref)) {
-		rt6_free_pcpu(rt);
-		dst_dev_put(&rt->dst);
-		dst_release(&rt->dst);
-	}
-}
-
 enum fib6_walk_state {
 #ifdef CONFIG_IPV6_SUBTREES
 	FWS_S,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 77cf43b2d858..2ab49b7cac22 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -240,28 +240,6 @@ static void node_free(struct net *net, struct fib6_node *fn)
 	net->ipv6.rt6_stats->fib_nodes--;
 }
 
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
-{
-	int cpu;
-
-	if (!non_pcpu_rt->rt6i_pcpu)
-		return;
-
-	for_each_possible_cpu(cpu) {
-		struct rt6_info **ppcpu_rt;
-		struct rt6_info *pcpu_rt;
-
-		ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
-		pcpu_rt = *ppcpu_rt;
-		if (pcpu_rt) {
-			dst_dev_put(&pcpu_rt->dst);
-			dst_release(&pcpu_rt->dst);
-			*ppcpu_rt = NULL;
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(rt6_free_pcpu);
-
 static void fib6_free_table(struct fib6_table *table)
 {
 	inetpeer_invalidate_tree(&table->tb6_peers);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2ccf939e1a20..f9c363327d62 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -302,10 +302,6 @@ static const struct rt6_info ip6_null_entry_template = {
 		.output		= ip6_pkt_discard_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32) 0,
-	.rt6i_ref	= ATOMIC_INIT(1),
-	.fib6_type	= RTN_UNREACHABLE,
 };
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -320,10 +316,6 @@ static const struct rt6_info ip6_prohibit_entry_template = {
 		.output		= ip6_pkt_prohibit_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32) 0,
-	.rt6i_ref	= ATOMIC_INIT(1),
-	.fib6_type	= RTN_PROHIBIT,
 };
 
 static const struct rt6_info ip6_blk_hole_entry_template = {
@@ -336,10 +328,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 		.output		= dst_discard_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32) 0,
-	.rt6i_ref	= ATOMIC_INIT(1),
-	.fib6_type	= RTN_BLACKHOLE,
 };
 
 #endif
@@ -349,7 +337,6 @@ static void rt6_info_init(struct rt6_info *rt)
 	struct dst_entry *dst = &rt->dst;
 
 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
-	INIT_LIST_HEAD(&rt->rt6i_siblings);
 	INIT_LIST_HEAD(&rt->rt6i_uncached);
 }
 
@@ -999,12 +986,10 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 	rt->rt6i_flags = ort->rt6i_flags;
 	rt6_set_from(rt, ort);
-	rt->rt6i_metric = ort->rt6i_metric;
 #ifdef CONFIG_IPV6_SUBTREES
 	rt->rt6i_src = ort->rt6i_src;
 #endif
 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
-	rt->rt6i_table = ort->rt6i_table;
 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 }
 
@@ -1192,7 +1177,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 
 	ip6_rt_copy_init(rt, ort);
 	rt->rt6i_flags |= RTF_CACHE;
-	rt->rt6i_metric = 0;
 	rt->dst.flags |= DST_HOST;
 	rt->rt6i_dst.addr = *daddr;
 	rt->rt6i_dst.plen = 128;
@@ -1225,7 +1209,6 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
 	if (!pcpu_rt)
 		return NULL;
 	ip6_rt_copy_init(pcpu_rt, rt);
-	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
 	pcpu_rt->rt6i_flags |= RTF_PCPU;
 	return pcpu_rt;
 }
@@ -1279,9 +1262,8 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
 		return;
 
 	net = dev_net(rt6_ex->rt6i->dst.dev);
-	rt6_ex->rt6i->rt6i_node = NULL;
 	hlist_del_rcu(&rt6_ex->hlist);
-	ip6_rt_put(rt6_ex->rt6i);
+	dst_release(&rt6_ex->rt6i->dst);
 	kfree_rcu(rt6_ex, rcu);
 	WARN_ON_ONCE(!bucket->depth);
 	bucket->depth--;
@@ -1463,8 +1445,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	}
 	rt6_ex->rt6i = nrt;
 	rt6_ex->stamp = jiffies;
-	atomic_inc(&nrt->rt6i_ref);
-	nrt->rt6i_node = ort->rt6i_node;
 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
 	bucket->depth++;
 	net->ipv6.rt6_stats->fib_rt_cache++;
@@ -2122,7 +2102,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 		rt->rt6i_idev = in6_dev_get(loopback_dev);
 		rt->rt6i_gateway = ort->rt6i_gateway;
 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
-		rt->rt6i_metric = 0;
 
 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 #ifdef CONFIG_IPV6_SUBTREES
@@ -2247,8 +2226,7 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 {
 	return !(rt->rt6i_flags & RTF_CACHE) &&
-		(rt->rt6i_flags & RTF_PCPU ||
-		 rcu_access_pointer(rt->rt6i_node));
+		(rt->rt6i_flags & RTF_PCPU || rt->from);
 }
 
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@ -3313,7 +3291,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	if (on_link)
 		nrt->rt6i_flags &= ~RTF_GATEWAY;
 
-	nrt->rt6i_protocol = RTPROT_REDIRECT;
 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
 
 	/* No need to remove rt from the exception table if rt is
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 416fe67271a9..2cff209d0fc1 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -107,8 +107,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	 * it was magically lost, so this code needs audit */
 	xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
 						   RTF_LOCAL);
-	xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
-	xdst->u.rt6.rt6i_node = rt->rt6i_node;
 	xdst->route_cookie = rt6_get_cookie(rt);
 	xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
 	xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
-- 
cgit v1.2.3


From 24ada03555505205b0c8b8b796d52926600bf947 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 18 Apr 2018 15:40:41 +0100
Subject: ASoC: topology: Fix build errors

The two commits:

   81e9b0a07889 ASoC: topology: Give more data to clients via callbacks
   28aa6f7779f7 ASoC: topology: Add callback for DAPM route load/unload

break the build so revert them.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-topology.h           | 30 +++++++----------------
 sound/soc/intel/skylake/skl-pcm.c      |  7 +++---
 sound/soc/intel/skylake/skl-topology.c |  5 ++--
 sound/soc/intel/skylake/skl-topology.h | 20 ++++++++++++----
 sound/soc/soc-topology.c               | 44 ++++++++++------------------------
 5 files changed, 42 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-topology.h b/include/sound/soc-topology.h
index 401ef2c45d6c..f552c3f56368 100644
--- a/include/sound/soc-topology.h
+++ b/include/sound/soc-topology.h
@@ -30,9 +30,6 @@ struct snd_soc_dapm_context;
 struct snd_soc_card;
 struct snd_kcontrol_new;
 struct snd_soc_dai_link;
-struct snd_soc_dai_driver;
-struct snd_soc_dai;
-struct snd_soc_dapm_route;
 
 /* object scan be loaded and unloaded in groups with identfying indexes */
 #define SND_SOC_TPLG_INDEX_ALL	0	/* ID that matches all FW objects */
@@ -112,44 +109,35 @@ struct snd_soc_tplg_widget_events {
 struct snd_soc_tplg_ops {
 
 	/* external kcontrol init - used for any driver specific init */
-	int (*control_load)(struct snd_soc_component *, int index,
+	int (*control_load)(struct snd_soc_component *,
 		struct snd_kcontrol_new *, struct snd_soc_tplg_ctl_hdr *);
 	int (*control_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
-	/* DAPM graph route element loading and unloading */
-	int (*dapm_route_load)(struct snd_soc_component *, int index,
-		struct snd_soc_dapm_route *route);
-	int (*dapm_route_unload)(struct snd_soc_component *,
-		struct snd_soc_dobj *);
-
 	/* external widget init - used for any driver specific init */
-	int (*widget_load)(struct snd_soc_component *, int index,
+	int (*widget_load)(struct snd_soc_component *,
 		struct snd_soc_dapm_widget *,
 		struct snd_soc_tplg_dapm_widget *);
-	int (*widget_ready)(struct snd_soc_component *, int index,
+	int (*widget_ready)(struct snd_soc_component *,
 		struct snd_soc_dapm_widget *,
 		struct snd_soc_tplg_dapm_widget *);
 	int (*widget_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
 	/* FE DAI - used for any driver specific init */
-	int (*dai_load)(struct snd_soc_component *, int index,
-		struct snd_soc_dai_driver *dai_drv,
-		struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai);
-
+	int (*dai_load)(struct snd_soc_component *,
+		struct snd_soc_dai_driver *dai_drv);
 	int (*dai_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
 	/* DAI link - used for any driver specific init */
-	int (*link_load)(struct snd_soc_component *, int index,
-		struct snd_soc_dai_link *link,
-		struct snd_soc_tplg_link_config *cfg);
+	int (*link_load)(struct snd_soc_component *,
+		struct snd_soc_dai_link *link);
 	int (*link_unload)(struct snd_soc_component *,
 		struct snd_soc_dobj *);
 
 	/* callback to handle vendor bespoke data */
-	int (*vendor_load)(struct snd_soc_component *, int index,
+	int (*vendor_load)(struct snd_soc_component *,
 		struct snd_soc_tplg_hdr *);
 	int (*vendor_unload)(struct snd_soc_component *,
 		struct snd_soc_tplg_hdr *);
@@ -158,7 +146,7 @@ struct snd_soc_tplg_ops {
 	void (*complete)(struct snd_soc_component *);
 
 	/* manifest - optional to inform component of manifest */
-	int (*manifest)(struct snd_soc_component *, int index,
+	int (*manifest)(struct snd_soc_component *,
 		struct snd_soc_tplg_manifest *);
 
 	/* vendor specific kcontrol handlers available for binding */
diff --git a/sound/soc/intel/skylake/skl-pcm.c b/sound/soc/intel/skylake/skl-pcm.c
index 1f4dd08d36c5..afa86b9e4dcf 100644
--- a/sound/soc/intel/skylake/skl-pcm.c
+++ b/sound/soc/intel/skylake/skl-pcm.c
@@ -1017,11 +1017,10 @@ static struct snd_soc_dai_driver skl_platform_dai[] = {
 },
 };
 
-int skl_dai_load(struct snd_soc_component *cmp, int index,
-			struct snd_soc_dai_driver *dai_drv,
-			struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai)
+int skl_dai_load(struct snd_soc_component *cmp,
+		 struct snd_soc_dai_driver *pcm_dai)
 {
-	dai_drv->ops = &skl_pcm_dai_ops;
+	pcm_dai->ops = &skl_pcm_dai_ops;
 
 	return 0;
 }
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index 6ac081f1f215..3b1dca419883 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -2851,7 +2851,7 @@ void skl_cleanup_resources(struct skl *skl)
  * information to the driver about module and pipeline parameters which DSP
  * FW expects like ids, resource values, formats etc
  */
-static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, int index,
+static int skl_tplg_widget_load(struct snd_soc_component *cmpnt,
 				struct snd_soc_dapm_widget *w,
 				struct snd_soc_tplg_dapm_widget *tplg_w)
 {
@@ -2958,7 +2958,6 @@ static int skl_init_enum_data(struct device *dev, struct soc_enum *se,
 }
 
 static int skl_tplg_control_load(struct snd_soc_component *cmpnt,
-				int index,
 				struct snd_kcontrol_new *kctl,
 				struct snd_soc_tplg_ctl_hdr *hdr)
 {
@@ -3447,7 +3446,7 @@ static int skl_tplg_get_manifest_data(struct snd_soc_tplg_manifest *manifest,
 	return 0;
 }
 
-static int skl_manifest_load(struct snd_soc_component *cmpnt, int index,
+static int skl_manifest_load(struct snd_soc_component *cmpnt,
 				struct snd_soc_tplg_manifest *manifest)
 {
 	struct hdac_ext_bus *ebus = snd_soc_component_get_drvdata(cmpnt);
diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h
index 77857c598eed..b1e0667c0ae0 100644
--- a/sound/soc/intel/skylake/skl-topology.h
+++ b/sound/soc/intel/skylake/skl-topology.h
@@ -221,9 +221,18 @@ struct skl_mod_inst_map {
 	u16 inst_id;
 };
 
+struct skl_uuid_inst_map {
+	u16 inst_id;
+	u16 reserved;
+	uuid_le mod_uuid;
+} __packed;
+
 struct skl_kpb_params {
 	u32 num_modules;
-	struct skl_mod_inst_map map[0];
+	union {
+		struct skl_mod_inst_map map[0];
+		struct skl_uuid_inst_map map_uuid[0];
+	} u;
 };
 
 struct skl_module_inst_id {
@@ -460,7 +469,7 @@ int skl_dsp_set_dma_control(struct skl_sst *ctx, u32 *caps,
 			u32 caps_size, u32 node_id);
 void skl_tplg_set_be_dmic_config(struct snd_soc_dai *dai,
 	struct skl_pipe_params *params, int stream);
-int skl_tplg_init(struct snd_soc_platform *platform,
+int skl_tplg_init(struct snd_soc_component *component,
 				struct hdac_ext_bus *ebus);
 struct skl_module_cfg *skl_tplg_fe_get_cpr_module(
 		struct snd_soc_dai *dai, int stream);
@@ -503,7 +512,8 @@ int skl_pcm_host_dma_prepare(struct device *dev,
 int skl_pcm_link_dma_prepare(struct device *dev,
 			struct skl_pipe_params *params);
 
-int skl_dai_load(struct snd_soc_component *, int index,
-		struct snd_soc_dai_driver *dai_drv,
-		struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai);
+int skl_dai_load(struct snd_soc_component *cmp,
+		 struct snd_soc_dai_driver *pcm_dai);
+void skl_tplg_add_moduleid_in_bind_params(struct skl *skl,
+				struct snd_soc_dapm_widget *w);
 #endif
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index b95a9ab0b526..de08693be9e1 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -315,7 +315,7 @@ static int soc_tplg_vendor_load_(struct soc_tplg *tplg,
 	int ret = 0;
 
 	if (tplg->comp && tplg->ops && tplg->ops->vendor_load)
-		ret = tplg->ops->vendor_load(tplg->comp, tplg->index, hdr);
+		ret = tplg->ops->vendor_load(tplg->comp, hdr);
 	else {
 		dev_err(tplg->dev, "ASoC: no vendor load callback for ID %d\n",
 			hdr->vendor_type);
@@ -347,8 +347,7 @@ static int soc_tplg_widget_load(struct soc_tplg *tplg,
 	struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->widget_load)
-		return tplg->ops->widget_load(tplg->comp, tplg->index, w,
-			tplg_w);
+		return tplg->ops->widget_load(tplg->comp, w, tplg_w);
 
 	return 0;
 }
@@ -359,30 +358,27 @@ static int soc_tplg_widget_ready(struct soc_tplg *tplg,
 	struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->widget_ready)
-		return tplg->ops->widget_ready(tplg->comp, tplg->index, w,
-			tplg_w);
+		return tplg->ops->widget_ready(tplg->comp, w, tplg_w);
 
 	return 0;
 }
 
 /* pass DAI configurations to component driver for extra initialization */
 static int soc_tplg_dai_load(struct soc_tplg *tplg,
-	struct snd_soc_dai_driver *dai_drv,
-	struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai)
+	struct snd_soc_dai_driver *dai_drv)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->dai_load)
-		return tplg->ops->dai_load(tplg->comp, tplg->index, dai_drv,
-			pcm, dai);
+		return tplg->ops->dai_load(tplg->comp, dai_drv);
 
 	return 0;
 }
 
 /* pass link configurations to component driver for extra initialization */
 static int soc_tplg_dai_link_load(struct soc_tplg *tplg,
-	struct snd_soc_dai_link *link, struct snd_soc_tplg_link_config *cfg)
+	struct snd_soc_dai_link *link)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->link_load)
-		return tplg->ops->link_load(tplg->comp, tplg->index, link, cfg);
+		return tplg->ops->link_load(tplg->comp, link);
 
 	return 0;
 }
@@ -703,8 +699,7 @@ static int soc_tplg_init_kcontrol(struct soc_tplg *tplg,
 	struct snd_kcontrol_new *k, struct snd_soc_tplg_ctl_hdr *hdr)
 {
 	if (tplg->comp && tplg->ops && tplg->ops->control_load)
-		return tplg->ops->control_load(tplg->comp, tplg->index, k,
-			hdr);
+		return tplg->ops->control_load(tplg->comp, k, hdr);
 
 	return 0;
 }
@@ -1161,17 +1156,6 @@ static int soc_tplg_kcontrol_elems_load(struct soc_tplg *tplg,
 	return 0;
 }
 
-/* optionally pass new dynamic kcontrol to component driver. */
-static int soc_tplg_add_route(struct soc_tplg *tplg,
-	struct snd_soc_dapm_route *route)
-{
-	if (tplg->comp && tplg->ops && tplg->ops->dapm_route_load)
-		return tplg->ops->dapm_route_load(tplg->comp, tplg->index,
-			route);
-
-	return 0;
-}
-
 static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg,
 	struct snd_soc_tplg_hdr *hdr)
 {
@@ -1220,8 +1204,6 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg,
 		else
 			route.control = elem->control;
 
-		soc_tplg_add_route(tplg, &route);
-
 		/* add route, but keep going if some fail */
 		snd_soc_dapm_add_routes(dapm, &route, 1);
 	}
@@ -1776,7 +1758,7 @@ static int soc_tplg_dai_create(struct soc_tplg *tplg,
 		dai_drv->compress_new = snd_soc_new_compress;
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_load(tplg, dai_drv, pcm, NULL);
+	ret = soc_tplg_dai_load(tplg, dai_drv);
 	if (ret < 0) {
 		dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n");
 		kfree(dai_drv);
@@ -1846,7 +1828,7 @@ static int soc_tplg_fe_link_create(struct soc_tplg *tplg,
 		set_link_flags(link, pcm->flag_mask, pcm->flags);
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_link_load(tplg, link, NULL);
+	ret = soc_tplg_dai_link_load(tplg, link);
 	if (ret < 0) {
 		dev_err(tplg->comp->dev, "ASoC: FE link loading failed\n");
 		kfree(link);
@@ -2154,7 +2136,7 @@ static int soc_tplg_link_config(struct soc_tplg *tplg,
 		set_link_flags(link, cfg->flag_mask, cfg->flags);
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_link_load(tplg, link, cfg);
+	ret = soc_tplg_dai_link_load(tplg, link);
 	if (ret < 0) {
 		dev_err(tplg->dev, "ASoC: physical link loading failed\n");
 		return ret;
@@ -2276,7 +2258,7 @@ static int soc_tplg_dai_config(struct soc_tplg *tplg,
 		set_dai_flags(dai_drv, d->flag_mask, d->flags);
 
 	/* pass control to component driver for optional further init */
-	ret = soc_tplg_dai_load(tplg, dai_drv, NULL, dai);
+	ret = soc_tplg_dai_load(tplg, dai_drv);
 	if (ret < 0) {
 		dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n");
 		return ret;
@@ -2382,7 +2364,7 @@ static int soc_tplg_manifest_load(struct soc_tplg *tplg,
 
 	/* pass control to component driver for optional further init */
 	if (tplg->comp && tplg->ops && tplg->ops->manifest)
-		return tplg->ops->manifest(tplg->comp, tplg->index, _manifest);
+		return tplg->ops->manifest(tplg->comp, _manifest);
 
 	if (!abi_match)	/* free the duplicated one */
 		kfree(_manifest);
-- 
cgit v1.2.3


From 91e2dd0a47bae19600f13dcc9e0761082c50afa6 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 28 Mar 2018 13:19:25 -0500
Subject: ipmi: Add a panic handler for IPMI users

Users of the IPMI code had their own panic handlers, but the
order was not necessarily right, the base IPMI code would
need to handle the panic first, and the user had no way to
know if the IPMI interface could run at panic time.

Add a panic handler to the user interface, it is called if
non-NULL and the interface the user is on is capable of panic
handling.  It also cleans up the panic log handling a bit to
reuse the existing interface loop in the main panic handler.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_msghandler.c | 214 ++++++++++++++++++------------------
 include/linux/ipmi.h                |   6 +
 2 files changed, 110 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index dcfbf2e3c8c5..9ffbb5f9c7bd 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -4956,13 +4956,15 @@ static void device_id_fetcher(ipmi_smi_t intf, struct ipmi_recv_msg *msg)
 	}
 }
 
-static void send_panic_events(char *str)
+static void send_panic_events(ipmi_smi_t intf, char *str)
 {
-	struct kernel_ipmi_msg            msg;
-	ipmi_smi_t                        intf;
-	unsigned char                     data[16];
+	struct kernel_ipmi_msg msg;
+	unsigned char data[16];
 	struct ipmi_system_interface_addr *si;
-	struct ipmi_addr                  addr;
+	struct ipmi_addr addr;
+	char *p = str;
+	struct ipmi_ipmb_addr *ipmb;
+	int j;
 
 	if (ipmi_send_panic_event == IPMI_SEND_PANIC_EVENT_NONE)
 		return;
@@ -4993,15 +4995,8 @@ static void send_panic_events(char *str)
 		data[7] = str[2];
 	}
 
-	/* For every registered interface, send the event. */
-	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
-		if (!intf->handlers || !intf->handlers->poll)
-			/* Interface is not ready or can't run at panic time. */
-			continue;
-
-		/* Send the event announcing the panic. */
-		ipmi_panic_request_and_wait(intf, &addr, &msg);
-	}
+	/* Send the event announcing the panic. */
+	ipmi_panic_request_and_wait(intf, &addr, &msg);
 
 	/*
 	 * On every interface, dump a bunch of OEM event holding the
@@ -5010,111 +5005,100 @@ static void send_panic_events(char *str)
 	if (ipmi_send_panic_event != IPMI_SEND_PANIC_EVENT_STRING || !str)
 		return;
 
-	/* For every registered interface, send the event. */
-	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
-		char                  *p = str;
-		struct ipmi_ipmb_addr *ipmb;
-		int                   j;
-
-		if (intf->intf_num == -1)
-			/* Interface was not ready yet. */
-			continue;
+	/*
+	 * intf_num is used as an marker to tell if the
+	 * interface is valid.  Thus we need a read barrier to
+	 * make sure data fetched before checking intf_num
+	 * won't be used.
+	 */
+	smp_rmb();
 
-		/*
-		 * intf_num is used as an marker to tell if the
-		 * interface is valid.  Thus we need a read barrier to
-		 * make sure data fetched before checking intf_num
-		 * won't be used.
-		 */
-		smp_rmb();
+	/*
+	 * First job here is to figure out where to send the
+	 * OEM events.  There's no way in IPMI to send OEM
+	 * events using an event send command, so we have to
+	 * find the SEL to put them in and stick them in
+	 * there.
+	 */
 
-		/*
-		 * First job here is to figure out where to send the
-		 * OEM events.  There's no way in IPMI to send OEM
-		 * events using an event send command, so we have to
-		 * find the SEL to put them in and stick them in
-		 * there.
-		 */
+	/* Get capabilities from the get device id. */
+	intf->local_sel_device = 0;
+	intf->local_event_generator = 0;
+	intf->event_receiver = 0;
 
-		/* Get capabilities from the get device id. */
-		intf->local_sel_device = 0;
-		intf->local_event_generator = 0;
-		intf->event_receiver = 0;
+	/* Request the device info from the local MC. */
+	msg.netfn = IPMI_NETFN_APP_REQUEST;
+	msg.cmd = IPMI_GET_DEVICE_ID_CMD;
+	msg.data = NULL;
+	msg.data_len = 0;
+	intf->null_user_handler = device_id_fetcher;
+	ipmi_panic_request_and_wait(intf, &addr, &msg);
 
-		/* Request the device info from the local MC. */
-		msg.netfn = IPMI_NETFN_APP_REQUEST;
-		msg.cmd = IPMI_GET_DEVICE_ID_CMD;
+	if (intf->local_event_generator) {
+		/* Request the event receiver from the local MC. */
+		msg.netfn = IPMI_NETFN_SENSOR_EVENT_REQUEST;
+		msg.cmd = IPMI_GET_EVENT_RECEIVER_CMD;
 		msg.data = NULL;
 		msg.data_len = 0;
-		intf->null_user_handler = device_id_fetcher;
+		intf->null_user_handler = event_receiver_fetcher;
 		ipmi_panic_request_and_wait(intf, &addr, &msg);
+	}
+	intf->null_user_handler = NULL;
 
-		if (intf->local_event_generator) {
-			/* Request the event receiver from the local MC. */
-			msg.netfn = IPMI_NETFN_SENSOR_EVENT_REQUEST;
-			msg.cmd = IPMI_GET_EVENT_RECEIVER_CMD;
-			msg.data = NULL;
-			msg.data_len = 0;
-			intf->null_user_handler = event_receiver_fetcher;
-			ipmi_panic_request_and_wait(intf, &addr, &msg);
-		}
-		intf->null_user_handler = NULL;
+	/*
+	 * Validate the event receiver.  The low bit must not
+	 * be 1 (it must be a valid IPMB address), it cannot
+	 * be zero, and it must not be my address.
+	 */
+	if (((intf->event_receiver & 1) == 0)
+	    && (intf->event_receiver != 0)
+	    && (intf->event_receiver != intf->addrinfo[0].address)) {
+		/*
+		 * The event receiver is valid, send an IPMB
+		 * message.
+		 */
+		ipmb = (struct ipmi_ipmb_addr *) &addr;
+		ipmb->addr_type = IPMI_IPMB_ADDR_TYPE;
+		ipmb->channel = 0; /* FIXME - is this right? */
+		ipmb->lun = intf->event_receiver_lun;
+		ipmb->slave_addr = intf->event_receiver;
+	} else if (intf->local_sel_device) {
+		/*
+		 * The event receiver was not valid (or was
+		 * me), but I am an SEL device, just dump it
+		 * in my SEL.
+		 */
+		si = (struct ipmi_system_interface_addr *) &addr;
+		si->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
+		si->channel = IPMI_BMC_CHANNEL;
+		si->lun = 0;
+	} else
+		return; /* No where to send the event. */
 
+	msg.netfn = IPMI_NETFN_STORAGE_REQUEST; /* Storage. */
+	msg.cmd = IPMI_ADD_SEL_ENTRY_CMD;
+	msg.data = data;
+	msg.data_len = 16;
+
+	j = 0;
+	while (*p) {
+		int size = strlen(p);
+
+		if (size > 11)
+			size = 11;
+		data[0] = 0;
+		data[1] = 0;
+		data[2] = 0xf0; /* OEM event without timestamp. */
+		data[3] = intf->addrinfo[0].address;
+		data[4] = j++; /* sequence # */
 		/*
-		 * Validate the event receiver.  The low bit must not
-		 * be 1 (it must be a valid IPMB address), it cannot
-		 * be zero, and it must not be my address.
+		 * Always give 11 bytes, so strncpy will fill
+		 * it with zeroes for me.
 		 */
-		if (((intf->event_receiver & 1) == 0)
-		    && (intf->event_receiver != 0)
-		    && (intf->event_receiver != intf->addrinfo[0].address)) {
-			/*
-			 * The event receiver is valid, send an IPMB
-			 * message.
-			 */
-			ipmb = (struct ipmi_ipmb_addr *) &addr;
-			ipmb->addr_type = IPMI_IPMB_ADDR_TYPE;
-			ipmb->channel = 0; /* FIXME - is this right? */
-			ipmb->lun = intf->event_receiver_lun;
-			ipmb->slave_addr = intf->event_receiver;
-		} else if (intf->local_sel_device) {
-			/*
-			 * The event receiver was not valid (or was
-			 * me), but I am an SEL device, just dump it
-			 * in my SEL.
-			 */
-			si = (struct ipmi_system_interface_addr *) &addr;
-			si->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
-			si->channel = IPMI_BMC_CHANNEL;
-			si->lun = 0;
-		} else
-			continue; /* No where to send the event. */
-
-		msg.netfn = IPMI_NETFN_STORAGE_REQUEST; /* Storage. */
-		msg.cmd = IPMI_ADD_SEL_ENTRY_CMD;
-		msg.data = data;
-		msg.data_len = 16;
-
-		j = 0;
-		while (*p) {
-			int size = strlen(p);
-
-			if (size > 11)
-				size = 11;
-			data[0] = 0;
-			data[1] = 0;
-			data[2] = 0xf0; /* OEM event without timestamp. */
-			data[3] = intf->addrinfo[0].address;
-			data[4] = j++; /* sequence # */
-			/*
-			 * Always give 11 bytes, so strncpy will fill
-			 * it with zeroes for me.
-			 */
-			strncpy(data+5, p, 11);
-			p += size;
+		strncpy(data+5, p, 11);
+		p += size;
 
-			ipmi_panic_request_and_wait(intf, &addr, &msg);
-		}
+		ipmi_panic_request_and_wait(intf, &addr, &msg);
 	}
 }
 
@@ -5125,6 +5109,7 @@ static int panic_event(struct notifier_block *this,
 		       void                  *ptr)
 {
 	ipmi_smi_t intf;
+	ipmi_user_t user;
 
 	if (has_panicked)
 		return NOTIFY_DONE;
@@ -5132,10 +5117,13 @@ static int panic_event(struct notifier_block *this,
 
 	/* For every registered interface, set it to run to completion. */
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
-		if (!intf->handlers)
+		if (!intf->handlers || intf->intf_num == -1)
 			/* Interface is not ready. */
 			continue;
 
+		if (!intf->handlers->poll)
+			continue;
+
 		/*
 		 * If we were interrupted while locking xmit_msgs_lock or
 		 * waiting_rcv_msgs_lock, the corresponding list may be
@@ -5157,9 +5145,15 @@ static int panic_event(struct notifier_block *this,
 		if (intf->handlers->set_run_to_completion)
 			intf->handlers->set_run_to_completion(intf->send_info,
 							      1);
-	}
 
-	send_panic_events(ptr);
+		list_for_each_entry_rcu(user, &intf->users, link) {
+			if (user->handler->ipmi_panic_handler)
+				user->handler->ipmi_panic_handler(
+					user->handler_data);
+		}
+
+		send_panic_events(intf, ptr);
+	}
 
 	return NOTIFY_DONE;
 }
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 8b0626cec980..39a29fb3131b 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -77,6 +77,12 @@ struct ipmi_user_hndl {
 	/* Called when the interface detects a watchdog pre-timeout.  If
 	   this is NULL, it will be ignored for the user. */
 	void (*ipmi_watchdog_pretimeout)(void *handler_data);
+
+	/*
+	 * If not NULL, called at panic time after the interface has
+	 * been set up to handle run to completion.
+	 */
+	void (*ipmi_panic_handler)(void *handler_data);
 };
 
 /* Create a new user of the IPMI layer on the given interface number. */
-- 
cgit v1.2.3


From 6dc1181f9fbcf7ba0e62adfaea41666f00ee9d18 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 4 Apr 2018 08:54:05 -0500
Subject: ipmi: Clean up comments in include files.

Make the comments correct and consistent.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 include/linux/ipmi.h     | 105 ++++++++++++++++++++++++++-----------------
 include/linux/ipmi_smi.h | 115 ++++++++++++++++++++++++++++-------------------
 2 files changed, 134 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 39a29fb3131b..3474f04cf9aa 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -23,8 +23,10 @@
 struct module;
 struct device;
 
-/* Opaque type for a IPMI message user.  One of these is needed to
-   send and receive messages. */
+/*
+ * Opaque type for a IPMI message user.  One of these is needed to
+ * send and receive messages.
+ */
 typedef struct ipmi_user *ipmi_user_t;
 
 /*
@@ -37,8 +39,10 @@ typedef struct ipmi_user *ipmi_user_t;
 struct ipmi_recv_msg {
 	struct list_head link;
 
-	/* The type of message as defined in the "Receive Types"
-	   defines above. */
+	/*
+	 * The type of message as defined in the "Receive Types"
+	 * defines above.
+	 */
 	int              recv_type;
 
 	ipmi_user_t      user;
@@ -46,19 +50,25 @@ struct ipmi_recv_msg {
 	long             msgid;
 	struct kernel_ipmi_msg  msg;
 
-	/* The user_msg_data is the data supplied when a message was
-	   sent, if this is a response to a sent message.  If this is
-	   not a response to a sent message, then user_msg_data will
-	   be NULL.  If the user above is NULL, then this will be the
-	   intf. */
+	/*
+	 * The user_msg_data is the data supplied when a message was
+	 * sent, if this is a response to a sent message.  If this is
+	 * not a response to a sent message, then user_msg_data will
+	 * be NULL.  If the user above is NULL, then this will be the
+	 * intf.
+	 */
 	void             *user_msg_data;
 
-	/* Call this when done with the message.  It will presumably free
-	   the message and do any other necessary cleanup. */
+	/*
+	 * Call this when done with the message.  It will presumably free
+	 * the message and do any other necessary cleanup.
+	 */
 	void (*done)(struct ipmi_recv_msg *msg);
 
-	/* Place-holder for the data, don't make any assumptions about
-	   the size or existence of this, since it may change. */
+	/*
+	 * Place-holder for the data, don't make any assumptions about
+	 * the size or existence of this, since it may change.
+	 */
 	unsigned char   msg_data[IPMI_MAX_MSG_LENGTH];
 };
 
@@ -66,16 +76,20 @@ struct ipmi_recv_msg {
 void ipmi_free_recv_msg(struct ipmi_recv_msg *msg);
 
 struct ipmi_user_hndl {
-	/* Routine type to call when a message needs to be routed to
-	   the upper layer.  This will be called with some locks held,
-	   the only IPMI routines that can be called are ipmi_request
-	   and the alloc/free operations.  The handler_data is the
-	   variable supplied when the receive handler was registered. */
+	/*
+	 * Routine type to call when a message needs to be routed to
+	 * the upper layer.  This will be called with some locks held,
+	 * the only IPMI routines that can be called are ipmi_request
+	 * and the alloc/free operations.  The handler_data is the
+	 * variable supplied when the receive handler was registered.
+	 */
 	void (*ipmi_recv_hndl)(struct ipmi_recv_msg *msg,
 			       void                 *user_msg_data);
 
-	/* Called when the interface detects a watchdog pre-timeout.  If
-	   this is NULL, it will be ignored for the user. */
+	/*
+	 * Called when the interface detects a watchdog pre-timeout.  If
+	 * this is NULL, it will be ignored for the user.
+	 */
 	void (*ipmi_watchdog_pretimeout)(void *handler_data);
 
 	/*
@@ -91,12 +105,14 @@ int ipmi_create_user(unsigned int          if_num,
 		     void                  *handler_data,
 		     ipmi_user_t           *user);
 
-/* Destroy the given user of the IPMI layer.  Note that after this
-   function returns, the system is guaranteed to not call any
-   callbacks for the user.  Thus as long as you destroy all the users
-   before you unload a module, you will be safe.  And if you destroy
-   the users before you destroy the callback structures, it should be
-   safe, too. */
+/*
+ * Destroy the given user of the IPMI layer.  Note that after this
+ * function returns, the system is guaranteed to not call any
+ * callbacks for the user.  Thus as long as you destroy all the users
+ * before you unload a module, you will be safe.  And if you destroy
+ * the users before you destroy the callback structures, it should be
+ * safe, too.
+ */
 int ipmi_destroy_user(ipmi_user_t user);
 
 /* Get the IPMI version of the BMC we are talking to. */
@@ -104,12 +120,15 @@ int ipmi_get_version(ipmi_user_t   user,
 		     unsigned char *major,
 		     unsigned char *minor);
 
-/* Set and get the slave address and LUN that we will use for our
-   source messages.  Note that this affects the interface, not just
-   this user, so it will affect all users of this interface.  This is
-   so some initialization code can come in and do the OEM-specific
-   things it takes to determine your address (if not the BMC) and set
-   it for everyone else.  Note that each channel can have its own address. */
+/*
+ * Set and get the slave address and LUN that we will use for our
+ * source messages.  Note that this affects the interface, not just
+ * this user, so it will affect all users of this interface.  This is
+ * so some initialization code can come in and do the OEM-specific
+ * things it takes to determine your address (if not the BMC) and set
+ * it for everyone else.  Note that each channel can have its own
+ * address.
+ */
 int ipmi_set_my_address(ipmi_user_t   user,
 			unsigned int  channel,
 			unsigned char address);
@@ -235,14 +254,18 @@ int ipmi_set_gets_events(ipmi_user_t user, bool val);
 struct ipmi_smi_watcher {
 	struct list_head link;
 
-	/* You must set the owner to the current module, if you are in
-	   a module (generally just set it to "THIS_MODULE"). */
+	/*
+	 * You must set the owner to the current module, if you are in
+	 * a module (generally just set it to "THIS_MODULE").
+	 */
 	struct module *owner;
 
-	/* These two are called with read locks held for the interface
-	   the watcher list.  So you can add and remove users from the
-	   IPMI interface, send messages, etc., but you cannot add
-	   or remove SMI watchers or SMI interfaces. */
+	/*
+	 * These two are called with read locks held for the interface
+	 * the watcher list.  So you can add and remove users from the
+	 * IPMI interface, send messages, etc., but you cannot add
+	 * or remove SMI watchers or SMI interfaces.
+	 */
 	void (*new_smi)(int if_num, struct device *dev);
 	void (*smi_gone)(int if_num);
 };
@@ -250,8 +273,10 @@ struct ipmi_smi_watcher {
 int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher);
 int ipmi_smi_watcher_unregister(struct ipmi_smi_watcher *watcher);
 
-/* The following are various helper functions for dealing with IPMI
-   addresses. */
+/*
+ * The following are various helper functions for dealing with IPMI
+ * addresses.
+ */
 
 /* Return the maximum length of an IPMI address given it's type. */
 unsigned int ipmi_addr_length(int addr_type);
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index af457b5a689e..9e5c3079d232 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -22,8 +22,10 @@
 
 struct device;
 
-/* This files describes the interface for IPMI system management interface
-   drivers to bind into the IPMI message handler. */
+/*
+ * This files describes the interface for IPMI system management interface
+ * drivers to bind into the IPMI message handler.
+ */
 
 /* Structure for the low-level drivers. */
 typedef struct ipmi_smi *ipmi_smi_t;
@@ -61,10 +63,12 @@ struct ipmi_smi_msg {
 struct ipmi_smi_handlers {
 	struct module *owner;
 
-	/* The low-level interface cannot start sending messages to
-	   the upper layer until this function is called.  This may
-	   not be NULL, the lower layer must take the interface from
-	   this call. */
+	/*
+	 * The low-level interface cannot start sending messages to
+	 * the upper layer until this function is called.  This may
+	 * not be NULL, the lower layer must take the interface from
+	 * this call.
+	 */
 	int (*start_processing)(void       *send_info,
 				ipmi_smi_t new_intf);
 
@@ -75,25 +79,31 @@ struct ipmi_smi_handlers {
 	 */
 	int (*get_smi_info)(void *send_info, struct ipmi_smi_info *data);
 
-	/* Called to enqueue an SMI message to be sent.  This
-	   operation is not allowed to fail.  If an error occurs, it
-	   should report back the error in a received message.  It may
-	   do this in the current call context, since no write locks
-	   are held when this is run.  Message are delivered one at
-	   a time by the message handler, a new message will not be
-	   delivered until the previous message is returned. */
+	/*
+	 * Called to enqueue an SMI message to be sent.  This
+	 * operation is not allowed to fail.  If an error occurs, it
+	 * should report back the error in a received message.  It may
+	 * do this in the current call context, since no write locks
+	 * are held when this is run.  Message are delivered one at
+	 * a time by the message handler, a new message will not be
+	 * delivered until the previous message is returned.
+	 */
 	void (*sender)(void                *send_info,
 		       struct ipmi_smi_msg *msg);
 
-	/* Called by the upper layer to request that we try to get
-	   events from the BMC we are attached to. */
+	/*
+	 * Called by the upper layer to request that we try to get
+	 * events from the BMC we are attached to.
+	 */
 	void (*request_events)(void *send_info);
 
-	/* Called by the upper layer when some user requires that the
-	   interface watch for events, received messages, watchdog
-	   pretimeouts, or not.  Used by the SMI to know if it should
-	   watch for these.  This may be NULL if the SMI does not
-	   implement it. */
+	/*
+	 * Called by the upper layer when some user requires that the
+	 * interface watch for events, received messages, watchdog
+	 * pretimeouts, or not.  Used by the SMI to know if it should
+	 * watch for these.  This may be NULL if the SMI does not
+	 * implement it.
+	 */
 	void (*set_need_watch)(void *send_info, bool enable);
 
 	/*
@@ -101,28 +111,36 @@ struct ipmi_smi_handlers {
 	 */
 	void (*flush_messages)(void *send_info);
 
-	/* Called when the interface should go into "run to
-	   completion" mode.  If this call sets the value to true, the
-	   interface should make sure that all messages are flushed
-	   out and that none are pending, and any new requests are run
-	   to completion immediately. */
+	/*
+	 * Called when the interface should go into "run to
+	 * completion" mode.  If this call sets the value to true, the
+	 * interface should make sure that all messages are flushed
+	 * out and that none are pending, and any new requests are run
+	 * to completion immediately.
+	 */
 	void (*set_run_to_completion)(void *send_info, bool run_to_completion);
 
-	/* Called to poll for work to do.  This is so upper layers can
-	   poll for operations during things like crash dumps. */
+	/*
+	 * Called to poll for work to do.  This is so upper layers can
+	 * poll for operations during things like crash dumps.
+	 */
 	void (*poll)(void *send_info);
 
-	/* Enable/disable firmware maintenance mode.  Note that this
-	   is *not* the modes defined, this is simply an on/off
-	   setting.  The message handler does the mode handling.  Note
-	   that this is called from interrupt context, so it cannot
-	   block. */
+	/*
+	 * Enable/disable firmware maintenance mode.  Note that this
+	 * is *not* the modes defined, this is simply an on/off
+	 * setting.  The message handler does the mode handling.  Note
+	 * that this is called from interrupt context, so it cannot
+	 * block.
+	 */
 	void (*set_maintenance_mode)(void *send_info, bool enable);
 
-	/* Tell the handler that we are using it/not using it.  The
-	   message handler get the modules that this handler belongs
-	   to; this function lets the SMI claim any modules that it
-	   uses.  These may be NULL if this is not required. */
+	/*
+	 * Tell the handler that we are using it/not using it.  The
+	 * message handler get the modules that this handler belongs
+	 * to; this function lets the SMI claim any modules that it
+	 * uses.  These may be NULL if this is not required.
+	 */
 	int (*inc_usecount)(void *send_info);
 	void (*dec_usecount)(void *send_info);
 };
@@ -143,7 +161,8 @@ struct ipmi_device_id {
 #define ipmi_version_major(v) ((v)->ipmi_version & 0xf)
 #define ipmi_version_minor(v) ((v)->ipmi_version >> 4)
 
-/* Take a pointer to an IPMI response and extract device id information from
+/*
+ * Take a pointer to an IPMI response and extract device id information from
  * it. @netfn is in the IPMI_NETFN_ format, so may need to be shifted from
  * a SI response.
  */
@@ -187,12 +206,14 @@ static inline int ipmi_demangle_device_id(uint8_t netfn, uint8_t cmd,
 	return 0;
 }
 
-/* Add a low-level interface to the IPMI driver.  Note that if the
-   interface doesn't know its slave address, it should pass in zero.
-   The low-level interface should not deliver any messages to the
-   upper layer until the start_processing() function in the handlers
-   is called, and the lower layer must get the interface from that
-   call. */
+/*
+ * Add a low-level interface to the IPMI driver.  Note that if the
+ * interface doesn't know its slave address, it should pass in zero.
+ * The low-level interface should not deliver any messages to the
+ * upper layer until the start_processing() function in the handlers
+ * is called, and the lower layer must get the interface from that
+ * call.
+ */
 int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 		      void                     *send_info,
 		      struct device            *dev,
@@ -223,9 +244,11 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
 }
 
 #ifdef CONFIG_IPMI_PROC_INTERFACE
-/* Allow the lower layer to add things to the proc filesystem
-   directory for this interface.  Note that the entry will
-   automatically be dstroyed when the interface is destroyed. */
+/*
+ * Allow the lower layer to add things to the proc filesystem
+ * directory for this interface.  Note that the entry will
+ * automatically be dstroyed when the interface is destroyed.
+ */
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    const struct file_operations *proc_ops,
 			    void *data);
-- 
cgit v1.2.3


From b7780dab90e8da302bc7d0d1b39b538b822017e8 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Thu, 5 Apr 2018 16:44:12 -0500
Subject: ipmi: Add shutdown functions for users and interfaces

Since things that IPMI uses can be hot-swapped, the users and
interfaces really need to be able to handle this.

Add the functions so the users and interfaces can implement
them, the actual function will be added after everything is
ready.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 include/linux/ipmi.h     | 8 ++++++++
 include/linux/ipmi_smi.h | 6 ++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 3474f04cf9aa..d89bea1e457a 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -97,6 +97,14 @@ struct ipmi_user_hndl {
 	 * been set up to handle run to completion.
 	 */
 	void (*ipmi_panic_handler)(void *handler_data);
+
+	/*
+	 * Called when the interface has been removed.  After this returns
+	 * the user handle will be invalid.  The interface may or may
+	 * not be usable when this is called, but it will return errors
+	 * if it is not usable.
+	 */
+	void (*shutdown)(void *handler_data);
 };
 
 /* Create a new user of the IPMI layer on the given interface number. */
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 9e5c3079d232..bdcda4741c89 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -72,6 +72,12 @@ struct ipmi_smi_handlers {
 	int (*start_processing)(void       *send_info,
 				ipmi_smi_t new_intf);
 
+	/*
+	 * When called, the low-level interface should disable all
+	 * processing, it should be complete shut down when it returns.
+	 */
+	void (*shutdown)(void *send_info);
+
 	/*
 	 * Get the detailed private info of the low level interface and store
 	 * it into the structure of ipmi_smi_data. For example: the
-- 
cgit v1.2.3


From 8eb005bf6e58ca08802ac8170e5d3c8486cb2d79 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Thu, 5 Apr 2018 22:10:08 -0500
Subject: ipmi: Remove usecount function from interfaces

All the users are now gone.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 include/linux/ipmi_smi.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index bdcda4741c89..16662b0423bf 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -140,15 +140,6 @@ struct ipmi_smi_handlers {
 	 * block.
 	 */
 	void (*set_maintenance_mode)(void *send_info, bool enable);
-
-	/*
-	 * Tell the handler that we are using it/not using it.  The
-	 * message handler get the modules that this handler belongs
-	 * to; this function lets the SMI claim any modules that it
-	 * uses.  These may be NULL if this is not required.
-	 */
-	int (*inc_usecount)(void *send_info);
-	void (*dec_usecount)(void *send_info);
 };
 
 struct ipmi_device_id {
-- 
cgit v1.2.3


From 6a0d23ed338ed7015128378e0ceec03eaa3d91e2 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 11 Apr 2018 12:41:33 -0500
Subject: ipmi: ipmi_unregister_smi() cannot fail, have it return void

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_msghandler.c | 4 +---
 drivers/char/ipmi/ipmi_si_intf.c    | 5 +----
 drivers/char/ipmi/ipmi_ssif.c       | 4 +---
 include/linux/ipmi_smi.h            | 2 +-
 4 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 7ddadab65f33..946bfcb1eeee 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -3711,7 +3711,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf)
 	}
 }
 
-int ipmi_unregister_smi(struct ipmi_smi *intf)
+void ipmi_unregister_smi(struct ipmi_smi *intf)
 {
 	struct ipmi_smi_watcher *w;
 	int intf_num = intf->intf_num, index;
@@ -3755,8 +3755,6 @@ int ipmi_unregister_smi(struct ipmi_smi *intf)
 
 	cleanup_srcu_struct(&intf->users_srcu);
 	kref_put(&intf->refcount, intf_free);
-
-	return 0;
 }
 EXPORT_SYMBOL(ipmi_unregister_smi);
 
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 00a324060dcd..2222caf4bab7 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2365,16 +2365,13 @@ static void shutdown_smi(void *send_info)
 
 static void shutdown_one_si(struct smi_info *smi_info)
 {
-	int rv;
 	ipmi_smi_t intf = smi_info->intf;
 
 	if (!intf)
 		return;
 
 	smi_info->intf = NULL;
-	rv = ipmi_unregister_smi(intf);
-	if (rv)
-		pr_err(PFX "Unable to unregister device: errno=%d\n", rv);
+	ipmi_unregister_smi(intf);
 }
 
 static void cleanup_one_si(struct smi_info *smi_info)
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 8c72f271d4b4..17cae7a41b70 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -1247,9 +1247,7 @@ static int ssif_remove(struct i2c_client *client)
 	 */
 	intf = ssif_info->intf;
 	ssif_info->intf = NULL;
-	rv = ipmi_unregister_smi(intf);
-	if (rv)
-		pr_err(PFX "Unable to unregister device: errno=%d\n", rv);
+	ipmi_unregister_smi(intf);
 
 	list_for_each_entry(addr_info, &ssif_infos, link) {
 		if (addr_info->client == client) {
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 16662b0423bf..26ba57c307f0 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -220,7 +220,7 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
  * Remove a low-level interface from the IPMI driver.  This will
  * return an error if the interface is still in use by a user.
  */
-int ipmi_unregister_smi(ipmi_smi_t intf);
+void ipmi_unregister_smi(ipmi_smi_t intf);
 
 /*
  * The lower layer reports received messages through this interface.
-- 
cgit v1.2.3


From 5ce1a7dc806efb2181e93d4a088b281c4cff4eaa Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 11 Apr 2018 13:11:54 -0500
Subject: ipmi: Get rid of ipmi_user_t and ipmi_smi_t in include files

Convert over to struct ipmi_user * and struct ipmi_smi *.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 include/linux/ipmi.h     | 34 +++++++++++++++++-----------------
 include/linux/ipmi_smi.h | 12 ++++++------
 2 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index d89bea1e457a..41f5c086f670 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -45,7 +45,7 @@ struct ipmi_recv_msg {
 	 */
 	int              recv_type;
 
-	ipmi_user_t      user;
+	struct ipmi_user *user;
 	struct ipmi_addr addr;
 	long             msgid;
 	struct kernel_ipmi_msg  msg;
@@ -111,7 +111,7 @@ struct ipmi_user_hndl {
 int ipmi_create_user(unsigned int          if_num,
 		     const struct ipmi_user_hndl *handler,
 		     void                  *handler_data,
-		     ipmi_user_t           *user);
+		     struct ipmi_user      **user);
 
 /*
  * Destroy the given user of the IPMI layer.  Note that after this
@@ -121,10 +121,10 @@ int ipmi_create_user(unsigned int          if_num,
  * the users before you destroy the callback structures, it should be
  * safe, too.
  */
-int ipmi_destroy_user(ipmi_user_t user);
+int ipmi_destroy_user(struct ipmi_user *user);
 
 /* Get the IPMI version of the BMC we are talking to. */
-int ipmi_get_version(ipmi_user_t   user,
+int ipmi_get_version(struct ipmi_user *user,
 		     unsigned char *major,
 		     unsigned char *minor);
 
@@ -137,16 +137,16 @@ int ipmi_get_version(ipmi_user_t   user,
  * it for everyone else.  Note that each channel can have its own
  * address.
  */
-int ipmi_set_my_address(ipmi_user_t   user,
+int ipmi_set_my_address(struct ipmi_user *user,
 			unsigned int  channel,
 			unsigned char address);
-int ipmi_get_my_address(ipmi_user_t   user,
+int ipmi_get_my_address(struct ipmi_user *user,
 			unsigned int  channel,
 			unsigned char *address);
-int ipmi_set_my_LUN(ipmi_user_t   user,
+int ipmi_set_my_LUN(struct ipmi_user *user,
 		    unsigned int  channel,
 		    unsigned char LUN);
-int ipmi_get_my_LUN(ipmi_user_t   user,
+int ipmi_get_my_LUN(struct ipmi_user *user,
 		    unsigned int  channel,
 		    unsigned char *LUN);
 
@@ -163,7 +163,7 @@ int ipmi_get_my_LUN(ipmi_user_t   user,
  * it makes no sense to do it here.  However, this can be used if you
  * have unusual requirements.
  */
-int ipmi_request_settime(ipmi_user_t      user,
+int ipmi_request_settime(struct ipmi_user *user,
 			 struct ipmi_addr *addr,
 			 long             msgid,
 			 struct kernel_ipmi_msg  *msg,
@@ -181,7 +181,7 @@ int ipmi_request_settime(ipmi_user_t      user,
  * change as the system changes, so don't use it unless you REALLY
  * have to.
  */
-int ipmi_request_supply_msgs(ipmi_user_t          user,
+int ipmi_request_supply_msgs(struct ipmi_user     *user,
 			     struct ipmi_addr     *addr,
 			     long                 msgid,
 			     struct kernel_ipmi_msg *msg,
@@ -197,7 +197,7 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
  * way.  This is useful if you need to spin waiting for something to
  * happen in the IPMI driver.
  */
-void ipmi_poll_interface(ipmi_user_t user);
+void ipmi_poll_interface(struct ipmi_user *user);
 
 /*
  * When commands come in to the SMS, the user can register to receive
@@ -208,11 +208,11 @@ void ipmi_poll_interface(ipmi_user_t user);
  * error.  Channels are specified as a bitfield, use IPMI_CHAN_ALL to
  * mean all channels.
  */
-int ipmi_register_for_cmd(ipmi_user_t   user,
+int ipmi_register_for_cmd(struct ipmi_user *user,
 			  unsigned char netfn,
 			  unsigned char cmd,
 			  unsigned int  chans);
-int ipmi_unregister_for_cmd(ipmi_user_t   user,
+int ipmi_unregister_for_cmd(struct ipmi_user *user,
 			    unsigned char netfn,
 			    unsigned char cmd,
 			    unsigned int  chans);
@@ -243,8 +243,8 @@ int ipmi_unregister_for_cmd(ipmi_user_t   user,
  *
  * See the IPMI_MAINTENANCE_MODE_xxx defines for what the mode means.
  */
-int ipmi_get_maintenance_mode(ipmi_user_t user);
-int ipmi_set_maintenance_mode(ipmi_user_t user, int mode);
+int ipmi_get_maintenance_mode(struct ipmi_user *user);
+int ipmi_set_maintenance_mode(struct ipmi_user *user, int mode);
 
 /*
  * When the user is created, it will not receive IPMI events by
@@ -252,7 +252,7 @@ int ipmi_set_maintenance_mode(ipmi_user_t user, int mode);
  * The first user that sets this to TRUE will receive all events that
  * have been queued while no one was waiting for events.
  */
-int ipmi_set_gets_events(ipmi_user_t user, bool val);
+int ipmi_set_gets_events(struct ipmi_user *user, bool val);
 
 /*
  * Called when a new SMI is registered.  This will also be called on
@@ -330,7 +330,7 @@ struct ipmi_smi_info {
 	union ipmi_smi_info_union addr_info;
 };
 
-/* This is to get the private info of ipmi_smi_t */
+/* This is to get the private info of struct ipmi_smi */
 extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data);
 
 #endif /* __LINUX_IPMI_H */
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 26ba57c307f0..0d438662a821 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -69,8 +69,8 @@ struct ipmi_smi_handlers {
 	 * not be NULL, the lower layer must take the interface from
 	 * this call.
 	 */
-	int (*start_processing)(void       *send_info,
-				ipmi_smi_t new_intf);
+	int (*start_processing)(void            *send_info,
+				struct ipmi_smi *new_intf);
 
 	/*
 	 * When called, the low-level interface should disable all
@@ -220,7 +220,7 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
  * Remove a low-level interface from the IPMI driver.  This will
  * return an error if the interface is still in use by a user.
  */
-void ipmi_unregister_smi(ipmi_smi_t intf);
+void ipmi_unregister_smi(struct ipmi_smi *intf);
 
 /*
  * The lower layer reports received messages through this interface.
@@ -228,11 +228,11 @@ void ipmi_unregister_smi(ipmi_smi_t intf);
  * the lower layer gets an error sending a message, it should format
  * an error response in the message response.
  */
-void ipmi_smi_msg_received(ipmi_smi_t          intf,
+void ipmi_smi_msg_received(struct ipmi_smi     *intf,
 			   struct ipmi_smi_msg *msg);
 
 /* The lower layer received a watchdog pre-timeout on interface. */
-void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf);
+void ipmi_smi_watchdog_pretimeout(struct ipmi_smi *intf);
 
 struct ipmi_smi_msg *ipmi_alloc_smi_msg(void);
 static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
@@ -246,7 +246,7 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
  * directory for this interface.  Note that the entry will
  * automatically be dstroyed when the interface is destroyed.
  */
-int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
+int ipmi_smi_add_proc_entry(struct ipmi_smi *smi, char *name,
 			    const struct file_operations *proc_ops,
 			    void *data);
 #endif
-- 
cgit v1.2.3


From b32cc5b9a346319c171e3ad905e0cddda032b5eb Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:13 -0700
Subject: bpf: adding bpf_xdp_adjust_tail helper

Adding new bpf helper which would allow us to manipulate
xdp's data_end pointer, and allow us to reduce packet's size
indended use case: to generate ICMP messages from XDP context,
where such message would contain truncated original packet.

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 net/core/filter.c        | 29 ++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..9a2d1a04eb24 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -755,6 +755,13 @@ union bpf_attr {
  *     @addr: pointer to struct sockaddr to bind socket to
  *     @addr_len: length of sockaddr structure
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_xdp_adjust_tail(xdp_md, delta)
+ *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
+ *     size is supported.
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: A negative integer to be added to xdp_md.data_end
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -821,7 +828,8 @@ union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(xdp_adjust_tail),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index a374b8560bc4..29318598fd60 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2725,6 +2725,30 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+	void *data_end = xdp->data_end + offset;
+
+	/* only shrinking is allowed for now. */
+	if (unlikely(offset >= 0))
+		return -EINVAL;
+
+	if (unlikely(data_end < xdp->data + ETH_HLEN))
+		return -EINVAL;
+
+	xdp->data_end = data_end;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+	.func		= bpf_xdp_adjust_tail,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
 {
 	void *meta = xdp->data_meta + offset;
@@ -3074,7 +3098,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_l4_csum_replace ||
 	    func == bpf_xdp_adjust_head ||
 	    func == bpf_xdp_adjust_meta ||
-	    func == bpf_msg_pull_data)
+	    func == bpf_msg_pull_data ||
+	    func == bpf_xdp_adjust_tail)
 		return true;
 
 	return false;
@@ -3888,6 +3913,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
 		return &bpf_xdp_redirect_map_proto;
+	case BPF_FUNC_xdp_adjust_tail:
+		return &bpf_xdp_adjust_tail_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From b6240a4df0186c03e5ffff6f61570ed31a1a5172 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Mon, 26 Mar 2018 17:27:41 +0800
Subject: scsi: libsas: add transport class for ATA devices

Now ata devices attached with sas controller do not have transport
class, so that we can not see any information of these ata devices in
/sys/class/ata_port(or ata_link or ata_device).

Add transport class for the ata devices attached with sas controller.
The /sys/class directory will show the infomation of the ata devices
as follows:

localhost:/sys/class # ls ata*
ata_device:
dev1.0  dev2.0

ata_link:
link1  link2

ata_port:
ata1  ata2

No functional change of the device scanning and io path. The ata
transport class was deleted when destroying the sas devices.

Signed-off-by: Jason Yan <yanaijie@huawei.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/libata-scsi.c          | 12 ++++++++++++
 drivers/scsi/libsas/sas_ata.c      |  5 +++++
 drivers/scsi/libsas/sas_discover.c |  1 +
 include/linux/libata.h             |  2 ++
 4 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 89a9d4a2efc8..1c9f80fbc51c 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -5051,6 +5051,18 @@ int ata_sas_port_init(struct ata_port *ap)
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_init);
 
+int ata_sas_tport_add(struct device *parent, struct ata_port *ap)
+{
+	return ata_tport_add(parent, ap);
+}
+EXPORT_SYMBOL_GPL(ata_sas_tport_add);
+
+void ata_sas_tport_delete(struct ata_port *ap)
+{
+	ata_tport_delete(ap);
+}
+EXPORT_SYMBOL_GPL(ata_sas_tport_delete);
+
 /**
  *	ata_sas_port_destroy - Destroy a SATA port allocated by ata_sas_port_alloc
  *	@ap: SATA port to destroy
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 0cc1567eacc1..ff1d612f6fb9 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -577,6 +577,11 @@ int sas_ata_init(struct domain_device *found_dev)
 		ata_sas_port_destroy(ap);
 		return rc;
 	}
+	rc = ata_sas_tport_add(found_dev->sata_dev.ata_host.dev, ap);
+	if (rc) {
+		ata_sas_port_destroy(ap);
+		return rc;
+	}
 	found_dev->sata_dev.ap = ap;
 
 	return 0;
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
index a0fa7ef3a071..1ffca28fe6a8 100644
--- a/drivers/scsi/libsas/sas_discover.c
+++ b/drivers/scsi/libsas/sas_discover.c
@@ -314,6 +314,7 @@ void sas_free_device(struct kref *kref)
 		kfree(dev->ex_dev.ex_phy);
 
 	if (dev_is_sata(dev) && dev->sata_dev.ap) {
+		ata_sas_tport_delete(dev->sata_dev.ap);
 		ata_sas_port_destroy(dev->sata_dev.ap);
 		dev->sata_dev.ap = NULL;
 	}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1795fecdea17..0619ebf4d475 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1130,6 +1130,8 @@ extern void ata_sas_async_probe(struct ata_port *ap);
 extern int ata_sas_sync_probe(struct ata_port *ap);
 extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_port_start(struct ata_port *ap);
+extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
+extern void ata_sas_tport_delete(struct ata_port *ap);
 extern void ata_sas_port_stop(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
 extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
-- 
cgit v1.2.3


From 63273cb40101b6f303a5493f1bdf629d4ab3746b Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Tue, 27 Mar 2018 17:48:38 -0700
Subject: scsi: vmbus: Add function to report available ring buffer to write in
 total ring size percentage

Netvsc has a function to calculate how much ring buffer in percentage is
available to write. This function is also useful for storvsc and other
vmbus devices.

Define a similar function in vmbus to be used by other vmbus devices.

Signed-off-by: Long Li <longli@microsoft.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/hv/ring_buffer.c |  2 ++
 include/linux/hyperv.h   | 12 ++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 8699bb969e7e..3c836c099a8f 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -227,6 +227,8 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 	ring_info->ring_buffer->feature_bits.value = 1;
 
 	ring_info->ring_size = page_cnt << PAGE_SHIFT;
+	ring_info->ring_size_div10_reciprocal =
+		reciprocal_value(ring_info->ring_size / 10);
 	ring_info->ring_datasize = ring_info->ring_size -
 		sizeof(struct hv_ring_buffer);
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 192ed8fbc403..9ac954ee577e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -35,6 +35,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/interrupt.h>
+#include <linux/reciprocal_div.h>
 
 #define MAX_PAGE_BUFFER_COUNT				32
 #define MAX_MULTIPAGE_BUFFER_COUNT			32 /* 128K */
@@ -120,6 +121,7 @@ struct hv_ring_buffer {
 struct hv_ring_buffer_info {
 	struct hv_ring_buffer *ring_buffer;
 	u32 ring_size;			/* Include the shared header */
+	struct reciprocal_value ring_size_div10_reciprocal;
 	spinlock_t ring_lock;
 
 	u32 ring_datasize;		/* < ring_size */
@@ -154,6 +156,16 @@ static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi)
 	return write;
 }
 
+static inline u32 hv_get_avail_to_write_percent(
+		const struct hv_ring_buffer_info *rbi)
+{
+	u32 avail_write = hv_get_bytes_to_write(rbi);
+
+	return reciprocal_divide(
+			(avail_write  << 3) + (avail_write << 1),
+			rbi->ring_size_div10_reciprocal);
+}
+
 /*
  * VMBUS version is 32 bit entity broken up into
  * two 16 bit quantities: major_number. minor_number.
-- 
cgit v1.2.3


From a2dd6877b43ef14129f258910d60b2e81b32100b Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:31 -0400
Subject: soc: ti: K2G: provide APIs to support driver probe deferral

This patch provide APIs to allow client drivers to support
probe deferral. On K2G SoC, devices can be probed only
after the ti_sci_pm_domains driver is probed and ready.
As drivers may get probed at different order, any driver
that depends on knav dma and qmss drivers, for example
netcp network driver, needs to defer probe until
knav devices are probed and ready to service. To do this,
add an API to query the device ready status from the knav
dma and qmss devices.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/soc/ti/knav_dma.c        |  8 ++++++++
 drivers/soc/ti/knav_qmss_queue.c |  8 ++++++++
 include/linux/soc/ti/knav_dma.h  | 12 ++++++++++++
 include/linux/soc/ti/knav_qmss.h |  1 +
 4 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/ti/knav_dma.c b/drivers/soc/ti/knav_dma.c
index 026182d3b27c..224d7ddeeb76 100644
--- a/drivers/soc/ti/knav_dma.c
+++ b/drivers/soc/ti/knav_dma.c
@@ -134,6 +134,13 @@ struct knav_dma_chan {
 
 static struct knav_dma_pool_device *kdev;
 
+static bool device_ready;
+bool knav_dma_device_ready(void)
+{
+	return device_ready;
+}
+EXPORT_SYMBOL_GPL(knav_dma_device_ready);
+
 static bool check_config(struct knav_dma_chan *chan, struct knav_dma_cfg *cfg)
 {
 	if (!memcmp(&chan->cfg, cfg, sizeof(*cfg)))
@@ -773,6 +780,7 @@ static int knav_dma_probe(struct platform_device *pdev)
 	debugfs_create_file("knav_dma", S_IFREG | S_IRUGO, NULL, NULL,
 			    &knav_dma_debug_ops);
 
+	device_ready = true;
 	return ret;
 }
 
diff --git a/drivers/soc/ti/knav_qmss_queue.c b/drivers/soc/ti/knav_qmss_queue.c
index 8526c8ed3af2..419365a8d1c2 100644
--- a/drivers/soc/ti/knav_qmss_queue.c
+++ b/drivers/soc/ti/knav_qmss_queue.c
@@ -74,6 +74,13 @@ static DEFINE_MUTEX(knav_dev_lock);
  */
 const char *knav_acc_firmwares[] = {"ks2_qmss_pdsp_acc48.bin"};
 
+static bool device_ready;
+bool knav_qmss_device_ready(void)
+{
+	return device_ready;
+}
+EXPORT_SYMBOL_GPL(knav_qmss_device_ready);
+
 /**
  * knav_queue_notify: qmss queue notfier call
  *
@@ -1849,6 +1856,7 @@ static int knav_queue_probe(struct platform_device *pdev)
 
 	debugfs_create_file("qmss", S_IFREG | S_IRUGO, NULL, NULL,
 			    &knav_queue_debug_ops);
+	device_ready = true;
 	return 0;
 
 err:
diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h
index 66693bc4c6ad..7127ec301537 100644
--- a/include/linux/soc/ti/knav_dma.h
+++ b/include/linux/soc/ti/knav_dma.h
@@ -167,6 +167,8 @@ struct knav_dma_desc {
 void *knav_dma_open_channel(struct device *dev, const char *name,
 				struct knav_dma_cfg *config);
 void knav_dma_close_channel(void *channel);
+int knav_dma_get_flow(void *channel);
+bool knav_dma_device_ready(void);
 #else
 static inline void *knav_dma_open_channel(struct device *dev, const char *name,
 				struct knav_dma_cfg *config)
@@ -176,6 +178,16 @@ static inline void *knav_dma_open_channel(struct device *dev, const char *name,
 static inline void knav_dma_close_channel(void *channel)
 {}
 
+static inline int knav_dma_get_flow(void *channel)
+{
+	return -EINVAL;
+}
+
+static inline bool knav_dma_device_ready(void)
+{
+	return false;
+}
+
 #endif
 
 #endif /* __SOC_TI_KEYSTONE_NAVIGATOR_DMA_H__ */
diff --git a/include/linux/soc/ti/knav_qmss.h b/include/linux/soc/ti/knav_qmss.h
index 9f0ebb3bad27..9745df6ed9d3 100644
--- a/include/linux/soc/ti/knav_qmss.h
+++ b/include/linux/soc/ti/knav_qmss.h
@@ -86,5 +86,6 @@ int knav_pool_desc_map(void *ph, void *desc, unsigned size,
 void *knav_pool_desc_unmap(void *ph, dma_addr_t dma, unsigned dma_sz);
 dma_addr_t knav_pool_desc_virt_to_dma(void *ph, void *virt);
 void *knav_pool_desc_dma_to_virt(void *ph, dma_addr_t dma);
+bool knav_qmss_device_ready(void);
 
 #endif /* __SOC_TI_KNAV_QMSS_H__ */
-- 
cgit v1.2.3


From ce20cdf498bdfa6c6130be75cf4359dad305ebcd Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 9 Apr 2018 16:40:34 +0200
Subject: netfilter: xt_NFLOG: use nf_log_packet instead of nfulnl_log_packet.

The nfulnl_log_packet() is added to make sure that the NFLOG target
works as only user-space logger. but now, nf_log_packet() can find proper
log function using NF_LOG_TYPE_ULOG and NF_LOG_TYPE_LOG.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nfnetlink_log.h | 17 -----------------
 net/netfilter/nfnetlink_log.c         |  8 +++-----
 net/netfilter/xt_NFLOG.c              | 15 +++++++++++----
 3 files changed, 14 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h
index 612cfb63ac68..ea32a7d3cf1b 100644
--- a/include/net/netfilter/nfnetlink_log.h
+++ b/include/net/netfilter/nfnetlink_log.h
@@ -1,18 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _KER_NFNETLINK_LOG_H
-#define _KER_NFNETLINK_LOG_H
-
-void
-nfulnl_log_packet(struct net *net,
-		  u_int8_t pf,
-		  unsigned int hooknum,
-		  const struct sk_buff *skb,
-		  const struct net_device *in,
-		  const struct net_device *out,
-		  const struct nf_loginfo *li_user,
-		  const char *prefix);
-
-#define NFULNL_COPY_DISABLED    0xff
-
-#endif /* _KER_NFNETLINK_LOG_H */
-
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 7b46aa4c478d..e5cc4d9b9ce7 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -37,7 +37,6 @@
 #include <net/sock.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netns/generic.h>
-#include <net/netfilter/nfnetlink_log.h>
 
 #include <linux/atomic.h>
 #include <linux/refcount.h>
@@ -47,6 +46,7 @@
 #include "../bridge/br_private.h"
 #endif
 
+#define NFULNL_COPY_DISABLED	0xff
 #define NFULNL_NLBUFSIZ_DEFAULT	NLMSG_GOODSIZE
 #define NFULNL_TIMEOUT_DEFAULT 	100	/* every second */
 #define NFULNL_QTHRESH_DEFAULT 	100	/* 100 packets */
@@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = {
 };
 
 /* log handler for internal netfilter logging api */
-void
+static void
 nfulnl_log_packet(struct net *net,
 		  u_int8_t pf,
 		  unsigned int hooknum,
@@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net,
 	struct nfulnl_instance *inst;
 	const struct nf_loginfo *li;
 	unsigned int qthreshold;
-	unsigned int plen;
+	unsigned int plen = 0;
 	struct nfnl_log_net *log = nfnl_log_pernet(net);
 	const struct nfnl_ct_hook *nfnl_ct = NULL;
 	struct nf_conn *ct = NULL;
@@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net,
 	if (!inst)
 		return;
 
-	plen = 0;
 	if (prefix)
 		plen = strlen(prefix) + 1;
 
@@ -760,7 +759,6 @@ alloc_failure:
 	/* FIXME: statistics */
 	goto unlock_and_release;
 }
-EXPORT_SYMBOL_GPL(nfulnl_log_packet);
 
 static int
 nfulnl_rcv_nl_event(struct notifier_block *this,
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index c7f8958cea4a..1ed0cac585c4 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -13,7 +13,6 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_NFLOG.h>
 #include <net/netfilter/nf_log.h>
-#include <net/netfilter/nfnetlink_log.h>
 
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
@@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (info->flags & XT_NFLOG_F_COPY_LEN)
 		li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
 
-	nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
-			  xt_in(par), xt_out(par), &li, info->prefix);
+	nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+		      xt_out(par), &li, "%s", info->prefix);
+
 	return XT_CONTINUE;
 }
 
@@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 	if (info->prefix[sizeof(info->prefix) - 1] != '\0')
 		return -EINVAL;
-	return 0;
+
+	return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+}
+
+static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
 }
 
 static struct xt_target nflog_tg_reg __read_mostly = {
@@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = {
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.checkentry = nflog_tg_check,
+	.destroy    = nflog_tg_destroy,
 	.target     = nflog_tg,
 	.targetsize = sizeof(struct xt_nflog_info),
 	.me         = THIS_MODULE,
-- 
cgit v1.2.3


From 291bfb928863d496e25c785e132a8fbfb32341a8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 19 Apr 2018 12:14:10 +0100
Subject: ASoC: topology: Revert recent changes while boot errors are
 investigated

Krzysztof Kozlowski reported a NULL dereference in _instantiate_card()
on Odroid XU3 and XU boards which he bisected to 45f8cb57da0d7 (ASoC:
core: Allow topology to override machine driver FE DAI link config).
Revert that commit for now, along with f11a5c27f928 (ASoC: core: Add
name prefix for machines with topology rewrites) due to dependency
issues, in order to keep things booting cleanly in -next.

Reported-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 12 --------
 sound/soc/soc-core.c | 87 ++--------------------------------------------------
 sound/soc/soc-pcm.c  | 12 --------
 3 files changed, 3 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 3676d0a8f532..ad266d7e9553 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1012,14 +1012,6 @@ struct snd_soc_platform_driver {
 
 	/* platform stream compress ops */
 	const struct snd_compr_ops *compr_ops;
-
-	/* this platform uses topology and ignore machine driver FEs */
-	const char *ignore_machine;
-	const char *topology_name_prefix;
-	int (*be_hw_params_fixup)(struct snd_soc_pcm_runtime *rtd,
-				  struct snd_pcm_hw_params *params);
-	bool use_dai_pcm_id;	/* use the DAI link PCM ID as PCM device number */
-	int be_pcm_base;	/* base device ID for all BE PCMs */
 };
 
 struct snd_soc_dai_link_component {
@@ -1126,9 +1118,6 @@ struct snd_soc_dai_link {
 	/* pmdown_time is ignored at stop */
 	unsigned int ignore_pmdown_time:1;
 
-	/* Do not create a PCM for this DAI link (Backend link) */
-	unsigned int ignore:1;
-
 	struct list_head list; /* DAI link list of the soc card */
 	struct snd_soc_dobj dobj; /* For topology */
 };
@@ -1168,7 +1157,6 @@ struct snd_soc_card {
 	const char *long_name;
 	const char *driver_name;
 	char dmi_longname[80];
-	char topology_shortname[32];
 
 	struct device *dev;
 	struct snd_card *snd_card;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index aa1c33b3cce0..bf7ca32ab31f 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1050,9 +1050,6 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 	const char *platform_name;
 	int i;
 
-	if (dai_link->ignore)
-		return 0;
-
 	dev_dbg(card->dev, "ASoC: binding %s\n", dai_link->name);
 
 	if (soc_is_dai_link_bound(card, dai_link)) {
@@ -1675,7 +1672,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card,
 {
 	struct snd_soc_dai_link *dai_link = rtd->dai_link;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
-	int i, ret, num;
+	int i, ret;
 
 	dev_dbg(card->dev, "ASoC: probe %s dai link %d late %d\n",
 			card->name, rtd->num, order);
@@ -1721,23 +1718,9 @@ static int soc_probe_link_dais(struct snd_soc_card *card,
 		soc_dpcm_debugfs_add(rtd);
 #endif
 
-	/*
-	 * most drivers will register their PCMs using DAI link ordering but
-	 * topology based drivers can use the DAI link id field to set PCM
-	 * device number and then use rtd + a base offset of the BEs.
-	 */
-	if (rtd->platform->driver->use_dai_pcm_id) {
-		if (rtd->dai_link->no_pcm)
-			num = rtd->platform->driver->be_pcm_base + rtd->num;
-		else
-			num = rtd->dai_link->id;
-	} else {
-		num = rtd->num;
-	}
-
 	if (cpu_dai->driver->compress_new) {
 		/*create compress_device"*/
-		ret = cpu_dai->driver->compress_new(rtd, num);
+		ret = cpu_dai->driver->compress_new(rtd, rtd->num);
 		if (ret < 0) {
 			dev_err(card->dev, "ASoC: can't create compress %s\n",
 					 dai_link->stream_name);
@@ -1747,7 +1730,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card,
 
 		if (!dai_link->params) {
 			/* create the pcm */
-			ret = soc_new_pcm(rtd, num);
+			ret = soc_new_pcm(rtd, rtd->num);
 			if (ret < 0) {
 				dev_err(card->dev, "ASoC: can't create pcm %s :%d\n",
 				       dai_link->stream_name, ret);
@@ -2093,67 +2076,6 @@ int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour)
 EXPORT_SYMBOL_GPL(snd_soc_set_dmi_name);
 #endif /* CONFIG_DMI */
 
-static void soc_check_tplg_fes(struct snd_soc_card *card)
-{
-	struct snd_soc_platform *platform;
-	struct snd_soc_dai_link *dai_link;
-	int i;
-
-	list_for_each_entry(platform, &platform_list, list) {
-
-		/* does this platform override FEs ? */
-		if (!platform->driver->ignore_machine)
-			continue;
-
-		/* for this machine ? */
-		if (strcmp(platform->driver->ignore_machine,
-			   card->dev->driver->name))
-			continue;
-
-		/* machine matches, so override the rtd data */
-		for (i = 0; i < card->num_links; i++) {
-
-			dai_link = &card->dai_link[i];
-
-			/* ignore this FE */
-			if (dai_link->dynamic) {
-				dai_link->ignore = true;
-				continue;
-			}
-
-			dev_info(card->dev, "info: override FE DAI link %s\n",
-				 card->dai_link[i].name);
-
-			/* override platform */
-			dai_link->platform_name = platform->component.name;
-			dai_link->cpu_dai_name = platform->component.name;
-
-			/* convert non BE into BE */
-			dai_link->no_pcm = 1;
-			dai_link->dpcm_playback = 1;
-			dai_link->dpcm_capture = 1;
-
-			/* override any BE fixups */
-			dai_link->be_hw_params_fixup =
-				platform->driver->be_hw_params_fixup;
-
-			/* most BE links dont set stream name, so set it to
-			 * dai link name if it's NULL to help bind widgets.
-			 */
-			if (!dai_link->stream_name)
-				dai_link->stream_name = dai_link->name;
-		}
-
-		/* Inform userspace we are using alternate topology */
-		if (platform->driver->topology_name_prefix) {
-			snprintf(card->topology_shortname, 32, "%s-%s",
-				 platform->driver->topology_name_prefix,
-				 card->name);
-			card->name = card->topology_shortname;
-		}
-	}
-}
-
 static int snd_soc_instantiate_card(struct snd_soc_card *card)
 {
 	struct snd_soc_codec *codec;
@@ -2164,9 +2086,6 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card)
 	mutex_lock(&client_mutex);
 	mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_INIT);
 
-	/* check whether any platform is ignore machine FE and using topology */
-	soc_check_tplg_fes(card);
-
 	/* bind DAIs */
 	for (i = 0; i < card->num_links; i++) {
 		ret = soc_bind_dai_link(card, &card->dai_link[i]);
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 4ce489165a6d..68d9dc930096 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -909,20 +909,8 @@ int soc_dai_hw_params(struct snd_pcm_substream *substream,
 		      struct snd_pcm_hw_params *params,
 		      struct snd_soc_dai *dai)
 {
-	struct snd_soc_pcm_runtime *rtd = substream->private_data;
 	int ret;
 
-	/* perform any topology hw_params fixups before DAI  */
-	if (rtd->dai_link->be_hw_params_fixup) {
-		ret = rtd->dai_link->be_hw_params_fixup(rtd, params);
-		if (ret < 0) {
-			dev_err(rtd->dev,
-				"ASoC: hw_params topology fixup failed %d\n",
-				ret);
-			return ret;
-		}
-	}
-
 	if (dai->driver->ops->hw_params) {
 		ret = dai->driver->ops->hw_params(substream, params, dai);
 		if (ret < 0) {
-- 
cgit v1.2.3


From 5055c67760df8e13c68f73b8534d5792d1d36543 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 13 Mar 2018 21:03:24 -0700
Subject: compat: Make compat helpers independent of CONFIG_COMPAT

Many of the compat time syscalls are also repurposed as 32 bit
native syscalls to provide backward compatibility while adding
new y2038 safe sycalls.
Enabling the helpers makes this possible.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compat.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 081281ad5772..3f35f4cfb98d 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -8,8 +8,6 @@
 
 #include <linux/types.h>
 
-#ifdef CONFIG_COMPAT
-
 #include <linux/stat.h>
 #include <linux/param.h>	/* for HZ */
 #include <linux/sem.h>
@@ -20,9 +18,11 @@
 #include <linux/uaccess.h>
 #include <linux/unistd.h>
 
+#ifdef CONFIG_COMPAT
 #include <asm/compat.h>
 #include <asm/siginfo.h>
 #include <asm/signal.h>
+#endif
 
 #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
 /*
@@ -83,6 +83,8 @@
 	static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 #endif /* COMPAT_SYSCALL_DEFINEx */
 
+#ifdef CONFIG_COMPAT
+
 #ifndef compat_user_stack_pointer
 #define compat_user_stack_pointer() current_user_stack_pointer()
 #endif
@@ -1016,7 +1018,9 @@ static inline struct compat_timeval ns_to_compat_timeval(s64 nsec)
 #else /* !CONFIG_COMPAT */
 
 #define is_compat_task() (0)
+#ifndef in_compat_syscall
 static inline bool in_compat_syscall(void) { return false; }
+#endif
 
 #endif /* CONFIG_COMPAT */
 
-- 
cgit v1.2.3


From 2b5a9a37e97467fea67070a94529d6d07e549270 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 26 Mar 2018 16:59:15 +0200
Subject: time: Add an asm-generic/compat.h file

We have a couple of files that try to include asm/compat.h on
architectures where this is available. Those should generally use the
higher-level linux/compat.h file, but that in turn fails to include
asm/compat.h when CONFIG_COMPAT is disabled, unless we can provide
that header on all architectures.

This adds the asm/compat.h for all remaining architectures to
simplify the dependencies.

Architectures that are getting removed in linux-4.17 are not changed
here, to avoid needless conflicts with the removal patches. Those
architectures are broken by this patch, but we have already shown
that they have no users.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/alpha/include/asm/Kbuild      | 1 +
 arch/arc/include/asm/Kbuild        | 1 +
 arch/arm/include/asm/Kbuild        | 1 +
 arch/c6x/include/asm/Kbuild        | 1 +
 arch/h8300/include/asm/Kbuild      | 1 +
 arch/hexagon/include/asm/Kbuild    | 1 +
 arch/ia64/include/asm/Kbuild       | 1 +
 arch/m68k/include/asm/Kbuild       | 1 +
 arch/microblaze/include/asm/Kbuild | 1 +
 arch/nds32/include/asm/Kbuild      | 1 +
 arch/nios2/include/asm/Kbuild      | 1 +
 arch/openrisc/include/asm/Kbuild   | 1 +
 arch/sh/include/asm/Kbuild         | 1 +
 arch/um/include/asm/Kbuild         | 1 +
 arch/unicore32/include/asm/Kbuild  | 1 +
 arch/xtensa/include/asm/Kbuild     | 1 +
 include/asm-generic/compat.h       | 3 +++
 include/linux/compat.h             | 3 ++-
 18 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 include/asm-generic/compat.h

(limited to 'include')

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 9b68790013e2..0580cb8c84b2 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 
+generic-y += compat.h
 generic-y += exec.h
 generic-y += export.h
 generic-y += fb.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 4bd5d4369e05..d51bc22e8795 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 generic-y += bugs.h
+generic-y += compat.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 873e3c189279..1d66db9c9db5 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -1,3 +1,4 @@
+generic-y += compat.h
 generic-y += current.h
 generic-y += early_ioremap.h
 generic-y += emergency-restart.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index fd4c840de837..24037486317c 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -1,6 +1,7 @@
 generic-y += atomic.h
 generic-y += barrier.h
 generic-y += bugs.h
+generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 14bac06b7116..a5d0b2991f47 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += barrier.h
 generic-y += bugs.h
 generic-y += cacheflush.h
 generic-y += checksum.h
+generic-y += compat.h
 generic-y += current.h
 generic-y += delay.h
 generic-y += device.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index e9743f689fb8..dd2fd9c0d292 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -2,6 +2,7 @@
 generic-y += barrier.h
 generic-y += bug.h
 generic-y += bugs.h
+generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 6dd867873364..557bbc8ba9f5 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -1,3 +1,4 @@
+generic-y += compat.h
 generic-y += exec.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 88a9d27df1ac..4d8d68c4e3dd 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -1,4 +1,5 @@
 generic-y += barrier.h
+generic-y += compat.h
 generic-y += device.h
 generic-y += emergency-restart.h
 generic-y += exec.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 3c80a5a308ed..fe6a6c6e5003 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -2,6 +2,7 @@ generic-y += barrier.h
 generic-y += bitops.h
 generic-y += bug.h
 generic-y += bugs.h
+generic-y += compat.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index 06bdf8167f5a..d1b1f89d44f6 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += checksum.h
 generic-y += clkdev.h
 generic-y += cmpxchg.h
 generic-y += cmpxchg-local.h
+generic-y += compat.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index d232da2cbb38..64ed3d656956 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += bitops.h
 generic-y += bug.h
 generic-y += bugs.h
 generic-y += cmpxchg.h
+generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index f05c722a21f8..65964d390b10 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -2,6 +2,7 @@ generic-y += barrier.h
 generic-y += bug.h
 generic-y += bugs.h
 generic-y += checksum.h
+generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 1efcce74997b..46dd82ab2c29 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -1,3 +1,4 @@
+generic-y += compat.h
 generic-y += current.h
 generic-y += delay.h
 generic-y += div64.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index bb5a196c3061..b10dde6cb793 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -1,6 +1,7 @@
 generic-y += barrier.h
 generic-y += bpf_perf_event.h
 generic-y += bug.h
+generic-y += compat.h
 generic-y += current.h
 generic-y += delay.h
 generic-y += device.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 6f70c76c81fc..bfc7abe77905 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -1,5 +1,6 @@
 generic-y += atomic.h
 generic-y += bugs.h
+generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 436b20337168..e5e1e61c538c 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -1,4 +1,5 @@
 generic-y += bug.h
+generic-y += compat.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += dma-contiguous.h
diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h
new file mode 100644
index 000000000000..28819451b6d1
--- /dev/null
+++ b/include/asm-generic/compat.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* This is an empty stub for 32-bit-only architectures */
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 3f35f4cfb98d..48d29b1339be 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -18,8 +18,9 @@
 #include <linux/uaccess.h>
 #include <linux/unistd.h>
 
-#ifdef CONFIG_COMPAT
 #include <asm/compat.h>
+
+#ifdef CONFIG_COMPAT
 #include <asm/siginfo.h>
 #include <asm/signal.h>
 #endif
-- 
cgit v1.2.3


From 0d55303c51a4f35f674617e415632d492b596c26 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 13 Mar 2018 21:03:25 -0700
Subject: compat: Move compat_timespec/ timeval to compat_time.h

All the current architecture specific defines for these
are the same. Refactor these common defines to a common
header file.

The new common linux/compat_time.h is also useful as it
will eventually be used to hold all the defines that
are needed for compat time types that support non y2038
safe types. New architectures need not have to define these
new types as they will only use new y2038 safe syscalls.
This file can be deleted after y2038 when we stop supporting
non y2038 safe syscalls.

The patch also requires an operation similar to:

git grep "asm/compat\.h" | cut -d ":" -f 1 |  xargs -n 1 sed -i -e "s%asm/compat.h%linux/compat.h%g"

Cc: acme@kernel.org
Cc: benh@kernel.crashing.org
Cc: borntraeger@de.ibm.com
Cc: catalin.marinas@arm.com
Cc: cmetcalf@mellanox.com
Cc: cohuck@redhat.com
Cc: davem@davemloft.net
Cc: deller@gmx.de
Cc: devel@driverdev.osuosl.org
Cc: gerald.schaefer@de.ibm.com
Cc: gregkh@linuxfoundation.org
Cc: heiko.carstens@de.ibm.com
Cc: hoeppner@linux.vnet.ibm.com
Cc: hpa@zytor.com
Cc: jejb@parisc-linux.org
Cc: jwi@linux.vnet.ibm.com
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: mark.rutland@arm.com
Cc: mingo@redhat.com
Cc: mpe@ellerman.id.au
Cc: oberpar@linux.vnet.ibm.com
Cc: oprofile-list@lists.sf.net
Cc: paulus@samba.org
Cc: peterz@infradead.org
Cc: ralf@linux-mips.org
Cc: rostedt@goodmis.org
Cc: rric@kernel.org
Cc: schwidefsky@de.ibm.com
Cc: sebott@linux.vnet.ibm.com
Cc: sparclinux@vger.kernel.org
Cc: sth@linux.vnet.ibm.com
Cc: ubraun@linux.vnet.ibm.com
Cc: will.deacon@arm.com
Cc: x86@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: James Hogan <jhogan@kernel.org>
Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm64/include/asm/compat.h   | 11 -----------
 arch/arm64/include/asm/stat.h     |  1 +
 arch/arm64/kernel/hw_breakpoint.c |  1 -
 arch/arm64/kernel/perf_regs.c     |  2 +-
 arch/mips/include/asm/compat.h    | 11 -----------
 arch/mips/kernel/signal32.c       |  2 +-
 arch/parisc/include/asm/compat.h  | 11 -----------
 arch/powerpc/include/asm/compat.h | 11 -----------
 arch/powerpc/kernel/asm-offsets.c |  2 +-
 arch/powerpc/oprofile/backtrace.c |  1 +
 arch/s390/hypfs/hypfs_sprp.c      |  1 -
 arch/s390/include/asm/compat.h    | 11 -----------
 arch/s390/include/asm/elf.h       |  4 ++--
 arch/s390/kvm/priv.c              |  1 -
 arch/s390/pci/pci_clp.c           |  1 -
 arch/sparc/include/asm/compat.h   | 11 -----------
 arch/x86/events/core.c            |  2 +-
 arch/x86/include/asm/compat.h     | 11 -----------
 arch/x86/include/asm/ftrace.h     |  2 +-
 arch/x86/kernel/sys_x86_64.c      |  2 +-
 drivers/s390/block/dasd_ioctl.c   |  1 -
 drivers/s390/char/fs3270.c        |  1 -
 drivers/s390/char/sclp_ctl.c      |  1 -
 drivers/s390/char/vmcp.c          |  1 -
 drivers/s390/cio/chsc_sch.c       |  1 -
 drivers/s390/net/qeth_core_main.c |  2 +-
 include/linux/compat.h            |  1 +
 include/linux/compat_time.h       | 19 +++++++++++++++++++
 28 files changed, 31 insertions(+), 95 deletions(-)
 create mode 100644 include/linux/compat_time.h

(limited to 'include')

diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index c00c62e1a4a3..0030f79808b3 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -34,7 +34,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
 typedef u16		__compat_uid_t;
@@ -66,16 +65,6 @@ typedef u32		compat_ulong_t;
 typedef u64		compat_u64;
 typedef u32		compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 #ifdef __AARCH64EB__
 	short		st_dev;
diff --git a/arch/arm64/include/asm/stat.h b/arch/arm64/include/asm/stat.h
index 15e35598ac40..eab738019707 100644
--- a/arch/arm64/include/asm/stat.h
+++ b/arch/arm64/include/asm/stat.h
@@ -20,6 +20,7 @@
 
 #ifdef CONFIG_COMPAT
 
+#include <linux/compat_time.h>
 #include <asm/compat.h>
 
 /*
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 74bb56f656ef..413dbe530da8 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -30,7 +30,6 @@
 #include <linux/smp.h>
 #include <linux/uaccess.h>
 
-#include <asm/compat.h>
 #include <asm/current.h>
 #include <asm/debug-monitors.h>
 #include <asm/hw_breakpoint.h>
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 1d091d048d04..0bbac612146e 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
 #include <linux/sched/task_stack.h>
 
-#include <asm/compat.h>
 #include <asm/perf_regs.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index 9a0fa66b81ac..3e548ee99a2f 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -14,7 +14,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_suseconds_t;
 
@@ -46,16 +45,6 @@ typedef u32		compat_ulong_t;
 typedef u64		compat_u64;
 typedef u32		compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 	compat_dev_t	st_dev;
 	s32		st_pad1[3];
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index c4db910a8794..b5d9e1784aff 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -8,13 +8,13 @@
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 2016, Imagination Technologies Ltd.
  */
+#include <linux/compat.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/syscalls.h>
 
-#include <asm/compat.h>
 #include <asm/compat-signal.h>
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h
index 57b8b2a2fd4e..0cdfec8857bd 100644
--- a/arch/parisc/include/asm/compat.h
+++ b/arch/parisc/include/asm/compat.h
@@ -13,7 +13,6 @@
 
 typedef u32	compat_size_t;
 typedef s32	compat_ssize_t;
-typedef s32	compat_time_t;
 typedef s32	compat_clock_t;
 typedef s32	compat_pid_t;
 typedef u32	__compat_uid_t;
@@ -40,16 +39,6 @@ typedef u32	compat_ulong_t;
 typedef u64	compat_u64;
 typedef u32	compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t		tv_sec;
-	s32			tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t		tv_sec;
-	s32			tv_usec;
-};
-
 struct compat_stat {
 	compat_dev_t		st_dev;	/* dev_t is 32 bits on parisc */
 	compat_ino_t		st_ino;	/* 32 bits */
diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index 62168e1158f1..b4773c81f7d5 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -17,7 +17,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
 typedef u32		__compat_uid_t;
@@ -45,16 +44,6 @@ typedef u32		compat_ulong_t;
 typedef u64		compat_u64;
 typedef u32		compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 	compat_dev_t	st_dev;
 	compat_ino_t	st_ino;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6bee65f3cfd3..a77528db474c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -13,6 +13,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/compat.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -42,7 +43,6 @@
 #include <asm/paca.h>
 #include <asm/lppaca.h>
 #include <asm/cache.h>
-#include <asm/compat.h>
 #include <asm/mmu.h>
 #include <asm/hvcall.h>
 #include <asm/xics.h>
diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c
index ecc66d5f02c9..ad054dd0d666 100644
--- a/arch/powerpc/oprofile/backtrace.c
+++ b/arch/powerpc/oprofile/backtrace.c
@@ -7,6 +7,7 @@
  * 2 of the License, or (at your option) any later version.
 **/
 
+#include <linux/compat_time.h>
 #include <linux/oprofile.h>
 #include <linux/sched.h>
 #include <asm/processor.h>
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index ae0ed8dd5f1b..5d85a039391c 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
-#include <asm/compat.h>
 #include <asm/diag.h>
 #include <asm/sclp.h>
 #include "hypfs.h"
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 9830fb6b076e..501aaff85304 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -53,7 +53,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
 typedef u16		__compat_uid_t;
@@ -97,16 +96,6 @@ typedef struct {
 	u32 gprs_high[NUM_GPRS];
 } s390_compat_regs_high;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 	compat_dev_t	st_dev;
 	u16		__pad1;
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 1a61b1b997f2..7d22a474a040 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -125,8 +125,9 @@
  * ELF register definitions..
  */
 
+#include <linux/compat.h>
+
 #include <asm/ptrace.h>
-#include <asm/compat.h>
 #include <asm/syscall.h>
 #include <asm/user.h>
 
@@ -136,7 +137,6 @@ typedef s390_regs elf_gregset_t;
 typedef s390_fp_regs compat_elf_fpregset_t;
 typedef s390_compat_regs compat_elf_gregset_t;
 
-#include <linux/compat.h>
 #include <linux/sched/mm.h>	/* for task_struct */
 #include <asm/mmu_context.h>
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index ebfa0442e569..a3bce0e84346 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -26,7 +26,6 @@
 #include <asm/gmap.h>
 #include <asm/io.h>
 #include <asm/ptrace.h>
-#include <asm/compat.h>
 #include <asm/sclp.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 93cd0f1ca12b..19b2d2a9b43d 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -19,7 +19,6 @@
 #include <linux/uaccess.h>
 #include <asm/pci_debug.h>
 #include <asm/pci_clp.h>
-#include <asm/compat.h>
 #include <asm/clp.h>
 #include <uapi/asm/clp.h>
 
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index 1898b6223160..1910c44521e3 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -11,7 +11,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
 typedef u16		__compat_uid_t;
@@ -39,16 +38,6 @@ typedef u32		compat_ulong_t;
 typedef u64		compat_u64;
 typedef u32		compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 	compat_dev_t	st_dev;
 	compat_ino_t	st_ino;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a6006e7bb729..7a987e6c7c35 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2391,7 +2391,7 @@ static unsigned long get_segment_base(unsigned int segment)
 
 #ifdef CONFIG_IA32_EMULATION
 
-#include <asm/compat.h>
+#include <linux/compat.h>
 
 static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index e1c8dab86670..7cd314b71c51 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -17,7 +17,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
 typedef u16		__compat_uid_t;
@@ -46,16 +45,6 @@ typedef u32		compat_u32;
 typedef u64 __attribute__((aligned(4))) compat_u64;
 typedef u32		compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 	compat_dev_t	st_dev;
 	u16		__pad1;
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 09ad88572746..db25aa15b705 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -49,7 +49,7 @@ int ftrace_int3_handler(struct pt_regs *regs);
 #if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS)
 
 #if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION)
-#include <asm/compat.h>
+#include <linux/compat.h>
 
 /*
  * Because ia32 syscalls do not map to x86_64 syscall numbers
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index a3f15ed545b5..6a78d4b36a79 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
@@ -19,7 +20,6 @@
 #include <linux/elf.h>
 
 #include <asm/elf.h>
-#include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
 #include <asm/mpx.h>
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 7bdc6aaa0ba3..2016e0ed5865 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -18,7 +18,6 @@
 #include <linux/fs.h>
 #include <linux/blkpg.h>
 #include <linux/slab.h>
-#include <asm/compat.h>
 #include <asm/ccwdev.h>
 #include <asm/schid.h>
 #include <asm/cmb.h>
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 61822480a2a0..16a4e8528bbc 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -19,7 +19,6 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#include <asm/compat.h>
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
 #include <asm/ebcdic.h>
diff --git a/drivers/s390/char/sclp_ctl.c b/drivers/s390/char/sclp_ctl.c
index a78cea0c3a09..248b5db3eaa8 100644
--- a/drivers/s390/char/sclp_ctl.c
+++ b/drivers/s390/char/sclp_ctl.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/ioctl.h>
 #include <linux/fs.h>
-#include <asm/compat.h>
 #include <asm/sclp_ctl.h>
 #include <asm/sclp.h>
 
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 17e411c57576..948ce82a7725 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include <linux/cma.h>
 #include <linux/mm.h>
-#include <asm/compat.h>
 #include <asm/cpcmd.h>
 #include <asm/debug.h>
 #include <asm/vmcp.h>
diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c
index 0015729d917d..8d9f36625ba5 100644
--- a/drivers/s390/cio/chsc_sch.c
+++ b/drivers/s390/cio/chsc_sch.c
@@ -16,7 +16,6 @@
 #include <linux/miscdevice.h>
 #include <linux/kernel_stat.h>
 
-#include <asm/compat.h>
 #include <asm/cio.h>
 #include <asm/chsc.h>
 #include <asm/isc.h>
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 04fefa5bb08d..df5d8451a51a 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -10,6 +10,7 @@
 #define KMSG_COMPONENT "qeth"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/compat.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/string.h>
@@ -32,7 +33,6 @@
 #include <asm/chpid.h>
 #include <asm/io.h>
 #include <asm/sysinfo.h>
-#include <asm/compat.h>
 #include <asm/diag.h>
 #include <asm/cio.h>
 #include <asm/ccwdev.h>
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 48d29b1339be..cd50b00ef59a 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -7,6 +7,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/compat_time.h>
 
 #include <linux/stat.h>
 #include <linux/param.h>	/* for HZ */
diff --git a/include/linux/compat_time.h b/include/linux/compat_time.h
new file mode 100644
index 000000000000..56a54a1e4355
--- /dev/null
+++ b/include/linux/compat_time.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_COMPAT_TIME_H
+#define _LINUX_COMPAT_TIME_H
+
+#include <linux/types.h>
+
+typedef s32		compat_time_t;
+
+struct compat_timespec {
+	compat_time_t	tv_sec;
+	s32		tv_nsec;
+};
+
+struct compat_timeval {
+	compat_time_t	tv_sec;
+	s32		tv_usec;
+};
+
+#endif /* _LINUX_COMPAT_TIME_H */
-- 
cgit v1.2.3


From 1c68adf61e5845f3525522fa9c797ed370689c85 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 13 Mar 2018 21:03:26 -0700
Subject: compat: Enable compat_get/put_timespec64 always

These functions are used in the repurposed compat syscalls
to provide backward compatibility for using 32 bit time_t
on 32 bit systems.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compat.h      |  2 --
 include/linux/compat_time.h |  4 ++++
 kernel/compat.c             | 52 +++++++--------------------------------------
 kernel/time/time.c          | 44 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index cd50b00ef59a..cfc1b6383ae0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -294,8 +294,6 @@ extern int compat_get_timespec(struct timespec *, const void __user *);
 extern int compat_put_timespec(const struct timespec *, void __user *);
 extern int compat_get_timeval(struct timeval *, const void __user *);
 extern int compat_put_timeval(const struct timeval *, void __user *);
-extern int compat_get_timespec64(struct timespec64 *, const void __user *);
-extern int compat_put_timespec64(const struct timespec64 *, void __user *);
 extern int get_compat_itimerspec64(struct itimerspec64 *its,
 			const struct compat_itimerspec __user *uits);
 extern int put_compat_itimerspec64(const struct itimerspec64 *its,
diff --git a/include/linux/compat_time.h b/include/linux/compat_time.h
index 56a54a1e4355..31f2774f1994 100644
--- a/include/linux/compat_time.h
+++ b/include/linux/compat_time.h
@@ -3,6 +3,7 @@
 #define _LINUX_COMPAT_TIME_H
 
 #include <linux/types.h>
+#include <linux/time64.h>
 
 typedef s32		compat_time_t;
 
@@ -16,4 +17,7 @@ struct compat_timeval {
 	s32		tv_usec;
 };
 
+extern int compat_get_timespec64(struct timespec64 *, const void __user *);
+extern int compat_put_timespec64(const struct timespec64 *, void __user *);
+
 #endif /* _LINUX_COMPAT_TIME_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index 6d21894806b4..51a081b46832 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -120,50 +120,6 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
 
-static int __compat_get_timespec64(struct timespec64 *ts64,
-				   const struct compat_timespec __user *cts)
-{
-	struct compat_timespec ts;
-	int ret;
-
-	ret = copy_from_user(&ts, cts, sizeof(ts));
-	if (ret)
-		return -EFAULT;
-
-	ts64->tv_sec = ts.tv_sec;
-	ts64->tv_nsec = ts.tv_nsec;
-
-	return 0;
-}
-
-static int __compat_put_timespec64(const struct timespec64 *ts64,
-				   struct compat_timespec __user *cts)
-{
-	struct compat_timespec ts = {
-		.tv_sec = ts64->tv_sec,
-		.tv_nsec = ts64->tv_nsec
-	};
-	return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
-}
-
-int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
-{
-	if (COMPAT_USE_64BIT_TIME)
-		return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
-	else
-		return __compat_get_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_get_timespec64);
-
-int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
-{
-	if (COMPAT_USE_64BIT_TIME)
-		return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
-	else
-		return __compat_put_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_put_timespec64);
-
 int compat_get_timeval(struct timeval *tv, const void __user *utv)
 {
 	if (COMPAT_USE_64BIT_TIME)
@@ -367,6 +323,14 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
 	return ret;
 }
 
+/* Todo: Delete these extern declarations when get/put_compat_itimerspec64()
+ * are moved to kernel/time/time.c .
+ */
+extern int __compat_get_timespec64(struct timespec64 *ts64,
+				   const struct compat_timespec __user *cts);
+extern int __compat_put_timespec64(const struct timespec64 *ts64,
+				   struct compat_timespec __user *cts);
+
 int get_compat_itimerspec64(struct itimerspec64 *its,
 			const struct compat_itimerspec __user *uits)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3044d48ebe56..df61143a54f6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -880,6 +880,50 @@ int put_timespec64(const struct timespec64 *ts,
 }
 EXPORT_SYMBOL_GPL(put_timespec64);
 
+int __compat_get_timespec64(struct timespec64 *ts64,
+				   const struct compat_timespec __user *cts)
+{
+	struct compat_timespec ts;
+	int ret;
+
+	ret = copy_from_user(&ts, cts, sizeof(ts));
+	if (ret)
+		return -EFAULT;
+
+	ts64->tv_sec = ts.tv_sec;
+	ts64->tv_nsec = ts.tv_nsec;
+
+	return 0;
+}
+
+int __compat_put_timespec64(const struct timespec64 *ts64,
+				   struct compat_timespec __user *cts)
+{
+	struct compat_timespec ts = {
+		.tv_sec = ts64->tv_sec,
+		.tv_nsec = ts64->tv_nsec
+	};
+	return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+	if (COMPAT_USE_64BIT_TIME)
+		return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+	else
+		return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+	if (COMPAT_USE_64BIT_TIME)
+		return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+	else
+		return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
 int get_itimerspec64(struct itimerspec64 *it,
 			const struct itimerspec __user *uit)
 {
-- 
cgit v1.2.3


From acf8870a62afd4f1b3c0b695aaa619df355c0851 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 13 Mar 2018 21:03:30 -0700
Subject: time: Add new y2038 safe __kernel_timespec

The new struct __kernel_timespec is similar to current
internal kernel struct timespec64 on 64 bit architecture.
The compat structure however is similar to below on little
endian systems (padding and tv_nsec are switched for big
endian systems):

typedef s32            compat_long_t;
typedef s64            compat_kernel_time64_t;

struct compat_kernel_timespec {
       compat_kernel_time64_t  tv_sec;
       compat_long_t           tv_nsec;
       compat_long_t           padding;
};

This allows for both the native and compat representations to
be the same and syscalls using this type as part of their ABI
can have a single entry point to both.

Note that the compat define is not included anywhere in the
kernel explicitly to avoid confusion.

These types will be used by the new syscalls that will be
introduced in the consequent patches.
Most of the new syscalls are just an update to the existing
native ones with this new type. Hence, put this new type under
an ifdef so that the architectures can define CONFIG_64BIT_TIME
when they are ready to handle this switch.

Cc: linux-arch@vger.kernel.org
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/time64.h                 | 10 +++++++++-
 include/uapi/asm-generic/posix_types.h |  1 +
 include/uapi/linux/time.h              |  7 +++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 93d39499838e..0d96887ba4e0 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -2,12 +2,20 @@
 #ifndef _LINUX_TIME64_H
 #define _LINUX_TIME64_H
 
-#include <uapi/linux/time.h>
 #include <linux/math64.h>
 
 typedef __s64 time64_t;
 typedef __u64 timeu64_t;
 
+/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path
+ * and 32-bit emulation.
+ */
+#ifndef CONFIG_64BIT_TIME
+#define __kernel_timespec timespec
+#endif
+
+#include <uapi/linux/time.h>
+
 #if __BITS_PER_LONG == 64
 /* this trick allows us to optimize out timespec64_to_timespec */
 # define timespec64 timespec
diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h
index 5e6ea22bd525..f0733a26ebfc 100644
--- a/include/uapi/asm-generic/posix_types.h
+++ b/include/uapi/asm-generic/posix_types.h
@@ -87,6 +87,7 @@ typedef struct {
 typedef __kernel_long_t	__kernel_off_t;
 typedef long long	__kernel_loff_t;
 typedef __kernel_long_t	__kernel_time_t;
+typedef long long __kernel_time64_t;
 typedef __kernel_long_t	__kernel_clock_t;
 typedef int		__kernel_timer_t;
 typedef int		__kernel_clockid_t;
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 16a296612ba4..94adfae599e0 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -42,6 +42,13 @@ struct itimerval {
 	struct timeval it_value;	/* current value */
 };
 
+#ifndef __kernel_timespec
+struct __kernel_timespec {
+	__kernel_time64_t       tv_sec;                 /* seconds */
+	long long               tv_nsec;                /* nanoseconds */
+};
+#endif
+
 /*
  * legacy timeval structure, only embedded in structures that
  * traditionally used 'timeval' to pass time intervals (not absolute
-- 
cgit v1.2.3


From ea2ce8f3514e2074a1910d8d721842d7341d5c81 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 13 Mar 2018 21:03:31 -0700
Subject: time: Fix get_timespec64() for y2038 safe compat interfaces

get/put_timespec64() interfaces will eventually be used for
conversions between the new y2038 safe struct __kernel_timespec
and struct timespec64.

The new y2038 safe syscalls have a common entry for native
and compat interfaces.
On compat interfaces, the high order bits of nanoseconds
should be zeroed out. This is because the application code
or the libc do not guarantee zeroing of these. If used without
zeroing, kernel might be at risk of using timespec values
incorrectly.

Note that clearing of bits is dependent on CONFIG_64BIT_TIME
for now. This is until COMPAT_USE_64BIT_TIME has been handled
correctly. x86 will be the first architecture that will use the
CONFIG_64BIT_TIME.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/time.h |  4 ++--
 kernel/time/time.c   | 14 ++++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/time.h b/include/linux/time.h
index 4b62a2c0a661..aed74463592d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -10,9 +10,9 @@
 extern struct timezone sys_tz;
 
 int get_timespec64(struct timespec64 *ts,
-		const struct timespec __user *uts);
+		const struct __kernel_timespec __user *uts);
 int put_timespec64(const struct timespec64 *ts,
-		struct timespec __user *uts);
+		struct __kernel_timespec __user *uts);
 int get_itimerspec64(struct itimerspec64 *it,
 			const struct itimerspec __user *uit);
 int put_itimerspec64(const struct itimerspec64 *it,
diff --git a/kernel/time/time.c b/kernel/time/time.c
index df61143a54f6..ccd751e95fcb 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -853,9 +853,9 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 }
 
 int get_timespec64(struct timespec64 *ts,
-		   const struct timespec __user *uts)
+		   const struct __kernel_timespec __user *uts)
 {
-	struct timespec kts;
+	struct __kernel_timespec kts;
 	int ret;
 
 	ret = copy_from_user(&kts, uts, sizeof(kts));
@@ -863,6 +863,11 @@ int get_timespec64(struct timespec64 *ts,
 		return -EFAULT;
 
 	ts->tv_sec = kts.tv_sec;
+
+	/* Zero out the padding for 32 bit systems or in compat mode */
+	if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()))
+		kts.tv_nsec &= 0xFFFFFFFFUL;
+
 	ts->tv_nsec = kts.tv_nsec;
 
 	return 0;
@@ -870,12 +875,13 @@ int get_timespec64(struct timespec64 *ts,
 EXPORT_SYMBOL_GPL(get_timespec64);
 
 int put_timespec64(const struct timespec64 *ts,
-		   struct timespec __user *uts)
+		   struct __kernel_timespec __user *uts)
 {
-	struct timespec kts = {
+	struct __kernel_timespec kts = {
 		.tv_sec = ts->tv_sec,
 		.tv_nsec = ts->tv_nsec
 	};
+
 	return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
 }
 EXPORT_SYMBOL_GPL(put_timespec64);
-- 
cgit v1.2.3


From 6d5b84132459c644cf4ee8de090382bad44b8ebd Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 13 Mar 2018 21:03:32 -0700
Subject: time: Change types to new y2038 safe __kernel_* types

Change over clock_settime, clock_gettime and clock_getres
syscalls to use __kernel_timespec times. This will enable
changing over of these syscalls to use new y2038 safe syscalls
when the architectures define the CONFIG_64BIT_TIME.

Cc: linux-api@vger.kernel.org
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/syscalls.h   | 6 +++---
 kernel/time/posix-stubs.c  | 6 +++---
 kernel/time/posix-timers.c | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 70fcda1a9049..40bb40d1741b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -567,11 +567,11 @@ asmlinkage long sys_timer_settime(timer_t timer_id, int flags,
 				struct itimerspec __user *old_setting);
 asmlinkage long sys_timer_delete(timer_t timer_id);
 asmlinkage long sys_clock_settime(clockid_t which_clock,
-				const struct timespec __user *tp);
+				const struct __kernel_timespec __user *tp);
 asmlinkage long sys_clock_gettime(clockid_t which_clock,
-				struct timespec __user *tp);
+				struct __kernel_timespec __user *tp);
 asmlinkage long sys_clock_getres(clockid_t which_clock,
-				struct timespec __user *tp);
+				struct __kernel_timespec __user *tp);
 asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
 				const struct timespec __user *rqtp,
 				struct timespec __user *rmtp);
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index b605b0a40ba9..4e76021cea7f 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -59,7 +59,7 @@ SYS_NI(alarm);
  */
 
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
-		const struct timespec __user *, tp)
+		const struct __kernel_timespec __user *, tp)
 {
 	struct timespec64 new_tp;
 
@@ -92,7 +92,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
 	return 0;
 }
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-		struct timespec __user *, tp)
+		struct __kernel_timespec __user *, tp)
 {
 	int ret;
 	struct timespec64 kernel_tp;
@@ -106,7 +106,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 	return 0;
 }
 
-SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp)
 {
 	struct timespec64 rtn_tp = {
 		.tv_sec = 0,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 147d79e2cef5..93b2c38ad0f3 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1041,7 +1041,7 @@ void exit_itimers(struct signal_struct *sig)
 }
 
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
-		const struct timespec __user *, tp)
+		const struct __kernel_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 new_tp;
@@ -1056,7 +1056,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-		struct timespec __user *,tp)
+		struct __kernel_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 kernel_tp;
@@ -1097,7 +1097,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
 }
 
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
-		struct timespec __user *, tp)
+		struct __kernel_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 rtn_tp;
-- 
cgit v1.2.3


From 01909974b41036a6a8d3907c66cc7b41c9a73da9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 13 Mar 2018 21:03:33 -0700
Subject: time: Change nanosleep to safe __kernel_* types

Change over clock_nanosleep syscalls to use y2038 safe
__kernel_timespec times. This will enable changing over
of these syscalls to use new y2038 safe syscalls when
the architectures define the CONFIG_64BIT_TIME.

Note that nanosleep syscall is deprecated and does not have a
plan for making it y2038 safe. But, the syscall should work as
before on 64 bit machines and on 32 bit machines, the syscall
works correctly until y2038 as before using the existing compat
syscall version. There is no new syscall for supporting 64 bit
time_t on 32 bit architectures.

Cc: linux-api@vger.kernel.org
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/restart_block.h | 7 ++-----
 include/linux/syscalls.h      | 7 ++++---
 kernel/time/hrtimer.c         | 8 ++++++--
 kernel/time/posix-stubs.c     | 4 ++--
 kernel/time/posix-timers.c    | 4 ++--
 5 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index bcfdb918cd81..5d83d0c1d06c 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -7,6 +7,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/time64.h>
 
 struct timespec;
 struct compat_timespec;
@@ -15,9 +16,7 @@ struct pollfd;
 enum timespec_type {
 	TT_NONE		= 0,
 	TT_NATIVE	= 1,
-#ifdef CONFIG_COMPAT
 	TT_COMPAT	= 2,
-#endif
 };
 
 /*
@@ -40,10 +39,8 @@ struct restart_block {
 			clockid_t clockid;
 			enum timespec_type type;
 			union {
-				struct timespec __user *rmtp;
-#ifdef CONFIG_COMPAT
+				struct __kernel_timespec __user *rmtp;
 				struct compat_timespec __user *compat_rmtp;
-#endif
 			};
 			u64 expires;
 		} nanosleep;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 40bb40d1741b..c9a2a2601852 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -536,7 +536,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
 /* kernel/hrtimer.c */
-asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp);
+asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
+			      struct __kernel_timespec __user *rmtp);
 
 /* kernel/itimer.c */
 asmlinkage long sys_getitimer(int which, struct itimerval __user *value);
@@ -573,8 +574,8 @@ asmlinkage long sys_clock_gettime(clockid_t which_clock,
 asmlinkage long sys_clock_getres(clockid_t which_clock,
 				struct __kernel_timespec __user *tp);
 asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
-				const struct timespec __user *rqtp,
-				struct timespec __user *rmtp);
+				const struct __kernel_timespec __user *rqtp,
+				struct __kernel_timespec __user *rmtp);
 
 /* kernel/printk.c */
 asmlinkage long sys_syslog(int type, char __user *buf, int len);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f183257ff0c6..d7051b3993b5 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1747,8 +1747,10 @@ out:
 	return ret;
 }
 
-SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
-		struct timespec __user *, rmtp)
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
+		struct __kernel_timespec __user *, rmtp)
 {
 	struct timespec64 tu;
 
@@ -1763,6 +1765,8 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 	return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
+#endif
+
 #ifdef CONFIG_COMPAT_32BIT_TIME
 
 COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 4e76021cea7f..474e4ca2e28f 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -126,8 +126,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_time
 }
 
 SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
-		const struct timespec __user *, rqtp,
-		struct timespec __user *, rmtp)
+		const struct __kernel_timespec __user *, rqtp,
+		struct __kernel_timespec __user *, rmtp)
 {
 	struct timespec64 t;
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 93b2c38ad0f3..c21f4c4f8660 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1212,8 +1212,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
 }
 
 SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
-		const struct timespec __user *, rqtp,
-		struct timespec __user *, rmtp)
+		const struct __kernel_timespec __user *, rqtp,
+		struct __kernel_timespec __user *, rmtp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 t;
-- 
cgit v1.2.3


From e21db6f69a95b846ff04e31fe0a86004cbd000d7 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 17 Apr 2018 23:18:48 -0700
Subject: tcp: track total bytes delivered with ECN CE marks

Introduce a new delivered_ce stat in tcp socket to estimate
number of packets being marked with CE bits. The estimation is
done via ACKs with ECE bit. Depending on the actual receiver
behavior, the estimation could have biases.

Since the TCP sender can't really see the CE bit in the data path,
so the sender is technically counting packets marked delivered with
the "ECE / ECN-Echo" flag set.

With RFC3168 ECN, because the ECE bit is sticky, this count can
drastically overestimate the nummber of CE-marked data packets

With DCTCP-style ECN this should be reasonably precise unless there
is loss in the ACK path, in which case it's not precise.

With AccECN proposal this can be made still more precise, even in
the case some degree of ACK loss.

However this is sender's best estimate of CE information.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  | 1 +
 net/ipv4/tcp.c       | 1 +
 net/ipv4/tcp_input.c | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8f4c54986f97..20585d5c4e1c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -281,6 +281,7 @@ struct tcp_sock {
 				 * receiver in Recovery. */
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
+	u32	delivered_ce;	/* Like the above but only ECE marked packets */
 	u32	lost;		/* Total data packets lost incl. rexmits */
 	u32	app_limited;	/* limited until "delivered" reaches this val */
 	u64	first_tx_mstamp;  /* start of window send phase */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 438fbca96cd3..5a5ce6da4792 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2559,6 +2559,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;
+	tp->delivered_ce = 0;
 	tcp_set_ca_state(sk, TCP_CA_Open);
 	tp->is_sack_reneg = 0;
 	tcp_clear_retrans(tp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 01cce28f90ca..b3bff9c20606 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3503,6 +3503,8 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 	u32 delivered;
 
 	delivered = tp->delivered - prior_delivered;
+	if (flag & FLAG_ECE)
+		tp->delivered_ce += delivered;
 	return delivered;
 }
 
-- 
cgit v1.2.3


From feb5f2ec646483fb66f9ad7218b1aad2a93a2a5c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 17 Apr 2018 23:18:49 -0700
Subject: tcp: export packets delivery info

Export data delivered and delivered with CE marks to
1) SNMP TCPDelivered and TCPDeliveredCE
2) getsockopt(TCP_INFO)
3) Timestamping API SOF_TIMESTAMPING_OPT_STATS

Note that for SCM_TSTAMP_ACK, the delivery info in
SOF_TIMESTAMPING_OPT_STATS is reported before the info
was fully updated on the ACK.

These stats help application monitor TCP delivery and ECN status
on per host, per connection, even per message level.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 2 ++
 include/uapi/linux/tcp.h  | 5 +++++
 net/ipv4/proc.c           | 2 ++
 net/ipv4/tcp.c            | 7 ++++++-
 net/ipv4/tcp_input.c      | 6 +++++-
 5 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 33a70ece462f..d02e859301ff 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -276,6 +276,8 @@ enum
 	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
 	LINUX_MIB_TCPMTUPFAIL,			/* TCPMTUPFail */
 	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
+	LINUX_MIB_TCPDELIVERED,			/* TCPDelivered */
+	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 560374c978f9..379b08700a54 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -224,6 +224,9 @@ struct tcp_info {
 	__u64	tcpi_busy_time;      /* Time (usec) busy sending data */
 	__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
 	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+
+	__u32	tcpi_delivered;
+	__u32	tcpi_delivered_ce;
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -244,6 +247,8 @@ enum {
 	TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */
 	TCP_NLA_CA_STATE,	/* ca_state of socket */
 	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
+	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
+	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a058de677e94..261b71d0ccc5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -296,6 +296,8 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
 	SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
 	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5a5ce6da4792..4022073b0aee 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3167,6 +3167,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	rate64 = tcp_compute_delivery_rate(tp);
 	if (rate64)
 		info->tcpi_delivery_rate = rate64;
+	info->tcpi_delivered = tp->delivered;
+	info->tcpi_delivered_ce = tp->delivered_ce;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3180,7 +3182,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	u32 rate;
 
 	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
-			  5 * nla_total_size(sizeof(u32)) +
+			  7 * nla_total_size(sizeof(u32)) +
 			  3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
@@ -3211,9 +3213,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
 	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
 
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
 	return stats;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b3bff9c20606..0396fb919b5d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3499,12 +3499,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
 /* Returns the number of packets newly acked or sacked by the current ACK */
 static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 {
+	const struct net *net = sock_net(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 delivered;
 
 	delivered = tp->delivered - prior_delivered;
-	if (flag & FLAG_ECE)
+	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+	if (flag & FLAG_ECE) {
 		tp->delivered_ce += delivered;
+		NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+	}
 	return delivered;
 }
 
-- 
cgit v1.2.3


From af81f9e75e546de43f4f0bf532d880ddc16a4dc8 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:08 +0100
Subject: netfilter: nf_flow_table: use IP_CT_DIR_* values for
 FLOW_OFFLOAD_DIR_*

Simplifies further code cleanups

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 833752dd0c58..09ba67598991 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -6,6 +6,7 @@
 #include <linux/netdevice.h>
 #include <linux/rhashtable.h>
 #include <linux/rcupdate.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
 #include <net/dst.h>
 
 struct nf_flowtable;
@@ -27,11 +28,10 @@ struct nf_flowtable {
 };
 
 enum flow_offload_tuple_dir {
-	FLOW_OFFLOAD_DIR_ORIGINAL,
-	FLOW_OFFLOAD_DIR_REPLY,
-	__FLOW_OFFLOAD_DIR_MAX		= FLOW_OFFLOAD_DIR_REPLY,
+	FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL,
+	FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY,
+	FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX
 };
-#define FLOW_OFFLOAD_DIR_MAX	(__FLOW_OFFLOAD_DIR_MAX + 1)
 
 struct flow_offload_tuple {
 	union {
-- 
cgit v1.2.3


From 88078d98d1bb085d72af8437707279e203524fa5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Apr 2018 11:43:15 -0700
Subject: net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends

After working on IP defragmentation lately, I found that some large
packets defeat CHECKSUM_COMPLETE optimization because of NIC adding
zero paddings on the last (small) fragment.

While removing the padding with pskb_trim_rcsum(), we set skb->ip_summed
to CHECKSUM_NONE, forcing a full csum validation, even if all prior
fragments had CHECKSUM_COMPLETE set.

We can instead compute the checksum of the part we are trimming,
usually smaller than the part we keep.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 ++---
 net/core/skbuff.c      | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9065477ed255..d274059529eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3131,6 +3131,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
 	return skb->data;
 }
 
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
 /**
  *	pskb_trim_rcsum - trim received skb and update checksum
  *	@skb: buffer to trim
@@ -3144,9 +3145,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 {
 	if (likely(len >= skb->len))
 		return 0;
-	if (skb->ip_summed == CHECKSUM_COMPLETE)
-		skb->ip_summed = CHECKSUM_NONE;
-	return __pskb_trim(skb, len);
+	return pskb_trim_rcsum_slow(skb, len);
 }
 
 static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca8..ff49e352deea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1839,6 +1839,20 @@ done:
 }
 EXPORT_SYMBOL(___pskb_trim);
 
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		int delta = skb->len - len;
+
+		skb->csum = csum_sub(skb->csum,
+				     skb_checksum(skb, len, delta, 0));
+	}
+	return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
 /**
  *	__pskb_pull_tail - advance tail of skb header
  *	@skb: buffer to reallocate
-- 
cgit v1.2.3


From 93c2fb253d177a0b8f4f93592441f88c9b7d6245 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:38:59 -0700
Subject: net/ipv6: Rename fib6_info struct elements

Change the prefix for fib6_info struct elements from rt6i_ to fib6_.
rt6i_pcpu and rt6i_exception_bucket are left as is given that they
point to rt6_info entries.

Rename only; not functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  56 ++---
 include/net/ip6_fib.h                              |  38 +--
 include/net/ip6_route.h                            |  26 +-
 net/ipv6/addrconf.c                                |  24 +-
 net/ipv6/anycast.c                                 |   8 +-
 net/ipv6/ip6_fib.c                                 | 170 ++++++-------
 net/ipv6/ndisc.c                                   |   2 +-
 net/ipv6/route.c                                   | 272 ++++++++++-----------
 8 files changed, 298 insertions(+), 298 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index b85b15851841..8e4edb634b11 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4705,17 +4705,17 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt)
 	 * are trapped to the CPU, so no need to program specific routes
 	 * for them.
 	 */
-	if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LINKLOCAL)
+	if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_LINKLOCAL)
 		return true;
 
 	/* Multicast routes aren't supported, so ignore them. Neighbour
 	 * Discovery packets are specifically trapped.
 	 */
-	if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_MULTICAST)
+	if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_MULTICAST)
 		return true;
 
 	/* Cloned routes are irrelevant in the forwarding path. */
-	if (rt->rt6i_flags & RTF_CACHE)
+	if (rt->fib6_flags & RTF_CACHE)
 		return true;
 
 	return false;
@@ -4759,7 +4759,7 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
 {
 	/* RTF_CACHE routes are ignored */
-	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
+	return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
 }
 
 static struct fib6_info *
@@ -4784,16 +4784,16 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 		/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
 		 * virtual router.
 		 */
-		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id)
 			continue;
-		if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
 			break;
-		if (rt->rt6i_metric < nrt->rt6i_metric)
+		if (rt->fib6_metric < nrt->fib6_metric)
 			continue;
-		if (rt->rt6i_metric == nrt->rt6i_metric &&
+		if (rt->fib6_metric == nrt->fib6_metric &&
 		    mlxsw_sp_fib6_rt_can_mp(rt))
 			return fib6_entry;
-		if (rt->rt6i_metric > nrt->rt6i_metric)
+		if (rt->fib6_metric > nrt->fib6_metric)
 			break;
 	}
 
@@ -4899,7 +4899,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
 static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
 				    const struct fib6_info *rt)
 {
-	return rt->rt6i_flags & RTF_GATEWAY ||
+	return rt->fib6_flags & RTF_GATEWAY ||
 	       mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
 }
 
@@ -5092,9 +5092,9 @@ static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp,
 	 * local, which will cause them to be trapped with a lower
 	 * priority than packets that need to be locally received.
 	 */
-	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
-	else if (rt->rt6i_flags & RTF_REJECT)
+	else if (rt->fib6_flags & RTF_REJECT)
 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
 	else if (mlxsw_sp_rt6_is_gateway(mlxsw_sp, rt))
 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE;
@@ -5175,18 +5175,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
 		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
-		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id)
 			continue;
-		if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
 			break;
-		if (replace && rt->rt6i_metric == nrt->rt6i_metric) {
+		if (replace && rt->fib6_metric == nrt->fib6_metric) {
 			if (mlxsw_sp_fib6_rt_can_mp(rt) ==
 			    mlxsw_sp_fib6_rt_can_mp(nrt))
 				return fib6_entry;
 			if (mlxsw_sp_fib6_rt_can_mp(nrt))
 				fallback = fallback ?: fib6_entry;
 		}
-		if (rt->rt6i_metric > nrt->rt6i_metric)
+		if (rt->fib6_metric > nrt->fib6_metric)
 			return fallback ?: fib6_entry;
 	}
 
@@ -5215,7 +5215,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
 		list_for_each_entry(last, &fib_node->entry_list, common.list) {
 			struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last);
 
-			if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id)
+			if (nrt->fib6_table->tb6_id > rt->fib6_table->tb6_id)
 				break;
 			fib6_entry = last;
 		}
@@ -5275,22 +5275,22 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
 	struct mlxsw_sp_fib *fib;
 	struct mlxsw_sp_vr *vr;
 
-	vr = mlxsw_sp_vr_find(mlxsw_sp, rt->rt6i_table->tb6_id);
+	vr = mlxsw_sp_vr_find(mlxsw_sp, rt->fib6_table->tb6_id);
 	if (!vr)
 		return NULL;
 	fib = mlxsw_sp_vr_fib(vr, MLXSW_SP_L3_PROTO_IPV6);
 
-	fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->rt6i_dst.addr,
-					    sizeof(rt->rt6i_dst.addr),
-					    rt->rt6i_dst.plen);
+	fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->fib6_dst.addr,
+					    sizeof(rt->fib6_dst.addr),
+					    rt->fib6_dst.plen);
 	if (!fib_node)
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
 		struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
-		if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id &&
-		    rt->rt6i_metric == iter_rt->rt6i_metric &&
+		if (rt->fib6_table->tb6_id == iter_rt->fib6_table->tb6_id &&
+		    rt->fib6_metric == iter_rt->fib6_metric &&
 		    mlxsw_sp_fib6_entry_rt_find(fib6_entry, rt))
 			return fib6_entry;
 	}
@@ -5325,16 +5325,16 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 	if (mlxsw_sp->router->aborted)
 		return 0;
 
-	if (rt->rt6i_src.plen)
+	if (rt->fib6_src.plen)
 		return -EINVAL;
 
 	if (mlxsw_sp_fib6_rt_should_ignore(rt))
 		return 0;
 
-	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->rt6i_table->tb6_id,
-					 &rt->rt6i_dst.addr,
-					 sizeof(rt->rt6i_dst.addr),
-					 rt->rt6i_dst.plen,
+	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->fib6_table->tb6_id,
+					 &rt->fib6_dst.addr,
+					 sizeof(rt->fib6_dst.addr),
+					 rt->fib6_dst.plen,
 					 MLXSW_SP_L3_PROTO_IPV6);
 	if (IS_ERR(fib_node))
 		return PTR_ERR(fib_node);
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a36116b92100..994dd67207fb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -134,34 +134,34 @@ struct fib6_nh {
 };
 
 struct fib6_info {
-	struct fib6_table		*rt6i_table;
+	struct fib6_table		*fib6_table;
 	struct fib6_info __rcu		*rt6_next;
-	struct fib6_node __rcu		*rt6i_node;
+	struct fib6_node __rcu		*fib6_node;
 
 	/* Multipath routes:
 	 * siblings is a list of fib6_info that have the the same metric/weight,
 	 * destination, but not the same gateway. nsiblings is just a cache
 	 * to speed up lookup.
 	 */
-	struct list_head		rt6i_siblings;
-	unsigned int			rt6i_nsiblings;
+	struct list_head		fib6_siblings;
+	unsigned int			fib6_nsiblings;
 
-	atomic_t			rt6i_ref;
-	struct inet6_dev		*rt6i_idev;
+	atomic_t			fib6_ref;
+	struct inet6_dev		*fib6_idev;
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
 
-	struct rt6key			rt6i_dst;
-	u32				rt6i_flags;
-	struct rt6key			rt6i_src;
-	struct rt6key			rt6i_prefsrc;
+	struct rt6key			fib6_dst;
+	u32				fib6_flags;
+	struct rt6key			fib6_src;
+	struct rt6key			fib6_prefsrc;
 
 	struct rt6_info * __percpu	*rt6i_pcpu;
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
-	u32				rt6i_metric;
-	u8				rt6i_protocol;
+	u32				fib6_metric;
+	u8				fib6_protocol;
 	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
@@ -206,7 +206,7 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 
 static inline void fib6_clean_expires(struct fib6_info *f6i)
 {
-	f6i->rt6i_flags &= ~RTF_EXPIRES;
+	f6i->fib6_flags &= ~RTF_EXPIRES;
 	f6i->expires = 0;
 }
 
@@ -214,12 +214,12 @@ static inline void fib6_set_expires(struct fib6_info *f6i,
 				    unsigned long expires)
 {
 	f6i->expires = expires;
-	f6i->rt6i_flags |= RTF_EXPIRES;
+	f6i->fib6_flags |= RTF_EXPIRES;
 }
 
 static inline bool fib6_check_expired(const struct fib6_info *f6i)
 {
-	if (f6i->rt6i_flags & RTF_EXPIRES)
+	if (f6i->fib6_flags & RTF_EXPIRES)
 		return time_after(jiffies, f6i->expires);
 	return false;
 }
@@ -250,14 +250,14 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
  * Return true if we can get cookie safely
  * Return false if not
  */
-static inline bool rt6_get_cookie_safe(const struct fib6_info *rt,
+static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i,
 				       u32 *cookie)
 {
 	struct fib6_node *fn;
 	bool status = false;
 
 	rcu_read_lock();
-	fn = rcu_dereference(rt->rt6i_node);
+	fn = rcu_dereference(f6i->fib6_node);
 
 	if (fn) {
 		*cookie = fn->fn_sernum;
@@ -295,12 +295,12 @@ void fib6_info_destroy(struct fib6_info *f6i);
 
 static inline void fib6_info_hold(struct fib6_info *f6i)
 {
-	atomic_inc(&f6i->rt6i_ref);
+	atomic_inc(&f6i->fib6_ref);
 }
 
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
-	if (f6i && atomic_dec_and_test(&f6i->rt6i_ref))
+	if (f6i && atomic_dec_and_test(&f6i->fib6_ref))
 		fib6_info_destroy(f6i);
 }
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d5fb1e4ae7ac..b4b85109984f 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,9 +66,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
-static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt)
+static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
 	       RTF_GATEWAY;
 }
 
@@ -102,23 +102,23 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack);
-int ip6_ins_rt(struct net *net, struct fib6_info *rt);
-int ip6_del_rt(struct net *net, struct fib6_info *rt);
+int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
+int ip6_del_rt(struct net *net, struct fib6_info *f6i);
 
-void rt6_flush_exceptions(struct fib6_info *rt);
-void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
+void rt6_flush_exceptions(struct fib6_info *f6i);
+void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
 			unsigned long now);
 
-static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt,
+static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
 				      const struct in6_addr *daddr,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
 {
-	struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL;
+	struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL;
 	int err = 0;
 
-	if (rt && rt->rt6i_prefsrc.plen)
-		*saddr = rt->rt6i_prefsrc.addr;
+	if (f6i && f6i->fib6_prefsrc.plen)
+		*saddr = f6i->fib6_prefsrc.addr;
 	else
 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
 					 daddr, prefs, saddr);
@@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg {
 	struct net *net;
 };
 
-int rt6_dump_route(struct fib6_info *rt, void *p_arg);
+int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
-void rt6_multipath_rebalance(struct fib6_info *rt);
+void rt6_multipath_rebalance(struct fib6_info *f6i);
 
 void rt6_uncached_list_add(struct rt6_info *rt);
 void rt6_uncached_list_del(struct rt6_info *rt);
@@ -274,7 +274,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
 	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
-	       a->rt6i_idev == b->rt6i_idev &&
+	       a->fib6_idev == b->fib6_idev &&
 	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a294e86a9b25..f38ea829c28b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1178,19 +1178,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
 static void
 cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
 {
-	struct fib6_info *rt;
+	struct fib6_info *f6i;
 
-	rt = addrconf_get_prefix_route(&ifp->addr,
+	f6i = addrconf_get_prefix_route(&ifp->addr,
 				       ifp->prefix_len,
 				       ifp->idev->dev,
 				       0, RTF_GATEWAY | RTF_DEFAULT);
-	if (rt) {
+	if (f6i) {
 		if (del_rt)
-			ip6_del_rt(dev_net(ifp->idev->dev), rt);
+			ip6_del_rt(dev_net(ifp->idev->dev), f6i);
 		else {
-			if (!(rt->rt6i_flags & RTF_EXPIRES))
-				fib6_set_expires(rt, expires);
-			fib6_info_release(rt);
+			if (!(f6i->fib6_flags & RTF_EXPIRES))
+				fib6_set_expires(f6i, expires);
+			fib6_info_release(f6i);
 		}
 	}
 }
@@ -2370,9 +2370,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
 			continue;
-		if ((rt->rt6i_flags & flags) != flags)
+		if ((rt->fib6_flags & flags) != flags)
 			continue;
-		if ((rt->rt6i_flags & noflags) != 0)
+		if ((rt->fib6_flags & noflags) != 0)
 			continue;
 		fib6_info_hold(rt);
 		break;
@@ -3341,11 +3341,11 @@ static int fixup_permanent_addr(struct net *net,
 				struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
-	/* !rt6i_node means the host route was removed from the
+	/* !fib6_node means the host route was removed from the
 	 * FIB, for example, if 'lo' device is taken down. In that
 	 * case regenerate the host route.
 	 */
-	if (!ifp->rt || !ifp->rt->rt6i_node) {
+	if (!ifp->rt || !ifp->rt->fib6_node) {
 		struct fib6_info *rt, *prev;
 
 		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
@@ -5612,7 +5612,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		 * our DAD process, so we don't need
 		 * to do it again
 		 */
-		if (!rcu_access_pointer(ifp->rt->rt6i_node))
+		if (!rcu_access_pointer(ifp->rt->fib6_node))
 			ip6_ins_rt(net, ifp->rt);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 0e35657501a7..eed3bf63bd05 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -218,10 +218,10 @@ static void aca_put(struct ifacaddr6 *ac)
 	}
 }
 
-static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
+static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 				   const struct in6_addr *addr)
 {
-	struct inet6_dev *idev = rt->rt6i_idev;
+	struct inet6_dev *idev = f6i->fib6_idev;
 	struct ifacaddr6 *aca;
 
 	aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -231,8 +231,8 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
 	aca->aca_addr = *addr;
 	in6_dev_hold(idev);
 	aca->aca_idev = idev;
-	fib6_info_hold(rt);
-	aca->aca_rt = rt;
+	fib6_info_hold(f6i);
+	aca->aca_rt = f6i;
 	aca->aca_users = 1;
 	/* aca_tstamp should be updated upon changes */
 	aca->aca_cstamp = aca->aca_tstamp = jiffies;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 2ab49b7cac22..353f0b3e7b0d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -105,12 +105,12 @@ enum {
 	FIB6_NO_SERNUM_CHANGE = 0,
 };
 
-void fib6_update_sernum(struct net *net, struct fib6_info *rt)
+void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
 {
 	struct fib6_node *fn;
 
-	fn = rcu_dereference_protected(rt->rt6i_node,
-			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	fn = rcu_dereference_protected(f6i->fib6_node,
+			lockdep_is_held(&f6i->fib6_table->tb6_lock));
 	if (fn)
 		fn->fn_sernum = fib6_new_sernum(net);
 }
@@ -159,10 +159,10 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 		return NULL;
 	}
 
-	INIT_LIST_HEAD(&f6i->rt6i_siblings);
+	INIT_LIST_HEAD(&f6i->fib6_siblings);
 	f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
 
-	atomic_inc(&f6i->rt6i_ref);
+	atomic_inc(&f6i->fib6_ref);
 
 	return f6i;
 }
@@ -172,7 +172,7 @@ void fib6_info_destroy(struct fib6_info *f6i)
 	struct rt6_exception_bucket *bucket;
 	struct dst_metrics *m;
 
-	WARN_ON(f6i->rt6i_node);
+	WARN_ON(f6i->fib6_node);
 
 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
 	if (bucket) {
@@ -197,8 +197,8 @@ void fib6_info_destroy(struct fib6_info *f6i)
 		}
 	}
 
-	if (f6i->rt6i_idev)
-		in6_dev_put(f6i->rt6i_idev);
+	if (f6i->fib6_idev)
+		in6_dev_put(f6i->fib6_idev);
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
@@ -401,7 +401,7 @@ static int call_fib6_entry_notifiers(struct net *net,
 		.rt = rt,
 	};
 
-	rt->rt6i_table->fib_seq++;
+	rt->fib6_table->fib_seq++;
 	return call_fib6_notifiers(net, event_type, &info.info);
 }
 
@@ -483,10 +483,10 @@ static int fib6_dump_node(struct fib6_walker *w)
 		 * last sibling of this route (no need to dump the
 		 * sibling routes again)
 		 */
-		if (rt->rt6i_nsiblings)
-			rt = list_last_entry(&rt->rt6i_siblings,
+		if (rt->fib6_nsiblings)
+			rt = list_last_entry(&rt->fib6_siblings,
 					     struct fib6_info,
-					     rt6i_siblings);
+					     fib6_siblings);
 	}
 	w->leaf = NULL;
 	return 0;
@@ -810,7 +810,7 @@ insert_above:
 		RCU_INIT_POINTER(in->parent, pn);
 		in->leaf = fn->leaf;
 		atomic_inc(&rcu_dereference_protected(in->leaf,
-				lockdep_is_held(&table->tb6_lock))->rt6i_ref);
+				lockdep_is_held(&table->tb6_lock))->fib6_ref);
 
 		/* update parent pointer */
 		if (dir)
@@ -865,9 +865,9 @@ insert_above:
 static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
-	struct fib6_table *table = rt->rt6i_table;
+	struct fib6_table *table = rt->fib6_table;
 
-	if (atomic_read(&rt->rt6i_ref) != 1) {
+	if (atomic_read(&rt->fib6_ref) != 1) {
 		/* This route is used as dummy address holder in some split
 		 * nodes. It is not leaked, but it still holds other resources,
 		 * which must be released in time. So, scan ascendant nodes
@@ -880,7 +880,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 			struct fib6_info *new_leaf;
 			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
 				new_leaf = fib6_find_prefix(net, table, fn);
-				atomic_inc(&new_leaf->rt6i_ref);
+				atomic_inc(&new_leaf->fib6_ref);
 
 				rcu_assign_pointer(fn->leaf, new_leaf);
 				fib6_info_release(rt);
@@ -919,7 +919,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			    struct netlink_ext_ack *extack)
 {
 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 	struct fib6_info *iter = NULL;
 	struct fib6_info __rcu **ins;
 	struct fib6_info __rcu **fallback_ins = NULL;
@@ -939,12 +939,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 
 	for (iter = leaf; iter;
 	     iter = rcu_dereference_protected(iter->rt6_next,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
+				lockdep_is_held(&rt->fib6_table->tb6_lock))) {
 		/*
 		 *	Search for duplicates
 		 */
 
-		if (iter->rt6i_metric == rt->rt6i_metric) {
+		if (iter->fib6_metric == rt->fib6_metric) {
 			/*
 			 *	Same priority level
 			 */
@@ -964,11 +964,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			}
 
 			if (rt6_duplicate_nexthop(iter, rt)) {
-				if (rt->rt6i_nsiblings)
-					rt->rt6i_nsiblings = 0;
-				if (!(iter->rt6i_flags & RTF_EXPIRES))
+				if (rt->fib6_nsiblings)
+					rt->fib6_nsiblings = 0;
+				if (!(iter->fib6_flags & RTF_EXPIRES))
 					return -EEXIST;
-				if (!(rt->rt6i_flags & RTF_EXPIRES))
+				if (!(rt->fib6_flags & RTF_EXPIRES))
 					fib6_clean_expires(iter);
 				else
 					fib6_set_expires(iter, rt->expires);
@@ -988,10 +988,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			 */
 			if (rt_can_ecmp &&
 			    rt6_qualify_for_ecmp(iter))
-				rt->rt6i_nsiblings++;
+				rt->fib6_nsiblings++;
 		}
 
-		if (iter->rt6i_metric > rt->rt6i_metric)
+		if (iter->fib6_metric > rt->fib6_metric)
 			break;
 
 next_iter:
@@ -1002,7 +1002,7 @@ next_iter:
 		/* No ECMP-able route found, replace first non-ECMP one */
 		ins = fallback_ins;
 		iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		found++;
 	}
 
@@ -1011,34 +1011,34 @@ next_iter:
 		fn->rr_ptr = NULL;
 
 	/* Link this route to others same route. */
-	if (rt->rt6i_nsiblings) {
-		unsigned int rt6i_nsiblings;
+	if (rt->fib6_nsiblings) {
+		unsigned int fib6_nsiblings;
 		struct fib6_info *sibling, *temp_sibling;
 
 		/* Find the first route that have the same metric */
 		sibling = leaf;
 		while (sibling) {
-			if (sibling->rt6i_metric == rt->rt6i_metric &&
+			if (sibling->fib6_metric == rt->fib6_metric &&
 			    rt6_qualify_for_ecmp(sibling)) {
-				list_add_tail(&rt->rt6i_siblings,
-					      &sibling->rt6i_siblings);
+				list_add_tail(&rt->fib6_siblings,
+					      &sibling->fib6_siblings);
 				break;
 			}
 			sibling = rcu_dereference_protected(sibling->rt6_next,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		}
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
 		 */
-		rt6i_nsiblings = 0;
+		fib6_nsiblings = 0;
 		list_for_each_entry_safe(sibling, temp_sibling,
-					 &rt->rt6i_siblings, rt6i_siblings) {
-			sibling->rt6i_nsiblings++;
-			BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
-			rt6i_nsiblings++;
+					 &rt->fib6_siblings, fib6_siblings) {
+			sibling->fib6_nsiblings++;
+			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
+			fib6_nsiblings++;
 		}
-		BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
+		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
 		rt6_multipath_rebalance(temp_sibling);
 	}
 
@@ -1059,8 +1059,8 @@ add:
 			return err;
 
 		rcu_assign_pointer(rt->rt6_next, iter);
-		atomic_inc(&rt->rt6i_ref);
-		rcu_assign_pointer(rt->rt6i_node, fn);
+		atomic_inc(&rt->fib6_ref);
+		rcu_assign_pointer(rt->fib6_node, fn);
 		rcu_assign_pointer(*ins, rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
@@ -1087,8 +1087,8 @@ add:
 		if (err)
 			return err;
 
-		atomic_inc(&rt->rt6i_ref);
-		rcu_assign_pointer(rt->rt6i_node, fn);
+		atomic_inc(&rt->fib6_ref);
+		rcu_assign_pointer(rt->fib6_node, fn);
 		rt->rt6_next = iter->rt6_next;
 		rcu_assign_pointer(*ins, rt);
 		if (!info->skip_notify)
@@ -1097,8 +1097,8 @@ add:
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
-		nsiblings = iter->rt6i_nsiblings;
-		iter->rt6i_node = NULL;
+		nsiblings = iter->fib6_nsiblings;
+		iter->fib6_node = NULL;
 		fib6_purge_rt(iter, fn, info->nl_net);
 		if (rcu_access_pointer(fn->rr_ptr) == iter)
 			fn->rr_ptr = NULL;
@@ -1108,13 +1108,13 @@ add:
 			/* Replacing an ECMP route, remove all siblings */
 			ins = &rt->rt6_next;
 			iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 			while (iter) {
-				if (iter->rt6i_metric > rt->rt6i_metric)
+				if (iter->fib6_metric > rt->fib6_metric)
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->rt6_next;
-					iter->rt6i_node = NULL;
+					iter->fib6_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
 						fn->rr_ptr = NULL;
@@ -1125,7 +1125,7 @@ add:
 					ins = &iter->rt6_next;
 				}
 				iter = rcu_dereference_protected(*ins,
-					lockdep_is_held(&rt->rt6i_table->tb6_lock));
+					lockdep_is_held(&rt->fib6_table->tb6_lock));
 			}
 			WARN_ON(nsiblings != 0);
 		}
@@ -1137,7 +1137,7 @@ add:
 static void fib6_start_gc(struct net *net, struct fib6_info *rt)
 {
 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
-	    (rt->rt6i_flags & RTF_EXPIRES))
+	    (rt->fib6_flags & RTF_EXPIRES))
 		mod_timer(&net->ipv6.ip6_fib_timer,
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
@@ -1152,15 +1152,15 @@ void fib6_force_start_gc(struct net *net)
 static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
 					   int sernum)
 {
-	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+				lockdep_is_held(&rt->fib6_table->tb6_lock));
 
 	/* paired with smp_rmb() in rt6_get_cookie_safe() */
 	smp_wmb();
 	while (fn) {
 		fn->fn_sernum = sernum;
 		fn = rcu_dereference_protected(fn->parent,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				lockdep_is_held(&rt->fib6_table->tb6_lock));
 	}
 }
 
@@ -1179,7 +1179,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack)
 {
-	struct fib6_table *table = rt->rt6i_table;
+	struct fib6_table *table = rt->fib6_table;
 	struct fib6_node *fn, *pn = NULL;
 	int err = -ENOMEM;
 	int allow_create = 1;
@@ -1196,8 +1196,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 		pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 
 	fn = fib6_add_1(info->nl_net, table, root,
-			&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
-			offsetof(struct fib6_info, rt6i_dst), allow_create,
+			&rt->fib6_dst.addr, rt->fib6_dst.plen,
+			offsetof(struct fib6_info, fib6_dst), allow_create,
 			replace_required, extack);
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
@@ -1208,7 +1208,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	pn = fn;
 
 #ifdef CONFIG_IPV6_SUBTREES
-	if (rt->rt6i_src.plen) {
+	if (rt->fib6_src.plen) {
 		struct fib6_node *sn;
 
 		if (!rcu_access_pointer(fn->subtree)) {
@@ -1229,7 +1229,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 			if (!sfn)
 				goto failure;
 
-			atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref);
+			atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
 			rcu_assign_pointer(sfn->leaf,
 					   info->nl_net->ipv6.fib6_null_entry);
 			sfn->fn_flags = RTN_ROOT;
@@ -1237,8 +1237,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 			/* Now add the first leaf node to new subtree */
 
 			sn = fib6_add_1(info->nl_net, table, sfn,
-					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct fib6_info, rt6i_src),
+					&rt->fib6_src.addr, rt->fib6_src.plen,
+					offsetof(struct fib6_info, fib6_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1256,8 +1256,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 			rcu_assign_pointer(fn->subtree, sfn);
 		} else {
 			sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
-					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct fib6_info, rt6i_src),
+					&rt->fib6_src.addr, rt->fib6_src.plen,
+					offsetof(struct fib6_info, fib6_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1272,7 +1272,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 				rcu_assign_pointer(fn->leaf,
 					    info->nl_net->ipv6.fib6_null_entry);
 			} else {
-				atomic_inc(&rt->rt6i_ref);
+				atomic_inc(&rt->fib6_ref);
 				rcu_assign_pointer(fn->leaf, rt);
 			}
 		}
@@ -1421,12 +1421,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
 		{
-			.offset = offsetof(struct fib6_info, rt6i_dst),
+			.offset = offsetof(struct fib6_info, fib6_dst),
 			.addr = daddr,
 		},
 #ifdef CONFIG_IPV6_SUBTREES
 		{
-			.offset = offsetof(struct fib6_info, rt6i_src),
+			.offset = offsetof(struct fib6_info, fib6_src),
 			.addr = saddr,
 		},
 #endif
@@ -1511,7 +1511,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 	struct fib6_node *fn;
 
 	fn = fib6_locate_1(root, daddr, dst_len,
-			   offsetof(struct fib6_info, rt6i_dst),
+			   offsetof(struct fib6_info, fib6_dst),
 			   exact_match);
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1522,7 +1522,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 			if (subtree) {
 				fn = fib6_locate_1(subtree, saddr, src_len,
-					   offsetof(struct fib6_info, rt6i_src),
+					   offsetof(struct fib6_info, fib6_src),
 					   exact_match);
 			}
 		}
@@ -1706,7 +1706,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	/* Unlink it */
 	*rtp = rt->rt6_next;
-	rt->rt6i_node = NULL;
+	rt->fib6_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
 
@@ -1718,14 +1718,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 		fn->rr_ptr = NULL;
 
 	/* Remove this entry from other siblings */
-	if (rt->rt6i_nsiblings) {
+	if (rt->fib6_nsiblings) {
 		struct fib6_info *sibling, *next_sibling;
 
 		list_for_each_entry_safe(sibling, next_sibling,
-					 &rt->rt6i_siblings, rt6i_siblings)
-			sibling->rt6i_nsiblings--;
-		rt->rt6i_nsiblings = 0;
-		list_del_init(&rt->rt6i_siblings);
+					 &rt->fib6_siblings, fib6_siblings)
+			sibling->fib6_nsiblings--;
+		rt->fib6_nsiblings = 0;
+		list_del_init(&rt->fib6_siblings);
 		rt6_multipath_rebalance(next_sibling);
 	}
 
@@ -1765,9 +1765,9 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 /* Need to own table->tb6_lock */
 int fib6_del(struct fib6_info *rt, struct nl_info *info)
 {
-	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
-	struct fib6_table *table = rt->rt6i_table;
+	struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+	struct fib6_table *table = rt->fib6_table;
 	struct net *net = info->nl_net;
 	struct fib6_info __rcu **rtp;
 	struct fib6_info __rcu **rtp_next;
@@ -1951,17 +1951,17 @@ static int fib6_clean_node(struct fib6_walker *w)
 #if RT6_DEBUG >= 2
 				pr_debug("%s: del failed: rt=%p@%p err=%d\n",
 					 __func__, rt,
-					 rcu_access_pointer(rt->rt6i_node),
+					 rcu_access_pointer(rt->fib6_node),
 					 res);
 #endif
 				continue;
 			}
 			return 0;
 		} else if (res == -2) {
-			if (WARN_ON(!rt->rt6i_nsiblings))
+			if (WARN_ON(!rt->fib6_nsiblings))
 				continue;
-			rt = list_last_entry(&rt->rt6i_siblings,
-					     struct fib6_info, rt6i_siblings);
+			rt = list_last_entry(&rt->fib6_siblings,
+					     struct fib6_info, fib6_siblings);
 			continue;
 		}
 		WARN_ON(res != 0);
@@ -2045,7 +2045,7 @@ static int fib6_age(struct fib6_info *rt, void *arg)
 	 *	Routes are expired even if they are in use.
 	 */
 
-	if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) {
+	if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
 		if (time_after(now, rt->expires)) {
 			RT6_TRACE("expiring %p\n", rt);
 			return -1;
@@ -2243,22 +2243,22 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 	struct ipv6_route_iter *iter = seq->private;
 	const struct net_device *dev;
 
-	seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
+	seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
 
 #ifdef CONFIG_IPV6_SUBTREES
-	seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
+	seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
 #else
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
-	if (rt->rt6i_flags & RTF_GATEWAY)
+	if (rt->fib6_flags & RTF_GATEWAY)
 		seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
 	else
 		seq_puts(seq, "00000000000000000000000000000000");
 
 	dev = rt->fib6_nh.nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
-		   rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0,
-		   rt->rt6i_flags, dev ? dev->name : "");
+		   rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
+		   rt->fib6_flags, dev ? dev->name : "");
 	iter->w.leaf = NULL;
 	return 0;
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 102645298692..9ac5366064e3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1318,7 +1318,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		}
 		neigh->flags |= NTF_ROUTER;
 	} else if (rt) {
-		rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+		rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 	}
 
 	if (rt)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f9c363327d62..e23ab0784e65 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -284,10 +284,10 @@ static const u32 ip6_template_metrics[RTAX_MAX] = {
 };
 
 static const struct fib6_info fib6_null_entry_template = {
-	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32)0,
-	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.fib6_protocol  = RTPROT_KERNEL,
+	.fib6_metric	= ~(u32)0,
+	.fib6_ref	= ATOMIC_INIT(1),
 	.fib6_type	= RTN_UNREACHABLE,
 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
 };
@@ -429,8 +429,8 @@ static struct fib6_info *rt6_multipath_select(const struct net *net,
 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
 		return match;
 
-	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
-				 rt6i_siblings) {
+	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
+				 fib6_siblings) {
 		int nh_upper_bound;
 
 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
@@ -472,12 +472,12 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 			if (dev->ifindex == oif)
 				return sprt;
 			if (dev->flags & IFF_LOOPBACK) {
-				if (!sprt->rt6i_idev ||
-				    sprt->rt6i_idev->dev->ifindex != oif) {
+				if (!sprt->fib6_idev ||
+				    sprt->fib6_idev->dev->ifindex != oif) {
 					if (flags & RT6_LOOKUP_F_IFACE)
 						continue;
 					if (local &&
-					    local->rt6i_idev->dev->ifindex == oif)
+					    local->fib6_idev->dev->ifindex == oif)
 						continue;
 				}
 				local = sprt;
@@ -534,7 +534,7 @@ static void rt6_probe(struct fib6_info *rt)
 	 * Router Reachability Probe MUST be rate-limited
 	 * to no more than one per minute.
 	 */
-	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
+	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
 		return;
 
 	nh_gw = &rt->fib6_nh.nh_gw;
@@ -550,7 +550,7 @@ static void rt6_probe(struct fib6_info *rt)
 		if (!(neigh->nud_state & NUD_VALID) &&
 		    time_after(jiffies,
 			       neigh->updated +
-			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
+			       rt->fib6_idev->cnf.rtr_probe_interval)) {
 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
 			if (work)
 				__neigh_set_probe_once(neigh);
@@ -587,7 +587,7 @@ static inline int rt6_check_dev(struct fib6_info *rt, int oif)
 	if (!oif || dev->ifindex == oif)
 		return 2;
 	if ((dev->flags & IFF_LOOPBACK) &&
-	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
+	    rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif)
 		return 1;
 	return 0;
 }
@@ -597,8 +597,8 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 	struct neighbour *neigh;
 
-	if (rt->rt6i_flags & RTF_NONEXTHOP ||
-	    !(rt->rt6i_flags & RTF_GATEWAY))
+	if (rt->fib6_flags & RTF_NONEXTHOP ||
+	    !(rt->fib6_flags & RTF_GATEWAY))
 		return RT6_NUD_SUCCEED;
 
 	rcu_read_lock_bh();
@@ -632,7 +632,7 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
 		return RT6_NUD_FAIL_HARD;
 #ifdef CONFIG_IPV6_ROUTER_PREF
-	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
 #endif
 	if (strict & RT6_LOOKUP_F_REACHABLE) {
 		int n = rt6_check_neigh(rt);
@@ -648,7 +648,7 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
 {
 	int m;
 	bool match_do_rr = false;
-	struct inet6_dev *idev = rt->rt6i_idev;
+	struct inet6_dev *idev = rt->fib6_idev;
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		goto out;
@@ -694,7 +694,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 	match = NULL;
 	cont = NULL;
 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
-		if (rt->rt6i_metric != metric) {
+		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
 		}
@@ -704,7 +704,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 
 	for (rt = leaf; rt && rt != rr_head;
 	     rt = rcu_dereference(rt->rt6_next)) {
-		if (rt->rt6i_metric != metric) {
+		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
 		}
@@ -741,30 +741,30 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 	 * (This might happen if all routes under fn are deleted from
 	 * the tree and fib6_repair_tree() is called on the node.)
 	 */
-	key_plen = rt0->rt6i_dst.plen;
+	key_plen = rt0->fib6_dst.plen;
 #ifdef CONFIG_IPV6_SUBTREES
-	if (rt0->rt6i_src.plen)
-		key_plen = rt0->rt6i_src.plen;
+	if (rt0->fib6_src.plen)
+		key_plen = rt0->fib6_src.plen;
 #endif
 	if (fn->fn_bit != key_plen)
 		return net->ipv6.fib6_null_entry;
 
-	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
+	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
 			     &do_rr);
 
 	if (do_rr) {
 		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
 
 		/* no entries matched; do round-robin */
-		if (!next || next->rt6i_metric != rt0->rt6i_metric)
+		if (!next || next->fib6_metric != rt0->fib6_metric)
 			next = leaf;
 
 		if (next != rt0) {
-			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+			spin_lock_bh(&leaf->fib6_table->tb6_lock);
 			/* make sure next is not being deleted from the tree */
-			if (next->rt6i_node)
+			if (next->fib6_node)
 				rcu_assign_pointer(fn->rr_ptr, next);
-			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
 		}
 	}
 
@@ -773,7 +773,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 
 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
 {
-	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -837,8 +837,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 					dev, pref);
 	else if (rt)
-		rt->rt6i_flags = RTF_ROUTEINFO |
-				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+		rt->fib6_flags = RTF_ROUTEINFO |
+				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 
 	if (rt) {
 		if (!addrconf_finite_timeout(lifetime))
@@ -861,13 +861,13 @@ static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
 {
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 
-	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
 		/* for copies of local routes, dst->dev needs to be the
 		 * device if it is a master device, the master device if
 		 * device is enslaved, and the loopback as the default
 		 */
 		if (netif_is_l3_slave(dev) &&
-		    !rt6_need_strict(&rt->rt6i_dst.addr))
+		    !rt6_need_strict(&rt->fib6_dst.addr))
 			dev = l3mdev_master_dev_rcu(dev);
 		else if (!netif_is_l3_master(dev))
 			dev = dev_net(dev)->loopback_dev;
@@ -939,7 +939,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 {
 	rt->dst.flags |= fib6_info_dst_flags(ort);
 
-	if (ort->rt6i_flags & RTF_REJECT) {
+	if (ort->fib6_flags & RTF_REJECT) {
 		ip6_rt_init_dst_reject(rt, ort);
 		return;
 	}
@@ -949,7 +949,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 
 	if (ort->fib6_type == RTN_LOCAL) {
 		rt->dst.input = ip6_input;
-	} else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
+	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
 		rt->dst.input = ip6_mc_input;
 	} else {
 		rt->dst.input = ip6_forward;
@@ -979,17 +979,17 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
 	ip6_rt_init_dst(rt, ort);
 
-	rt->rt6i_dst = ort->rt6i_dst;
-	rt->rt6i_idev = ort->rt6i_idev;
+	rt->rt6i_dst = ort->fib6_dst;
+	rt->rt6i_idev = ort->fib6_idev;
 	if (rt->rt6i_idev)
 		in6_dev_hold(rt->rt6i_idev);
 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
-	rt->rt6i_flags = ort->rt6i_flags;
+	rt->rt6i_flags = ort->fib6_flags;
 	rt6_set_from(rt, ort);
 #ifdef CONFIG_IPV6_SUBTREES
-	rt->rt6i_src = ort->rt6i_src;
+	rt->rt6i_src = ort->fib6_src;
 #endif
-	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+	rt->rt6i_prefsrc = ort->fib6_prefsrc;
 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 }
 
@@ -1064,7 +1064,7 @@ restart:
 	} else {
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
-		if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
+		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
 			f6i = rt6_multipath_select(net, f6i, fl6,
 						   fl6->flowi6_oif, skb, flags);
 	}
@@ -1142,7 +1142,7 @@ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
 	int err;
 	struct fib6_table *table;
 
-	table = rt->rt6i_table;
+	table = rt->fib6_table;
 	spin_lock_bh(&table->tb6_lock);
 	err = fib6_add(&table->tb6_root, rt, info, extack);
 	spin_unlock_bh(&table->tb6_lock);
@@ -1182,8 +1182,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 	rt->rt6i_dst.plen = 128;
 
 	if (!rt6_is_gw_or_nonexthop(ort)) {
-		if (ort->rt6i_dst.plen != 128 &&
-		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
+		if (ort->fib6_dst.plen != 128 &&
+		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
 			rt->rt6i_flags |= RTF_ANYCAST;
 #ifdef CONFIG_IPV6_SUBTREES
 		if (rt->rt6i_src.plen && saddr) {
@@ -1375,7 +1375,7 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
 {
 	unsigned int mtu;
 
-	mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
+	mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6;
 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
 
 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
@@ -1416,14 +1416,14 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (ort->rt6i_src.plen)
+	if (ort->fib6_src.plen)
 		src_key = &nrt->rt6i_src.addr;
 #endif
 
 	/* Update rt6i_prefsrc as it could be changed
 	 * in rt6_remove_prefsrc()
 	 */
-	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
 	/* rt6_mtu_change() might lower mtu on ort.
 	 * Only insert this exception route if its mtu
 	 * is less than ort's mtu value.
@@ -1457,9 +1457,9 @@ out:
 
 	/* Update fn->fn_sernum to invalidate all cached dst */
 	if (!err) {
-		spin_lock_bh(&ort->rt6i_table->tb6_lock);
+		spin_lock_bh(&ort->fib6_table->tb6_lock);
 		fib6_update_sernum(net, ort);
-		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
+		spin_unlock_bh(&ort->fib6_table->tb6_lock);
 		fib6_force_start_gc(net);
 	}
 
@@ -1514,7 +1514,7 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (rt->rt6i_src.plen)
+	if (rt->fib6_src.plen)
 		src_key = saddr;
 #endif
 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
@@ -1551,7 +1551,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (from->rt6i_src.plen)
+	if (from->fib6_src.plen)
 		src_key = &rt->rt6i_src.addr;
 #endif
 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1592,7 +1592,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (from->rt6i_src.plen)
+	if (from->fib6_src.plen)
 		src_key = &rt->rt6i_src.addr;
 #endif
 	rt6_ex = __rt6_find_exception_rcu(&bucket,
@@ -1810,7 +1810,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->rt6i_nsiblings)
+	if (f6i->fib6_nsiblings)
 		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1842,7 +1842,7 @@ redo_rt6_select:
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
-			    !(f6i->rt6i_flags & RTF_GATEWAY))) {
+			    !(f6i->fib6_flags & RTF_GATEWAY))) {
 		/* Create a RTF_CACHE clone which will not be
 		 * owned by the fib6 tree.  It is for the special case where
 		 * the daddr in the skb during the neighbor look-up is different
@@ -2206,7 +2206,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 			struct fib6_node *fn;
 
 			rcu_read_lock();
-			fn = rcu_dereference(rt->from->rt6i_node);
+			fn = rcu_dereference(rt->from->fib6_node);
 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
 				fn->fn_sernum = -1;
 			rcu_read_unlock();
@@ -2372,9 +2372,9 @@ restart:
 			continue;
 		if (fib6_check_expired(rt))
 			continue;
-		if (rt->rt6i_flags & RTF_REJECT)
+		if (rt->fib6_flags & RTF_REJECT)
 			break;
-		if (!(rt->rt6i_flags & RTF_GATEWAY))
+		if (!(rt->fib6_flags & RTF_GATEWAY))
 			continue;
 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
 			continue;
@@ -2400,7 +2400,7 @@ restart:
 
 	if (!rt)
 		rt = net->ipv6.fib6_null_entry;
-	else if (rt->rt6i_flags & RTF_REJECT) {
+	else if (rt->fib6_flags & RTF_REJECT) {
 		ret = net->ipv6.ip6_null_entry;
 		goto out;
 	}
@@ -2907,7 +2907,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 
 	if (cfg->fc_protocol == RTPROT_UNSPEC)
 		cfg->fc_protocol = RTPROT_BOOT;
-	rt->rt6i_protocol = cfg->fc_protocol;
+	rt->fib6_protocol = cfg->fc_protocol;
 
 	addr_type = ipv6_addr_type(&cfg->fc_dst);
 
@@ -2922,17 +2922,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
 	}
 
-	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
-	rt->rt6i_dst.plen = cfg->fc_dst_len;
-	if (rt->rt6i_dst.plen == 128)
+	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+	rt->fib6_dst.plen = cfg->fc_dst_len;
+	if (rt->fib6_dst.plen == 128)
 		rt->dst_host = true;
 
 #ifdef CONFIG_IPV6_SUBTREES
-	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
-	rt->rt6i_src.plen = cfg->fc_src_len;
+	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
+	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
 
-	rt->rt6i_metric = cfg->fc_metric;
+	rt->fib6_metric = cfg->fc_metric;
 	rt->fib6_nh.nh_weight = 1;
 
 	rt->fib6_type = cfg->fc_type;
@@ -2958,7 +2958,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 				goto out;
 			}
 		}
-		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
+		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
 		goto install_route;
 	}
 
@@ -2992,21 +2992,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 			err = -EINVAL;
 			goto out;
 		}
-		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
-		rt->rt6i_prefsrc.plen = 128;
+		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
+		rt->fib6_prefsrc.plen = 128;
 	} else
-		rt->rt6i_prefsrc.plen = 0;
+		rt->fib6_prefsrc.plen = 0;
 
-	rt->rt6i_flags = cfg->fc_flags;
+	rt->fib6_flags = cfg->fc_flags;
 
 install_route:
-	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
 	    !netif_carrier_ok(dev))
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
 	rt->fib6_nh.nh_dev = dev;
-	rt->rt6i_idev = idev;
-	rt->rt6i_table = table;
+	rt->fib6_idev = idev;
+	rt->fib6_table = table;
 
 	cfg->fc_nlinfo.nl_net = dev_net(dev);
 
@@ -3048,7 +3048,7 @@ static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
 		goto out;
 	}
 
-	table = rt->rt6i_table;
+	table = rt->fib6_table;
 	spin_lock_bh(&table->tb6_lock);
 	err = fib6_del(rt, info);
 	spin_unlock_bh(&table->tb6_lock);
@@ -3075,10 +3075,10 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
 
 	if (rt == net->ipv6.fib6_null_entry)
 		goto out_put;
-	table = rt->rt6i_table;
+	table = rt->fib6_table;
 	spin_lock_bh(&table->tb6_lock);
 
-	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
+	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
 		struct fib6_info *sibling, *next_sibling;
 
 		/* prefer to send a single notification with all hops */
@@ -3096,8 +3096,8 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
 		}
 
 		list_for_each_entry_safe(sibling, next_sibling,
-					 &rt->rt6i_siblings,
-					 rt6i_siblings) {
+					 &rt->fib6_siblings,
+					 fib6_siblings) {
 			err = fib6_del(sibling, info);
 			if (err)
 				goto out_unlock;
@@ -3176,9 +3176,9 @@ static int ip6_route_del(struct fib6_config *cfg,
 			if (cfg->fc_flags & RTF_GATEWAY &&
 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
 				continue;
-			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
+			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
 				continue;
-			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
+			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
 				continue;
 			fib6_info_hold(rt);
 			rcu_read_unlock();
@@ -3336,7 +3336,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
 			continue;
-		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
 			continue;
 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
@@ -3396,7 +3396,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
 		if (dev == rt->fib6_nh.nh_dev &&
-		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
+		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
 			break;
 	}
@@ -3445,8 +3445,8 @@ static void __rt6_purge_dflt_routers(struct net *net,
 restart:
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
-		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
+		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+		    (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) {
 			fib6_info_hold(rt);
 			rcu_read_unlock();
 			ip6_del_rt(net, rt);
@@ -3607,26 +3607,26 @@ struct fib6_info *addrconf_dst_alloc(struct net *net,
 	rt->dst_nocount = true;
 
 	in6_dev_hold(idev);
-	rt->rt6i_idev = idev;
+	rt->fib6_idev = idev;
 
 	rt->dst_host = true;
-	rt->rt6i_protocol = RTPROT_KERNEL;
-	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
+	rt->fib6_protocol = RTPROT_KERNEL;
+	rt->fib6_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast) {
 		rt->fib6_type = RTN_ANYCAST;
-		rt->rt6i_flags |= RTF_ANYCAST;
+		rt->fib6_flags |= RTF_ANYCAST;
 	} else {
 		rt->fib6_type = RTN_LOCAL;
-		rt->rt6i_flags |= RTF_LOCAL;
+		rt->fib6_flags |= RTF_LOCAL;
 	}
 
 	rt->fib6_nh.nh_gw = *addr;
 	dev_hold(dev);
 	rt->fib6_nh.nh_dev = dev;
-	rt->rt6i_dst.addr = *addr;
-	rt->rt6i_dst.plen = 128;
+	rt->fib6_dst.addr = *addr;
+	rt->fib6_dst.plen = 128;
 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
-	rt->rt6i_table = fib6_get_table(net, tb_id);
+	rt->fib6_table = fib6_get_table(net, tb_id);
 
 	return rt;
 }
@@ -3646,10 +3646,10 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 
 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
 	    rt != net->ipv6.fib6_null_entry &&
-	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
 		/* remove prefsrc entry */
-		rt->rt6i_prefsrc.plen = 0;
+		rt->fib6_prefsrc.plen = 0;
 		/* need to update cache as well */
 		rt6_exceptions_remove_prefsrc(rt);
 		spin_unlock_bh(&rt6_exception_lock);
@@ -3675,7 +3675,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
-	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
 		return -1;
 	}
@@ -3707,16 +3707,16 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 	struct fib6_info *iter;
 	struct fib6_node *fn;
 
-	fn = rcu_dereference_protected(rt->rt6i_node,
-			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	fn = rcu_dereference_protected(rt->fib6_node,
+			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	iter = rcu_dereference_protected(fn->leaf,
-			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	while (iter) {
-		if (iter->rt6i_metric == rt->rt6i_metric &&
+		if (iter->fib6_metric == rt->fib6_metric &&
 		    rt6_qualify_for_ecmp(iter))
 			return iter;
 		iter = rcu_dereference_protected(iter->rt6_next,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				lockdep_is_held(&rt->fib6_table->tb6_lock));
 	}
 
 	return NULL;
@@ -3726,7 +3726,7 @@ static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
-	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+	     rt->fib6_idev->cnf.ignore_routes_with_linkdown))
 		return true;
 
 	return false;
@@ -3740,7 +3740,7 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
 	if (!rt6_is_dead(rt))
 		total += rt->fib6_nh.nh_weight;
 
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
 		if (!rt6_is_dead(iter))
 			total += iter->fib6_nh.nh_weight;
 	}
@@ -3767,7 +3767,7 @@ static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
 
 	rt6_upper_bound_set(rt, &weight, total);
 
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		rt6_upper_bound_set(iter, &weight, total);
 }
 
@@ -3780,7 +3780,7 @@ void rt6_multipath_rebalance(struct fib6_info *rt)
 	 * then there is no need to rebalance upon the removal of every
 	 * sibling route.
 	 */
-	if (!rt->rt6i_nsiblings || rt->should_flush)
+	if (!rt->fib6_nsiblings || rt->should_flush)
 		return;
 
 	/* During lookup routes are evaluated in order, so we need to
@@ -3831,7 +3831,7 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_dev == dev)
 		return true;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		if (iter->fib6_nh.nh_dev == dev)
 			return true;
 
@@ -3843,7 +3843,7 @@ static void rt6_multipath_flush(struct fib6_info *rt)
 	struct fib6_info *iter;
 
 	rt->should_flush = 1;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		iter->should_flush = 1;
 }
 
@@ -3856,7 +3856,7 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
 	if (rt->fib6_nh.nh_dev == down_dev ||
 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		dead++;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		if (iter->fib6_nh.nh_dev == down_dev ||
 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
 			dead++;
@@ -3872,7 +3872,7 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_dev == dev)
 		rt->fib6_nh.nh_flags |= nh_flags;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		if (iter->fib6_nh.nh_dev == dev)
 			iter->fib6_nh.nh_flags |= nh_flags;
 }
@@ -3893,13 +3893,13 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 	case NETDEV_DOWN:
 		if (rt->should_flush)
 			return -1;
-		if (!rt->rt6i_nsiblings)
+		if (!rt->fib6_nsiblings)
 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
 		if (rt6_multipath_uses_dev(rt, dev)) {
 			unsigned int count;
 
 			count = rt6_multipath_dead_count(rt, dev);
-			if (rt->rt6i_nsiblings + 1 == count) {
+			if (rt->fib6_nsiblings + 1 == count) {
 				rt6_multipath_flush(rt);
 				return -1;
 			}
@@ -3911,7 +3911,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 		return -2;
 	case NETDEV_CHANGE:
 		if (rt->fib6_nh.nh_dev != dev ||
-		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 		rt6_multipath_rebalance(rt);
@@ -4188,10 +4188,10 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
 	 * nexthop. Since sibling routes are always added at the end of
 	 * the list, find the first sibling of the last route appended
 	 */
-	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
-		rt = list_first_entry(&rt_last->rt6i_siblings,
+	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
+		rt = list_first_entry(&rt_last->fib6_siblings,
 				      struct fib6_info,
-				      rt6i_siblings);
+				      fib6_siblings);
 	}
 
 	if (rt)
@@ -4410,13 +4410,13 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 {
 	int nexthop_len = 0;
 
-	if (rt->rt6i_nsiblings) {
+	if (rt->fib6_nsiblings) {
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
 			    + nla_total_size(16) /* RTA_GATEWAY */
 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
 
-		nexthop_len *= rt->rt6i_nsiblings;
+		nexthop_len *= rt->fib6_nsiblings;
 	}
 
 	return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4444,11 +4444,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
-		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+		if (rt->fib6_idev->cnf.ignore_routes_with_linkdown)
 			*flags |= RTNH_F_DEAD;
 	}
 
-	if (rt->rt6i_flags & RTF_GATEWAY) {
+	if (rt->fib6_flags & RTF_GATEWAY) {
 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
 			goto nla_put_failure;
 	}
@@ -4518,11 +4518,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 
 	rtm = nlmsg_data(nlh);
 	rtm->rtm_family = AF_INET6;
-	rtm->rtm_dst_len = rt->rt6i_dst.plen;
-	rtm->rtm_src_len = rt->rt6i_src.plen;
+	rtm->rtm_dst_len = rt->fib6_dst.plen;
+	rtm->rtm_src_len = rt->fib6_src.plen;
 	rtm->rtm_tos = 0;
-	if (rt->rt6i_table)
-		table = rt->rt6i_table->tb6_id;
+	if (rt->fib6_table)
+		table = rt->fib6_table->tb6_id;
 	else
 		table = RT6_TABLE_UNSPEC;
 	rtm->rtm_table = table;
@@ -4532,9 +4532,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	rtm->rtm_type = rt->fib6_type;
 	rtm->rtm_flags = 0;
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
-	rtm->rtm_protocol = rt->rt6i_protocol;
+	rtm->rtm_protocol = rt->fib6_protocol;
 
-	if (rt->rt6i_flags & RTF_CACHE)
+	if (rt->fib6_flags & RTF_CACHE)
 		rtm->rtm_flags |= RTM_F_CLONED;
 
 	if (dest) {
@@ -4542,7 +4542,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 		rtm->rtm_dst_len = 128;
 	} else if (rtm->rtm_dst_len)
-		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
+		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
 			goto nla_put_failure;
 #ifdef CONFIG_IPV6_SUBTREES
 	if (src) {
@@ -4550,12 +4550,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 		rtm->rtm_src_len = 128;
 	} else if (rtm->rtm_src_len &&
-		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
+		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
 		goto nla_put_failure;
 #endif
 	if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
-		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
 			int err = ip6mr_get_route(net, skb, rtm, portid);
 
 			if (err == 0)
@@ -4573,9 +4573,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_prefsrc.plen) {
+	if (rt->fib6_prefsrc.plen) {
 		struct in6_addr saddr_buf;
-		saddr_buf = rt->rt6i_prefsrc.addr;
+		saddr_buf = rt->fib6_prefsrc.addr;
 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
 			goto nla_put_failure;
 	}
@@ -4584,13 +4584,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
 		goto nla_put_failure;
 
-	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
+	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
 		goto nla_put_failure;
 
 	/* For multipath routes, walk the siblings list and add
 	 * each as a nexthop within RTA_MULTIPATH.
 	 */
-	if (rt->rt6i_nsiblings) {
+	if (rt->fib6_nsiblings) {
 		struct fib6_info *sibling, *next_sibling;
 		struct nlattr *mp;
 
@@ -4602,7 +4602,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 
 		list_for_each_entry_safe(sibling, next_sibling,
-					 &rt->rt6i_siblings, rt6i_siblings) {
+					 &rt->fib6_siblings, fib6_siblings) {
 			if (rt6_add_nexthop(skb, sibling) < 0)
 				goto nla_put_failure;
 		}
@@ -4613,7 +4613,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_flags & RTF_EXPIRES) {
+	if (rt->fib6_flags & RTF_EXPIRES) {
 		expires = dst ? dst->expires : rt->expires;
 		expires -= jiffies;
 	}
@@ -4621,7 +4621,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
 		goto nla_put_failure;
 
-	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
+	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
 		goto nla_put_failure;
 
 
@@ -4646,7 +4646,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 
 		/* user wants prefix routes only */
 		if (rtm->rtm_flags & RTM_F_PREFIX &&
-		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
+		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
 			/* success since this is not a prefix route */
 			return 1;
 		}
@@ -4820,7 +4820,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 
 	if (event == NETDEV_REGISTER) {
 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
-		net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
+		net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev);
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -4834,7 +4834,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		/* NETDEV_UNREGISTER could be fired for multiple times by
 		 * netdev_wait_allrefs(). Make sure we only call this once.
 		 */
-		in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
+		in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev);
 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
@@ -5157,7 +5157,7 @@ void __init ip6_route_init_special_entries(void)
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
-	init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+	init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev);
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
cgit v1.2.3


From 360a9887c8c01a715b2b4b131f7c7462f7cce576 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:00 -0700
Subject: net/ipv6: Rename addrconf_dst_alloc

addrconf_dst_alloc now returns a fib6_info. Update the name
and its users to reflect the change.

Rename only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  2 +-
 net/ipv6/addrconf.c     | 28 ++++++++++++++--------------
 net/ipv6/anycast.c      | 14 +++++++-------
 net/ipv6/route.c        | 44 ++++++++++++++++++++++----------------------
 4 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index b4b85109984f..31e24f821ef6 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -136,7 +136,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
 void fib6_force_start_gc(struct net *net);
 
-struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
+struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev,
 				     const struct in6_addr *addr, bool anycast,
 				     gfp_t gfp_flags);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f38ea829c28b..e6dd886f8f1f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -994,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
 	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
-	struct fib6_info *rt = NULL;
+	struct fib6_info *f6i = NULL;
 	int err = 0;
 	int addr_type = ipv6_addr_type(addr);
 
@@ -1036,16 +1036,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		rt = NULL;
+	f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags);
+	if (IS_ERR(f6i)) {
+		err = PTR_ERR(f6i);
+		f6i = NULL;
 		goto out;
 	}
 
 	if (net->ipv6.devconf_all->disable_policy ||
 	    idev->cnf.disable_policy)
-		rt->dst_nopolicy = true;
+		f6i->dst_nopolicy = true;
 
 	neigh_parms_data_state_setall(idev->nd_parms);
 
@@ -1067,7 +1067,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	ifa->cstamp = ifa->tstamp = jiffies;
 	ifa->tokenized = false;
 
-	ifa->rt = rt;
+	ifa->rt = f6i;
 
 	ifa->idev = idev;
 	in6_dev_hold(idev);
@@ -1101,7 +1101,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	inet6addr_notifier_call_chain(NETDEV_UP, ifa);
 out:
 	if (unlikely(err < 0)) {
-		fib6_info_release(rt);
+		fib6_info_release(f6i);
 
 		if (ifa) {
 			if (ifa->idev)
@@ -3346,17 +3346,17 @@ static int fixup_permanent_addr(struct net *net,
 	 * case regenerate the host route.
 	 */
 	if (!ifp->rt || !ifp->rt->fib6_node) {
-		struct fib6_info *rt, *prev;
+		struct fib6_info *f6i, *prev;
 
-		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
-					GFP_ATOMIC);
-		if (IS_ERR(rt))
-			return PTR_ERR(rt);
+		f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
+					 GFP_ATOMIC);
+		if (IS_ERR(f6i))
+			return PTR_ERR(f6i);
 
 		/* ifp->rt can be accessed outside of rtnl */
 		spin_lock(&ifp->lock);
 		prev = ifp->rt;
-		ifp->rt = rt;
+		ifp->rt = f6i;
 		spin_unlock(&ifp->lock);
 
 		fib6_info_release(prev);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index eed3bf63bd05..5c3e74d05018 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -247,7 +247,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 {
 	struct ifacaddr6 *aca;
-	struct fib6_info *rt;
+	struct fib6_info *f6i;
 	struct net *net;
 	int err;
 
@@ -268,14 +268,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	}
 
 	net = dev_net(idev->dev);
-	rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
+	f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
+	if (IS_ERR(f6i)) {
+		err = PTR_ERR(f6i);
 		goto out;
 	}
-	aca = aca_alloc(rt, addr);
+	aca = aca_alloc(f6i, addr);
 	if (!aca) {
-		fib6_info_release(rt);
+		fib6_info_release(f6i);
 		err = -ENOMEM;
 		goto out;
 	}
@@ -289,7 +289,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	aca_get(aca);
 	write_unlock_bh(&idev->lock);
 
-	ip6_ins_rt(net, rt);
+	ip6_ins_rt(net, f6i);
 
 	addrconf_join_solict(idev->dev, &aca->aca_addr);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e23ab0784e65..e44f82848143 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3591,44 +3591,44 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
  *	Allocate a dst for local (unicast / anycast) address.
  */
 
-struct fib6_info *addrconf_dst_alloc(struct net *net,
-				    struct inet6_dev *idev,
-				    const struct in6_addr *addr,
-				    bool anycast, gfp_t gfp_flags)
+struct fib6_info *addrconf_f6i_alloc(struct net *net,
+				     struct inet6_dev *idev,
+				     const struct in6_addr *addr,
+				     bool anycast, gfp_t gfp_flags)
 {
 	u32 tb_id;
 	struct net_device *dev = idev->dev;
-	struct fib6_info *rt;
+	struct fib6_info *f6i;
 
-	rt = fib6_info_alloc(gfp_flags);
-	if (!rt)
+	f6i = fib6_info_alloc(gfp_flags);
+	if (!f6i)
 		return ERR_PTR(-ENOMEM);
 
-	rt->dst_nocount = true;
+	f6i->dst_nocount = true;
 
 	in6_dev_hold(idev);
-	rt->fib6_idev = idev;
+	f6i->fib6_idev = idev;
 
-	rt->dst_host = true;
-	rt->fib6_protocol = RTPROT_KERNEL;
-	rt->fib6_flags = RTF_UP | RTF_NONEXTHOP;
+	f6i->dst_host = true;
+	f6i->fib6_protocol = RTPROT_KERNEL;
+	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast) {
-		rt->fib6_type = RTN_ANYCAST;
-		rt->fib6_flags |= RTF_ANYCAST;
+		f6i->fib6_type = RTN_ANYCAST;
+		f6i->fib6_flags |= RTF_ANYCAST;
 	} else {
-		rt->fib6_type = RTN_LOCAL;
-		rt->fib6_flags |= RTF_LOCAL;
+		f6i->fib6_type = RTN_LOCAL;
+		f6i->fib6_flags |= RTF_LOCAL;
 	}
 
-	rt->fib6_nh.nh_gw = *addr;
+	f6i->fib6_nh.nh_gw = *addr;
 	dev_hold(dev);
-	rt->fib6_nh.nh_dev = dev;
-	rt->fib6_dst.addr = *addr;
-	rt->fib6_dst.plen = 128;
+	f6i->fib6_nh.nh_dev = dev;
+	f6i->fib6_dst.addr = *addr;
+	f6i->fib6_dst.plen = 128;
 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
-	rt->fib6_table = fib6_get_table(net, tb_id);
+	f6i->fib6_table = fib6_get_table(net, tb_id);
 
-	return rt;
+	return f6i;
 }
 
 /* remove deleted ip from prefsrc entries */
-- 
cgit v1.2.3


From 9ee8cbb2fd4a7d6f483a20c4b8e82d8b1cf685fa Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:01 -0700
Subject: net/ipv6: Remove aca_idev

aca_idev has only 1 user - inet6_fill_ifacaddr - and it only
wants the device index which can be extracted from the fib6_info
nexthop.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/if_inet6.h | 1 -
 include/net/ip6_fib.h  | 5 +++++
 net/ipv6/addrconf.c    | 3 ++-
 net/ipv6/anycast.c     | 4 ----
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d6089b2e64fe..db389253dc2a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -143,7 +143,6 @@ struct ipv6_ac_socklist {
 
 struct ifacaddr6 {
 	struct in6_addr		aca_addr;
-	struct inet6_dev	*aca_idev;
 	struct fib6_info	*aca_rt;
 	struct ifacaddr6	*aca_next;
 	int			aca_users;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 994dd67207fb..bd11c990c353 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -410,6 +410,11 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
+static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
+{
+	return f6i->fib6_nh.nh_dev;
+}
+
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e6dd886f8f1f..6c42c5d5fafa 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4817,9 +4817,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
 static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
 				u32 portid, u32 seq, int event, unsigned int flags)
 {
+	struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
+	int ifindex = dev ? dev->ifindex : 1;
 	struct nlmsghdr  *nlh;
 	u8 scope = RT_SCOPE_UNIVERSE;
-	int ifindex = ifaca->aca_idev->dev->ifindex;
 
 	if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
 		scope = RT_SCOPE_SITE;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5c3e74d05018..0250d199e527 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -212,7 +212,6 @@ static void aca_get(struct ifacaddr6 *aca)
 static void aca_put(struct ifacaddr6 *ac)
 {
 	if (refcount_dec_and_test(&ac->aca_refcnt)) {
-		in6_dev_put(ac->aca_idev);
 		fib6_info_release(ac->aca_rt);
 		kfree(ac);
 	}
@@ -221,7 +220,6 @@ static void aca_put(struct ifacaddr6 *ac)
 static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 				   const struct in6_addr *addr)
 {
-	struct inet6_dev *idev = f6i->fib6_idev;
 	struct ifacaddr6 *aca;
 
 	aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -229,8 +227,6 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 		return NULL;
 
 	aca->aca_addr = *addr;
-	in6_dev_hold(idev);
-	aca->aca_idev = idev;
 	fib6_info_hold(f6i);
 	aca->aca_rt = f6i;
 	aca->aca_users = 1;
-- 
cgit v1.2.3


From 6fe749414446d86cfaae1c84bfdd556fe4fa8fca Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:03 -0700
Subject: net/ipv6: Change ip6_route_get_saddr to get dev from route

Prior to 4832c30d5458 ("net: ipv6: put host and anycast routes on device
with address") host routes and anycast routes were installed with the
device set to loopback (or VRF device once that feature was added). In the
older code dst.dev was set to loopback (needed for packet tx) and rt6i_idev
was used to denote the actual interface.

Commit 4832c30d5458 changed the code to have dst.dev pointing to the real
device with the switch to lo or vrf device done on dst clones. As a
consequence of this change ip6_route_get_saddr can just pass the nexthop
device to ipv6_dev_get_saddr.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 31e24f821ef6..c0620330035c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -114,14 +114,15 @@ static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
 {
-	struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL;
 	int err = 0;
 
-	if (f6i && f6i->fib6_prefsrc.plen)
+	if (f6i && f6i->fib6_prefsrc.plen) {
 		*saddr = f6i->fib6_prefsrc.addr;
-	else
-		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
-					 daddr, prefs, saddr);
+	} else {
+		struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
+
+		err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr);
+	}
 
 	return err;
 }
-- 
cgit v1.2.3


From 647d4c1363a85bec63ecf929d4ab4aae78b2a960 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:04 -0700
Subject: net/ipv6: Remove compare of fib6_idev from rt6_duplicate_nexthop

After 4832c30d5458 ("net: ipv6: put host and anycast routes on device
with address") the comparison of idev does not add value since it
correlates to the nexthop device which is already compared. Remove
the idev comparison.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index c0620330035c..8df4ff798b04 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -275,7 +275,6 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
 	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
-	       a->fib6_idev == b->fib6_idev &&
 	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
-- 
cgit v1.2.3


From dcd1f572954f9d66d7b4a65df894ed5b4c467368 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:05 -0700
Subject: net/ipv6: Remove fib6_idev

fib6_idev can be obtained from __in6_dev_get on the nexthop device
rather than caching it in the fib6_info. Remove it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 -
 net/ipv6/ip6_fib.c    |  2 --
 net/ipv6/route.c      | 66 ++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index bd11c990c353..642211174692 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -147,7 +147,6 @@ struct fib6_info {
 	unsigned int			fib6_nsiblings;
 
 	atomic_t			fib6_ref;
-	struct inet6_dev		*fib6_idev;
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 353f0b3e7b0d..f936e91a8c65 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -197,8 +197,6 @@ void fib6_info_destroy(struct fib6_info *f6i)
 		}
 	}
 
-	if (f6i->fib6_idev)
-		in6_dev_put(f6i->fib6_idev);
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8cf4f0623229..08cae7323776 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -525,15 +525,17 @@ static void rt6_probe(struct fib6_info *rt)
 	rcu_read_lock_bh();
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
+		struct inet6_dev *idev;
+
 		if (neigh->nud_state & NUD_VALID)
 			goto out;
 
+		idev = __in6_dev_get(dev);
 		work = NULL;
 		write_lock(&neigh->lock);
 		if (!(neigh->nud_state & NUD_VALID) &&
 		    time_after(jiffies,
-			       neigh->updated +
-			       rt->fib6_idev->cnf.rtr_probe_interval)) {
+			       neigh->updated + idev->cnf.rtr_probe_interval)) {
 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
 			if (work)
 				__neigh_set_probe_once(neigh);
@@ -622,18 +624,32 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 	return m;
 }
 
+/* called with rc_read_lock held */
+static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
+{
+	const struct net_device *dev = fib6_info_nh_dev(f6i);
+	bool rc = false;
+
+	if (dev) {
+		const struct inet6_dev *idev = __in6_dev_get(dev);
+
+		rc = !!idev->cnf.ignore_routes_with_linkdown;
+	}
+
+	return rc;
+}
+
 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
 				   int *mpri, struct fib6_info *match,
 				   bool *do_rr)
 {
 	int m;
 	bool match_do_rr = false;
-	struct inet6_dev *idev = rt->fib6_idev;
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		goto out;
 
-	if (idev->cnf.ignore_routes_with_linkdown &&
+	if (fib6_ignore_linkdown(rt) &&
 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
@@ -957,12 +973,12 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 
 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
+	struct net_device *dev = fib6_info_nh_dev(ort);
+
 	ip6_rt_init_dst(rt, ort);
 
 	rt->rt6i_dst = ort->fib6_dst;
-	rt->rt6i_idev = ort->fib6_idev;
-	if (rt->rt6i_idev)
-		in6_dev_hold(rt->rt6i_idev);
+	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 	rt->rt6i_flags = ort->fib6_flags;
 	rt6_set_from(rt, ort);
@@ -1355,7 +1371,18 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
 {
 	unsigned int mtu;
 
-	mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6;
+	if (rt->fib6_pmtu) {
+		mtu = rt->fib6_pmtu;
+	} else {
+		struct net_device *dev = fib6_info_nh_dev(rt);
+		struct inet6_dev *idev;
+
+		rcu_read_lock();
+		idev = __in6_dev_get(dev);
+		mtu = idev->cnf.mtu6;
+		rcu_read_unlock();
+	}
+
 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
 
 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
@@ -2985,11 +3012,13 @@ install_route:
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
 	rt->fib6_nh.nh_dev = dev;
-	rt->fib6_idev = idev;
 	rt->fib6_table = table;
 
 	cfg->fc_nlinfo.nl_net = dev_net(dev);
 
+	if (idev)
+		in6_dev_put(idev);
+
 	return rt;
 out:
 	if (dev)
@@ -3425,8 +3454,11 @@ static void __rt6_purge_dflt_routers(struct net *net,
 restart:
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
+		struct net_device *dev = fib6_info_nh_dev(rt);
+		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
+
 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
-		    (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) {
+		    (!idev || idev->cnf.accept_ra != 2)) {
 			fib6_info_hold(rt);
 			rcu_read_unlock();
 			ip6_del_rt(net, rt);
@@ -3585,10 +3617,6 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
 		return ERR_PTR(-ENOMEM);
 
 	f6i->dst_nocount = true;
-
-	in6_dev_hold(idev);
-	f6i->fib6_idev = idev;
-
 	f6i->dst_host = true;
 	f6i->fib6_protocol = RTPROT_KERNEL;
 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
@@ -3706,7 +3734,7 @@ static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
-	     rt->fib6_idev->cnf.ignore_routes_with_linkdown))
+	     fib6_ignore_linkdown(rt)))
 		return true;
 
 	return false;
@@ -4424,8 +4452,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
-		if (rt->fib6_idev->cnf.ignore_routes_with_linkdown)
+
+		rcu_read_lock();
+		if (fib6_ignore_linkdown(rt))
 			*flags |= RTNH_F_DEAD;
+		rcu_read_unlock();
 	}
 
 	if (rt->fib6_flags & RTF_GATEWAY) {
@@ -4800,7 +4831,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 
 	if (event == NETDEV_REGISTER) {
 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
-		net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev);
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -4814,7 +4844,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		/* NETDEV_UNREGISTER could be fired for multiple times by
 		 * netdev_wait_allrefs(). Make sure we only call this once.
 		 */
-		in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev);
 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
@@ -5137,7 +5166,6 @@ void __init ip6_route_init_special_entries(void)
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
-	init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev);
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
cgit v1.2.3


From 69b693f0aefa0ed521e8bd02260523b5ae446ad7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:55:57 -0700
Subject: bpf: btf: Introduce BPF Type Format (BTF)

This patch introduces BPF type Format (BTF).

BTF (BPF Type Format) is the meta data format which describes
the data types of BPF program/map.  Hence, it basically focus
on the C programming language which the modern BPF is primary
using.  The first use case is to provide a generic pretty print
capability for a BPF map.

BTF has its root from CTF (Compact C-Type format).  To simplify
the handling of BTF data, BTF removes the differences between
small and big type/struct-member.  Hence, BTF consistently uses u32
instead of supporting both "one u16" and "two u32 (+padding)" in
describing type and struct-member.

It also raises the number of types (and functions) limit
from 0x7fff to 0x7fffffff.

Due to the above changes,  the format is not compatible to CTF.
Hence, BTF starts with a new BTF_MAGIC and version number.

This patch does the first verification pass to the BTF.  The first
pass checks:
1. meta-data size (e.g. It does not go beyond the total btf's size)
2. name_offset is valid
3. Each BTF_KIND (e.g. int, enum, struct....) does its
   own check of its meta-data.

Some other checks, like checking a struct's member is referring
to a valid type, can only be done in the second pass.  The second
verification pass will be implemented in the next patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h | 130 +++++++
 kernel/bpf/Makefile      |   1 +
 kernel/bpf/btf.c         | 915 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1046 insertions(+)
 create mode 100644 include/uapi/linux/btf.h
 create mode 100644 kernel/bpf/btf.c

(limited to 'include')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
new file mode 100644
index 000000000000..74a30b1090df
--- /dev/null
+++ b/include/uapi/linux/btf.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2018 Facebook */
+#ifndef _UAPI__LINUX_BTF_H__
+#define _UAPI__LINUX_BTF_H__
+
+#include <linux/types.h>
+
+#define BTF_MAGIC	0xeB9F
+#define BTF_MAGIC_SWAP	0x9FeB
+#define BTF_VERSION	1
+#define BTF_FLAGS_COMPR	0x01
+
+struct btf_header {
+	__u16	magic;
+	__u8	version;
+	__u8	flags;
+
+	__u32	parent_label;
+	__u32	parent_name;
+
+	/* All offsets are in bytes relative to the end of this header */
+	__u32	label_off;	/* offset of label section	*/
+	__u32	object_off;	/* offset of data object section*/
+	__u32	func_off;	/* offset of function section	*/
+	__u32	type_off;	/* offset of type section	*/
+	__u32	str_off;	/* offset of string section	*/
+	__u32	str_len;	/* length of string section	*/
+};
+
+/* Max # of type identifier */
+#define BTF_MAX_TYPE	0x7fffffff
+/* Max offset into the string section */
+#define BTF_MAX_NAME_OFFSET	0x7fffffff
+/* Max # of struct/union/enum members or func args */
+#define BTF_MAX_VLEN	0xffff
+
+/* The type id is referring to a parent BTF */
+#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
+#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
+
+/* String is in the ELF string section */
+#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
+#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
+
+struct btf_type {
+	__u32 name;
+	/* "info" bits arrangement
+	 * bits  0-15: vlen (e.g. # of struct's members)
+	 * bits 16-23: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
+	 * bits    31: root
+	 */
+	__u32 info;
+	/* "size" is used by INT, ENUM, STRUCT and UNION.
+	 * "size" tells the size of the type it is describing.
+	 *
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+	 * "type" is a type_id referring to another type.
+	 */
+	union {
+		__u32 size;
+		__u32 type;
+	};
+};
+
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
+#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+
+#define BTF_KIND_UNKN		0	/* Unknown	*/
+#define BTF_KIND_INT		1	/* Integer	*/
+#define BTF_KIND_PTR		2	/* Pointer	*/
+#define BTF_KIND_ARRAY		3	/* Array	*/
+#define BTF_KIND_STRUCT		4	/* Struct	*/
+#define BTF_KIND_UNION		5	/* Union	*/
+#define BTF_KIND_ENUM		6	/* Enumeration	*/
+#define BTF_KIND_FWD		7	/* Forward	*/
+#define BTF_KIND_TYPEDEF	8	/* Typedef	*/
+#define BTF_KIND_VOLATILE	9	/* Volatile	*/
+#define BTF_KIND_CONST		10	/* Const	*/
+#define BTF_KIND_RESTRICT	11	/* Restrict	*/
+#define BTF_KIND_MAX		11
+#define NR_BTF_KINDS		12
+
+/* For some specific BTF_KIND, "struct btf_type" is immediately
+ * followed by extra data.
+ */
+
+/* BTF_KIND_INT is followed by a u32 and the following
+ * is the 32 bits arrangement:
+ */
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
+
+/* Attributes stored in the BTF_INT_ENCODING */
+#define BTF_INT_SIGNED	0x1
+#define BTF_INT_CHAR	0x2
+#define BTF_INT_BOOL	0x4
+#define BTF_INT_VARARGS	0x8
+
+/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
+ * The exact number of btf_enum is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum {
+	__u32	name;
+	__s32	val;
+};
+
+/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
+struct btf_array {
+	__u32	type;
+	__u32	index_type;
+	__u32	nelems;
+};
+
+/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
+ * by multiple "struct btf_member".  The exact number
+ * of btf_member is stored in the vlen (of the info in
+ * "struct btf_type").
+ */
+struct btf_member {
+	__u32	name;
+	__u32	type;
+	__u32	offset;	/* offset in bits */
+};
+
+#endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..35c485fa9ea3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,6 +4,7 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..26e9ed7cea5f
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,915 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/types.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map.  Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data.  F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type.  Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to desribe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id.  The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section.  The first one has type_id 1.  The second
+ * one has type_id 2...etc.  Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *".  A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *".  This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ *   - Each line started with "[?]" is a btf_type object
+ *   - [?] is the type_id of the btf_type object.
+ *   - CONST/PTR is the BTF_KIND_XXX
+ *   - "(anon)" is the name of the type.  It just
+ *     happens that CONST and PTR has no name.
+ *   - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data.  We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done.  e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ */
+
+#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+/* 64k. We can raise it later. The hard limit is S32_MAX. */
+#define BTF_MAX_NR_TYPES 65535
+
+#define for_each_member(i, struct_type, member)			\
+	for (i = 0, member = btf_type_member(struct_type);	\
+	     i < btf_type_vlen(struct_type);			\
+	     i++, member++)
+
+struct btf {
+	union {
+		struct btf_header *hdr;
+		void *data;
+	};
+	struct btf_type **types;
+	const char *strings;
+	void *nohdr_data;
+	u32 nr_types;
+	u32 types_size;
+	u32 data_size;
+};
+
+struct btf_verifier_env {
+	struct btf *btf;
+	struct bpf_verifier_log log;
+	u32 log_type_id;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+	[BTF_KIND_UNKN]		= "UNKNOWN",
+	[BTF_KIND_INT]		= "INT",
+	[BTF_KIND_PTR]		= "PTR",
+	[BTF_KIND_ARRAY]	= "ARRAY",
+	[BTF_KIND_STRUCT]	= "STRUCT",
+	[BTF_KIND_UNION]	= "UNION",
+	[BTF_KIND_ENUM]		= "ENUM",
+	[BTF_KIND_FWD]		= "FWD",
+	[BTF_KIND_TYPEDEF]	= "TYPEDEF",
+	[BTF_KIND_VOLATILE]	= "VOLATILE",
+	[BTF_KIND_CONST]	= "CONST",
+	[BTF_KIND_RESTRICT]	= "RESTRICT",
+};
+
+struct btf_kind_operations {
+	s32 (*check_meta)(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left);
+	void (*log_details)(struct btf_verifier_env *env,
+			    const struct btf_type *t);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+	if (encoding == 0)
+		return "(none)";
+	else if (encoding == BTF_INT_SIGNED)
+		return "SIGNED";
+	else if (encoding == BTF_INT_CHAR)
+		return "CHAR";
+	else if (encoding == BTF_INT_BOOL)
+		return "BOOL";
+	else if (encoding == BTF_INT_VARARGS)
+		return "VARARGS";
+	else
+		return "UNKN";
+}
+
+static u16 btf_type_vlen(const struct btf_type *t)
+{
+	return BTF_INFO_VLEN(t->info);
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+	return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+	return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_member *btf_type_member(const struct btf_type *t)
+{
+	return (const struct btf_member *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+	return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+	return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+	return !BTF_STR_TBL_ELF_ID(offset) &&
+		BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+}
+
+static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+	if (!BTF_STR_OFFSET(offset))
+		return "(anon)";
+	else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else
+		return "(invalid-name-offset)";
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+					      const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+					    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+						   const struct btf_type *t,
+						   bool log_details,
+						   const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	u8 kind = BTF_INFO_KIND(t->info);
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	__btf_verifier_log(log, "[%u] %s %s%s",
+			   env->log_type_id,
+			   btf_kind_str[kind],
+			   btf_name_by_offset(btf, t->name),
+			   log_details ? " " : "");
+
+	if (log_details)
+		btf_type_ops(t)->log_details(env, t);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+	__btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+	__btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+				    const struct btf_type *struct_type,
+				    const struct btf_member *member,
+				    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+			   btf_name_by_offset(btf, member->name),
+			   member->type, member->offset);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+{
+	struct bpf_verifier_log *log = &env->log;
+	const struct btf *btf = env->btf;
+	const struct btf_header *hdr;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	hdr = btf->hdr;
+	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+	__btf_verifier_log(log, "version: %u\n", hdr->version);
+	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+	__btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
+	__btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
+	__btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
+	__btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
+	__btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+	struct btf *btf = env->btf;
+
+	/* < 2 because +1 for btf_void which is always in btf->types[0].
+	 * btf_void is not accounted in btf->nr_types because btf_void
+	 * does not come from the BTF file.
+	 */
+	if (btf->types_size - btf->nr_types < 2) {
+		/* Expand 'types' array */
+
+		struct btf_type **new_types;
+		u32 expand_by, new_size;
+
+		if (btf->types_size == BTF_MAX_NR_TYPES) {
+			btf_verifier_log(env, "Exceeded max num of types");
+			return -E2BIG;
+		}
+
+		expand_by = max_t(u32, btf->types_size >> 2, 16);
+		new_size = min_t(u32, BTF_MAX_NR_TYPES,
+				 btf->types_size + expand_by);
+
+		new_types = kvzalloc(new_size * sizeof(*new_types),
+				     GFP_KERNEL | __GFP_NOWARN);
+		if (!new_types)
+			return -ENOMEM;
+
+		if (btf->nr_types == 0)
+			new_types[0] = &btf_void;
+		else
+			memcpy(new_types, btf->types,
+			       sizeof(*btf->types) * (btf->nr_types + 1));
+
+		kvfree(btf->types);
+		btf->types = new_types;
+		btf->types_size = new_size;
+	}
+
+	btf->types[++(btf->nr_types)] = t;
+
+	return 0;
+}
+
+static void btf_free(struct btf *btf)
+{
+	kvfree(btf->types);
+	kvfree(btf->data);
+	kfree(btf);
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+	kfree(env);
+}
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+	u16 encoding;
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	int_data = btf_type_int(t);
+	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+	if (nr_bits > BITS_PER_U64) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+				      BITS_PER_U64);
+		return -EINVAL;
+	}
+
+	if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+		return -EINVAL;
+	}
+
+	encoding = BTF_INT_ENCODING(int_data);
+	if (encoding &&
+	    encoding != BTF_INT_SIGNED &&
+	    encoding != BTF_INT_CHAR &&
+	    encoding != BTF_INT_BOOL &&
+	    encoding != BTF_INT_VARARGS) {
+		btf_verifier_log_type(env, t, "Unsupported encoding");
+		return -ENOTSUPP;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+			const struct btf_type *t)
+{
+	int int_data = btf_type_int(t);
+
+	btf_verifier_log(env,
+			 "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+			 t->size, BTF_INT_OFFSET(int_data),
+			 BTF_INT_BITS(int_data),
+			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static const struct btf_kind_operations int_ops = {
+	.check_meta = btf_int_check_meta,
+	.log_details = btf_int_log,
+};
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+				   const struct btf_type *t,
+				   u32 meta_left)
+{
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (BTF_TYPE_PARENT(t->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+			     const struct btf_type *t)
+{
+	btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static struct btf_kind_operations modifier_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations ptr_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations fwd_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+				const struct btf_type *t,
+				u32 meta_left)
+{
+	const struct btf_array *array = btf_type_array(t);
+	u32 meta_needed = sizeof(*array);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	/* We are a little forgiving on array->index_type since
+	 * the kernel is not using it.
+	 */
+	/* Array elem cannot be in type void,
+	 * so !array->type is not allowed.
+	 */
+	if (!array->type || BTF_TYPE_PARENT(array->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+			  const struct btf_type *t)
+{
+	const struct btf_array *array = btf_type_array(t);
+
+	btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+			 array->type, array->index_type, array->nelems);
+}
+
+static struct btf_kind_operations array_ops = {
+	.check_meta = btf_array_check_meta,
+	.log_details = btf_array_log,
+};
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+				 const struct btf_type *t,
+				 u32 meta_left)
+{
+	bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+	const struct btf_member *member;
+	struct btf *btf = env->btf;
+	u32 struct_size = t->size;
+	u32 meta_needed;
+	u16 i;
+
+	meta_needed = btf_type_vlen(t) * sizeof(*member);
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for_each_member(i, t, member) {
+		if (!btf_name_offset_valid(btf, member->name)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member name_offset:%u",
+						member->name);
+			return -EINVAL;
+		}
+
+		/* A member cannot be in type void */
+		if (!member->type || BTF_TYPE_PARENT(member->type)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid type_id");
+			return -EINVAL;
+		}
+
+		if (is_union && member->offset) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member bits_offset");
+			return -EINVAL;
+		}
+
+		if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+			btf_verifier_log_member(env, t, member,
+						"Memmber bits_offset exceeds its struct size");
+			return -EINVAL;
+		}
+
+		btf_verifier_log_member(env, t, member, NULL);
+	}
+
+	return meta_needed;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+			   const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations struct_ops = {
+	.check_meta = btf_struct_check_meta,
+	.log_details = btf_struct_log,
+};
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+			       const struct btf_type *t,
+			       u32 meta_left)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	struct btf *btf = env->btf;
+	u16 i, nr_enums;
+	u32 meta_needed;
+
+	nr_enums = btf_type_vlen(t);
+	meta_needed = nr_enums * sizeof(*enums);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->size != sizeof(int)) {
+		btf_verifier_log_type(env, t, "Expected size:%zu",
+				      sizeof(int));
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for (i = 0; i < nr_enums; i++) {
+		if (!btf_name_offset_valid(btf, enums[i].name)) {
+			btf_verifier_log(env, "\tInvalid name_offset:%u",
+					 enums[i].name);
+			return -EINVAL;
+		}
+
+		btf_verifier_log(env, "\t%s val=%d\n",
+				 btf_name_by_offset(btf, enums[i].name),
+				 enums[i].val);
+	}
+
+	return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+			 const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations enum_ops = {
+	.check_meta = btf_enum_check_meta,
+	.log_details = btf_enum_log,
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+	[BTF_KIND_INT] = &int_ops,
+	[BTF_KIND_PTR] = &ptr_ops,
+	[BTF_KIND_ARRAY] = &array_ops,
+	[BTF_KIND_STRUCT] = &struct_ops,
+	[BTF_KIND_UNION] = &struct_ops,
+	[BTF_KIND_ENUM] = &enum_ops,
+	[BTF_KIND_FWD] = &fwd_ops,
+	[BTF_KIND_TYPEDEF] = &modifier_ops,
+	[BTF_KIND_VOLATILE] = &modifier_ops,
+	[BTF_KIND_CONST] = &modifier_ops,
+	[BTF_KIND_RESTRICT] = &modifier_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left)
+{
+	u32 saved_meta_left = meta_left;
+	s32 var_meta_size;
+
+	if (meta_left < sizeof(*t)) {
+		btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+				 env->log_type_id, meta_left, sizeof(*t));
+		return -EINVAL;
+	}
+	meta_left -= sizeof(*t);
+
+	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+		btf_verifier_log(env, "[%u] Invalid kind:%u",
+				 env->log_type_id, BTF_INFO_KIND(t->info));
+		return -EINVAL;
+	}
+
+	if (!btf_name_offset_valid(env->btf, t->name)) {
+		btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+				 env->log_type_id, t->name);
+		return -EINVAL;
+	}
+
+	var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+	if (var_meta_size < 0)
+		return var_meta_size;
+
+	meta_left -= var_meta_size;
+
+	return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	struct btf_header *hdr;
+	void *cur, *end;
+
+	hdr = btf->hdr;
+	cur = btf->nohdr_data + hdr->type_off;
+	end = btf->nohdr_data + hdr->str_off;
+
+	env->log_type_id = 1;
+	while (cur < end) {
+		struct btf_type *t = cur;
+		s32 meta_size;
+
+		meta_size = btf_check_meta(env, t, end - cur);
+		if (meta_size < 0)
+			return meta_size;
+
+		btf_add_type(env, t);
+		cur += meta_size;
+		env->log_type_id++;
+	}
+
+	return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+	return btf_check_all_metas(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr;
+	struct btf *btf = env->btf;
+	const char *start, *end;
+
+	hdr = btf->hdr;
+	start = btf->nohdr_data + hdr->str_off;
+	end = start + hdr->str_len;
+
+	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+	    start[0] || end[-1]) {
+		btf_verifier_log(env, "Invalid string section");
+		return -EINVAL;
+	}
+
+	btf->strings = start;
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr;
+	struct btf *btf = env->btf;
+	u32 meta_left;
+
+	if (btf->data_size < sizeof(*hdr)) {
+		btf_verifier_log(env, "btf_header not found");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_hdr(env);
+
+	hdr = btf->hdr;
+	if (hdr->magic != BTF_MAGIC) {
+		btf_verifier_log(env, "Invalid magic");
+		return -EINVAL;
+	}
+
+	if (hdr->version != BTF_VERSION) {
+		btf_verifier_log(env, "Unsupported version");
+		return -ENOTSUPP;
+	}
+
+	if (hdr->flags) {
+		btf_verifier_log(env, "Unsupported flags");
+		return -ENOTSUPP;
+	}
+
+	meta_left = btf->data_size - sizeof(*hdr);
+	if (!meta_left) {
+		btf_verifier_log(env, "No data");
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
+	    /* Type section must align to 4 bytes */
+	    hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Invalid type_off");
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->str_off ||
+	    meta_left - hdr->str_off < hdr->str_len) {
+		btf_verifier_log(env, "Invalid str_off or str_len");
+		return -EINVAL;
+	}
+
+	btf->nohdr_data = btf->hdr + 1;
+
+	return 0;
+}
+
+static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+			     u32 log_level, char __user *log_ubuf, u32 log_size)
+{
+	struct btf_verifier_env *env = NULL;
+	struct bpf_verifier_log *log;
+	struct btf *btf = NULL;
+	u8 *data;
+	int err;
+
+	if (btf_data_size > BTF_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+	if (!env)
+		return ERR_PTR(-ENOMEM);
+
+	log = &env->log;
+	if (log_level || log_ubuf || log_size) {
+		/* user requested verbose verifier output
+		 * and supplied buffer to store the verification trace
+		 */
+		log->level = log_level;
+		log->ubuf = log_ubuf;
+		log->len_total = log_size;
+
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf) {
+			err = -EINVAL;
+			goto errout;
+		}
+	}
+
+	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+	if (!btf) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!data) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	btf->data = data;
+	btf->data_size = btf_data_size;
+
+	if (copy_from_user(data, btf_data, btf_data_size)) {
+		err = -EFAULT;
+		goto errout;
+	}
+
+	env->btf = btf;
+
+	err = btf_parse_hdr(env);
+	if (err)
+		goto errout;
+
+	err = btf_parse_str_sec(env);
+	if (err)
+		goto errout;
+
+	err = btf_parse_type_sec(env);
+	if (err)
+		goto errout;
+
+	if (!err && log->level && bpf_verifier_log_full(log)) {
+		err = -ENOSPC;
+		goto errout;
+	}
+
+	if (!err) {
+		btf_verifier_env_free(env);
+		return btf;
+	}
+
+errout:
+	btf_verifier_env_free(env);
+	if (btf)
+		btf_free(btf);
+	return ERR_PTR(err);
+}
-- 
cgit v1.2.3


From eb3f595dab40b61011c5f123507a7db2df6f0e65 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:55:58 -0700
Subject: bpf: btf: Validate type reference

After collecting all btf_type in the first pass in an earlier patch,
the second pass (in this patch) can validate the reference types
(e.g. the referring type does exist and it does not refer to itself).

While checking the reference type, it also gathers other information (e.g.
the size of an array).  This info will be useful in checking the
struct's members in a later patch.  They will also be useful in doing
pretty print later.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h |  37 +++
 kernel/bpf/btf.c    | 666 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 702 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/btf.h

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
new file mode 100644
index 000000000000..f14b60368753
--- /dev/null
+++ b/include/linux/btf.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef _LINUX_BTF_H
+#define _LINUX_BTF_H 1
+
+#include <linux/types.h>
+
+struct btf;
+struct btf_type;
+
+/* Figure out the size of a type_id.  If type_id is a modifier
+ * (e.g. const), it will be resolved to find out the type with size.
+ *
+ * For example:
+ * In describing "const void *",  type_id is "const" and "const"
+ * refers to "void *".  The return type will be "void *".
+ *
+ * If type_id is a simple "int", then return type will be "int".
+ *
+ * @btf: struct btf object
+ * @type_id: Find out the size of type_id. The type_id of the return
+ *           type is set to *type_id.
+ * @ret_size: It can be NULL.  If not NULL, the size of the return
+ *            type is set to *ret_size.
+ * Return: The btf_type (resolved to another type with size info if needed).
+ *         NULL is returned if type_id itself does not have size info
+ *         (e.g. void) or it cannot be resolved to another type that
+ *         has size info.
+ *         *type_id and *ret_size will not be changed in the
+ *         NULL return case.
+ */
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+					u32 *type_id,
+					u32 *ret_size);
+
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 26e9ed7cea5f..18bf266ceeda 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -105,6 +105,50 @@
  *
  * In the first pass, it still does some verifications (e.g.
  * checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ *	struct A {
+ *		struct B b;
+ *	};
+ *
+ *	struct B {
+ *		struct A a;
+ *	};
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type.  A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR.  Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ *	struct A {
+ *		int m;
+ *		struct A *a;
+ *	};
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR.  Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ *                        ^                                         |
+ *                        +-----------------------------------------+
+ *
  */
 
 #define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
@@ -127,12 +171,19 @@
 	     i < btf_type_vlen(struct_type);			\
 	     i++, member++)
 
+#define for_each_member_from(i, from, struct_type, member)		\
+	for (i = from, member = btf_type_member(struct_type) + from;	\
+	     i < btf_type_vlen(struct_type);				\
+	     i++, member++)
+
 struct btf {
 	union {
 		struct btf_header *hdr;
 		void *data;
 	};
 	struct btf_type **types;
+	u32 *resolved_ids;
+	u32 *resolved_sizes;
 	const char *strings;
 	void *nohdr_data;
 	u32 nr_types;
@@ -140,10 +191,42 @@ struct btf {
 	u32 data_size;
 };
 
+enum verifier_phase {
+	CHECK_META,
+	CHECK_TYPE,
+};
+
+struct resolve_vertex {
+	const struct btf_type *t;
+	u32 type_id;
+	u16 next_member;
+};
+
+enum visit_state {
+	NOT_VISITED,
+	VISITED,
+	RESOLVED,
+};
+
+enum resolve_mode {
+	RESOLVE_TBD,	/* To Be Determined */
+	RESOLVE_PTR,	/* Resolving for Pointer */
+	RESOLVE_STRUCT_OR_ARRAY,	/* Resolving for struct/union
+					 * or array
+					 */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
 struct btf_verifier_env {
 	struct btf *btf;
+	u8 *visit_states;
+	struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
 	struct bpf_verifier_log log;
 	u32 log_type_id;
+	u32 top_stack;
+	enum verifier_phase phase;
+	enum resolve_mode resolve_mode;
 };
 
 static const char * const btf_kind_str[NR_BTF_KINDS] = {
@@ -165,6 +248,8 @@ struct btf_kind_operations {
 	s32 (*check_meta)(struct btf_verifier_env *env,
 			  const struct btf_type *t,
 			  u32 meta_left);
+	int (*resolve)(struct btf_verifier_env *env,
+		       const struct resolve_vertex *v);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
 };
@@ -172,6 +257,101 @@ struct btf_kind_operations {
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
 static struct btf_type btf_void;
 
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+	/* Some of them is not strictly a C modifier
+	 * but they are grouped into the same bucket
+	 * for BTF concern:
+	 *   A type (t) that refers to another
+	 *   type through t->type AND its size cannot
+	 *   be determined without following the t->type.
+	 *
+	 * ptr does not fall into this bucket
+	 * because its size is always sizeof(void *).
+	 */
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_CONST:
+	case BTF_KIND_RESTRICT:
+		return true;
+	}
+
+	return false;
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+	/* void => no type and size info.
+	 * Hence, FWD is also treated as void.
+	 */
+	return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+	return !t || btf_type_is_void(t);
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static bool btf_type_is_struct(const struct btf_type *t)
+{
+	u8 kind = BTF_INFO_KIND(t->info);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_type_is_array(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
+static bool btf_type_is_ptr(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static bool btf_type_is_int(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type.  Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+	return btf_type_is_modifier(t) ||
+		btf_type_is_ptr(t) ||
+		btf_type_is_struct(t) ||
+		btf_type_is_array(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_INT:
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+	case BTF_KIND_ENUM:
+		return true;
+	}
+
+	return false;
+}
+
 static const char *btf_int_encoding_str(u8 encoding)
 {
 	if (encoding == 0)
@@ -234,6 +414,14 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 		return "(invalid-name-offset)";
 }
 
+static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+	if (type_id > btf->nr_types)
+		return NULL;
+
+	return btf->types[type_id];
+}
+
 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
 					      const char *fmt, ...)
 {
@@ -308,6 +496,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	if (!bpf_verifier_log_needed(log))
 		return;
 
+	/* The CHECK_META phase already did a btf dump.
+	 *
+	 * If member is logged again, it must hit an error in
+	 * parsing this member.  It is useful to print out which
+	 * struct this member belongs to.
+	 */
+	if (env->phase != CHECK_META)
+		btf_verifier_log_type(env, struct_type, NULL);
+
 	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
 			   btf_name_by_offset(btf, member->name),
 			   member->type, member->offset);
@@ -393,15 +590,183 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 static void btf_free(struct btf *btf)
 {
 	kvfree(btf->types);
+	kvfree(btf->resolved_sizes);
+	kvfree(btf->resolved_ids);
 	kvfree(btf->data);
 	kfree(btf);
 }
 
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 nr_types = btf->nr_types;
+	u32 *resolved_sizes = NULL;
+	u32 *resolved_ids = NULL;
+	u8 *visit_states = NULL;
+
+	/* +1 for btf_void */
+	resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes),
+				  GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_sizes)
+		goto nomem;
+
+	resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_ids)
+		goto nomem;
+
+	visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!visit_states)
+		goto nomem;
+
+	btf->resolved_sizes = resolved_sizes;
+	btf->resolved_ids = resolved_ids;
+	env->visit_states = visit_states;
+
+	return 0;
+
+nomem:
+	kvfree(resolved_sizes);
+	kvfree(resolved_ids);
+	kvfree(visit_states);
+	return -ENOMEM;
+}
+
 static void btf_verifier_env_free(struct btf_verifier_env *env)
 {
+	kvfree(env->visit_states);
 	kfree(env);
 }
 
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+				     const struct btf_type *next_type)
+{
+	switch (env->resolve_mode) {
+	case RESOLVE_TBD:
+		/* int, enum or void is a sink */
+		return !btf_type_needs_resolve(next_type);
+	case RESOLVE_PTR:
+		/* int, enum, void, struct or array is a sink for ptr */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_ptr(next_type);
+	case RESOLVE_STRUCT_OR_ARRAY:
+		/* int, enum, void or ptr is a sink for struct and array */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_array(next_type) &&
+			!btf_type_is_struct(next_type);
+	default:
+		BUG_ON(1);
+	}
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+				 u32 type_id)
+{
+	return env->visit_states[type_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+			  const struct btf_type *t, u32 type_id)
+{
+	struct resolve_vertex *v;
+
+	if (env->top_stack == MAX_RESOLVE_DEPTH)
+		return -E2BIG;
+
+	if (env->visit_states[type_id] != NOT_VISITED)
+		return -EEXIST;
+
+	env->visit_states[type_id] = VISITED;
+
+	v = &env->stack[env->top_stack++];
+	v->t = t;
+	v->type_id = type_id;
+	v->next_member = 0;
+
+	if (env->resolve_mode == RESOLVE_TBD) {
+		if (btf_type_is_ptr(t))
+			env->resolve_mode = RESOLVE_PTR;
+		else if (btf_type_is_struct(t) || btf_type_is_array(t))
+			env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+	}
+
+	return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+				      u16 next_member)
+{
+	env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+				   u32 resolved_type_id,
+				   u32 resolved_size)
+{
+	u32 type_id = env->stack[--(env->top_stack)].type_id;
+	struct btf *btf = env->btf;
+
+	btf->resolved_sizes[type_id] = resolved_size;
+	btf->resolved_ids[type_id] = resolved_type_id;
+	env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+	return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+						  u32 *type_id)
+{
+	*type_id = btf->resolved_ids[*type_id];
+	return btf_type_by_id(btf, *type_id);
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+					u32 *type_id, u32 *ret_size)
+{
+	const struct btf_type *size_type;
+	u32 size_type_id = *type_id;
+	u32 size = 0;
+
+	size_type = btf_type_by_id(btf, size_type_id);
+	if (btf_type_is_void_or_null(size_type))
+		return NULL;
+
+	if (btf_type_has_size(size_type)) {
+		size = size_type->size;
+	} else if (btf_type_is_array(size_type)) {
+		size = btf->resolved_sizes[size_type_id];
+	} else if (btf_type_is_ptr(size_type)) {
+		size = sizeof(void *);
+	} else {
+		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+			return NULL;
+
+		size = btf->resolved_sizes[size_type_id];
+		size_type_id = btf->resolved_ids[size_type_id];
+		size_type = btf_type_by_id(btf, size_type_id);
+		if (btf_type_is_void(size_type))
+			return NULL;
+	}
+
+	*type_id = size_type_id;
+	if (ret_size)
+		*ret_size = size;
+
+	return size_type;
+}
+
+static int btf_df_resolve(struct btf_verifier_env *env,
+			  const struct resolve_vertex *v)
+{
+	btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+	return -EINVAL;
+}
+
 static s32 btf_int_check_meta(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 meta_left)
@@ -464,6 +829,7 @@ static void btf_int_log(struct btf_verifier_env *env,
 
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_int_log,
 };
 
@@ -486,6 +852,104 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+				const struct resolve_vertex *v)
+{
+	const struct btf_type *t = v->t;
+	const struct btf_type *next_type;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "typedef void new_void", "const void"...etc */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* Figure out the resolved next_type_id with size.
+	 * They will be stored in the current modifier's
+	 * resolved_ids and resolved_sizes such that it can
+	 * save us a few type-following when we use it later (e.g. in
+	 * pretty print).
+	 */
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+	return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+			   const struct resolve_vertex *v)
+{
+	const struct btf_type *next_type;
+	const struct btf_type *t = v->t;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "void *" */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+	 * the modifier may have stopped resolving when it was resolved
+	 * to a ptr (last-resolved-ptr).
+	 *
+	 * We now need to continue from the last-resolved-ptr to
+	 * ensure the last-resolved-ptr will not referring back to
+	 * the currenct ptr (t).
+	 */
+	if (btf_type_is_modifier(next_type)) {
+		const struct btf_type *resolved_type;
+		u32 resolved_type_id;
+
+		resolved_type_id = next_type_id;
+		resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+		if (btf_type_is_ptr(resolved_type) &&
+		    !env_type_is_resolve_sink(env, resolved_type) &&
+		    !env_type_is_resolved(env, resolved_type_id))
+			return env_stack_push(env, resolved_type,
+					      resolved_type_id);
+	}
+
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, 0);
+
+	return 0;
+}
+
 static void btf_ref_type_log(struct btf_verifier_env *env,
 			     const struct btf_type *t)
 {
@@ -494,16 +958,19 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations modifier_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_modifier_resolve,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations ptr_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_ptr_resolve,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_ref_type_log,
 };
 
@@ -542,6 +1009,61 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 	return meta_needed;
 }
 
+static int btf_array_resolve(struct btf_verifier_env *env,
+			     const struct resolve_vertex *v)
+{
+	const struct btf_array *array = btf_type_array(v->t);
+	const struct btf_type *elem_type;
+	u32 elem_type_id = array->type;
+	struct btf *btf = env->btf;
+	u32 elem_size;
+
+	elem_type = btf_type_by_id(btf, elem_type_id);
+	if (btf_type_is_void_or_null(elem_type)) {
+		btf_verifier_log_type(env, v->t,
+				      "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, elem_type) &&
+	    !env_type_is_resolved(env, elem_type_id))
+		return env_stack_push(env, elem_type, elem_type_id);
+
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	if (!elem_type) {
+		btf_verifier_log_type(env, v->t, "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (btf_type_is_int(elem_type)) {
+		int int_type_data = btf_type_int(elem_type);
+		u16 nr_bits = BTF_INT_BITS(int_type_data);
+		u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+		/* Put more restriction on array of int.  The int cannot
+		 * be a bit field and it must be either u8/u16/u32/u64.
+		 */
+		if (BITS_PER_BYTE_MASKED(nr_bits) ||
+		    BTF_INT_OFFSET(int_type_data) ||
+		    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+		     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+			btf_verifier_log_type(env, v->t,
+					      "Invalid array of int");
+			return -EINVAL;
+		}
+	}
+
+	if (array->nelems && elem_size > U32_MAX / array->nelems) {
+		btf_verifier_log_type(env, v->t,
+				      "Array size overflows U32_MAX");
+		return -EINVAL;
+	}
+
+	env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+	return 0;
+}
+
 static void btf_array_log(struct btf_verifier_env *env,
 			  const struct btf_type *t)
 {
@@ -553,6 +1075,7 @@ static void btf_array_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
+	.resolve = btf_array_resolve,
 	.log_details = btf_array_log,
 };
 
@@ -610,6 +1133,50 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	return meta_needed;
 }
 
+static int btf_struct_resolve(struct btf_verifier_env *env,
+			      const struct resolve_vertex *v)
+{
+	const struct btf_member *member;
+	u16 i;
+
+	/* Before continue resolving the next_member,
+	 * ensure the last member is indeed resolved to a
+	 * type with size info.
+	 */
+	if (v->next_member) {
+		const struct btf_member *last_member;
+		u16 last_member_type_id;
+
+		last_member = btf_type_member(v->t) + v->next_member - 1;
+		last_member_type_id = last_member->type;
+		if (WARN_ON_ONCE(!env_type_is_resolved(env,
+						       last_member_type_id)))
+			return -EINVAL;
+	}
+
+	for_each_member_from(i, v->next_member, v->t, member) {
+		u32 member_type_id = member->type;
+		const struct btf_type *member_type = btf_type_by_id(env->btf,
+								member_type_id);
+
+		if (btf_type_is_void_or_null(member_type)) {
+			btf_verifier_log_member(env, v->t, member,
+						"Invalid member");
+			return -EINVAL;
+		}
+
+		if (!env_type_is_resolve_sink(env, member_type) &&
+		    !env_type_is_resolved(env, member_type_id)) {
+			env_stack_set_next_member(env, i + 1);
+			return env_stack_push(env, member_type, member_type_id);
+		}
+	}
+
+	env_stack_pop_resolved(env, 0, 0);
+
+	return 0;
+}
+
 static void btf_struct_log(struct btf_verifier_env *env,
 			   const struct btf_type *t)
 {
@@ -618,6 +1185,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
+	.resolve = btf_struct_resolve,
 	.log_details = btf_struct_log,
 };
 
@@ -671,6 +1239,7 @@ static void btf_enum_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_enum_log,
 };
 
@@ -751,9 +1320,104 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	return 0;
 }
 
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id)
+{
+	const struct resolve_vertex *v;
+	int err = 0;
+
+	env->resolve_mode = RESOLVE_TBD;
+	env_stack_push(env, t, type_id);
+	while (!err && (v = env_stack_peak(env))) {
+		env->log_type_id = v->type_id;
+		err = btf_type_ops(v->t)->resolve(env, v);
+	}
+
+	env->log_type_id = type_id;
+	if (err == -E2BIG)
+		btf_verifier_log_type(env, t,
+				      "Exceeded max resolving depth:%u",
+				      MAX_RESOLVE_DEPTH);
+	else if (err == -EEXIST)
+		btf_verifier_log_type(env, t, "Loop detected");
+
+	return err;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 type_id)
+{
+	struct btf *btf = env->btf;
+
+	if (!env_type_is_resolved(env, type_id))
+		return false;
+
+	if (btf_type_is_struct(t))
+		return !btf->resolved_ids[type_id] &&
+			!btf->resolved_sizes[type_id];
+
+	if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+		t = btf_type_id_resolve(btf, &type_id);
+		return t && !btf_type_is_modifier(t);
+	}
+
+	if (btf_type_is_array(t)) {
+		const struct btf_array *array = btf_type_array(t);
+		const struct btf_type *elem_type;
+		u32 elem_type_id = array->type;
+		u32 elem_size;
+
+		elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+		return elem_type && !btf_type_is_modifier(elem_type) &&
+			(array->nelems * elem_size ==
+			 btf->resolved_sizes[type_id]);
+	}
+
+	return false;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 type_id;
+	int err;
+
+	err = env_resolve_init(env);
+	if (err)
+		return err;
+
+	env->phase++;
+	for (type_id = 1; type_id <= btf->nr_types; type_id++) {
+		const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+		env->log_type_id = type_id;
+		if (btf_type_needs_resolve(t) &&
+		    !env_type_is_resolved(env, type_id)) {
+			err = btf_resolve(env, t, type_id);
+			if (err)
+				return err;
+		}
+
+		if (btf_type_needs_resolve(t) &&
+		    !btf_resolve_valid(env, t, type_id)) {
+			btf_verifier_log_type(env, t, "Invalid resolve state");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int btf_parse_type_sec(struct btf_verifier_env *env)
 {
-	return btf_check_all_metas(env);
+	int err;
+
+	err = btf_check_all_metas(env);
+	if (err)
+		return err;
+
+	return btf_check_all_types(env);
 }
 
 static int btf_parse_str_sec(struct btf_verifier_env *env)
-- 
cgit v1.2.3


From b00b8daec828dd59af7d1f7a42acd6e5867f80c6 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:00 -0700
Subject: bpf: btf: Add pretty print capability for data with BTF type info

This patch adds pretty print capability for data with BTF type info.
The current usage is to allow pretty print for a BPF map.

The next few patches will allow a read() on a pinned map with BTF
type info for its key and value.

This patch uses the seq_printf() infra.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h |   2 +
 kernel/bpf/btf.c    | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index f14b60368753..d8bdab0280ba 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -33,5 +33,7 @@ struct btf_type;
 const struct btf_type *btf_type_id_size(const struct btf *btf,
 					u32 *type_id,
 					u32 *ret_size);
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+		       struct seq_file *m);
 
 #endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4e31249f6c61..10ee41589da2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3,6 +3,7 @@
 
 #include <uapi/linux/btf.h>
 #include <uapi/linux/types.h>
+#include <linux/seq_file.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -256,6 +257,9 @@ struct btf_kind_operations {
 			    const struct btf_type *member_type);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
+	void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+			 u32 type_id, void *data, u8 bits_offsets,
+			 struct seq_file *m);
 };
 
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
@@ -781,6 +785,13 @@ static int btf_df_resolve(struct btf_verifier_env *env,
 	return -EINVAL;
 }
 
+static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
+			    u32 type_id, void *data, u8 bits_offsets,
+			    struct seq_file *m)
+{
+	seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
 static int btf_int_check_member(struct btf_verifier_env *env,
 				const struct btf_type *struct_type,
 				const struct btf_member *member,
@@ -879,11 +890,96 @@ static void btf_int_log(struct btf_verifier_env *env,
 			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
 }
 
+static void btf_int_bits_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  void *data, u8 bits_offset,
+				  struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u16 nr_bits = BTF_INT_BITS(int_data);
+	u16 total_bits_offset;
+	u16 nr_copy_bytes;
+	u16 nr_copy_bits;
+	u8 nr_upper_bits;
+	union {
+		u64 u64_num;
+		u8  u8_nums[8];
+	} print_num;
+
+	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+	bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+	nr_copy_bits = nr_bits + bits_offset;
+	nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+	print_num.u64_num = 0;
+	memcpy(&print_num.u64_num, data, nr_copy_bytes);
+
+	/* Ditch the higher order bits */
+	nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
+	if (nr_upper_bits) {
+		/* We need to mask out some bits of the upper byte. */
+		u8 mask = (1 << nr_upper_bits) - 1;
+
+		print_num.u8_nums[nr_copy_bytes - 1] &= mask;
+	}
+
+	print_num.u64_num >>= bits_offset;
+
+	seq_printf(m, "0x%llx", print_num.u64_num);
+}
+
+static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u8 encoding = BTF_INT_ENCODING(int_data);
+	bool sign = encoding & BTF_INT_SIGNED;
+	u32 nr_bits = BTF_INT_BITS(int_data);
+
+	if (bits_offset || BTF_INT_OFFSET(int_data) ||
+	    BITS_PER_BYTE_MASKED(nr_bits)) {
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+		return;
+	}
+
+	switch (nr_bits) {
+	case 64:
+		if (sign)
+			seq_printf(m, "%lld", *(s64 *)data);
+		else
+			seq_printf(m, "%llu", *(u64 *)data);
+		break;
+	case 32:
+		if (sign)
+			seq_printf(m, "%d", *(s32 *)data);
+		else
+			seq_printf(m, "%u", *(u32 *)data);
+		break;
+	case 16:
+		if (sign)
+			seq_printf(m, "%d", *(s16 *)data);
+		else
+			seq_printf(m, "%u", *(u16 *)data);
+		break;
+	case 8:
+		if (sign)
+			seq_printf(m, "%d", *(s8 *)data);
+		else
+			seq_printf(m, "%u", *(u8 *)data);
+		break;
+	default:
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+	}
+}
+
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_int_check_member,
 	.log_details = btf_int_log,
+	.seq_show = btf_int_seq_show,
 };
 
 static int btf_modifier_check_member(struct btf_verifier_env *env,
@@ -1054,6 +1150,24 @@ resolved:
 	return 0;
 }
 
+static void btf_modifier_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  u32 type_id, void *data,
+				  u8 bits_offset, struct seq_file *m)
+{
+	t = btf_type_id_resolve(btf, &type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
+static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	/* It is a hashed value */
+	seq_printf(m, "%p", *(void **)data);
+}
+
 static void btf_ref_type_log(struct btf_verifier_env *env,
 			     const struct btf_type *t)
 {
@@ -1065,6 +1179,7 @@ static struct btf_kind_operations modifier_ops = {
 	.resolve = btf_modifier_resolve,
 	.check_member = btf_modifier_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_modifier_seq_show,
 };
 
 static struct btf_kind_operations ptr_ops = {
@@ -1072,6 +1187,7 @@ static struct btf_kind_operations ptr_ops = {
 	.resolve = btf_ptr_resolve,
 	.check_member = btf_ptr_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_ptr_seq_show,
 };
 
 static struct btf_kind_operations fwd_ops = {
@@ -1079,6 +1195,7 @@ static struct btf_kind_operations fwd_ops = {
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_df_seq_show,
 };
 
 static int btf_array_check_member(struct btf_verifier_env *env,
@@ -1209,11 +1326,36 @@ static void btf_array_log(struct btf_verifier_env *env,
 			 array->type, array->index_type, array->nelems);
 }
 
+static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
+			       u32 type_id, void *data, u8 bits_offset,
+			       struct seq_file *m)
+{
+	const struct btf_array *array = btf_type_array(t);
+	const struct btf_kind_operations *elem_ops;
+	const struct btf_type *elem_type;
+	u32 i, elem_size, elem_type_id;
+
+	elem_type_id = array->type;
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	elem_ops = btf_type_ops(elem_type);
+	seq_puts(m, "[");
+	for (i = 0; i < array->nelems; i++) {
+		if (i)
+			seq_puts(m, ",");
+
+		elem_ops->seq_show(btf, elem_type, elem_type_id, data,
+				   bits_offset, m);
+		data += elem_size;
+	}
+	seq_puts(m, "]");
+}
+
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
 	.resolve = btf_array_resolve,
 	.check_member = btf_array_check_member,
 	.log_details = btf_array_log,
+	.seq_show = btf_array_seq_show,
 };
 
 static int btf_struct_check_member(struct btf_verifier_env *env,
@@ -1361,11 +1503,39 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
+				u32 type_id, void *data, u8 bits_offset,
+				struct seq_file *m)
+{
+	const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
+	const struct btf_member *member;
+	u32 i;
+
+	seq_puts(m, "{");
+	for_each_member(i, t, member) {
+		const struct btf_type *member_type = btf_type_by_id(btf,
+								member->type);
+		u32 member_offset = member->offset;
+		u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+		u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+		const struct btf_kind_operations *ops;
+
+		if (i)
+			seq_puts(m, seq);
+
+		ops = btf_type_ops(member_type);
+		ops->seq_show(btf, member_type, member->type,
+			      data + bytes_offset, bits8_offset, m);
+	}
+	seq_puts(m, "}");
+}
+
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
 	.resolve = btf_struct_resolve,
 	.check_member = btf_struct_check_member,
 	.log_details = btf_struct_log,
+	.seq_show = btf_struct_seq_show,
 };
 
 static int btf_enum_check_member(struct btf_verifier_env *env,
@@ -1441,11 +1611,31 @@ static void btf_enum_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
+			      u32 type_id, void *data, u8 bits_offset,
+			      struct seq_file *m)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	u32 i, nr_enums = btf_type_vlen(t);
+	int v = *(int *)data;
+
+	for (i = 0; i < nr_enums; i++) {
+		if (v == enums[i].val) {
+			seq_printf(m, "%s",
+				   btf_name_by_offset(btf, enums[i].name));
+			return;
+		}
+	}
+
+	seq_printf(m, "%d", v);
+}
+
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_enum_check_member,
 	.log_details = btf_enum_log,
+	.seq_show = btf_enum_seq_show,
 };
 
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
@@ -1782,3 +1972,11 @@ errout:
 		btf_free(btf);
 	return ERR_PTR(err);
 }
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+		       struct seq_file *m)
+{
+	const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+}
-- 
cgit v1.2.3


From f56a653c1fd13a197076dec4461c656fd2adec73 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:01 -0700
Subject: bpf: btf: Add BPF_BTF_LOAD command

This patch adds a BPF_BTF_LOAD command which
1) loads and verifies the BTF (implemented in earlier patches)
2) returns a BTF fd to userspace.  In the next patch, the
   BTF fd can be specified during BPF_MAP_CREATE.

It currently limits to CAP_SYS_ADMIN.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h      |  4 +++
 include/uapi/linux/bpf.h |  9 +++++++
 kernel/bpf/btf.c         | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 17 ++++++++++++
 4 files changed, 97 insertions(+)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index d8bdab0280ba..a7c7072535ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -8,7 +8,11 @@
 
 struct btf;
 struct btf_type;
+union bpf_attr;
 
+void btf_put(struct btf *btf);
+int btf_new_fd(const union bpf_attr *attr);
+struct btf *btf_get_by_fd(int fd);
 /* Figure out the size of a type_id.  If type_id is a modifier
  * (e.g. const), it will be resolved to find out the type with size.
  *
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9a2d1a04eb24..795bcd577750 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
 	BPF_OBJ_GET_INFO_BY_FD,
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
 };
 
 enum bpf_map_type {
@@ -363,6 +364,14 @@ union bpf_attr {
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 10ee41589da2..2322340694cf 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7,6 +7,8 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/bpf_verifier.h>
@@ -190,6 +192,7 @@ struct btf {
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
+	refcount_t refcnt;
 };
 
 enum verifier_phase {
@@ -604,6 +607,17 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
+static void btf_get(struct btf *btf)
+{
+	refcount_inc(&btf->refcnt);
+}
+
+void btf_put(struct btf *btf)
+{
+	if (btf && refcount_dec_and_test(&btf->refcnt))
+		btf_free(btf);
+}
+
 static int env_resolve_init(struct btf_verifier_env *env)
 {
 	struct btf *btf = env->btf;
@@ -1963,6 +1977,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	if (!err) {
 		btf_verifier_env_free(env);
+		btf_get(btf);
 		return btf;
 	}
 
@@ -1980,3 +1995,55 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 
 	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
 }
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+	btf_put(filp->private_data);
+	return 0;
+}
+
+static const struct file_operations btf_fops = {
+	.release	= btf_release,
+};
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+	struct btf *btf;
+	int fd;
+
+	btf = btf_parse(u64_to_user_ptr(attr->btf),
+			attr->btf_size, attr->btf_log_level,
+			u64_to_user_ptr(attr->btf_log_buf),
+			attr->btf_log_size);
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = anon_inode_getfd("btf", &btf_fops, btf,
+			      O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+	struct btf *btf;
+	struct fd f;
+
+	f = fdget(fd);
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &btf_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	btf = f.file->private_data;
+	btf_get(btf);
+	fdput(f);
+
+	return btf;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..cd8ebadc66eb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
@@ -2023,6 +2024,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_LOAD))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_new_fd(attr);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2103,6 +2117,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_RAW_TRACEPOINT_OPEN:
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
+	case BPF_BTF_LOAD:
+		err = bpf_btf_load(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 60197cfb6e11ffc03aa0ed23765b2f7e70b2e2d4 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:02 -0700
Subject: bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd

This patch adds BPF_OBJ_GET_INFO_BY_FD support to BTF fd.
The original BTF data, which was used to create the BTF fd during
the earlier BPF_BTF_LOAD call, will be returned.

The userspace is expected to allocate buffer
to info.info and the buffer size is set to info.info_len before
calling BPF_OBJ_GET_INFO_BY_FD.

The original BTF data is copied to the userspace buffer (info.info).
Only upto the user's specified info.info_len will be copied.

The original BTF data size is set to info.info_len.  The userspace
needs to check if it is bigger than its allocated buffer size.
If it is, the userspace should realloc with the kernel-returned
info.info_len and call the BPF_OBJ_GET_INFO_BY_FD again.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h  |  5 +++++
 kernel/bpf/btf.c     | 17 ++++++++++++++++-
 kernel/bpf/syscall.c |  2 ++
 3 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a7c7072535ea..a966dc6d61ee 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -10,9 +10,14 @@ struct btf;
 struct btf_type;
 union bpf_attr;
 
+extern const struct file_operations btf_fops;
+
 void btf_put(struct btf *btf);
 int btf_new_fd(const union bpf_attr *attr);
 struct btf *btf_get_by_fd(int fd);
+int btf_get_info_by_fd(const struct btf *btf,
+		       const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 /* Figure out the size of a type_id.  If type_id is a modifier
  * (e.g. const), it will be resolved to find out the type with size.
  *
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2322340694cf..eb56ac760547 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2002,7 +2002,7 @@ static int btf_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static const struct file_operations btf_fops = {
+const struct file_operations btf_fops = {
 	.release	= btf_release,
 };
 
@@ -2047,3 +2047,18 @@ struct btf *btf_get_by_fd(int fd)
 
 	return btf;
 }
+
+int btf_get_info_by_fd(const struct btf *btf,
+		       const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	void __user *udata = u64_to_user_ptr(attr->info.info);
+	u32 copy_len = min_t(u32, btf->data_size,
+			     attr->info.info_len);
+
+	if (copy_to_user(udata, btf->data, copy_len) ||
+	    put_user(btf->data_size, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd8ebadc66eb..0a4924a0a8da 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2017,6 +2017,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	else if (f.file->f_op == &bpf_map_fops)
 		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
 					     uattr);
+	else if (f.file->f_op == &btf_fops)
+		err = btf_get_info_by_fd(f.file->private_data, attr, uattr);
 	else
 		err = -EINVAL;
 
-- 
cgit v1.2.3


From a26ca7c982cb576749cbdd01e8ecde4bf010d60a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:03 -0700
Subject: bpf: btf: Add pretty print support to the basic arraymap

This patch adds pretty print support to the basic arraymap.
Support for other bpf maps can be added later.

This patch adds new attrs to the BPF_MAP_CREATE command to allow
specifying the btf_fd, btf_key_id and btf_value_id.  The
BPF_MAP_CREATE can then associate the btf to the map if
the creating map supports BTF.

A BTF supported map needs to implement two new map ops,
map_seq_show_elem() and map_check_btf().  This patch has
implemented these new map ops for the basic arraymap.

It also adds file_operations, bpffs_map_fops, to the pinned
map such that the pinned map can be opened and read.
After that, the user has an intuitive way to do
"cat bpffs/pathto/a-pinned-map" instead of getting
an error.

bpffs_map_fops should not be extended further to support
other operations.  Other operations (e.g. write/key-lookup...)
should be realized by the userspace tools (e.g. bpftool) through
the BPF_OBJ_GET_INFO_BY_FD, map's lookup/update interface...etc.
Follow up patches will allow the userspace to obtain
the BTF from a map-fd.

Here is a sample output when reading a pinned arraymap
with the following map's value:

struct map_value {
	int count_a;
	int count_b;
};

cat /sys/fs/bpf/pinned_array_map:

0: {1,2}
1: {3,4}
2: {5,6}
...

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  20 +++++-
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/arraymap.c    |  50 +++++++++++++++
 kernel/bpf/inode.c       | 156 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c     |  32 +++++++++-
 5 files changed, 254 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd0ee92..ee5275e7d4df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,8 @@ struct perf_event;
 struct bpf_prog;
 struct bpf_map;
 struct sock;
+struct seq_file;
+struct btf;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
@@ -43,10 +45,14 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
+	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
+				  struct seq_file *m);
+	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
+			     u32 key_type_id, u32 value_type_id);
 };
 
 struct bpf_map {
-	/* 1st cacheline with read-mostly members of which some
+	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
 	 */
 	const struct bpf_map_ops *ops ____cacheline_aligned;
@@ -62,10 +68,13 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
+	u32 btf_key_id;
+	u32 btf_value_id;
+	struct btf *btf;
 	bool unpriv_array;
-	/* 7 bytes hole */
+	/* 55 bytes hole */
 
-	/* 2nd cacheline with misc members to avoid false sharing
+	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
 	struct user_struct *user ____cacheline_aligned;
@@ -100,6 +109,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
 	return container_of(map, struct bpf_offloaded_map, map);
 }
 
+static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
+{
+	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
+}
+
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 795bcd577750..c8383a289f7b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -280,6 +280,9 @@ union bpf_attr {
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_id;	/* BTF type_id of the key */
+		__u32	btf_value_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..02a189339381 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,11 +11,13 @@
  * General Public License for more details.
  */
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
 #include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
 
 #include "map_in_map.h"
 
@@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map)
 	bpf_map_area_free(array);
 }
 
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+				    struct seq_file *m)
+{
+	void *value;
+
+	rcu_read_lock();
+
+	value = array_map_lookup_elem(map, key);
+	if (!value) {
+		rcu_read_unlock();
+		return;
+	}
+
+	seq_printf(m, "%u: ", *(u32 *)key);
+	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	seq_puts(m, "\n");
+
+	rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			       u32 btf_key_id, u32 btf_value_id)
+{
+	const struct btf_type *key_type, *value_type;
+	u32 key_size, value_size;
+	u32 int_data;
+
+	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EINVAL;
+
+	int_data = *(u32 *)(key_type + 1);
+	/* bpf array can only take a u32 key.  This check makes
+	 * sure that the btf matches the attr used during map_create.
+	 */
+	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
+	    BTF_INT_OFFSET(int_data))
+		return -EINVAL;
+
+	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+	if (!value_type || value_size > map->value_size)
+		return -EINVAL;
+
+	return 0;
+}
+
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
 	.map_gen_lookup = array_map_gen_lookup,
+	.map_seq_show_elem = array_map_seq_show_elem,
+	.map_check_btf = array_map_check_btf,
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bf6da59ae0d0..a41343009ccc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
+struct map_iter {
+	void *key;
+	bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+	return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+	return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+	if (iter) {
+		kfree(iter->key);
+		kfree(iter);
+	}
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+	struct map_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+	if (!iter)
+		goto error;
+
+	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!iter->key)
+		goto error;
+
+	return iter;
+
+error:
+	map_iter_free(iter);
+	return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (map_iter(m)->done)
+		return NULL;
+
+	if (unlikely(v == SEQ_START_TOKEN))
+		goto done;
+
+	if (map->ops->map_get_next_key(map, key, key)) {
+		map_iter(m)->done = true;
+		return NULL;
+	}
+
+done:
+	++(*pos);
+	return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (map_iter(m)->done)
+		return NULL;
+
+	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (unlikely(v == SEQ_START_TOKEN)) {
+		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+		seq_puts(m, "# WARNING!! The output format will change\n");
+	} else {
+		map->ops->map_seq_show_elem(map, key, m);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+	.start	= map_seq_start,
+	.next	= map_seq_next,
+	.show	= map_seq_show,
+	.stop	= map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+	struct bpf_map *map = inode->i_private;
+	struct map_iter *iter;
+	struct seq_file *m;
+	int err;
+
+	iter = map_iter_alloc(map);
+	if (!iter)
+		return -ENOMEM;
+
+	err = seq_open(file, &bpffs_map_seq_ops);
+	if (err) {
+		map_iter_free(iter);
+		return err;
+	}
+
+	m = file->private_data;
+	m->private = iter;
+
+	return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	map_iter_free(map_iter(m));
+
+	return seq_release(inode, file);
+}
+
+/* bpffs_map_fops should only implement the basic
+ * read operation for a BPF map.  The purpose is to
+ * provide a simple user intuitive way to do
+ * "cat bpffs/pathto/a-pinned-map".
+ *
+ * Other operations (e.g. write, lookup...) should be realized by
+ * the userspace tools (e.g. bpftool) through the
+ * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
+ * interface.
+ */
+static const struct file_operations bpffs_map_fops = {
+	.open		= bpffs_map_open,
+	.read		= seq_read,
+	.release	= bpffs_map_release,
+};
+
 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
-			 const struct inode_operations *iops)
+			 const struct inode_operations *iops,
+			 const struct file_operations *fops)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
@@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 		return PTR_ERR(inode);
 
 	inode->i_op = iops;
+	inode->i_fop = fops;
 	inode->i_private = raw;
 
 	bpf_dentry_finalize(dentry, inode, dir);
@@ -167,12 +314,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 
 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL);
 }
 
 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
+	struct bpf_map *map = arg;
+
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+			     map->btf ? &bpffs_map_fops : NULL);
 }
 
 static struct dentry *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0a4924a0a8da..fe23dc5a3ec4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,7 @@
 #include <linux/cred.h>
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
+#include <linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -251,6 +252,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_uncharge_memlock(map);
 	security_bpf_map_free(map);
+	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -416,7 +418,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -450,6 +452,33 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
+	if (bpf_map_support_seq_show(map) &&
+	    (attr->btf_key_id || attr->btf_value_id)) {
+		struct btf *btf;
+
+		if (!attr->btf_key_id || !attr->btf_value_id) {
+			err = -EINVAL;
+			goto free_map_nouncharge;
+		}
+
+		btf = btf_get_by_fd(attr->btf_fd);
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto free_map_nouncharge;
+		}
+
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
+					      attr->btf_value_id);
+		if (err) {
+			btf_put(btf);
+			goto free_map_nouncharge;
+		}
+
+		map->btf = btf;
+		map->btf_key_id = attr->btf_key_id;
+		map->btf_value_id = attr->btf_value_id;
+	}
+
 	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
@@ -482,6 +511,7 @@ free_map:
 free_map_sec:
 	security_bpf_map_free(map);
 free_map_nouncharge:
+	btf_put(map->btf);
 	map->ops->map_free(map);
 	return err;
 }
-- 
cgit v1.2.3


From 9e4d60938a2b2aae3c2c213c923d9eb8d0a87ba2 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:50 +0200
Subject: net: phy: mdio-gpio: Remove reset function

The platform data can contain a function to call to reset
the bit banging interface. It is not used, so remove it.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 1 -
 include/linux/platform_data/mdio-gpio.h | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 369f5f35d6fd..570b87b82abc 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -141,7 +141,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 		goto out;
 
 	bitbang->ctrl.ops = &mdio_gpio_ops;
-	bitbang->ctrl.reset = pdata->reset;
 	mdc = pdata->mdc;
 	bitbang->mdc = gpio_to_desc(mdc);
 	if (pdata->mdc_active_low)
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index 11f00cdabe3d..6e8f01a570f2 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -26,8 +26,6 @@ struct mdio_gpio_platform_data {
 	u32 phy_mask;
 	u32 phy_ignore_ta_mask;
 	int irqs[PHY_MAX_ADDR];
-	/* reset callback */
-	int (*reset)(struct mii_bus *bus);
 };
 
 #endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3


From a3283e2576736463e07ddc684efb2e587a3bbda2 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:51 +0200
Subject: net: phy: mdio-bitbang: Remove reset support

The mdio-gpio driver was the only user of the interface reset option.
Since it no longer uses it, remove it from the bit banging code.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-bitbang.c | 9 ---------
 include/linux/mdio-bitbang.h   | 2 --
 2 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-bitbang.c b/drivers/net/phy/mdio-bitbang.c
index 403b085f0a89..15352f987bdf 100644
--- a/drivers/net/phy/mdio-bitbang.c
+++ b/drivers/net/phy/mdio-bitbang.c
@@ -205,14 +205,6 @@ static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val)
 	return 0;
 }
 
-static int mdiobb_reset(struct mii_bus *bus)
-{
-	struct mdiobb_ctrl *ctrl = bus->priv;
-	if (ctrl->reset)
-		ctrl->reset(bus);
-	return 0;
-}
-
 struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
 {
 	struct mii_bus *bus;
@@ -225,7 +217,6 @@ struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
 
 	bus->read = mdiobb_read;
 	bus->write = mdiobb_write;
-	bus->reset = mdiobb_reset;
 	bus->priv = ctrl;
 
 	return bus;
diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h
index a8ac9cfa014c..5d71e8a8500f 100644
--- a/include/linux/mdio-bitbang.h
+++ b/include/linux/mdio-bitbang.h
@@ -33,8 +33,6 @@ struct mdiobb_ops {
 
 struct mdiobb_ctrl {
 	const struct mdiobb_ops *ops;
-	/* reset callback */
-	int (*reset)(struct mii_bus *bus);
 };
 
 /* The returned bus is not yet registered with the phy layer. */
-- 
cgit v1.2.3


From c1b3eb04682f80af74572aa25601dbb555c6bc6e Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:52 +0200
Subject: net: phy: mdio-gpio: remove support for ignoring turn around

This is not needed any more by devices using platform data, so remove
it.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 1 -
 include/linux/platform_data/mdio-gpio.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 570b87b82abc..28ad9ca7b9e7 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -163,7 +163,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 	new_bus->name = "GPIO Bitbanged MDIO";
 
 	new_bus->phy_mask = pdata->phy_mask;
-	new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask;
 	memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq));
 	new_bus->parent = dev;
 
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index 6e8f01a570f2..b8f914a30126 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -24,7 +24,6 @@ struct mdio_gpio_platform_data {
 	bool mdo_active_low;
 
 	u32 phy_mask;
-	u32 phy_ignore_ta_mask;
 	int irqs[PHY_MAX_ADDR];
 };
 
-- 
cgit v1.2.3


From 185a16b60a239f9db3fe8b8ce931a2fd61330853 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:53 +0200
Subject: net: phy: mdio-gpio: remove support for phy mask

This is not needed any more by devices using platform data, so remove
it.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 4 ----
 include/linux/platform_data/mdio-gpio.h | 1 -
 2 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 28ad9ca7b9e7..676ba0dd04be 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -162,13 +162,9 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 
 	new_bus->name = "GPIO Bitbanged MDIO";
 
-	new_bus->phy_mask = pdata->phy_mask;
 	memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq));
 	new_bus->parent = dev;
 
-	if (new_bus->phy_mask == ~0)
-		goto out_free_bus;
-
 	for (i = 0; i < PHY_MAX_ADDR; i++)
 		if (!new_bus->irq[i])
 			new_bus->irq[i] = PHY_POLL;
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index b8f914a30126..7d55dfef56dc 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -23,7 +23,6 @@ struct mdio_gpio_platform_data {
 	bool mdio_active_low;
 	bool mdo_active_low;
 
-	u32 phy_mask;
 	int irqs[PHY_MAX_ADDR];
 };
 
-- 
cgit v1.2.3


From 68abb4f25d7eaf4d5f40108aa748621ddab68209 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:54 +0200
Subject: net: phy: mdio-gpio: Remove support for IRQs in platform data

No current devices use IRQs in platform data, so remove support for
it. The MDIO core will also initialise the new bus such that all
addresses are polled, so remove the unneeded re-initialisation.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 7 -------
 include/linux/platform_data/mdio-gpio.h | 2 --
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 676ba0dd04be..0f8c748c8edd 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -130,7 +130,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 {
 	struct mii_bus *new_bus;
 	struct mdio_gpio_info *bitbang;
-	int i;
 	int mdc, mdio, mdo;
 	unsigned long mdc_flags = GPIOF_OUT_INIT_LOW;
 	unsigned long mdio_flags = GPIOF_DIR_IN;
@@ -161,14 +160,8 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 		goto out;
 
 	new_bus->name = "GPIO Bitbanged MDIO";
-
-	memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq));
 	new_bus->parent = dev;
 
-	for (i = 0; i < PHY_MAX_ADDR; i++)
-		if (!new_bus->irq[i])
-			new_bus->irq[i] = PHY_POLL;
-
 	if (bus_id != -1)
 		snprintf(new_bus->id, MII_BUS_ID_SIZE, "gpio-%x", bus_id);
 	else
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index 7d55dfef56dc..af3be0c4ff9b 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -22,8 +22,6 @@ struct mdio_gpio_platform_data {
 	bool mdc_active_low;
 	bool mdio_active_low;
 	bool mdo_active_low;
-
-	int irqs[PHY_MAX_ADDR];
 };
 
 #endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3


From c82fc4814a93f36c9b536b5af8dd617e4ee1380f Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:55 +0200
Subject: net: phy: mdio-gpio: Swap to using gpio descriptors

This simplifies the code, removing the need to handle active low
flags, etc.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 73 ++++++++-------------------------
 include/linux/platform_data/mdio-gpio.h | 10 ++---
 2 files changed, 20 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 0f8c748c8edd..b999f32374e2 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -35,35 +35,25 @@ struct mdio_gpio_info {
 	struct gpio_desc *mdc, *mdio, *mdo;
 };
 
-static void *mdio_gpio_of_get_data(struct platform_device *pdev)
+static void *mdio_gpio_of_get_data(struct device *dev)
 {
-	struct device_node *np = pdev->dev.of_node;
 	struct mdio_gpio_platform_data *pdata;
-	enum of_gpio_flags flags;
-	int ret;
 
-	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return NULL;
 
-	ret = of_get_gpio_flags(np, 0, &flags);
-	if (ret < 0)
-		return NULL;
-
-	pdata->mdc = ret;
-	pdata->mdc_active_low = flags & OF_GPIO_ACTIVE_LOW;
+	pdata->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW);
+	if (IS_ERR(pdata->mdc))
+		return ERR_CAST(pdata->mdc);
 
-	ret = of_get_gpio_flags(np, 1, &flags);
-	if (ret < 0)
-		return NULL;
-	pdata->mdio = ret;
-	pdata->mdio_active_low = flags & OF_GPIO_ACTIVE_LOW;
+	pdata->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN);
+	if (IS_ERR(pdata->mdio))
+		return ERR_CAST(pdata->mdio);
 
-	ret = of_get_gpio_flags(np, 2, &flags);
-	if (ret > 0) {
-		pdata->mdo = ret;
-		pdata->mdo_active_low = flags & OF_GPIO_ACTIVE_LOW;
-	}
+	pdata->mdo = devm_gpiod_get_index_optional(dev, NULL, 2, GPIOD_OUT_LOW);
+	if (IS_ERR(pdata->mdo))
+		return ERR_CAST(pdata->mdo);
 
 	return pdata;
 }
@@ -130,34 +120,19 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 {
 	struct mii_bus *new_bus;
 	struct mdio_gpio_info *bitbang;
-	int mdc, mdio, mdo;
-	unsigned long mdc_flags = GPIOF_OUT_INIT_LOW;
-	unsigned long mdio_flags = GPIOF_DIR_IN;
-	unsigned long mdo_flags = GPIOF_OUT_INIT_HIGH;
 
 	bitbang = devm_kzalloc(dev, sizeof(*bitbang), GFP_KERNEL);
 	if (!bitbang)
-		goto out;
+		return NULL;
 
 	bitbang->ctrl.ops = &mdio_gpio_ops;
-	mdc = pdata->mdc;
-	bitbang->mdc = gpio_to_desc(mdc);
-	if (pdata->mdc_active_low)
-		mdc_flags = GPIOF_OUT_INIT_HIGH | GPIOF_ACTIVE_LOW;
-	mdio = pdata->mdio;
-	bitbang->mdio = gpio_to_desc(mdio);
-	if (pdata->mdio_active_low)
-		mdio_flags |= GPIOF_ACTIVE_LOW;
-	mdo = pdata->mdo;
-	if (mdo) {
-		bitbang->mdo = gpio_to_desc(mdo);
-		if (pdata->mdo_active_low)
-			mdo_flags = GPIOF_OUT_INIT_LOW | GPIOF_ACTIVE_LOW;
-	}
+	bitbang->mdc = pdata->mdc;
+	bitbang->mdio = pdata->mdio;
+	bitbang->mdo = pdata->mdo;
 
 	new_bus = alloc_mdio_bitbang(&bitbang->ctrl);
 	if (!new_bus)
-		goto out;
+		return NULL;
 
 	new_bus->name = "GPIO Bitbanged MDIO";
 	new_bus->parent = dev;
@@ -167,23 +142,9 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 	else
 		strncpy(new_bus->id, "gpio", MII_BUS_ID_SIZE);
 
-	if (devm_gpio_request_one(dev, mdc, mdc_flags, "mdc"))
-		goto out_free_bus;
-
-	if (devm_gpio_request_one(dev, mdio, mdio_flags, "mdio"))
-		goto out_free_bus;
-
-	if (mdo && devm_gpio_request_one(dev, mdo, mdo_flags, "mdo"))
-		goto out_free_bus;
-
 	dev_set_drvdata(dev, new_bus);
 
 	return new_bus;
-
-out_free_bus:
-	free_mdio_bitbang(new_bus);
-out:
-	return NULL;
 }
 
 static void mdio_gpio_bus_deinit(struct device *dev)
@@ -208,7 +169,7 @@ static int mdio_gpio_probe(struct platform_device *pdev)
 	int ret, bus_id;
 
 	if (pdev->dev.of_node) {
-		pdata = mdio_gpio_of_get_data(pdev);
+		pdata = mdio_gpio_of_get_data(&pdev->dev);
 		bus_id = of_alias_get_id(pdev->dev.of_node, "mdio-gpio");
 		if (bus_id < 0) {
 			dev_warn(&pdev->dev, "failed to get alias id\n");
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index af3be0c4ff9b..bd91fa98a3aa 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -15,13 +15,9 @@
 
 struct mdio_gpio_platform_data {
 	/* GPIO numbers for bus pins */
-	unsigned int mdc;
-	unsigned int mdio;
-	unsigned int mdo;
-
-	bool mdc_active_low;
-	bool mdio_active_low;
-	bool mdo_active_low;
+	struct gpio_desc *mdc;
+	struct gpio_desc *mdio;
+	struct gpio_desc *mdo;
 };
 
 #endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3


From fb78a95e22123da57cb79b98bdc62eb33965ca8a Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:58 +0200
Subject: net: phy: mdio-gpio: Add #defines for the GPIO index's

The GPIOs are described in device tree using a list, without names.
Add defines to indicate what each index in the list means. These
defines should also be used by platform devices passing GPIOs via a
GPIO lookup table.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c | 10 +++++++---
 include/linux/mdio-gpio.h   |  9 +++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/mdio-gpio.h

(limited to 'include')

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 74b982835ac1..281c905ef9fd 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -24,6 +24,8 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
+#include <linux/mdio-bitbang.h>
+#include <linux/mdio-gpio.h>
 #include <linux/gpio.h>
 #include <linux/platform_data/mdio-gpio.h>
 
@@ -38,15 +40,17 @@ struct mdio_gpio_info {
 static int mdio_gpio_get_data(struct device *dev,
 			      struct mdio_gpio_info *bitbang)
 {
-	bitbang->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW);
+	bitbang->mdc = devm_gpiod_get_index(dev, NULL, MDIO_GPIO_MDC,
+					    GPIOD_OUT_LOW);
 	if (IS_ERR(bitbang->mdc))
 		return PTR_ERR(bitbang->mdc);
 
-	bitbang->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN);
+	bitbang->mdio = devm_gpiod_get_index(dev, NULL, MDIO_GPIO_MDIO,
+					     GPIOD_IN);
 	if (IS_ERR(bitbang->mdio))
 		return PTR_ERR(bitbang->mdio);
 
-	bitbang->mdo = devm_gpiod_get_index_optional(dev, NULL, 2,
+	bitbang->mdo = devm_gpiod_get_index_optional(dev, NULL, MDIO_GPIO_MDO,
 						     GPIOD_OUT_LOW);
 	return PTR_ERR_OR_ZERO(bitbang->mdo);
 }
diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h
new file mode 100644
index 000000000000..cea443a672cb
--- /dev/null
+++ b/include/linux/mdio-gpio.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MDIO_GPIO_H
+#define __LINUX_MDIO_GPIO_H
+
+#define MDIO_GPIO_MDC	0
+#define MDIO_GPIO_MDIO	1
+#define MDIO_GPIO_MDO	2
+
+#endif
-- 
cgit v1.2.3


From 0207dd1173fe31c153ffd439c4bb33d1341829b1 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:59 +0200
Subject: net: phy: mdio-gpio: Remove redundant platform data header

The platform data header file is now unused. Remove it, but add
an extra include which it brought in.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                             |  1 -
 drivers/net/phy/mdio-gpio.c             |  2 +-
 include/linux/platform_data/mdio-gpio.h | 23 -----------------------
 3 files changed, 1 insertion(+), 25 deletions(-)
 delete mode 100644 include/linux/platform_data/mdio-gpio.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index b60179d948bb..a7321687cae9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5321,7 +5321,6 @@ F:	include/linux/*mdio*.h
 F:	include/linux/of_net.h
 F:	include/linux/phy.h
 F:	include/linux/phy_fixed.h
-F:	include/linux/platform_data/mdio-gpio.h
 F:	include/linux/platform_data/mdio-bcm-unimac.h
 F:	include/trace/events/mdio.h
 F:	include/uapi/linux/mdio.h
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 281c905ef9fd..b501221819e1 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -27,7 +27,7 @@
 #include <linux/mdio-bitbang.h>
 #include <linux/mdio-gpio.h>
 #include <linux/gpio.h>
-#include <linux/platform_data/mdio-gpio.h>
+#include <linux/gpio/consumer.h>
 
 #include <linux/of_gpio.h>
 #include <linux/of_mdio.h>
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
deleted file mode 100644
index bd91fa98a3aa..000000000000
--- a/include/linux/platform_data/mdio-gpio.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * MDIO-GPIO bus platform data structures
- *
- * Copyright (C) 2008, Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-
-#ifndef __LINUX_MDIO_GPIO_H
-#define __LINUX_MDIO_GPIO_H
-
-#include <linux/mdio-bitbang.h>
-
-struct mdio_gpio_platform_data {
-	/* GPIO numbers for bus pins */
-	struct gpio_desc *mdc;
-	struct gpio_desc *mdio;
-	struct gpio_desc *mdo;
-};
-
-#endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3


From 27cced20192d25ae528db8fe694c95c7656f3d56 Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:22 +1200
Subject: net-next: ax88796: Add block_input/output hooks to ax_plat_data

Add platform specific hooks for block transfer reads/writes of packet
buffer data, superseding the default provided ax_block_input/output.
Currently used for m68k Amiga XSurf100.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 10 ++++++++--
 include/net/ax88796.h               |  9 +++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index d3f30f1c40b2..939a572a19ff 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -758,8 +758,14 @@ static int ax_init_dev(struct net_device *dev)
 #endif
 
 	ei_local->reset_8390 = &ax_reset_8390;
-	ei_local->block_input = &ax_block_input;
-	ei_local->block_output = &ax_block_output;
+	if (ax->plat->block_input)
+		ei_local->block_input = ax->plat->block_input;
+	else
+		ei_local->block_input = &ax_block_input;
+	if (ax->plat->block_output)
+		ei_local->block_output = ax->plat->block_output;
+	else
+		ei_local->block_output = &ax_block_output;
 	ei_local->get_8390_hdr = &ax_get_8390_hdr;
 	ei_local->priv = 0;
 
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index b9a3beca0ce4..363b0ca5f7e8 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -12,6 +12,9 @@
 #ifndef __NET_AX88796_PLAT_H
 #define __NET_AX88796_PLAT_H
 
+struct sk_buff;
+struct net_device;
+
 #define AXFLG_HAS_EEPROM		(1<<0)
 #define AXFLG_MAC_FROMDEV		(1<<1)	/* device already has MAC */
 #define AXFLG_HAS_93CX6			(1<<2)	/* use eeprom_93cx6 driver */
@@ -26,6 +29,12 @@ struct ax_plat_data {
 	u32		*reg_offsets;	/* register offsets */
 	u8		*mac_addr;	/* MAC addr (only used when
 					   AXFLG_MAC_FROMPLATFORM is used */
+
+	/* uses default ax88796 buffer if set to NULL */
+	void (*block_output)(struct net_device *dev, int count,
+			const unsigned char *buf, int star_page);
+	void (*block_input)(struct net_device *dev, int count,
+			struct sk_buff *skb, int ring_offset);
 };
 
 #endif /* __NET_AX88796_PLAT_H */
-- 
cgit v1.2.3


From cec4c1c54a643608c262bd9bb72cf9bbec64f44a Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:23 +1200
Subject: net-next: ax88796: add interrupt status callback to platform data

To be able to tell the ax88796 driver whether it is sensible to enter
the 8390 interrupt handler, an "is this interrupt caused by the 88796"
callback has been added to the ax_plat_data structure (with NULL being
compatible to the previous behaviour).

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 23 +++++++++++++++++++++--
 include/net/ax88796.h               |  5 +++++
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 939a572a19ff..d283ed092519 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -163,6 +163,21 @@ static void ax_reset_8390(struct net_device *dev)
 	ei_outb(ENISR_RESET, addr + EN0_ISR);	/* Ack intr. */
 }
 
+/* Wrapper for __ei_interrupt for platforms that have a platform-specific
+ * way to find out whether the interrupt request might be caused by
+ * the ax88796 chip.
+ */
+static irqreturn_t ax_ei_interrupt_filtered(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct ax_device *ax = to_ax_dev(dev);
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+
+	if (!ax->plat->check_irq(pdev))
+		return IRQ_NONE;
+
+	return ax_ei_interrupt(irq, dev_id);
+}
 
 static void ax_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr,
 			    int ring_page)
@@ -482,8 +497,12 @@ static int ax_open(struct net_device *dev)
 	if (ret)
 		goto failed_mii;
 
-	ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
-			  dev->name, dev);
+	if (ax->plat->check_irq)
+		ret = request_irq(dev->irq, ax_ei_interrupt_filtered,
+				  ax->irqflags, dev->name, dev);
+	else
+		ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
+				  dev->name, dev);
 	if (ret)
 		goto failed_request_irq;
 
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index 363b0ca5f7e8..84b3785d0e66 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -14,6 +14,7 @@
 
 struct sk_buff;
 struct net_device;
+struct platform_device;
 
 #define AXFLG_HAS_EEPROM		(1<<0)
 #define AXFLG_MAC_FROMDEV		(1<<1)	/* device already has MAC */
@@ -35,6 +36,10 @@ struct ax_plat_data {
 			const unsigned char *buf, int star_page);
 	void (*block_input)(struct net_device *dev, int count,
 			struct sk_buff *skb, int ring_offset);
+	/* returns nonzero if a pending interrupt request might by caused by
+	 * the ax88786. Handles all interrupts if set to NULL
+	 */
+	int (*check_irq)(struct platform_device *pdev);
 };
 
 #endif /* __NET_AX88796_PLAT_H */
-- 
cgit v1.2.3


From a597043304a13defc646bb1f16514e4903b36c3c Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 10 Apr 2018 15:06:05 +0200
Subject: clk: Remove clk_init_cb typedef

Since commit c08ee14cc6634457 ("clk: ti: change clock init to use
generic of_clk_init"), there is only a single (private) user left of the
(public) clk_init_cb typedef.

Hence expand its single user in the core clock code, and remove the
typedef.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/1523365565-17124-1-git-send-email-geert+renesas@glider.be
---
 drivers/clk/clk.c            | 2 +-
 include/linux/clk-provider.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index ea67ac81c6f9..972f1ea4b63f 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3906,7 +3906,7 @@ int of_clk_parent_fill(struct device_node *np, const char **parents,
 EXPORT_SYMBOL_GPL(of_clk_parent_fill);
 
 struct clock_provider {
-	of_clk_init_cb_t clk_init_cb;
+	void (*clk_init_cb)(struct device_node *);
 	struct device_node *np;
 	struct list_head node;
 };
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 210a890008f9..410a8627b8c0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -802,8 +802,6 @@ unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate);
 
 struct of_device_id;
 
-typedef void (*of_clk_init_cb_t)(struct device_node *);
-
 struct clk_onecell_data {
 	struct clk **clks;
 	unsigned int clk_num;
-- 
cgit v1.2.3


From 669ca0303ac93adba0e046d414165250861efdb7 Mon Sep 17 00:00:00 2001
From: ryang <decatf@gmail.com>
Date: Thu, 19 Apr 2018 12:18:50 -0400
Subject: regulator: tps6586x: Add support for TPS658624

This version is exists in the Samsung Galaxy Tab 10.1 which is based on the
Nvidia Tegra 2 board. The TPS658624 has the same SM2 voltage table as
TPS658623.

Signed-off-by: ryang <decatf@gmail.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/tps6586x-regulator.c | 1 +
 include/linux/mfd/tps6586x.h           | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/regulator/tps6586x-regulator.c b/drivers/regulator/tps6586x-regulator.c
index 9e9d22038017..ba3dae7b2d2d 100644
--- a/drivers/regulator/tps6586x-regulator.c
+++ b/drivers/regulator/tps6586x-regulator.c
@@ -342,6 +342,7 @@ static struct tps6586x_regulator *find_regulator_info(int id, int version)
 
 	switch (version) {
 	case TPS658623:
+	case TPS658624:
 		table = tps658623_regulator;
 		num = ARRAY_SIZE(tps658623_regulator);
 		break;
diff --git a/include/linux/mfd/tps6586x.h b/include/linux/mfd/tps6586x.h
index 2fe68e481230..b19c2801a30e 100644
--- a/include/linux/mfd/tps6586x.h
+++ b/include/linux/mfd/tps6586x.h
@@ -18,6 +18,7 @@
 #define TPS658621A	0x15
 #define TPS658621CD	0x2c
 #define TPS658623	0x1b
+#define TPS658624	0x0a
 #define TPS658640	0x01
 #define TPS658640v2	0x02
 #define TPS658643	0x03
-- 
cgit v1.2.3


From 02f3703934a42417021405ef336fe45add13c3d1 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 18 Apr 2018 08:54:18 -0700
Subject: regulator: Don't return or expect -errno from of_map_mode()

In of_get_regulation_constraints() we were taking the result of
of_map_mode() (an unsigned int) and assigning it to an int.  We were
then checking whether this value was -EINVAL.  Some implementers of
of_map_mode() were returning -EINVAL (even though the return type of
their function needed to be unsigned int) because they needed to
signal an error back to of_get_regulation_constraints().

In general in the regulator framework the mode is always referred to
as an unsigned int.  While we could fix this to be a signed int (the
highest value we store in there right now is 0x8), it's actually
pretty clean to just define the regulator mode 0x0 (the lack of any
bits set) as an invalid mode.  Let's do that.

Fixes: 5e5e3a42c653 ("regulator: of: Add support for parsing initial and suspend modes")
Suggested-by: Javier Martinez Canillas <javierm@redhat.com>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/cpcap-regulator.c |  2 +-
 drivers/regulator/of_regulator.c    | 13 +++++++------
 drivers/regulator/twl-regulator.c   |  2 +-
 include/linux/regulator/consumer.h  |  1 +
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/cpcap-regulator.c b/drivers/regulator/cpcap-regulator.c
index f541b80f1b54..bd910fe123d9 100644
--- a/drivers/regulator/cpcap-regulator.c
+++ b/drivers/regulator/cpcap-regulator.c
@@ -222,7 +222,7 @@ static unsigned int cpcap_map_mode(unsigned int mode)
 	case CPCAP_BIT_AUDIO_LOW_PWR:
 		return REGULATOR_MODE_STANDBY;
 	default:
-		return -EINVAL;
+		return REGULATOR_MODE_INVALID;
 	}
 }
 
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index f47264fa1940..0d3f73eacb99 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -31,6 +31,7 @@ static void of_get_regulation_constraints(struct device_node *np,
 	struct regulation_constraints *constraints = &(*init_data)->constraints;
 	struct regulator_state *suspend_state;
 	struct device_node *suspend_np;
+	unsigned int mode;
 	int ret, i;
 	u32 pval;
 
@@ -124,11 +125,11 @@ static void of_get_regulation_constraints(struct device_node *np,
 
 	if (!of_property_read_u32(np, "regulator-initial-mode", &pval)) {
 		if (desc && desc->of_map_mode) {
-			ret = desc->of_map_mode(pval);
-			if (ret == -EINVAL)
+			mode = desc->of_map_mode(pval);
+			if (mode == REGULATOR_MODE_INVALID)
 				pr_err("%s: invalid mode %u\n", np->name, pval);
 			else
-				constraints->initial_mode = ret;
+				constraints->initial_mode = mode;
 		} else {
 			pr_warn("%s: mapping for mode %d not defined\n",
 				np->name, pval);
@@ -163,12 +164,12 @@ static void of_get_regulation_constraints(struct device_node *np,
 		if (!of_property_read_u32(suspend_np, "regulator-mode",
 					  &pval)) {
 			if (desc && desc->of_map_mode) {
-				ret = desc->of_map_mode(pval);
-				if (ret == -EINVAL)
+				mode = desc->of_map_mode(pval);
+				if (mode == REGULATOR_MODE_INVALID)
 					pr_err("%s: invalid mode %u\n",
 					       np->name, pval);
 				else
-					suspend_state->mode = ret;
+					suspend_state->mode = mode;
 			} else {
 				pr_warn("%s: mapping for mode %d not defined\n",
 					np->name, pval);
diff --git a/drivers/regulator/twl-regulator.c b/drivers/regulator/twl-regulator.c
index a4456db5849d..884c7505ed91 100644
--- a/drivers/regulator/twl-regulator.c
+++ b/drivers/regulator/twl-regulator.c
@@ -274,7 +274,7 @@ static inline unsigned int twl4030reg_map_mode(unsigned int mode)
 	case RES_STATE_SLEEP:
 		return REGULATOR_MODE_STANDBY;
 	default:
-		return -EINVAL;
+		return REGULATOR_MODE_INVALID;
 	}
 }
 
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index df176d7c2b87..25602afd4844 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -80,6 +80,7 @@ struct regmap;
  * These modes can be OR'ed together to make up a mask of valid register modes.
  */
 
+#define REGULATOR_MODE_INVALID			0x0
 #define REGULATOR_MODE_FAST			0x1
 #define REGULATOR_MODE_NORMAL			0x2
 #define REGULATOR_MODE_IDLE			0x4
-- 
cgit v1.2.3


From 95d1544eb643847e05df06c3de252609593c9073 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Fri, 23 Mar 2018 16:59:52 -0400
Subject: media: rc: add ioctl to get the current timeout

Since the kernel now modifies the timeout, make it possible to retrieve
the current value.

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/rc/lirc-func.rst            |  1 +
 Documentation/media/uapi/rc/lirc-set-rec-timeout.rst | 14 +++++++++-----
 drivers/media/rc/lirc_dev.c                          |  7 +++++++
 include/uapi/linux/lirc.h                            |  6 ++++++
 4 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/uapi/rc/lirc-func.rst b/Documentation/media/uapi/rc/lirc-func.rst
index ddb4620de294..9656423a3f28 100644
--- a/Documentation/media/uapi/rc/lirc-func.rst
+++ b/Documentation/media/uapi/rc/lirc-func.rst
@@ -17,6 +17,7 @@ LIRC Function Reference
     lirc-get-rec-resolution
     lirc-set-send-duty-cycle
     lirc-get-timeout
+    lirc-get-rec-timeout
     lirc-set-rec-timeout
     lirc-set-rec-carrier
     lirc-set-rec-carrier-range
diff --git a/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst b/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst
index b3e16bbdbc90..a833a6a4c25a 100644
--- a/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst
+++ b/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst
@@ -1,19 +1,23 @@
 .. -*- coding: utf-8; mode: rst -*-
 
 .. _lirc_set_rec_timeout:
+.. _lirc_get_rec_timeout:
 
-**************************
-ioctl LIRC_SET_REC_TIMEOUT
-**************************
+***************************************************
+ioctl LIRC_GET_REC_TIMEOUT and LIRC_SET_REC_TIMEOUT
+***************************************************
 
 Name
 ====
 
-LIRC_SET_REC_TIMEOUT - sets the integer value for IR inactivity timeout.
+LIRC_GET_REC_TIMEOUT/LIRC_SET_REC_TIMEOUT - Get/set the integer value for IR inactivity timeout.
 
 Synopsis
 ========
 
+.. c:function:: int ioctl( int fd, LIRC_GET_REC_TIMEOUT, __u32 *timeout )
+    :name: LIRC_GET_REC_TIMEOUT
+
 .. c:function:: int ioctl( int fd, LIRC_SET_REC_TIMEOUT, __u32 *timeout )
     :name: LIRC_SET_REC_TIMEOUT
 
@@ -30,7 +34,7 @@ Arguments
 Description
 ===========
 
-Sets the integer value for IR inactivity timeout.
+Get and set the integer value for IR inactivity timeout.
 
 If supported by the hardware, setting it to 0  disables all hardware timeouts
 and data should be reported as soon as possible. If the exact value
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 247e6fc3dc0c..6b4755e9fa25 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -575,6 +575,13 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd,
 		}
 		break;
 
+	case LIRC_GET_REC_TIMEOUT:
+		if (!dev->timeout)
+			ret = -ENOTTY;
+		else
+			val = DIV_ROUND_UP(dev->timeout, 1000);
+		break;
+
 	case LIRC_SET_REC_TIMEOUT_REPORTS:
 		if (!dev->timeout)
 			ret = -ENOTTY;
diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index f189931042a7..6b319581882f 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -133,6 +133,12 @@
 
 #define LIRC_SET_WIDEBAND_RECEIVER     _IOW('i', 0x00000023, __u32)
 
+/*
+ * Return the recording timeout, which is either set by
+ * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols.
+ */
+#define LIRC_GET_REC_TIMEOUT	       _IOR('i', 0x00000024, __u32)
+
 /*
  * struct lirc_scancode - decoded scancode with protocol for use with
  *	LIRC_MODE_SCANCODE
-- 
cgit v1.2.3


From f991f01571281b82e5e6ca74445f5d3f42d72212 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 May 2015 23:13:15 +0200
Subject: y2038: asm-generic: Extend sysvipc data structures

Most architectures now use the asm-generic copy of the sysvipc data
structures (msqid64_ds, semid64_ds, shmid64_ds), which use 32-bit
__kernel_time_t on 32-bit architectures but have padding behind them to
allow extending the type to 64-bit.

Unfortunately, that fails on all big-endian architectures, which have the
padding on the wrong side. As so many of them get it wrong, we decided to
not bother even trying to fix it up when we introduced the asm-generic
copy. Instead we always use the padding word now to provide the upper
32 bits of the seconds value, regardless of the endianess.

A libc implementation on a typical big-endian system can deal with
this by providing its own copy of the structure definition to user
space, and swapping the two 32-bit words before returning from the
semctl/shmctl/msgctl system calls.

Note that msqid64_ds and shmid64_ds were broken on x32 since commit
f4b4aae18288 ("x86/headers/uapi: Fix __BITS_PER_LONG value for x32
builds"). I have sent a separate fix for that, but as we no longer
have to worry about x32 here, I no longer worry about x32 here and
use 'unsigned long' instead of __kernel_ulong_t.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/asm-generic/msgbuf.h | 27 +++++++++++++-------------
 include/uapi/asm-generic/sembuf.h | 26 +++++++++++++++----------
 include/uapi/asm-generic/shmbuf.h | 41 +++++++++++++++++++--------------------
 3 files changed, 49 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/uapi/asm-generic/msgbuf.h b/include/uapi/asm-generic/msgbuf.h
index fb306ebdb36f..9fe4881557cb 100644
--- a/include/uapi/asm-generic/msgbuf.h
+++ b/include/uapi/asm-generic/msgbuf.h
@@ -18,31 +18,30 @@
  * On big-endian systems, the padding is in the wrong place.
  *
  * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
  * - 2 miscellaneous 32-bit values
  */
 
 struct msqid64_ds {
 	struct ipc64_perm msg_perm;
+#if __BITS_PER_LONG == 64
 	__kernel_time_t msg_stime;	/* last msgsnd time */
-#if __BITS_PER_LONG != 64
-	unsigned long	__unused1;
-#endif
 	__kernel_time_t msg_rtime;	/* last msgrcv time */
-#if __BITS_PER_LONG != 64
-	unsigned long	__unused2;
-#endif
 	__kernel_time_t msg_ctime;	/* last change time */
-#if __BITS_PER_LONG != 64
-	unsigned long	__unused3;
+#else
+	unsigned long	msg_stime;	/* last msgsnd time */
+	unsigned long	msg_stime_high;
+	unsigned long	msg_rtime;	/* last msgrcv time */
+	unsigned long	msg_rtime_high;
+	unsigned long	msg_ctime;	/* last change time */
+	unsigned long	msg_ctime_high;
 #endif
-	__kernel_ulong_t msg_cbytes;	/* current number of bytes on queue */
-	__kernel_ulong_t msg_qnum;	/* number of messages in queue */
-	__kernel_ulong_t msg_qbytes;	/* max number of bytes on queue */
+	unsigned long	msg_cbytes;	/* current number of bytes on queue */
+	unsigned long	msg_qnum;	/* number of messages in queue */
+	unsigned long	 msg_qbytes;	/* max number of bytes on queue */
 	__kernel_pid_t msg_lspid;	/* pid of last msgsnd */
 	__kernel_pid_t msg_lrpid;	/* last receive pid */
-	__kernel_ulong_t __unused4;
-	__kernel_ulong_t __unused5;
+	unsigned long	 __unused4;
+	unsigned long	 __unused5;
 };
 
 #endif /* __ASM_GENERIC_MSGBUF_H */
diff --git a/include/uapi/asm-generic/sembuf.h b/include/uapi/asm-generic/sembuf.h
index cbf9cfe977d6..0bae010f1b64 100644
--- a/include/uapi/asm-generic/sembuf.h
+++ b/include/uapi/asm-generic/sembuf.h
@@ -13,23 +13,29 @@
  * everyone just ended up making identical copies without specific
  * optimizations, so we may just as well all use the same one.
  *
- * 64 bit architectures typically define a 64 bit __kernel_time_t,
+ * 64 bit architectures use a 64-bit __kernel_time_t here, while
+ * 32 bit architectures have a pair of unsigned long values.
  * so they do not need the first two padding words.
- * On big-endian systems, the padding is in the wrong place.
  *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
+ * On big-endian systems, the padding is in the wrong place for
+ * historic reasons, so user space has to reconstruct a time_t
+ * value using
+ *
+ * user_semid_ds.sem_otime = kernel_semid64_ds.sem_otime +
+ *		((long long)kernel_semid64_ds.sem_otime_high << 32)
+ *
+ * Pad space is left for 2 miscellaneous 32-bit values
  */
 struct semid64_ds {
 	struct ipc64_perm sem_perm;	/* permissions .. see ipc.h */
+#if __BITS_PER_LONG == 64
 	__kernel_time_t	sem_otime;	/* last semop time */
-#if __BITS_PER_LONG != 64
-	unsigned long	__unused1;
-#endif
 	__kernel_time_t	sem_ctime;	/* last change time */
-#if __BITS_PER_LONG != 64
-	unsigned long	__unused2;
+#else
+	unsigned long	sem_otime;	/* last semop time */
+	unsigned long	sem_otime_high;
+	unsigned long	sem_ctime;	/* last change time */
+	unsigned long	sem_ctime_high;
 #endif
 	unsigned long	sem_nsems;	/* no. of semaphores in array */
 	unsigned long	__unused3;
diff --git a/include/uapi/asm-generic/shmbuf.h b/include/uapi/asm-generic/shmbuf.h
index 2b6c3bb97f97..e504422fc501 100644
--- a/include/uapi/asm-generic/shmbuf.h
+++ b/include/uapi/asm-generic/shmbuf.h
@@ -19,42 +19,41 @@
  *
  *
  * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
  * - 2 miscellaneous 32-bit values
  */
 
 struct shmid64_ds {
 	struct ipc64_perm	shm_perm;	/* operation perms */
 	size_t			shm_segsz;	/* size of segment (bytes) */
+#if __BITS_PER_LONG == 64
 	__kernel_time_t		shm_atime;	/* last attach time */
-#if __BITS_PER_LONG != 64
-	unsigned long		__unused1;
-#endif
 	__kernel_time_t		shm_dtime;	/* last detach time */
-#if __BITS_PER_LONG != 64
-	unsigned long		__unused2;
-#endif
 	__kernel_time_t		shm_ctime;	/* last change time */
-#if __BITS_PER_LONG != 64
-	unsigned long		__unused3;
+#else
+	unsigned long		shm_atime;	/* last attach time */
+	unsigned long		shm_atime_high;
+	unsigned long		shm_dtime;	/* last detach time */
+	unsigned long		shm_dtime_high;
+	unsigned long		shm_ctime;	/* last change time */
+	unsigned long		shm_ctime_high;
 #endif
 	__kernel_pid_t		shm_cpid;	/* pid of creator */
 	__kernel_pid_t		shm_lpid;	/* pid of last operator */
-	__kernel_ulong_t	shm_nattch;	/* no. of current attaches */
-	__kernel_ulong_t	__unused4;
-	__kernel_ulong_t	__unused5;
+	unsigned long		shm_nattch;	/* no. of current attaches */
+	unsigned long		__unused4;
+	unsigned long		__unused5;
 };
 
 struct shminfo64 {
-	__kernel_ulong_t	shmmax;
-	__kernel_ulong_t	shmmin;
-	__kernel_ulong_t	shmmni;
-	__kernel_ulong_t	shmseg;
-	__kernel_ulong_t	shmall;
-	__kernel_ulong_t	__unused1;
-	__kernel_ulong_t	__unused2;
-	__kernel_ulong_t	__unused3;
-	__kernel_ulong_t	__unused4;
+	unsigned long		shmmax;
+	unsigned long		shmmin;
+	unsigned long		shmmni;
+	unsigned long		shmseg;
+	unsigned long		shmall;
+	unsigned long		__unused1;
+	unsigned long		__unused2;
+	unsigned long		__unused3;
+	unsigned long		__unused4;
 };
 
 #endif /* __ASM_GENERIC_SHMBUF_H */
-- 
cgit v1.2.3


From 21fc538d817ce671f1a28a03996c715247c2ac89 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Apr 2018 13:58:00 +0200
Subject: y2038: ipc: Use __kernel_timespec

This is a preparatation for changing over __kernel_timespec to 64-bit
times, which involves assigning new system call numbers for mq_timedsend(),
mq_timedreceive() and semtimedop() for compatibility with future y2038
proof user space.

The existing ABIs will remain available through compat code.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/syscalls.h | 6 +++---
 ipc/mqueue.c             | 6 +++---
 ipc/sem.c                | 4 ++--
 ipc/util.h               | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c9a2a2601852..b92cb79d38c3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -680,8 +680,8 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info);
 /* ipc/mqueue.c */
 asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr);
 asmlinkage long sys_mq_unlink(const char __user *name);
-asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
-asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct __kernel_timespec __user *abs_timeout);
+asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct __kernel_timespec __user *abs_timeout);
 asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
 asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
 
@@ -698,7 +698,7 @@ asmlinkage long sys_semget(key_t key, int nsems, int semflg);
 asmlinkage long sys_semctl(int semid, int semnum, int cmd, unsigned long arg);
 asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
 				unsigned nsops,
-				const struct timespec __user *timeout);
+				const struct __kernel_timespec __user *timeout);
 asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
 				unsigned nsops);
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a808f29d4c5a..9610afcfa2e5 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -691,7 +691,7 @@ static void __do_notify(struct mqueue_inode_info *info)
 	wake_up(&info->wait_q);
 }
 
-static int prepare_timeout(const struct timespec __user *u_abs_timeout,
+static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,
 			   struct timespec64 *ts)
 {
 	if (get_timespec64(ts, u_abs_timeout))
@@ -1128,7 +1128,7 @@ out:
 
 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 		size_t, msg_len, unsigned int, msg_prio,
-		const struct timespec __user *, u_abs_timeout)
+		const struct __kernel_timespec __user *, u_abs_timeout)
 {
 	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
@@ -1142,7 +1142,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 
 SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 		size_t, msg_len, unsigned int __user *, u_msg_prio,
-		const struct timespec __user *, u_abs_timeout)
+		const struct __kernel_timespec __user *, u_abs_timeout)
 {
 	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
diff --git a/ipc/sem.c b/ipc/sem.c
index 8935cd8cf166..b951e25ba2db 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2176,7 +2176,7 @@ out_free:
 }
 
 long ksys_semtimedop(int semid, struct sembuf __user *tsops,
-		     unsigned int nsops, const struct timespec __user *timeout)
+		     unsigned int nsops, const struct __kernel_timespec __user *timeout)
 {
 	if (timeout) {
 		struct timespec64 ts;
@@ -2188,7 +2188,7 @@ long ksys_semtimedop(int semid, struct sembuf __user *tsops,
 }
 
 SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
-		unsigned int, nsops, const struct timespec __user *, timeout)
+		unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
 {
 	return ksys_semtimedop(semid, tsops, nsops, timeout);
 }
diff --git a/ipc/util.h b/ipc/util.h
index acc5159e96d0..975c6de2df9d 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -251,7 +251,7 @@ static inline int compat_ipc_parse_version(int *cmd)
 /* for __ARCH_WANT_SYS_IPC */
 long ksys_semtimedop(int semid, struct sembuf __user *tsops,
 		     unsigned int nsops,
-		     const struct timespec __user *timeout);
+		     const struct __kernel_timespec __user *timeout);
 long ksys_semget(key_t key, int nsems, int semflg);
 long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg);
 long ksys_msgget(key_t key, int msgflg);
-- 
cgit v1.2.3


From c7f1770f369ae0ec632b3ddc2bd6677b0e485128 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 5 Apr 2018 10:54:14 -0400
Subject: media: omap: omap-iommu.h: allow building drivers with COMPILE_TEST

Drivers that depend on omap-iommu.h (currently, just omap3isp)
need a stub implementation in order to be built with COMPILE_TEST.

Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/linux/omap-iommu.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/omap-iommu.h b/include/linux/omap-iommu.h
index c1aede46718b..ce1b7c6283ee 100644
--- a/include/linux/omap-iommu.h
+++ b/include/linux/omap-iommu.h
@@ -13,7 +13,12 @@
 #ifndef _OMAP_IOMMU_H_
 #define _OMAP_IOMMU_H_
 
+#ifdef CONFIG_OMAP_IOMMU
 extern void omap_iommu_save_ctx(struct device *dev);
 extern void omap_iommu_restore_ctx(struct device *dev);
+#else
+static inline void omap_iommu_save_ctx(struct device *dev) {}
+static inline void omap_iommu_restore_ctx(struct device *dev) {}
+#endif
 
 #endif
-- 
cgit v1.2.3


From a4dfa72d0acd1c99a160e25c099849ae37ad13fd Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Thu, 19 Apr 2018 11:06:18 +0200
Subject: tipc: set default MTU for UDP media

Currently, all bearers are configured with MTU value same as the
underlying L2 device. However, in case of bearers with media type
UDP, higher throughput is possible with a fixed and higher emulated
MTU value than adapting to the underlying L2 MTU.

In this commit, we introduce a parameter mtu in struct tipc_media
and a default value is set for UDP. A default value of 14k
was determined by experimentation and found to have a higher throughput
than 16k. MTU for UDP bearers are assigned the above set value of
media MTU.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_config.h | 5 +++++
 net/tipc/udp_media.c             | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 3f29e3c8ed06..4b2c93b1934c 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -185,6 +185,11 @@
 #define TIPC_DEF_LINK_WIN 50
 #define TIPC_MAX_LINK_WIN 8191
 
+/*
+ * Default MTU for UDP media
+ */
+
+#define TIPC_DEF_LINK_UDP_MTU 14000
 
 struct tipc_node_info {
 	__be32 addr;			/* network address of node */
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e7d91f5d5cae..9783101bc4a9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 			err = -EINVAL;
 			goto err;
 		}
-		b->mtu = dev->mtu - sizeof(struct iphdr)
-			- sizeof(struct udphdr);
+		b->mtu = b->media->mtu;
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (local.proto == htons(ETH_P_IPV6)) {
 		udp_conf.family = AF_INET6;
@@ -803,6 +802,7 @@ struct tipc_media udp_media_info = {
 	.priority	= TIPC_DEF_LINK_PRI,
 	.tolerance	= TIPC_DEF_LINK_TOL,
 	.window		= TIPC_DEF_LINK_WIN,
+	.mtu		= TIPC_DEF_LINK_UDP_MTU,
 	.type_id	= TIPC_MEDIA_TYPE_UDP,
 	.hwaddr_len	= 0,
 	.name		= "udp"
-- 
cgit v1.2.3


From 901271e0403af638c224987c2a4e55cebade7e91 Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Thu, 19 Apr 2018 11:06:19 +0200
Subject: tipc: implement configuration of UDP media MTU

In previous commit, we changed the default emulated MTU for UDP bearers
to 14k.

This commit adds the functionality to set/change the default value
by configuring new MTU for UDP media. UDP bearer(s) have to be disabled
and enabled back for the new MTU to take effect.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h |  1 +
 net/tipc/bearer.c                 | 13 +++++++++++++
 net/tipc/bearer.h                 |  3 +++
 net/tipc/udp_media.h              | 14 ++++++++++++++
 4 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index 0affb682e5e3..85c11982c89b 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -266,6 +266,7 @@ enum {
 	TIPC_NLA_PROP_PRIO,		/* u32 */
 	TIPC_NLA_PROP_TOL,		/* u32 */
 	TIPC_NLA_PROP_WIN,		/* u32 */
+	TIPC_NLA_PROP_MTU,		/* u32 */
 
 	__TIPC_NLA_PROP_MAX,
 	TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index f7d47c89d658..a22caf9e5a18 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -1029,6 +1029,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
 		goto prop_msg_full;
 	if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
 		goto prop_msg_full;
+	if (media->type_id == TIPC_MEDIA_TYPE_UDP)
+		if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu))
+			goto prop_msg_full;
 
 	nla_nest_end(msg->skb, prop);
 	nla_nest_end(msg->skb, attrs);
@@ -1158,6 +1161,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
 			m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
 		if (props[TIPC_NLA_PROP_WIN])
 			m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+		if (props[TIPC_NLA_PROP_MTU]) {
+			if (m->type_id != TIPC_MEDIA_TYPE_UDP)
+				return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+			if (tipc_udp_mtu_bad(nla_get_u32
+					     (props[TIPC_NLA_PROP_MTU])))
+				return -EINVAL;
+			m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+#endif
+		}
 	}
 
 	return 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 6efcee63a381..394290cbbb1d 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -94,6 +94,8 @@ struct tipc_bearer;
  * @priority: default link (and bearer) priority
  * @tolerance: default time (in ms) before declaring link failure
  * @window: default window (in packets) before declaring link congestion
+ * @mtu: max packet size bearer can support for media type not dependent on
+ * underlying device MTU
  * @type_id: TIPC media identifier
  * @hwaddr_len: TIPC media address len
  * @name: media name
@@ -118,6 +120,7 @@ struct tipc_media {
 	u32 priority;
 	u32 tolerance;
 	u32 window;
+	u32 mtu;
 	u32 type_id;
 	u32 hwaddr_len;
 	char name[TIPC_MAX_MEDIA_NAME];
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
index 281bbae87726..e7455cc73e16 100644
--- a/net/tipc/udp_media.h
+++ b/net/tipc/udp_media.h
@@ -38,9 +38,23 @@
 #ifndef _TIPC_UDP_MEDIA_H
 #define _TIPC_UDP_MEDIA_H
 
+#include <linux/ip.h>
+#include <linux/udp.h>
+
 int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
 int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
 int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
 
+/* check if configured MTU is too low for tipc headers */
+static inline bool tipc_udp_mtu_bad(u32 mtu)
+{
+	if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) +
+	    sizeof(struct udphdr)))
+		return false;
+
+	pr_warn("MTU too low for tipc bearer\n");
+	return true;
+}
+
 #endif
 #endif
-- 
cgit v1.2.3


From 809c45a091d93e05c6e9b5d53bb3f1185273286b Mon Sep 17 00:00:00 2001
From: Shahed Shaikh <shahed.shaikh@cavium.com>
Date: Thu, 19 Apr 2018 05:50:12 -0700
Subject: qed* : Add new TLV to request PF to update MAC in bulletin board

There may be a need for VF driver to request PF to explicitly update its
bulletin with a MAC address.
e.g. When user assigns a MAC address to VF while VF is still down,
and PF's bulletin board contains different MAC address, in this case,
when VF's interface is brought up, it gets loaded with MAC address from
bulletin board which is not desirable.

To handle this corner case, we need a new TLV to request PF to update
its bulletin board with suggested MAC.

This request will be honored only for trusted VFs.

Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c       | 19 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 37 ++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.c       | 29 ++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h       | 21 +++++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_filter.c |  4 +++
 include/linux/qed/qed_eth_if.h                 |  1 +
 6 files changed, 111 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index e874504e8b28..8b1b7e8ca56c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2850,6 +2850,24 @@ static int qed_fp_cqe_completion(struct qed_dev *dev,
 				      cqe);
 }
 
+static int qed_req_bulletin_update_mac(struct qed_dev *cdev, u8 *mac)
+{
+	int i, ret;
+
+	if (IS_PF(cdev))
+		return 0;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		ret = qed_vf_pf_bulletin_update_mac(p_hwfn, mac);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 #ifdef CONFIG_QED_SRIOV
 extern const struct qed_iov_hv_ops qed_iov_ops_pass;
 #endif
@@ -2887,6 +2905,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.ntuple_filter_config = &qed_ntuple_arfs_filter_config,
 	.configure_arfs_searcher = &qed_configure_arfs_searcher,
 	.get_coalesce = &qed_get_coalesce,
+	.req_bulletin_update_mac = &qed_req_bulletin_update_mac,
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 77376fda8eae..f01bf52bc381 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -3820,6 +3820,40 @@ static void qed_iov_get_link(struct qed_hwfn *p_hwfn,
 		__qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin);
 }
 
+static int
+qed_iov_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_vf_info *p_vf)
+{
+	struct qed_bulletin_content *p_bulletin = p_vf->bulletin.p_virt;
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct vfpf_bulletin_update_mac_tlv *p_req;
+	u8 status = PFVF_STATUS_SUCCESS;
+	int rc = 0;
+
+	if (!p_vf->p_vf_info.is_trusted_configured) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "Blocking bulletin update request from untrusted VF[%d]\n",
+			   p_vf->abs_vf_id);
+		status = PFVF_STATUS_NOT_SUPPORTED;
+		rc = -EINVAL;
+		goto send_status;
+	}
+
+	p_req = &mbx->req_virt->bulletin_update_mac;
+	ether_addr_copy(p_bulletin->mac, p_req->mac);
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Updated bulletin of VF[%d] with requested MAC[%pM]\n",
+		   p_vf->abs_vf_id, p_req->mac);
+
+send_status:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf,
+			     CHANNEL_TLV_BULLETIN_UPDATE_MAC,
+			     sizeof(struct pfvf_def_resp_tlv), status);
+	return rc;
+}
+
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt, int vfid)
 {
@@ -3899,6 +3933,9 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 		case CHANNEL_TLV_COALESCE_READ:
 			qed_iov_vf_pf_get_coalesce(p_hwfn, p_ptt, p_vf);
 			break;
+		case CHANNEL_TLV_BULLETIN_UPDATE_MAC:
+			qed_iov_vf_pf_bulletin_update_mac(p_hwfn, p_ptt, p_vf);
+			break;
 		}
 	} else if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
 		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 91b5e9f02a62..2d7fcd6a0777 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1374,6 +1374,35 @@ exit:
 	return rc;
 }
 
+int
+qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn,
+			      u8 *p_mac)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_bulletin_update_mac_tlv *p_req;
+	struct pfvf_def_resp_tlv *p_resp;
+	int rc;
+
+	if (!p_mac)
+		return -EINVAL;
+
+	/* clear mailbox and prep header tlv */
+	p_req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_BULLETIN_UPDATE_MAC,
+			       sizeof(*p_req));
+	ether_addr_copy(p_req->mac, p_mac);
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Requesting bulletin update for MAC[%pM]\n", p_mac);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	p_resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &p_resp->hdr.status, sizeof(*p_resp));
+	qed_vf_pf_req_end(p_hwfn, rc);
+	return rc;
+}
+
 int
 qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
 		       u16 rx_coal, u16 tx_coal, struct qed_queue_cid *p_cid)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 97d44dfb38ca..4f05d5eb3cf5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -518,6 +518,12 @@ struct pfvf_read_coal_resp_tlv {
 	u8 padding[6];
 };
 
+struct vfpf_bulletin_update_mac_tlv {
+	struct vfpf_first_tlv first_tlv;
+	u8 mac[ETH_ALEN];
+	u8 padding[2];
+};
+
 union vfpf_tlvs {
 	struct vfpf_first_tlv first_tlv;
 	struct vfpf_acquire_tlv acquire;
@@ -532,6 +538,7 @@ union vfpf_tlvs {
 	struct vfpf_update_tunn_param_tlv tunn_param_update;
 	struct vfpf_update_coalesce update_coalesce;
 	struct vfpf_read_coal_req_tlv read_coal_req;
+	struct vfpf_bulletin_update_mac_tlv bulletin_update_mac;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
@@ -650,6 +657,7 @@ enum {
 	CHANNEL_TLV_COALESCE_UPDATE,
 	CHANNEL_TLV_QID,
 	CHANNEL_TLV_COALESCE_READ,
+	CHANNEL_TLV_BULLETIN_UPDATE_MAC,
 	CHANNEL_TLV_MAX,
 
 	/* Required for iterating over vport-update tlvs.
@@ -1042,6 +1050,13 @@ int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
 				  struct qed_tunnel_info *p_tunn);
 
 u32 qed_vf_hw_bar_size(struct qed_hwfn *p_hwfn, enum BAR_ID bar_id);
+/**
+ * @brief - Ask PF to update the MAC address in it's bulletin board
+ *
+ * @param p_mac - mac address to be updated in bulletin board
+ */
+int qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn, u8 *p_mac);
+
 #else
 static inline void qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
 					  struct qed_mcp_link_params *params)
@@ -1228,6 +1243,12 @@ static inline int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
 	return -EINVAL;
 }
 
+static inline int qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn,
+						u8 *p_mac)
+{
+	return -EINVAL;
+}
+
 static inline u32
 qed_vf_hw_bar_size(struct qed_hwfn  *p_hwfn,
 		   enum BAR_ID bar_id)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 8094f035b705..43569b1839be 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1160,6 +1160,10 @@ int qede_set_mac_addr(struct net_device *ndev, void *p)
 	if (edev->state != QEDE_STATE_OPEN) {
 		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
 			   "The device is currently down\n");
+		/* Ask PF to explicitly update a copy in bulletin board */
+		if (IS_VF(edev) && edev->ops->req_bulletin_update_mac)
+			edev->ops->req_bulletin_update_mac(edev->cdev,
+							   ndev->dev_addr);
 		goto out;
 	}
 
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 147d08ccf813..7f9756fe9180 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -352,6 +352,7 @@ struct qed_eth_ops {
 	int (*configure_arfs_searcher)(struct qed_dev *cdev,
 				       enum qed_filter_config_mode mode);
 	int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle);
+	int (*req_bulletin_update_mac)(struct qed_dev *cdev, u8 *mac);
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void);
-- 
cgit v1.2.3


From 1827b0678863bc97a1653fdf5308762b2aefcd56 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Thu, 19 Apr 2018 17:59:39 +0100
Subject: lan78xx: Read LED states from Device Tree

Add support for DT property "microchip,led-modes", a vector of zero
to four cells (u32s) in the range 0-15, each of which sets the mode
for one of the LEDs. Some possible values are:

    0=link/activity          1=link1000/activity
    2=link100/activity       3=link10/activity
    4=link100/1000/activity  5=link10/1000/activity
    6=link10/100/activity    14=off    15=on

These values are given symbolic constants in a dt-bindings header.

Also use the presence of the DT property to indicate that the
LEDs should be enabled - necessary in the event that no valid OTP
or EEPROM is available.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                                 |  1 +
 drivers/net/phy/microchip.c                 | 25 ++++++++++++++++++++++
 drivers/net/usb/lan78xx.c                   | 32 ++++++++++++++++++++++++++++-
 include/dt-bindings/net/microchip-lan78xx.h | 21 +++++++++++++++++++
 include/linux/microchipphy.h                |  3 +++
 5 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/net/microchip-lan78xx.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index a7321687cae9..c952d3076a65 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14572,6 +14572,7 @@ M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/usb/lan78xx.*
+F:	include/dt-bindings/net/microchip-lan78xx.h
 
 USB MASS STORAGE DRIVER
 M:	Alan Stern <stern@rowland.harvard.edu>
diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c
index 0f293ef28935..ef5e160fe9dc 100644
--- a/drivers/net/phy/microchip.c
+++ b/drivers/net/phy/microchip.c
@@ -20,6 +20,8 @@
 #include <linux/ethtool.h>
 #include <linux/phy.h>
 #include <linux/microchipphy.h>
+#include <linux/of.h>
+#include <dt-bindings/net/microchip-lan78xx.h>
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
 #define DRIVER_DESC	"Microchip LAN88XX PHY driver"
@@ -70,6 +72,8 @@ static int lan88xx_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
 	struct lan88xx_priv *priv;
+	u32 led_modes[4];
+	int len;
 
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -77,6 +81,27 @@ static int lan88xx_probe(struct phy_device *phydev)
 
 	priv->wolopts = 0;
 
+	len = of_property_read_variable_u32_array(dev->of_node,
+						  "microchip,led-modes",
+						  led_modes,
+						  0,
+						  ARRAY_SIZE(led_modes));
+	if (len >= 0) {
+		u32 reg = 0;
+		int i;
+
+		for (i = 0; i < len; i++) {
+			if (led_modes[i] > 15)
+				return -EINVAL;
+			reg |= led_modes[i] << (i * 4);
+		}
+		for (; i < ARRAY_SIZE(led_modes); i++)
+			reg |= LAN78XX_FORCE_LED_OFF << (i * 4);
+		(void)phy_write(phydev, LAN78XX_PHY_LED_MODE_SELECT, reg);
+	} else if (len == -EOVERFLOW) {
+		return -EINVAL;
+	}
+
 	/* these values can be used to identify internal PHY */
 	priv->chip_id = phy_read_mmd(phydev, 3, LAN88XX_MMD3_CHIP_ID);
 	priv->chip_rev = phy_read_mmd(phydev, 3, LAN88XX_MMD3_CHIP_REV);
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index a823f010de30..6b03b973083e 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -37,6 +37,7 @@
 #include <linux/irqchip/chained_irq.h>
 #include <linux/microchipphy.h>
 #include <linux/phy.h>
+#include <linux/of_mdio.h>
 #include <linux/of_net.h>
 #include "lan78xx.h"
 
@@ -1760,6 +1761,7 @@ done:
 
 static int lan78xx_mdio_init(struct lan78xx_net *dev)
 {
+	struct device_node *node;
 	int ret;
 
 	dev->mdiobus = mdiobus_alloc();
@@ -1788,7 +1790,13 @@ static int lan78xx_mdio_init(struct lan78xx_net *dev)
 		break;
 	}
 
-	ret = mdiobus_register(dev->mdiobus);
+	node = of_get_child_by_name(dev->udev->dev.of_node, "mdio");
+	if (node) {
+		ret = of_mdiobus_register(dev->mdiobus, node);
+		of_node_put(node);
+	} else {
+		ret = mdiobus_register(dev->mdiobus);
+	}
 	if (ret) {
 		netdev_err(dev->net, "can't register MDIO bus\n");
 		goto exit1;
@@ -2077,6 +2085,28 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	mii_adv = (u32)mii_advertise_flowctrl(dev->fc_request_control);
 	phydev->advertising |= mii_adv_to_ethtool_adv_t(mii_adv);
 
+	if (phydev->mdio.dev.of_node) {
+		u32 reg;
+		int len;
+
+		len = of_property_count_elems_of_size(phydev->mdio.dev.of_node,
+						      "microchip,led-modes",
+						      sizeof(u32));
+		if (len >= 0) {
+			/* Ensure the appropriate LEDs are enabled */
+			lan78xx_read_reg(dev, HW_CFG, &reg);
+			reg &= ~(HW_CFG_LED0_EN_ |
+				 HW_CFG_LED1_EN_ |
+				 HW_CFG_LED2_EN_ |
+				 HW_CFG_LED3_EN_);
+			reg |= (len > 0) * HW_CFG_LED0_EN_ |
+				(len > 1) * HW_CFG_LED1_EN_ |
+				(len > 2) * HW_CFG_LED2_EN_ |
+				(len > 3) * HW_CFG_LED3_EN_;
+			lan78xx_write_reg(dev, HW_CFG, reg);
+		}
+	}
+
 	genphy_config_aneg(phydev);
 
 	dev->fc_autoneg = phydev->autoneg;
diff --git a/include/dt-bindings/net/microchip-lan78xx.h b/include/dt-bindings/net/microchip-lan78xx.h
new file mode 100644
index 000000000000..0742ff075307
--- /dev/null
+++ b/include/dt-bindings/net/microchip-lan78xx.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _DT_BINDINGS_MICROCHIP_LAN78XX_H
+#define _DT_BINDINGS_MICROCHIP_LAN78XX_H
+
+/* LED modes for LAN7800/LAN7850 embedded PHY */
+
+#define LAN78XX_LINK_ACTIVITY           0
+#define LAN78XX_LINK_1000_ACTIVITY      1
+#define LAN78XX_LINK_100_ACTIVITY       2
+#define LAN78XX_LINK_10_ACTIVITY        3
+#define LAN78XX_LINK_100_1000_ACTIVITY  4
+#define LAN78XX_LINK_10_1000_ACTIVITY   5
+#define LAN78XX_LINK_10_100_ACTIVITY    6
+#define LAN78XX_DUPLEX_COLLISION        8
+#define LAN78XX_COLLISION               9
+#define LAN78XX_ACTIVITY                10
+#define LAN78XX_AUTONEG_FAULT           12
+#define LAN78XX_FORCE_LED_OFF           14
+#define LAN78XX_FORCE_LED_ON            15
+
+#endif
diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h
index eb492d47f717..8e4015e7f8d3 100644
--- a/include/linux/microchipphy.h
+++ b/include/linux/microchipphy.h
@@ -70,4 +70,7 @@
 #define	LAN88XX_MMD3_CHIP_ID			(32877)
 #define	LAN88XX_MMD3_CHIP_REV			(32878)
 
+/* Registers specific to the LAN7800/LAN7850 embedded phy */
+#define LAN78XX_PHY_LED_MODE_SELECT		(0x1D)
+
 #endif /* _MICROCHIPPHY_H */
-- 
cgit v1.2.3


From 13c935bb09948aef0202574ee12bb089459eb43b Mon Sep 17 00:00:00 2001
From: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Date: Mon, 9 Apr 2018 15:54:46 +0200
Subject: crypto: api - laying defines and checks for statically allocated
 buffers

In preparation for the removal of VLAs[1] from crypto code.
We create 2 new compile-time constants: all ciphers implemented
in Linux have a block size less than or equal to 16 bytes and
the most demanding hw require 16 bytes alignment for the block
buffer.
We also enforce these limits in crypto_check_alg when a new
cipher is registered.

[1] http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c         | 10 ++++++++++
 include/crypto/algapi.h |  8 ++++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 2a0271b5f62a..c0755cf4f53f 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <crypto/algapi.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/fips.h>
@@ -59,6 +60,15 @@ static int crypto_check_alg(struct crypto_alg *alg)
 	if (alg->cra_blocksize > PAGE_SIZE / 8)
 		return -EINVAL;
 
+	if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) ==
+			       CRYPTO_ALG_TYPE_CIPHER) {
+		if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK)
+			return -EINVAL;
+
+		if (alg->cra_blocksize > MAX_CIPHER_BLOCKSIZE)
+			return -EINVAL;
+	}
+
 	if (alg->cra_priority < 0)
 		return -EINVAL;
 
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 1aba888241dd..bd5e8ccf1687 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -17,6 +17,14 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 
+/*
+ * Maximum values for blocksize and alignmask, used to allocate
+ * static buffers that are big enough for any combination of
+ * ciphers and architectures.
+ */
+#define MAX_CIPHER_BLOCKSIZE		16
+#define MAX_CIPHER_ALIGNMASK		15
+
 struct crypto_aead;
 struct crypto_instance;
 struct module;
-- 
cgit v1.2.3


From 8af70cd2ca508061088d5059ba8a8218aca7ddf1 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Fri, 20 Apr 2018 10:14:27 -0700
Subject: memory: aemif: add support for board files

Currently aemif is supported in two places separately. By the platform
driver in drivers/memory and by a hand crafted driver in mach-davinci.

We want to drop the latter but also keep the legacy mode. Add support
for board files to the aemif driver.

The new structure in platform data currently only contains the chip
select number, since currently existing users don't require anything
else, but it can be extended in the future.

While extending the platform data struct, add kernel docs describing
its members.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/memory/ti-aemif.c              | 58 ++++++++++++++++++++++------------
 include/linux/platform_data/ti-aemif.h | 25 +++++++++++++++
 2 files changed, 63 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/memory/ti-aemif.c b/drivers/memory/ti-aemif.c
index 588e58d40d1b..31112f622b88 100644
--- a/drivers/memory/ti-aemif.c
+++ b/drivers/memory/ti-aemif.c
@@ -339,9 +339,6 @@ static int aemif_probe(struct platform_device *pdev)
 	struct aemif_platform_data *pdata;
 	struct of_dev_auxdata *dev_lookup;
 
-	if (np == NULL)
-		return 0;
-
 	aemif = devm_kzalloc(dev, sizeof(*aemif), GFP_KERNEL);
 	if (!aemif)
 		return -ENOMEM;
@@ -363,8 +360,10 @@ static int aemif_probe(struct platform_device *pdev)
 
 	aemif->clk_rate = clk_get_rate(aemif->clk) / MSEC_PER_SEC;
 
-	if (of_device_is_compatible(np, "ti,da850-aemif"))
+	if (np && of_device_is_compatible(np, "ti,da850-aemif"))
 		aemif->cs_offset = 2;
+	else if (pdata)
+		aemif->cs_offset = pdata->cs_offset;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	aemif->base = devm_ioremap_resource(dev, res);
@@ -373,15 +372,23 @@ static int aemif_probe(struct platform_device *pdev)
 		goto error;
 	}
 
-	/*
-	 * For every controller device node, there is a cs device node that
-	 * describe the bus configuration parameters. This functions iterate
-	 * over these nodes and update the cs data array.
-	 */
-	for_each_available_child_of_node(np, child_np) {
-		ret = of_aemif_parse_abus_config(pdev, child_np);
-		if (ret < 0)
-			goto error;
+	if (np) {
+		/*
+		 * For every controller device node, there is a cs device node
+		 * that describe the bus configuration parameters. This
+		 * functions iterate over these nodes and update the cs data
+		 * array.
+		 */
+		for_each_available_child_of_node(np, child_np) {
+			ret = of_aemif_parse_abus_config(pdev, child_np);
+			if (ret < 0)
+				goto error;
+		}
+	} else if (pdata && pdata->num_abus_data > 0) {
+		for (i = 0; i < pdata->num_abus_data; i++, aemif->num_cs++) {
+			aemif->cs_data[i].cs = pdata->abus_data[i].cs;
+			aemif_get_hw_params(pdev, i);
+		}
 	}
 
 	for (i = 0; i < aemif->num_cs; i++) {
@@ -394,14 +401,25 @@ static int aemif_probe(struct platform_device *pdev)
 	}
 
 	/*
-	 * Create a child devices explicitly from here to
-	 * guarantee that the child will be probed after the AEMIF timing
-	 * parameters are set.
+	 * Create a child devices explicitly from here to guarantee that the
+	 * child will be probed after the AEMIF timing parameters are set.
 	 */
-	for_each_available_child_of_node(np, child_np) {
-		ret = of_platform_populate(child_np, NULL, dev_lookup, dev);
-		if (ret < 0)
-			goto error;
+	if (np) {
+		for_each_available_child_of_node(np, child_np) {
+			ret = of_platform_populate(child_np, NULL,
+						   dev_lookup, dev);
+			if (ret < 0)
+				goto error;
+		}
+	} else {
+		for (i = 0; i < pdata->num_sub_devices; i++) {
+			pdata->sub_devices[i].dev.parent = dev;
+			ret = platform_device_register(&pdata->sub_devices[i]);
+			if (ret) {
+				dev_warn(dev, "Error register sub device %s\n",
+					 pdata->sub_devices[i].name);
+			}
+		}
 	}
 
 	return 0;
diff --git a/include/linux/platform_data/ti-aemif.h b/include/linux/platform_data/ti-aemif.h
index ac72e115093c..e6407bafcbf8 100644
--- a/include/linux/platform_data/ti-aemif.h
+++ b/include/linux/platform_data/ti-aemif.h
@@ -16,8 +16,33 @@
 
 #include <linux/of_platform.h>
 
+/**
+ * struct aemif_abus_data - Async bus configuration parameters.
+ *
+ * @cs - Chip-select number.
+ */
+struct aemif_abus_data {
+	u32 cs;
+};
+
+/**
+ * struct aemif_platform_data - Data to set up the TI aemif driver.
+ *
+ * @dev_lookup: of_dev_auxdata passed to of_platform_populate() for aemif
+ *              subdevices.
+ * @cs_offset: Lowest allowed chip-select number.
+ * @abus_data: Array of async bus configuration entries.
+ * @num_abus_data: Number of abus entries.
+ * @sub_devices: Array of platform subdevices.
+ * @num_sub_devices: Number of subdevices.
+ */
 struct aemif_platform_data {
 	struct of_dev_auxdata *dev_lookup;
+	u32 cs_offset;
+	struct aemif_abus_data *abus_data;
+	size_t num_abus_data;
+	struct platform_device *sub_devices;
+	size_t num_sub_devices;
 };
 
 #endif /* __TI_DAVINCI_AEMIF_DATA_H__ */
-- 
cgit v1.2.3


From dbef91ec5482239055dd2db8ec656fc13d382add Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Wed, 18 Apr 2018 01:35:06 +0200
Subject: scsi: ilog2: create truly constant version for sparse

Sparse emits errors about ilog2() in array indices because of the use of
__ilog2_32() and __ilog2_64(), rightly so
(https://www.spinics.net/lists/linux-sparse/msg03471.html).

Create a const_ilog2() variant that works with sparse for this scenario.

(Note: checkpatch.pl complains about missing parentheses, but that
appears to be a false positive. I can get rid of the warning simply by
inserting whitespace, making checkpatch "see" the whole macro).

Signed-off-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/log2.h | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/log2.h b/include/linux/log2.h
index 41a1ae010993..2af7f77866d0 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -72,16 +72,13 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 }
 
 /**
- * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
+ * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
  * @n: parameter
  *
- * constant-capable log of base 2 calculation
- * - this can be used to initialise global variables from constant data, hence
- * the massive ternary operator construction
- *
- * selects the appropriately-sized optimised version depending on sizeof(n)
+ * Use this where sparse expects a true constant expression, e.g. for array
+ * indices.
  */
-#define ilog2(n)				\
+#define const_ilog2(n)				\
 (						\
 	__builtin_constant_p(n) ? (		\
 		(n) < 2 ? 0 :			\
@@ -147,10 +144,26 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 		(n) & (1ULL <<  4) ?  4 :	\
 		(n) & (1ULL <<  3) ?  3 :	\
 		(n) & (1ULL <<  2) ?  2 :	\
-		1 ) :				\
-	(sizeof(n) <= 4) ?			\
-	__ilog2_u32(n) :			\
-	__ilog2_u64(n)				\
+		1) :				\
+	-1)
+
+/**
+ * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
+ * @n: parameter
+ *
+ * constant-capable log of base 2 calculation
+ * - this can be used to initialise global variables from constant data, hence
+ * the massive ternary operator construction
+ *
+ * selects the appropriately-sized optimised version depending on sizeof(n)
+ */
+#define ilog2(n) \
+( \
+	__builtin_constant_p(n) ?	\
+	const_ilog2(n) :		\
+	(sizeof(n) <= 4) ?		\
+	__ilog2_u32(n) :		\
+	__ilog2_u64(n)			\
  )
 
 /**
-- 
cgit v1.2.3


From 1409880357ed33dc1c23eed080d88ea4410ed9a3 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Wed, 18 Apr 2018 01:35:08 +0200
Subject: scsi: devinfo: change blist_flag_t to 64bit

Space for SCSI blist flags is gradually running out. Change the type to
__u64 and fix a checkpatch complaint about symbolic mode flags in
scsi_devinfo.c.

Make checkpatch happy by replacing simple_strtoul() with kstrtoull().

Signed-off-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_devinfo.c | 18 +++++++++++-----
 drivers/scsi/scsi_sysfs.c   |  2 +-
 include/scsi/scsi_device.h  |  2 +-
 include/scsi/scsi_devinfo.h | 50 ++++++++++++++++++++++-----------------------
 4 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index e5c82f13b7d9..bd04a33e4360 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -360,8 +360,16 @@ int scsi_dev_info_list_add_keyed(int compatible, char *vendor, char *model,
 	scsi_strcpy_devinfo("model", devinfo->model, sizeof(devinfo->model),
 			    model, compatible);
 
-	if (strflags)
-		flags = (__force blist_flags_t)simple_strtoul(strflags, NULL, 0);
+	if (strflags) {
+		unsigned long long val;
+		int ret = kstrtoull(strflags, 0, &val);
+
+		if (ret != 0) {
+			kfree(devinfo);
+			return ret;
+		}
+		flags = (__force blist_flags_t)val;
+	}
 	devinfo->flags = flags;
 	devinfo->compatible = compatible;
 
@@ -614,7 +622,7 @@ static int devinfo_seq_show(struct seq_file *m, void *v)
 	    devinfo_table->name)
 		seq_printf(m, "[%s]:\n", devinfo_table->name);
 
-	seq_printf(m, "'%.8s' '%.16s' 0x%x\n",
+	seq_printf(m, "'%.8s' '%.16s' 0x%llx\n",
 		   devinfo->vendor, devinfo->model, devinfo->flags);
 	return 0;
 }
@@ -733,9 +741,9 @@ MODULE_PARM_DESC(dev_flags,
 	 " list entries for vendor and model with an integer value of flags"
 	 " to the scsi device info list");
 
-module_param_named(default_dev_flags, scsi_default_dev_flags, int, S_IRUGO|S_IWUSR);
+module_param_named(default_dev_flags, scsi_default_dev_flags, ullong, 0644);
 MODULE_PARM_DESC(default_dev_flags,
-		 "scsi default device flag integer value");
+		 "scsi default device flag uint64_t value");
 
 /**
  * scsi_exit_devinfo - remove /proc/scsi/device_info & the scsi_dev_info_list
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 5120e613a29c..7943b762c12d 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -968,7 +968,7 @@ sdev_show_wwid(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(wwid, S_IRUGO, sdev_show_wwid, NULL);
 
 #define BLIST_FLAG_NAME(name)					\
-	[const_ilog2((__force unsigned int)BLIST_##name)] = #name
+	[const_ilog2((__force __u64)BLIST_##name)] = #name
 static const char *const sdev_bflags_name[] = {
 #include "scsi_devinfo_tbl.c"
 };
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 7ae177c8e399..4c36af6edd79 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -15,7 +15,7 @@ struct scsi_cmnd;
 struct scsi_lun;
 struct scsi_sense_hdr;
 
-typedef unsigned int __bitwise blist_flags_t;
+typedef __u64 __bitwise blist_flags_t;
 
 struct scsi_mode_data {
 	__u32	length;
diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h
index ea67c32e870e..e206d299f137 100644
--- a/include/scsi/scsi_devinfo.h
+++ b/include/scsi/scsi_devinfo.h
@@ -6,55 +6,55 @@
  */
 
 /* Only scan LUN 0 */
-#define BLIST_NOLUN		((__force blist_flags_t)(1 << 0))
+#define BLIST_NOLUN		((__force blist_flags_t)(1ULL << 0))
 /* Known to have LUNs, force scanning.
  * DEPRECATED: Use max_luns=N */
-#define BLIST_FORCELUN		((__force blist_flags_t)(1 << 1))
+#define BLIST_FORCELUN		((__force blist_flags_t)(1ULL << 1))
 /* Flag for broken handshaking */
-#define BLIST_BORKEN		((__force blist_flags_t)(1 << 2))
+#define BLIST_BORKEN		((__force blist_flags_t)(1ULL << 2))
 /* unlock by special command */
-#define BLIST_KEY		((__force blist_flags_t)(1 << 3))
+#define BLIST_KEY		((__force blist_flags_t)(1ULL << 3))
 /* Do not use LUNs in parallel */
-#define BLIST_SINGLELUN		((__force blist_flags_t)(1 << 4))
+#define BLIST_SINGLELUN		((__force blist_flags_t)(1ULL << 4))
 /* Buggy Tagged Command Queuing */
-#define BLIST_NOTQ		((__force blist_flags_t)(1 << 5))
+#define BLIST_NOTQ		((__force blist_flags_t)(1ULL << 5))
 /* Non consecutive LUN numbering */
-#define BLIST_SPARSELUN		((__force blist_flags_t)(1 << 6))
+#define BLIST_SPARSELUN		((__force blist_flags_t)(1ULL << 6))
 /* Avoid LUNS >= 5 */
-#define BLIST_MAX5LUN		((__force blist_flags_t)(1 << 7))
+#define BLIST_MAX5LUN		((__force blist_flags_t)(1ULL << 7))
 /* Treat as (removable) CD-ROM */
-#define BLIST_ISROM		((__force blist_flags_t)(1 << 8))
+#define BLIST_ISROM		((__force blist_flags_t)(1ULL << 8))
 /* LUNs past 7 on a SCSI-2 device */
-#define BLIST_LARGELUN		((__force blist_flags_t)(1 << 9))
+#define BLIST_LARGELUN		((__force blist_flags_t)(1ULL << 9))
 /* override additional length field */
-#define BLIST_INQUIRY_36	((__force blist_flags_t)(1 << 10))
+#define BLIST_INQUIRY_36	((__force blist_flags_t)(1ULL << 10))
 /* do not do automatic start on add */
-#define BLIST_NOSTARTONADD	((__force blist_flags_t)(1 << 12))
+#define BLIST_NOSTARTONADD	((__force blist_flags_t)(1ULL << 12))
 /* try REPORT_LUNS even for SCSI-2 devs (if HBA supports more than 8 LUNs) */
-#define BLIST_REPORTLUN2	((__force blist_flags_t)(1 << 17))
+#define BLIST_REPORTLUN2	((__force blist_flags_t)(1ULL << 17))
 /* don't try REPORT_LUNS scan (SCSI-3 devs) */
-#define BLIST_NOREPORTLUN	((__force blist_flags_t)(1 << 18))
+#define BLIST_NOREPORTLUN	((__force blist_flags_t)(1ULL << 18))
 /* don't use PREVENT-ALLOW commands */
-#define BLIST_NOT_LOCKABLE	((__force blist_flags_t)(1 << 19))
+#define BLIST_NOT_LOCKABLE	((__force blist_flags_t)(1ULL << 19))
 /* device is actually for RAID config */
-#define BLIST_NO_ULD_ATTACH	((__force blist_flags_t)(1 << 20))
+#define BLIST_NO_ULD_ATTACH	((__force blist_flags_t)(1ULL << 20))
 /* select without ATN */
-#define BLIST_SELECT_NO_ATN	((__force blist_flags_t)(1 << 21))
+#define BLIST_SELECT_NO_ATN	((__force blist_flags_t)(1ULL << 21))
 /* retry HARDWARE_ERROR */
-#define BLIST_RETRY_HWERROR	((__force blist_flags_t)(1 << 22))
+#define BLIST_RETRY_HWERROR	((__force blist_flags_t)(1ULL << 22))
 /* maximum 512 sector cdb length */
-#define BLIST_MAX_512		((__force blist_flags_t)(1 << 23))
+#define BLIST_MAX_512		((__force blist_flags_t)(1ULL << 23))
 /* Disable T10 PI (DIF) */
-#define BLIST_NO_DIF		((__force blist_flags_t)(1 << 25))
+#define BLIST_NO_DIF		((__force blist_flags_t)(1ULL << 25))
 /* Ignore SBC-3 VPD pages */
-#define BLIST_SKIP_VPD_PAGES	((__force blist_flags_t)(1 << 26))
+#define BLIST_SKIP_VPD_PAGES	((__force blist_flags_t)(1ULL << 26))
 /* Attempt to read VPD pages */
-#define BLIST_TRY_VPD_PAGES	((__force blist_flags_t)(1 << 28))
+#define BLIST_TRY_VPD_PAGES	((__force blist_flags_t)(1ULL << 28))
 /* don't try to issue RSOC */
-#define BLIST_NO_RSOC		((__force blist_flags_t)(1 << 29))
+#define BLIST_NO_RSOC		((__force blist_flags_t)(1ULL << 29))
 /* maximum 1024 sector cdb length */
-#define BLIST_MAX_1024		((__force blist_flags_t)(1 << 30))
+#define BLIST_MAX_1024		((__force blist_flags_t)(1ULL << 30))
 /* Use UNMAP limit for WRITE SAME */
-#define BLIST_UNMAP_LIMIT_WS	((__force blist_flags_t)(1 << 31))
+#define BLIST_UNMAP_LIMIT_WS	((__force blist_flags_t)(1ULL << 31))
 
 #endif
-- 
cgit v1.2.3


From 358fda5ff4c441dfa6e66fd3b1f7d8f882ecba8f Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Wed, 18 Apr 2018 01:35:09 +0200
Subject: scsi: devinfo: warn on undefined blist flags

Warn if a device (or the user) sets blist flags which are unknown
or have been removed. This should enable us to reuse freed blist
bits in later releases.

Signed-off-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/Makefile       |  2 +-
 drivers/scsi/scsi_devinfo.c |  6 ++++++
 include/scsi/scsi_devinfo.h | 21 +++++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index f31145d6d472..eb30e558fc36 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -190,7 +190,7 @@ $(obj)/53c700.o $(MODVERDIR)/$(obj)/53c700.ver: $(obj)/53c700_d.h
 $(obj)/scsi_sysfs.o: $(obj)/scsi_devinfo_tbl.c
 
 quiet_cmd_bflags = GEN     $@
-	cmd_bflags = sed -n 's/.*BLIST_\([A-Z0-9_]*\) *.*/BLIST_FLAG_NAME(\1),/p' $< > $@
+	cmd_bflags = sed -n 's/.*define *BLIST_\([A-Z0-9_]*\) *.*/BLIST_FLAG_NAME(\1),/p' $< > $@
 
 $(obj)/scsi_devinfo_tbl.c: include/scsi/scsi_devinfo.h
 	$(call if_changed,bflags)
diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index bd04a33e4360..9603f3ef18aa 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -370,6 +370,12 @@ int scsi_dev_info_list_add_keyed(int compatible, char *vendor, char *model,
 		}
 		flags = (__force blist_flags_t)val;
 	}
+	if (flags & __BLIST_UNUSED_MASK) {
+		pr_err("scsi_devinfo (%s:%s): unsupported flags 0x%llx",
+		       vendor, model, flags & __BLIST_UNUSED_MASK);
+		kfree(devinfo);
+		return -EINVAL;
+	}
 	devinfo->flags = flags;
 	devinfo->compatible = compatible;
 
diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h
index e206d299f137..3434e143feff 100644
--- a/include/scsi/scsi_devinfo.h
+++ b/include/scsi/scsi_devinfo.h
@@ -28,8 +28,13 @@
 #define BLIST_LARGELUN		((__force blist_flags_t)(1ULL << 9))
 /* override additional length field */
 #define BLIST_INQUIRY_36	((__force blist_flags_t)(1ULL << 10))
+#define __BLIST_UNUSED_11	((__force blist_flags_t)(1ULL << 11))
 /* do not do automatic start on add */
 #define BLIST_NOSTARTONADD	((__force blist_flags_t)(1ULL << 12))
+#define __BLIST_UNUSED_13	((__force blist_flags_t)(1ULL << 13))
+#define __BLIST_UNUSED_14	((__force blist_flags_t)(1ULL << 14))
+#define __BLIST_UNUSED_15	((__force blist_flags_t)(1ULL << 15))
+#define __BLIST_UNUSED_16	((__force blist_flags_t)(1ULL << 16))
 /* try REPORT_LUNS even for SCSI-2 devs (if HBA supports more than 8 LUNs) */
 #define BLIST_REPORTLUN2	((__force blist_flags_t)(1ULL << 17))
 /* don't try REPORT_LUNS scan (SCSI-3 devs) */
@@ -44,10 +49,12 @@
 #define BLIST_RETRY_HWERROR	((__force blist_flags_t)(1ULL << 22))
 /* maximum 512 sector cdb length */
 #define BLIST_MAX_512		((__force blist_flags_t)(1ULL << 23))
+#define __BLIST_UNUSED_24	((__force blist_flags_t)(1ULL << 24))
 /* Disable T10 PI (DIF) */
 #define BLIST_NO_DIF		((__force blist_flags_t)(1ULL << 25))
 /* Ignore SBC-3 VPD pages */
 #define BLIST_SKIP_VPD_PAGES	((__force blist_flags_t)(1ULL << 26))
+#define __BLIST_UNUSED_27	((__force blist_flags_t)(1ULL << 27))
 /* Attempt to read VPD pages */
 #define BLIST_TRY_VPD_PAGES	((__force blist_flags_t)(1ULL << 28))
 /* don't try to issue RSOC */
@@ -57,4 +64,18 @@
 /* Use UNMAP limit for WRITE SAME */
 #define BLIST_UNMAP_LIMIT_WS	((__force blist_flags_t)(1ULL << 31))
 
+#define __BLIST_LAST_USED BLIST_UNMAP_LIMIT_WS
+
+#define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \
+			       (__force blist_flags_t) \
+			       ((__force __u64)__BLIST_LAST_USED - 1ULL)))
+#define __BLIST_UNUSED_MASK (__BLIST_UNUSED_11 | \
+			     __BLIST_UNUSED_13 | \
+			     __BLIST_UNUSED_14 | \
+			     __BLIST_UNUSED_15 | \
+			     __BLIST_UNUSED_16 | \
+			     __BLIST_UNUSED_24 | \
+			     __BLIST_UNUSED_27 | \
+			     __BLIST_HIGH_UNUSED)
+
 #endif
-- 
cgit v1.2.3


From 29cfc2ab71d9642c2f4fda6cd278309cc253ff82 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Wed, 18 Apr 2018 01:35:10 +0200
Subject: scsi: devinfo: add BLIST_RETRY_ITF for EMC Symmetrix

EMC Symmetrix returns 'internal target error' for a variety of
conditions, most of which will be transient.  So we should always retry
it, even with failfast set.  Otherwise we'd get spurious path flaps with
multipath.

Signed-off-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_devinfo.c | 3 ++-
 drivers/scsi/scsi_error.c   | 4 ++++
 include/scsi/scsi_devinfo.h | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index 9603f3ef18aa..87bc50686ac6 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -161,7 +161,8 @@ static struct {
 	{"DGC", "RAID", NULL, BLIST_SPARSELUN},	/* EMC CLARiiON, storage on LUN 0 */
 	{"DGC", "DISK", NULL, BLIST_SPARSELUN},	/* EMC CLARiiON, no storage on LUN 0 */
 	{"EMC",  "Invista", "*", BLIST_SPARSELUN | BLIST_LARGELUN},
-	{"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN | BLIST_REPORTLUN2},
+	{"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN |
+	 BLIST_REPORTLUN2 | BLIST_RETRY_ITF},
 	{"EMULEX", "MD21/S2     ESDI", NULL, BLIST_SINGLELUN},
 	{"easyRAID", "16P", NULL, BLIST_NOREPORTLUN},
 	{"easyRAID", "X6P", NULL, BLIST_NOREPORTLUN},
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 946039117bf4..633b198ed895 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -38,6 +38,7 @@
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi_dh.h>
+#include <scsi/scsi_devinfo.h>
 #include <scsi/sg.h>
 
 #include "scsi_priv.h"
@@ -525,6 +526,9 @@ int scsi_check_sense(struct scsi_cmnd *scmd)
 		if (sshdr.asc == 0x10) /* DIF */
 			return SUCCESS;
 
+		if (sshdr.asc == 0x44 && sdev->sdev_bflags & BLIST_RETRY_ITF)
+			return ADD_TO_MLQUEUE;
+
 		return NEEDS_RETRY;
 	case NOT_READY:
 	case UNIT_ATTENTION:
diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h
index 3434e143feff..91a327edf1fa 100644
--- a/include/scsi/scsi_devinfo.h
+++ b/include/scsi/scsi_devinfo.h
@@ -63,8 +63,10 @@
 #define BLIST_MAX_1024		((__force blist_flags_t)(1ULL << 30))
 /* Use UNMAP limit for WRITE SAME */
 #define BLIST_UNMAP_LIMIT_WS	((__force blist_flags_t)(1ULL << 31))
+/* Always retry ABORTED_COMMAND with Internal Target Failure */
+#define BLIST_RETRY_ITF		((__force blist_flags_t)(1ULL << 32))
 
-#define __BLIST_LAST_USED BLIST_UNMAP_LIMIT_WS
+#define __BLIST_LAST_USED BLIST_RETRY_ITF
 
 #define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \
 			       (__force blist_flags_t) \
-- 
cgit v1.2.3


From c360652006bba40837cf16d5099ea61f7ce16c63 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Wed, 18 Apr 2018 01:35:11 +0200
Subject: scsi: devinfo: BLIST_RETRY_ASC_C1 for Fujitsu ETERNUS

On Fujitsu ETERNUS systems, sense code ABORTED COMMAND with ASC/Q C1/01
is used to indicate temporary condition where the storage-internal path
to a target is switched from one controller to another. SCSI commands
that return with this error code must be retried unconditionally
(i.e. without the "maybe_retry" logic in scsi_decide_disposition);
otherwise dm-multipath might initiate a failover from a healthy path
e.g. for REQ_FAILFAST_DEV commands.

Introduce a new blist flag for this case.

[mkp: applied by hand]

Signed-off-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_devinfo.c | 1 +
 drivers/scsi/scsi_error.c   | 3 +++
 include/scsi/scsi_devinfo.h | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index 87bc50686ac6..c4cbfd07b916 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -168,6 +168,7 @@ static struct {
 	{"easyRAID", "X6P", NULL, BLIST_NOREPORTLUN},
 	{"easyRAID", "F8", NULL, BLIST_NOREPORTLUN},
 	{"FSC", "CentricStor", "*", BLIST_SPARSELUN | BLIST_LARGELUN},
+	{"FUJITSU", "ETERNUS_DXM", "*", BLIST_RETRY_ASC_C1},
 	{"Generic", "USB SD Reader", "1.00", BLIST_FORCELUN | BLIST_INQUIRY_36},
 	{"Generic", "USB Storage-SMC", NULL, BLIST_FORCELUN | BLIST_INQUIRY_36}, /* FW: 0180 and 0207 */
 	{"HITACHI", "DF400", "*", BLIST_REPORTLUN2},
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 633b198ed895..94d2047e0096 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -528,6 +528,9 @@ int scsi_check_sense(struct scsi_cmnd *scmd)
 
 		if (sshdr.asc == 0x44 && sdev->sdev_bflags & BLIST_RETRY_ITF)
 			return ADD_TO_MLQUEUE;
+		if (sshdr.asc == 0xc1 && sshdr.ascq == 0x01 &&
+		    sdev->sdev_bflags & BLIST_RETRY_ASC_C1)
+			return ADD_TO_MLQUEUE;
 
 		return NEEDS_RETRY;
 	case NOT_READY:
diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h
index 91a327edf1fa..3fdb322d4c4b 100644
--- a/include/scsi/scsi_devinfo.h
+++ b/include/scsi/scsi_devinfo.h
@@ -65,8 +65,10 @@
 #define BLIST_UNMAP_LIMIT_WS	((__force blist_flags_t)(1ULL << 31))
 /* Always retry ABORTED_COMMAND with Internal Target Failure */
 #define BLIST_RETRY_ITF		((__force blist_flags_t)(1ULL << 32))
+/* Always retry ABORTED_COMMAND with ASC 0xc1 */
+#define BLIST_RETRY_ASC_C1	((__force blist_flags_t)(1ULL << 33))
 
-#define __BLIST_LAST_USED BLIST_RETRY_ITF
+#define __BLIST_LAST_USED BLIST_RETRY_ASC_C1
 
 #define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \
 			       (__force blist_flags_t) \
-- 
cgit v1.2.3


From 572ccdab50bb3ae9096d6947c2e78a7107acf2dd Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 14 Apr 2018 10:51:05 -0700
Subject: scsi: target: target_core_user.[ch]: convert comments into DOC:

Make documentation on target-supported userspace-I/O design be
usable by kernel-doc by using "DOC:". This is used in the driver-api
Documentation chapter.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
To: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
Cc: linux-scsi@vger.kernel.org
Cc: target-devel@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_user.c     |  8 ++++++--
 include/uapi/linux/target_core_user.h | 11 ++++++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index d6af29250d94..ae0aea9a3aad 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -42,7 +42,11 @@
 
 #include <linux/target_core_user.h>
 
-/*
+/**
+ * DOC: Userspace I/O
+ * Userspace I/O
+ * -------------
+ *
  * Define a shared-memory interface for LIO to pass SCSI commands and
  * data to userspace for processing. This is to allow backends that
  * are too complex for in-kernel support to be possible.
@@ -53,7 +57,7 @@
  * See the .h file for how the ring is laid out. Note that while the
  * command ring is defined, the particulars of the data area are
  * not. Offset values in the command entry point to other locations
- * internal to the mmap()ed area. There is separate space outside the
+ * internal to the mmap-ed area. There is separate space outside the
  * command ring for data buffers. This leaves maximum flexibility for
  * moving buffer allocations, or even page flipping or other
  * allocation techniques, without altering the command ring layout.
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 0be80f72646b..6e299349b158 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -9,21 +9,22 @@
 
 #define TCMU_VERSION "2.0"
 
-/*
+/**
+ * DOC: Ring Design
  * Ring Design
  * -----------
  *
  * The mmaped area is divided into three parts:
- * 1) The mailbox (struct tcmu_mailbox, below)
- * 2) The command ring
- * 3) Everything beyond the command ring (data)
+ * 1) The mailbox (struct tcmu_mailbox, below);
+ * 2) The command ring;
+ * 3) Everything beyond the command ring (data).
  *
  * The mailbox tells userspace the offset of the command ring from the
  * start of the shared memory region, and how big the command ring is.
  *
  * The kernel passes SCSI commands to userspace by putting a struct
  * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking
- * userspace via uio's interrupt mechanism.
+ * userspace via UIO's interrupt mechanism.
  *
  * tcmu_cmd_entry contains a header. If the header type is PAD,
  * userspace should skip hdr->length bytes (mod cmdr_size) to find the
-- 
cgit v1.2.3


From f134fbbb4ff813dd227c9ce40b5c0b2078a77b07 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Sat, 21 Apr 2018 08:54:40 +1000
Subject: mtd: spi-nor: clear Winbond Extended Address Reg on switch to 3-byte
 addressing.

Winbond spi-nor flash 32MB and larger have an 'Extended Address
Register' as one option for addressing beyond 16MB (Macronix
has the same concept, Spansion has EXTADD bits in the Bank Address
Register).

According to section
   8.2.7 Write Extended Address Register (C5h)

of the Winbond W25Q256FV data sheet (256M-BIT SPI flash)

   The Extended Address Register is only effective when the device is
   in the 3-Byte Address Mode.  When the device operates in the 4-Byte
   Address Mode (ADS=1), any command with address input of A31-A24
   will replace the Extended Address Register values. It is
   recommended to check and update the Extended Address Register if
   necessary when the device is switched from 4-Byte to 3-Byte Address
   Mode.

So the documentation suggests clearing the EAR after switching to
3-byte mode.  Experimentation shows that the EAR is *always* one after
the switch to 3-byte mode, so clearing the EAR is mandatory at
shutdown for a subsequent 3-byte-addressed reboot to work.

Note that some SOCs (e.g. MT7621) do not assert a reset line at normal
reboot, so we cannot rely on hardware reset.  The MT7621 does assert a
reset line at watchdog-reset.

Acked-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: NeilBrown <neil@brown.name>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/spi-nor/spi-nor.c | 14 ++++++++++++++
 include/linux/mtd/spi-nor.h   |  2 ++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 9363f299e4ee..494b7a269872 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -284,6 +284,20 @@ static inline int set_4byte(struct spi_nor *nor, const struct flash_info *info,
 		if (need_wren)
 			write_disable(nor);
 
+		if (!status && !enable &&
+		    JEDEC_MFR(info) == SNOR_MFR_WINBOND) {
+			/*
+			 * On Winbond W25Q256FV, leaving 4byte mode causes
+			 * the Extended Address Register to be set to 1, so all
+			 * 3-byte-address reads come from the second 16M.
+			 * We must clear the register to enable normal behavior.
+			 */
+			write_enable(nor);
+			nor->cmd_buf[0] = 0;
+			nor->write_reg(nor, SPINOR_OP_WREAR, nor->cmd_buf, 1);
+			write_disable(nor);
+		}
+
 		return status;
 	default:
 		/* Spansion style */
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index de36969eb359..e60da0d34cc1 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -62,6 +62,8 @@
 #define SPINOR_OP_RDCR		0x35	/* Read configuration register */
 #define SPINOR_OP_RDFSR		0x70	/* Read flag status register */
 #define SPINOR_OP_CLFSR		0x50	/* Clear flag status register */
+#define SPINOR_OP_RDEAR		0xc8	/* Read Extended Address Register */
+#define SPINOR_OP_WREAR		0xc5	/* Write Extended Address Register */
 
 /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
 #define SPINOR_OP_READ_4B	0x13	/* Read data bytes (low frequency) */
-- 
cgit v1.2.3


From 07cb9623ee7d15cc9968969ac247edae6972fb8f Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:10 +0100
Subject: ipv6: make ip6_dst_mtu_forward inline

Just like ip_dst_mtu_maybe_forward(), to avoid a dependency with ipv6.ko.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip6_route.h | 21 +++++++++++++++++++++
 include/net/ipv6.h      |  2 --
 net/ipv6/ip6_output.c   | 22 ----------------------
 3 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d5fb1e4ae7ac..376928c26d2d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -279,6 +279,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
 
+static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+{
+	struct inet6_dev *idev;
+	unsigned int mtu;
+
+	if (dst_metric_locked(dst, RTAX_MTU)) {
+		mtu = dst_metric_raw(dst, RTAX_MTU);
+		if (mtu)
+			return mtu;
+	}
+
+	mtu = IPV6_MIN_MTU;
+	rcu_read_lock();
+	idev = __in6_dev_get(dst->dev);
+	if (idev)
+		mtu = idev->cnf.mtu6;
+	rcu_read_unlock();
+
+	return mtu;
+}
+
 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 				   struct net_device *dev, struct sk_buff *skb,
 				   const void *daddr);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 68b167d98879..765441867cfa 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -958,8 +958,6 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
 			      &inet6_sk(sk)->cork);
 }
 
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst);
-
 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
 		   struct flowi6 *fl6);
 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3db47986ef38..cec49e137dbb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 	return dst_output(net, sk, skb);
 }
 
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
-{
-	unsigned int mtu;
-	struct inet6_dev *idev;
-
-	if (dst_metric_locked(dst, RTAX_MTU)) {
-		mtu = dst_metric_raw(dst, RTAX_MTU);
-		if (mtu)
-			return mtu;
-	}
-
-	mtu = IPV6_MIN_MTU;
-	rcu_read_lock();
-	idev = __in6_dev_get(dst->dev);
-	if (idev)
-		mtu = idev->cnf.mtu6;
-	rcu_read_unlock();
-
-	return mtu;
-}
-EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
-
 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
-- 
cgit v1.2.3


From 4f3780c004ef608477a534558eb16f46c5a1c534 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:11 +0100
Subject: netfilter: nf_flow_table: cache mtu in struct flow_offload_tuple

Reduces the number of cache lines touched in the offload forwarding
path. This is safe because PMTU limits are bypassed for the forwarding
path (see commit f87c10a8aa1e for more details).

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |  2 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 17 +++--------------
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 17 +++--------------
 net/netfilter/nf_flow_table.c           |  8 ++++++--
 4 files changed, 14 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 09ba67598991..76ee5c81b752 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -55,6 +55,8 @@ struct flow_offload_tuple {
 
 	int				oifidx;
 
+	u16				mtu;
+
 	struct dst_entry		*dst_cache;
 };
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 0cd46bffa469..461b1815e633 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -178,7 +178,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -192,17 +192,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
-{
-	u32 mtu;
-
-	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 			const struct nf_hook_state *state)
@@ -233,9 +222,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*iph)))
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 207cb35569b1..0e6328490142 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -173,7 +173,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -184,17 +184,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
-{
-	u32 mtu;
-
-	mtu = ip6_dst_mtu_forward(&rt->dst);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 			  const struct nf_hook_state *state)
@@ -225,9 +214,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index db0673a40b97..7403a0dfddf7 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,8 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
@@ -23,6 +25,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 {
 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
 	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+	struct dst_entry *dst = route->tuple[dir].dst;
 
 	ft->dir = dir;
 
@@ -30,10 +33,12 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 	case NFPROTO_IPV4:
 		ft->src_v4 = ctt->src.u3.in;
 		ft->dst_v4 = ctt->dst.u3.in;
+		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
 		break;
 	case NFPROTO_IPV6:
 		ft->src_v6 = ctt->src.u3.in6;
 		ft->dst_v6 = ctt->dst.u3.in6;
+		ft->mtu = ip6_dst_mtu_forward(dst);
 		break;
 	}
 
@@ -44,8 +49,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 
 	ft->iifidx = route->tuple[dir].ifindex;
 	ft->oifidx = route->tuple[!dir].ifindex;
-
-	ft->dst_cache = route->tuple[dir].dst;
+	ft->dst_cache = dst;
 }
 
 struct flow_offload *
-- 
cgit v1.2.3


From 6a3e030f08e1b700aa6d1ebdc7ebe4c44a2ef67a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:37:57 -0700
Subject: net/ipv6: Clean up rt expires helpers

rt6_clean_expires and rt6_set_expires are no longer used. Removed them.
rt6_update_expires has 1 caller in route.c, so move it from the header.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 21 ---------------------
 net/ipv6/route.c      |  9 +++++++++
 2 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 642211174692..327a74cd6a0d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -223,27 +223,6 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i)
 	return false;
 }
 
-static inline void rt6_clean_expires(struct rt6_info *rt)
-{
-	rt->rt6i_flags &= ~RTF_EXPIRES;
-	rt->dst.expires = 0;
-}
-
-static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires)
-{
-	rt->dst.expires = expires;
-	rt->rt6i_flags |= RTF_EXPIRES;
-}
-
-static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
-{
-	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
-		rt0->dst.expires = rt0->from->expires;
-
-	dst_set_expires(&rt0->dst, timeout);
-	rt0->rt6i_flags |= RTF_EXPIRES;
-}
-
 /* Function to safely get fn->sernum for passed in rt
  * and store result in passed in cookie.
  * Return true if we can get cookie safely
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 884d9cee790f..062dd4d8232c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2221,6 +2221,15 @@ static void ip6_link_failure(struct sk_buff *skb)
 	}
 }
 
+static void rt6_update_expires(struct rt6_info *rt0, int timeout)
+{
+	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
+		rt0->dst.expires = rt0->from->expires;
+
+	dst_set_expires(&rt0->dst, timeout);
+	rt0->rt6i_flags |= RTF_EXPIRES;
+}
+
 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 {
 	struct net *net = dev_net(rt->dst.dev);
-- 
cgit v1.2.3


From a269f1a764bb3abf5c73499fb74bc7ab1c2e986c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:37:58 -0700
Subject: net/ipv6: Rename rt6_get_cookie_safe

rt6_get_cookie_safe takes a fib6_info and checks the sernum of
the node. Update the name to reflect its purpose.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 6 +++---
 net/ipv6/route.c      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 327a74cd6a0d..dd1481ed8bdb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -228,8 +228,8 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i)
  * Return true if we can get cookie safely
  * Return false if not
  */
-static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i,
-				       u32 *cookie)
+static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
+					u32 *cookie)
 {
 	struct fib6_node *fn;
 	bool status = false;
@@ -254,7 +254,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		rt6_get_cookie_safe(rt->from, &cookie);
+		fib6_get_cookie_safe(rt->from, &cookie);
 
 	return cookie;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 062dd4d8232c..2d6fcfe11c82 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2128,7 +2128,7 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
+	if ((f6i && !fib6_get_cookie_safe(f6i, &rt_cookie)) ||
 	     rt_cookie != cookie)
 		return false;
 
@@ -2142,7 +2142,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
+	if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) ||
 	    rt_cookie != cookie)
 		return NULL;
 
-- 
cgit v1.2.3


From a87b7dc9f7fa7c87cbd575361553e8e9e4ab4473 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:38:00 -0700
Subject: net/ipv6: Move rcu locking to callers of fib6_get_cookie_safe

A later patch protects 'from' in rt6_info and this simplifies the
locking needed by it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  6 ++++--
 net/ipv6/route.c      | 13 ++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index dd1481ed8bdb..dc3505fb62b3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -234,7 +234,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
 	struct fib6_node *fn;
 	bool status = false;
 
-	rcu_read_lock();
 	fn = rcu_dereference(f6i->fib6_node);
 
 	if (fn) {
@@ -244,7 +243,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
 		status = true;
 	}
 
-	rcu_read_unlock();
 	return status;
 }
 
@@ -252,10 +250,14 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
 	u32 cookie = 0;
 
+	rcu_read_lock();
+
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
 		fib6_get_cookie_safe(rt->from, &cookie);
 
+	rcu_read_unlock();
+
 	return cookie;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f8b22183b7fd..810a37ac043b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2159,9 +2159,12 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
+	struct dst_entry *dst_ret;
 	struct rt6_info *rt;
 
-	rt = (struct rt6_info *) dst;
+	rt = container_of(dst, struct rt6_info, dst);
+
+	rcu_read_lock();
 
 	/* All IPV6 dsts are created with ->obsolete set to the value
 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
@@ -2170,9 +2173,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		return rt6_dst_from_check(rt, cookie);
+		dst_ret = rt6_dst_from_check(rt, cookie);
 	else
-		return rt6_check(rt, cookie);
+		dst_ret = rt6_check(rt, cookie);
+
+	rcu_read_unlock();
+
+	return dst_ret;
 }
 
 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
-- 
cgit v1.2.3


From a68886a691804d3f6d479ebf6825480fbafb6a00 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:38:02 -0700
Subject: net/ipv6: Make from in rt6_info rcu protected

When a dst entry is created from a fib entry, the 'from' in rt6_info
is set to the fib entry. The 'from' reference is used most notably for
cookie checking - making sure stale dst entries are updated if the
fib entry is changed.

When a fib entry is deleted, the pcpu routes on it are walked releasing
the fib6_info reference. This is needed for the fib6_info cleanup to
happen and to make sure all device references are released in a timely
manner.

There is a race window when a FIB entry is deleted and the 'from' on the
pcpu route is dropped and the pcpu route hits a cookie check. Handle
this race using rcu on from.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 10 +++---
 net/ipv6/ip6_fib.c    |  8 +++--
 net/ipv6/ip6_output.c |  9 +++--
 net/ipv6/route.c      | 96 ++++++++++++++++++++++++++++++++++++---------------
 4 files changed, 88 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index dc3505fb62b3..1af450d4e923 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -174,7 +174,7 @@ struct fib6_info {
 
 struct rt6_info {
 	struct dst_entry		dst;
-	struct fib6_info		*from;
+	struct fib6_info __rcu		*from;
 
 	struct rt6key			rt6i_dst;
 	struct rt6key			rt6i_src;
@@ -248,13 +248,15 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
 
 static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
+	struct fib6_info *from;
 	u32 cookie = 0;
 
 	rcu_read_lock();
 
-	if (rt->rt6i_flags & RTF_PCPU ||
-	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		fib6_get_cookie_safe(rt->from, &cookie);
+	from = rcu_dereference(rt->from);
+	if (from && (rt->rt6i_flags & RTF_PCPU ||
+	    unlikely(!list_empty(&rt->rt6i_uncached))))
+		fib6_get_cookie_safe(from, &cookie);
 
 	rcu_read_unlock();
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 64ca02b7745f..6421c893466e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -875,8 +875,12 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 		ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
 		pcpu_rt = *ppcpu_rt;
 		if (pcpu_rt) {
-			fib6_info_release(pcpu_rt->from);
-			pcpu_rt->from = NULL;
+			struct fib6_info *from;
+
+			from = rcu_dereference_protected(pcpu_rt->from,
+					     lockdep_is_held(&table->tb6_lock));
+			rcu_assign_pointer(pcpu_rt->from, NULL);
+			fib6_info_release(from);
 		}
 	}
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3db47986ef38..6a477d54f8c7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -962,16 +962,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	 * that's why we try it again later.
 	 */
 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+		struct fib6_info *from;
 		struct rt6_info *rt;
 		bool had_dst = *dst != NULL;
 
 		if (!had_dst)
 			*dst = ip6_route_output(net, sk, fl6);
 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
-		err = ip6_route_get_saddr(net, rt ? rt->from : NULL,
-					  &fl6->daddr,
+
+		rcu_read_lock();
+		from = rt ? rcu_dereference(rt->from) : NULL;
+		err = ip6_route_get_saddr(net, from, &fl6->daddr,
 					  sk ? inet6_sk(sk)->srcprefs : 0,
 					  &fl6->saddr);
+		rcu_read_unlock();
+
 		if (err)
 			goto out_err_release;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 810a37ac043b..004d00fe2fe5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -359,7 +359,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
-	struct fib6_info *from = rt->from;
+	struct fib6_info *from;
 	struct inet6_dev *idev;
 
 	dst_destroy_metrics_generic(dst);
@@ -371,8 +371,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		in6_dev_put(idev);
 	}
 
-	rt->from = NULL;
+	rcu_read_lock();
+	from = rcu_dereference(rt->from);
+	rcu_assign_pointer(rt->from, NULL);
 	fib6_info_release(from);
+	rcu_read_unlock();
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -402,12 +405,16 @@ static bool __rt6_check_expired(const struct rt6_info *rt)
 
 static bool rt6_check_expired(const struct rt6_info *rt)
 {
+	struct fib6_info *from;
+
+	from = rcu_dereference(rt->from);
+
 	if (rt->rt6i_flags & RTF_EXPIRES) {
 		if (time_after(jiffies, rt->dst.expires))
 			return true;
-	} else if (rt->from) {
+	} else if (from) {
 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
-			fib6_check_expired(rt->from);
+			fib6_check_expired(from);
 	}
 	return false;
 }
@@ -963,7 +970,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	fib6_info_hold(from);
-	rt->from = from;
+	rcu_assign_pointer(rt->from, from);
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
@@ -2133,11 +2140,13 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie)
 	return true;
 }
 
-static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_check(struct rt6_info *rt,
+				   struct fib6_info *from,
+				   u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) ||
+	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
 	    rt_cookie != cookie)
 		return NULL;
 
@@ -2147,11 +2156,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 	return &rt->dst;
 }
 
-static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
+					    struct fib6_info *from,
+					    u32 cookie)
 {
 	if (!__rt6_check_expired(rt) &&
 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
-	    fib6_check(rt->from, cookie))
+	    fib6_check(from, cookie))
 		return &rt->dst;
 	else
 		return NULL;
@@ -2160,6 +2171,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct dst_entry *dst_ret;
+	struct fib6_info *from;
 	struct rt6_info *rt;
 
 	rt = container_of(dst, struct rt6_info, dst);
@@ -2171,11 +2183,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	 * into this function always.
 	 */
 
-	if (rt->rt6i_flags & RTF_PCPU ||
-	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		dst_ret = rt6_dst_from_check(rt, cookie);
+	from = rcu_dereference(rt->from);
+
+	if (from && (rt->rt6i_flags & RTF_PCPU ||
+	    unlikely(!list_empty(&rt->rt6i_uncached))))
+		dst_ret = rt6_dst_from_check(rt, from, cookie);
 	else
-		dst_ret = rt6_check(rt, cookie);
+		dst_ret = rt6_check(rt, from, cookie);
 
 	rcu_read_unlock();
 
@@ -2211,13 +2225,17 @@ static void ip6_link_failure(struct sk_buff *skb)
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
 				rt6_remove_exception_rt(rt);
-		} else if (rt->from) {
+		} else {
+			struct fib6_info *from;
 			struct fib6_node *fn;
 
 			rcu_read_lock();
-			fn = rcu_dereference(rt->from->fib6_node);
-			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
-				fn->fn_sernum = -1;
+			from = rcu_dereference(rt->from);
+			if (from) {
+				fn = rcu_dereference(from->fib6_node);
+				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+					fn->fn_sernum = -1;
+			}
 			rcu_read_unlock();
 		}
 	}
@@ -2225,8 +2243,15 @@ static void ip6_link_failure(struct sk_buff *skb)
 
 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
 {
-	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
-		rt0->dst.expires = rt0->from->expires;
+	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
+		struct fib6_info *from;
+
+		rcu_read_lock();
+		from = rcu_dereference(rt0->from);
+		if (from)
+			rt0->dst.expires = from->expires;
+		rcu_read_unlock();
+	}
 
 	dst_set_expires(&rt0->dst, timeout);
 	rt0->rt6i_flags |= RTF_EXPIRES;
@@ -2243,8 +2268,14 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 
 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 {
+	bool from_set;
+
+	rcu_read_lock();
+	from_set = !!rcu_dereference(rt->from);
+	rcu_read_unlock();
+
 	return !(rt->rt6i_flags & RTF_CACHE) &&
-		(rt->rt6i_flags & RTF_PCPU || rt->from);
+		(rt->rt6i_flags & RTF_PCPU || from_set);
 }
 
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@ -2280,16 +2311,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 		if (rt6->rt6i_flags & RTF_CACHE)
 			rt6_update_exception_stamp_rt(rt6);
 	} else if (daddr) {
+		struct fib6_info *from;
 		struct rt6_info *nrt6;
 
 		rcu_read_lock();
-		nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
-		rcu_read_unlock();
+		from = rcu_dereference(rt6->from);
+		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
-			if (rt6_insert_exception(nrt6, rt6->from))
+			if (rt6_insert_exception(nrt6, from))
 				dst_release_immediate(&nrt6->dst);
 		}
+		rcu_read_unlock();
 	}
 }
 
@@ -3222,6 +3255,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	struct ndisc_options ndopts;
 	struct inet6_dev *in6_dev;
 	struct neighbour *neigh;
+	struct fib6_info *from;
 	struct rd_msg *msg;
 	int optlen, on_link;
 	u8 *lladdr;
@@ -3304,7 +3338,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 		     NDISC_REDIRECT, &ndopts);
 
 	rcu_read_lock();
-	nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
+	from = rcu_dereference(rt->from);
+	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
 	rcu_read_unlock();
 	if (!nrt)
 		goto out;
@@ -4687,6 +4722,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	struct net *net = sock_net(in_skb->sk);
 	struct nlattr *tb[RTA_MAX+1];
 	int err, iif = 0, oif = 0;
+	struct fib6_info *from;
 	struct dst_entry *dst;
 	struct rt6_info *rt;
 	struct sk_buff *skb;
@@ -4783,15 +4819,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	skb_dst_set(skb, &rt->dst);
+
+	rcu_read_lock();
+	from = rcu_dereference(rt->from);
+
 	if (fibmatch)
-		err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
+		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, 0);
 	else
-		err = rt6_fill_node(net, skb, rt->from, dst,
-				    &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
+		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
+				    &fl6.saddr, iif, RTM_NEWROUTE,
 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
 				    0);
+	rcu_read_unlock();
+
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
-- 
cgit v1.2.3


From a47060cf40c6c8e6e125692f5d73ceea173b12b7 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 6 Apr 2018 15:31:49 +0200
Subject: usb: audio-v2: Correct the comment for struct
 uac_clock_selector_descriptor

The comment in UAC2 clock selector descriptor definition mentions the
bAssocTerminal after baCSourceID[], but it doesn't exist in the actual
definition.  Let's correct it.

Fixes: 5dd360ebd832 ("include/linux/usb/audio-v2.h: add more UAC2 details")
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/audio-v2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index aaafecf073ff..49699255cfd3 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -94,7 +94,7 @@ struct uac_clock_selector_descriptor {
 	__u8 bClockID;
 	__u8 bNrInPins;
 	__u8 baCSourceID[];
-	/* bmControls, bAssocTerminal and iClockSource omitted */
+	/* bmControls and iClockSource omitted */
 } __attribute__((packed));
 
 /* 4.7.2.3 Clock Multiplier Descriptor */
-- 
cgit v1.2.3


From 60b9f942bc059913bcdac90d5b225f557438b5c5 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 18 Apr 2018 11:26:19 +0200
Subject: USB: phy: drop unused legacy controller-phy bind helper

Drop the unused legacy usb_bind_phy() helper whose last user was removed
in 2016 when OMAP moved to device-tree boot (9080b8dc761a ("ARM: OMAP2+:
Remove legacy usb-host.c platform init code")).

Note that this means that for the last couple of years the phy_bind_list
has been empty (when using mainline kernels) and that consequently all
phy lookups using the usb_get_phy_dev() interface have failed with
-ENODEV. This helper along with its current users will be removed by
follow-on patches.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/phy/phy.c   | 34 ----------------------------------
 include/linux/usb/phy.h |  8 --------
 2 files changed, 42 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c
index bceb2c9988dd..833547b00383 100644
--- a/drivers/usb/phy/phy.c
+++ b/drivers/usb/phy/phy.c
@@ -795,40 +795,6 @@ void usb_remove_phy(struct usb_phy *x)
 }
 EXPORT_SYMBOL_GPL(usb_remove_phy);
 
-/**
- * usb_bind_phy - bind the phy and the controller that uses the phy
- * @dev_name: the device name of the device that will bind to the phy
- * @index: index to specify the port number
- * @phy_dev_name: the device name of the phy
- *
- * Fills the phy_bind structure with the dev_name and phy_dev_name. This will
- * be used when the phy driver registers the phy and when the controller
- * requests this phy.
- *
- * To be used by platform specific initialization code.
- */
-int usb_bind_phy(const char *dev_name, u8 index,
-				const char *phy_dev_name)
-{
-	struct usb_phy_bind *phy_bind;
-	unsigned long flags;
-
-	phy_bind = kzalloc(sizeof(*phy_bind), GFP_KERNEL);
-	if (!phy_bind)
-		return -ENOMEM;
-
-	phy_bind->dev_name = dev_name;
-	phy_bind->phy_dev_name = phy_dev_name;
-	phy_bind->index = index;
-
-	spin_lock_irqsave(&phy_lock, flags);
-	list_add_tail(&phy_bind->list, &phy_bind_list);
-	spin_unlock_irqrestore(&phy_lock, flags);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(usb_bind_phy);
-
 /**
  * usb_phy_set_event - set event to phy event
  * @x: the phy returned by usb_get_phy();
diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index b7a2625947f5..ac5a079161e1 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -242,8 +242,6 @@ extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev,
 	struct device_node *node, struct notifier_block *nb);
 extern void usb_put_phy(struct usb_phy *);
 extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x);
-extern int usb_bind_phy(const char *dev_name, u8 index,
-				const char *phy_dev_name);
 extern void usb_phy_set_event(struct usb_phy *x, unsigned long event);
 extern void usb_phy_set_charger_current(struct usb_phy *usb_phy,
 					unsigned int mA);
@@ -293,12 +291,6 @@ static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x)
 {
 }
 
-static inline int usb_bind_phy(const char *dev_name, u8 index,
-				const char *phy_dev_name)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event)
 {
 }
-- 
cgit v1.2.3


From bc40f53417410be18298c5b5dbf5bcae9588d84f Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 18 Apr 2018 11:26:20 +0200
Subject: USB: core: hcd: drop support for legacy phys

Drop support for looking up and initialising legacy phys in USB core,
something which hasn't been used by a mainline kernel since commit
9080b8dc761a ("ARM: OMAP2+: Remove legacy usb-host.c platform init
code"). Specifically, since that commit usb_get_phy_dev() have always
returned -ENODEV and consequently this code has not been used.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 38 +++-----------------------------------
 include/linux/usb/hcd.h |  1 -
 2 files changed, 3 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 777036ae6367..6241d32c5ba7 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -33,7 +33,6 @@
 #include <linux/phy/phy.h>
 #include <linux/usb.h>
 #include <linux/usb/hcd.h>
-#include <linux/usb/phy.h>
 #include <linux/usb/otg.h>
 
 #include "usb.h"
@@ -2739,30 +2738,10 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	int retval;
 	struct usb_device *rhdev;
 
-	if (IS_ENABLED(CONFIG_USB_PHY) && !hcd->skip_phy_initialization) {
-		struct usb_phy *phy = usb_get_phy_dev(hcd->self.sysdev, 0);
-
-		if (IS_ERR(phy)) {
-			retval = PTR_ERR(phy);
-			if (retval == -EPROBE_DEFER)
-				return retval;
-		} else {
-			retval = usb_phy_init(phy);
-			if (retval) {
-				usb_put_phy(phy);
-				return retval;
-			}
-			hcd->usb_phy = phy;
-			hcd->remove_phy = 1;
-		}
-	}
-
 	if (!hcd->skip_phy_initialization && usb_hcd_is_primary_hcd(hcd)) {
 		hcd->phy_roothub = usb_phy_roothub_init(hcd->self.sysdev);
-		if (IS_ERR(hcd->phy_roothub)) {
-			retval = PTR_ERR(hcd->phy_roothub);
-			goto err_phy_roothub_init;
-		}
+		if (IS_ERR(hcd->phy_roothub))
+			return PTR_ERR(hcd->phy_roothub);
 
 		retval = usb_phy_roothub_power_on(hcd->phy_roothub);
 		if (retval)
@@ -2936,12 +2915,7 @@ err_create_buf:
 	usb_phy_roothub_power_off(hcd->phy_roothub);
 err_usb_phy_roothub_power_on:
 	usb_phy_roothub_exit(hcd->phy_roothub);
-err_phy_roothub_init:
-	if (hcd->remove_phy && hcd->usb_phy) {
-		usb_phy_shutdown(hcd->usb_phy);
-		usb_put_phy(hcd->usb_phy);
-		hcd->usb_phy = NULL;
-	}
+
 	return retval;
 }
 EXPORT_SYMBOL_GPL(usb_add_hcd);
@@ -3017,12 +2991,6 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 	usb_phy_roothub_power_off(hcd->phy_roothub);
 	usb_phy_roothub_exit(hcd->phy_roothub);
 
-	if (hcd->remove_phy && hcd->usb_phy) {
-		usb_phy_shutdown(hcd->usb_phy);
-		usb_put_phy(hcd->usb_phy);
-		hcd->usb_phy = NULL;
-	}
-
 	usb_put_invalidate_rhdev(hcd);
 	hcd->flags = 0;
 }
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index aef50cb2ed1b..e33009c77840 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -150,7 +150,6 @@ struct usb_hcd {
 	unsigned		rh_pollable:1;	/* may we poll the root hub? */
 	unsigned		msix_enabled:1;	/* driver has MSI-X enabled? */
 	unsigned		msi_enabled:1;	/* driver has MSI enabled? */
-	unsigned		remove_phy:1;	/* auto-remove USB phy */
 	/*
 	 * do not manage the PHY state in the HCD core, instead let the driver
 	 * handle this (for example if the PHY can only be turned on after a
-- 
cgit v1.2.3


From c3c0ac70c77d34c03e9600170932b2c28478795f Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 18 Apr 2018 11:26:24 +0200
Subject: USB: phy: drop legacy board-file support

The legacy interface for associating controllers with phys from board
files and platform code has been unused since commit 9080b8dc761a ("ARM:
OMAP2+: Remove legacy usb-host.c platform init code"). Since then, all
calls to usb_get_phy_dev() and its devres version have been returning
-ENODEV.

Now that the final calls to these functions have been removed, we can
drop this legacy lookup interface altogether.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/phy/phy.c   | 99 +------------------------------------------------
 include/linux/usb/phy.h | 28 --------------
 2 files changed, 2 insertions(+), 125 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c
index 833547b00383..0277f62739a2 100644
--- a/drivers/usb/phy/phy.c
+++ b/drivers/usb/phy/phy.c
@@ -27,7 +27,6 @@
 #define DEFAULT_ACA_CUR_MAX	5000
 
 static LIST_HEAD(phy_list);
-static LIST_HEAD(phy_bind_list);
 static DEFINE_SPINLOCK(phy_lock);
 
 struct phy_devm {
@@ -50,24 +49,6 @@ static struct usb_phy *__usb_find_phy(struct list_head *list,
 	return ERR_PTR(-ENODEV);
 }
 
-static struct usb_phy *__usb_find_phy_dev(struct device *dev,
-	struct list_head *list, u8 index)
-{
-	struct usb_phy_bind *phy_bind = NULL;
-
-	list_for_each_entry(phy_bind, list, list) {
-		if (!(strcmp(phy_bind->dev_name, dev_name(dev))) &&
-				phy_bind->index == index) {
-			if (phy_bind->phy)
-				return phy_bind->phy;
-			else
-				return ERR_PTR(-EPROBE_DEFER);
-		}
-	}
-
-	return ERR_PTR(-ENODEV);
-}
-
 static struct usb_phy *__of_usb_find_phy(struct device_node *node)
 {
 	struct usb_phy  *phy;
@@ -584,72 +565,6 @@ struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_usb_get_phy_by_phandle);
 
-/**
- * usb_get_phy_dev - find the USB PHY
- * @dev - device that requests this phy
- * @index - the index of the phy
- *
- * Returns the phy driver, after getting a refcount to it; or
- * -ENODEV if there is no such phy.  The caller is responsible for
- * calling usb_put_phy() to release that count.
- *
- * For use by USB host and peripheral drivers.
- */
-struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index)
-{
-	struct usb_phy	*phy = NULL;
-	unsigned long	flags;
-
-	spin_lock_irqsave(&phy_lock, flags);
-
-	phy = __usb_find_phy_dev(dev, &phy_bind_list, index);
-	if (IS_ERR(phy) || !try_module_get(phy->dev->driver->owner)) {
-		dev_dbg(dev, "unable to find transceiver\n");
-		if (!IS_ERR(phy))
-			phy = ERR_PTR(-ENODEV);
-
-		goto err0;
-	}
-
-	get_device(phy->dev);
-
-err0:
-	spin_unlock_irqrestore(&phy_lock, flags);
-
-	return phy;
-}
-EXPORT_SYMBOL_GPL(usb_get_phy_dev);
-
-/**
- * devm_usb_get_phy_dev - find the USB PHY using device ptr and index
- * @dev - device that requests this phy
- * @index - the index of the phy
- *
- * Gets the phy using usb_get_phy_dev(), and associates a device with it using
- * devres. On driver detach, release function is invoked on the devres data,
- * then, devres data is freed.
- *
- * For use by USB host and peripheral drivers.
- */
-struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index)
-{
-	struct usb_phy **ptr, *phy;
-
-	ptr = devres_alloc(devm_usb_phy_release, sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
-		return NULL;
-
-	phy = usb_get_phy_dev(dev, index);
-	if (!IS_ERR(phy)) {
-		*ptr = phy;
-		devres_add(dev, ptr);
-	} else
-		devres_free(ptr);
-
-	return phy;
-}
-EXPORT_SYMBOL_GPL(devm_usb_get_phy_dev);
-
 /**
  * devm_usb_put_phy - release the USB PHY
  * @dev - device that wants to release this phy
@@ -745,7 +660,6 @@ EXPORT_SYMBOL_GPL(usb_add_phy);
  */
 int usb_add_phy_dev(struct usb_phy *x)
 {
-	struct usb_phy_bind *phy_bind;
 	unsigned long flags;
 	int ret;
 
@@ -762,13 +676,9 @@ int usb_add_phy_dev(struct usb_phy *x)
 	ATOMIC_INIT_NOTIFIER_HEAD(&x->notifier);
 
 	spin_lock_irqsave(&phy_lock, flags);
-	list_for_each_entry(phy_bind, &phy_bind_list, list)
-		if (!(strcmp(phy_bind->phy_dev_name, dev_name(x->dev))))
-			phy_bind->phy = x;
-
 	list_add_tail(&x->head, &phy_list);
-
 	spin_unlock_irqrestore(&phy_lock, flags);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(usb_add_phy_dev);
@@ -782,15 +692,10 @@ EXPORT_SYMBOL_GPL(usb_add_phy_dev);
 void usb_remove_phy(struct usb_phy *x)
 {
 	unsigned long	flags;
-	struct usb_phy_bind *phy_bind;
 
 	spin_lock_irqsave(&phy_lock, flags);
-	if (x) {
-		list_for_each_entry(phy_bind, &phy_bind_list, list)
-			if (phy_bind->phy == x)
-				phy_bind->phy = NULL;
+	if (x)
 		list_del(&x->head);
-	}
 	spin_unlock_irqrestore(&phy_lock, flags);
 }
 EXPORT_SYMBOL_GPL(usb_remove_phy);
diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index ac5a079161e1..e4de6bc1f69b 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -157,22 +157,6 @@ struct usb_phy {
 	enum usb_charger_type (*charger_detect)(struct usb_phy *x);
 };
 
-/**
- * struct usb_phy_bind - represent the binding for the phy
- * @dev_name: the device name of the device that will bind to the phy
- * @phy_dev_name: the device name of the phy
- * @index: used if a single controller uses multiple phys
- * @phy: reference to the phy
- * @list: to maintain a linked list of the binding information
- */
-struct usb_phy_bind {
-	const char	*dev_name;
-	const char	*phy_dev_name;
-	u8		index;
-	struct usb_phy	*phy;
-	struct list_head list;
-};
-
 /* for board-specific init logic */
 extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type);
 extern int usb_add_phy_dev(struct usb_phy *);
@@ -234,8 +218,6 @@ usb_phy_vbus_off(struct usb_phy *x)
 extern struct usb_phy *usb_get_phy(enum usb_phy_type type);
 extern struct usb_phy *devm_usb_get_phy(struct device *dev,
 	enum usb_phy_type type);
-extern struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index);
-extern struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index);
 extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
 	const char *phandle, u8 index);
 extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev,
@@ -261,16 +243,6 @@ static inline struct usb_phy *devm_usb_get_phy(struct device *dev,
 	return ERR_PTR(-ENXIO);
 }
 
-static inline struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index)
-{
-	return ERR_PTR(-ENXIO);
-}
-
-static inline struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index)
-{
-	return ERR_PTR(-ENXIO);
-}
-
 static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
 	const char *phandle, u8 index)
 {
-- 
cgit v1.2.3


From e8374db15436648411d933b01d929acae0538775 Mon Sep 17 00:00:00 2001
From: Li Jun <jun.li@nxp.com>
Date: Mon, 16 Apr 2018 14:54:37 +0800
Subject: usb: typec: tcpm: remove max_snk_mv/ma/mw

Since there is no user of max_snk_*, so we can remove them from tcpm.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Li Jun <jun.li@nxp.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm.c | 12 ------------
 include/linux/usb/tcpm.h |  9 ---------
 2 files changed, 21 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/typec/tcpm.c b/drivers/usb/typec/tcpm.c
index 048b9532fd39..27192083f8e6 100644
--- a/drivers/usb/typec/tcpm.c
+++ b/drivers/usb/typec/tcpm.c
@@ -257,9 +257,6 @@ struct tcpm_port {
 	u32 snk_vdo[VDO_MAX_OBJECTS];
 	unsigned int nr_snk_vdo;
 
-	unsigned int max_snk_mv;
-	unsigned int max_snk_ma;
-	unsigned int max_snk_mw;
 	unsigned int operating_snk_mw;
 
 	/* Requested current / voltage */
@@ -3598,9 +3595,6 @@ EXPORT_SYMBOL_GPL(tcpm_update_source_capabilities);
 
 int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
 				  unsigned int nr_pdo,
-				  unsigned int max_snk_mv,
-				  unsigned int max_snk_ma,
-				  unsigned int max_snk_mw,
 				  unsigned int operating_snk_mw)
 {
 	if (tcpm_validate_caps(port, pdo, nr_pdo))
@@ -3608,9 +3602,6 @@ int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
 
 	mutex_lock(&port->lock);
 	port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, pdo, nr_pdo);
-	port->max_snk_mv = max_snk_mv;
-	port->max_snk_ma = max_snk_ma;
-	port->max_snk_mw = max_snk_mw;
 	port->operating_snk_mw = operating_snk_mw;
 
 	switch (port->state) {
@@ -3676,9 +3667,6 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
 	port->nr_snk_vdo = tcpm_copy_vdos(port->snk_vdo, tcpc->config->snk_vdo,
 					  tcpc->config->nr_snk_vdo);
 
-	port->max_snk_mv = tcpc->config->max_snk_mv;
-	port->max_snk_ma = tcpc->config->max_snk_ma;
-	port->max_snk_mw = tcpc->config->max_snk_mw;
 	port->operating_snk_mw = tcpc->config->operating_snk_mw;
 	if (!tcpc->config->try_role_hw)
 		port->try_role = tcpc->config->default_role;
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index f0d839daeaea..f5bda9a0f644 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -62,9 +62,6 @@ enum tcpm_transmit_type {
  * @snk_pdo:	PDO parameters sent to partner as response to
  *		PD_CTRL_GET_SINK_CAP message
  * @nr_snk_pdo:	Number of entries in @snk_pdo
- * @max_snk_mv:	Maximum acceptable sink voltage in mV
- * @max_snk_ma:	Maximum sink current in mA
- * @max_snk_mw:	Maximum required sink power in mW
  * @operating_snk_mw:
  *		Required operating sink power in mW
  * @type:	Port type (TYPEC_PORT_DFP, TYPEC_PORT_UFP, or
@@ -85,9 +82,6 @@ struct tcpc_config {
 	const u32 *snk_vdo;
 	unsigned int nr_snk_vdo;
 
-	unsigned int max_snk_mv;
-	unsigned int max_snk_ma;
-	unsigned int max_snk_mw;
 	unsigned int operating_snk_mw;
 
 	enum typec_port_type type;
@@ -174,9 +168,6 @@ int tcpm_update_source_capabilities(struct tcpm_port *port, const u32 *pdo,
 				    unsigned int nr_pdo);
 int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
 				  unsigned int nr_pdo,
-				  unsigned int max_snk_mv,
-				  unsigned int max_snk_ma,
-				  unsigned int max_snk_mw,
 				  unsigned int operating_snk_mw);
 
 void tcpm_vbus_change(struct tcpm_port *port);
-- 
cgit v1.2.3


From ffe95371d2a84f3ad8085656d4fcb2fc926ff7a1 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Thu, 19 Apr 2018 19:05:50 +0300
Subject: usb: define HCD_USB32 speed option for hosts that support USB 3.2
 dual-lane

Hosts that support USB 3.2 Enhaned SuperSpeed can set their hcd speed
to HCD_USB32 to let usb core and host drivers know that the controller
supports new USB 3.2 dual-lane features.

make sure usb core handle HCD_USB32 hosts correctly, for now similar
to HCD_USB32.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 3 +++
 include/linux/usb/hcd.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 6241d32c5ba7..5a799f84dcc2 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -567,6 +567,7 @@ static int rh_call_control (struct usb_hcd *hcd, struct urb *urb)
 		switch (wValue & 0xff00) {
 		case USB_DT_DEVICE << 8:
 			switch (hcd->speed) {
+			case HCD_USB32:
 			case HCD_USB31:
 				bufp = usb31_rh_dev_descriptor;
 				break;
@@ -591,6 +592,7 @@ static int rh_call_control (struct usb_hcd *hcd, struct urb *urb)
 			break;
 		case USB_DT_CONFIG << 8:
 			switch (hcd->speed) {
+			case HCD_USB32:
 			case HCD_USB31:
 			case HCD_USB3:
 				bufp = ss_rh_config_descriptor;
@@ -2804,6 +2806,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	case HCD_USB3:
 		rhdev->speed = USB_SPEED_SUPER;
 		break;
+	case HCD_USB32:
 	case HCD_USB31:
 		rhdev->speed = USB_SPEED_SUPER_PLUS;
 		break;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index e33009c77840..34a6ded6f319 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -260,6 +260,7 @@ struct hc_driver {
 #define	HCD_USB25	0x0030		/* Wireless USB 1.0 (USB 2.5)*/
 #define	HCD_USB3	0x0040		/* USB 3.0 */
 #define	HCD_USB31	0x0050		/* USB 3.1 */
+#define	HCD_USB32	0x0060		/* USB 3.2 */
 #define	HCD_MASK	0x0070
 #define	HCD_BH		0x0100		/* URB complete in BH context */
 
-- 
cgit v1.2.3


From 013eedb8c56e55549c45df19ca56a029cc804028 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Thu, 19 Apr 2018 19:05:51 +0300
Subject: USB: Add support to store lane count used by USB 3.2

USB 3.2 specification adds Dual-lane support, doubling the maximum
SuperSpeedPlus data rate from 10Gbps to 20Gbps.

Dual-lane takes into use a second set of rx and tx wires/pins in the
Type-C cable and connector.

Add "rx_lanes" and "tx_lanes" variables to struct usb_device to store
the numer of lanes in use. Number of lanes can be read using the extended
port status hub request that was introduced in USB 3.1.

Extended port status rx and tx lane count are zero based, maximum
lanes supported by non inter-chip (SSIC) USB 3.2 is 2 (dual lane) with
rx and tx lane count symmetric. SSIC devices support asymmetric lanes
up to 4 lanes per direction.

If extended port status is not available then default to one lane.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hub.c        | 8 ++++++++
 include/linux/usb.h           | 4 ++++
 include/uapi/linux/usb/ch11.h | 5 +++++
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 9e79bcf03cde..e21cc3be18b7 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2746,6 +2746,14 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1,
 	if (!udev)
 		return 0;
 
+	if (hub_is_superspeedplus(hub->hdev)) {
+		/* extended portstatus Rx and Tx lane count are zero based */
+		udev->rx_lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1;
+		udev->tx_lanes = USB_EXT_PORT_TX_LANES(ext_portstatus) + 1;
+	} else {
+		udev->rx_lanes = 1;
+		udev->tx_lanes = 1;
+	}
 	if (hub_is_wusb(hub))
 		udev->speed = USB_SPEED_WIRELESS;
 	else if (hub_is_superspeedplus(hub->hdev) &&
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 0173597e59aa..beffceec4915 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -551,6 +551,8 @@ struct usb3_lpm_parameters {
  * @route: tree topology hex string for use with xHCI
  * @state: device state: configured, not attached, etc.
  * @speed: device speed: high/full/low (or error)
+ * @rx_lanes: number of rx lanes in use, USB 3.2 adds dual-lane support
+ * @tx_lanes: number of tx lanes in use, USB 3.2 adds dual-lane support
  * @tt: Transaction Translator info; used with low/full speed dev, highspeed hub
  * @ttport: device port on that tt hub
  * @toggle: one bit for each endpoint, with ([0] = IN, [1] = OUT) endpoints
@@ -624,6 +626,8 @@ struct usb_device {
 	u32		route;
 	enum usb_device_state	state;
 	enum usb_device_speed	speed;
+	unsigned int		rx_lanes;
+	unsigned int		tx_lanes;
 
 	struct usb_tt	*tt;
 	int		ttport;
diff --git a/include/uapi/linux/usb/ch11.h b/include/uapi/linux/usb/ch11.h
index 29c120c88747..fb0cd24c392c 100644
--- a/include/uapi/linux/usb/ch11.h
+++ b/include/uapi/linux/usb/ch11.h
@@ -197,6 +197,11 @@ struct usb_port_status {
 #define USB_EXT_PORT_STAT_RX_LANES	0x00000f00
 #define USB_EXT_PORT_STAT_TX_LANES	0x0000f000
 
+#define USB_EXT_PORT_RX_LANES(p) \
+			(((p) & USB_EXT_PORT_STAT_RX_LANES) >> 8)
+#define USB_EXT_PORT_TX_LANES(p) \
+			(((p) & USB_EXT_PORT_STAT_TX_LANES) >> 12)
+
 /*
  * wHubCharacteristics (masks)
  * See USB 2.0 spec Table 11-13, offset 3
-- 
cgit v1.2.3


From 143470368efd3f0fb4cbe81aa89f49de8048d8a9 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Tue, 10 Apr 2018 01:02:58 +0300
Subject: usb: tegra: Move utmi-pads reset from ehci-tegra to tegra-phy

UTMI pads are shared by USB controllers and reset of UTMI pads is shared
with the reset of USB1 controller. Currently reset of UTMI pads is done by
the EHCI driver and ChipIdea UDC works because EHCI driver always happen
to be probed first. Move reset controls from ehci-tegra to tegra-phy in
order to resolve the problem.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-tegra.c     | 87 ++++++++++++++++++---------------------
 drivers/usb/phy/phy-tegra-usb.c   | 79 ++++++++++++++++++++++++++++++++---
 include/linux/usb/tegra_usb_phy.h |  2 +
 3 files changed, 115 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/host/ehci-tegra.c b/drivers/usb/host/ehci-tegra.c
index a6f4389f7e88..4d2cdec4cb78 100644
--- a/drivers/usb/host/ehci-tegra.c
+++ b/drivers/usb/host/ehci-tegra.c
@@ -36,7 +36,6 @@
 #define DRV_NAME "tegra-ehci"
 
 static struct hc_driver __read_mostly tegra_ehci_hc_driver;
-static bool usb1_reset_attempted;
 
 struct tegra_ehci_soc_config {
 	bool has_hostpc;
@@ -51,67 +50,54 @@ struct tegra_ehci_hcd {
 	enum tegra_usb_phy_port_speed port_speed;
 };
 
-/*
- * The 1st USB controller contains some UTMI pad registers that are global for
- * all the controllers on the chip. Those registers are also cleared when
- * reset is asserted to the 1st controller. This means that the 1st controller
- * can only be reset when no other controlled has finished probing. So we'll
- * reset the 1st controller before doing any other setup on any of the
- * controllers, and then never again.
- *
- * Since this is a PHY issue, the Tegra PHY driver should probably be doing
- * the resetting of the USB controllers. But to keep compatibility with old
- * device trees that don't have reset phandles in the PHYs, do it here.
- * Those old DTs will be vulnerable to total USB breakage if the 1st EHCI
- * device isn't the first one to finish probing, so warn them.
- */
 static int tegra_reset_usb_controller(struct platform_device *pdev)
 {
 	struct device_node *phy_np;
 	struct usb_hcd *hcd = platform_get_drvdata(pdev);
 	struct tegra_ehci_hcd *tegra =
 		(struct tegra_ehci_hcd *)hcd_to_ehci(hcd)->priv;
-	bool has_utmi_pad_registers = false;
+	struct reset_control *rst;
+	int err;
 
 	phy_np = of_parse_phandle(pdev->dev.of_node, "nvidia,phy", 0);
 	if (!phy_np)
 		return -ENOENT;
 
-	if (of_property_read_bool(phy_np, "nvidia,has-utmi-pad-registers"))
-		has_utmi_pad_registers = true;
+	/*
+	 * The 1st USB controller contains some UTMI pad registers that are
+	 * global for all the controllers on the chip. Those registers are
+	 * also cleared when reset is asserted to the 1st controller.
+	 */
+	rst = of_reset_control_get_shared(phy_np, "utmi-pads");
+	if (IS_ERR(rst)) {
+		dev_warn(&pdev->dev,
+			 "can't get utmi-pads reset from the PHY\n");
+		dev_warn(&pdev->dev,
+			 "continuing, but please update your DT\n");
+	} else {
+		/*
+		 * PHY driver performs UTMI-pads reset in a case of
+		 * non-legacy DT.
+		 */
+		reset_control_put(rst);
+	}
 
-	if (!usb1_reset_attempted) {
-		struct reset_control *usb1_reset;
+	of_node_put(phy_np);
 
-		if (!has_utmi_pad_registers)
-			usb1_reset = of_reset_control_get(phy_np, "utmi-pads");
-		else
-			usb1_reset = tegra->rst;
-
-		if (IS_ERR(usb1_reset)) {
-			dev_warn(&pdev->dev,
-				 "can't get utmi-pads reset from the PHY\n");
-			dev_warn(&pdev->dev,
-				 "continuing, but please update your DT\n");
-		} else {
-			reset_control_assert(usb1_reset);
-			udelay(1);
-			reset_control_deassert(usb1_reset);
-
-			if (!has_utmi_pad_registers)
-				reset_control_put(usb1_reset);
-		}
+	/* reset control is shared, hence initialize it first */
+	err = reset_control_deassert(tegra->rst);
+	if (err)
+		return err;
 
-		usb1_reset_attempted = true;
-	}
+	err = reset_control_assert(tegra->rst);
+	if (err)
+		return err;
 
-	if (!has_utmi_pad_registers) {
-		reset_control_assert(tegra->rst);
-		udelay(1);
-		reset_control_deassert(tegra->rst);
-	}
+	udelay(1);
 
-	of_node_put(phy_np);
+	err = reset_control_deassert(tegra->rst);
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -440,7 +426,7 @@ static int tegra_ehci_probe(struct platform_device *pdev)
 		goto cleanup_hcd_create;
 	}
 
-	tegra->rst = devm_reset_control_get(&pdev->dev, "usb");
+	tegra->rst = devm_reset_control_get_shared(&pdev->dev, "usb");
 	if (IS_ERR(tegra->rst)) {
 		dev_err(&pdev->dev, "Can't get ehci reset\n");
 		err = PTR_ERR(tegra->rst);
@@ -452,8 +438,10 @@ static int tegra_ehci_probe(struct platform_device *pdev)
 		goto cleanup_hcd_create;
 
 	err = tegra_reset_usb_controller(pdev);
-	if (err)
+	if (err) {
+		dev_err(&pdev->dev, "Failed to reset controller\n");
 		goto cleanup_clk_en;
+	}
 
 	u_phy = devm_usb_get_phy_by_phandle(&pdev->dev, "nvidia,phy", 0);
 	if (IS_ERR(u_phy)) {
@@ -538,6 +526,9 @@ static int tegra_ehci_remove(struct platform_device *pdev)
 	usb_phy_shutdown(hcd->usb_phy);
 	usb_remove_hcd(hcd);
 
+	reset_control_assert(tegra->rst);
+	udelay(1);
+
 	clk_disable_unprepare(tegra->clk);
 
 	usb_put_hcd(hcd);
diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c
index e46219e7fa93..ea7ef1dc0b42 100644
--- a/drivers/usb/phy/phy-tegra-usb.c
+++ b/drivers/usb/phy/phy-tegra-usb.c
@@ -236,17 +236,83 @@ static void set_phcd(struct tegra_usb_phy *phy, bool enable)
 
 static int utmip_pad_open(struct tegra_usb_phy *phy)
 {
-	int err;
+	int ret;
 
 	phy->pad_clk = devm_clk_get(phy->u_phy.dev, "utmi-pads");
 	if (IS_ERR(phy->pad_clk)) {
-		err = PTR_ERR(phy->pad_clk);
+		ret = PTR_ERR(phy->pad_clk);
 		dev_err(phy->u_phy.dev,
-			"Failed to get UTMIP pad clock: %d\n", err);
-		return err;
+			"Failed to get UTMIP pad clock: %d\n", ret);
+		return ret;
 	}
 
-	return 0;
+	phy->pad_rst = devm_reset_control_get_optional_shared(
+						phy->u_phy.dev, "utmi-pads");
+	if (IS_ERR(phy->pad_rst)) {
+		ret = PTR_ERR(phy->pad_rst);
+		dev_err(phy->u_phy.dev,
+			"Failed to get UTMI-pads reset: %d\n", ret);
+		return ret;
+	}
+
+	ret = clk_prepare_enable(phy->pad_clk);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to enable UTMI-pads clock: %d\n", ret);
+		return ret;
+	}
+
+	spin_lock(&utmip_pad_lock);
+
+	ret = reset_control_deassert(phy->pad_rst);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to initialize UTMI-pads reset: %d\n", ret);
+		goto unlock;
+	}
+
+	ret = reset_control_assert(phy->pad_rst);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to assert UTMI-pads reset: %d\n", ret);
+		goto unlock;
+	}
+
+	udelay(1);
+
+	ret = reset_control_deassert(phy->pad_rst);
+	if (ret)
+		dev_err(phy->u_phy.dev,
+			"Failed to deassert UTMI-pads reset: %d\n", ret);
+unlock:
+	spin_unlock(&utmip_pad_lock);
+
+	clk_disable_unprepare(phy->pad_clk);
+
+	return ret;
+}
+
+static int utmip_pad_close(struct tegra_usb_phy *phy)
+{
+	int ret;
+
+	ret = clk_prepare_enable(phy->pad_clk);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to enable UTMI-pads clock: %d\n", ret);
+		return ret;
+	}
+
+	ret = reset_control_assert(phy->pad_rst);
+	if (ret)
+		dev_err(phy->u_phy.dev,
+			"Failed to assert UTMI-pads reset: %d\n", ret);
+
+	udelay(1);
+
+	clk_disable_unprepare(phy->pad_clk);
+
+	return ret;
 }
 
 static void utmip_pad_power_on(struct tegra_usb_phy *phy)
@@ -700,6 +766,9 @@ static void tegra_usb_phy_close(struct tegra_usb_phy *phy)
 	if (!IS_ERR(phy->vbus))
 		regulator_disable(phy->vbus);
 
+	if (!phy->is_ulpi_phy)
+		utmip_pad_close(phy);
+
 	clk_disable_unprepare(phy->pll_u);
 }
 
diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index d641ea1660b7..0c5c3ea8b2d7 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -17,6 +17,7 @@
 #define __TEGRA_USB_PHY_H
 
 #include <linux/clk.h>
+#include <linux/reset.h>
 #include <linux/usb/otg.h>
 
 /*
@@ -76,6 +77,7 @@ struct tegra_usb_phy {
 	bool is_legacy_phy;
 	bool is_ulpi_phy;
 	int reset_gpio;
+	struct reset_control *pad_rst;
 };
 
 void tegra_usb_phy_preresume(struct usb_phy *phy);
-- 
cgit v1.2.3


From c5f78b1fe4e5baf4c4ca30377c2d7e06e2e391ec Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Tue, 27 Mar 2018 11:48:24 +0800
Subject: serial: Introduce UPSTAT_SYNC_FIFO for synchronised FIFOs

This change adds a flag to indicate that a UART is has an external means
of synchronising its FIFO, without needing CTSRTS or XON/XOFF.

This allows us to use the throttle/unthrottle callbacks, without having
to claim other methods of flow control.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Tested-by: Eddie James <eajames@linux.vnet.ibm.com>
Tested-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 4 ++--
 include/linux/serial_core.h      | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 0466f9f08a91..c47158c93202 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -674,8 +674,8 @@ static void uart_send_xchar(struct tty_struct *tty, char ch)
 static void uart_throttle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	upstat_t mask = UPSTAT_SYNC_FIFO;
 	struct uart_port *port;
-	upstat_t mask = 0;
 
 	port = uart_port_ref(state);
 	if (!port)
@@ -703,8 +703,8 @@ static void uart_throttle(struct tty_struct *tty)
 static void uart_unthrottle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	upstat_t mask = UPSTAT_SYNC_FIFO;
 	struct uart_port *port;
-	upstat_t mask = 0;
 
 	port = uart_port_ref(state);
 	if (!port)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 1d356105f25a..d224961e1346 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -233,6 +233,7 @@ struct uart_port {
 #define UPSTAT_AUTORTS		((__force upstat_t) (1 << 2))
 #define UPSTAT_AUTOCTS		((__force upstat_t) (1 << 3))
 #define UPSTAT_AUTOXOFF		((__force upstat_t) (1 << 4))
+#define UPSTAT_SYNC_FIFO	((__force upstat_t) (1 << 5))
 
 	int			hw_stopped;		/* sw-assisted CTS flow state */
 	unsigned int		mctrl;			/* current modem ctrl settings */
-- 
cgit v1.2.3


From ebbaf9ab9ebd69f42b286c7a67cc84571c3d947a Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Tue, 27 Mar 2018 11:48:25 +0800
Subject: serial/8250: export serial8250_read_char

Currently, we export serial8250_rx_chars, which does a whole bunch of
reads from the 8250 data register, without any form of flow control
between reads.

An upcoming change to the aspeed vuart driver implements more
fine-grained flow control in the interrupt handler, requiring
character-at-a-time control over the rx path.

This change exports serial8250_read_char to allow this.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Tested-by: Eddie James <eajames@linux.vnet.ibm.com>
Tested-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_port.c | 3 ++-
 include/linux/serial_8250.h         | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 95833cbc4338..8fbd5fbeb318 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1680,7 +1680,7 @@ static void serial8250_enable_ms(struct uart_port *port)
 	serial8250_rpm_put(up);
 }
 
-static void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr)
+void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr)
 {
 	struct uart_port *port = &up->port;
 	unsigned char ch;
@@ -1740,6 +1740,7 @@ static void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr)
 
 	uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
 }
+EXPORT_SYMBOL_GPL(serial8250_read_char);
 
 /*
  * serial8250_rx_chars: processes according to the passed in LSR
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index a27ef5f56431..76b9db71e489 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -163,6 +163,7 @@ extern void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl);
 extern int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr);
+void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr);
 void serial8250_tx_chars(struct uart_8250_port *up);
 unsigned int serial8250_modem_status(struct uart_8250_port *up);
 void serial8250_init_port(struct uart_8250_port *up);
-- 
cgit v1.2.3


From fbcf93ebcaef7d09881ee308b52cd84f5e43c622 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 21 Apr 2018 09:48:23 -0700
Subject: bpf: btf: Clean up btf.h in uapi

This patch cleans up btf.h in uapi:
1) Rename "name" to "name_off" to better reflect it is an offset to the
   string section instead of a char array.
2) Remove unused value BTF_FLAGS_COMPR and BTF_MAGIC_SWAP

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h       |  8 +++-----
 kernel/bpf/btf.c               | 20 ++++++++++----------
 tools/include/uapi/linux/btf.h |  8 +++-----
 tools/lib/bpf/btf.c            |  2 +-
 4 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 74a30b1090df..bcb56ee47014 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -6,9 +6,7 @@
 #include <linux/types.h>
 
 #define BTF_MAGIC	0xeB9F
-#define BTF_MAGIC_SWAP	0x9FeB
 #define BTF_VERSION	1
-#define BTF_FLAGS_COMPR	0x01
 
 struct btf_header {
 	__u16	magic;
@@ -43,7 +41,7 @@ struct btf_header {
 #define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
 
 struct btf_type {
-	__u32 name;
+	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
@@ -105,7 +103,7 @@ struct btf_type {
  * info in "struct btf_type").
  */
 struct btf_enum {
-	__u32	name;
+	__u32	name_off;
 	__s32	val;
 };
 
@@ -122,7 +120,7 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-	__u32	name;
+	__u32	name_off;
 	__u32	type;
 	__u32	offset;	/* offset in bits */
 };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index eb56ac760547..22e1046a1a86 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -473,7 +473,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "[%u] %s %s%s",
 			   env->log_type_id,
 			   btf_kind_str[kind],
-			   btf_name_by_offset(btf, t->name),
+			   btf_name_by_offset(btf, t->name_off),
 			   log_details ? " " : "");
 
 	if (log_details)
@@ -517,7 +517,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 		btf_verifier_log_type(env, struct_type, NULL);
 
 	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
-			   btf_name_by_offset(btf, member->name),
+			   btf_name_by_offset(btf, member->name_off),
 			   member->type, member->offset);
 
 	if (fmt && *fmt) {
@@ -1419,10 +1419,10 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	btf_verifier_log_type(env, t, NULL);
 
 	for_each_member(i, t, member) {
-		if (!btf_name_offset_valid(btf, member->name)) {
+		if (!btf_name_offset_valid(btf, member->name_off)) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid member name_offset:%u",
-						member->name);
+						member->name_off);
 			return -EINVAL;
 		}
 
@@ -1605,14 +1605,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 	btf_verifier_log_type(env, t, NULL);
 
 	for (i = 0; i < nr_enums; i++) {
-		if (!btf_name_offset_valid(btf, enums[i].name)) {
+		if (!btf_name_offset_valid(btf, enums[i].name_off)) {
 			btf_verifier_log(env, "\tInvalid name_offset:%u",
-					 enums[i].name);
+					 enums[i].name_off);
 			return -EINVAL;
 		}
 
 		btf_verifier_log(env, "\t%s val=%d\n",
-				 btf_name_by_offset(btf, enums[i].name),
+				 btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);
 	}
 
@@ -1636,7 +1636,7 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
 	for (i = 0; i < nr_enums; i++) {
 		if (v == enums[i].val) {
 			seq_printf(m, "%s",
-				   btf_name_by_offset(btf, enums[i].name));
+				   btf_name_by_offset(btf, enums[i].name_off));
 			return;
 		}
 	}
@@ -1687,9 +1687,9 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (!btf_name_offset_valid(env->btf, t->name)) {
+	if (!btf_name_offset_valid(env->btf, t->name_off)) {
 		btf_verifier_log(env, "[%u] Invalid name_offset:%u",
-				 env->log_type_id, t->name);
+				 env->log_type_id, t->name_off);
 		return -EINVAL;
 	}
 
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 74a30b1090df..bcb56ee47014 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -6,9 +6,7 @@
 #include <linux/types.h>
 
 #define BTF_MAGIC	0xeB9F
-#define BTF_MAGIC_SWAP	0x9FeB
 #define BTF_VERSION	1
-#define BTF_FLAGS_COMPR	0x01
 
 struct btf_header {
 	__u16	magic;
@@ -43,7 +41,7 @@ struct btf_header {
 #define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
 
 struct btf_type {
-	__u32 name;
+	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
@@ -105,7 +103,7 @@ struct btf_type {
  * info in "struct btf_type").
  */
 struct btf_enum {
-	__u32	name;
+	__u32	name_off;
 	__s32	val;
 };
 
@@ -122,7 +120,7 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-	__u32	name;
+	__u32	name_off;
 	__u32	type;
 	__u32	offset;	/* offset in bits */
 };
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 58b6255abc7a..2bac710e3194 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -281,7 +281,7 @@ int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
 
 	for (i = 1; i <= btf->nr_types; i++) {
 		const struct btf_type *t = btf->types[i];
-		const char *name = btf_name_by_offset(btf, t->name);
+		const char *name = btf_name_by_offset(btf, t->name_off);
 
 		if (name && !strcmp(type_name, name))
 			return i;
-- 
cgit v1.2.3


From c1c734cb1f54b062f7e67ffc9656d82f5b412b9c Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 23 Mar 2018 10:58:31 -0700
Subject: serial: core: Make sure compiler barfs for 16-byte earlycon names

As part of bringup I ended up wanting to call an earlycon driver by a
name that was exactly 16-bytes big, specifically "qcom_geni_serial".

Unfortunately, when I tried this I found that things compiled just
fine.  They just didn't work.

Specifically the compiler felt perfectly justified in initting the
".name" field of "struct earlycon_id" with the full 16-bytes and just
skipping the '\0'.  Needless to say, that behavior didn't seem ideal,
but I guess someone must have allowed it for a reason.

One way to fix this is to shorten the name field to 15 bytes and then
add an extra byte after that nobody touches.  This should always be
initted to 0 and we're golden.

There are, of course, other ways to fix this too.  We could audit all
the users of the "name" field and make them stop at both null
termination or at 16 bytes.  We could also just make the name field
much bigger so that we're not likely to run into this.  ...but both
seem like we'll just hit the bug again.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d224961e1346..d2a2e4bc2435 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -349,7 +349,8 @@ struct earlycon_device {
 };
 
 struct earlycon_id {
-	char	name[16];
+	char	name[15];
+	char	name_term;	/* In case compiler didn't '\0' term name */
 	char	compatible[128];
 	int	(*setup)(struct earlycon_device *, const char *options);
 } __aligned(32);
-- 
cgit v1.2.3


From 05e6557b8ed833546ee2b66ce6b58fecf09f439e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 29 Mar 2018 15:26:48 +1100
Subject: staging: lustre: add container_of_safe()

Luster has a container_of0() function which is similar to
container_of() but passes an IS_ERR_OR_NULL() pointer through
unchanged.
This could be generally useful: bcache at last has a similar function.

Naming is hard, but the precedent set by hlist_entry_safe() suggests
a _safe suffix might be most consistent.

So add container_of_safe() to kernel.h, and replace all occurrences of
container_of0() with one of
  - list_first_entry, list_next_entry, when that is a better fit,
  - container_of(), when the pointer is used as a validpointer in
    surrounding code,
  - container_of_safe() when there is no obviously better alternative.

Signed-off-by: NeilBrown <neilb@suse.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/lustre/include/linux/libcfs/libcfs.h | 11 -----------
 drivers/staging/lustre/lustre/include/cl_object.h    | 10 +++++-----
 drivers/staging/lustre/lustre/include/lu_object.h    |  6 +++---
 drivers/staging/lustre/lustre/llite/llite_nfs.c      |  2 +-
 drivers/staging/lustre/lustre/llite/vvp_internal.h   |  8 ++++----
 drivers/staging/lustre/lustre/lmv/lmv_internal.h     |  2 +-
 drivers/staging/lustre/lustre/lov/lov_cl_internal.h  | 18 +++++++++---------
 drivers/staging/lustre/lustre/lov/lov_internal.h     |  2 +-
 drivers/staging/lustre/lustre/obdclass/lu_object.c   |  8 ++++----
 drivers/staging/lustre/lustre/obdecho/echo_client.c  |  2 +-
 drivers/staging/lustre/lustre/osc/osc_cl_internal.h  | 10 +++++-----
 drivers/staging/lustre/lustre/osc/osc_internal.h     |  2 +-
 drivers/staging/lustre/lustre/osc/osc_io.c           |  2 +-
 drivers/staging/lustre/lustre/osc/osc_object.c       |  2 +-
 include/linux/kernel.h                               | 16 ++++++++++++++++
 15 files changed, 53 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h
index 5d10aa32895b..6e7754b2f296 100644
--- a/drivers/staging/lustre/include/linux/libcfs/libcfs.h
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs.h
@@ -143,17 +143,6 @@ int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
 int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 int libcfs_ioctl(unsigned long cmd, void __user *arg);
 
-/* container_of depends on "likely" which is defined in libcfs_private.h */
-static inline void *__container_of(void *ptr, unsigned long shift)
-{
-	if (IS_ERR_OR_NULL(ptr))
-		return ptr;
-	return (char *)ptr - shift;
-}
-
-#define container_of0(ptr, type, member) \
-	((type *)__container_of((void *)(ptr), offsetof(type, member)))
-
 #define _LIBCFS_H
 
 extern struct miscdevice libcfs_dev;
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
index 341a145c3331..6f7b991be809 100644
--- a/drivers/staging/lustre/lustre/include/cl_object.h
+++ b/drivers/staging/lustre/lustre/include/cl_object.h
@@ -1941,7 +1941,7 @@ static inline int lu_device_is_cl(const struct lu_device *d)
 static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
 {
 	LASSERT(!d || IS_ERR(d) || lu_device_is_cl(d));
-	return container_of0(d, struct cl_device, cd_lu_dev);
+	return container_of_safe(d, struct cl_device, cd_lu_dev);
 }
 
 static inline struct lu_device *cl2lu_dev(struct cl_device *d)
@@ -1952,13 +1952,13 @@ static inline struct lu_device *cl2lu_dev(struct cl_device *d)
 static inline struct cl_object *lu2cl(const struct lu_object *o)
 {
 	LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
-	return container_of0(o, struct cl_object, co_lu);
+	return container_of_safe(o, struct cl_object, co_lu);
 }
 
 static inline const struct cl_object_conf *
 lu2cl_conf(const struct lu_object_conf *conf)
 {
-	return container_of0(conf, struct cl_object_conf, coc_lu);
+	return container_of_safe(conf, struct cl_object_conf, coc_lu);
 }
 
 static inline struct cl_object *cl_object_next(const struct cl_object *obj)
@@ -1969,12 +1969,12 @@ static inline struct cl_object *cl_object_next(const struct cl_object *obj)
 static inline struct cl_device *cl_object_device(const struct cl_object *o)
 {
 	LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
-	return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+	return container_of_safe(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
 }
 
 static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
 {
-	return container_of0(h, struct cl_object_header, coh_lu);
+	return container_of_safe(h, struct cl_object_header, coh_lu);
 }
 
 static inline struct cl_site *cl_object_site(const struct cl_object *obj)
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
index 35c7b582f36d..c3b0ed518819 100644
--- a/drivers/staging/lustre/lustre/include/lu_object.h
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -745,15 +745,15 @@ struct lu_object *lu_object_find_slice(const struct lu_env *env,
 static inline struct lu_object *lu_object_top(struct lu_object_header *h)
 {
 	LASSERT(!list_empty(&h->loh_layers));
-	return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+	return list_first_entry(&h->loh_layers, struct lu_object, lo_linkage);
 }
 
 /**
  * Next sub-object in the layering
  */
-static inline struct lu_object *lu_object_next(const struct lu_object *o)
+static inline const struct lu_object *lu_object_next(const struct lu_object *o)
 {
-	return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+	return list_next_entry(o, lo_linkage);
 }
 
 /**
diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c
index a6a1d80c711a..14172688d55f 100644
--- a/drivers/staging/lustre/lustre/llite/llite_nfs.c
+++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c
@@ -223,7 +223,7 @@ static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name,
 	/* It is hack to access lde_fid for comparison with lgd_fid.
 	 * So the input 'name' must be part of the 'lu_dirent'.
 	 */
-	struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+	struct lu_dirent *lde = container_of((void*)name, struct lu_dirent, lde_name);
 	struct ll_getname_data *lgd =
 		container_of(ctx, struct ll_getname_data, ctx);
 	struct lu_fid fid;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h
index 02ea5161d635..7d3abb43584a 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_internal.h
+++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h
@@ -263,22 +263,22 @@ static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv)
 
 static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d)
 {
-	return container_of0(d, struct vvp_device, vdv_cl.cd_lu_dev);
+	return container_of_safe(d, struct vvp_device, vdv_cl.cd_lu_dev);
 }
 
 static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d)
 {
-	return container_of0(d, struct vvp_device, vdv_cl);
+	return container_of_safe(d, struct vvp_device, vdv_cl);
 }
 
 static inline struct vvp_object *cl2vvp(const struct cl_object *obj)
 {
-	return container_of0(obj, struct vvp_object, vob_cl);
+	return container_of_safe(obj, struct vvp_object, vob_cl);
 }
 
 static inline struct vvp_object *lu2vvp(const struct lu_object *obj)
 {
-	return container_of0(obj, struct vvp_object, vob_cl.co_lu);
+	return container_of_safe(obj, struct vvp_object, vob_cl.co_lu);
 }
 
 static inline struct inode *vvp_object_inode(const struct cl_object *obj)
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
index c27c3c32188d..68a99170c424 100644
--- a/drivers/staging/lustre/lustre/lmv/lmv_internal.h
+++ b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
@@ -60,7 +60,7 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 
 static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv)
 {
-	return container_of0(lmv, struct obd_device, u.lmv);
+	return container_of_safe(lmv, struct obd_device, u.lmv);
 }
 
 static inline struct lmv_tgt_desc *
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
index 1185eceaf497..2e9c75ebdda5 100644
--- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
+++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
@@ -496,7 +496,7 @@ static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
 static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
 {
 	LINVRNT(d->ld_type == &lov_device_type);
-	return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+	return container_of(d, struct lov_device, ld_cl.cd_lu_dev);
 }
 
 static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
@@ -512,13 +512,13 @@ static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
 static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
 {
 	LINVRNT(d->ld_type == &lovsub_device_type);
-	return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+	return container_of(d, struct lovsub_device, acid_cl.cd_lu_dev);
 }
 
 static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
 {
 	LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
-	return container_of0(d, struct lovsub_device, acid_cl);
+	return container_of(d, struct lovsub_device, acid_cl);
 }
 
 static inline struct lu_object *lov2lu(struct lov_object *lov)
@@ -534,13 +534,13 @@ static inline struct cl_object *lov2cl(struct lov_object *lov)
 static inline struct lov_object *lu2lov(const struct lu_object *obj)
 {
 	LINVRNT(lov_is_object(obj));
-	return container_of0(obj, struct lov_object, lo_cl.co_lu);
+	return container_of(obj, struct lov_object, lo_cl.co_lu);
 }
 
 static inline struct lov_object *cl2lov(const struct cl_object *obj)
 {
 	LINVRNT(lov_is_object(&obj->co_lu));
-	return container_of0(obj, struct lov_object, lo_cl);
+	return container_of(obj, struct lov_object, lo_cl);
 }
 
 static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
@@ -556,13 +556,13 @@ static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
 static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
 {
 	LINVRNT(lovsub_is_object(&obj->co_lu));
-	return container_of0(obj, struct lovsub_object, lso_cl);
+	return container_of(obj, struct lovsub_object, lso_cl);
 }
 
 static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
 {
 	LINVRNT(lovsub_is_object(obj));
-	return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+	return container_of(obj, struct lovsub_object, lso_cl.co_lu);
 }
 
 static inline struct lovsub_lock *
@@ -590,14 +590,14 @@ static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
 static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
 {
 	LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
-	return container_of0(slice, struct lov_page, lps_cl);
+	return container_of(slice, struct lov_page, lps_cl);
 }
 
 static inline struct lovsub_page *
 cl2lovsub_page(const struct cl_page_slice *slice)
 {
 	LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
-	return container_of0(slice, struct lovsub_page, lsb_cl);
+	return container_of(slice, struct lovsub_page, lsb_cl);
 }
 
 static inline struct lov_io *cl2lov_io(const struct lu_env *env,
diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h
index 8dde3828f935..47042f27ca90 100644
--- a/drivers/staging/lustre/lustre/lov/lov_internal.h
+++ b/drivers/staging/lustre/lustre/lov/lov_internal.h
@@ -280,7 +280,7 @@ static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi)
 
 static inline struct obd_device *lov2obd(const struct lov_obd *lov)
 {
-	return container_of0(lov, struct obd_device, u.lov);
+	return container_of_safe(lov, struct obd_device, u.lov);
 }
 
 #endif
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
index 3ae16e8501c2..3de7dc0497c4 100644
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -319,7 +319,7 @@ static void lu_object_free(const struct lu_env *env, struct lu_object *o)
 		 * lives as long as possible and ->loo_object_free() methods
 		 * can look at its contents.
 		 */
-		o = container_of0(splice.prev, struct lu_object, lo_linkage);
+		o = container_of(splice.prev, struct lu_object, lo_linkage);
 		list_del_init(&o->lo_linkage);
 		o->lo_ops->loo_object_free(env, o);
 	}
@@ -404,8 +404,8 @@ int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s,
 		 * races due to the reasons described in lu_object_put().
 		 */
 		while (!list_empty(&dispose)) {
-			h = container_of0(dispose.next,
-					  struct lu_object_header, loh_lru);
+			h = container_of(dispose.next,
+					 struct lu_object_header, loh_lru);
 			list_del_init(&h->loh_lru);
 			lu_object_free(env, lu_object_top(h));
 			lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
@@ -579,7 +579,7 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 		return ERR_PTR(-ENOENT);
 	}
 
-	h = container_of0(hnode, struct lu_object_header, loh_hash);
+	h = container_of(hnode, struct lu_object_header, loh_hash);
 	if (likely(!lu_object_is_dying(h))) {
 		cfs_hash_get(s->ls_obj_hash, hnode);
 		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c
index 99a76db51ae0..767067b61109 100644
--- a/drivers/staging/lustre/lustre/obdecho/echo_client.c
+++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c
@@ -99,7 +99,7 @@ static int echo_client_cleanup(struct obd_device *obddev);
  */
 static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
 {
-	return container_of0(dev, struct echo_device, ed_cl);
+	return container_of_safe(dev, struct echo_device, ed_cl);
 }
 
 static inline struct cl_device *echo_dev2cl(struct echo_device *d)
diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
index 1449013722f6..dc25dd12d7d5 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
+++ b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
@@ -460,7 +460,7 @@ static inline int osc_is_object(const struct lu_object *obj)
 static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
 {
 	LINVRNT(d->ld_type == &osc_device_type);
-	return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+	return container_of(d, struct osc_device, od_cl.cd_lu_dev);
 }
 
 static inline struct obd_export *osc_export(const struct osc_object *obj)
@@ -476,7 +476,7 @@ static inline struct client_obd *osc_cli(const struct osc_object *obj)
 static inline struct osc_object *cl2osc(const struct cl_object *obj)
 {
 	LINVRNT(osc_is_object(&obj->co_lu));
-	return container_of0(obj, struct osc_object, oo_cl);
+	return container_of(obj, struct osc_object, oo_cl);
 }
 
 static inline struct cl_object *osc2cl(const struct osc_object *obj)
@@ -509,12 +509,12 @@ static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode)
 static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
 {
 	LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
-	return container_of0(slice, struct osc_page, ops_cl);
+	return container_of(slice, struct osc_page, ops_cl);
 }
 
 static inline struct osc_page *oap2osc(struct osc_async_page *oap)
 {
-	return container_of0(oap, struct osc_page, ops_oap);
+	return container_of_safe(oap, struct osc_page, ops_oap);
 }
 
 static inline pgoff_t osc_index(struct osc_page *opg)
@@ -545,7 +545,7 @@ osc_cl_page_osc(struct cl_page *page, struct osc_object *osc)
 static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
 {
 	LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
-	return container_of0(slice, struct osc_lock, ols_cl);
+	return container_of(slice, struct osc_lock, ols_cl);
 }
 
 static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
index 1c8ba4ad6e7f..fca020568c19 100644
--- a/drivers/staging/lustre/lustre/osc/osc_internal.h
+++ b/drivers/staging/lustre/lustre/osc/osc_internal.h
@@ -180,7 +180,7 @@ struct osc_device {
 
 static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
 {
-	return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+	return container_of_safe(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
 }
 
 extern struct lu_kmem_descr osc_caches[];
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
index 76743faf3e6d..67734a8ed331 100644
--- a/drivers/staging/lustre/lustre/osc/osc_io.c
+++ b/drivers/staging/lustre/lustre/osc/osc_io.c
@@ -55,7 +55,7 @@
 static struct osc_io *cl2osc_io(const struct lu_env *env,
 				const struct cl_io_slice *slice)
 {
-	struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+	struct osc_io *oio = container_of_safe(slice, struct osc_io, oi_cl);
 
 	LINVRNT(oio == osc_env_io(env));
 	return oio;
diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c
index 4f81dd16f4f5..84240181c7ea 100644
--- a/drivers/staging/lustre/lustre/osc/osc_object.c
+++ b/drivers/staging/lustre/lustre/osc/osc_object.c
@@ -58,7 +58,7 @@ static struct lu_object *osc2lu(struct osc_object *osc)
 static struct osc_object *lu2osc(const struct lu_object *obj)
 {
 	LINVRNT(osc_is_object(obj));
-	return container_of0(obj, struct osc_object, oo_cl.co_lu);
+	return container_of(obj, struct osc_object, oo_cl.co_lu);
 }
 
 /*****************************************************************************
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 6a1eb0b0aad9..58d6645b1425 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -964,6 +964,22 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 			 "pointer type mismatch in container_of()");	\
 	((type *)(__mptr - offsetof(type, member))); })
 
+/**
+ * container_of_safe - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged.
+ */
+#define container_of_safe(ptr, type, member) ({				\
+	void *__mptr = (void *)(ptr);					\
+	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
+			 !__same_type(*(ptr), void),			\
+			 "pointer type mismatch in container_of()");	\
+	IS_ERR_OR_NULL(ptr) ? ERR_CAST(ptr) :				\
+		((type *)(__mptr - offsetof(type, member))); })
+
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From af8bb9f89838249872240f258e67774ccbcc5970 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 17 Apr 2018 10:58:09 -0500
Subject: PCI/ACPI: Request LTR control from platform before using it

Per the PCI Firmware spec r3.2, sec 4.5, an ACPI-based OS should use _OSC
to request control of Latency Tolerance Reporting (LTR) before using it.

Request control of LTR, and if the platform does not grant control, don't
use it.

N.B. If the hardware supports LTR and the ASPM L1.2 substate but the BIOS
doesn't support LTR in _OSC, we previously would enable ASPM L1.2.  This
patch will prevent us from enabling ASPM L1.2 in that case.  It does not
prevent us from enabling PCI-PM L1.2, since that doesn't depend on LTR.
See PCIe r40, sec 5.5.1, for the L1 PM substate entry conditions.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/pci_root.c | 6 ++++++
 drivers/pci/probe.c     | 5 +++++
 include/linux/acpi.h    | 3 ++-
 include/linux/pci.h     | 1 +
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 0da18bde6a16..2ff0d6702a2e 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -153,6 +153,7 @@ static struct pci_osc_bit_struct pci_osc_control_bit[] = {
 	{ OSC_PCI_EXPRESS_PME_CONTROL, "PME" },
 	{ OSC_PCI_EXPRESS_AER_CONTROL, "AER" },
 	{ OSC_PCI_EXPRESS_CAPABILITY_CONTROL, "PCIeCapability" },
+	{ OSC_PCI_EXPRESS_LTR_CONTROL, "LTR" },
 };
 
 static void decode_osc_bits(struct acpi_pci_root *root, char *msg, u32 word,
@@ -475,6 +476,9 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm)
 		| OSC_PCI_EXPRESS_NATIVE_HP_CONTROL
 		| OSC_PCI_EXPRESS_PME_CONTROL;
 
+	if (IS_ENABLED(CONFIG_PCIEASPM))
+		control |= OSC_PCI_EXPRESS_LTR_CONTROL;
+
 	if (pci_aer_available()) {
 		if (aer_acpi_firmware_first())
 			dev_info(&device->dev,
@@ -905,6 +909,8 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 		host_bridge->native_aer = 0;
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_PME_CONTROL))
 		host_bridge->native_pme = 0;
+	if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL))
+		host_bridge->native_ltr = 0;
 
 	pci_scan_child_bus(bus);
 	pci_set_host_bridge_release(host_bridge, acpi_pci_root_release_info,
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac91b6fd0bcd..cc1688d75664 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -554,6 +554,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
 	bridge->native_aer = 1;
 	bridge->native_hotplug = 1;
 	bridge->native_pme = 1;
+	bridge->native_ltr = 1;
 
 	return bridge;
 }
@@ -1954,9 +1955,13 @@ static void pci_configure_relaxed_ordering(struct pci_dev *dev)
 static void pci_configure_ltr(struct pci_dev *dev)
 {
 #ifdef CONFIG_PCIEASPM
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 	u32 cap;
 	struct pci_dev *bridge;
 
+	if (!host->native_ltr)
+		return;
+
 	if (!pci_is_pcie(dev))
 		return;
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 15bfb15c2fa5..49f63c67a9d1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -506,7 +506,8 @@ extern bool osc_pc_lpi_support_confirmed;
 #define OSC_PCI_EXPRESS_PME_CONTROL		0x00000004
 #define OSC_PCI_EXPRESS_AER_CONTROL		0x00000008
 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL	0x00000010
-#define OSC_PCI_CONTROL_MASKS			0x0000001f
+#define OSC_PCI_EXPRESS_LTR_CONTROL		0x00000020
+#define OSC_PCI_CONTROL_MASKS			0x0000003f
 
 #define ACPI_GSB_ACCESS_ATTRIB_QUICK		0x00000002
 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..d0149c01996d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -473,6 +473,7 @@ struct pci_host_bridge {
 	unsigned int	native_aer:1;		/* OS may use PCIe AER */
 	unsigned int	native_hotplug:1;	/* OS may use PCIe hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
+	unsigned int	native_ltr:1;		/* OS may use PCIe LTR */
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
 			const struct resource *res,
-- 
cgit v1.2.3


From 6163849d289be6ff2acd2fb520da303dec3219f0 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 20 Apr 2018 23:18:26 +0800
Subject: net: introduce a new tracepoint for tcp_rcv_space_adjust

tcp_rcv_space_adjust is called every time data is copied to user space,
introducing a tcp tracepoint for which could show us when the packet is
copied to user.

When a tcp packet arrives, tcp_rcv_established() will be called and with
the existed tracepoint tcp_probe we could get the time when this packet
arrives.
Then this packet will be copied to user, and tcp_rcv_space_adjust will
be called and with this new introduced tracepoint we could get the time
when this packet is copied to user.
With these two tracepoints, we could figure out whether the user program
processes this packet immediately or there's latency.

Hence in the printk message, sk_cookie is printed as a key to relate
tcp_rcv_space_adjust with tcp_probe.

Maybe we could export sockfd in this new tracepoint as well, then we
could relate this new tracepoint with epoll/read/recv* tracepoints, and
finally that could show us the whole lifespan of this packet. But we
could also implement that with pid as these functions are executed in
process context.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 30 +++++++++++++++++++++---------
 net/ipv4/tcp_input.c       |  2 ++
 2 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 3dd68029d77a..c1a5284713b8 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -10,6 +10,7 @@
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
 #include <net/tcp.h>
+#include <linux/sock_diag.h>
 
 #define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
 	do {							\
@@ -113,7 +114,7 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
  */
 DECLARE_EVENT_CLASS(tcp_event_sk,
 
-	TP_PROTO(const struct sock *sk),
+	TP_PROTO(struct sock *sk),
 
 	TP_ARGS(sk),
 
@@ -125,6 +126,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 		__array(__u8, daddr, 4)
 		__array(__u8, saddr_v6, 16)
 		__array(__u8, daddr_v6, 16)
+		__field(__u64, sock_cookie)
 	),
 
 	TP_fast_assign(
@@ -144,24 +146,34 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 
 		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
 			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+		__entry->sock_cookie = sock_gen_cookie(sk);
 	),
 
-	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx",
 		  __entry->sport, __entry->dport,
 		  __entry->saddr, __entry->daddr,
-		  __entry->saddr_v6, __entry->daddr_v6)
+		  __entry->saddr_v6, __entry->daddr_v6,
+		  __entry->sock_cookie)
 );
 
 DEFINE_EVENT(tcp_event_sk, tcp_receive_reset,
 
-	TP_PROTO(const struct sock *sk),
+	TP_PROTO(struct sock *sk),
 
 	TP_ARGS(sk)
 );
 
 DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,
 
-	TP_PROTO(const struct sock *sk),
+	TP_PROTO(struct sock *sk),
+
+	TP_ARGS(sk)
+);
+
+DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
+
+	TP_PROTO(struct sock *sk),
 
 	TP_ARGS(sk)
 );
@@ -232,6 +244,7 @@ TRACE_EVENT(tcp_probe,
 		__field(__u32, snd_wnd)
 		__field(__u32, srtt)
 		__field(__u32, rcv_wnd)
+		__field(__u64, sock_cookie)
 	),
 
 	TP_fast_assign(
@@ -256,15 +269,14 @@ TRACE_EVENT(tcp_probe,
 		__entry->rcv_wnd = tp->rcv_wnd;
 		__entry->ssthresh = tcp_current_ssthresh(sk);
 		__entry->srtt = tp->srtt_us >> 3;
+		__entry->sock_cookie = sock_gen_cookie(sk);
 	),
 
-	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x "
-		  "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u "
-		  "rcv_wnd=%u",
+	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
 		  __entry->saddr, __entry->daddr, __entry->mark,
 		  __entry->length, __entry->snd_nxt, __entry->snd_una,
 		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
-		  __entry->srtt, __entry->rcv_wnd)
+		  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
 );
 
 #endif /* _TRACE_TCP_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0396fb919b5d..5a17cfc75326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -582,6 +582,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	u32 copied;
 	int time;
 
+	trace_tcp_rcv_space_adjust(sk);
+
 	tcp_mstamp_refresh(tp);
 	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
 	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
-- 
cgit v1.2.3


From b16fb418b1bf2a9f14d5d2a4fe29bde1f5550b37 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Sat, 21 Apr 2018 09:41:31 -0700
Subject: net: fib_rules: add extack support

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h |  3 ++-
 net/core/fib_rules.c    | 55 +++++++++++++++++++++++++++++++++++++------------
 net/decnet/dn_rules.c   |  7 +++++--
 net/ipv4/fib_rules.c    |  7 +++++--
 net/ipv4/ipmr.c         |  3 ++-
 net/ipv6/fib6_rules.c   |  7 +++++--
 net/ipv6/ip6mr.c        |  3 ++-
 7 files changed, 63 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index e5cfcfc7dd93..b473df5b9512 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -75,7 +75,8 @@ struct fib_rules_ops {
 	int			(*configure)(struct fib_rule *,
 					     struct sk_buff *,
 					     struct fib_rule_hdr *,
-					     struct nlattr **);
+					     struct nlattr **,
+					     struct netlink_ext_ack *);
 	int			(*delete)(struct fib_rule *);
 	int			(*compare)(struct fib_rule *,
 					   struct fib_rule_hdr *,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 67801104e9a8..ebd9351b213a 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -469,14 +469,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (frh->src_len)
 		if (!tb[FRA_SRC] ||
 		    frh->src_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_SRC]) != ops->addr_size)
+		    nla_len(tb[FRA_SRC]) != ops->addr_size) {
+			NL_SET_ERR_MSG(extack, "Invalid source address");
 			goto errout;
+	}
 
 	if (frh->dst_len)
 		if (!tb[FRA_DST] ||
 		    frh->dst_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_DST]) != ops->addr_size)
+		    nla_len(tb[FRA_DST]) != ops->addr_size) {
+			NL_SET_ERR_MSG(extack, "Invalid dst address");
 			goto errout;
+	}
 
 	nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
 	if (!nlrule) {
@@ -537,6 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
 		if (nlrule->l3mdev != 1)
 #endif
+			NL_SET_ERR_MSG(extack, "Invalid l3mdev");
 			goto errout_free;
 	}
 
@@ -554,31 +559,41 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->suppress_ifgroup = -1;
 
 	if (tb[FRA_GOTO]) {
-		if (nlrule->action != FR_ACT_GOTO)
+		if (nlrule->action != FR_ACT_GOTO) {
+			NL_SET_ERR_MSG(extack, "Unexpected goto");
 			goto errout_free;
+		}
 
 		nlrule->target = nla_get_u32(tb[FRA_GOTO]);
 		/* Backward jumps are prohibited to avoid endless loops */
-		if (nlrule->target <= nlrule->pref)
+		if (nlrule->target <= nlrule->pref) {
+			NL_SET_ERR_MSG(extack, "Backward goto not supported");
 			goto errout_free;
+		}
 	} else if (nlrule->action == FR_ACT_GOTO) {
+		NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
 		goto errout_free;
 	}
 
-	if (nlrule->l3mdev && nlrule->table)
+	if (nlrule->l3mdev && nlrule->table) {
+		NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
 		goto errout_free;
+	}
 
 	if (tb[FRA_UID_RANGE]) {
 		if (current_user_ns() != net->user_ns) {
 			err = -EPERM;
+			NL_SET_ERR_MSG(extack, "No permission to set uid");
 			goto errout_free;
 		}
 
 		nlrule->uid_range = nla_get_kuid_range(tb);
 
 		if (!uid_range_set(&nlrule->uid_range) ||
-		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end))
+		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+			NL_SET_ERR_MSG(extack, "Invalid uid range");
 			goto errout_free;
+		}
 	} else {
 		nlrule->uid_range = fib_kuid_range_unset;
 	}
@@ -589,15 +604,19 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[FRA_SPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
 					 &nlrule->sport_range);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Invalid sport range");
 			goto errout_free;
+		}
 	}
 
 	if (tb[FRA_DPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
 					 &nlrule->dport_range);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Invalid dport range");
 			goto errout_free;
+		}
 	}
 
 	*rule = nlrule;
@@ -621,18 +640,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -EINVAL, unresolved = 0;
 	bool user_priority = false;
 
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+		NL_SET_ERR_MSG(extack, "Invalid msg length");
 		goto errout;
+	}
 
 	ops = lookup_rules_ops(net, frh->family);
 	if (!ops) {
 		err = -EAFNOSUPPORT;
+		NL_SET_ERR_MSG(extack, "Rule family not supported");
 		goto errout;
 	}
 
 	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Error parsing msg");
 		goto errout;
+	}
 
 	err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
 	if (err)
@@ -644,7 +668,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout_free;
 	}
 
-	err = ops->configure(rule, skb, frh, tb);
+	err = ops->configure(rule, skb, frh, tb, extack);
 	if (err < 0)
 		goto errout_free;
 
@@ -723,18 +747,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -EINVAL;
 	bool user_priority = false;
 
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+		NL_SET_ERR_MSG(extack, "Invalid msg length");
 		goto errout;
+	}
 
 	ops = lookup_rules_ops(net, frh->family);
 	if (ops == NULL) {
 		err = -EAFNOSUPPORT;
+		NL_SET_ERR_MSG(extack, "Rule family not supported");
 		goto errout;
 	}
 
 	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Error parsing msg");
 		goto errout;
+	}
 
 	err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
 	if (err)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index c795c3f509c9..72236695db3d 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 
 static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 				 struct fib_rule_hdr *frh,
-				 struct nlattr **tb)
+				 struct nlattr **tb,
+				 struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
 
-	if (frh->tos)
+	if (frh->tos) {
+		NL_SET_ERR_MSG(extack, "Invalid tos value");
 		goto  errout;
+	}
 
 	if (rule->table == RT_TABLE_UNSPEC) {
 		if (rule->action == FR_ACT_TO_TBL) {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 737d11bc8838..f8eb78d042a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
 
 static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 			       struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	int err = -EINVAL;
 	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-	if (frh->tos & ~IPTOS_TOS_MASK)
+	if (frh->tos & ~IPTOS_TOS_MASK) {
+		NL_SET_ERR_MSG(extack, "Invalid tos");
 		goto errout;
+	}
 
 	/* split local/main if they are not already split */
 	err = fib_unmerge(net);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2fb4de3f7f66..38e092eafc97 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
 };
 
 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
-			       struct fib_rule_hdr *frh, struct nlattr **tb)
+			       struct fib_rule_hdr *frh, struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	return 0;
 }
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7b5fc8..6547fc6491a6 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -245,15 +245,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
 
 static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 			       struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 	struct net *net = sock_net(skb->sk);
 	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
 
 	if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
-		if (rule->table == RT6_TABLE_UNSPEC)
+		if (rule->table == RT6_TABLE_UNSPEC) {
+			NL_SET_ERR_MSG(extack, "Invalid table");
 			goto errout;
+		}
 
 		if (fib6_new_table(net, rule->table) == NULL) {
 			err = -ENOBUFS;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b6ed17..20a419ee8000 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
 };
 
 static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
-				struct fib_rule_hdr *frh, struct nlattr **tb)
+				struct fib_rule_hdr *frh, struct nlattr **tb,
+				struct netlink_ext_ack *extack)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From c6849a3ac17e336811f1d5bba991d2a9bdc47af1 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 22 Apr 2018 21:50:04 +0800
Subject: net: init sk_cookie for inet socket

With sk_cookie we can identify a socket, that is very helpful for
traceing and statistic, i.e. tcp tracepiont and ebpf.
So we'd better init it by default for inet socket.
When using it, we just need call atomic64_read(&sk->sk_cookie).

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h | 9 +++++++++
 net/ipv4/tcp_input.c      | 8 +++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 15fe980a27ea..5c916e6dff36 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -25,6 +25,15 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
+static inline
+void sock_init_cookie(struct sock *sk)
+{
+	u64 res;
+
+	res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+	atomic64_set(&sk->sk_cookie, res);
+}
+
 u64 sock_gen_cookie(struct sock *sk);
 int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
 void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5a17cfc75326..17b78582ba63 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
 #include <linux/static_key.h>
+#include <linux/sock_diag.h>
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -6190,10 +6191,15 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
 #endif
-		atomic64_set(&ireq->ir_cookie, 0);
 		ireq->ireq_state = TCP_NEW_SYN_RECV;
 		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
 		ireq->ireq_family = sk_listener->sk_family;
+
+		BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+					offsetof(struct sock, sk_cookie));
+		BUILD_BUG_ON(offsetof(struct inet_request_sock, ireq_net) !=
+					offsetof(struct sock, sk_net));
+		sock_init_cookie((struct sock *)ireq);
 	}
 
 	return req;
-- 
cgit v1.2.3


From 1ac4329a1cff2e0bb12b71c13ad53a0e05bc87a6 Mon Sep 17 00:00:00 2001
From: Denis Bolotin <denis.bolotin@cavium.com>
Date: Mon, 23 Apr 2018 14:56:05 +0300
Subject: qed: Add configuration information to register dump and debug data

Configuration information is added to the debug data collection, in
addition to register dump.
Added qed_dbg_nvm_image() that receives an image type, allocates a
buffer and reads the image. The images are saved in the buffers and the
dump size is updated.

Signed-off-by: Denis Bolotin <denis.bolotin@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_debug.c | 113 +++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c   |  14 +++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.h   |  14 ++++
 include/linux/qed/qed_if.h                  |   3 +
 4 files changed, 139 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 4926c5532fba..b3211c7d38c2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -7778,6 +7778,57 @@ int qed_dbg_igu_fifo_size(struct qed_dev *cdev)
 	return qed_dbg_feature_size(cdev, DBG_FEATURE_IGU_FIFO);
 }
 
+int qed_dbg_nvm_image_length(struct qed_hwfn *p_hwfn,
+			     enum qed_nvm_images image_id, u32 *length)
+{
+	struct qed_nvm_image_att image_att;
+	int rc;
+
+	*length = 0;
+	rc = qed_mcp_get_nvm_image_att(p_hwfn, image_id, &image_att);
+	if (rc)
+		return rc;
+
+	*length = image_att.length;
+
+	return rc;
+}
+
+int qed_dbg_nvm_image(struct qed_dev *cdev, void *buffer,
+		      u32 *num_dumped_bytes, enum qed_nvm_images image_id)
+{
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	u32 len_rounded, i;
+	__be32 val;
+	int rc;
+
+	*num_dumped_bytes = 0;
+	rc = qed_dbg_nvm_image_length(p_hwfn, image_id, &len_rounded);
+	if (rc)
+		return rc;
+
+	DP_NOTICE(p_hwfn->cdev,
+		  "Collecting a debug feature [\"nvram image %d\"]\n",
+		  image_id);
+
+	len_rounded = roundup(len_rounded, sizeof(u32));
+	rc = qed_mcp_get_nvm_image(p_hwfn, image_id, buffer, len_rounded);
+	if (rc)
+		return rc;
+
+	/* QED_NVM_IMAGE_NVM_META image is not swapped like other images */
+	if (image_id != QED_NVM_IMAGE_NVM_META)
+		for (i = 0; i < len_rounded; i += 4) {
+			val = cpu_to_be32(*(u32 *)(buffer + i));
+			*(u32 *)(buffer + i) = val;
+		}
+
+	*num_dumped_bytes = len_rounded;
+
+	return rc;
+}
+
 int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer,
 				u32 *num_dumped_bytes)
 {
@@ -7831,6 +7882,9 @@ enum debug_print_features {
 	IGU_FIFO = 6,
 	PHY = 7,
 	FW_ASSERTS = 8,
+	NVM_CFG1 = 9,
+	DEFAULT_CFG = 10,
+	NVM_META = 11,
 };
 
 static u32 qed_calc_regdump_header(enum debug_print_features feature,
@@ -7965,13 +8019,61 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer)
 		DP_ERR(cdev, "qed_dbg_mcp_trace failed. rc = %d\n", rc);
 	}
 
+	/* nvm cfg1 */
+	rc = qed_dbg_nvm_image(cdev,
+			       (u8 *)buffer + offset + REGDUMP_HEADER_SIZE,
+			       &feature_size, QED_NVM_IMAGE_NVM_CFG1);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(NVM_CFG1, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else if (rc != -ENOENT) {
+		DP_ERR(cdev,
+		       "qed_dbg_nvm_image failed for image  %d (%s), rc = %d\n",
+		       QED_NVM_IMAGE_NVM_CFG1, "QED_NVM_IMAGE_NVM_CFG1", rc);
+	}
+
+	/* nvm default */
+	rc = qed_dbg_nvm_image(cdev,
+			       (u8 *)buffer + offset + REGDUMP_HEADER_SIZE,
+			       &feature_size, QED_NVM_IMAGE_DEFAULT_CFG);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(DEFAULT_CFG, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else if (rc != -ENOENT) {
+		DP_ERR(cdev,
+		       "qed_dbg_nvm_image failed for image %d (%s), rc = %d\n",
+		       QED_NVM_IMAGE_DEFAULT_CFG, "QED_NVM_IMAGE_DEFAULT_CFG",
+		       rc);
+	}
+
+	/* nvm meta */
+	rc = qed_dbg_nvm_image(cdev,
+			       (u8 *)buffer + offset + REGDUMP_HEADER_SIZE,
+			       &feature_size, QED_NVM_IMAGE_NVM_META);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(NVM_META, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else if (rc != -ENOENT) {
+		DP_ERR(cdev,
+		       "qed_dbg_nvm_image failed for image %d (%s), rc = %d\n",
+		       QED_NVM_IMAGE_NVM_META, "QED_NVM_IMAGE_NVM_META", rc);
+	}
+
 	return 0;
 }
 
 int qed_dbg_all_data_size(struct qed_dev *cdev)
 {
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	u32 regs_len = 0, image_len = 0;
 	u8 cur_engine, org_engine;
-	u32 regs_len = 0;
 
 	org_engine = qed_get_debug_engine(cdev);
 	for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) {
@@ -7993,6 +8095,15 @@ int qed_dbg_all_data_size(struct qed_dev *cdev)
 
 	/* Engine common */
 	regs_len += REGDUMP_HEADER_SIZE + qed_dbg_mcp_trace_size(cdev);
+	qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_NVM_CFG1, &image_len);
+	if (image_len)
+		regs_len += REGDUMP_HEADER_SIZE + image_len;
+	qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_DEFAULT_CFG, &image_len);
+	if (image_len)
+		regs_len += REGDUMP_HEADER_SIZE + image_len;
+	qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_NVM_META, &image_len);
+	if (image_len)
+		regs_len += REGDUMP_HEADER_SIZE + image_len;
 
 	return regs_len;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 1377ad172fb3..0550f0ee11b3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -2529,7 +2529,7 @@ err0:
 	return rc;
 }
 
-static int
+int
 qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
 			  enum qed_nvm_images image_id,
 			  struct qed_nvm_image_att *p_image_att)
@@ -2545,6 +2545,15 @@ qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
 	case QED_NVM_IMAGE_FCOE_CFG:
 		type = NVM_TYPE_FCOE_CFG;
 		break;
+	case QED_NVM_IMAGE_NVM_CFG1:
+		type = NVM_TYPE_NVM_CFG1;
+		break;
+	case QED_NVM_IMAGE_DEFAULT_CFG:
+		type = NVM_TYPE_DEFAULT_CFG;
+		break;
+	case QED_NVM_IMAGE_NVM_META:
+		type = NVM_TYPE_META;
+		break;
 	default:
 		DP_NOTICE(p_hwfn, "Unknown request of image_id %08x\n",
 			  image_id);
@@ -2588,9 +2597,6 @@ int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn,
 		return -EINVAL;
 	}
 
-	/* Each NVM image is suffixed by CRC; Upper-layer has no need for it */
-	image_att.length -= 4;
-
 	if (image_att.length > buffer_len) {
 		DP_VERBOSE(p_hwfn,
 			   QED_MSG_STORAGE,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index dd62c38b3bd3..3af3896420b9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -482,6 +482,20 @@ struct qed_nvm_image_att {
 	u32 length;
 };
 
+/**
+ * @brief Allows reading a whole nvram image
+ *
+ * @param p_hwfn
+ * @param image_id - image to get attributes for
+ * @param p_image_att - image attributes structure into which to fill data
+ *
+ * @return int - 0 - operation was successful.
+ */
+int
+qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
+			  enum qed_nvm_images image_id,
+			  struct qed_nvm_image_att *p_image_att);
+
 /**
  * @brief Allows reading a whole nvram image
  *
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index b5b2bc9eacca..e53f9c7c2809 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -159,6 +159,9 @@ struct qed_dcbx_get {
 enum qed_nvm_images {
 	QED_NVM_IMAGE_ISCSI_CFG,
 	QED_NVM_IMAGE_FCOE_CFG,
+	QED_NVM_IMAGE_NVM_CFG1,
+	QED_NVM_IMAGE_DEFAULT_CFG,
+	QED_NVM_IMAGE_NVM_META,
 };
 
 struct qed_link_eee_params {
-- 
cgit v1.2.3


From 2ff0dab80a8999e99a93fd70f8d701ec3deab207 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 18 Apr 2018 15:18:02 +0200
Subject: mfd: bd9571mwv: Add DDR Backup Power register bit definitions

Add definitions for the KEEPON_* bits in the "BKUP Mode Cnt" register,
which control the DDR rails to be kept powered when backup mode is
enabled.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/bd9571mwv.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/mfd/bd9571mwv.h b/include/linux/mfd/bd9571mwv.h
index f0708ba4cbba..eb05569f752b 100644
--- a/include/linux/mfd/bd9571mwv.h
+++ b/include/linux/mfd/bd9571mwv.h
@@ -33,6 +33,11 @@
 #define BD9571MWV_I2C_MD2_E1_BIT_2		0x12
 
 #define BD9571MWV_BKUP_MODE_CNT			0x20
+#define BD9571MWV_BKUP_MODE_CNT_KEEPON_MASK	GENMASK(3, 0)
+#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR0	BIT(0)
+#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR1	BIT(1)
+#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR0C	BIT(2)
+#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR1C	BIT(3)
 #define BD9571MWV_BKUP_MODE_STATUS		0x21
 #define BD9571MWV_BKUP_RECOVERY_CNT		0x22
 #define BD9571MWV_BKUP_CTRL_TIM_CNT		0x23
-- 
cgit v1.2.3


From a2b2eff6ac2716f499defa590a6ec4ba379d765e Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Fri, 13 Apr 2018 14:34:31 -0400
Subject: media: v4l: fwnode: Fix comment incorrectly mentioning
 v4l2_fwnode_parse_endpoint

It's called v4l2_fwnode_endpoint_parse, not v4l2_fwnode_parse_endpoint.

Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/v4l2-fwnode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h
index c228ec1c77cf..9cccab618b98 100644
--- a/include/media/v4l2-fwnode.h
+++ b/include/media/v4l2-fwnode.h
@@ -99,7 +99,7 @@ struct v4l2_fwnode_endpoint {
 	struct fwnode_endpoint base;
 	/*
 	 * Fields below this line will be zeroed by
-	 * v4l2_fwnode_parse_endpoint()
+	 * v4l2_fwnode_endpoint_parse()
 	 */
 	enum v4l2_mbus_type bus_type;
 	union {
-- 
cgit v1.2.3


From 672e314b21dc614894e69bb56a2b55cc7d256810 Mon Sep 17 00:00:00 2001
From: Matt Atwood <matthew.s.atwood@intel.com>
Date: Mon, 23 Apr 2018 15:28:03 -0700
Subject: drm/i915/kbl: Add KBL GT2 sku

Adding a missing GT2 sku discovered off hardware.

Signed-off-by: Matt Atwood <matthew.s.atwood@intel.com>
Reviewed-by: Clint Taylor <clinton.a.taylor@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1524522483-19987-1-git-send-email-matthew.s.atwood@intel.com
---
 include/drm/i915_pciids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 70f0c2535b87..bab70ff6e78b 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -349,6 +349,7 @@
 #define INTEL_KBL_GT2_IDS(info)	\
 	INTEL_VGA_DEVICE(0x5916, info), /* ULT GT2 */ \
 	INTEL_VGA_DEVICE(0x5917, info), /* Mobile GT2 */ \
+	INTEL_VGA_DEVICE(0x591C, info), /* ULX GT2 */ \
 	INTEL_VGA_DEVICE(0x5921, info), /* ULT GT2F */ \
 	INTEL_VGA_DEVICE(0x591E, info), /* ULX GT2 */ \
 	INTEL_VGA_DEVICE(0x5912, info), /* DT  GT2 */ \
-- 
cgit v1.2.3


From 7bb3bb4d56d8f3e0b29b8e4a70f2ab7a8e04a935 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 23 Apr 2018 12:49:58 +0200
Subject: drm/bridge: analogix_dp: Split the platform-specific poweron in two
 parts

Some of the platform-specific stuff in rockchip_dp_poweron() needs to
happen before the generic code.  Some needs to happen after.  Let's
split the callback in two.

Specifically we can't start doing PSR work until _after_ the whole
controller is up, so don't set the enable until the end.

Cc: Kristian H. Kristensen <hoegsberg@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
[seanpaul added exynos change]
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Reviewed-by: Andrzej Hajda <a.hajda@samsung.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Archit Taneja <architt@codeaurora.org>
Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180423105003.9004-23-enric.balletbo@collabora.com
---
 drivers/gpu/drm/bridge/analogix/analogix_dp_core.c |  7 +++++--
 drivers/gpu/drm/exynos/exynos_dp.c                 |  2 +-
 drivers/gpu/drm/rockchip/analogix_dp-rockchip.c    | 12 ++++++++++--
 include/drm/bridge/analogix_dp.h                   |  3 ++-
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
index a260de4f0bd8..2bcbfadb6ac5 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
@@ -1260,8 +1260,8 @@ static int analogix_dp_set_bridge(struct analogix_dp_device *dp)
 		goto out_dp_clk_pre;
 	}
 
-	if (dp->plat_data->power_on)
-		dp->plat_data->power_on(dp->plat_data);
+	if (dp->plat_data->power_on_start)
+		dp->plat_data->power_on_start(dp->plat_data);
 
 	phy_power_on(dp->phy);
 
@@ -1286,6 +1286,9 @@ static int analogix_dp_set_bridge(struct analogix_dp_device *dp)
 		goto out_dp_init;
 	}
 
+	if (dp->plat_data->power_on_end)
+		dp->plat_data->power_on_end(dp->plat_data);
+
 	enable_irq(dp->irq);
 	return 0;
 
diff --git a/drivers/gpu/drm/exynos/exynos_dp.c b/drivers/gpu/drm/exynos/exynos_dp.c
index 964831dab102..86330f396784 100644
--- a/drivers/gpu/drm/exynos/exynos_dp.c
+++ b/drivers/gpu/drm/exynos/exynos_dp.c
@@ -162,7 +162,7 @@ static int exynos_dp_bind(struct device *dev, struct device *master, void *data)
 	dp->drm_dev = drm_dev;
 
 	dp->plat_data.dev_type = EXYNOS_DP;
-	dp->plat_data.power_on = exynos_dp_poweron;
+	dp->plat_data.power_on_start = exynos_dp_poweron;
 	dp->plat_data.power_off = exynos_dp_poweroff;
 	dp->plat_data.attach = exynos_dp_bridge_attach;
 	dp->plat_data.get_modes = exynos_dp_get_modes;
diff --git a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
index b3f46ed24cdc..23317a2269e1 100644
--- a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
+++ b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
@@ -109,7 +109,7 @@ static int rockchip_dp_pre_init(struct rockchip_dp_device *dp)
 	return 0;
 }
 
-static int rockchip_dp_poweron(struct analogix_dp_plat_data *plat_data)
+static int rockchip_dp_poweron_start(struct analogix_dp_plat_data *plat_data)
 {
 	struct rockchip_dp_device *dp = to_dp(plat_data);
 	int ret;
@@ -127,6 +127,13 @@ static int rockchip_dp_poweron(struct analogix_dp_plat_data *plat_data)
 		return ret;
 	}
 
+	return ret;
+}
+
+static int rockchip_dp_poweron_end(struct analogix_dp_plat_data *plat_data)
+{
+	struct rockchip_dp_device *dp = to_dp(plat_data);
+
 	return rockchip_drm_psr_activate(&dp->encoder);
 }
 
@@ -330,7 +337,8 @@ static int rockchip_dp_bind(struct device *dev, struct device *master,
 	dp->plat_data.encoder = &dp->encoder;
 
 	dp->plat_data.dev_type = dp->data->chip_type;
-	dp->plat_data.power_on = rockchip_dp_poweron;
+	dp->plat_data.power_on_start = rockchip_dp_poweron_start;
+	dp->plat_data.power_on_end = rockchip_dp_poweron_end;
 	dp->plat_data.power_off = rockchip_dp_powerdown;
 	dp->plat_data.get_modes = rockchip_dp_get_modes;
 
diff --git a/include/drm/bridge/analogix_dp.h b/include/drm/bridge/analogix_dp.h
index e9a1116d2f8e..475b706b49de 100644
--- a/include/drm/bridge/analogix_dp.h
+++ b/include/drm/bridge/analogix_dp.h
@@ -33,7 +33,8 @@ struct analogix_dp_plat_data {
 	struct drm_connector *connector;
 	bool skip_connector;
 
-	int (*power_on)(struct analogix_dp_plat_data *);
+	int (*power_on_start)(struct analogix_dp_plat_data *);
+	int (*power_on_end)(struct analogix_dp_plat_data *);
 	int (*power_off)(struct analogix_dp_plat_data *);
 	int (*attach)(struct analogix_dp_plat_data *, struct drm_bridge *,
 		      struct drm_connector *);
-- 
cgit v1.2.3


From 9a31fa395c19d5873190bf84c8192f5799861342 Mon Sep 17 00:00:00 2001
From: Takeshi Kihara <takeshi.kihara.df@renesas.com>
Date: Fri, 20 Apr 2018 21:27:43 +0900
Subject: clk: renesas: Add r8a77990 CPG Core Clock Definitions

This patch adds all R-Car E3 Clock Pulse Generator Core Clock Outputs.

Note that internal CPG clocks (S0, S1, S2, S3, SDSRC, POST3) are not
included, as they are used as internal clock sources only, and never
referenced from DT.

Signed-off-by: Takeshi Kihara <takeshi.kihara.df@renesas.com>
[shimoda: add SPDX-License-Identifier]
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r8a77990-cpg-mssr.h | 62 +++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 include/dt-bindings/clock/r8a77990-cpg-mssr.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/r8a77990-cpg-mssr.h b/include/dt-bindings/clock/r8a77990-cpg-mssr.h
new file mode 100644
index 000000000000..a596a482f3a9
--- /dev/null
+++ b/include/dt-bindings/clock/r8a77990-cpg-mssr.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Renesas Electronics Corp.
+ */
+#ifndef __DT_BINDINGS_CLOCK_R8A77990_CPG_MSSR_H__
+#define __DT_BINDINGS_CLOCK_R8A77990_CPG_MSSR_H__
+
+#include <dt-bindings/clock/renesas-cpg-mssr.h>
+
+/* r8a77990 CPG Core Clocks */
+#define R8A77990_CLK_Z2			0
+#define R8A77990_CLK_ZR			1
+#define R8A77990_CLK_ZG			2
+#define R8A77990_CLK_ZTR		3
+#define R8A77990_CLK_ZT			4
+#define R8A77990_CLK_ZX			5
+#define R8A77990_CLK_S0D1		6
+#define R8A77990_CLK_S0D3		7
+#define R8A77990_CLK_S0D6		8
+#define R8A77990_CLK_S0D12		9
+#define R8A77990_CLK_S0D24		10
+#define R8A77990_CLK_S1D1		11
+#define R8A77990_CLK_S1D2		12
+#define R8A77990_CLK_S1D4		13
+#define R8A77990_CLK_S2D1		14
+#define R8A77990_CLK_S2D2		15
+#define R8A77990_CLK_S2D4		16
+#define R8A77990_CLK_S3D1		17
+#define R8A77990_CLK_S3D2		18
+#define R8A77990_CLK_S3D4		19
+#define R8A77990_CLK_S0D6C		20
+#define R8A77990_CLK_S3D1C		21
+#define R8A77990_CLK_S3D2C		22
+#define R8A77990_CLK_S3D4C		23
+#define R8A77990_CLK_LB			24
+#define R8A77990_CLK_CL			25
+#define R8A77990_CLK_ZB3		26
+#define R8A77990_CLK_ZB3D2		27
+#define R8A77990_CLK_CR			28
+#define R8A77990_CLK_CRD2		29
+#define R8A77990_CLK_SD0H		30
+#define R8A77990_CLK_SD0		31
+#define R8A77990_CLK_SD1H		32
+#define R8A77990_CLK_SD1		33
+#define R8A77990_CLK_SD3H		34
+#define R8A77990_CLK_SD3		35
+#define R8A77990_CLK_RPC		36
+#define R8A77990_CLK_RPCD2		37
+#define R8A77990_CLK_ZA2		38
+#define R8A77990_CLK_ZA8		39
+#define R8A77990_CLK_Z2D		40
+#define R8A77990_CLK_CANFD		41
+#define R8A77990_CLK_MSO		42
+#define R8A77990_CLK_R			43
+#define R8A77990_CLK_OSC		44
+#define R8A77990_CLK_LV0		45
+#define R8A77990_CLK_LV1		46
+#define R8A77990_CLK_CSI0		47
+#define R8A77990_CLK_CP			48
+#define R8A77990_CLK_CPEX		49
+
+#endif /* __DT_BINDINGS_CLOCK_R8A77990_CPG_MSSR_H__ */
-- 
cgit v1.2.3


From a268de77faf6881756b4943b287fd78ec05a7d1e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:17 +0100
Subject: netfilter: nf_flow_table: move init code to nf_flow_table_core.c

Reduces duplication of .gc and .params in flowtable type definitions and
makes the API clearer

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |   6 +-
 net/ipv4/netfilter/nf_flow_table_ipv4.c |   3 +-
 net/ipv6/netfilter/nf_flow_table_ipv6.c |   3 +-
 net/netfilter/nf_flow_table_core.c      | 102 +++++++++++++++++++-------------
 net/netfilter/nf_flow_table_inet.c      |   3 +-
 net/netfilter/nf_tables_api.c           |  22 +++----
 6 files changed, 74 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 76ee5c81b752..f876e32a60b8 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,9 +14,8 @@ struct nf_flowtable;
 struct nf_flowtable_type {
 	struct list_head		list;
 	int				family;
-	void				(*gc)(struct work_struct *work);
+	int				(*init)(struct nf_flowtable *ft);
 	void				(*free)(struct nf_flowtable *ft);
-	const struct rhashtable_params	*params;
 	nf_hookfn			*hook;
 	struct module			*owner;
 };
@@ -100,9 +99,8 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 
+int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
-void nf_flow_offload_work_gc(struct work_struct *work);
-extern const struct rhashtable_params nf_flow_offload_rhash_params;
 
 void flow_offload_dead(struct flow_offload *flow);
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index b6e43ff0c7b7..e1e56d7123d2 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -7,8 +7,7 @@
 
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index f1804ce8d561..c511d206bf9b 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -8,8 +8,7 @@
 
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 7403a0dfddf7..09d1be669c39 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -116,16 +116,50 @@ void flow_offload_dead(struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_dead);
 
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple *tuple = data;
+
+	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple_rhash *tuplehash = data;
+
+	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+					const void *ptr)
+{
+	const struct flow_offload_tuple *tuple = arg->key;
+	const struct flow_offload_tuple_rhash *x = ptr;
+
+	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+		return 1;
+
+	return 0;
+}
+
+static const struct rhashtable_params nf_flow_offload_rhash_params = {
+	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
+	.hashfn			= flow_offload_hash,
+	.obj_hashfn		= flow_offload_hash_obj,
+	.obj_cmpfn		= flow_offload_hash_cmp,
+	.automatic_shrinking	= true,
+};
+
 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
 {
 	flow->timeout = (u32)jiffies;
 
 	rhashtable_insert_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	rhashtable_insert_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -135,10 +169,10 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 {
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 
 	flow_offload_free(flow);
 }
@@ -148,7 +182,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
 		    struct flow_offload_tuple *tuple)
 {
 	return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
-				      *flow_table->type->params);
+				      nf_flow_offload_rhash_params);
 }
 EXPORT_SYMBOL_GPL(flow_offload_lookup);
 
@@ -237,7 +271,7 @@ out:
 	return 1;
 }
 
-void nf_flow_offload_work_gc(struct work_struct *work)
+static void nf_flow_offload_work_gc(struct work_struct *work)
 {
 	struct nf_flowtable *flow_table;
 
@@ -245,42 +279,6 @@ void nf_flow_offload_work_gc(struct work_struct *work)
 	nf_flow_offload_gc_step(flow_table);
 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
 }
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple *tuple = data;
-
-	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple_rhash *tuplehash = data;
-
-	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
-					const void *ptr)
-{
-	const struct flow_offload_tuple *tuple = arg->key;
-	const struct flow_offload_tuple_rhash *x = ptr;
-
-	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
-		return 1;
-
-	return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
-	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
-	.hashfn			= flow_offload_hash,
-	.obj_hashfn		= flow_offload_hash_obj,
-	.obj_cmpfn		= flow_offload_hash_cmp,
-	.automatic_shrinking	= true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
 
 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
 				__be16 port, __be16 new_port)
@@ -398,6 +396,24 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
 }
 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
 
+int nf_flow_table_init(struct nf_flowtable *flowtable)
+{
+	int err;
+
+	INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+
+	err = rhashtable_init(&flowtable->rhashtable,
+			      &nf_flow_offload_rhash_params);
+	if (err < 0)
+		return err;
+
+	queue_delayed_work(system_power_efficient_wq,
+			   &flowtable->gc_work, HZ);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_init);
+
 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 {
 	struct net_device *dev = data;
@@ -423,8 +439,10 @@ EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
 void nf_flow_table_free(struct nf_flowtable *flow_table)
 {
+	cancel_delayed_work_sync(&flow_table->gc_work);
 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
 	WARN_ON(!nf_flow_offload_gc_step(flow_table));
+	rhashtable_destroy(&flow_table->rhashtable);
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_free);
 
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 375a1881d93d..99771aa7e7ea 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9134cc429ad4..6cd9955916e5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5150,14 +5150,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	}
 
 	flowtable->data.type = type;
-	err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+	err = type->init(&flowtable->data);
 	if (err < 0)
 		goto err3;
 
 	err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
 					     flowtable);
 	if (err < 0)
-		goto err3;
+		goto err4;
 
 	for (i = 0; i < flowtable->ops_len; i++) {
 		if (!flowtable->ops[i].dev)
@@ -5171,37 +5171,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				if (flowtable->ops[i].dev == ft->ops[k].dev &&
 				    flowtable->ops[i].pf == ft->ops[k].pf) {
 					err = -EBUSY;
-					goto err4;
+					goto err5;
 				}
 			}
 		}
 
 		err = nf_register_net_hook(net, &flowtable->ops[i]);
 		if (err < 0)
-			goto err4;
+			goto err5;
 	}
 
 	err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
 	if (err < 0)
-		goto err5;
-
-	INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
-	queue_delayed_work(system_power_efficient_wq,
-			   &flowtable->data.gc_work, HZ);
+		goto err6;
 
 	list_add_tail_rcu(&flowtable->list, &table->flowtables);
 	table->use++;
 
 	return 0;
-err5:
+err6:
 	i = flowtable->ops_len;
-err4:
+err5:
 	for (k = i - 1; k >= 0; k--) {
 		kfree(flowtable->dev_name[k]);
 		nf_unregister_net_hook(net, &flowtable->ops[k]);
 	}
 
 	kfree(flowtable->ops);
+err4:
+	flowtable->data.type->free(&flowtable->data);
 err3:
 	module_put(type->owner);
 err2:
@@ -5485,11 +5483,9 @@ err:
 
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
-	cancel_delayed_work_sync(&flowtable->data.gc_work);
 	kfree(flowtable->ops);
 	kfree(flowtable->name);
 	flowtable->data.type->free(&flowtable->data);
-	rhashtable_destroy(&flowtable->data.rhashtable);
 	module_put(flowtable->data.type->owner);
 }
 
-- 
cgit v1.2.3


From 84453a90252ca0cd7d1bd229199a40c58bfe431e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:19 +0100
Subject: netfilter: nf_flow_table: track flow tables in nf_flow_table directly

Avoids having nf_flow_table depend on nftables (useful for future
iptables backport work)

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  1 +
 include/net/netfilter/nf_tables.h     |  3 ---
 net/netfilter/nf_flow_table_core.c    | 21 ++++++++++++++++++---
 net/netfilter/nf_tables_api.c         | 17 -----------------
 4 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index f876e32a60b8..ab408adba688 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -21,6 +21,7 @@ struct nf_flowtable_type {
 };
 
 struct nf_flowtable {
+	struct list_head		list;
 	struct rhashtable		rhashtable;
 	const struct nf_flowtable_type	*type;
 	struct delayed_work		gc_work;
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cd368d1b8cb8..2f2062ae1c45 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1109,9 +1109,6 @@ struct nft_flowtable {
 struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 						 const struct nlattr *nla,
 						 u8 genmask);
-void nft_flow_table_iterate(struct net *net,
-			    void (*iter)(struct nf_flowtable *flowtable, void *data),
-			    void *data);
 
 void nft_register_flowtable_type(struct nf_flowtable_type *type);
 void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 09d1be669c39..e761359b56a9 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -18,6 +18,9 @@ struct flow_offload_entry {
 	struct rcu_head		rcu_head;
 };
 
+static DEFINE_MUTEX(flowtable_lock);
+static LIST_HEAD(flowtables);
+
 static void
 flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 		      struct nf_flow_route *route,
@@ -410,6 +413,10 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
 	queue_delayed_work(system_power_efficient_wq,
 			   &flowtable->gc_work, HZ);
 
+	mutex_lock(&flowtable_lock);
+	list_add(&flowtable->list, &flowtables);
+	mutex_unlock(&flowtable_lock);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_init);
@@ -425,20 +432,28 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 }
 
 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
-					  void *data)
+					  struct net_device *dev)
 {
-	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
 	flush_delayed_work(&flowtable->gc_work);
 }
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
 {
-	nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+	struct nf_flowtable *flowtable;
+
+	mutex_lock(&flowtable_lock);
+	list_for_each_entry(flowtable, &flowtables, list)
+		nf_flow_table_iterate_cleanup(flowtable, dev);
+	mutex_unlock(&flowtable_lock);
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
 void nf_flow_table_free(struct nf_flowtable *flow_table)
 {
+	mutex_lock(&flowtable_lock);
+	list_del(&flow_table->list);
+	mutex_unlock(&flowtable_lock);
 	cancel_delayed_work_sync(&flow_table->gc_work);
 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
 	WARN_ON(!nf_flow_offload_gc_step(flow_table));
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 517bb93c00fb..16b67f54b3d2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5060,23 +5060,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
 	return ERR_PTR(-ENOENT);
 }
 
-void nft_flow_table_iterate(struct net *net,
-			    void (*iter)(struct nf_flowtable *flowtable, void *data),
-			    void *data)
-{
-	struct nft_flowtable *flowtable;
-	const struct nft_table *table;
-
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_for_each_entry(table, &net->nft.tables, list) {
-		list_for_each_entry(flowtable, &table->flowtables, list) {
-			iter(&flowtable->data, data);
-		}
-	}
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
-
 static void nft_unregister_flowtable_net_hooks(struct net *net,
 					       struct nft_flowtable *flowtable)
 {
-- 
cgit v1.2.3


From 6bdc3c68d94c5d6adc675ee55361962e9dd2489d Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:20 +0100
Subject: netfilter: nf_flow_table: make flow_offload_dead inline

It is too trivial to keep as a separate exported function

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 5 ++++-
 net/netfilter/nf_flow_table_core.c    | 6 ------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ab408adba688..5aa49524ebef 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -103,7 +103,10 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
 
-void flow_offload_dead(struct flow_offload *flow);
+static inline void flow_offload_dead(struct flow_offload *flow)
+{
+	flow->flags |= FLOW_OFFLOAD_DYING;
+}
 
 int nf_flow_snat_port(const struct flow_offload *flow,
 		      struct sk_buff *skb, unsigned int thoff,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index e761359b56a9..0d38f20fd226 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -113,12 +113,6 @@ void flow_offload_free(struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_free);
 
-void flow_offload_dead(struct flow_offload *flow)
-{
-	flow->flags |= FLOW_OFFLOAD_DYING;
-}
-EXPORT_SYMBOL_GPL(flow_offload_dead);
-
 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
 {
 	const struct flow_offload_tuple *tuple = data;
-- 
cgit v1.2.3


From 59c466dd68e796f3a7a0709d90c72ce2d84e29c2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:21 +0100
Subject: netfilter: nf_flow_table: add a new flow state for tearing down
 offloading

On cleanup, this will be treated differently from FLOW_OFFLOAD_DYING:

If FLOW_OFFLOAD_DYING is set, the connection is going away, so both the
offload state and the connection tracking entry will be deleted.

If FLOW_OFFLOAD_TEARDOWN is set, the connection remains alive, but
the offload state is torn down. This is useful for cases that require
more complex state tracking / timeout handling on TCP, or if the
connection has been idle for too long.

Support for sending flows back to the slow path will be implemented in
a following patch

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  2 ++
 net/netfilter/nf_flow_table_core.c    | 22 ++++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 5aa49524ebef..ba9fa4592f2b 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -68,6 +68,7 @@ struct flow_offload_tuple_rhash {
 #define FLOW_OFFLOAD_SNAT	0x1
 #define FLOW_OFFLOAD_DNAT	0x2
 #define FLOW_OFFLOAD_DYING	0x4
+#define FLOW_OFFLOAD_TEARDOWN	0x8
 
 struct flow_offload {
 	struct flow_offload_tuple_rhash		tuplehash[FLOW_OFFLOAD_DIR_MAX];
@@ -103,6 +104,7 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
 
+void flow_offload_teardown(struct flow_offload *flow);
 static inline void flow_offload_dead(struct flow_offload *flow)
 {
 	flow->flags |= FLOW_OFFLOAD_DYING;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 0d38f20fd226..5a81e4f771e9 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -174,6 +174,12 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 	flow_offload_free(flow);
 }
 
+void flow_offload_teardown(struct flow_offload *flow)
+{
+	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
+}
+EXPORT_SYMBOL_GPL(flow_offload_teardown);
+
 struct flow_offload_tuple_rhash *
 flow_offload_lookup(struct nf_flowtable *flow_table,
 		    struct flow_offload_tuple *tuple)
@@ -226,11 +232,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
 	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
 }
 
-static inline bool nf_flow_is_dying(const struct flow_offload *flow)
-{
-	return flow->flags & FLOW_OFFLOAD_DYING;
-}
-
 static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 {
 	struct flow_offload_tuple_rhash *tuplehash;
@@ -258,7 +259,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
 
 		if (nf_flow_has_expired(flow) ||
-		    nf_flow_is_dying(flow))
+		    (flow->flags & (FLOW_OFFLOAD_DYING |
+				    FLOW_OFFLOAD_TEARDOWN)))
 			flow_offload_del(flow_table, flow);
 	}
 out:
@@ -419,10 +421,14 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 {
 	struct net_device *dev = data;
 
-	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+	if (!dev) {
+		flow_offload_teardown(flow);
 		return;
+	}
 
-	flow_offload_dead(flow);
+	if (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
+	    flow->tuplehash[1].tuple.iifidx == dev->ifindex)
+		flow_offload_dead(flow);
 }
 
 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
-- 
cgit v1.2.3


From cac20fcdf146b82d02e412d7a345f5826279cd82 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 28 Mar 2018 12:06:51 +0200
Subject: netfilter: nf_tables: simplify lookup functions

Replace the nf_tables_ prefix by nft_ and merge code into single lookup
function whenever possible. In many cases we go over the 80-chars
boundary function names, this save us ~50 LoC.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  12 +-
 net/netfilter/nf_tables_api.c     | 249 +++++++++++++++-----------------------
 net/netfilter/nft_flow_offload.c  |   5 +-
 net/netfilter/nft_objref.c        |   4 +-
 4 files changed, 110 insertions(+), 160 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2f2062ae1c45..123e82a2f8bb 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1015,9 +1015,9 @@ static inline void *nft_obj_data(const struct nft_object *obj)
 
 #define nft_expr_obj(expr)	*((struct nft_object **)nft_expr_priv(expr))
 
-struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
-					const struct nlattr *nla, u32 objtype,
-					u8 genmask);
+struct nft_object *nft_obj_lookup(const struct nft_table *table,
+				  const struct nlattr *nla, u32 objtype,
+				  u8 genmask);
 
 void nft_obj_notify(struct net *net, struct nft_table *table,
 		    struct nft_object *obj, u32 portid, u32 seq,
@@ -1106,9 +1106,9 @@ struct nft_flowtable {
 	struct nf_flowtable		data;
 };
 
-struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
-						 const struct nlattr *nla,
-						 u8 genmask);
+struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+					   const struct nlattr *nla,
+					   u8 genmask);
 
 void nft_register_flowtable_type(struct nf_flowtable_type *type);
 void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 16b67f54b3d2..f65e650b61aa 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -386,13 +386,17 @@ static struct nft_table *nft_table_lookup(const struct net *net,
 {
 	struct nft_table *table;
 
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
 	list_for_each_entry(table, &net->nft.tables, list) {
 		if (!nla_strcmp(nla, table->name) &&
 		    table->family == family &&
 		    nft_active_genmask(table, genmask))
 			return table;
 	}
-	return NULL;
+
+	return ERR_PTR(-ENOENT);
 }
 
 static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
@@ -406,37 +410,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
 		    nft_active_genmask(table, genmask))
 			return table;
 	}
-	return NULL;
-}
-
-static struct nft_table *nf_tables_table_lookup(const struct net *net,
-						const struct nlattr *nla,
-						u8 family, u8 genmask)
-{
-	struct nft_table *table;
-
-	if (nla == NULL)
-		return ERR_PTR(-EINVAL);
-
-	table = nft_table_lookup(net, nla, family, genmask);
-	if (table != NULL)
-		return table;
-
-	return ERR_PTR(-ENOENT);
-}
-
-static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
-							 const struct nlattr *nla,
-							 u8 genmask)
-{
-	struct nft_table *table;
-
-	if (nla == NULL)
-		return ERR_PTR(-EINVAL);
-
-	table = nft_table_lookup_byhandle(net, nla, genmask);
-	if (table != NULL)
-		return table;
 
 	return ERR_PTR(-ENOENT);
 }
@@ -608,8 +581,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -735,7 +707,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	int err;
 
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(net, name, family, genmask);
+	table = nft_table_lookup(net, name, family, genmask);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -893,12 +865,11 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 		return nft_flush(&ctx, family);
 
 	if (nla[NFTA_TABLE_HANDLE])
-		table = nf_tables_table_lookup_byhandle(net,
-							nla[NFTA_TABLE_HANDLE],
-							genmask);
+		table = nft_table_lookup_byhandle(net, nla[NFTA_TABLE_HANDLE],
+						  genmask);
 	else
-		table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME],
-					       family, genmask);
+		table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family,
+					 genmask);
 
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -949,8 +920,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
  */
 
 static struct nft_chain *
-nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
-				u8 genmask)
+nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
 {
 	struct nft_chain *chain;
 
@@ -963,9 +933,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
-						const struct nlattr *nla,
-						u8 genmask)
+static struct nft_chain *nft_chain_lookup(const struct nft_table *table,
+					  const struct nlattr *nla, u8 genmask)
 {
 	struct nft_chain *chain;
 
@@ -1194,12 +1163,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+	chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
@@ -1513,8 +1481,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 	    nla[NFTA_CHAIN_NAME]) {
 		struct nft_chain *chain2;
 
-		chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME],
-						genmask);
+		chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 		if (!IS_ERR(chain2))
 			return -EEXIST;
 	}
@@ -1576,8 +1543,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1586,11 +1552,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 
 	if (nla[NFTA_CHAIN_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
-		chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+		chain = nft_chain_lookup_byhandle(table, handle, genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 	} else {
-		chain = nf_tables_chain_lookup(table, name, genmask);
+		chain = nft_chain_lookup(table, name, genmask);
 		if (IS_ERR(chain)) {
 			if (PTR_ERR(chain) != -ENOENT)
 				return PTR_ERR(chain);
@@ -1647,16 +1613,15 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	u32 use;
 	int err;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	if (nla[NFTA_CHAIN_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
-		chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+		chain = nft_chain_lookup_byhandle(table, handle, genmask);
 	} else {
-		chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+		chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 	}
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
@@ -1939,8 +1904,8 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
  * Rules
  */
 
-static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
-						u64 handle)
+static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
+					  u64 handle)
 {
 	struct nft_rule *rule;
 
@@ -1953,13 +1918,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
-					      const struct nlattr *nla)
+static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
+					const struct nlattr *nla)
 {
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+	return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
 }
 
 static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -2191,16 +2156,15 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
-	rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+	rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
 	if (IS_ERR(rule))
 		return PTR_ERR(rule);
 
@@ -2265,18 +2229,17 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
 	if (nla[NFTA_RULE_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
-		rule = __nf_tables_rule_lookup(chain, handle);
+		rule = __nft_rule_lookup(chain, handle);
 		if (IS_ERR(rule))
 			return PTR_ERR(rule);
 
@@ -2300,7 +2263,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			return -EOPNOTSUPP;
 
 		pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
-		old_rule = __nf_tables_rule_lookup(chain, pos_handle);
+		old_rule = __nft_rule_lookup(chain, pos_handle);
 		if (IS_ERR(old_rule))
 			return PTR_ERR(old_rule);
 	}
@@ -2435,14 +2398,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 	int family = nfmsg->nfgen_family, err = 0;
 	struct nft_ctx ctx;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	if (nla[NFTA_RULE_CHAIN]) {
-		chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN],
-					       genmask);
+		chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 	}
@@ -2451,8 +2412,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 
 	if (chain) {
 		if (nla[NFTA_RULE_HANDLE]) {
-			rule = nf_tables_rule_lookup(chain,
-						     nla[NFTA_RULE_HANDLE]);
+			rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
 			if (IS_ERR(rule))
 				return PTR_ERR(rule);
 
@@ -2635,8 +2595,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 	struct nft_table *table = NULL;
 
 	if (nla[NFTA_SET_TABLE] != NULL) {
-		table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE],
-					       family, genmask);
+		table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+					 genmask);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
@@ -2645,8 +2605,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 	return 0;
 }
 
-static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
-					    const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup(const struct nft_table *table,
+				      const struct nlattr *nla, u8 genmask)
 {
 	struct nft_set *set;
 
@@ -2661,14 +2621,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table,
-						     const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
+					       const struct nlattr *nla,
+					       u8 genmask)
 {
 	struct nft_set *set;
 
-	if (nla == NULL)
-		return ERR_PTR(-EINVAL);
-
 	list_for_each_entry(set, &table->sets, list) {
 		if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
 		    nft_active_genmask(set, genmask))
@@ -2677,9 +2635,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
-						 const struct nlattr *nla,
-						 u8 genmask)
+static struct nft_set *nft_set_lookup_byid(const struct net *net,
+					   const struct nlattr *nla, u8 genmask)
 {
 	struct nft_trans *trans;
 	u32 id = ntohl(nla_get_be32(nla));
@@ -2703,12 +2660,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
 {
 	struct nft_set *set;
 
-	set = nf_tables_set_lookup(table, nla_set_name, genmask);
+	set = nft_set_lookup(table, nla_set_name, genmask);
 	if (IS_ERR(set)) {
 		if (!nla_set_id)
 			return set;
 
-		set = nf_tables_set_lookup_byid(net, nla_set_id, genmask);
+		set = nft_set_lookup_byid(net, nla_set_id, genmask);
 	}
 	return set;
 }
@@ -2980,7 +2937,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 	if (!nla[NFTA_SET_TABLE])
 		return -EINVAL;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+	set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -3132,14 +3089,13 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
-	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
+	set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set)) {
 		if (PTR_ERR(set) != -ENOENT)
 			return PTR_ERR(set);
@@ -3262,9 +3218,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 		return err;
 
 	if (nla[NFTA_SET_HANDLE])
-		set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask);
+		set = nft_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE],
+					      genmask);
 	else
-		set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+		set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -3404,8 +3361,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 	int family = nfmsg->nfgen_family;
 	struct nft_table *table;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -3741,8 +3698,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
-				   genmask);
+	set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -3954,8 +3910,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			err = -EINVAL;
 			goto err2;
 		}
-		obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
-					   set->objtype, genmask);
+		obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+				     set->objtype, genmask);
 		if (IS_ERR(obj)) {
 			err = PTR_ERR(obj);
 			goto err2;
@@ -4284,8 +4240,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
-				   genmask);
+	set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
@@ -4373,9 +4328,9 @@ void nft_unregister_obj(struct nft_object_type *obj_type)
 }
 EXPORT_SYMBOL_GPL(nft_unregister_obj);
 
-struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
-					const struct nlattr *nla,
-					u32 objtype, u8 genmask)
+struct nft_object *nft_obj_lookup(const struct nft_table *table,
+				  const struct nlattr *nla, u32 objtype,
+				  u8 genmask)
 {
 	struct nft_object *obj;
 
@@ -4387,11 +4342,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 	}
 	return ERR_PTR(-ENOENT);
 }
-EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+EXPORT_SYMBOL_GPL(nft_obj_lookup);
 
-static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
-							const struct nlattr *nla,
-							u32 objtype, u8 genmask)
+static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table,
+						  const struct nlattr *nla,
+						  u32 objtype, u8 genmask)
 {
 	struct nft_object *obj;
 
@@ -4535,13 +4490,12 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_OBJ_DATA])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
 	if (IS_ERR(obj)) {
 		err = PTR_ERR(obj);
 		if (err != -ENOENT)
@@ -4761,13 +4715,12 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_OBJ_TYPE])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 
@@ -4817,18 +4770,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	    (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 	if (nla[NFTA_OBJ_HANDLE])
-		obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
-						    objtype, genmask);
+		obj = nft_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
+					      objtype, genmask);
 	else
-		obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME],
-					   objtype, genmask);
+		obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype,
+				     genmask);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 	if (obj->use > 0)
@@ -4903,9 +4855,8 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
 	[NFTA_FLOWTABLE_HANDLE]		= { .type = NLA_U64 },
 };
 
-struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
-						 const struct nlattr *nla,
-						 u8 genmask)
+struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+					   const struct nlattr *nla, u8 genmask)
 {
 	struct nft_flowtable *flowtable;
 
@@ -4916,11 +4867,11 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 	}
 	return ERR_PTR(-ENOENT);
 }
-EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
+EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
 
 static struct nft_flowtable *
-nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
-				    const struct nlattr *nla, u8 genmask)
+nft_flowtable_lookup_byhandle(const struct nft_table *table,
+			      const struct nlattr *nla, u8 genmask)
 {
        struct nft_flowtable *flowtable;
 
@@ -5093,13 +5044,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_FLOWTABLE_HOOK])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
-					       genmask);
+	flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					 genmask);
 	if (IS_ERR(flowtable)) {
 		err = PTR_ERR(flowtable);
 		if (err != -ENOENT)
@@ -5210,19 +5161,19 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	     !nla[NFTA_FLOWTABLE_HANDLE]))
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	if (nla[NFTA_FLOWTABLE_HANDLE])
-		flowtable = nf_tables_flowtable_lookup_byhandle(table,
-								nla[NFTA_FLOWTABLE_HANDLE],
-								genmask);
+		flowtable = nft_flowtable_lookup_byhandle(table,
+							  nla[NFTA_FLOWTABLE_HANDLE],
+							  genmask);
 	else
-		flowtable = nf_tables_flowtable_lookup(table,
-						       nla[NFTA_FLOWTABLE_NAME],
-						       genmask);
+		flowtable = nft_flowtable_lookup(table,
+						 nla[NFTA_FLOWTABLE_NAME],
+						 genmask);
 	if (IS_ERR(flowtable))
                 return PTR_ERR(flowtable);
 	if (flowtable->use > 0)
@@ -5407,13 +5358,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 	if (!nla[NFTA_FLOWTABLE_NAME])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
-					       genmask);
+	flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					 genmask);
 	if (IS_ERR(flowtable))
 		return PTR_ERR(flowtable);
 
@@ -6382,8 +6333,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 	case NFT_GOTO:
 		if (!tb[NFTA_VERDICT_CHAIN])
 			return -EINVAL;
-		chain = nf_tables_chain_lookup(ctx->table,
-					       tb[NFTA_VERDICT_CHAIN], genmask);
+		chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
+					 genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 		if (nft_is_base_chain(chain))
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b65829b2be22..d6bab8c3cbb0 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
 	if (!tb[NFTA_FLOW_TABLE_NAME])
 		return -EINVAL;
 
-	flowtable = nf_tables_flowtable_lookup(ctx->table,
-					       tb[NFTA_FLOW_TABLE_NAME],
-					       genmask);
+	flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
+					 genmask);
 	if (IS_ERR(flowtable))
 		return PTR_ERR(flowtable);
 
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 0b02407773ad..cdf348f751ec 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
 		return -EINVAL;
 
 	objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
-	obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
-				   genmask);
+	obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+			     genmask);
 	if (IS_ERR(obj))
 		return -ENOENT;
 
-- 
cgit v1.2.3


From 71cc0873e0e0a4c6dca899c42e3ac143f7960d8e Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 3 Apr 2018 23:15:39 +0200
Subject: netfilter: nf_tables: Simplify set backend selection

Drop nft_set_type's ability to act as a container of multiple backend
implementations it chooses from. Instead consolidate the whole selection
logic in nft_select_set_ops() and the actual backend provided estimate()
callback.

This turns nf_tables_set_types into a list containing all available
backends which is traversed when selecting one matching userspace
requested criteria.

Also, this change allows to embed nft_set_ops structure into
nft_set_type and pull flags field into the latter as it's only used
during selection phase.

A crucial part of this change is to make sure the new layout respects
hash backend constraints formerly enforced by nft_hash_select_ops()
function: This is achieved by introduction of a specific estimate()
callback for nft_hash_fast_ops which returns false for key lengths != 4.
In turn, nft_hash_estimate() is changed to return false for key lengths
== 4 so it won't be chosen by accident. Also, both callbacks must return
false for unbounded sets as their size estimate depends on a known
maximum element count.

Note that this patch partially reverts commit 4f2921ca21b71 ("netfilter:
nf_tables: meter: pick a set backend that supports updates") by making
nft_set_ops_candidate() not explicitly look for an update callback but
make NFT_SET_EVAL a regular backend feature flag which is checked along
with the others. This way all feature requirements are checked in one
go.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  34 ++++-----
 net/netfilter/nf_tables_api.c     |  25 +++----
 net/netfilter/nft_set_bitmap.c    |  34 ++++-----
 net/netfilter/nft_set_hash.c      | 153 +++++++++++++++++++++-----------------
 net/netfilter/nft_set_rbtree.c    |  36 ++++-----
 5 files changed, 139 insertions(+), 143 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 123e82a2f8bb..de77d36e36b3 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -275,23 +275,6 @@ struct nft_set_estimate {
 	enum nft_set_class	space;
 };
 
-/**
- *      struct nft_set_type - nf_tables set type
- *
- *      @select_ops: function to select nft_set_ops
- *      @ops: default ops, used when no select_ops functions is present
- *      @list: used internally
- *      @owner: module reference
- */
-struct nft_set_type {
-	const struct nft_set_ops	*(*select_ops)(const struct nft_ctx *,
-						       const struct nft_set_desc *desc,
-						       u32 flags);
-	const struct nft_set_ops	*ops;
-	struct list_head		list;
-	struct module			*owner;
-};
-
 struct nft_set_ext;
 struct nft_expr;
 
@@ -310,7 +293,6 @@ struct nft_expr;
  *	@init: initialize private data of new set instance
  *	@destroy: destroy private data of set instance
  *	@elemsize: element private size
- *	@features: features supported by the implementation
  */
 struct nft_set_ops {
 	bool				(*lookup)(const struct net *net,
@@ -361,9 +343,23 @@ struct nft_set_ops {
 	void				(*destroy)(const struct nft_set *set);
 
 	unsigned int			elemsize;
+};
+
+/**
+ *      struct nft_set_type - nf_tables set type
+ *
+ *      @ops: set ops for this type
+ *      @list: used internally
+ *      @owner: module reference
+ *      @features: features supported by the implementation
+ */
+struct nft_set_type {
+	const struct nft_set_ops	ops;
+	struct list_head		list;
+	struct module			*owner;
 	u32				features;
-	const struct nft_set_type	*type;
 };
+#define to_set_type(o) container_of(o, struct nft_set_type, ops)
 
 int nft_register_set(struct nft_set_type *type);
 void nft_unregister_set(struct nft_set_type *type);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2f14cadd9922..9ce35acf491d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2523,14 +2523,12 @@ void nft_unregister_set(struct nft_set_type *type)
 EXPORT_SYMBOL_GPL(nft_unregister_set);
 
 #define NFT_SET_FEATURES	(NFT_SET_INTERVAL | NFT_SET_MAP | \
-				 NFT_SET_TIMEOUT | NFT_SET_OBJECT)
+				 NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
+				 NFT_SET_EVAL)
 
-static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags)
+static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
 {
-	if ((flags & NFT_SET_EVAL) && !ops->update)
-		return false;
-
-	return (flags & ops->features) == (flags & NFT_SET_FEATURES);
+	return (flags & type->features) == (flags & NFT_SET_FEATURES);
 }
 
 /*
@@ -2567,14 +2565,9 @@ nft_select_set_ops(const struct nft_ctx *ctx,
 	best.space  = ~0;
 
 	list_for_each_entry(type, &nf_tables_set_types, list) {
-		if (!type->select_ops)
-			ops = type->ops;
-		else
-			ops = type->select_ops(ctx, desc, flags);
-		if (!ops)
-			continue;
+		ops = &type->ops;
 
-		if (!nft_set_ops_candidate(ops, flags))
+		if (!nft_set_ops_candidate(type, flags))
 			continue;
 		if (!ops->estimate(desc, flags, &est))
 			continue;
@@ -2605,7 +2598,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
 		if (!try_module_get(type->owner))
 			continue;
 		if (bops != NULL)
-			module_put(bops->type->owner);
+			module_put(to_set_type(bops)->owner);
 
 		bops = ops;
 		best = est;
@@ -3247,14 +3240,14 @@ err3:
 err2:
 	kvfree(set);
 err1:
-	module_put(ops->type->owner);
+	module_put(to_set_type(ops)->owner);
 	return err;
 }
 
 static void nft_set_destroy(struct nft_set *set)
 {
 	set->ops->destroy(set);
-	module_put(set->ops->type->owner);
+	module_put(to_set_type(set->ops)->owner);
 	kfree(set->name);
 	kvfree(set);
 }
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 45fb2752fb63..d6626e01c7ee 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_bitmap_type;
-static struct nft_set_ops nft_bitmap_ops __read_mostly = {
-	.type		= &nft_bitmap_type,
-	.privsize	= nft_bitmap_privsize,
-	.elemsize	= offsetof(struct nft_bitmap_elem, ext),
-	.estimate	= nft_bitmap_estimate,
-	.init		= nft_bitmap_init,
-	.destroy	= nft_bitmap_destroy,
-	.insert		= nft_bitmap_insert,
-	.remove		= nft_bitmap_remove,
-	.deactivate	= nft_bitmap_deactivate,
-	.flush		= nft_bitmap_flush,
-	.activate	= nft_bitmap_activate,
-	.lookup		= nft_bitmap_lookup,
-	.walk		= nft_bitmap_walk,
-	.get		= nft_bitmap_get,
-};
-
 static struct nft_set_type nft_bitmap_type __read_mostly = {
-	.ops		= &nft_bitmap_ops,
 	.owner		= THIS_MODULE,
+	.ops		= {
+		.privsize	= nft_bitmap_privsize,
+		.elemsize	= offsetof(struct nft_bitmap_elem, ext),
+		.estimate	= nft_bitmap_estimate,
+		.init		= nft_bitmap_init,
+		.destroy	= nft_bitmap_destroy,
+		.insert		= nft_bitmap_insert,
+		.remove		= nft_bitmap_remove,
+		.deactivate	= nft_bitmap_deactivate,
+		.flush		= nft_bitmap_flush,
+		.activate	= nft_bitmap_activate,
+		.lookup		= nft_bitmap_lookup,
+		.walk		= nft_bitmap_walk,
+		.get		= nft_bitmap_get,
+	},
 };
 
 static int __init nft_bitmap_module_init(void)
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index fc9c6d5d64cd..dbf1f4ad077c 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -605,6 +605,12 @@ static void nft_hash_destroy(const struct nft_set *set)
 static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
 			      struct nft_set_estimate *est)
 {
+	if (!desc->size)
+		return false;
+
+	if (desc->klen == 4)
+		return false;
+
 	est->size   = sizeof(struct nft_hash) +
 		      nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
 		      desc->size * sizeof(struct nft_hash_elem);
@@ -614,91 +620,100 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_hash_type;
-static struct nft_set_ops nft_rhash_ops __read_mostly = {
-	.type		= &nft_hash_type,
-	.privsize       = nft_rhash_privsize,
-	.elemsize	= offsetof(struct nft_rhash_elem, ext),
-	.estimate	= nft_rhash_estimate,
-	.init		= nft_rhash_init,
-	.destroy	= nft_rhash_destroy,
-	.insert		= nft_rhash_insert,
-	.activate	= nft_rhash_activate,
-	.deactivate	= nft_rhash_deactivate,
-	.flush		= nft_rhash_flush,
-	.remove		= nft_rhash_remove,
-	.lookup		= nft_rhash_lookup,
-	.update		= nft_rhash_update,
-	.walk		= nft_rhash_walk,
-	.get		= nft_rhash_get,
-	.features	= NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
-};
+static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features,
+			      struct nft_set_estimate *est)
+{
+	if (!desc->size)
+		return false;
 
-static struct nft_set_ops nft_hash_ops __read_mostly = {
-	.type		= &nft_hash_type,
-	.privsize       = nft_hash_privsize,
-	.elemsize	= offsetof(struct nft_hash_elem, ext),
-	.estimate	= nft_hash_estimate,
-	.init		= nft_hash_init,
-	.destroy	= nft_hash_destroy,
-	.insert		= nft_hash_insert,
-	.activate	= nft_hash_activate,
-	.deactivate	= nft_hash_deactivate,
-	.flush		= nft_hash_flush,
-	.remove		= nft_hash_remove,
-	.lookup		= nft_hash_lookup,
-	.walk		= nft_hash_walk,
-	.get		= nft_hash_get,
-	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
-};
+	if (desc->klen != 4)
+		return false;
 
-static struct nft_set_ops nft_hash_fast_ops __read_mostly = {
-	.type		= &nft_hash_type,
-	.privsize       = nft_hash_privsize,
-	.elemsize	= offsetof(struct nft_hash_elem, ext),
-	.estimate	= nft_hash_estimate,
-	.init		= nft_hash_init,
-	.destroy	= nft_hash_destroy,
-	.insert		= nft_hash_insert,
-	.activate	= nft_hash_activate,
-	.deactivate	= nft_hash_deactivate,
-	.flush		= nft_hash_flush,
-	.remove		= nft_hash_remove,
-	.lookup		= nft_hash_lookup_fast,
-	.walk		= nft_hash_walk,
-	.get		= nft_hash_get,
-	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
-};
-
-static const struct nft_set_ops *
-nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc,
-		    u32 flags)
-{
-	if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) {
-		switch (desc->klen) {
-		case 4:
-			return &nft_hash_fast_ops;
-		default:
-			return &nft_hash_ops;
-		}
-	}
+	est->size   = sizeof(struct nft_hash) +
+		      nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+		      desc->size * sizeof(struct nft_hash_elem);
+	est->lookup = NFT_SET_CLASS_O_1;
+	est->space  = NFT_SET_CLASS_O_N;
 
-	return &nft_rhash_ops;
+	return true;
 }
 
+static struct nft_set_type nft_rhash_type __read_mostly = {
+	.owner		= THIS_MODULE,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT |
+			  NFT_SET_TIMEOUT | NFT_SET_EVAL,
+	.ops		= {
+		.privsize       = nft_rhash_privsize,
+		.elemsize	= offsetof(struct nft_rhash_elem, ext),
+		.estimate	= nft_rhash_estimate,
+		.init		= nft_rhash_init,
+		.destroy	= nft_rhash_destroy,
+		.insert		= nft_rhash_insert,
+		.activate	= nft_rhash_activate,
+		.deactivate	= nft_rhash_deactivate,
+		.flush		= nft_rhash_flush,
+		.remove		= nft_rhash_remove,
+		.lookup		= nft_rhash_lookup,
+		.update		= nft_rhash_update,
+		.walk		= nft_rhash_walk,
+		.get		= nft_rhash_get,
+	},
+};
+
 static struct nft_set_type nft_hash_type __read_mostly = {
-	.select_ops	= nft_hash_select_ops,
 	.owner		= THIS_MODULE,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
+	.ops		= {
+		.privsize       = nft_hash_privsize,
+		.elemsize	= offsetof(struct nft_hash_elem, ext),
+		.estimate	= nft_hash_estimate,
+		.init		= nft_hash_init,
+		.destroy	= nft_hash_destroy,
+		.insert		= nft_hash_insert,
+		.activate	= nft_hash_activate,
+		.deactivate	= nft_hash_deactivate,
+		.flush		= nft_hash_flush,
+		.remove		= nft_hash_remove,
+		.lookup		= nft_hash_lookup,
+		.walk		= nft_hash_walk,
+		.get		= nft_hash_get,
+	},
+};
+
+static struct nft_set_type nft_hash_fast_type __read_mostly = {
+	.owner		= THIS_MODULE,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
+	.ops		= {
+		.privsize       = nft_hash_privsize,
+		.elemsize	= offsetof(struct nft_hash_elem, ext),
+		.estimate	= nft_hash_fast_estimate,
+		.init		= nft_hash_init,
+		.destroy	= nft_hash_destroy,
+		.insert		= nft_hash_insert,
+		.activate	= nft_hash_activate,
+		.deactivate	= nft_hash_deactivate,
+		.flush		= nft_hash_flush,
+		.remove		= nft_hash_remove,
+		.lookup		= nft_hash_lookup_fast,
+		.walk		= nft_hash_walk,
+		.get		= nft_hash_get,
+	},
 };
 
 static int __init nft_hash_module_init(void)
 {
-	return nft_register_set(&nft_hash_type);
+	if (nft_register_set(&nft_hash_fast_type) ||
+	    nft_register_set(&nft_hash_type) ||
+	    nft_register_set(&nft_rhash_type))
+		return 1;
+	return 0;
 }
 
 static void __exit nft_hash_module_exit(void)
 {
+	nft_unregister_set(&nft_rhash_type);
 	nft_unregister_set(&nft_hash_type);
+	nft_unregister_set(&nft_hash_fast_type);
 }
 
 module_init(nft_hash_module_init);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e6f08bc5f359..22c57d7612c4 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -393,28 +393,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_rbtree_type;
-static struct nft_set_ops nft_rbtree_ops __read_mostly = {
-	.type		= &nft_rbtree_type,
-	.privsize	= nft_rbtree_privsize,
-	.elemsize	= offsetof(struct nft_rbtree_elem, ext),
-	.estimate	= nft_rbtree_estimate,
-	.init		= nft_rbtree_init,
-	.destroy	= nft_rbtree_destroy,
-	.insert		= nft_rbtree_insert,
-	.remove		= nft_rbtree_remove,
-	.deactivate	= nft_rbtree_deactivate,
-	.flush		= nft_rbtree_flush,
-	.activate	= nft_rbtree_activate,
-	.lookup		= nft_rbtree_lookup,
-	.walk		= nft_rbtree_walk,
-	.get		= nft_rbtree_get,
-	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
-};
-
 static struct nft_set_type nft_rbtree_type __read_mostly = {
-	.ops		= &nft_rbtree_ops,
 	.owner		= THIS_MODULE,
+	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
+	.ops		= {
+		.privsize	= nft_rbtree_privsize,
+		.elemsize	= offsetof(struct nft_rbtree_elem, ext),
+		.estimate	= nft_rbtree_estimate,
+		.init		= nft_rbtree_init,
+		.destroy	= nft_rbtree_destroy,
+		.insert		= nft_rbtree_insert,
+		.remove		= nft_rbtree_remove,
+		.deactivate	= nft_rbtree_deactivate,
+		.flush		= nft_rbtree_flush,
+		.activate	= nft_rbtree_activate,
+		.lookup		= nft_rbtree_lookup,
+		.walk		= nft_rbtree_walk,
+		.get		= nft_rbtree_get,
+	},
 };
 
 static int __init nft_rbtree_module_init(void)
-- 
cgit v1.2.3


From 2eb0f624b709e78ec8e2f4c3412947703db99301 Mon Sep 17 00:00:00 2001
From: Thierry Du Tre <thierry@dtsystems.be>
Date: Wed, 4 Apr 2018 15:38:22 +0200
Subject: netfilter: add NAT support for shifted portmap ranges

This is a patch proposal to support shifted ranges in portmaps.  (i.e. tcp/udp
incoming port 5000-5100 on WAN redirected to LAN 192.168.1.5:2000-2100)

Currently DNAT only works for single port or identical port ranges.  (i.e.
ports 5000-5100 on WAN interface redirected to a LAN host while original
destination port is not altered) When different port ranges are configured,
either 'random' mode should be used, or else all incoming connections are
mapped onto the first port in the redirect range. (in described example
WAN:5000-5100 will all be mapped to 192.168.1.5:2000)

This patch introduces a new mode indicated by flag NF_NAT_RANGE_PROTO_OFFSET
which uses a base port value to calculate an offset with the destination port
present in the incoming stream. That offset is then applied as index within the
redirect port range (index modulo rangewidth to handle range overflow).

In described example the base port would be 5000. An incoming stream with
destination port 5004 would result in an offset value 4 which means that the
NAT'ed stream will be using destination port 2004.

Other possibilities include deterministic mapping of larger or multiple ranges
to a smaller range : WAN:5000-5999 -> LAN:5000-5099 (maps WAN port 5*xx to port
51xx)

This patch does not change any current behavior. It just adds new NAT proto
range functionality which must be selected via the specific flag when intended
to use.

A patch for iptables (libipt_DNAT.c + libip6t_DNAT.c) will also be proposed
which makes this functionality immediately available.

Signed-off-by: Thierry Du Tre <thierry@dtsystems.be>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_nat_masquerade.h |  2 +-
 include/net/netfilter/ipv6/nf_nat_masquerade.h |  2 +-
 include/net/netfilter/nf_nat.h                 |  2 +-
 include/net/netfilter/nf_nat_l3proto.h         |  4 +-
 include/net/netfilter/nf_nat_l4proto.h         |  8 +--
 include/net/netfilter/nf_nat_redirect.h        |  2 +-
 include/uapi/linux/netfilter/nf_nat.h          | 12 ++++-
 net/ipv4/netfilter/ipt_MASQUERADE.c            |  2 +-
 net/ipv4/netfilter/nf_nat_h323.c               |  4 +-
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c       |  4 +-
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c    |  4 +-
 net/ipv4/netfilter/nf_nat_pptp.c               |  2 +-
 net/ipv4/netfilter/nf_nat_proto_gre.c          |  2 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c         |  2 +-
 net/ipv4/netfilter/nft_masq_ipv4.c             |  2 +-
 net/ipv6/netfilter/ip6t_MASQUERADE.c           |  2 +-
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c       |  4 +-
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c    |  4 +-
 net/ipv6/netfilter/nf_nat_proto_icmpv6.c       |  2 +-
 net/ipv6/netfilter/nft_masq_ipv6.c             |  2 +-
 net/ipv6/netfilter/nft_redir_ipv6.c            |  2 +-
 net/netfilter/nf_nat_core.c                    | 27 +++++-----
 net/netfilter/nf_nat_helper.c                  |  2 +-
 net/netfilter/nf_nat_proto_common.c            |  9 ++--
 net/netfilter/nf_nat_proto_dccp.c              |  2 +-
 net/netfilter/nf_nat_proto_sctp.c              |  2 +-
 net/netfilter/nf_nat_proto_tcp.c               |  2 +-
 net/netfilter/nf_nat_proto_udp.c               |  4 +-
 net/netfilter/nf_nat_proto_unknown.c           |  2 +-
 net/netfilter/nf_nat_redirect.c                |  6 +--
 net/netfilter/nf_nat_sip.c                     |  2 +-
 net/netfilter/nft_nat.c                        |  2 +-
 net/netfilter/xt_NETMAP.c                      |  8 +--
 net/netfilter/xt_REDIRECT.c                    |  2 +-
 net/netfilter/xt_nat.c                         | 72 +++++++++++++++++++++++---
 net/openvswitch/conntrack.c                    |  4 +-
 36 files changed, 145 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h
index ebd869473603..cd24be4c4a99 100644
--- a/include/net/netfilter/ipv4/nf_nat_masquerade.h
+++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h
@@ -6,7 +6,7 @@
 
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range *range,
+		       const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 
 void nf_nat_masquerade_ipv4_register_notifier(void);
diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h
index 1ed4f2631ed6..0c3b5ebf0bb8 100644
--- a/include/net/netfilter/ipv6/nf_nat_masquerade.h
+++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h
@@ -3,7 +3,7 @@
 #define _NF_NAT_MASQUERADE_IPV6_H_
 
 unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 void nf_nat_masquerade_ipv6_register_notifier(void);
 void nf_nat_masquerade_ipv6_unregister_notifier(void);
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 207a467e7ca6..da3d601cadee 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -39,7 +39,7 @@ struct nf_conn_nat {
 
 /* Set up the info structure to map into this range. */
 unsigned int nf_nat_setup_info(struct nf_conn *ct,
-			       const struct nf_nat_range *range,
+			       const struct nf_nat_range2 *range,
 			       enum nf_nat_manip_type maniptype);
 
 extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct,
diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index ce7c2b4e64bb..ac47098a61dc 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -7,7 +7,7 @@ struct nf_nat_l3proto {
 	u8	l3proto;
 
 	bool	(*in_range)(const struct nf_conntrack_tuple *t,
-			    const struct nf_nat_range *range);
+			    const struct nf_nat_range2 *range);
 
 	u32 	(*secure_port)(const struct nf_conntrack_tuple *t, __be16);
 
@@ -33,7 +33,7 @@ struct nf_nat_l3proto {
 				  struct flowi *fl);
 
 	int	(*nlattr_to_range)(struct nlattr *tb[],
-				   struct nf_nat_range *range);
+				   struct nf_nat_range2 *range);
 };
 
 int nf_nat_l3proto_register(const struct nf_nat_l3proto *);
diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 67835ff8a2d9..b4d6b29bca62 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -34,12 +34,12 @@ struct nf_nat_l4proto {
 	 */
 	void (*unique_tuple)(const struct nf_nat_l3proto *l3proto,
 			     struct nf_conntrack_tuple *tuple,
-			     const struct nf_nat_range *range,
+			     const struct nf_nat_range2 *range,
 			     enum nf_nat_manip_type maniptype,
 			     const struct nf_conn *ct);
 
 	int (*nlattr_to_range)(struct nlattr *tb[],
-			       struct nf_nat_range *range);
+			       struct nf_nat_range2 *range);
 };
 
 /* Protocol registration. */
@@ -72,11 +72,11 @@ bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 
 void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct, u16 *rover);
 
 int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
-				   struct nf_nat_range *range);
+				   struct nf_nat_range2 *range);
 
 #endif /*_NF_NAT_L4PROTO_H*/
diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h
index 5ddabb08c472..c129aacc8ae8 100644
--- a/include/net/netfilter/nf_nat_redirect.h
+++ b/include/net/netfilter/nf_nat_redirect.h
@@ -7,7 +7,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 		     const struct nf_nat_ipv4_multi_range_compat *mr,
 		     unsigned int hooknum);
 unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum);
 
 #endif /* _NF_NAT_REDIRECT_H_ */
diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
index a33000da7229..4a95c0db14d4 100644
--- a/include/uapi/linux/netfilter/nf_nat.h
+++ b/include/uapi/linux/netfilter/nf_nat.h
@@ -10,6 +10,7 @@
 #define NF_NAT_RANGE_PROTO_RANDOM		(1 << 2)
 #define NF_NAT_RANGE_PERSISTENT			(1 << 3)
 #define NF_NAT_RANGE_PROTO_RANDOM_FULLY		(1 << 4)
+#define NF_NAT_RANGE_PROTO_OFFSET		(1 << 5)
 
 #define NF_NAT_RANGE_PROTO_RANDOM_ALL		\
 	(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
@@ -17,7 +18,7 @@
 #define NF_NAT_RANGE_MASK					\
 	(NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |	\
 	 NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |	\
-	 NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+	 NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET)
 
 struct nf_nat_ipv4_range {
 	unsigned int			flags;
@@ -40,4 +41,13 @@ struct nf_nat_range {
 	union nf_conntrack_man_proto	max_proto;
 };
 
+struct nf_nat_range2 {
+	unsigned int			flags;
+	union nf_inet_addr		min_addr;
+	union nf_inet_addr		max_addr;
+	union nf_conntrack_man_proto	min_proto;
+	union nf_conntrack_man_proto	max_proto;
+	union nf_conntrack_man_proto	base_proto;
+};
+
 #endif /* _NETFILTER_NF_NAT_H */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index a03e4e7ef5f9..ce1512b02cb2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
 static unsigned int
 masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	const struct nf_nat_ipv4_multi_range_compat *mr;
 
 	mr = par->targinfo;
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index ac8342dcb55e..4e6b53ab6c33 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_q931_expect(struct nf_conn *new,
 			       struct nf_conntrack_expect *this)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	if (this->tuple.src.u3.ip != 0) {	/* Only accept calls from GK */
 		nf_nat_follow_master(new, this);
@@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_callforwarding_expect(struct nf_conn *new,
 					 struct nf_conntrack_expect *this)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(new->status & IPS_NAT_DONE_MASK);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f7ff6a364d7b..4346336cee4c 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
 #endif /* CONFIG_XFRM */
 
 static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
-				 const struct nf_nat_range *range)
+				 const struct nf_nat_range2 *range)
 {
 	return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
 	       ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
@@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
-				       struct nf_nat_range *range)
+				       struct nf_nat_range2 *range)
 {
 	if (tb[CTA_NAT_V4_MINIP]) {
 		range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index 0c366aad89cb..f538c5001547 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -24,13 +24,13 @@
 
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range *range,
+		       const struct nf_nat_range2 *range,
 		       const struct net_device *out)
 {
 	struct nf_conn *ct;
 	struct nf_conn_nat *nat;
 	enum ip_conntrack_info ctinfo;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 	const struct rtable *rt;
 	__be32 newsrc, nh;
 
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 8a69363b4884..5d259a12e25f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 	struct nf_conntrack_tuple t = {};
 	const struct nf_ct_pptp_master *ct_pptp_info;
 	const struct nf_nat_pptp *nat_pptp_info;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	struct nf_conn_nat *nat;
 
 	nat = nf_ct_nat_ext_add(ct);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index edf05002d674..00fda6331ce5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 static void
 gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 7b98baa13ede..6d7cf1d79baf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 static void
 icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f18677277119..f1193e1e928a 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_masq *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 92c0047e7e33..491f808e356a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 6b7f075f811f..56d75eb5448f 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
 #endif
 
 static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
-				 const struct nf_nat_range *range)
+				 const struct nf_nat_range2 *range)
 {
 	return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
 	       ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
@@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
-				       struct nf_nat_range *range)
+				       struct nf_nat_range2 *range)
 {
 	if (tb[CTA_NAT_V6_MINIP]) {
 		nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 98f61fcb9108..9dfc2b90c362 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -26,14 +26,14 @@
 static atomic_t v6_worker_count;
 
 unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out)
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_nat *nat;
 	struct in6_addr src;
 	struct nf_conn *ct;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index 57593b00c5b4..d9bf42ba44fa 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
 static void
 icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		    struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range,
+		    const struct nf_nat_range2 *range,
 		    enum nf_nat_manip_type maniptype,
 		    const struct nf_conn *ct)
 {
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 4146536e9c15..dd0122f3cffe 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_masq *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index a27e424f690d..74269865acc8 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
 				const struct nft_pktinfo *pkt)
 {
 	struct nft_redir *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	if (priv->sreg_proto_min) {
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 617693ff9f4c..37b3c9913b08 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple);
 static int in_range(const struct nf_nat_l3proto *l3proto,
 		    const struct nf_nat_l4proto *l4proto,
 		    const struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range)
+		    const struct nf_nat_range2 *range)
 {
 	/* If we are supposed to map IPs, then we must be in the
 	 * range specified, otherwise let this drag us onto a new src IP.
@@ -194,7 +194,7 @@ find_appropriate_src(struct net *net,
 		     const struct nf_nat_l4proto *l4proto,
 		     const struct nf_conntrack_tuple *tuple,
 		     struct nf_conntrack_tuple *result,
-		     const struct nf_nat_range *range)
+		     const struct nf_nat_range2 *range)
 {
 	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
@@ -224,7 +224,7 @@ find_appropriate_src(struct net *net,
 static void
 find_best_ips_proto(const struct nf_conntrack_zone *zone,
 		    struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range,
+		    const struct nf_nat_range2 *range,
 		    const struct nf_conn *ct,
 		    enum nf_nat_manip_type maniptype)
 {
@@ -298,7 +298,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
 static void
 get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_conntrack_tuple *orig_tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 struct nf_conn *ct,
 		 enum nf_nat_manip_type maniptype)
 {
@@ -349,9 +349,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	/* Only bother mapping if it's not already in range and unique */
 	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
 		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
-			if (l4proto->in_range(tuple, maniptype,
-					      &range->min_proto,
-					      &range->max_proto) &&
+			if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
+			    l4proto->in_range(tuple, maniptype,
+			          &range->min_proto,
+			          &range->max_proto) &&
 			    (range->min_proto.all == range->max_proto.all ||
 			     !nf_nat_used_tuple(tuple, ct)))
 				goto out;
@@ -360,7 +361,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		}
 	}
 
-	/* Last change: get protocol to try to obtain unique tuple. */
+	/* Last chance: get protocol to try to obtain unique tuple. */
 	l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
 out:
 	rcu_read_unlock();
@@ -381,7 +382,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
 
 unsigned int
 nf_nat_setup_info(struct nf_conn *ct,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype)
 {
 	struct net *net = nf_ct_net(ct);
@@ -459,7 +460,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
 		(manip == NF_NAT_MANIP_SRC ?
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
-	struct nf_nat_range range = {
+	struct nf_nat_range2 range = {
 		.flags		= NF_NAT_RANGE_MAP_IPS,
 		.min_addr	= ip,
 		.max_addr	= ip,
@@ -702,7 +703,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
 
 static int nfnetlink_parse_nat_proto(struct nlattr *attr,
 				     const struct nf_conn *ct,
-				     struct nf_nat_range *range)
+				     struct nf_nat_range2 *range)
 {
 	struct nlattr *tb[CTA_PROTONAT_MAX+1];
 	const struct nf_nat_l4proto *l4proto;
@@ -730,7 +731,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
 
 static int
 nfnetlink_parse_nat(const struct nlattr *nat,
-		    const struct nf_conn *ct, struct nf_nat_range *range,
+		    const struct nf_conn *ct, struct nf_nat_range2 *range,
 		    const struct nf_nat_l3proto *l3proto)
 {
 	struct nlattr *tb[CTA_NAT_MAX+1];
@@ -758,7 +759,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 			  enum nf_nat_manip_type manip,
 			  const struct nlattr *attr)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	const struct nf_nat_l3proto *l3proto;
 	int err;
 
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 607a373379b4..99606baedda4 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
 void nf_nat_follow_master(struct nf_conn *ct,
 			  struct nf_conntrack_expect *exp)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 7d7466dbf663..5d849d835561 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
 
 void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct,
 				 u16 *rover)
@@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 						  : tuple->src.u.all);
 	} else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
 		off = prandom_u32();
+	} else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) {
+		off = (ntohs(*portptr) - ntohs(range->base_proto.all));
 	} else {
 		off = *rover;
 	}
@@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		*portptr = htons(min + off % range_size);
 		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
 			continue;
-		if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+		if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
+					NF_NAT_RANGE_PROTO_OFFSET)))
 			*rover = off;
 		return;
 	}
@@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
-				   struct nf_nat_range *range)
+				   struct nf_nat_range2 *range)
 {
 	if (tb[CTA_PROTONAT_PORT_MIN]) {
 		range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 269fcd5dc34c..67ea0d83aa5a 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover;
 static void
 dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index c57ee3240b1d..1c5d9b65fbba 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover;
 static void
 sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 4f8820fc5148..f15fcd475f98 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -23,7 +23,7 @@ static u16 tcp_port_rover;
 static void
 tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index edd4a77dc09a..5790f70a83b2 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -22,7 +22,7 @@ static u16 udp_port_rover;
 static void
 udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
@@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
 static void
 udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		     struct nf_conntrack_tuple *tuple,
-		     const struct nf_nat_range *range,
+		     const struct nf_nat_range2 *range,
 		     enum nf_nat_manip_type maniptype,
 		     const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
index 6e494d584412..c5db3e251232 100644
--- a/net/netfilter/nf_nat_proto_unknown.c
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
 
 static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 25b06b959118..7c4bb0a773ca 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -36,7 +36,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	__be32 newdst;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
 		hooknum != NF_INET_LOCAL_OUT);
@@ -82,10 +82,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
 static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
 
 unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum)
 {
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 	struct in6_addr newdst;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 791fac4fd745..1f3086074981 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
 static void nf_nat_sip_expected(struct nf_conn *ct,
 				struct nf_conntrack_expect *exp)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 1f36954c2ba9..c15807d10b91 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr,
 	const struct nft_nat *priv = nft_expr_priv(expr);
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	if (priv->sreg_addr_min) {
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 58aa9dd3c5b7..1d437875e15a 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -21,8 +21,8 @@
 static unsigned int
 netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
-	struct nf_nat_range newrange;
+	const struct nf_nat_range2 *range = par->targinfo;
+	struct nf_nat_range2 newrange;
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	union nf_inet_addr new_addr, netmask;
@@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
 		return -EINVAL;
@@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	enum ip_conntrack_info ctinfo;
 	__be32 new_ip, netmask;
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING &&
 		xt_hooknum(par) != NF_INET_POST_ROUTING &&
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 98a4c6d4f1cb..5ce9461e979c 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bdb689cdc829..8af9707f8789 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par)
 	nf_ct_netns_put(par->net, par->family);
 }
 
-static void xt_nat_convert_range(struct nf_nat_range *dst,
+static void xt_nat_convert_range(struct nf_nat_range2 *dst,
 				 const struct nf_nat_ipv4_range *src)
 {
 	memset(&dst->min_addr, 0, sizeof(dst->min_addr));
 	memset(&dst->max_addr, 0, sizeof(dst->max_addr));
+	memset(&dst->base_proto, 0, sizeof(dst->base_proto));
 
 	dst->flags	 = src->flags;
 	dst->min_addr.ip = src->min_ip;
@@ -54,7 +55,7 @@ static unsigned int
 xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -71,7 +72,7 @@ static unsigned int
 xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 static unsigned int
 xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range *range_v1 = par->targinfo;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
 		  ctinfo == IP_CT_RELATED_REPLY)));
 
-	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+	memcpy(&range, range_v1, sizeof(*range_v1));
+	memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
 }
 
 static unsigned int
 xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range *range_v1 = par->targinfo;
+	struct nf_nat_range2 range;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
+
+	memcpy(&range, range_v1, sizeof(*range_v1));
+	memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+static unsigned int
+xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct nf_nat_range2 *range = par->targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+		  ctinfo == IP_CT_RELATED_REPLY)));
+
+	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct nf_nat_range2 *range = par->targinfo;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
 				  (1 << NF_INET_LOCAL_OUT),
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "SNAT",
+		.revision	= 2,
+		.checkentry	= xt_nat_checkentry,
+		.destroy	= xt_nat_destroy,
+		.target		= xt_snat_target_v2,
+		.targetsize	= sizeof(struct nf_nat_range2),
+		.table		= "nat",
+		.hooks		= (1 << NF_INET_POST_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "DNAT",
+		.revision	= 2,
+		.target		= xt_dnat_target_v2,
+		.targetsize	= sizeof(struct nf_nat_range2),
+		.table		= "nat",
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_OUT),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init xt_nat_init(void)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c5904f629091..02fc343feb66 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -72,7 +72,7 @@ struct ovs_conntrack_info {
 	struct md_mark mark;
 	struct md_labels labels;
 #ifdef CONFIG_NF_NAT_NEEDED
-	struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+	struct nf_nat_range2 range;  /* Only present for SRC NAT and DST NAT. */
 #endif
 };
 
@@ -710,7 +710,7 @@ static bool skb_nfct_cached(struct net *net,
  */
 static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
 			      enum ip_conntrack_info ctinfo,
-			      const struct nf_nat_range *range,
+			      const struct nf_nat_range2 *range,
 			      enum nf_nat_manip_type maniptype)
 {
 	int hooknum, nh_off, err = NF_ACCEPT;
-- 
cgit v1.2.3


From cd9a5a15808403a7895c51b1378168d6c75cf8a6 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 9 Apr 2018 00:00:57 +0900
Subject: netfilter: ebtables: remove EBT_MATCH and EBT_NOMATCH

EBT_MATCH and EBT_NOMATCH are used to change return value.
match functions(ebt_xxx.c) return false when received frame is not matched
and returns true when received frame is matched.
but, EBT_MATCH_ITERATE understands oppositely.
so, to change return value, EBT_MATCH and EBT_NOMATCH are used.
but, we can use operation '!' simply.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h | 4 ----
 net/bridge/netfilter/ebtables.c           | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 0773b5a032f1..c6935be7c6ca 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -17,10 +17,6 @@
 #include <linux/if_ether.h>
 #include <uapi/linux/netfilter_bridge/ebtables.h>
 
-/* return values for match() functions */
-#define EBT_MATCH 0
-#define EBT_NOMATCH 1
-
 struct ebt_match {
 	struct list_head list;
 	const char name[EBT_FUNCTION_MAXNAMELEN];
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 355410b13316..7c07221369c0 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
 {
 	par->match     = m->u.match;
 	par->matchinfo = m->data;
-	return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH;
+	return !m->u.match->match(skb, par);
 }
 
 static inline int
-- 
cgit v1.2.3


From a1d768f1a00db556e2aae9f92bdb38671e601da5 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 13 Apr 2018 23:09:58 +0900
Subject: netfilter: ebtables: add ebt_get_target and ebt_get_target_c

ebt_get_target similar to {ip/ip6/arp}t_get_target.
and ebt_get_target_c similar to {ip/ip6/arp}t_get_target_c.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_bridge/ebtables.h |  6 ++++++
 net/bridge/netfilter/ebtables.c                | 22 +++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h
index 0c7dc8315013..3b86c14ea49d 100644
--- a/include/uapi/linux/netfilter_bridge/ebtables.h
+++ b/include/uapi/linux/netfilter_bridge/ebtables.h
@@ -191,6 +191,12 @@ struct ebt_entry {
 	unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace))));
 };
 
+static __inline__ struct ebt_entry_target *
+ebt_get_target(struct ebt_entry *e)
+{
+	return (void *)e + e->target_offset;
+}
+
 /* {g,s}etsockopt numbers */
 #define EBT_BASE_CTL            128
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 7c07221369c0..9be240129448 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
 	return (void *)entry + entry->next_offset;
 }
 
+static inline const struct ebt_entry_target *
+ebt_get_target_c(const struct ebt_entry *e)
+{
+	return ebt_get_target((struct ebt_entry *)e);
+}
+
 /* Do some firewalling */
 unsigned int ebt_do_table(struct sk_buff *skb,
 			  const struct nf_hook_state *state,
@@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
 		 */
 		EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
 
-		t = (struct ebt_entry_target *)
-		   (((char *)point) + point->target_offset);
+		t = ebt_get_target_c(point);
 		/* standard target */
 		if (!t->u.target->target)
 			verdict = ((struct ebt_standard_target *)t)->verdict;
@@ -637,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
 		return 1;
 	EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
 	EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target(e);
 
 	par.net      = net;
 	par.target   = t->u.target;
@@ -716,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
 	ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
 	if (ret != 0)
 		goto cleanup_watchers;
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target(e);
 	gap = e->next_offset - e->target_offset;
 
 	target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
@@ -789,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
 			if (pos == nentries)
 				continue;
 		}
-		t = (struct ebt_entry_target *)
-		   (((char *)e) + e->target_offset);
+		t = ebt_get_target_c(e);
 		if (strcmp(t->u.name, EBT_STANDARD_TARGET))
 			goto letscontinue;
 		if (e->target_offset + sizeof(struct ebt_standard_target) >
@@ -1396,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
 		return -EFAULT;
 
 	hlp = ubase + (((char *)e + e->target_offset) - base);
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target_c(e);
 
 	ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
 	if (ret != 0)
@@ -1737,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
 		return ret;
 	target_offset = e->target_offset - (origsize - *size);
 
-	t = (struct ebt_entry_target *) ((char *) e + e->target_offset);
+	t = ebt_get_target(e);
 
 	ret = compat_target_to_user(t, dstptr, size);
 	if (ret)
@@ -1785,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e,
 	EBT_MATCH_ITERATE(e, compat_calc_match, &off);
 	EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
 
-	t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
+	t = ebt_get_target_c(e);
 
 	off += xt_compat_target_offset(t->u.target);
 	off += ebt_compat_entry_padsize();
-- 
cgit v1.2.3


From 8e1102d5a1596dca10f51e3de800809944f8816d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 18:04:49 +0200
Subject: netfilter: nf_tables: support timeouts larger than 23 days

Marco De Benedetto says:
 I would like to use a timeout of 30 days for elements in a set but it
 seems there is a some kind of problem above 24d20h31m23s.

Fix this by using 'jiffies64' for timeout handling to get same behaviour
on 32 and 64bit systems.

nftables passes timeouts as u64 in milliseconds to the kernel,
but on kernel side we used a mixture of 'long' and jiffies conversions
rather than u64 and jiffies64.

Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1237
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++--
 net/netfilter/nf_tables_api.c     | 50 +++++++++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index de77d36e36b3..435c9e3b9181 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -585,7 +585,7 @@ static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext)
 	return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT);
 }
 
-static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext)
+static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext)
 {
 	return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION);
 }
@@ -603,7 +603,7 @@ static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext)
 static inline bool nft_set_elem_expired(const struct nft_set_ext *ext)
 {
 	return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) &&
-	       time_is_before_eq_jiffies(*nft_set_ext_expiration(ext));
+	       time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext));
 }
 
 static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9ce35acf491d..d57aeea89a79 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2779,6 +2779,27 @@ cont:
 	return 0;
 }
 
+static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+{
+	u64 ms = be64_to_cpu(nla_get_be64(nla));
+	u64 max = (u64)(~((u64)0));
+
+	max = div_u64(max, NSEC_PER_MSEC);
+	if (ms >= max)
+		return -ERANGE;
+
+	ms *= NSEC_PER_MSEC;
+	*result = nsecs_to_jiffies64(ms);
+	return 0;
+}
+
+static u64 nf_jiffies64_to_msecs(u64 input)
+{
+	u64 ms = jiffies64_to_nsecs(input);
+
+	return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
+}
+
 static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 			      const struct nft_set *set, u16 event, u16 flags)
 {
@@ -2826,7 +2847,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 
 	if (set->timeout &&
 	    nla_put_be64(skb, NFTA_SET_TIMEOUT,
-			 cpu_to_be64(jiffies_to_msecs(set->timeout)),
+			 nf_jiffies64_to_msecs(set->timeout),
 			 NFTA_SET_PAD))
 		goto nla_put_failure;
 	if (set->gc_int &&
@@ -3122,8 +3143,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_TIMEOUT] != NULL) {
 		if (!(flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
-						nla[NFTA_SET_TIMEOUT])));
+
+		err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout);
+		if (err)
+			return err;
 	}
 	gc_int = 0;
 	if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -3387,8 +3410,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
 		.align	= __alignof__(u64),
 	},
 	[NFT_SET_EXT_EXPIRATION]	= {
-		.len	= sizeof(unsigned long),
-		.align	= __alignof__(unsigned long),
+		.len	= sizeof(u64),
+		.align	= __alignof__(u64),
 	},
 	[NFT_SET_EXT_USERDATA]		= {
 		.len	= sizeof(struct nft_userdata),
@@ -3481,22 +3504,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
 	    nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
-			 cpu_to_be64(jiffies_to_msecs(
-						*nft_set_ext_timeout(ext))),
+			 nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
 			 NFTA_SET_ELEM_PAD))
 		goto nla_put_failure;
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
-		unsigned long expires, now = jiffies;
+		u64 expires, now = get_jiffies_64();
 
 		expires = *nft_set_ext_expiration(ext);
-		if (time_before(now, expires))
+		if (time_before64(now, expires))
 			expires -= now;
 		else
 			expires = 0;
 
 		if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
-				 cpu_to_be64(jiffies_to_msecs(expires)),
+				 nf_jiffies64_to_msecs(expires),
 				 NFTA_SET_ELEM_PAD))
 			goto nla_put_failure;
 	}
@@ -3871,7 +3893,7 @@ void *nft_set_elem_init(const struct nft_set *set,
 		memcpy(nft_set_ext_data(ext), data, set->dlen);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
 		*nft_set_ext_expiration(ext) =
-			jiffies + timeout;
+			get_jiffies_64() + timeout;
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
 		*nft_set_ext_timeout(ext) = timeout;
 
@@ -3958,8 +3980,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
 		if (!(set->flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
-					nla[NFTA_SET_ELEM_TIMEOUT])));
+		err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
+					    &timeout);
+		if (err)
+			return err;
 	} else if (set->flags & NFT_SET_TIMEOUT) {
 		timeout = set->timeout;
 	}
-- 
cgit v1.2.3


From bd2bbdb497dba24b9ca7f6257c83e496c64b6e9d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:53 +0200
Subject: netfilter: merge meta_bridge into nft_meta

It overcomplicates things for no reason.
nft_meta_bridge only offers retrieval of bridge port interface name.

Because of this being its own module, we had to export all nft_meta
functions, which we can then make static again (which even reduces
the size of nft_meta -- including bridge port retrieval...):

before:
   text    data     bss     dec     hex filename
   1838     832       0    2670     a6e net/bridge/netfilter/nft_meta_bridge.ko
   6147     936       1    7084    1bac net/netfilter/nft_meta.ko

after:
   5826     936       1    6763    1a6b net/netfilter/nft_meta.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h       |  44 -----------
 net/bridge/netfilter/Kconfig           |   7 --
 net/bridge/netfilter/Makefile          |   1 -
 net/bridge/netfilter/nft_meta_bridge.c | 135 ---------------------------------
 net/netfilter/nft_meta.c               |  90 ++++++++++++++--------
 5 files changed, 58 insertions(+), 219 deletions(-)
 delete mode 100644 include/net/netfilter/nft_meta.h
 delete mode 100644 net/bridge/netfilter/nft_meta_bridge.c

(limited to 'include')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
deleted file mode 100644
index 5c69e9b09388..000000000000
--- a/include/net/netfilter/nft_meta.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NFT_META_H_
-#define _NFT_META_H_
-
-struct nft_meta {
-	enum nft_meta_keys	key:8;
-	union {
-		enum nft_registers	dreg:8;
-		enum nft_registers	sreg:8;
-	};
-};
-
-extern const struct nla_policy nft_meta_policy[];
-
-int nft_meta_get_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[]);
-
-int nft_meta_set_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[]);
-
-int nft_meta_get_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr);
-
-int nft_meta_set_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr);
-
-void nft_meta_get_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt);
-
-void nft_meta_set_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt);
-
-void nft_meta_set_destroy(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr);
-
-int nft_meta_set_validate(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr,
-			  const struct nft_data **data);
-
-#endif
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index f212447794bd..9a0159aebe1a 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE
 	bool "Ethernet Bridge nf_tables support"
 
 if NF_TABLES_BRIDGE
-
-config NFT_BRIDGE_META
-	tristate "Netfilter nf_table bridge meta support"
-	depends on NFT_META
-	help
-	  Add support for bridge dedicated meta key.
-
 config NFT_BRIDGE_REJECT
 	tristate "Netfilter nf_tables bridge reject support"
 	depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 4bc758dd4a8c..9b868861f21a 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,7 +3,6 @@
 # Makefile for the netfilter modules for Link Layer filtering on a bridge.
 #
 
-obj-$(CONFIG_NFT_BRIDGE_META)  += nft_meta_bridge.o
 obj-$(CONFIG_NFT_BRIDGE_REJECT)  += nft_reject_bridge.o
 
 # packet logging
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
deleted file mode 100644
index bb63c9aed55d..000000000000
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2014 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nft_meta.h>
-
-#include "../br_private.h"
-
-static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
-				     struct nft_regs *regs,
-				     const struct nft_pktinfo *pkt)
-{
-	const struct nft_meta *priv = nft_expr_priv(expr);
-	const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
-	u32 *dest = &regs->data[priv->dreg];
-	const struct net_bridge_port *p;
-
-	switch (priv->key) {
-	case NFT_META_BRI_IIFNAME:
-		if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
-			goto err;
-		break;
-	case NFT_META_BRI_OIFNAME:
-		if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
-			goto err;
-		break;
-	default:
-		goto out;
-	}
-
-	strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
-	return;
-out:
-	return nft_meta_get_eval(expr, regs, pkt);
-err:
-	regs->verdict.code = NFT_BREAK;
-}
-
-static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
-				    const struct nft_expr *expr,
-				    const struct nlattr * const tb[])
-{
-	struct nft_meta *priv = nft_expr_priv(expr);
-	unsigned int len;
-
-	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
-	switch (priv->key) {
-	case NFT_META_BRI_IIFNAME:
-	case NFT_META_BRI_OIFNAME:
-		len = IFNAMSIZ;
-		break;
-	default:
-		return nft_meta_get_init(ctx, expr, tb);
-	}
-
-	priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
-	return nft_validate_register_store(ctx, priv->dreg, NULL,
-					   NFT_DATA_VALUE, len);
-}
-
-static struct nft_expr_type nft_meta_bridge_type;
-static const struct nft_expr_ops nft_meta_bridge_get_ops = {
-	.type		= &nft_meta_bridge_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
-	.eval		= nft_meta_bridge_get_eval,
-	.init		= nft_meta_bridge_get_init,
-	.dump		= nft_meta_get_dump,
-};
-
-static const struct nft_expr_ops nft_meta_bridge_set_ops = {
-	.type		= &nft_meta_bridge_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
-	.eval		= nft_meta_set_eval,
-	.init		= nft_meta_set_init,
-	.destroy	= nft_meta_set_destroy,
-	.dump		= nft_meta_set_dump,
-	.validate	= nft_meta_set_validate,
-};
-
-static const struct nft_expr_ops *
-nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
-			   const struct nlattr * const tb[])
-{
-	if (tb[NFTA_META_KEY] == NULL)
-		return ERR_PTR(-EINVAL);
-
-	if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
-		return ERR_PTR(-EINVAL);
-
-	if (tb[NFTA_META_DREG])
-		return &nft_meta_bridge_get_ops;
-
-	if (tb[NFTA_META_SREG])
-		return &nft_meta_bridge_set_ops;
-
-	return ERR_PTR(-EINVAL);
-}
-
-static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
-	.family         = NFPROTO_BRIDGE,
-	.name           = "meta",
-	.select_ops     = nft_meta_bridge_select_ops,
-	.policy         = nft_meta_policy,
-	.maxattr        = NFTA_META_MAX,
-	.owner          = THIS_MODULE,
-};
-
-static int __init nft_meta_bridge_module_init(void)
-{
-	return nft_register_expr(&nft_meta_bridge_type);
-}
-
-static void __exit nft_meta_bridge_module_exit(void)
-{
-	nft_unregister_expr(&nft_meta_bridge_type);
-}
-
-module_init(nft_meta_bridge_module_init);
-module_exit(nft_meta_bridge_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8fb91940e2e7..6c0b82628117 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -1,5 +1,7 @@
 /*
  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2014 Intel Corporation
+ * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -24,21 +26,35 @@
 #include <net/tcp_states.h> /* for TCP_TIME_WAIT */
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
-#include <net/netfilter/nft_meta.h>
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
+struct nft_meta {
+	enum nft_meta_keys	key:8;
+	union {
+		enum nft_registers	dreg:8;
+		enum nft_registers	sreg:8;
+	};
+};
+
 static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
 
-void nft_meta_get_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt)
+#ifdef CONFIG_NF_TABLES_BRIDGE
+#include "../bridge/br_private.h"
+#endif
+
+static void nft_meta_get_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 	const struct sk_buff *skb = pkt->skb;
 	const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
 	struct sock *sk;
 	u32 *dest = &regs->data[priv->dreg];
+#ifdef CONFIG_NF_TABLES_BRIDGE
+	const struct net_bridge_port *p;
+#endif
 
 	switch (priv->key) {
 	case NFT_META_LEN:
@@ -214,6 +230,18 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 	case NFT_META_SECPATH:
 		nft_reg_store8(dest, !!skb->sp);
 		break;
+#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+	case NFT_META_BRI_IIFNAME:
+		if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
+			goto err;
+		strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+		return;
+	case NFT_META_BRI_OIFNAME:
+		if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
+			goto err;
+		strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+		return;
 #endif
 	default:
 		WARN_ON(1);
@@ -224,11 +252,10 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 err:
 	regs->verdict.code = NFT_BREAK;
 }
-EXPORT_SYMBOL_GPL(nft_meta_get_eval);
 
-void nft_meta_set_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt)
+static void nft_meta_set_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
 {
 	const struct nft_meta *meta = nft_expr_priv(expr);
 	struct sk_buff *skb = pkt->skb;
@@ -258,18 +285,16 @@ void nft_meta_set_eval(const struct nft_expr *expr,
 		WARN_ON(1);
 	}
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_eval);
 
-const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
 	[NFTA_META_DREG]	= { .type = NLA_U32 },
 	[NFTA_META_KEY]		= { .type = NLA_U32 },
 	[NFTA_META_SREG]	= { .type = NLA_U32 },
 };
-EXPORT_SYMBOL_GPL(nft_meta_policy);
 
-int nft_meta_get_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[])
+static int nft_meta_get_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int len;
@@ -317,6 +342,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_SECPATH:
 		len = sizeof(u8);
 		break;
+#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+	case NFT_META_BRI_IIFNAME:
+	case NFT_META_BRI_OIFNAME:
+		if (ctx->family != NFPROTO_BRIDGE)
+			return -EOPNOTSUPP;
+		len = IFNAMSIZ;
+		break;
 #endif
 	default:
 		return -EOPNOTSUPP;
@@ -326,7 +359,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, len);
 }
-EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
 static int nft_meta_get_validate(const struct nft_ctx *ctx,
 				 const struct nft_expr *expr,
@@ -360,9 +392,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
 #endif
 }
 
-int nft_meta_set_validate(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr,
-			  const struct nft_data **data)
+static int nft_meta_set_validate(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr,
+				 const struct nft_data **data)
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int hooks;
@@ -388,11 +420,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
 
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_validate);
 
-int nft_meta_set_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[])
+static int nft_meta_set_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int len;
@@ -424,10 +455,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_init);
 
-int nft_meta_get_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr)
+static int nft_meta_get_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
@@ -440,10 +470,8 @@ int nft_meta_get_dump(struct sk_buff *skb,
 nla_put_failure:
 	return -1;
 }
-EXPORT_SYMBOL_GPL(nft_meta_get_dump);
 
-int nft_meta_set_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr)
+static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
@@ -457,17 +485,15 @@ int nft_meta_set_dump(struct sk_buff *skb,
 nla_put_failure:
 	return -1;
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_dump);
 
-void nft_meta_set_destroy(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr)
+static void nft_meta_set_destroy(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
 	if (priv->key == NFT_META_NFTRACE)
 		static_branch_dec(&nft_trace_enabled);
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
 
 static struct nft_expr_type nft_meta_type;
 static const struct nft_expr_ops nft_meta_get_ops = {
-- 
cgit v1.2.3


From f0316f93897c4c4e67278b175bfbfd3a95ba650a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 5 Dec 2015 18:41:28 +0000
Subject: drm/i2c: tda9950: add CEC driver

Add a CEC driver for the TDA9950, which is a stand-alone I2C CEC device,
but is also integrated into HDMI transceivers such as the TDA9989 and
TDA19989.

The TDA9950 contains a command processor which handles retransmissions
and the low level bus protocol.  The driver just has to read and write
the messages, and handle error conditions.

Reviewed-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/gpu/drm/i2c/Kconfig           |   5 +
 drivers/gpu/drm/i2c/Makefile          |   1 +
 drivers/gpu/drm/i2c/tda9950.c         | 509 ++++++++++++++++++++++++++++++++++
 include/linux/platform_data/tda9950.h |  16 ++
 4 files changed, 531 insertions(+)
 create mode 100644 drivers/gpu/drm/i2c/tda9950.c
 create mode 100644 include/linux/platform_data/tda9950.h

(limited to 'include')

diff --git a/drivers/gpu/drm/i2c/Kconfig b/drivers/gpu/drm/i2c/Kconfig
index a6c92beb410a..3a232f5ff0a1 100644
--- a/drivers/gpu/drm/i2c/Kconfig
+++ b/drivers/gpu/drm/i2c/Kconfig
@@ -26,4 +26,9 @@ config DRM_I2C_NXP_TDA998X
 	help
 	  Support for NXP Semiconductors TDA998X HDMI encoders.
 
+config DRM_I2C_NXP_TDA9950
+	tristate "NXP Semiconductors TDA9950/TDA998X HDMI CEC"
+	select CEC_NOTIFIER
+	select CEC_CORE
+
 endmenu
diff --git a/drivers/gpu/drm/i2c/Makefile b/drivers/gpu/drm/i2c/Makefile
index b20100c18ffb..a962f6f08568 100644
--- a/drivers/gpu/drm/i2c/Makefile
+++ b/drivers/gpu/drm/i2c/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_DRM_I2C_SIL164) += sil164.o
 
 tda998x-y := tda998x_drv.o
 obj-$(CONFIG_DRM_I2C_NXP_TDA998X) += tda998x.o
+obj-$(CONFIG_DRM_I2C_NXP_TDA9950) += tda9950.o
diff --git a/drivers/gpu/drm/i2c/tda9950.c b/drivers/gpu/drm/i2c/tda9950.c
new file mode 100644
index 000000000000..3f7396caad48
--- /dev/null
+++ b/drivers/gpu/drm/i2c/tda9950.c
@@ -0,0 +1,509 @@
+/*
+ *  TDA9950 Consumer Electronics Control driver
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * The NXP TDA9950 implements the HDMI Consumer Electronics Control
+ * interface.  The host interface is similar to a mailbox: the data
+ * registers starting at REG_CDR0 are written to send a command to the
+ * internal CPU, and replies are read from these registers.
+ *
+ * As the data registers represent a mailbox, they must be accessed
+ * as a single I2C transaction.  See the TDA9950 data sheet for details.
+ */
+#include <linux/delay.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/platform_data/tda9950.h>
+#include <linux/slab.h>
+#include <drm/drm_edid.h>
+#include <media/cec.h>
+#include <media/cec-notifier.h>
+
+enum {
+	REG_CSR = 0x00,
+	CSR_BUSY = BIT(7),
+	CSR_INT  = BIT(6),
+	CSR_ERR  = BIT(5),
+
+	REG_CER = 0x01,
+
+	REG_CVR = 0x02,
+
+	REG_CCR = 0x03,
+	CCR_RESET = BIT(7),
+	CCR_ON    = BIT(6),
+
+	REG_ACKH = 0x04,
+	REG_ACKL = 0x05,
+
+	REG_CCONR = 0x06,
+	CCONR_ENABLE_ERROR = BIT(4),
+	CCONR_RETRY_MASK = 7,
+
+	REG_CDR0 = 0x07,
+
+	CDR1_REQ = 0x00,
+	CDR1_CNF = 0x01,
+	CDR1_IND = 0x81,
+	CDR1_ERR = 0x82,
+	CDR1_IER = 0x83,
+
+	CDR2_CNF_SUCCESS    = 0x00,
+	CDR2_CNF_OFF_STATE  = 0x80,
+	CDR2_CNF_BAD_REQ    = 0x81,
+	CDR2_CNF_CEC_ACCESS = 0x82,
+	CDR2_CNF_ARB_ERROR  = 0x83,
+	CDR2_CNF_BAD_TIMING = 0x84,
+	CDR2_CNF_NACK_ADDR  = 0x85,
+	CDR2_CNF_NACK_DATA  = 0x86,
+};
+
+struct tda9950_priv {
+	struct i2c_client *client;
+	struct device *hdmi;
+	struct cec_adapter *adap;
+	struct tda9950_glue *glue;
+	u16 addresses;
+	struct cec_msg rx_msg;
+	struct cec_notifier *notify;
+	bool open;
+};
+
+static int tda9950_write_range(struct i2c_client *client, u8 addr, u8 *p, int cnt)
+{
+	struct i2c_msg msg;
+	u8 buf[cnt + 1];
+	int ret;
+
+	buf[0] = addr;
+	memcpy(buf + 1, p, cnt);
+
+	msg.addr = client->addr;
+	msg.flags = 0;
+	msg.len = cnt + 1;
+	msg.buf = buf;
+
+	dev_dbg(&client->dev, "wr 0x%02x: %*ph\n", addr, cnt, p);
+
+	ret = i2c_transfer(client->adapter, &msg, 1);
+	if (ret < 0)
+		dev_err(&client->dev, "Error %d writing to cec:0x%x\n", ret, addr);
+	return ret < 0 ? ret : 0;
+}
+
+static void tda9950_write(struct i2c_client *client, u8 addr, u8 val)
+{
+	tda9950_write_range(client, addr, &val, 1);
+}
+
+static int tda9950_read_range(struct i2c_client *client, u8 addr, u8 *p, int cnt)
+{
+	struct i2c_msg msg[2];
+	int ret;
+
+	msg[0].addr = client->addr;
+	msg[0].flags = 0;
+	msg[0].len = 1;
+	msg[0].buf = &addr;
+	msg[1].addr = client->addr;
+	msg[1].flags = I2C_M_RD;
+	msg[1].len = cnt;
+	msg[1].buf = p;
+
+	ret = i2c_transfer(client->adapter, msg, 2);
+	if (ret < 0)
+		dev_err(&client->dev, "Error %d reading from cec:0x%x\n", ret, addr);
+
+	dev_dbg(&client->dev, "rd 0x%02x: %*ph\n", addr, cnt, p);
+
+	return ret;
+}
+
+static u8 tda9950_read(struct i2c_client *client, u8 addr)
+{
+	int ret;
+	u8 val;
+
+	ret = tda9950_read_range(client, addr, &val, 1);
+	if (ret < 0)
+		val = 0;
+
+	return val;
+}
+
+static irqreturn_t tda9950_irq(int irq, void *data)
+{
+	struct tda9950_priv *priv = data;
+	unsigned int tx_status;
+	u8 csr, cconr, buf[19];
+	u8 arb_lost_cnt, nack_cnt, err_cnt;
+
+	if (!priv->open)
+		return IRQ_NONE;
+
+	csr = tda9950_read(priv->client, REG_CSR);
+	if (!(csr & CSR_INT))
+		return IRQ_NONE;
+
+	cconr = tda9950_read(priv->client, REG_CCONR) & CCONR_RETRY_MASK;
+
+	tda9950_read_range(priv->client, REG_CDR0, buf, sizeof(buf));
+
+	/*
+	 * This should never happen: the data sheet says that there will
+	 * always be a valid message if the interrupt line is asserted.
+	 */
+	if (buf[0] == 0) {
+		dev_warn(&priv->client->dev, "interrupt pending, but no message?\n");
+		return IRQ_NONE;
+	}
+
+	switch (buf[1]) {
+	case CDR1_CNF: /* transmit result */
+		arb_lost_cnt = nack_cnt = err_cnt = 0;
+		switch (buf[2]) {
+		case CDR2_CNF_SUCCESS:
+			tx_status = CEC_TX_STATUS_OK;
+			break;
+
+		case CDR2_CNF_ARB_ERROR:
+			tx_status = CEC_TX_STATUS_ARB_LOST;
+			arb_lost_cnt = cconr;
+			break;
+
+		case CDR2_CNF_NACK_ADDR:
+			tx_status = CEC_TX_STATUS_NACK;
+			nack_cnt = cconr;
+			break;
+
+		default: /* some other error, refer to TDA9950 docs */
+			dev_err(&priv->client->dev, "CNF reply error 0x%02x\n",
+				buf[2]);
+			tx_status = CEC_TX_STATUS_ERROR;
+			err_cnt = cconr;
+			break;
+		}
+		/* TDA9950 executes all retries for us */
+		tx_status |= CEC_TX_STATUS_MAX_RETRIES;
+		cec_transmit_done(priv->adap, tx_status, arb_lost_cnt,
+				  nack_cnt, 0, err_cnt);
+		break;
+
+	case CDR1_IND:
+		priv->rx_msg.len = buf[0] - 2;
+		if (priv->rx_msg.len > CEC_MAX_MSG_SIZE)
+			priv->rx_msg.len = CEC_MAX_MSG_SIZE;
+
+		memcpy(priv->rx_msg.msg, buf + 2, priv->rx_msg.len);
+		cec_received_msg(priv->adap, &priv->rx_msg);
+		break;
+
+	default: /* unknown */
+		dev_err(&priv->client->dev, "unknown service id 0x%02x\n",
+			buf[1]);
+		break;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int tda9950_cec_transmit(struct cec_adapter *adap, u8 attempts,
+				u32 signal_free_time, struct cec_msg *msg)
+{
+	struct tda9950_priv *priv = adap->priv;
+	u8 buf[CEC_MAX_MSG_SIZE + 2];
+
+	buf[0] = 2 + msg->len;
+	buf[1] = CDR1_REQ;
+	memcpy(buf + 2, msg->msg, msg->len);
+
+	if (attempts > 5)
+		attempts = 5;
+
+	tda9950_write(priv->client, REG_CCONR, attempts);
+
+	return tda9950_write_range(priv->client, REG_CDR0, buf, 2 + msg->len);
+}
+
+static int tda9950_cec_adap_log_addr(struct cec_adapter *adap, u8 addr)
+{
+	struct tda9950_priv *priv = adap->priv;
+	u16 addresses;
+	u8 buf[2];
+
+	if (addr == CEC_LOG_ADDR_INVALID)
+		addresses = priv->addresses = 0;
+	else
+		addresses = priv->addresses |= BIT(addr);
+
+	/* TDA9950 doesn't want address 15 set */
+	addresses &= 0x7fff;
+	buf[0] = addresses >> 8;
+	buf[1] = addresses;
+
+	return tda9950_write_range(priv->client, REG_ACKH, buf, 2);
+}
+
+/*
+ * When operating as part of the TDA998x, we need additional handling
+ * to initialise and shut down the TDA9950 part of the device.  These
+ * two hooks are provided to allow the TDA998x code to perform those
+ * activities.
+ */
+static int tda9950_glue_open(struct tda9950_priv *priv)
+{
+	int ret = 0;
+
+	if (priv->glue && priv->glue->open)
+		ret = priv->glue->open(priv->glue->data);
+
+	priv->open = true;
+
+	return ret;
+}
+
+static void tda9950_glue_release(struct tda9950_priv *priv)
+{
+	priv->open = false;
+
+	if (priv->glue && priv->glue->release)
+		priv->glue->release(priv->glue->data);
+}
+
+static int tda9950_open(struct tda9950_priv *priv)
+{
+	struct i2c_client *client = priv->client;
+	int ret;
+
+	ret = tda9950_glue_open(priv);
+	if (ret)
+		return ret;
+
+	/* Reset the TDA9950, and wait 250ms for it to recover */
+	tda9950_write(client, REG_CCR, CCR_RESET);
+	msleep(250);
+
+	tda9950_cec_adap_log_addr(priv->adap, CEC_LOG_ADDR_INVALID);
+
+	/* Start the command processor */
+	tda9950_write(client, REG_CCR, CCR_ON);
+
+	return 0;
+}
+
+static void tda9950_release(struct tda9950_priv *priv)
+{
+	struct i2c_client *client = priv->client;
+	int timeout = 50;
+	u8 csr;
+
+	/* Stop the command processor */
+	tda9950_write(client, REG_CCR, 0);
+
+	/* Wait up to .5s for it to signal non-busy */
+	do {
+		csr = tda9950_read(client, REG_CSR);
+		if (!(csr & CSR_BUSY) || --timeout)
+			break;
+		msleep(10);
+	} while (1);
+
+	/* Warn the user that their IRQ may die if it's shared. */
+	if (csr & CSR_BUSY)
+		dev_warn(&client->dev, "command processor failed to stop, irq%d may die (csr=0x%02x)\n",
+			 client->irq, csr);
+
+	tda9950_glue_release(priv);
+}
+
+static int tda9950_cec_adap_enable(struct cec_adapter *adap, bool enable)
+{
+	struct tda9950_priv *priv = adap->priv;
+
+	if (!enable) {
+		tda9950_release(priv);
+		return 0;
+	} else {
+		return tda9950_open(priv);
+	}
+}
+
+static const struct cec_adap_ops tda9950_cec_ops = {
+	.adap_enable = tda9950_cec_adap_enable,
+	.adap_log_addr = tda9950_cec_adap_log_addr,
+	.adap_transmit = tda9950_cec_transmit,
+};
+
+/*
+ * When operating as part of the TDA998x, we need to claim additional
+ * resources.  These two hooks permit the management of those resources.
+ */
+static void tda9950_devm_glue_exit(void *data)
+{
+	struct tda9950_glue *glue = data;
+
+	if (glue && glue->exit)
+		glue->exit(glue->data);
+}
+
+static int tda9950_devm_glue_init(struct device *dev, struct tda9950_glue *glue)
+{
+	int ret;
+
+	if (glue && glue->init) {
+		ret = glue->init(glue->data);
+		if (ret)
+			return ret;
+	}
+
+	ret = devm_add_action(dev, tda9950_devm_glue_exit, glue);
+	if (ret)
+		tda9950_devm_glue_exit(glue);
+
+	return ret;
+}
+
+static void tda9950_cec_del(void *data)
+{
+	struct tda9950_priv *priv = data;
+
+	cec_delete_adapter(priv->adap);
+}
+
+static int tda9950_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
+{
+	struct tda9950_glue *glue = client->dev.platform_data;
+	struct device *dev = &client->dev;
+	struct tda9950_priv *priv;
+	unsigned long irqflags;
+	int ret;
+	u8 cvr;
+
+	/*
+	 * We must have I2C functionality: our multi-byte accesses
+	 * must be performed as a single contiguous transaction.
+	 */
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+		dev_err(&client->dev,
+			"adapter does not support I2C functionality\n");
+		return -ENXIO;
+	}
+
+	/* We must have an interrupt to be functional. */
+	if (client->irq <= 0) {
+		dev_err(&client->dev, "driver requires an interrupt\n");
+		return -ENXIO;
+	}
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->client = client;
+	priv->glue = glue;
+
+	i2c_set_clientdata(client, priv);
+
+	/*
+	 * If we're part of a TDA998x, we want the class devices to be
+	 * associated with the HDMI Tx so we have a tight relationship
+	 * between the HDMI interface and the CEC interface.
+	 */
+	priv->hdmi = dev;
+	if (glue && glue->parent)
+		priv->hdmi = glue->parent;
+
+	priv->adap = cec_allocate_adapter(&tda9950_cec_ops, priv, "tda9950",
+					  CEC_CAP_DEFAULTS,
+					  CEC_MAX_LOG_ADDRS);
+	if (IS_ERR(priv->adap))
+		return PTR_ERR(priv->adap);
+
+	ret = devm_add_action(dev, tda9950_cec_del, priv);
+	if (ret) {
+		cec_delete_adapter(priv->adap);
+		return ret;
+	}
+
+	ret = tda9950_devm_glue_init(dev, glue);
+	if (ret)
+		return ret;
+
+	ret = tda9950_glue_open(priv);
+	if (ret)
+		return ret;
+
+	cvr = tda9950_read(client, REG_CVR);
+
+	dev_info(&client->dev,
+		 "TDA9950 CEC interface, hardware version %u.%u\n",
+		 cvr >> 4, cvr & 15);
+
+	tda9950_glue_release(priv);
+
+	irqflags = IRQF_TRIGGER_FALLING;
+	if (glue)
+		irqflags = glue->irq_flags;
+
+	ret = devm_request_threaded_irq(dev, client->irq, NULL, tda9950_irq,
+					irqflags | IRQF_SHARED | IRQF_ONESHOT,
+					dev_name(&client->dev), priv);
+	if (ret < 0)
+		return ret;
+
+	priv->notify = cec_notifier_get(priv->hdmi);
+	if (!priv->notify)
+		return -ENOMEM;
+
+	ret = cec_register_adapter(priv->adap, priv->hdmi);
+	if (ret < 0) {
+		cec_notifier_put(priv->notify);
+		return ret;
+	}
+
+	/*
+	 * CEC documentation says we must not call cec_delete_adapter
+	 * after a successful call to cec_register_adapter().
+	 */
+	devm_remove_action(dev, tda9950_cec_del, priv);
+
+	cec_register_cec_notifier(priv->adap, priv->notify);
+
+	return 0;
+}
+
+static int tda9950_remove(struct i2c_client *client)
+{
+	struct tda9950_priv *priv = i2c_get_clientdata(client);
+
+	cec_unregister_adapter(priv->adap);
+	cec_notifier_put(priv->notify);
+
+	return 0;
+}
+
+static struct i2c_device_id tda9950_ids[] = {
+	{ "tda9950", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, tda9950_ids);
+
+static struct i2c_driver tda9950_driver = {
+	.probe = tda9950_probe,
+	.remove = tda9950_remove,
+	.driver = {
+		.name = "tda9950",
+	},
+	.id_table = tda9950_ids,
+};
+
+module_i2c_driver(tda9950_driver);
+
+MODULE_AUTHOR("Russell King <rmk+kernel@armlinux.org.uk>");
+MODULE_DESCRIPTION("TDA9950/TDA998x Consumer Electronics Control Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/platform_data/tda9950.h b/include/linux/platform_data/tda9950.h
new file mode 100644
index 000000000000..c65efd461102
--- /dev/null
+++ b/include/linux/platform_data/tda9950.h
@@ -0,0 +1,16 @@
+#ifndef LINUX_PLATFORM_DATA_TDA9950_H
+#define LINUX_PLATFORM_DATA_TDA9950_H
+
+struct device;
+
+struct tda9950_glue {
+	struct device *parent;
+	unsigned long irq_flags;
+	void *data;
+	int (*init)(void *);
+	void (*exit)(void *);
+	int (*open)(void *);
+	void (*release)(void *);
+};
+
+#endif
-- 
cgit v1.2.3


From 4773e77cdc9b3af93ee1ae7bcf2acf94fde17166 Mon Sep 17 00:00:00 2001
From: Prashanth Prakash <pprakash@codeaurora.org>
Date: Wed, 4 Apr 2018 12:14:50 -0600
Subject: ACPI / CPPC: Add support for CPPC v3

CPPC V3 introduces two new entries to make it easier to convert between
abstract processor performance and frequency. The two new entries are
lowest frequency and nominal frequency. These are the frequencies
corresponding to lowest and nominal abstract performance.

Add support to read the new entries and populate them as part of the
CPPC performance capabilities which can be used by cpufreq drivers

Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c | 81 ++++++++++++++++++++++++++++++++++++++----------
 include/acpi/cppc_acpi.h | 14 ++++++---
 2 files changed, 75 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 735c74a4cbdb..8fa3d3a6e5b6 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -156,6 +156,9 @@ show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, highest_perf);
 show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_perf);
 show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, nominal_perf);
 show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_nonlinear_perf);
+show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_freq);
+show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, nominal_freq);
+
 show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, reference_perf);
 show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, wraparound_time);
 
@@ -183,6 +186,8 @@ static struct attribute *cppc_attrs[] = {
 	&lowest_perf.attr,
 	&lowest_nonlinear_perf.attr,
 	&nominal_perf.attr,
+	&nominal_freq.attr,
+	&lowest_freq.attr,
 	NULL
 };
 
@@ -613,7 +618,6 @@ bool __weak cpc_ffh_supported(void)
 	return false;
 }
 
-
 /**
  * pcc_data_alloc() - Allocate the pcc_data memory for pcc subspace
  *
@@ -641,6 +645,34 @@ int pcc_data_alloc(int pcc_ss_id)
 
 	return 0;
 }
+
+/* Check if CPPC revision + num_ent combination is supported */
+static bool is_cppc_supported(int revision, int num_ent)
+{
+	int expected_num_ent;
+
+	switch (revision) {
+	case CPPC_V2_REV:
+		expected_num_ent = CPPC_V2_NUM_ENT;
+		break;
+	case CPPC_V3_REV:
+		expected_num_ent = CPPC_V3_NUM_ENT;
+		break;
+	default:
+		pr_debug("Firmware exports unsupported CPPC revision: %d\n",
+			revision);
+		return false;
+	}
+
+	if (expected_num_ent != num_ent) {
+		pr_debug("Firmware exports %d entries. Expected: %d for CPPC rev:%d\n",
+			num_ent, expected_num_ent, revision);
+		return false;
+	}
+
+	return true;
+}
+
 /*
  * An example CPC table looks like the following.
  *
@@ -731,14 +763,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 				cpc_obj->type);
 		goto out_free;
 	}
-
-	/* Only support CPPCv2. Bail otherwise. */
-	if (num_ent != CPPC_NUM_ENT) {
-		pr_debug("Firmware exports %d entries. Expected: %d\n",
-				num_ent, CPPC_NUM_ENT);
-		goto out_free;
-	}
-
 	cpc_ptr->num_entries = num_ent;
 
 	/* Second entry should be revision. */
@@ -750,12 +774,10 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 				cpc_obj->type);
 		goto out_free;
 	}
+	cpc_ptr->version = cpc_rev;
 
-	if (cpc_rev != CPPC_REV) {
-		pr_debug("Firmware exports revision:%d. Expected:%d\n",
-				cpc_rev, CPPC_REV);
+	if (!is_cppc_supported(cpc_rev, num_ent))
 		goto out_free;
-	}
 
 	/* Iterate through remaining entries in _CPC */
 	for (i = 2; i < num_ent; i++) {
@@ -808,6 +830,18 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 		}
 	}
 	per_cpu(cpu_pcc_subspace_idx, pr->id) = pcc_subspace_id;
+
+	/*
+	 * Initialize the remaining cpc_regs as unsupported.
+	 * Example: In case FW exposes CPPC v2, the below loop will initialize
+	 * LOWEST_FREQ and NOMINAL_FREQ regs as unsupported
+	 */
+	for (i = num_ent - 2; i < MAX_CPC_REG_ENT; i++) {
+		cpc_ptr->cpc_regs[i].type = ACPI_TYPE_INTEGER;
+		cpc_ptr->cpc_regs[i].cpc_entry.int_value = 0;
+	}
+
+
 	/* Store CPU Logical ID */
 	cpc_ptr->cpu_id = pr->id;
 
@@ -1037,8 +1071,9 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps)
 {
 	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
 	struct cpc_register_resource *highest_reg, *lowest_reg,
-		*lowest_non_linear_reg, *nominal_reg;
-	u64 high, low, nom, min_nonlinear;
+		*lowest_non_linear_reg, *nominal_reg,
+		*low_freq_reg = NULL, *nom_freq_reg = NULL;
+	u64 high, low, nom, min_nonlinear, low_f = 0, nom_f = 0;
 	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
 	struct cppc_pcc_data *pcc_ss_data;
 	int ret = 0, regs_in_pcc = 0;
@@ -1053,10 +1088,13 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps)
 	lowest_reg = &cpc_desc->cpc_regs[LOWEST_PERF];
 	lowest_non_linear_reg = &cpc_desc->cpc_regs[LOW_NON_LINEAR_PERF];
 	nominal_reg = &cpc_desc->cpc_regs[NOMINAL_PERF];
+	low_freq_reg = &cpc_desc->cpc_regs[LOWEST_FREQ];
+	nom_freq_reg = &cpc_desc->cpc_regs[NOMINAL_FREQ];
 
 	/* Are any of the regs PCC ?*/
 	if (CPC_IN_PCC(highest_reg) || CPC_IN_PCC(lowest_reg) ||
-		CPC_IN_PCC(lowest_non_linear_reg) || CPC_IN_PCC(nominal_reg)) {
+		CPC_IN_PCC(lowest_non_linear_reg) || CPC_IN_PCC(nominal_reg) ||
+		CPC_IN_PCC(low_freq_reg) || CPC_IN_PCC(nom_freq_reg)) {
 		regs_in_pcc = 1;
 		down_write(&pcc_ss_data->pcc_lock);
 		/* Ring doorbell once to update PCC subspace */
@@ -1081,6 +1119,17 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps)
 	if (!high || !low || !nom || !min_nonlinear)
 		ret = -EFAULT;
 
+	/* Read optional lowest and nominal frequencies if present */
+	if (CPC_SUPPORTED(low_freq_reg))
+		cpc_read(cpunum, low_freq_reg, &low_f);
+
+	if (CPC_SUPPORTED(nom_freq_reg))
+		cpc_read(cpunum, nom_freq_reg, &nom_f);
+
+	perf_caps->lowest_freq = low_f;
+	perf_caps->nominal_freq = nom_f;
+
+
 out_err:
 	if (regs_in_pcc)
 		up_write(&pcc_ss_data->pcc_lock);
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 2010c0516f27..8e0b8250a139 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -20,14 +20,16 @@
 #include <acpi/pcc.h>
 #include <acpi/processor.h>
 
-/* Only support CPPCv2 for now. */
-#define CPPC_NUM_ENT	21
-#define CPPC_REV	2
+/* Support CPPCv2 and CPPCv3  */
+#define CPPC_V2_REV	2
+#define CPPC_V3_REV	3
+#define CPPC_V2_NUM_ENT	21
+#define CPPC_V3_NUM_ENT	23
 
 #define PCC_CMD_COMPLETE_MASK	(1 << 0)
 #define PCC_ERROR_MASK		(1 << 2)
 
-#define MAX_CPC_REG_ENT 19
+#define MAX_CPC_REG_ENT 21
 
 /* CPPC specific PCC commands. */
 #define	CMD_READ 0
@@ -91,6 +93,8 @@ enum cppc_regs {
 	AUTO_ACT_WINDOW,
 	ENERGY_PERF,
 	REFERENCE_PERF,
+	LOWEST_FREQ,
+	NOMINAL_FREQ,
 };
 
 /*
@@ -104,6 +108,8 @@ struct cppc_perf_caps {
 	u32 nominal_perf;
 	u32 lowest_perf;
 	u32 lowest_nonlinear_perf;
+	u32 lowest_freq;
+	u32 nominal_freq;
 };
 
 struct cppc_perf_ctrls {
-- 
cgit v1.2.3


From a9c2dfc8527318a27db045cd7ea51e8ecab8c884 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 23 Apr 2018 17:24:56 +0200
Subject: ALSA: hda - Use a macro for snd_array iteration loops

Introduce a new helper macro, snd_array_for_each(), to iterate for
each snd_array element.  It slightly improves the readability than
lengthy open codes at each place.

Along with it, add const prefix to some obvious places.

There should be no functional changes by this.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/hdaudio.h         |  5 +++++
 sound/hda/hdac_regmap.c         |  4 ++--
 sound/pci/hda/hda_auto_parser.c | 10 +++++-----
 sound/pci/hda/hda_codec.c       | 36 ++++++++++++++++++------------------
 sound/pci/hda/hda_generic.c     | 27 +++++++++++++--------------
 sound/pci/hda/hda_sysfs.c       | 20 ++++++++++----------
 sound/pci/hda/patch_conexant.c  |  5 ++---
 sound/pci/hda/patch_realtek.c   |  4 ++--
 8 files changed, 57 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h
index 06536e01ed94..c052afc27547 100644
--- a/include/sound/hdaudio.h
+++ b/include/sound/hdaudio.h
@@ -571,4 +571,9 @@ static inline unsigned int snd_array_index(struct snd_array *array, void *ptr)
 	return (unsigned long)(ptr - array->list) / array->elem_size;
 }
 
+/* a helper macro to iterate for each snd_array element */
+#define snd_array_for_each(array, idx, ptr) \
+	for ((idx) = 0, (ptr) = (array)->list; (idx) < (array)->used; \
+	     (ptr) = snd_array_elem(array, ++(idx)))
+
 #endif /* __SOUND_HDAUDIO_H */
diff --git a/sound/hda/hdac_regmap.c b/sound/hda/hdac_regmap.c
index 47a358fab132..419e285e0226 100644
--- a/sound/hda/hdac_regmap.c
+++ b/sound/hda/hdac_regmap.c
@@ -65,10 +65,10 @@ static bool hda_writeable_reg(struct device *dev, unsigned int reg)
 {
 	struct hdac_device *codec = dev_to_hdac_dev(dev);
 	unsigned int verb = get_verb(reg);
+	const unsigned int *v;
 	int i;
 
-	for (i = 0; i < codec->vendor_verbs.used; i++) {
-		unsigned int *v = snd_array_elem(&codec->vendor_verbs, i);
+	snd_array_for_each(&codec->vendor_verbs, i, v) {
 		if (verb == *v)
 			return true;
 	}
diff --git a/sound/pci/hda/hda_auto_parser.c b/sound/pci/hda/hda_auto_parser.c
index d3ea73171a3d..b9a6b66aeb0e 100644
--- a/sound/pci/hda/hda_auto_parser.c
+++ b/sound/pci/hda/hda_auto_parser.c
@@ -793,11 +793,11 @@ EXPORT_SYMBOL_GPL(snd_hda_add_verbs);
  */
 void snd_hda_apply_verbs(struct hda_codec *codec)
 {
+	const struct hda_verb **v;
 	int i;
-	for (i = 0; i < codec->verbs.used; i++) {
-		struct hda_verb **v = snd_array_elem(&codec->verbs, i);
+
+	snd_array_for_each(&codec->verbs, i, v)
 		snd_hda_sequence_write(codec, *v);
-	}
 }
 EXPORT_SYMBOL_GPL(snd_hda_apply_verbs);
 
@@ -890,10 +890,10 @@ EXPORT_SYMBOL_GPL(snd_hda_apply_fixup);
 static bool pin_config_match(struct hda_codec *codec,
 			     const struct hda_pintbl *pins)
 {
+	const struct hda_pincfg *pin;
 	int i;
 
-	for (i = 0; i < codec->init_pins.used; i++) {
-		struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i);
+	snd_array_for_each(&codec->init_pins, i, pin) {
 		hda_nid_t nid = pin->nid;
 		u32 cfg = pin->cfg;
 		const struct hda_pintbl *t_pins;
diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index 5bc3a7468e17..0aa923d129f5 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -481,9 +481,10 @@ static struct hda_pincfg *look_up_pincfg(struct hda_codec *codec,
 					 struct snd_array *array,
 					 hda_nid_t nid)
 {
+	struct hda_pincfg *pin;
 	int i;
-	for (i = 0; i < array->used; i++) {
-		struct hda_pincfg *pin = snd_array_elem(array, i);
+
+	snd_array_for_each(array, i, pin) {
 		if (pin->nid == nid)
 			return pin;
 	}
@@ -618,14 +619,15 @@ EXPORT_SYMBOL_GPL(snd_hda_codec_get_pin_target);
  */
 void snd_hda_shutup_pins(struct hda_codec *codec)
 {
+	const struct hda_pincfg *pin;
 	int i;
+
 	/* don't shut up pins when unloading the driver; otherwise it breaks
 	 * the default pin setup at the next load of the driver
 	 */
 	if (codec->bus->shutdown)
 		return;
-	for (i = 0; i < codec->init_pins.used; i++) {
-		struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i);
+	snd_array_for_each(&codec->init_pins, i, pin) {
 		/* use read here for syncing after issuing each verb */
 		snd_hda_codec_read(codec, pin->nid, 0,
 				   AC_VERB_SET_PIN_WIDGET_CONTROL, 0);
@@ -638,13 +640,14 @@ EXPORT_SYMBOL_GPL(snd_hda_shutup_pins);
 /* Restore the pin controls cleared previously via snd_hda_shutup_pins() */
 static void restore_shutup_pins(struct hda_codec *codec)
 {
+	const struct hda_pincfg *pin;
 	int i;
+
 	if (!codec->pins_shutup)
 		return;
 	if (codec->bus->shutdown)
 		return;
-	for (i = 0; i < codec->init_pins.used; i++) {
-		struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i);
+	snd_array_for_each(&codec->init_pins, i, pin) {
 		snd_hda_codec_write(codec, pin->nid, 0,
 				    AC_VERB_SET_PIN_WIDGET_CONTROL,
 				    pin->ctrl);
@@ -697,8 +700,7 @@ get_hda_cvt_setup(struct hda_codec *codec, hda_nid_t nid)
 	struct hda_cvt_setup *p;
 	int i;
 
-	for (i = 0; i < codec->cvt_setups.used; i++) {
-		p = snd_array_elem(&codec->cvt_setups, i);
+	snd_array_for_each(&codec->cvt_setups, i, p) {
 		if (p->nid == nid)
 			return p;
 	}
@@ -1076,8 +1078,7 @@ void snd_hda_codec_setup_stream(struct hda_codec *codec, hda_nid_t nid,
 	/* make other inactive cvts with the same stream-tag dirty */
 	type = get_wcaps_type(get_wcaps(codec, nid));
 	list_for_each_codec(c, codec->bus) {
-		for (i = 0; i < c->cvt_setups.used; i++) {
-			p = snd_array_elem(&c->cvt_setups, i);
+		snd_array_for_each(&c->cvt_setups, i, p) {
 			if (!p->active && p->stream_tag == stream_tag &&
 			    get_wcaps_type(get_wcaps(c, p->nid)) == type)
 				p->dirty = 1;
@@ -1140,12 +1141,11 @@ static void really_cleanup_stream(struct hda_codec *codec,
 static void purify_inactive_streams(struct hda_codec *codec)
 {
 	struct hda_codec *c;
+	struct hda_cvt_setup *p;
 	int i;
 
 	list_for_each_codec(c, codec->bus) {
-		for (i = 0; i < c->cvt_setups.used; i++) {
-			struct hda_cvt_setup *p;
-			p = snd_array_elem(&c->cvt_setups, i);
+		snd_array_for_each(&c->cvt_setups, i, p) {
 			if (p->dirty)
 				really_cleanup_stream(c, p);
 		}
@@ -1156,10 +1156,10 @@ static void purify_inactive_streams(struct hda_codec *codec)
 /* clean up all streams; called from suspend */
 static void hda_cleanup_all_streams(struct hda_codec *codec)
 {
+	struct hda_cvt_setup *p;
 	int i;
 
-	for (i = 0; i < codec->cvt_setups.used; i++) {
-		struct hda_cvt_setup *p = snd_array_elem(&codec->cvt_setups, i);
+	snd_array_for_each(&codec->cvt_setups, i, p) {
 		if (p->stream_tag)
 			really_cleanup_stream(codec, p);
 	}
@@ -2461,10 +2461,10 @@ EXPORT_SYMBOL_GPL(snd_hda_create_dig_out_ctls);
 struct hda_spdif_out *snd_hda_spdif_out_of_nid(struct hda_codec *codec,
 					       hda_nid_t nid)
 {
+	struct hda_spdif_out *spdif;
 	int i;
-	for (i = 0; i < codec->spdif_out.used; i++) {
-		struct hda_spdif_out *spdif =
-				snd_array_elem(&codec->spdif_out, i);
+
+	snd_array_for_each(&codec->spdif_out, i, spdif) {
 		if (spdif->nid == nid)
 			return spdif;
 	}
diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c
index 5cc65093d941..51030f040745 100644
--- a/sound/pci/hda/hda_generic.c
+++ b/sound/pci/hda/hda_generic.c
@@ -264,10 +264,10 @@ static struct nid_path *get_nid_path(struct hda_codec *codec,
 				     int anchor_nid)
 {
 	struct hda_gen_spec *spec = codec->spec;
+	struct nid_path *path;
 	int i;
 
-	for (i = 0; i < spec->paths.used; i++) {
-		struct nid_path *path = snd_array_elem(&spec->paths, i);
+	snd_array_for_each(&spec->paths, i, path) {
 		if (path->depth <= 0)
 			continue;
 		if ((!from_nid || path->path[0] == from_nid) &&
@@ -325,10 +325,10 @@ EXPORT_SYMBOL_GPL(snd_hda_get_path_from_idx);
 static bool is_dac_already_used(struct hda_codec *codec, hda_nid_t nid)
 {
 	struct hda_gen_spec *spec = codec->spec;
+	const struct nid_path *path;
 	int i;
 
-	for (i = 0; i < spec->paths.used; i++) {
-		struct nid_path *path = snd_array_elem(&spec->paths, i);
+	snd_array_for_each(&spec->paths, i, path) {
 		if (path->path[0] == nid)
 			return true;
 	}
@@ -351,11 +351,11 @@ static bool is_reachable_path(struct hda_codec *codec,
 static bool is_ctl_used(struct hda_codec *codec, unsigned int val, int type)
 {
 	struct hda_gen_spec *spec = codec->spec;
+	const struct nid_path *path;
 	int i;
 
 	val &= AMP_VAL_COMPARE_MASK;
-	for (i = 0; i < spec->paths.used; i++) {
-		struct nid_path *path = snd_array_elem(&spec->paths, i);
+	snd_array_for_each(&spec->paths, i, path) {
 		if ((path->ctls[type] & AMP_VAL_COMPARE_MASK) == val)
 			return true;
 	}
@@ -638,13 +638,13 @@ static bool is_active_nid(struct hda_codec *codec, hda_nid_t nid,
 {
 	struct hda_gen_spec *spec = codec->spec;
 	int type = get_wcaps_type(get_wcaps(codec, nid));
+	const struct nid_path *path;
 	int i, n;
 
 	if (nid == codec->core.afg)
 		return true;
 
-	for (n = 0; n < spec->paths.used; n++) {
-		struct nid_path *path = snd_array_elem(&spec->paths, n);
+	snd_array_for_each(&spec->paths, n, path) {
 		if (!path->active)
 			continue;
 		if (codec->power_save_node) {
@@ -2696,10 +2696,10 @@ static const struct snd_kcontrol_new out_jack_mode_enum = {
 static bool find_kctl_name(struct hda_codec *codec, const char *name, int idx)
 {
 	struct hda_gen_spec *spec = codec->spec;
+	const struct snd_kcontrol_new *kctl;
 	int i;
 
-	for (i = 0; i < spec->kctls.used; i++) {
-		struct snd_kcontrol_new *kctl = snd_array_elem(&spec->kctls, i);
+	snd_array_for_each(&spec->kctls, i, kctl) {
 		if (!strcmp(kctl->name, name) && kctl->index == idx)
 			return true;
 	}
@@ -4021,8 +4021,7 @@ static hda_nid_t set_path_power(struct hda_codec *codec, hda_nid_t nid,
 	struct nid_path *path;
 	int n;
 
-	for (n = 0; n < spec->paths.used; n++) {
-		path = snd_array_elem(&spec->paths, n);
+	snd_array_for_each(&spec->paths, n, path) {
 		if (!path->depth)
 			continue;
 		if (path->path[0] == nid ||
@@ -5831,10 +5830,10 @@ static void init_digital(struct hda_codec *codec)
  */
 static void clear_unsol_on_unused_pins(struct hda_codec *codec)
 {
+	const struct hda_pincfg *pin;
 	int i;
 
-	for (i = 0; i < codec->init_pins.used; i++) {
-		struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i);
+	snd_array_for_each(&codec->init_pins, i, pin) {
 		hda_nid_t nid = pin->nid;
 		if (is_jack_detectable(codec, nid) &&
 		    !snd_hda_jack_tbl_get(codec, nid))
diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c
index 9b7efece4484..6ec79c58d48d 100644
--- a/sound/pci/hda/hda_sysfs.c
+++ b/sound/pci/hda/hda_sysfs.c
@@ -80,10 +80,10 @@ static ssize_t pin_configs_show(struct hda_codec *codec,
 				struct snd_array *list,
 				char *buf)
 {
+	const struct hda_pincfg *pin;
 	int i, len = 0;
 	mutex_lock(&codec->user_mutex);
-	for (i = 0; i < list->used; i++) {
-		struct hda_pincfg *pin = snd_array_elem(list, i);
+	snd_array_for_each(list, i, pin) {
 		len += sprintf(buf + len, "0x%02x 0x%08x\n",
 			       pin->nid, pin->cfg);
 	}
@@ -217,10 +217,10 @@ static ssize_t init_verbs_show(struct device *dev,
 			       char *buf)
 {
 	struct hda_codec *codec = dev_get_drvdata(dev);
+	const struct hda_verb *v;
 	int i, len = 0;
 	mutex_lock(&codec->user_mutex);
-	for (i = 0; i < codec->init_verbs.used; i++) {
-		struct hda_verb *v = snd_array_elem(&codec->init_verbs, i);
+	snd_array_for_each(&codec->init_verbs, i, v) {
 		len += snprintf(buf + len, PAGE_SIZE - len,
 				"0x%02x 0x%03x 0x%04x\n",
 				v->nid, v->verb, v->param);
@@ -267,10 +267,10 @@ static ssize_t hints_show(struct device *dev,
 			  char *buf)
 {
 	struct hda_codec *codec = dev_get_drvdata(dev);
+	const struct hda_hint *hint;
 	int i, len = 0;
 	mutex_lock(&codec->user_mutex);
-	for (i = 0; i < codec->hints.used; i++) {
-		struct hda_hint *hint = snd_array_elem(&codec->hints, i);
+	snd_array_for_each(&codec->hints, i, hint) {
 		len += snprintf(buf + len, PAGE_SIZE - len,
 				"%s = %s\n", hint->key, hint->val);
 	}
@@ -280,10 +280,10 @@ static ssize_t hints_show(struct device *dev,
 
 static struct hda_hint *get_hint(struct hda_codec *codec, const char *key)
 {
+	struct hda_hint *hint;
 	int i;
 
-	for (i = 0; i < codec->hints.used; i++) {
-		struct hda_hint *hint = snd_array_elem(&codec->hints, i);
+	snd_array_for_each(&codec->hints, i, hint) {
 		if (!strcmp(hint->key, key))
 			return hint;
 	}
@@ -783,13 +783,13 @@ void snd_hda_sysfs_init(struct hda_codec *codec)
 void snd_hda_sysfs_clear(struct hda_codec *codec)
 {
 #ifdef CONFIG_SND_HDA_RECONFIG
+	struct hda_hint *hint;
 	int i;
 
 	/* clear init verbs */
 	snd_array_free(&codec->init_verbs);
 	/* clear hints */
-	for (i = 0; i < codec->hints.used; i++) {
-		struct hda_hint *hint = snd_array_elem(&codec->hints, i);
+	snd_array_for_each(&codec->hints, i, hint) {
 		kfree(hint->key); /* we don't need to free hint->val */
 	}
 	snd_array_free(&codec->hints);
diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 5b4dbcec6de8..093d2a9ece85 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -588,6 +588,7 @@ static void cxt_fixup_olpc_xo(struct hda_codec *codec,
 				    const struct hda_fixup *fix, int action)
 {
 	struct conexant_spec *spec = codec->spec;
+	struct snd_kcontrol_new *kctl;
 	int i;
 
 	if (action != HDA_FIXUP_ACT_PROBE)
@@ -606,9 +607,7 @@ static void cxt_fixup_olpc_xo(struct hda_codec *codec,
 	snd_hda_codec_set_pin_target(codec, 0x1a, PIN_VREF50);
 
 	/* override mic boost control */
-	for (i = 0; i < spec->gen.kctls.used; i++) {
-		struct snd_kcontrol_new *kctl =
-			snd_array_elem(&spec->gen.kctls, i);
+	snd_array_for_each(&spec->gen.kctls, i, kctl) {
 		if (!strcmp(kctl->name, "Mic Boost Volume")) {
 			kctl->put = olpc_xo_mic_boost_put;
 			break;
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index aef1f52db7d9..7f2d5b157b75 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -2828,6 +2828,7 @@ static int find_ext_mic_pin(struct hda_codec *codec);
 
 static void alc286_shutup(struct hda_codec *codec)
 {
+	const struct hda_pincfg *pin;
 	int i;
 	int mic_pin = find_ext_mic_pin(codec);
 	/* don't shut up pins when unloading the driver; otherwise it breaks
@@ -2835,8 +2836,7 @@ static void alc286_shutup(struct hda_codec *codec)
 	 */
 	if (codec->bus->shutdown)
 		return;
-	for (i = 0; i < codec->init_pins.used; i++) {
-		struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i);
+	snd_array_for_each(&codec->init_pins, i, pin) {
 		/* use read here for syncing after issuing each verb */
 		if (pin->nid != mic_pin)
 			snd_hda_codec_read(codec, pin->nid, 0,
-- 
cgit v1.2.3


From ccc3b2b3482c2c05d05fd2cfbf0c28d644b4b0c2 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 5 Apr 2018 17:44:42 +0200
Subject: drm: Move simple_display_pipe prepare_fb helper into gem fb helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's nothing tinydrm specific to this, and there's a few more
copies of the same in various other drivers.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: David Airlie <airlied@linux.ie>
Cc: David Lechner <david@lechnology.com>
Cc: "Noralf Trønnes" <noralf@tronnes.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Neil Armstrong <narmstrong@baylibre.com>
Cc: Daniel Stone <daniels@collabora.com>
Cc: Haneen Mohammed <hamohammed.sa@gmail.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: "Ville Syrjälä" <ville.syrjala@linux.intel.com>
Reviewed-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Acked-by: David Lechner <david@lechnology.com>
Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20180405154449.23038-3-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_gem_framebuffer_helper.c | 19 +++++++++++++++++++
 drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c  | 17 -----------------
 drivers/gpu/drm/tinydrm/ili9225.c            |  2 +-
 drivers/gpu/drm/tinydrm/mi0283qt.c           |  3 ++-
 drivers/gpu/drm/tinydrm/repaper.c            |  2 +-
 drivers/gpu/drm/tinydrm/st7586.c             |  2 +-
 drivers/gpu/drm/tinydrm/st7735r.c            |  2 +-
 include/drm/drm_gem_framebuffer_helper.h     |  3 +++
 include/drm/drm_simple_kms_helper.h          |  3 +++
 include/drm/tinydrm/tinydrm.h                |  2 --
 10 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
index 4d682a6e8bcb..acfbc0641a06 100644
--- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c
+++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
@@ -22,6 +22,7 @@
 #include <drm/drm_gem.h>
 #include <drm/drm_gem_framebuffer_helper.h>
 #include <drm/drm_modeset_helper.h>
+#include <drm/drm_simple_kms_helper.h>
 
 /**
  * DOC: overview
@@ -265,6 +266,24 @@ int drm_gem_fb_prepare_fb(struct drm_plane *plane,
 }
 EXPORT_SYMBOL_GPL(drm_gem_fb_prepare_fb);
 
+/**
+ * drm_gem_fb_simple_display_pipe_prepare_fb - prepare_fb helper for
+ *     &drm_simple_display_pipe
+ * @pipe: Simple display pipe
+ * @plane_state: Plane state
+ *
+ * This function uses drm_gem_fb_prepare_fb() to check if the plane FB has a
+ * &dma_buf attached, extracts the exclusive fence and attaches it to plane
+ * state for the atomic helper to wait on. Drivers can use this as their
+ * &drm_simple_display_pipe_funcs.prepare_fb callback.
+ */
+int drm_gem_fb_simple_display_pipe_prepare_fb(struct drm_simple_display_pipe *pipe,
+					      struct drm_plane_state *plane_state)
+{
+	return drm_gem_fb_prepare_fb(&pipe->plane, plane_state);
+}
+EXPORT_SYMBOL(drm_gem_fb_simple_display_pipe_prepare_fb);
+
 /**
  * drm_gem_fbdev_fb_create - Create a GEM backed &drm_framebuffer for fbdev
  *                           emulation
diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c
index e68b528ae64d..7e8e24d0b7a7 100644
--- a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c
+++ b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c
@@ -138,23 +138,6 @@ void tinydrm_display_pipe_update(struct drm_simple_display_pipe *pipe,
 }
 EXPORT_SYMBOL(tinydrm_display_pipe_update);
 
-/**
- * tinydrm_display_pipe_prepare_fb - Display pipe prepare_fb helper
- * @pipe: Simple display pipe
- * @plane_state: Plane state
- *
- * This function uses drm_gem_fb_prepare_fb() to check if the plane FB has an
- * dma-buf attached, extracts the exclusive fence and attaches it to plane
- * state for the atomic helper to wait on. Drivers can use this as their
- * &drm_simple_display_pipe_funcs->prepare_fb callback.
- */
-int tinydrm_display_pipe_prepare_fb(struct drm_simple_display_pipe *pipe,
-				    struct drm_plane_state *plane_state)
-{
-	return drm_gem_fb_prepare_fb(&pipe->plane, plane_state);
-}
-EXPORT_SYMBOL(tinydrm_display_pipe_prepare_fb);
-
 static int tinydrm_rotate_mode(struct drm_display_mode *mode,
 			       unsigned int rotation)
 {
diff --git a/drivers/gpu/drm/tinydrm/ili9225.c b/drivers/gpu/drm/tinydrm/ili9225.c
index 0874e877b111..841c69aba059 100644
--- a/drivers/gpu/drm/tinydrm/ili9225.c
+++ b/drivers/gpu/drm/tinydrm/ili9225.c
@@ -354,7 +354,7 @@ static const struct drm_simple_display_pipe_funcs ili9225_pipe_funcs = {
 	.enable		= ili9225_pipe_enable,
 	.disable	= ili9225_pipe_disable,
 	.update		= tinydrm_display_pipe_update,
-	.prepare_fb	= tinydrm_display_pipe_prepare_fb,
+	.prepare_fb	= drm_gem_fb_simple_display_pipe_prepare_fb,
 };
 
 static const struct drm_display_mode ili9225_mode = {
diff --git a/drivers/gpu/drm/tinydrm/mi0283qt.c b/drivers/gpu/drm/tinydrm/mi0283qt.c
index 4e6d2ee94e55..d5ef65179c16 100644
--- a/drivers/gpu/drm/tinydrm/mi0283qt.c
+++ b/drivers/gpu/drm/tinydrm/mi0283qt.c
@@ -19,6 +19,7 @@
 
 #include <drm/drm_fb_helper.h>
 #include <drm/drm_modeset_helper.h>
+#include <drm/drm_gem_framebuffer_helper.h>
 #include <drm/tinydrm/mipi-dbi.h>
 #include <drm/tinydrm/tinydrm-helpers.h>
 #include <video/mipi_display.h>
@@ -134,7 +135,7 @@ static const struct drm_simple_display_pipe_funcs mi0283qt_pipe_funcs = {
 	.enable = mi0283qt_enable,
 	.disable = mipi_dbi_pipe_disable,
 	.update = tinydrm_display_pipe_update,
-	.prepare_fb = tinydrm_display_pipe_prepare_fb,
+	.prepare_fb = drm_gem_fb_simple_display_pipe_prepare_fb,
 };
 
 static const struct drm_display_mode mi0283qt_mode = {
diff --git a/drivers/gpu/drm/tinydrm/repaper.c b/drivers/gpu/drm/tinydrm/repaper.c
index bb6f80a81899..1ee6855212a0 100644
--- a/drivers/gpu/drm/tinydrm/repaper.c
+++ b/drivers/gpu/drm/tinydrm/repaper.c
@@ -841,7 +841,7 @@ static const struct drm_simple_display_pipe_funcs repaper_pipe_funcs = {
 	.enable = repaper_pipe_enable,
 	.disable = repaper_pipe_disable,
 	.update = tinydrm_display_pipe_update,
-	.prepare_fb = tinydrm_display_pipe_prepare_fb,
+	.prepare_fb = drm_gem_fb_simple_display_pipe_prepare_fb,
 };
 
 static const uint32_t repaper_formats[] = {
diff --git a/drivers/gpu/drm/tinydrm/st7586.c b/drivers/gpu/drm/tinydrm/st7586.c
index 22644b88199a..5c29e3803ecb 100644
--- a/drivers/gpu/drm/tinydrm/st7586.c
+++ b/drivers/gpu/drm/tinydrm/st7586.c
@@ -290,7 +290,7 @@ static const struct drm_simple_display_pipe_funcs st7586_pipe_funcs = {
 	.enable		= st7586_pipe_enable,
 	.disable	= st7586_pipe_disable,
 	.update		= tinydrm_display_pipe_update,
-	.prepare_fb	= tinydrm_display_pipe_prepare_fb,
+	.prepare_fb	= drm_gem_fb_simple_display_pipe_prepare_fb,
 };
 
 static const struct drm_display_mode st7586_mode = {
diff --git a/drivers/gpu/drm/tinydrm/st7735r.c b/drivers/gpu/drm/tinydrm/st7735r.c
index 189a07894d36..6c7b15c9da4f 100644
--- a/drivers/gpu/drm/tinydrm/st7735r.c
+++ b/drivers/gpu/drm/tinydrm/st7735r.c
@@ -106,7 +106,7 @@ static const struct drm_simple_display_pipe_funcs jd_t18003_t01_pipe_funcs = {
 	.enable		= jd_t18003_t01_pipe_enable,
 	.disable	= mipi_dbi_pipe_disable,
 	.update		= tinydrm_display_pipe_update,
-	.prepare_fb	= tinydrm_display_pipe_prepare_fb,
+	.prepare_fb	= drm_gem_fb_simple_display_pipe_prepare_fb,
 };
 
 static const struct drm_display_mode jd_t18003_t01_mode = {
diff --git a/include/drm/drm_gem_framebuffer_helper.h b/include/drm/drm_gem_framebuffer_helper.h
index 5ca7cdc3f527..a38de7eb55b4 100644
--- a/include/drm/drm_gem_framebuffer_helper.h
+++ b/include/drm/drm_gem_framebuffer_helper.h
@@ -10,6 +10,7 @@ struct drm_gem_object;
 struct drm_mode_fb_cmd2;
 struct drm_plane;
 struct drm_plane_state;
+struct drm_simple_display_pipe;
 
 struct drm_gem_object *drm_gem_fb_get_obj(struct drm_framebuffer *fb,
 					  unsigned int plane);
@@ -27,6 +28,8 @@ drm_gem_fb_create(struct drm_device *dev, struct drm_file *file,
 
 int drm_gem_fb_prepare_fb(struct drm_plane *plane,
 			  struct drm_plane_state *state);
+int drm_gem_fb_simple_display_pipe_prepare_fb(struct drm_simple_display_pipe *pipe,
+					      struct drm_plane_state *plane_state);
 
 struct drm_framebuffer *
 drm_gem_fbdev_fb_create(struct drm_device *dev,
diff --git a/include/drm/drm_simple_kms_helper.h b/include/drm/drm_simple_kms_helper.h
index b02793742317..451960438a29 100644
--- a/include/drm/drm_simple_kms_helper.h
+++ b/include/drm/drm_simple_kms_helper.h
@@ -116,6 +116,9 @@ struct drm_simple_display_pipe_funcs {
 	 * Optional, called by &drm_plane_helper_funcs.prepare_fb.  Please read
 	 * the documentation for the &drm_plane_helper_funcs.prepare_fb hook for
 	 * more details.
+	 *
+	 * Drivers which always have their buffers pinned should use
+	 * drm_gem_fb_simple_display_pipe_prepare_fb() for this hook.
 	 */
 	int (*prepare_fb)(struct drm_simple_display_pipe *pipe,
 			  struct drm_plane_state *plane_state);
diff --git a/include/drm/tinydrm/tinydrm.h b/include/drm/tinydrm/tinydrm.h
index 6e2b960e25eb..56e4a916b5e8 100644
--- a/include/drm/tinydrm/tinydrm.h
+++ b/include/drm/tinydrm/tinydrm.h
@@ -95,8 +95,6 @@ void tinydrm_shutdown(struct tinydrm_device *tdev);
 
 void tinydrm_display_pipe_update(struct drm_simple_display_pipe *pipe,
 				 struct drm_plane_state *old_state);
-int tinydrm_display_pipe_prepare_fb(struct drm_simple_display_pipe *pipe,
-				    struct drm_plane_state *plane_state);
 int
 tinydrm_display_pipe_init(struct tinydrm_device *tdev,
 			  const struct drm_simple_display_pipe_funcs *funcs,
-- 
cgit v1.2.3


From 30d23f220c75cf58584b19929bd8460b4edc9771 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 5 Apr 2018 17:44:46 +0200
Subject: drm/atomic: better doc for implicit vs explicit fencing

Note that a pile of drivers don't seem to take implicit fencing into
account, or at least don't call drm_atoimc_set_fence_for_plane().
Cc'ing relevant people, or at least some. Some drivers also look like
they don't disable implicit fencing (e.g. amdgpu) because the explicit
fences and implicit fences are handled by entirely independent code
paths.

I also wonder whether we shouldn't just make the recommended helpers
the default ones, since a lot of drivers don't bother to handle the
implicit fences at all it seems. The helpers won't blow up even for
non-GEM drivers or GEM drivers which don't fill out the gem bo
pointers in struct drm_framebuffer.

v2: Comments from Eric.

Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Harry Wentland <harry.wentland@amd.com>
Cc: Sinclair Yeh <syeh@vmware.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180405154449.23038-7-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_atomic.c             |  8 ++++++++
 include/drm/drm_modeset_helper_vtables.h |  5 ++++-
 include/drm/drm_plane.h                  | 11 +++++++++--
 3 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 3d9ae057a6cd..9bdd67781917 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -1496,6 +1496,14 @@ EXPORT_SYMBOL(drm_atomic_set_fb_for_plane);
  * Otherwise, if &drm_plane_state.fence is not set this function we just set it
  * with the received implicit fence. In both cases this function consumes a
  * reference for @fence.
+ *
+ * This way explicit fencing can be used to overrule implicit fencing, which is
+ * important to make explicit fencing use-cases work: One example is using one
+ * buffer for 2 screens with different refresh rates. Implicit fencing will
+ * clamp rendering to the refresh rate of the slower screen, whereas explicit
+ * fence allows 2 independent render and display loops on a single buffer. If a
+ * driver allows obeys both implicit and explicit fences for plane updates, then
+ * it will break all the benefits of explicit fencing.
  */
 void
 drm_atomic_set_fence_for_plane(struct drm_plane_state *plane_state,
diff --git a/include/drm/drm_modeset_helper_vtables.h b/include/drm/drm_modeset_helper_vtables.h
index 3e76ca805b0f..35e2a3a79fc5 100644
--- a/include/drm/drm_modeset_helper_vtables.h
+++ b/include/drm/drm_modeset_helper_vtables.h
@@ -1004,11 +1004,14 @@ struct drm_plane_helper_funcs {
 	 * This function must not block for outstanding rendering, since it is
 	 * called in the context of the atomic IOCTL even for async commits to
 	 * be able to return any errors to userspace. Instead the recommended
-	 * way is to fill out the fence member of the passed-in
+	 * way is to fill out the &drm_plane_state.fence of the passed-in
 	 * &drm_plane_state. If the driver doesn't support native fences then
 	 * equivalent functionality should be implemented through private
 	 * members in the plane structure.
 	 *
+	 * Drivers which always have their buffers pinned should use
+	 * drm_gem_fb_prepare_fb() for this hook.
+	 *
 	 * The helpers will call @cleanup_fb with matching arguments for every
 	 * successful call to this hook.
 	 *
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 9563bd25f19b..26fa50c2a50e 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -80,8 +80,15 @@ struct drm_plane_state {
 	/**
 	 * @fence:
 	 *
-	 * Optional fence to wait for before scanning out @fb. Do not write this
-	 * directly, use drm_atomic_set_fence_for_plane()
+	 * Optional fence to wait for before scanning out @fb. The core atomic
+	 * code will set this when userspace is using explicit fencing. Do not
+	 * write this directly for a driver's implicit fence, use
+	 * drm_atomic_set_fence_for_plane() to ensure that an explicit fence is
+	 * preserved.
+	 *
+	 * Drivers should store any implicit fence in this from their
+	 * &drm_plane_helper.prepare_fb callback. See drm_gem_fb_prepare_fb()
+	 * and drm_gem_fb_simple_display_pipe_prepare_fb() for suitable helpers.
 	 */
 	struct dma_fence *fence;
 
-- 
cgit v1.2.3


From 026a807c2de37aa826748c2ffa1969fc778406b2 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Tue, 24 Apr 2018 13:36:01 +0300
Subject: net/dim: Rename *_get_profile() functions to *_get_rx_moderation()

Preparation for introducing adaptive TX to net DIM.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bcmsysport.c        |  6 +++---
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c     |  8 ++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.c    |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c  |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  6 ++++--
 include/linux/net_dim.h                           | 12 ++++++------
 6 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index f9a3c1a76d5d..effc651a2a2f 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -654,7 +654,7 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 	pkts = priv->rx_max_coalesced_frames;
 
 	if (ec->use_adaptive_rx_coalesce && !priv->dim.use_dim) {
-		moder = net_dim_get_def_profile(priv->dim.dim.mode);
+		moder = net_dim_get_def_rx_moderation(priv->dim.dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
@@ -1064,7 +1064,7 @@ static void bcm_sysport_dim_work(struct work_struct *work)
 	struct bcm_sysport_priv *priv =
 			container_of(ndim, struct bcm_sysport_priv, dim);
 	struct net_dim_cq_moder cur_profile =
-				net_dim_get_profile(dim->mode, dim->profile_ix);
+			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcm_sysport_set_rx_coalesce(priv, cur_profile.usec, cur_profile.pkts);
 	dim->state = NET_DIM_START_MEASURE;
@@ -1437,7 +1437,7 @@ static void bcm_sysport_init_rx_coalesce(struct bcm_sysport_priv *priv)
 
 	/* If DIM was enabled, re-apply default parameters */
 	if (dim->use_dim) {
-		moder = net_dim_get_def_profile(dim->dim.mode);
+		moder = net_dim_get_def_rx_moderation(dim->dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index 408dd190331e..afa97c8bb081 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -21,11 +21,11 @@ void bnxt_dim_work(struct work_struct *work)
 	struct bnxt_napi *bnapi = container_of(cpr,
 					       struct bnxt_napi,
 					       cp_ring);
-	struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode,
-								  dim->profile_ix);
+	struct net_dim_cq_moder cur_moder =
+		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
-	cpr->rx_ring_coal.coal_ticks = cur_profile.usec;
-	cpr->rx_ring_coal.coal_bufs = cur_profile.pkts;
+	cpr->rx_ring_coal.coal_ticks = cur_moder.usec;
+	cpr->rx_ring_coal.coal_bufs = cur_moder.pkts;
 
 	bnxt_hwrm_set_ring_coal(bnapi->bp, bnapi);
 	dim->state = NET_DIM_START_MEASURE;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 0445f2c0c629..20c1681bb1af 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -652,7 +652,7 @@ static void bcmgenet_set_ring_rx_coalesce(struct bcmgenet_rx_ring *ring,
 	pkts = ring->rx_max_coalesced_frames;
 
 	if (ec->use_adaptive_rx_coalesce && !ring->dim.use_dim) {
-		moder = net_dim_get_def_profile(ring->dim.dim.mode);
+		moder = net_dim_get_def_rx_moderation(ring->dim.dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
@@ -1925,7 +1925,7 @@ static void bcmgenet_dim_work(struct work_struct *work)
 	struct bcmgenet_rx_ring *ring =
 			container_of(ndim, struct bcmgenet_rx_ring, dim);
 	struct net_dim_cq_moder cur_profile =
-			net_dim_get_profile(dim->mode, dim->profile_ix);
+			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcmgenet_set_rx_coalesce(ring, cur_profile.usec, cur_profile.pkts);
 	dim->state = NET_DIM_START_MEASURE;
@@ -2102,7 +2102,7 @@ static void bcmgenet_init_rx_coalesce(struct bcmgenet_rx_ring *ring)
 
 	/* If DIM was enabled, re-apply default parameters */
 	if (dim->use_dim) {
-		moder = net_dim_get_def_profile(dim->dim.mode);
+		moder = net_dim_get_def_rx_moderation(dim->dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index 602851ab5b14..1b286e1985d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -38,11 +38,11 @@ void mlx5e_rx_dim_work(struct work_struct *work)
 	struct net_dim *dim = container_of(work, struct net_dim,
 					   work);
 	struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
-	struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode,
-								  dim->profile_ix);
+	struct net_dim_cq_moder cur_moder =
+		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq,
-				       cur_profile.usec, cur_profile.pkts);
+				       cur_moder.usec, cur_moder.pkts);
 
 	dim->state = NET_DIM_START_MEASURE;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f10037419978..a69cc1cc7bfb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4119,12 +4119,14 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 		switch (cq_period_mode) {
 		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
 			params->rx_cq_moderation =
-				net_dim_get_def_profile(NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE);
+				net_dim_get_def_rx_moderation(
+					NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE);
 			break;
 		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 		default:
 			params->rx_cq_moderation =
-				net_dim_get_def_profile(NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE);
+				net_dim_get_def_rx_moderation(
+					NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE);
 		}
 	}
 
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index 29ed8fd6379a..7ca3c4deb3a6 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -129,17 +129,17 @@ profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_CQE_PROFILES,
 };
 
-static inline struct net_dim_cq_moder net_dim_get_profile(u8 cq_period_mode,
-							  int ix)
+static inline struct net_dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder;
+	struct net_dim_cq_moder cq_moder = profile[cq_period_mode][ix];
 
-	cq_moder = profile[cq_period_mode][ix];
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
-static inline struct net_dim_cq_moder net_dim_get_def_profile(u8 rx_cq_period_mode)
+static inline struct net_dim_cq_moder
+net_dim_get_def_rx_moderation(u8 rx_cq_period_mode)
 {
 	int default_profile_ix;
 
@@ -148,7 +148,7 @@ static inline struct net_dim_cq_moder net_dim_get_def_profile(u8 rx_cq_period_mo
 	else /* NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE */
 		default_profile_ix = NET_DIM_DEF_PROFILE_EQE;
 
-	return net_dim_get_profile(rx_cq_period_mode, default_profile_ix);
+	return net_dim_get_rx_moderation(rx_cq_period_mode, default_profile_ix);
 }
 
 static inline bool net_dim_on_top(struct net_dim *dim)
-- 
cgit v1.2.3


From 623ad755226c2b4fb312772bae9cdeccd8219ee3 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Tue, 24 Apr 2018 13:36:02 +0300
Subject: net/dim: Support adaptive TX moderation

Interrupt moderation for TX traffic requires different profiles than RX
interrupt moderation. The main goal here is to reduce interrupt rate and
allow better payload aggregation by keeping SKBs in the TX queue a bit
longer. Ping-pong behavior would get a profile with a short timer, so
latency wouldn't increase for these scenarios. There might be a slight
degradation in bandwidth for single stream with large message sizes, since
net.ipv4.tcp_limit_output_bytes is limiting the allowed TX traffic, but
with many streams performance is always improved.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net_dim.h | 63 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index 7ca3c4deb3a6..db99240d00bd 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -103,11 +103,12 @@ enum {
 #define NET_DIM_PARAMS_NUM_PROFILES 5
 /* Adaptive moderation profiles */
 #define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
 #define NET_DIM_DEF_PROFILE_CQE 1
 #define NET_DIM_DEF_PROFILE_EQE 1
 
 /* All profiles sizes must be NET_PARAMS_DIM_NUM_PROFILES */
-#define NET_DIM_EQE_PROFILES { \
+#define NET_DIM_RX_EQE_PROFILES { \
 	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
@@ -115,7 +116,7 @@ enum {
 	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 }
 
-#define NET_DIM_CQE_PROFILES { \
+#define NET_DIM_RX_CQE_PROFILES { \
 	{2,  256},             \
 	{8,  128},             \
 	{16, 64},              \
@@ -123,32 +124,68 @@ enum {
 	{64, 64}               \
 }
 
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
 static const struct net_dim_cq_moder
-profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_EQE_PROFILES,
-	NET_DIM_CQE_PROFILES,
+rx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct net_dim_cq_moder
+tx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
 };
 
 static inline struct net_dim_cq_moder
 net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder = profile[cq_period_mode][ix];
+	struct net_dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
 
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
 static inline struct net_dim_cq_moder
-net_dim_get_def_rx_moderation(u8 rx_cq_period_mode)
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+
+static inline struct net_dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
 {
-	int default_profile_ix;
+	struct net_dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
 
-	if (rx_cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE)
-		default_profile_ix = NET_DIM_DEF_PROFILE_CQE;
-	else /* NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE */
-		default_profile_ix = NET_DIM_DEF_PROFILE_EQE;
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+static inline struct net_dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
 
-	return net_dim_get_rx_moderation(rx_cq_period_mode, default_profile_ix);
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
 }
 
 static inline bool net_dim_on_top(struct net_dim *dim)
-- 
cgit v1.2.3


From a06ac0d67d9fda7c255476c6391032319030045d Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Tue, 24 Apr 2018 23:07:45 +0800
Subject: Revert "net: init sk_cookie for inet socket"

This reverts commit <c6849a3ac17e> ("net: init sk_cookie for inet socket")

Per discussion with Eric, when update sock_net(sk)->cookie_gen, the
whole cache cache line will be invalidated, as this cache line is shared
with all cpus, that may cause great performace hit.

Bellow is the data form Eric.
"Performance is reduced from ~5 Mpps to ~3.8 Mpps with 16 RX queues on
my host" when running synflood test.

Have to revert it to prevent from cache line false sharing.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h | 9 ---------
 net/ipv4/tcp_input.c      | 8 +-------
 2 files changed, 1 insertion(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 5c916e6dff36..15fe980a27ea 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -25,15 +25,6 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
-static inline
-void sock_init_cookie(struct sock *sk)
-{
-	u64 res;
-
-	res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
-	atomic64_set(&sk->sk_cookie, res);
-}
-
 u64 sock_gen_cookie(struct sock *sk);
 int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
 void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 17b78582ba63..5a17cfc75326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,7 +78,6 @@
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
 #include <linux/static_key.h>
-#include <linux/sock_diag.h>
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -6191,15 +6190,10 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
 #endif
+		atomic64_set(&ireq->ir_cookie, 0);
 		ireq->ireq_state = TCP_NEW_SYN_RECV;
 		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
 		ireq->ireq_family = sk_listener->sk_family;
-
-		BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
-					offsetof(struct sock, sk_cookie));
-		BUILD_BUG_ON(offsetof(struct inet_request_sock, ireq_net) !=
-					offsetof(struct sock, sk_net));
-		sock_init_cookie((struct sock *)ireq);
 	}
 
 	return req;
-- 
cgit v1.2.3


From f15cd6d99198e9c15229aefec639a34a6e8174c6 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Mon, 23 Apr 2018 14:42:44 +0800
Subject: regmap: include <linux/ktime.h> from include/linux/regmap.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similar to the readx_poll_timeout() macro calling ktime_* and using
ktime_t type, which is declared in <linux/ktime.h>. So, make
include/linux/regmap.h explicitly include <linux/ktime.h>, like
include/linux/iopoll.h does.  Otherwise, users of the macro will see
below errors.

error: implicit declaration of function ‘ktime_add_us’
[-Werror=implicit-function-declaration]

error: implicit declaration of function ‘ktime_get’
[-Werror=implicit-function-declaration]

error: implicit declaration of function ‘ktime_compare’
[-Werror=implicit-function-declaration]

include/linux/regmap.h:128:2: error: unknown type name ‘ktime_t’
  ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 5f7ad0552c03..b6865f070464 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -15,6 +15,7 @@
 
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <linux/ktime.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/bug.h>
-- 
cgit v1.2.3


From 0c6f69a5e3641bccfe21fd0eeafad45a8c6db987 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 24 Apr 2018 08:29:13 +1000
Subject: rhashtable: remove outdated comments about grow_decision etc

grow_decision and shink_decision no longer exist, so remove
the remaining references to them.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 1f8ad121eb43..87d443a5b11d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -836,9 +836,8 @@ out:
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhashtable_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
@@ -866,9 +865,8 @@ static inline int rhashtable_insert_fast(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhltable_insert_key(
 	struct rhltable *hlt, const void *key, struct rhlist_head *list,
@@ -890,9 +888,8 @@ static inline int rhltable_insert_key(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhltable_insert(
 	struct rhltable *hlt, struct rhlist_head *list,
@@ -922,9 +919,8 @@ static inline int rhltable_insert(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhashtable_lookup_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
@@ -981,9 +977,8 @@ static inline void *rhashtable_lookup_get_insert_fast(
  *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  *
  * Returns zero on success.
  */
@@ -1134,8 +1129,8 @@ static inline int __rhashtable_remove_fast(
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
  *
  * Returns zero on success, -ENOENT if the entry could not be found.
  */
@@ -1156,8 +1151,8 @@ static inline int rhashtable_remove_fast(
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
  *
  * Returns zero on success, -ENOENT if the entry could not be found.
  */
-- 
cgit v1.2.3


From 82266e98dd4d8e7d5b8e4a0fedeb91f2eb29d306 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 24 Apr 2018 08:29:13 +1000
Subject: rhashtable: Revise incorrect comment on r{hl, hash}table_walk_enter()

Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, though
they do take a spinlock without irq protection.
So revise the comments to accurately state the contexts in which
these functions can be called.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 5 +++--
 lib/rhashtable.c           | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 87d443a5b11d..4e1f535c2034 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1268,8 +1268,9 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2b2b79974b61..6d490f51174e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -668,8 +668,9 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */
-- 
cgit v1.2.3


From 6e35fed963b4de28930e126a32600543f1662a3d Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 24 Apr 2018 16:22:42 +0200
Subject: drm: Don't EXPORT drm_add/reset_display_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only used within drm.ko, no need to tempt drivers.

Cc: Keith Packard <keithp@keithp.com>
Cc: Ville Syrjala <ville.syrjala@linux.intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180424142242.12093-1-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_crtc_internal.h | 2 ++
 drivers/gpu/drm/drm_edid.c          | 2 --
 include/drm/drm_edid.h              | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_crtc_internal.h b/drivers/gpu/drm/drm_crtc_internal.h
index 3c2b82865ad2..5d307b23a4e6 100644
--- a/drivers/gpu/drm/drm_crtc_internal.h
+++ b/drivers/gpu/drm/drm_crtc_internal.h
@@ -220,3 +220,5 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev,
 
 /* drm_edid.c */
 void drm_mode_fixup_1366x768(struct drm_display_mode *mode);
+void drm_reset_display_info(struct drm_connector *connector);
+u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid);
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 134069f36482..61dd9a2fbe5b 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -4454,7 +4454,6 @@ drm_reset_display_info(struct drm_connector *connector)
 
 	info->non_desktop = 0;
 }
-EXPORT_SYMBOL_GPL(drm_reset_display_info);
 
 u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid)
 {
@@ -4538,7 +4537,6 @@ u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edi
 		info->color_formats |= DRM_COLOR_FORMAT_YCRCB422;
 	return quirks;
 }
-EXPORT_SYMBOL_GPL(drm_add_display_info);
 
 static int validate_displayid(u8 *displayid, int length, int idx)
 {
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 8d89a9c3748d..b25d12ef120a 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -465,8 +465,6 @@ struct edid *drm_get_edid(struct drm_connector *connector,
 struct edid *drm_get_edid_switcheroo(struct drm_connector *connector,
 				     struct i2c_adapter *adapter);
 struct edid *drm_edid_duplicate(const struct edid *edid);
-void drm_reset_display_info(struct drm_connector *connector);
-u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid);
 int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid);
 
 u8 drm_match_cea_mode(const struct drm_display_mode *to_match);
-- 
cgit v1.2.3


From 12bed760a78da6e12ac8252fec64d019a9eac523 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Tue, 24 Apr 2018 17:50:29 +0300
Subject: bpf: add helper for getting xfrm states

This commit introduces a helper which allows fetching xfrm state
parameters by eBPF programs attached to TC.

Prototype:
bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)

skb: pointer to skb
index: the index in the skb xfrm_state secpath array
xfrm_state: pointer to 'struct bpf_xfrm_state'
size: size of 'struct bpf_xfrm_state'
flags: reserved for future extensions

The helper returns 0 on success. Non zero if no xfrm state at the index
is found - or non exists at all.

struct bpf_xfrm_state currently includes the SPI, peer IPv4/IPv6
address and the reqid; it can be further extended by adding elements to
its end - indicating the populated fields by the 'size' argument -
keeping backwards compatibility.

Typical usage:

struct bpf_xfrm_state x = {};
bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
...

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++-
 net/core/filter.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8383a289f7b..e6679393b687 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -774,6 +774,15 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: A negative integer to be added to xdp_md.data_end
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
+ *     retrieve XFRM state
+ *     @skb: pointer to skb
+ *     @index: index of the xfrm state in the secpath
+ *     @key: pointer to 'struct bpf_xfrm_state'
+ *     @size: size of 'struct bpf_xfrm_state'
+ *     @flags: room for future extensions
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -841,7 +850,8 @@ union bpf_attr {
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
 	FN(bind),			\
-	FN(xdp_adjust_tail),
+	FN(xdp_adjust_tail),		\
+	FN(skb_get_xfrm_state),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -947,6 +957,19 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+	__u32 reqid;
+	__u32 spi;	/* Stored in network byte order */
+	__u16 family;
+	union {
+		__u32 remote_ipv4;	/* Stored in network byte order */
+		__u32 remote_ipv6[4];	/* Stored in network byte order */
+	};
+};
+
 /* Generic BPF return codes which all BPF program types may support.
  * The values are binary compatible with their TC_ACT_* counter-part to
  * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
diff --git a/net/core/filter.c b/net/core/filter.c
index e25bc4a3aa1a..8e45c6c7ab08 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,6 +57,7 @@
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
 #include <net/tcp.h>
+#include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 
 /**
@@ -3743,6 +3744,49 @@ static const struct bpf_func_proto bpf_bind_proto = {
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+	   struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+	const struct sec_path *sp = skb_sec_path(skb);
+	const struct xfrm_state *x;
+
+	if (!sp || unlikely(index >= sp->len || flags))
+		goto err_clear;
+
+	x = sp->xvec[index];
+
+	if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+		goto err_clear;
+
+	to->reqid = x->props.reqid;
+	to->spi = x->id.spi;
+	to->family = x->props.family;
+	if (to->family == AF_INET6) {
+		memcpy(to->remote_ipv6, x->props.saddr.a6,
+		       sizeof(to->remote_ipv6));
+	} else {
+		to->remote_ipv4 = x->props.saddr.a4;
+	}
+
+	return 0;
+err_clear:
+	memset(to, 0, size);
+	return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+	.func		= bpf_skb_get_xfrm_state,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+#endif
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3884,6 +3928,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+#ifdef CONFIG_XFRM
+	case BPF_FUNC_skb_get_xfrm_state:
+		return &bpf_skb_get_xfrm_state_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 8effc395c2097e258fcedfc02ed4a66d45fb4238 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Sat, 21 Apr 2018 15:23:09 -0500
Subject: PCI/IOV: Add pci_sriov_configure_simple()

SR-IOV (Single Root I/O Virtualization) is an optional PCIe capability (see
PCIe r4.0, sec 9).  A PCIe Function with the SR-IOV capability is referred
to as a PF (Physical Function).  If SR-IOV is enabled on the PF, several
VFs (Virtual Functions) may be created.  The VFs can be individually
assigned to virtual machines, which allows them to share a single hardware
device while being isolated from each other.

Some SR-IOV devices have resources such as queues and interrupts that must
be set up in the PF before enabling the VFs, so they require a PF driver to
do that.

Other SR-IOV devices don't require any PF setup before enabling VFs.  Add a
pci_sriov_configure_simple() interface so PF drivers for such devices can
use it without repeating the VF-enabling code.

Tested-by: Mark Rustad <mark.d.rustad@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
[bhelgaas: changelog, comment]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>:wq
---
 drivers/pci/iov.c   | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |  2 ++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 8adf4a64f291..192b82898a38 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -833,3 +833,39 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
 	return dev->sriov->total_VFs;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
+
+/**
+ * pci_sriov_configure_simple - helper to configure SR-IOV
+ * @dev: the PCI device
+ * @nr_virtfn: number of virtual functions to enable, 0 to disable
+ *
+ * Enable or disable SR-IOV for devices that don't require any PF setup
+ * before enabling SR-IOV.  Return value is negative on error, or number of
+ * VFs allocated on success.
+ */
+int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn)
+{
+	int rc;
+
+	might_sleep();
+
+	if (!dev->is_physfn)
+		return -ENODEV;
+
+	if (pci_vfs_assigned(dev)) {
+		pci_warn(dev, "Cannot modify SR-IOV while VFs are assigned\n");
+		return -EPERM;
+	}
+
+	if (nr_virtfn == 0) {
+		sriov_disable(dev);
+		return 0;
+	}
+
+	rc = sriov_enable(dev, nr_virtfn);
+	if (rc < 0)
+		return rc;
+
+	return nr_virtfn;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..911f9098a466 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1954,6 +1954,7 @@ int pci_num_vf(struct pci_dev *dev);
 int pci_vfs_assigned(struct pci_dev *dev);
 int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
 int pci_sriov_get_totalvfs(struct pci_dev *dev);
+int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
 resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
 void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
 
@@ -1986,6 +1987,7 @@ static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
 { return 0; }
 static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
 { return 0; }
+#define pci_sriov_configure_simple	NULL
 static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
 { return 0; }
 static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { }
-- 
cgit v1.2.3


From a8ccf8a666639ca9184e8b23d515d9fbe722a27d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 24 Apr 2018 16:47:16 -0500
Subject: PCI/IOV: Add pci-pf-stub driver for PFs that only enable VFs

Some SR-IOV PF devices provide no functionality other than acting as a
means of enabling VFs.  For these devices, we want to enable the VFs and
assign them to guest virtual machines, but there's no need to have a driver
for the PF itself.

Add a new pci-pf-stub driver to claim those PF devices and provide the
generic VF enable functionality.  An administrator can use the sysfs
"sriov_numvfs" file to enable VFs, then assign them to guests.

For now I only have one example ID provided by Amazon in terms of devices
that require this functionality.  The general idea is that in the future we
will see other devices added as vendors come up with devices where the PF
is more or less just a lightweight shim used to allocate VFs.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/Kconfig       | 12 +++++++++++
 drivers/pci/Makefile      |  1 +
 drivers/pci/pci-pf-stub.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h   |  2 ++
 4 files changed, 69 insertions(+)
 create mode 100644 drivers/pci/pci-pf-stub.c

(limited to 'include')

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 34b56a8f8480..cdef2a2a9bc5 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -71,6 +71,18 @@ config PCI_STUB
 
 	  When in doubt, say N.
 
+config PCI_PF_STUB
+	tristate "PCI PF Stub driver"
+	depends on PCI
+	depends on PCI_IOV
+	help
+	  Say Y or M here if you want to enable support for devices that
+	  require SR-IOV support, while at the same time the PF itself is
+	  not providing any actual services on the host itself such as
+	  storage or networking.
+
+	  When in doubt, say N.
+
 config XEN_PCIDEV_FRONTEND
         tristate "Xen PCI Frontend"
         depends on PCI && X86 && XEN
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 952addc7bacf..84c9eef6b1c3 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_PCI_LABEL)		+= pci-label.o
 obj-$(CONFIG_X86_INTEL_MID)	+= pci-mid.o
 obj-$(CONFIG_PCI_SYSCALL)	+= syscall.o
 obj-$(CONFIG_PCI_STUB)		+= pci-stub.o
+obj-$(CONFIG_PCI_PF_STUB)	+= pci-pf-stub.o
 obj-$(CONFIG_PCI_ECAM)		+= ecam.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 
diff --git a/drivers/pci/pci-pf-stub.c b/drivers/pci/pci-pf-stub.c
new file mode 100644
index 000000000000..9795649fc6f9
--- /dev/null
+++ b/drivers/pci/pci-pf-stub.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+/* pci-pf-stub - simple stub driver for PCI SR-IOV PF device
+ *
+ * This driver is meant to act as a "whitelist" for devices that provde
+ * SR-IOV functionality while at the same time not actually needing a
+ * driver of their own.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+
+/**
+ * pci_pf_stub_whitelist - White list of devices to bind pci-pf-stub onto
+ *
+ * This table provides the list of IDs this driver is supposed to bind
+ * onto.  You could think of this as a list of "quirked" devices where we
+ * are adding support for SR-IOV here since there are no other drivers
+ * that they would be running under.
+ */
+static const struct pci_device_id pci_pf_stub_whitelist[] = {
+	{ PCI_VDEVICE(AMAZON, 0x0053) },
+	/* required last entry */
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, pci_pf_stub_whitelist);
+
+static int pci_pf_stub_probe(struct pci_dev *dev,
+			     const struct pci_device_id *id)
+{
+	pci_info(dev, "claimed by pci-pf-stub\n");
+	return 0;
+}
+
+static struct pci_driver pf_stub_driver = {
+	.name			= "pci-pf-stub",
+	.id_table		= pci_pf_stub_whitelist,
+	.probe			= pci_pf_stub_probe,
+	.sriov_configure	= pci_sriov_configure_simple,
+};
+
+static int __init pci_pf_stub_init(void)
+{
+	return pci_register_driver(&pf_stub_driver);
+}
+
+static void __exit pci_pf_stub_exit(void)
+{
+	pci_unregister_driver(&pf_stub_driver);
+}
+
+module_init(pci_pf_stub_init);
+module_exit(pci_pf_stub_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cc608fc55334..411c12287dda 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2552,6 +2552,8 @@
 #define PCI_VENDOR_ID_CIRCUITCO		0x1cc8
 #define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD	0x0001
 
+#define PCI_VENDOR_ID_AMAZON		0x1d0f
+
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
-- 
cgit v1.2.3


From 83a530e1610ab996e59c0941db6cc72f763dddbd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Apr 2018 18:23:46 +0200
Subject: rslib: Add GFP aware init function

The rslib usage in dm/verity_fec is broken because init_rs() can nest in
GFP_NOIO mempool allocations as init_rs() is invoked from the mempool alloc
callback.

Provide a variant which takes gfp_t flags as argument.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/rslib.h           | 28 ++++++++++++++++++++++++---
 lib/reed_solomon/reed_solomon.c | 43 ++++++++++++++++++++++-------------------
 2 files changed, 48 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index 746580c1939c..2aae7ef987eb 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -20,6 +20,8 @@
 #define _RSLIB_H_
 
 #include <linux/list.h>
+#include <linux/types.h>	/* for gfp_t */
+#include <linux/gfp.h>		/* for GFP_KERNEL */
 
 /**
  * struct rs_control - rs control structure
@@ -77,10 +79,30 @@ int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
 #endif
 
 /* Create or get a matching rs control structure */
-struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
-			   int nroots);
+struct rs_control *init_rs_gfp(int symsize, int gfpoly, int fcr, int prim,
+			       int nroots, gfp_t gfp);
+
+/**
+ * init_rs - Create a RS control struct and initialize it
+ *  @symsize:	the symbol size (number of bits)
+ *  @gfpoly:	the extended Galois field generator polynomial coefficients,
+ *		with the 0th coefficient in the low order bit. The polynomial
+ *		must be primitive;
+ *  @fcr:	the first consecutive root of the rs code generator polynomial
+ *		in index form
+ *  @prim:	primitive element to generate polynomial roots
+ *  @nroots:	RS code generator polynomial degree (number of roots)
+ *
+ * Allocations use GFP_KERNEL.
+ */
+static inline struct rs_control *init_rs(int symsize, int gfpoly, int fcr,
+					 int prim, int nroots)
+{
+	return init_rs_gfp(symsize, gfpoly, fcr, prim, nroots, GFP_KERNEL);
+}
+
 struct rs_control *init_rs_non_canonical(int symsize, int (*func)(int),
-                                         int fcr, int prim, int nroots);
+					 int fcr, int prim, int nroots);
 
 /* Release a rs control structure */
 void free_rs(struct rs_control *rs);
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index 06d04cfa9339..f8ae1f7b3558 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -59,19 +59,20 @@ static DEFINE_MUTEX(rslistlock);
  * @fcr:	first root of RS code generator polynomial, index form
  * @prim:	primitive element to generate polynomial roots
  * @nroots:	RS code generator polynomial degree (number of roots)
+ * @gfp:	GFP_ flags for allocations
  *
  * Allocate a control structure and the polynom arrays for faster
  * en/decoding. Fill the arrays according to the given parameters.
  */
 static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
-                                  int fcr, int prim, int nroots)
+				  int fcr, int prim, int nroots, gfp_t gfp)
 {
 	struct rs_control *rs;
 	int i, j, sr, root, iprim;
 
 	/* Allocate the control structure */
-	rs = kmalloc(sizeof (struct rs_control), GFP_KERNEL);
-	if (rs == NULL)
+	rs = kmalloc(sizeof(*rs), gfp);
+	if (!rs)
 		return NULL;
 
 	INIT_LIST_HEAD(&rs->list);
@@ -85,15 +86,15 @@ static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
 	rs->gffunc = gffunc;
 
 	/* Allocate the arrays */
-	rs->alpha_to = kmalloc(sizeof(uint16_t) * (rs->nn + 1), GFP_KERNEL);
+	rs->alpha_to = kmalloc(sizeof(uint16_t) * (rs->nn + 1), gfp);
 	if (rs->alpha_to == NULL)
 		goto errrs;
 
-	rs->index_of = kmalloc(sizeof(uint16_t) * (rs->nn + 1), GFP_KERNEL);
+	rs->index_of = kmalloc(sizeof(uint16_t) * (rs->nn + 1), gfp);
 	if (rs->index_of == NULL)
 		goto erralp;
 
-	rs->genpoly = kmalloc(sizeof(uint16_t) * (rs->nroots + 1), GFP_KERNEL);
+	rs->genpoly = kmalloc(sizeof(uint16_t) * (rs->nroots + 1), gfp);
 	if(rs->genpoly == NULL)
 		goto erridx;
 
@@ -181,6 +182,7 @@ void free_rs(struct rs_control *rs)
 	}
 	mutex_unlock(&rslistlock);
 }
+EXPORT_SYMBOL_GPL(free_rs);
 
 /**
  * init_rs_internal - Find a matching or allocate a new rs control structure
@@ -195,13 +197,14 @@ void free_rs(struct rs_control *rs)
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
+ *  @gfp:	GFP_ flags for allocations
  */
 static struct rs_control *init_rs_internal(int symsize, int gfpoly,
-                                           int (*gffunc)(int), int fcr,
-                                           int prim, int nroots)
+					   int (*gffunc)(int), int fcr,
+					   int prim, int nroots, gfp_t gfp)
 {
-	struct list_head	*tmp;
-	struct rs_control	*rs;
+	struct list_head *tmp;
+	struct rs_control *rs;
 
 	/* Sanity checks */
 	if (symsize < 1)
@@ -236,7 +239,7 @@ static struct rs_control *init_rs_internal(int symsize, int gfpoly,
 	}
 
 	/* Create a new one */
-	rs = rs_init(symsize, gfpoly, gffunc, fcr, prim, nroots);
+	rs = rs_init(symsize, gfpoly, gffunc, fcr, prim, nroots, gfp);
 	if (rs) {
 		rs->users = 1;
 		list_add(&rs->list, &rslist);
@@ -247,7 +250,7 @@ out:
 }
 
 /**
- * init_rs - Find a matching or allocate a new rs control structure
+ * init_rs_gfp - Find a matching or allocate a new rs control structure
  *  @symsize:	the symbol size (number of bits)
  *  @gfpoly:	the extended Galois field generator polynomial coefficients,
  *		with the 0th coefficient in the low order bit. The polynomial
@@ -256,12 +259,14 @@ out:
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
+ *  @gfp:	GFP_ flags for allocations
  */
-struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
-                           int nroots)
+struct rs_control *init_rs_gfp(int symsize, int gfpoly, int fcr, int prim,
+			       int nroots, gfp_t gfp)
 {
-	return init_rs_internal(symsize, gfpoly, NULL, fcr, prim, nroots);
+	return init_rs_internal(symsize, gfpoly, NULL, fcr, prim, nroots, gfp);
 }
+EXPORT_SYMBOL_GPL(init_rs_gfp);
 
 /**
  * init_rs_non_canonical - Find a matching or allocate a new rs control
@@ -279,8 +284,10 @@ struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
 struct rs_control *init_rs_non_canonical(int symsize, int (*gffunc)(int),
                                          int fcr, int prim, int nroots)
 {
-	return init_rs_internal(symsize, 0, gffunc, fcr, prim, nroots);
+	return init_rs_internal(symsize, 0, gffunc, fcr, prim, nroots,
+				GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(init_rs_non_canonical);
 
 #ifdef CONFIG_REED_SOLOMON_ENC8
 /**
@@ -374,10 +381,6 @@ int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
 EXPORT_SYMBOL_GPL(decode_rs16);
 #endif
 
-EXPORT_SYMBOL_GPL(init_rs);
-EXPORT_SYMBOL_GPL(init_rs_non_canonical);
-EXPORT_SYMBOL_GPL(free_rs);
-
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Reed Solomon encoder/decoder");
 MODULE_AUTHOR("Phil Karn, Thomas Gleixner");
-- 
cgit v1.2.3


From cc4b86e496beb6f3f499e1be260f51b4026aba84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Apr 2018 18:23:48 +0200
Subject: rslib: Cleanup whitespace damage

Instead of mixing the whitespace cleanup into functional changes, mop it up
first.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Andrew Morton <akpm@linuxfoundation.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/rslib.h           | 12 ++++++------
 lib/reed_solomon/reed_solomon.c | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index 2aae7ef987eb..d7bc66931032 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -41,15 +41,15 @@
  * @list:	List entry for the rs control list
 */
 struct rs_control {
-	int 		mm;
-	int 		nn;
+	int		mm;
+	int		nn;
 	uint16_t	*alpha_to;
 	uint16_t	*index_of;
 	uint16_t	*genpoly;
-	int 		nroots;
-	int 		fcr;
-	int 		prim;
-	int 		iprim;
+	int		nroots;
+	int		fcr;
+	int		prim;
+	int		iprim;
 	int		gfpoly;
 	int		(*gffunc)(int);
 	int		users;
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index f8ae1f7b3558..1ea750f8fa55 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(free_rs);
  *  @gffunc:	pointer to function to generate the next field element,
  *		or the multiplicative identity element if given 0.  Used
  *		instead of gfpoly if gfpoly is 0
- *  @fcr:  	the first consecutive root of the rs code generator polynomial
+ *  @fcr:	the first consecutive root of the rs code generator polynomial
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
@@ -210,9 +210,9 @@ static struct rs_control *init_rs_internal(int symsize, int gfpoly,
 	if (symsize < 1)
 		return NULL;
 	if (fcr < 0 || fcr >= (1<<symsize))
-    		return NULL;
+		return NULL;
 	if (prim <= 0 || prim >= (1<<symsize))
-    		return NULL;
+		return NULL;
 	if (nroots < 0 || nroots >= (1<<symsize))
 		return NULL;
 
@@ -255,7 +255,7 @@ out:
  *  @gfpoly:	the extended Galois field generator polynomial coefficients,
  *		with the 0th coefficient in the low order bit. The polynomial
  *		must be primitive;
- *  @fcr:  	the first consecutive root of the rs code generator polynomial
+ *  @fcr:	the first consecutive root of the rs code generator polynomial
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
@@ -276,13 +276,13 @@ EXPORT_SYMBOL_GPL(init_rs_gfp);
  *  @gffunc:	pointer to function to generate the next field element,
  *		or the multiplicative identity element if given 0.  Used
  *		instead of gfpoly if gfpoly is 0
- *  @fcr:  	the first consecutive root of the rs code generator polynomial
+ *  @fcr:	the first consecutive root of the rs code generator polynomial
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
  */
 struct rs_control *init_rs_non_canonical(int symsize, int (*gffunc)(int),
-                                         int fcr, int prim, int nroots)
+					 int fcr, int prim, int nroots)
 {
 	return init_rs_internal(symsize, 0, gffunc, fcr, prim, nroots,
 				GFP_KERNEL);
-- 
cgit v1.2.3


From 3413e1891d443b9e936b4b6fb0ba4cdc38ff71b6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Apr 2018 18:23:49 +0200
Subject: rslib: Cleanup top level comments

File references and stale CVS ids are really not useful.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Andrew Morton <akpm@linuxfoundation.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/rslib.h           |  7 +------
 lib/reed_solomon/decode_rs.c    | 12 ++----------
 lib/reed_solomon/encode_rs.c    | 13 ++-----------
 lib/reed_solomon/reed_solomon.c |  9 +--------
 4 files changed, 6 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index d7bc66931032..a596abaf2e56 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -1,16 +1,11 @@
 /*
- * include/linux/rslib.h
- *
- * Overview:
- *   Generic Reed Solomon encoder / decoder library
+ * Generic Reed Solomon encoder / decoder library
  *
  * Copyright (C) 2004 Thomas Gleixner (tglx@linutronix.de)
  *
  * RS code lifted from reed solomon library written by Phil Karn
  * Copyright 2002 Phil Karn, KA9Q
  *
- * $Id: rslib.h,v 1.4 2005/11/07 11:14:52 gleixner Exp $
- *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c
index 0ec3f257ffdf..c1ef9917bc85 100644
--- a/lib/reed_solomon/decode_rs.c
+++ b/lib/reed_solomon/decode_rs.c
@@ -1,20 +1,12 @@
 /*
- * lib/reed_solomon/decode_rs.c
- *
- * Overview:
- *   Generic Reed Solomon encoder / decoder library
+ * Generic Reed Solomon encoder / decoder library
  *
  * Copyright 2002, Phil Karn, KA9Q
  * May be used under the terms of the GNU General Public License (GPL)
  *
  * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de)
  *
- * $Id: decode_rs.c,v 1.7 2005/11/07 11:14:59 gleixner Exp $
- *
- */
-
-/* Generic data width independent code which is included by the
- * wrappers.
+ * Generic data width independent code which is included by the wrappers.
  */
 {
 	int deg_lambda, el, deg_omega;
diff --git a/lib/reed_solomon/encode_rs.c b/lib/reed_solomon/encode_rs.c
index 0b5b1a6728ec..edb1ba059c16 100644
--- a/lib/reed_solomon/encode_rs.c
+++ b/lib/reed_solomon/encode_rs.c
@@ -1,21 +1,12 @@
 /*
- * lib/reed_solomon/encode_rs.c
- *
- * Overview:
- *   Generic Reed Solomon encoder / decoder library
+ * Generic Reed Solomon encoder / decoder library
  *
  * Copyright 2002, Phil Karn, KA9Q
  * May be used under the terms of the GNU General Public License (GPL)
  *
  * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de)
  *
- * $Id: encode_rs.c,v 1.5 2005/11/07 11:14:59 gleixner Exp $
- *
- */
-
-/* Generic data width independent code which is included by the
- * wrappers.
- * int encode_rsX (struct rs_control *rs, uintX_t *data, int len, uintY_t *par)
+ * Generic data width independent code which is included by the wrappers.
  */
 {
 	int i, j, pad;
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index 1ea750f8fa55..880fcdaf3346 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -1,16 +1,11 @@
 /*
- * lib/reed_solomon/reed_solomon.c
- *
- * Overview:
- *   Generic Reed Solomon encoder / decoder library
+ * Generic Reed Solomon encoder / decoder library
  *
  * Copyright (C) 2004 Thomas Gleixner (tglx@linutronix.de)
  *
  * Reed Solomon code lifted from reed solomon library written by Phil Karn
  * Copyright 2002 Phil Karn, KA9Q
  *
- * $Id: rslib.c,v 1.7 2005/11/07 11:14:59 gleixner Exp $
- *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -35,9 +30,7 @@
  * second stage, which does the decoding / error correction itself.
  * Many hw encoders provide a syndrome calculation over the received
  * data + syndrome and can call the second stage directly.
- *
  */
-
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-- 
cgit v1.2.3


From dc8f923eaee24061c557cb3221abeea3b95072c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Apr 2018 18:23:50 +0200
Subject: rslib: Add SPDX identifiers

The Reed-Solomon library is based on code from Phil Karn who granted
permission to import it into the kernel under the GPL V2.

See commit 15b5423757a7 ("Shared Reed-Solomon ECC library") in the history
git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git

  ...
  The encoder/decoder code is lifted from the GPL'd userspace RS-library
  written by Phil Karn. I modified/wrapped it to provide the different
  functions which we need in the MTD/NAND code.
  ...
  Signed-Off-By: Thomas Gleixner <tglx@linutronix.de>
  Signed-Off-By: David Woodhouse <dwmw2@infradead.org>
  "No objections at all. Just keep the authorship notices." -- Phil Karn

Add the proper SPDX identifiers according to
Documentation/process/license-rules.rst.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Andrew Morton <akpm@linuxfoundation.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/rslib.h           | 1 +
 lib/reed_solomon/decode_rs.c    | 1 +
 lib/reed_solomon/encode_rs.c    | 1 +
 lib/reed_solomon/reed_solomon.c | 1 +
 4 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index a596abaf2e56..7ace7eec3028 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic Reed Solomon encoder / decoder library
  *
diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c
index c1ef9917bc85..d61007ade4a0 100644
--- a/lib/reed_solomon/decode_rs.c
+++ b/lib/reed_solomon/decode_rs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic Reed Solomon encoder / decoder library
  *
diff --git a/lib/reed_solomon/encode_rs.c b/lib/reed_solomon/encode_rs.c
index edb1ba059c16..10ca1ebb13dd 100644
--- a/lib/reed_solomon/encode_rs.c
+++ b/lib/reed_solomon/encode_rs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic Reed Solomon encoder / decoder library
  *
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index 880fcdaf3346..0c1a818c8018 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic Reed Solomon encoder / decoder library
  *
-- 
cgit v1.2.3


From 689c6efdfb58d7b21c375f515349e4091e08100b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Apr 2018 18:23:51 +0200
Subject: rslib: Remove GPL boilerplate

Now that SPDX identifiers are in place, remove the GPL boiler plate
text. Leave the notices which document that Phil Karn granted permission in
place (encode/decode source code). The modified files are code written for
the kernel by me.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Andrew Morton <akpm@linuxfoundation.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/rslib.h           | 5 -----
 lib/reed_solomon/reed_solomon.c | 4 ----
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index 7ace7eec3028..27652c18bf8c 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -6,12 +6,7 @@
  *
  * RS code lifted from reed solomon library written by Phil Karn
  * Copyright 2002 Phil Karn, KA9Q
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
-
 #ifndef _RSLIB_H_
 #define _RSLIB_H_
 
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index 0c1a818c8018..3e694ab9e3b2 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -7,10 +7,6 @@
  * Reed Solomon code lifted from reed solomon library written by Phil Karn
  * Copyright 2002 Phil Karn, KA9Q
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
  * Description:
  *
  * The generic Reed Solomon library provides runtime configurable
-- 
cgit v1.2.3


From 2163398192f6a53e84765cfe3e5a2088437c3d2d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Apr 2018 18:23:53 +0200
Subject: rslib: Split rs control struct

The decoder library uses variable length arrays on stack. To get rid of
them it would be simple to allocate fixed length arrays on stack, but those
might become rather large. The other solution is to allocate the buffers in
the rs control structure, but this cannot be done as long as the structure
can be shared by several users. Sharing is desired because the RS polynom
tables are large and initialization is time consuming.

To solve this split the codec information out of the control structure and
have a pointer to a shared codec in it. Instantiate the control structure
for each user, create a new codec if no shareable is avaiable yet.  Adjust
all affected usage sites to the new scheme.

This allows to add per instance decoder buffers to the control structure
later on.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Boris Brezillon <boris.brezillon@bootlin.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Andrew Morton <akpm@linuxfoundation.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/mtd/nand/raw/cafe_nand.c  |   7 +-
 drivers/mtd/nand/raw/diskonchip.c |   7 +-
 include/linux/rslib.h             |  19 ++++--
 lib/reed_solomon/decode_rs.c      |   1 +
 lib/reed_solomon/encode_rs.c      |   1 +
 lib/reed_solomon/reed_solomon.c   | 135 ++++++++++++++++++++++----------------
 6 files changed, 100 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/raw/cafe_nand.c b/drivers/mtd/nand/raw/cafe_nand.c
index d8c8c9d1e640..d721f489b38b 100644
--- a/drivers/mtd/nand/raw/cafe_nand.c
+++ b/drivers/mtd/nand/raw/cafe_nand.c
@@ -394,12 +394,13 @@ static int cafe_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
 
 		for (i=0; i<8; i+=2) {
 			uint32_t tmp = cafe_readl(cafe, NAND_ECC_SYN01 + (i*2));
-			syn[i] = cafe->rs->index_of[tmp & 0xfff];
-			syn[i+1] = cafe->rs->index_of[(tmp >> 16) & 0xfff];
+
+			syn[i] = cafe->rs->codec->index_of[tmp & 0xfff];
+			syn[i+1] = cafe->rs->codec->index_of[(tmp >> 16) & 0xfff];
 		}
 
 		n = decode_rs16(cafe->rs, NULL, NULL, 1367, syn, 0, pos, 0,
-		                pat);
+				pat);
 
 		for (i = 0; i < n; i++) {
 			int p = pos[i];
diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c
index 86a258de0b75..73308333e788 100644
--- a/drivers/mtd/nand/raw/diskonchip.c
+++ b/drivers/mtd/nand/raw/diskonchip.c
@@ -140,6 +140,7 @@ static int doc_ecc_decode(struct rs_control *rs, uint8_t *data, uint8_t *ecc)
 	int i, j, nerr, errpos[8];
 	uint8_t parity;
 	uint16_t ds[4], s[5], tmp, errval[8], syn[4];
+	struct rs_codec *cd = rs->codec;
 
 	memset(syn, 0, sizeof(syn));
 	/* Convert the ecc bytes into words */
@@ -160,15 +161,15 @@ static int doc_ecc_decode(struct rs_control *rs, uint8_t *data, uint8_t *ecc)
 	for (j = 1; j < NROOTS; j++) {
 		if (ds[j] == 0)
 			continue;
-		tmp = rs->index_of[ds[j]];
+		tmp = cd->index_of[ds[j]];
 		for (i = 0; i < NROOTS; i++)
-			s[i] ^= rs->alpha_to[rs_modnn(rs, tmp + (FCR + i) * j)];
+			s[i] ^= cd->alpha_to[rs_modnn(cd, tmp + (FCR + i) * j)];
 	}
 
 	/* Calc syn[i] = s[i] / alpha^(v + i) */
 	for (i = 0; i < NROOTS; i++) {
 		if (s[i])
-			syn[i] = rs_modnn(rs, rs->index_of[s[i]] + (NN - FCR - i));
+			syn[i] = rs_modnn(cd, cd->index_of[s[i]] + (NN - FCR - i));
 	}
 	/* Call the decoder library */
 	nerr = decode_rs16(rs, NULL, NULL, 1019, syn, 0, errpos, 0, errval);
diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index 27652c18bf8c..6703311beea3 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -15,7 +15,7 @@
 #include <linux/gfp.h>		/* for GFP_KERNEL */
 
 /**
- * struct rs_control - rs control structure
+ * struct rs_codec - rs codec data
  *
  * @mm:		Bits per symbol
  * @nn:		Symbols per block (= (1<<mm)-1)
@@ -29,9 +29,9 @@
  * @gfpoly:	The primitive generator polynominal
  * @gffunc:	Function to generate the field, if non-canonical representation
  * @users:	Users of this structure
- * @list:	List entry for the rs control list
+ * @list:	List entry for the rs codec list
 */
-struct rs_control {
+struct rs_codec {
 	int		mm;
 	int		nn;
 	uint16_t	*alpha_to;
@@ -47,6 +47,14 @@ struct rs_control {
 	struct list_head list;
 };
 
+/**
+ * struct rs_control - rs control structure per instance
+ * @codec:	The codec used for this instance
+ */
+struct rs_control {
+	struct rs_codec	*codec;
+};
+
 /* General purpose RS codec, 8-bit data width, symbol width 1-15 bit  */
 #ifdef CONFIG_REED_SOLOMON_ENC8
 int encode_rs8(struct rs_control *rs, uint8_t *data, int len, uint16_t *par,
@@ -69,7 +77,6 @@ int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
 		uint16_t *corr);
 #endif
 
-/* Create or get a matching rs control structure */
 struct rs_control *init_rs_gfp(int symsize, int gfpoly, int fcr, int prim,
 			       int nroots, gfp_t gfp);
 
@@ -100,7 +107,7 @@ void free_rs(struct rs_control *rs);
 
 /** modulo replacement for galois field arithmetics
  *
- *  @rs:	the rs control structure
+ *  @rs:	Pointer to the RS codec
  *  @x:		the value to reduce
  *
  *  where
@@ -110,7 +117,7 @@ void free_rs(struct rs_control *rs);
  *  Simple arithmetic modulo would return a wrong result for values
  *  >= 3 * rs->nn
 */
-static inline int rs_modnn(struct rs_control *rs, int x)
+static inline int rs_modnn(struct rs_codec *rs, int x)
 {
 	while (x >= rs->nn) {
 		x -= rs->nn;
diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c
index d61007ade4a0..794cced31c75 100644
--- a/lib/reed_solomon/decode_rs.c
+++ b/lib/reed_solomon/decode_rs.c
@@ -10,6 +10,7 @@
  * Generic data width independent code which is included by the wrappers.
  */
 {
+	struct rs_codec *rs = rsc->codec;
 	int deg_lambda, el, deg_omega;
 	int i, j, r, k, pad;
 	int nn = rs->nn;
diff --git a/lib/reed_solomon/encode_rs.c b/lib/reed_solomon/encode_rs.c
index 10ca1ebb13dd..9112d46e869e 100644
--- a/lib/reed_solomon/encode_rs.c
+++ b/lib/reed_solomon/encode_rs.c
@@ -10,6 +10,7 @@
  * Generic data width independent code which is included by the wrappers.
  */
 {
+	struct rs_codec *rs = rsc->codec;
 	int i, j, pad;
 	int nn = rs->nn;
 	int nroots = rs->nroots;
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index 02c19ecffc28..cb21e8b5a4e0 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -11,22 +11,23 @@
  *
  * The generic Reed Solomon library provides runtime configurable
  * encoding / decoding of RS codes.
- * Each user must call init_rs to get a pointer to a rs_control
- * structure for the given rs parameters. This structure is either
- * generated or a already available matching control structure is used.
- * If a structure is generated then the polynomial arrays for
- * fast encoding / decoding are built. This can take some time so
- * make sure not to call this function from a time critical path.
- * Usually a module / driver should initialize the necessary
- * rs_control structure on module / driver init and release it
- * on exit.
- * The encoding puts the calculated syndrome into a given syndrome
- * buffer.
- * The decoding is a two step process. The first step calculates
- * the syndrome over the received (data + syndrome) and calls the
- * second stage, which does the decoding / error correction itself.
- * Many hw encoders provide a syndrome calculation over the received
- * data + syndrome and can call the second stage directly.
+ *
+ * Each user must call init_rs to get a pointer to a rs_control structure
+ * for the given rs parameters. The control struct is unique per instance.
+ * It points to a codec which can be shared by multiple control structures.
+ * If a codec is newly allocated then the polynomial arrays for fast
+ * encoding / decoding are built. This can take some time so make sure not
+ * to call this function from a time critical path.  Usually a module /
+ * driver should initialize the necessary rs_control structure on module /
+ * driver init and release it on exit.
+ *
+ * The encoding puts the calculated syndrome into a given syndrome buffer.
+ *
+ * The decoding is a two step process. The first step calculates the
+ * syndrome over the received (data + syndrome) and calls the second stage,
+ * which does the decoding / error correction itself.  Many hw encoders
+ * provide a syndrome calculation over the received data + syndrome and can
+ * call the second stage directly.
  */
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -36,13 +37,13 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 
-/* This list holds all currently allocated rs control structures */
-static LIST_HEAD (rslist);
+/* This list holds all currently allocated rs codec structures */
+static LIST_HEAD(codec_list);
 /* Protection for the list */
 static DEFINE_MUTEX(rslistlock);
 
 /**
- * rs_init - Initialize a Reed-Solomon codec
+ * codec_init - Initialize a Reed-Solomon codec
  * @symsize:	symbol size, bits (1-8)
  * @gfpoly:	Field generator polynomial coefficients
  * @gffunc:	Field generator function
@@ -51,14 +52,14 @@ static DEFINE_MUTEX(rslistlock);
  * @nroots:	RS code generator polynomial degree (number of roots)
  * @gfp:	GFP_ flags for allocations
  *
- * Allocate a control structure and the polynom arrays for faster
+ * Allocate a codec structure and the polynom arrays for faster
  * en/decoding. Fill the arrays according to the given parameters.
  */
-static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
-				  int fcr, int prim, int nroots, gfp_t gfp)
+static struct rs_codec *codec_init(int symsize, int gfpoly, int (*gffunc)(int),
+				   int fcr, int prim, int nroots, gfp_t gfp)
 {
-	struct rs_control *rs;
 	int i, j, sr, root, iprim;
+	struct rs_codec *rs;
 
 	rs = kzalloc(sizeof(*rs), gfp);
 	if (!rs)
@@ -138,6 +139,9 @@ static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
 	/* convert rs->genpoly[] to index form for quicker encoding */
 	for (i = 0; i <= nroots; i++)
 		rs->genpoly[i] = rs->index_of[rs->genpoly[i]];
+
+	rs->users = 1;
+	list_add(&rs->list, &codec_list);
 	return rs;
 
 err:
@@ -150,27 +154,37 @@ err:
 
 
 /**
- *  free_rs - Free the rs control structure, if it is no longer used
- *  @rs:	the control structure which is not longer used by the
+ *  free_rs - Free the rs control structure
+ *  @rs:	The control structure which is not longer used by the
  *		caller
+ *
+ * Free the control structure. If @rs is the last user of the associated
+ * codec, free the codec as well.
  */
 void free_rs(struct rs_control *rs)
 {
+	struct rs_codec *cd;
+
+	if (!rs)
+		return;
+
+	cd = rs->codec;
 	mutex_lock(&rslistlock);
-	rs->users--;
-	if(!rs->users) {
-		list_del(&rs->list);
-		kfree(rs->alpha_to);
-		kfree(rs->index_of);
-		kfree(rs->genpoly);
-		kfree(rs);
+	cd->users--;
+	if(!cd->users) {
+		list_del(&cd->list);
+		kfree(cd->alpha_to);
+		kfree(cd->index_of);
+		kfree(cd->genpoly);
+		kfree(cd);
 	}
 	mutex_unlock(&rslistlock);
+	kfree(rs);
 }
 EXPORT_SYMBOL_GPL(free_rs);
 
 /**
- * init_rs_internal - Find a matching or allocate a new rs control structure
+ * init_rs_internal - Allocate rs control, find a matching codec or allocate a new one
  *  @symsize:	the symbol size (number of bits)
  *  @gfpoly:	the extended Galois field generator polynomial coefficients,
  *		with the 0th coefficient in the low order bit. The polynomial
@@ -201,33 +215,39 @@ static struct rs_control *init_rs_internal(int symsize, int gfpoly,
 	if (nroots < 0 || nroots >= (1<<symsize))
 		return NULL;
 
+	rs = kzalloc(sizeof(*rs), GFP_KERNEL);
+	if (!rs)
+		return NULL;
+
 	mutex_lock(&rslistlock);
 
 	/* Walk through the list and look for a matching entry */
-	list_for_each(tmp, &rslist) {
-		rs = list_entry(tmp, struct rs_control, list);
-		if (symsize != rs->mm)
+	list_for_each(tmp, &codec_list) {
+		struct rs_codec *cd = list_entry(tmp, struct rs_codec, list);
+
+		if (symsize != cd->mm)
 			continue;
-		if (gfpoly != rs->gfpoly)
+		if (gfpoly != cd->gfpoly)
 			continue;
-		if (gffunc != rs->gffunc)
+		if (gffunc != cd->gffunc)
 			continue;
-		if (fcr != rs->fcr)
+		if (fcr != cd->fcr)
 			continue;
-		if (prim != rs->prim)
+		if (prim != cd->prim)
 			continue;
-		if (nroots != rs->nroots)
+		if (nroots != cd->nroots)
 			continue;
 		/* We have a matching one already */
-		rs->users++;
+		cd->users++;
+		rs->codec = cd;
 		goto out;
 	}
 
 	/* Create a new one */
-	rs = rs_init(symsize, gfpoly, gffunc, fcr, prim, nroots, gfp);
-	if (rs) {
-		rs->users = 1;
-		list_add(&rs->list, &rslist);
+	rs->codec = codec_init(symsize, gfpoly, gffunc, fcr, prim, nroots, gfp);
+	if (!rs->codec) {
+		kfree(rs);
+		rs = NULL;
 	}
 out:
 	mutex_unlock(&rslistlock);
@@ -235,7 +255,7 @@ out:
 }
 
 /**
- * init_rs_gfp - Find a matching or allocate a new rs control structure
+ * init_rs_gfp - Create a RS control struct and initialize it
  *  @symsize:	the symbol size (number of bits)
  *  @gfpoly:	the extended Galois field generator polynomial coefficients,
  *		with the 0th coefficient in the low order bit. The polynomial
@@ -254,9 +274,8 @@ struct rs_control *init_rs_gfp(int symsize, int gfpoly, int fcr, int prim,
 EXPORT_SYMBOL_GPL(init_rs_gfp);
 
 /**
- * init_rs_non_canonical - Find a matching or allocate a new rs control
- *                         structure, for fields with non-canonical
- *                         representation
+ * init_rs_non_canonical - Allocate rs control struct for fields with
+ *                         non-canonical representation
  *  @symsize:	the symbol size (number of bits)
  *  @gffunc:	pointer to function to generate the next field element,
  *		or the multiplicative identity element if given 0.  Used
@@ -277,7 +296,7 @@ EXPORT_SYMBOL_GPL(init_rs_non_canonical);
 #ifdef CONFIG_REED_SOLOMON_ENC8
 /**
  *  encode_rs8 - Calculate the parity for data values (8bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @len:	data length
  *  @par:	parity data, must be initialized by caller (usually all 0)
@@ -287,7 +306,7 @@ EXPORT_SYMBOL_GPL(init_rs_non_canonical);
  *  symbol size > 8. The calling code must take care of encoding of the
  *  syndrome result for storage itself.
  */
-int encode_rs8(struct rs_control *rs, uint8_t *data, int len, uint16_t *par,
+int encode_rs8(struct rs_control *rsc, uint8_t *data, int len, uint16_t *par,
 	       uint16_t invmsk)
 {
 #include "encode_rs.c"
@@ -298,7 +317,7 @@ EXPORT_SYMBOL_GPL(encode_rs8);
 #ifdef CONFIG_REED_SOLOMON_DEC8
 /**
  *  decode_rs8 - Decode codeword (8bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @par:	received parity data field
  *  @len:	data length
@@ -313,7 +332,7 @@ EXPORT_SYMBOL_GPL(encode_rs8);
  *  syndrome result and the received parity before calling this code.
  *  Returns the number of corrected bits or -EBADMSG for uncorrectable errors.
  */
-int decode_rs8(struct rs_control *rs, uint8_t *data, uint16_t *par, int len,
+int decode_rs8(struct rs_control *rsc, uint8_t *data, uint16_t *par, int len,
 	       uint16_t *s, int no_eras, int *eras_pos, uint16_t invmsk,
 	       uint16_t *corr)
 {
@@ -325,7 +344,7 @@ EXPORT_SYMBOL_GPL(decode_rs8);
 #ifdef CONFIG_REED_SOLOMON_ENC16
 /**
  *  encode_rs16 - Calculate the parity for data values (16bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @len:	data length
  *  @par:	parity data, must be initialized by caller (usually all 0)
@@ -333,7 +352,7 @@ EXPORT_SYMBOL_GPL(decode_rs8);
  *
  *  Each field in the data array contains up to symbol size bits of valid data.
  */
-int encode_rs16(struct rs_control *rs, uint16_t *data, int len, uint16_t *par,
+int encode_rs16(struct rs_control *rsc, uint16_t *data, int len, uint16_t *par,
 	uint16_t invmsk)
 {
 #include "encode_rs.c"
@@ -344,7 +363,7 @@ EXPORT_SYMBOL_GPL(encode_rs16);
 #ifdef CONFIG_REED_SOLOMON_DEC16
 /**
  *  decode_rs16 - Decode codeword (16bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @par:	received parity data field
  *  @len:	data length
@@ -357,7 +376,7 @@ EXPORT_SYMBOL_GPL(encode_rs16);
  *  Each field in the data array contains up to symbol size bits of valid data.
  *  Returns the number of corrected bits or -EBADMSG for uncorrectable errors.
  */
-int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
+int decode_rs16(struct rs_control *rsc, uint16_t *data, uint16_t *par, int len,
 		uint16_t *s, int no_eras, int *eras_pos, uint16_t invmsk,
 		uint16_t *corr)
 {
-- 
cgit v1.2.3


From 45888b40d2a6221d46bb69959e2600ddba71cc1f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Apr 2018 18:23:55 +0200
Subject: rslib: Allocate decoder buffers to avoid VLAs

To get rid of the variable length arrays on stack in the RS decoder it's
necessary to allocate the decoder buffers per control structure instance.

All usage sites have been checked for potential parallel decoder usage and
fixed where necessary. Kees confirmed that the pstore decoding is strictly
single threaded so there should be no surprises.

Allocate them in the rs control structure sized depending on the number of
roots for the chosen codec and adapt the decoder code to make use of them.

Document the fact that decode operations based on a particular rs control
instance cannot run in parallel and the caller has to ensure that as it's
not possible to provide a proper locking construct which fits all use
cases.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Andrew Morton <akpm@linuxfoundation.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/rslib.h           |  2 ++
 lib/reed_solomon/decode_rs.c    | 20 +++++++++++++-------
 lib/reed_solomon/reed_solomon.c | 31 ++++++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index 6703311beea3..5974cedd008c 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -50,9 +50,11 @@ struct rs_codec {
 /**
  * struct rs_control - rs control structure per instance
  * @codec:	The codec used for this instance
+ * @buffers:	Internal scratch buffers used in calls to decode_rs()
  */
 struct rs_control {
 	struct rs_codec	*codec;
+	uint16_t	buffers[0];
 };
 
 /* General purpose RS codec, 8-bit data width, symbol width 1-15 bit  */
diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c
index 794cced31c75..1db74eb098d0 100644
--- a/lib/reed_solomon/decode_rs.c
+++ b/lib/reed_solomon/decode_rs.c
@@ -21,16 +21,22 @@
 	uint16_t *alpha_to = rs->alpha_to;
 	uint16_t *index_of = rs->index_of;
 	uint16_t u, q, tmp, num1, num2, den, discr_r, syn_error;
-	/* Err+Eras Locator poly and syndrome poly The maximum value
-	 * of nroots is 8. So the necessary stack size will be about
-	 * 220 bytes max.
-	 */
-	uint16_t lambda[nroots + 1], syn[nroots];
-	uint16_t b[nroots + 1], t[nroots + 1], omega[nroots + 1];
-	uint16_t root[nroots], reg[nroots + 1], loc[nroots];
 	int count = 0;
 	uint16_t msk = (uint16_t) rs->nn;
 
+	/*
+	 * The decoder buffers are in the rs control struct. They are
+	 * arrays sized [nroots + 1]
+	 */
+	uint16_t *lambda = rsc->buffers + RS_DECODE_LAMBDA * (nroots + 1);
+	uint16_t *syn = rsc->buffers + RS_DECODE_SYN * (nroots + 1);
+	uint16_t *b = rsc->buffers + RS_DECODE_B * (nroots + 1);
+	uint16_t *t = rsc->buffers + RS_DECODE_T * (nroots + 1);
+	uint16_t *omega = rsc->buffers + RS_DECODE_OMEGA * (nroots + 1);
+	uint16_t *root = rsc->buffers + RS_DECODE_ROOT * (nroots + 1);
+	uint16_t *reg = rsc->buffers + RS_DECODE_REG * (nroots + 1);
+	uint16_t *loc = rsc->buffers + RS_DECODE_LOC * (nroots + 1);
+
 	/* Check length parameter for validity */
 	pad = nn - nroots - len;
 	BUG_ON(pad < 0 || pad >= nn);
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index cb21e8b5a4e0..dfcf54242fb9 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -37,6 +37,18 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 
+enum {
+	RS_DECODE_LAMBDA,
+	RS_DECODE_SYN,
+	RS_DECODE_B,
+	RS_DECODE_T,
+	RS_DECODE_OMEGA,
+	RS_DECODE_ROOT,
+	RS_DECODE_REG,
+	RS_DECODE_LOC,
+	RS_DECODE_NUM_BUFFERS
+};
+
 /* This list holds all currently allocated rs codec structures */
 static LIST_HEAD(codec_list);
 /* Protection for the list */
@@ -204,6 +216,7 @@ static struct rs_control *init_rs_internal(int symsize, int gfpoly,
 {
 	struct list_head *tmp;
 	struct rs_control *rs;
+	unsigned int bsize;
 
 	/* Sanity checks */
 	if (symsize < 1)
@@ -215,7 +228,13 @@ static struct rs_control *init_rs_internal(int symsize, int gfpoly,
 	if (nroots < 0 || nroots >= (1<<symsize))
 		return NULL;
 
-	rs = kzalloc(sizeof(*rs), GFP_KERNEL);
+	/*
+	 * The decoder needs buffers in each control struct instance to
+	 * avoid variable size or large fixed size allocations on
+	 * stack. Size the buffers to arrays of [nroots + 1].
+	 */
+	bsize = sizeof(uint16_t) * RS_DECODE_NUM_BUFFERS * (nroots + 1);
+	rs = kzalloc(sizeof(*rs) + bsize, gfp);
 	if (!rs)
 		return NULL;
 
@@ -330,6 +349,11 @@ EXPORT_SYMBOL_GPL(encode_rs8);
  *  The syndrome and parity uses a uint16_t data type to enable
  *  symbol size > 8. The calling code must take care of decoding of the
  *  syndrome result and the received parity before calling this code.
+ *
+ *  Note: The rs_control struct @rsc contains buffers which are used for
+ *  decoding, so the caller has to ensure that decoder invocations are
+ *  serialized.
+ *
  *  Returns the number of corrected bits or -EBADMSG for uncorrectable errors.
  */
 int decode_rs8(struct rs_control *rsc, uint8_t *data, uint16_t *par, int len,
@@ -374,6 +398,11 @@ EXPORT_SYMBOL_GPL(encode_rs16);
  *  @corr:	buffer to store correction bitmask on eras_pos
  *
  *  Each field in the data array contains up to symbol size bits of valid data.
+ *
+ *  Note: The rc_control struct @rsc contains buffers which are used for
+ *  decoding, so the caller has to ensure that decoder invocations are
+ *  serialized.
+ *
  *  Returns the number of corrected bits or -EBADMSG for uncorrectable errors.
  */
 int decode_rs16(struct rs_control *rsc, uint16_t *data, uint16_t *par, int len,
-- 
cgit v1.2.3


From eddac5af06546d2e7a0730e3dc02dde3dc91098a Mon Sep 17 00:00:00 2001
From: Karthikeyan Ramasubramanian <kramasub@codeaurora.org>
Date: Fri, 30 Mar 2018 11:08:17 -0600
Subject: soc: qcom: Add GENI based QUP Wrapper driver

This driver manages the Generic Interface (GENI) firmware based Qualcomm
Universal Peripheral (QUP) Wrapper. GENI based QUP is the next generation
programmable module composed of multiple Serial Engines (SE) and supports
a wide range of serial interfaces like UART, SPI, I2C, I3C, etc. This
driver also enables managing the serial interface independent aspects of
Serial Engines.

Signed-off-by: Karthikeyan Ramasubramanian <kramasub@codeaurora.org>
Signed-off-by: Sagar Dharia <sdharia@codeaurora.org>
Signed-off-by: Girish Mahadevan <girishm@codeaurora.org>
Reviewed-by: Evan Green <evgreen@chromium.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/Kconfig        |   9 +
 drivers/soc/qcom/Makefile       |   1 +
 drivers/soc/qcom/qcom-geni-se.c | 748 ++++++++++++++++++++++++++++++++++++++++
 include/linux/qcom-geni-se.h    | 425 +++++++++++++++++++++++
 4 files changed, 1183 insertions(+)
 create mode 100644 drivers/soc/qcom/qcom-geni-se.c
 create mode 100644 include/linux/qcom-geni-se.h

(limited to 'include')

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 5c4535b545cc..d0372de29834 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -3,6 +3,15 @@
 #
 menu "Qualcomm SoC drivers"
 
+config QCOM_GENI_SE
+	tristate "QCOM GENI Serial Engine Driver"
+	depends on ARCH_QCOM || COMPILE_TEST
+	help
+	  This driver is used to manage Generic Interface (GENI) firmware based
+	  Qualcomm Technologies, Inc. Universal Peripheral (QUP) Wrapper. This
+	  driver is also used to manage the common aspects of multiple Serial
+	  Engines present in the QUP.
+
 config QCOM_GLINK_SSR
 	tristate "Qualcomm Glink SSR driver"
 	depends on RPMSG
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index dcebf2814e6d..959aa748f5ac 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_QCOM_GENI_SE) +=	qcom-geni-se.o
 obj-$(CONFIG_QCOM_GLINK_SSR) +=	glink_ssr.o
 obj-$(CONFIG_QCOM_GSBI)	+=	qcom_gsbi.o
 obj-$(CONFIG_QCOM_MDT_LOADER)	+= mdt_loader.o
diff --git a/drivers/soc/qcom/qcom-geni-se.c b/drivers/soc/qcom/qcom-geni-se.c
new file mode 100644
index 000000000000..feed3db21c10
--- /dev/null
+++ b/drivers/soc/qcom/qcom-geni-se.c
@@ -0,0 +1,748 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
+
+#include <linux/clk.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/qcom-geni-se.h>
+
+/**
+ * DOC: Overview
+ *
+ * Generic Interface (GENI) Serial Engine (SE) Wrapper driver is introduced
+ * to manage GENI firmware based Qualcomm Universal Peripheral (QUP) Wrapper
+ * controller. QUP Wrapper is designed to support various serial bus protocols
+ * like UART, SPI, I2C, I3C, etc.
+ */
+
+/**
+ * DOC: Hardware description
+ *
+ * GENI based QUP is a highly-flexible and programmable module for supporting
+ * a wide range of serial interfaces like UART, SPI, I2C, I3C, etc. A single
+ * QUP module can provide upto 8 serial interfaces, using its internal
+ * serial engines. The actual configuration is determined by the target
+ * platform configuration. The protocol supported by each interface is
+ * determined by the firmware loaded to the serial engine. Each SE consists
+ * of a DMA Engine and GENI sub modules which enable serial engines to
+ * support FIFO and DMA modes of operation.
+ *
+ *
+ *                      +-----------------------------------------+
+ *                      |QUP Wrapper                              |
+ *                      |         +----------------------------+  |
+ *   --QUP & SE Clocks-->         | Serial Engine N            |  +-IO------>
+ *                      |         | ...                        |  | Interface
+ *   <---Clock Perf.----+    +----+-----------------------+    |  |
+ *     State Interface  |    | Serial Engine 1            |    |  |
+ *                      |    |                            |    |  |
+ *                      |    |                            |    |  |
+ *   <--------AHB------->    |                            |    |  |
+ *                      |    |                            +----+  |
+ *                      |    |                            |       |
+ *                      |    |                            |       |
+ *   <------SE IRQ------+    +----------------------------+       |
+ *                      |                                         |
+ *                      +-----------------------------------------+
+ *
+ *                         Figure 1: GENI based QUP Wrapper
+ *
+ * The GENI submodules include primary and secondary sequencers which are
+ * used to drive TX & RX operations. On serial interfaces that operate using
+ * master-slave model, primary sequencer drives both TX & RX operations. On
+ * serial interfaces that operate using peer-to-peer model, primary sequencer
+ * drives TX operation and secondary sequencer drives RX operation.
+ */
+
+/**
+ * DOC: Software description
+ *
+ * GENI SE Wrapper driver is structured into 2 parts:
+ *
+ * geni_wrapper represents QUP Wrapper controller. This part of the driver
+ * manages QUP Wrapper information such as hardware version, clock
+ * performance table that is common to all the internal serial engines.
+ *
+ * geni_se represents serial engine. This part of the driver manages serial
+ * engine information such as clocks, containing QUP Wrapper, etc. This part
+ * of driver also supports operations (eg. initialize the concerned serial
+ * engine, select between FIFO and DMA mode of operation etc.) that are
+ * common to all the serial engines and are independent of serial interfaces.
+ */
+
+#define MAX_CLK_PERF_LEVEL 32
+#define NUM_AHB_CLKS 2
+
+/**
+ * @struct geni_wrapper - Data structure to represent the QUP Wrapper Core
+ * @dev:		Device pointer of the QUP wrapper core
+ * @base:		Base address of this instance of QUP wrapper core
+ * @ahb_clks:		Handle to the primary & secondary AHB clocks
+ */
+struct geni_wrapper {
+	struct device *dev;
+	void __iomem *base;
+	struct clk_bulk_data ahb_clks[NUM_AHB_CLKS];
+};
+
+#define QUP_HW_VER_REG			0x4
+
+/* Common SE registers */
+#define GENI_INIT_CFG_REVISION		0x0
+#define GENI_S_INIT_CFG_REVISION	0x4
+#define GENI_OUTPUT_CTRL		0x24
+#define GENI_CGC_CTRL			0x28
+#define GENI_CLK_CTRL_RO		0x60
+#define GENI_IF_DISABLE_RO		0x64
+#define GENI_FW_S_REVISION_RO		0x6c
+#define SE_GENI_BYTE_GRAN		0x254
+#define SE_GENI_TX_PACKING_CFG0		0x260
+#define SE_GENI_TX_PACKING_CFG1		0x264
+#define SE_GENI_RX_PACKING_CFG0		0x284
+#define SE_GENI_RX_PACKING_CFG1		0x288
+#define SE_GENI_M_GP_LENGTH		0x910
+#define SE_GENI_S_GP_LENGTH		0x914
+#define SE_DMA_TX_PTR_L			0xc30
+#define SE_DMA_TX_PTR_H			0xc34
+#define SE_DMA_TX_ATTR			0xc38
+#define SE_DMA_TX_LEN			0xc3c
+#define SE_DMA_TX_IRQ_EN		0xc48
+#define SE_DMA_TX_IRQ_EN_SET		0xc4c
+#define SE_DMA_TX_IRQ_EN_CLR		0xc50
+#define SE_DMA_TX_LEN_IN		0xc54
+#define SE_DMA_TX_MAX_BURST		0xc5c
+#define SE_DMA_RX_PTR_L			0xd30
+#define SE_DMA_RX_PTR_H			0xd34
+#define SE_DMA_RX_ATTR			0xd38
+#define SE_DMA_RX_LEN			0xd3c
+#define SE_DMA_RX_IRQ_EN		0xd48
+#define SE_DMA_RX_IRQ_EN_SET		0xd4c
+#define SE_DMA_RX_IRQ_EN_CLR		0xd50
+#define SE_DMA_RX_LEN_IN		0xd54
+#define SE_DMA_RX_MAX_BURST		0xd5c
+#define SE_DMA_RX_FLUSH			0xd60
+#define SE_GSI_EVENT_EN			0xe18
+#define SE_IRQ_EN			0xe1c
+#define SE_DMA_GENERAL_CFG		0xe30
+
+/* GENI_OUTPUT_CTRL fields */
+#define DEFAULT_IO_OUTPUT_CTRL_MSK	GENMASK(6, 0)
+
+/* GENI_CGC_CTRL fields */
+#define CFG_AHB_CLK_CGC_ON		BIT(0)
+#define CFG_AHB_WR_ACLK_CGC_ON		BIT(1)
+#define DATA_AHB_CLK_CGC_ON		BIT(2)
+#define SCLK_CGC_ON			BIT(3)
+#define TX_CLK_CGC_ON			BIT(4)
+#define RX_CLK_CGC_ON			BIT(5)
+#define EXT_CLK_CGC_ON			BIT(6)
+#define PROG_RAM_HCLK_OFF		BIT(8)
+#define PROG_RAM_SCLK_OFF		BIT(9)
+#define DEFAULT_CGC_EN			GENMASK(6, 0)
+
+/* SE_GSI_EVENT_EN fields */
+#define DMA_RX_EVENT_EN			BIT(0)
+#define DMA_TX_EVENT_EN			BIT(1)
+#define GENI_M_EVENT_EN			BIT(2)
+#define GENI_S_EVENT_EN			BIT(3)
+
+/* SE_IRQ_EN fields */
+#define DMA_RX_IRQ_EN			BIT(0)
+#define DMA_TX_IRQ_EN			BIT(1)
+#define GENI_M_IRQ_EN			BIT(2)
+#define GENI_S_IRQ_EN			BIT(3)
+
+/* SE_DMA_GENERAL_CFG */
+#define DMA_RX_CLK_CGC_ON		BIT(0)
+#define DMA_TX_CLK_CGC_ON		BIT(1)
+#define DMA_AHB_SLV_CFG_ON		BIT(2)
+#define AHB_SEC_SLV_CLK_CGC_ON		BIT(3)
+#define DUMMY_RX_NON_BUFFERABLE		BIT(4)
+#define RX_DMA_ZERO_PADDING_EN		BIT(5)
+#define RX_DMA_IRQ_DELAY_MSK		GENMASK(8, 6)
+#define RX_DMA_IRQ_DELAY_SHFT		6
+
+/**
+ * geni_se_get_qup_hw_version() - Read the QUP wrapper Hardware version
+ * @se:	Pointer to the corresponding serial engine.
+ *
+ * Return: Hardware Version of the wrapper.
+ */
+u32 geni_se_get_qup_hw_version(struct geni_se *se)
+{
+	struct geni_wrapper *wrapper = se->wrapper;
+
+	return readl_relaxed(wrapper->base + QUP_HW_VER_REG);
+}
+EXPORT_SYMBOL(geni_se_get_qup_hw_version);
+
+static void geni_se_io_set_mode(void __iomem *base)
+{
+	u32 val;
+
+	val = readl_relaxed(base + SE_IRQ_EN);
+	val |= GENI_M_IRQ_EN | GENI_S_IRQ_EN;
+	val |= DMA_TX_IRQ_EN | DMA_RX_IRQ_EN;
+	writel_relaxed(val, base + SE_IRQ_EN);
+
+	val = readl_relaxed(base + SE_GENI_DMA_MODE_EN);
+	val &= ~GENI_DMA_MODE_EN;
+	writel_relaxed(val, base + SE_GENI_DMA_MODE_EN);
+
+	writel_relaxed(0, base + SE_GSI_EVENT_EN);
+}
+
+static void geni_se_io_init(void __iomem *base)
+{
+	u32 val;
+
+	val = readl_relaxed(base + GENI_CGC_CTRL);
+	val |= DEFAULT_CGC_EN;
+	writel_relaxed(val, base + GENI_CGC_CTRL);
+
+	val = readl_relaxed(base + SE_DMA_GENERAL_CFG);
+	val |= AHB_SEC_SLV_CLK_CGC_ON | DMA_AHB_SLV_CFG_ON;
+	val |= DMA_TX_CLK_CGC_ON | DMA_RX_CLK_CGC_ON;
+	writel_relaxed(val, base + SE_DMA_GENERAL_CFG);
+
+	writel_relaxed(DEFAULT_IO_OUTPUT_CTRL_MSK, base + GENI_OUTPUT_CTRL);
+	writel_relaxed(FORCE_DEFAULT, base + GENI_FORCE_DEFAULT_REG);
+}
+
+/**
+ * geni_se_init() - Initialize the GENI serial engine
+ * @se:		Pointer to the concerned serial engine.
+ * @rx_wm:	Receive watermark, in units of FIFO words.
+ * @rx_rfr_wm:	Ready-for-receive watermark, in units of FIFO words.
+ *
+ * This function is used to initialize the GENI serial engine, configure
+ * receive watermark and ready-for-receive watermarks.
+ */
+void geni_se_init(struct geni_se *se, u32 rx_wm, u32 rx_rfr)
+{
+	u32 val;
+
+	geni_se_io_init(se->base);
+	geni_se_io_set_mode(se->base);
+
+	writel_relaxed(rx_wm, se->base + SE_GENI_RX_WATERMARK_REG);
+	writel_relaxed(rx_rfr, se->base + SE_GENI_RX_RFR_WATERMARK_REG);
+
+	val = readl_relaxed(se->base + SE_GENI_M_IRQ_EN);
+	val |= M_COMMON_GENI_M_IRQ_EN;
+	writel_relaxed(val, se->base + SE_GENI_M_IRQ_EN);
+
+	val = readl_relaxed(se->base + SE_GENI_S_IRQ_EN);
+	val |= S_COMMON_GENI_S_IRQ_EN;
+	writel_relaxed(val, se->base + SE_GENI_S_IRQ_EN);
+}
+EXPORT_SYMBOL(geni_se_init);
+
+static void geni_se_select_fifo_mode(struct geni_se *se)
+{
+	u32 proto = geni_se_read_proto(se);
+	u32 val;
+
+	writel_relaxed(0, se->base + SE_GSI_EVENT_EN);
+	writel_relaxed(0xffffffff, se->base + SE_GENI_M_IRQ_CLEAR);
+	writel_relaxed(0xffffffff, se->base + SE_GENI_S_IRQ_CLEAR);
+	writel_relaxed(0xffffffff, se->base + SE_DMA_TX_IRQ_CLR);
+	writel_relaxed(0xffffffff, se->base + SE_DMA_RX_IRQ_CLR);
+	writel_relaxed(0xffffffff, se->base + SE_IRQ_EN);
+
+	val = readl_relaxed(se->base + SE_GENI_M_IRQ_EN);
+	if (proto != GENI_SE_UART) {
+		val |= M_CMD_DONE_EN | M_TX_FIFO_WATERMARK_EN;
+		val |= M_RX_FIFO_WATERMARK_EN | M_RX_FIFO_LAST_EN;
+	}
+	writel_relaxed(val, se->base + SE_GENI_M_IRQ_EN);
+
+	val = readl_relaxed(se->base + SE_GENI_S_IRQ_EN);
+	if (proto != GENI_SE_UART)
+		val |= S_CMD_DONE_EN;
+	writel_relaxed(val, se->base + SE_GENI_S_IRQ_EN);
+
+	val = readl_relaxed(se->base + SE_GENI_DMA_MODE_EN);
+	val &= ~GENI_DMA_MODE_EN;
+	writel_relaxed(val, se->base + SE_GENI_DMA_MODE_EN);
+}
+
+static void geni_se_select_dma_mode(struct geni_se *se)
+{
+	u32 val;
+
+	writel_relaxed(0, se->base + SE_GSI_EVENT_EN);
+	writel_relaxed(0xffffffff, se->base + SE_GENI_M_IRQ_CLEAR);
+	writel_relaxed(0xffffffff, se->base + SE_GENI_S_IRQ_CLEAR);
+	writel_relaxed(0xffffffff, se->base + SE_DMA_TX_IRQ_CLR);
+	writel_relaxed(0xffffffff, se->base + SE_DMA_RX_IRQ_CLR);
+	writel_relaxed(0xffffffff, se->base + SE_IRQ_EN);
+
+	val = readl_relaxed(se->base + SE_GENI_DMA_MODE_EN);
+	val |= GENI_DMA_MODE_EN;
+	writel_relaxed(val, se->base + SE_GENI_DMA_MODE_EN);
+}
+
+/**
+ * geni_se_select_mode() - Select the serial engine transfer mode
+ * @se:		Pointer to the concerned serial engine.
+ * @mode:	Transfer mode to be selected.
+ */
+void geni_se_select_mode(struct geni_se *se, enum geni_se_xfer_mode mode)
+{
+	WARN_ON(mode != GENI_SE_FIFO && mode != GENI_SE_DMA);
+
+	switch (mode) {
+	case GENI_SE_FIFO:
+		geni_se_select_fifo_mode(se);
+		break;
+	case GENI_SE_DMA:
+		geni_se_select_dma_mode(se);
+		break;
+	case GENI_SE_INVALID:
+	default:
+		break;
+	}
+}
+EXPORT_SYMBOL(geni_se_select_mode);
+
+/**
+ * DOC: Overview
+ *
+ * GENI FIFO packing is highly configurable. TX/RX packing/unpacking consist
+ * of up to 4 operations, each operation represented by 4 configuration vectors
+ * of 10 bits programmed in GENI_TX_PACKING_CFG0 and GENI_TX_PACKING_CFG1 for
+ * TX FIFO and in GENI_RX_PACKING_CFG0 and GENI_RX_PACKING_CFG1 for RX FIFO.
+ * Refer to below examples for detailed bit-field description.
+ *
+ * Example 1: word_size = 7, packing_mode = 4 x 8, msb_to_lsb = 1
+ *
+ *        +-----------+-------+-------+-------+-------+
+ *        |           | vec_0 | vec_1 | vec_2 | vec_3 |
+ *        +-----------+-------+-------+-------+-------+
+ *        | start     | 0x6   | 0xe   | 0x16  | 0x1e  |
+ *        | direction | 1     | 1     | 1     | 1     |
+ *        | length    | 6     | 6     | 6     | 6     |
+ *        | stop      | 0     | 0     | 0     | 1     |
+ *        +-----------+-------+-------+-------+-------+
+ *
+ * Example 2: word_size = 15, packing_mode = 2 x 16, msb_to_lsb = 0
+ *
+ *        +-----------+-------+-------+-------+-------+
+ *        |           | vec_0 | vec_1 | vec_2 | vec_3 |
+ *        +-----------+-------+-------+-------+-------+
+ *        | start     | 0x0   | 0x8   | 0x10  | 0x18  |
+ *        | direction | 0     | 0     | 0     | 0     |
+ *        | length    | 7     | 6     | 7     | 6     |
+ *        | stop      | 0     | 0     | 0     | 1     |
+ *        +-----------+-------+-------+-------+-------+
+ *
+ * Example 3: word_size = 23, packing_mode = 1 x 32, msb_to_lsb = 1
+ *
+ *        +-----------+-------+-------+-------+-------+
+ *        |           | vec_0 | vec_1 | vec_2 | vec_3 |
+ *        +-----------+-------+-------+-------+-------+
+ *        | start     | 0x16  | 0xe   | 0x6   | 0x0   |
+ *        | direction | 1     | 1     | 1     | 1     |
+ *        | length    | 7     | 7     | 6     | 0     |
+ *        | stop      | 0     | 0     | 1     | 0     |
+ *        +-----------+-------+-------+-------+-------+
+ *
+ */
+
+#define NUM_PACKING_VECTORS 4
+#define PACKING_START_SHIFT 5
+#define PACKING_DIR_SHIFT 4
+#define PACKING_LEN_SHIFT 1
+#define PACKING_STOP_BIT BIT(0)
+#define PACKING_VECTOR_SHIFT 10
+/**
+ * geni_se_config_packing() - Packing configuration of the serial engine
+ * @se:		Pointer to the concerned serial engine
+ * @bpw:	Bits of data per transfer word.
+ * @pack_words:	Number of words per fifo element.
+ * @msb_to_lsb:	Transfer from MSB to LSB or vice-versa.
+ * @tx_cfg:	Flag to configure the TX Packing.
+ * @rx_cfg:	Flag to configure the RX Packing.
+ *
+ * This function is used to configure the packing rules for the current
+ * transfer.
+ */
+void geni_se_config_packing(struct geni_se *se, int bpw, int pack_words,
+			    bool msb_to_lsb, bool tx_cfg, bool rx_cfg)
+{
+	u32 cfg0, cfg1, cfg[NUM_PACKING_VECTORS] = {0};
+	int len;
+	int temp_bpw = bpw;
+	int idx_start = msb_to_lsb ? bpw - 1 : 0;
+	int idx = idx_start;
+	int idx_delta = msb_to_lsb ? -BITS_PER_BYTE : BITS_PER_BYTE;
+	int ceil_bpw = ALIGN(bpw, BITS_PER_BYTE);
+	int iter = (ceil_bpw * pack_words) / BITS_PER_BYTE;
+	int i;
+
+	if (iter <= 0 || iter > NUM_PACKING_VECTORS)
+		return;
+
+	for (i = 0; i < iter; i++) {
+		len = min_t(int, temp_bpw, BITS_PER_BYTE) - 1;
+		cfg[i] = idx << PACKING_START_SHIFT;
+		cfg[i] |= msb_to_lsb << PACKING_DIR_SHIFT;
+		cfg[i] |= len << PACKING_LEN_SHIFT;
+
+		if (temp_bpw <= BITS_PER_BYTE) {
+			idx = ((i + 1) * BITS_PER_BYTE) + idx_start;
+			temp_bpw = bpw;
+		} else {
+			idx = idx + idx_delta;
+			temp_bpw = temp_bpw - BITS_PER_BYTE;
+		}
+	}
+	cfg[iter - 1] |= PACKING_STOP_BIT;
+	cfg0 = cfg[0] | (cfg[1] << PACKING_VECTOR_SHIFT);
+	cfg1 = cfg[2] | (cfg[3] << PACKING_VECTOR_SHIFT);
+
+	if (tx_cfg) {
+		writel_relaxed(cfg0, se->base + SE_GENI_TX_PACKING_CFG0);
+		writel_relaxed(cfg1, se->base + SE_GENI_TX_PACKING_CFG1);
+	}
+	if (rx_cfg) {
+		writel_relaxed(cfg0, se->base + SE_GENI_RX_PACKING_CFG0);
+		writel_relaxed(cfg1, se->base + SE_GENI_RX_PACKING_CFG1);
+	}
+
+	/*
+	 * Number of protocol words in each FIFO entry
+	 * 0 - 4x8, four words in each entry, max word size of 8 bits
+	 * 1 - 2x16, two words in each entry, max word size of 16 bits
+	 * 2 - 1x32, one word in each entry, max word size of 32 bits
+	 * 3 - undefined
+	 */
+	if (pack_words || bpw == 32)
+		writel_relaxed(bpw / 16, se->base + SE_GENI_BYTE_GRAN);
+}
+EXPORT_SYMBOL(geni_se_config_packing);
+
+static void geni_se_clks_off(struct geni_se *se)
+{
+	struct geni_wrapper *wrapper = se->wrapper;
+
+	clk_disable_unprepare(se->clk);
+	clk_bulk_disable_unprepare(ARRAY_SIZE(wrapper->ahb_clks),
+						wrapper->ahb_clks);
+}
+
+/**
+ * geni_se_resources_off() - Turn off resources associated with the serial
+ *                           engine
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * Return: 0 on success, standard Linux error codes on failure/error.
+ */
+int geni_se_resources_off(struct geni_se *se)
+{
+	int ret;
+
+	ret = pinctrl_pm_select_sleep_state(se->dev);
+	if (ret)
+		return ret;
+
+	geni_se_clks_off(se);
+	return 0;
+}
+EXPORT_SYMBOL(geni_se_resources_off);
+
+static int geni_se_clks_on(struct geni_se *se)
+{
+	int ret;
+	struct geni_wrapper *wrapper = se->wrapper;
+
+	ret = clk_bulk_prepare_enable(ARRAY_SIZE(wrapper->ahb_clks),
+						wrapper->ahb_clks);
+	if (ret)
+		return ret;
+
+	ret = clk_prepare_enable(se->clk);
+	if (ret)
+		clk_bulk_disable_unprepare(ARRAY_SIZE(wrapper->ahb_clks),
+							wrapper->ahb_clks);
+	return ret;
+}
+
+/**
+ * geni_se_resources_on() - Turn on resources associated with the serial
+ *                          engine
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * Return: 0 on success, standard Linux error codes on failure/error.
+ */
+int geni_se_resources_on(struct geni_se *se)
+{
+	int ret;
+
+	ret = geni_se_clks_on(se);
+	if (ret)
+		return ret;
+
+	ret = pinctrl_pm_select_default_state(se->dev);
+	if (ret)
+		geni_se_clks_off(se);
+
+	return ret;
+}
+EXPORT_SYMBOL(geni_se_resources_on);
+
+/**
+ * geni_se_clk_tbl_get() - Get the clock table to program DFS
+ * @se:		Pointer to the concerned serial engine.
+ * @tbl:	Table in which the output is returned.
+ *
+ * This function is called by the protocol drivers to determine the different
+ * clock frequencies supported by serial engine core clock. The protocol
+ * drivers use the output to determine the clock frequency index to be
+ * programmed into DFS.
+ *
+ * Return: number of valid performance levels in the table on success,
+ *	   standard Linux error codes on failure.
+ */
+int geni_se_clk_tbl_get(struct geni_se *se, unsigned long **tbl)
+{
+	unsigned long freq = 0;
+	int i;
+
+	if (se->clk_perf_tbl) {
+		*tbl = se->clk_perf_tbl;
+		return se->num_clk_levels;
+	}
+
+	se->clk_perf_tbl = devm_kcalloc(se->dev, MAX_CLK_PERF_LEVEL,
+					sizeof(*se->clk_perf_tbl),
+					GFP_KERNEL);
+	if (!se->clk_perf_tbl)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_CLK_PERF_LEVEL; i++) {
+		freq = clk_round_rate(se->clk, freq + 1);
+		if (!freq || freq == se->clk_perf_tbl[i - 1])
+			break;
+		se->clk_perf_tbl[i] = freq;
+	}
+	se->num_clk_levels = i;
+	*tbl = se->clk_perf_tbl;
+	return se->num_clk_levels;
+}
+EXPORT_SYMBOL(geni_se_clk_tbl_get);
+
+/**
+ * geni_se_clk_freq_match() - Get the matching or closest SE clock frequency
+ * @se:		Pointer to the concerned serial engine.
+ * @req_freq:	Requested clock frequency.
+ * @index:	Index of the resultant frequency in the table.
+ * @res_freq:	Resultant frequency which matches or is closer to the
+ *		requested frequency.
+ * @exact:	Flag to indicate exact multiple requirement of the requested
+ *		frequency.
+ *
+ * This function is called by the protocol drivers to determine the matching
+ * or exact multiple of the requested frequency, as provided by the serial
+ * engine clock in order to meet the performance requirements. If there is
+ * no matching or exact multiple of the requested frequency found, then it
+ * selects the closest floor frequency, if exact flag is not set.
+ *
+ * Return: 0 on success, standard Linux error codes on failure.
+ */
+int geni_se_clk_freq_match(struct geni_se *se, unsigned long req_freq,
+			   unsigned int *index, unsigned long *res_freq,
+			   bool exact)
+{
+	unsigned long *tbl;
+	int num_clk_levels;
+	int i;
+
+	num_clk_levels = geni_se_clk_tbl_get(se, &tbl);
+	if (num_clk_levels < 0)
+		return num_clk_levels;
+
+	if (num_clk_levels == 0)
+		return -EINVAL;
+
+	*res_freq = 0;
+	for (i = 0; i < num_clk_levels; i++) {
+		if (!(tbl[i] % req_freq)) {
+			*index = i;
+			*res_freq = tbl[i];
+			return 0;
+		}
+
+		if (!(*res_freq) || ((tbl[i] > *res_freq) &&
+				     (tbl[i] < req_freq))) {
+			*index = i;
+			*res_freq = tbl[i];
+		}
+	}
+
+	if (exact)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(geni_se_clk_freq_match);
+
+#define GENI_SE_DMA_DONE_EN BIT(0)
+#define GENI_SE_DMA_EOT_EN BIT(1)
+#define GENI_SE_DMA_AHB_ERR_EN BIT(2)
+#define GENI_SE_DMA_EOT_BUF BIT(0)
+/**
+ * geni_se_tx_dma_prep() - Prepare the serial engine for TX DMA transfer
+ * @se:			Pointer to the concerned serial engine.
+ * @buf:		Pointer to the TX buffer.
+ * @len:		Length of the TX buffer.
+ * @iova:		Pointer to store the mapped DMA address.
+ *
+ * This function is used to prepare the buffers for DMA TX.
+ *
+ * Return: 0 on success, standard Linux error codes on failure.
+ */
+int geni_se_tx_dma_prep(struct geni_se *se, void *buf, size_t len,
+			dma_addr_t *iova)
+{
+	struct geni_wrapper *wrapper = se->wrapper;
+	u32 val;
+
+	*iova = dma_map_single(wrapper->dev, buf, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(wrapper->dev, *iova))
+		return -EIO;
+
+	val = GENI_SE_DMA_DONE_EN;
+	val |= GENI_SE_DMA_EOT_EN;
+	val |= GENI_SE_DMA_AHB_ERR_EN;
+	writel_relaxed(val, se->base + SE_DMA_TX_IRQ_EN_SET);
+	writel_relaxed(lower_32_bits(*iova), se->base + SE_DMA_TX_PTR_L);
+	writel_relaxed(upper_32_bits(*iova), se->base + SE_DMA_TX_PTR_H);
+	writel_relaxed(GENI_SE_DMA_EOT_BUF, se->base + SE_DMA_TX_ATTR);
+	writel_relaxed(len, se->base + SE_DMA_TX_LEN);
+	return 0;
+}
+EXPORT_SYMBOL(geni_se_tx_dma_prep);
+
+/**
+ * geni_se_rx_dma_prep() - Prepare the serial engine for RX DMA transfer
+ * @se:			Pointer to the concerned serial engine.
+ * @buf:		Pointer to the RX buffer.
+ * @len:		Length of the RX buffer.
+ * @iova:		Pointer to store the mapped DMA address.
+ *
+ * This function is used to prepare the buffers for DMA RX.
+ *
+ * Return: 0 on success, standard Linux error codes on failure.
+ */
+int geni_se_rx_dma_prep(struct geni_se *se, void *buf, size_t len,
+			dma_addr_t *iova)
+{
+	struct geni_wrapper *wrapper = se->wrapper;
+	u32 val;
+
+	*iova = dma_map_single(wrapper->dev, buf, len, DMA_FROM_DEVICE);
+	if (dma_mapping_error(wrapper->dev, *iova))
+		return -EIO;
+
+	val = GENI_SE_DMA_DONE_EN;
+	val |= GENI_SE_DMA_EOT_EN;
+	val |= GENI_SE_DMA_AHB_ERR_EN;
+	writel_relaxed(val, se->base + SE_DMA_RX_IRQ_EN_SET);
+	writel_relaxed(lower_32_bits(*iova), se->base + SE_DMA_RX_PTR_L);
+	writel_relaxed(upper_32_bits(*iova), se->base + SE_DMA_RX_PTR_H);
+	/* RX does not have EOT buffer type bit. So just reset RX_ATTR */
+	writel_relaxed(0, se->base + SE_DMA_RX_ATTR);
+	writel_relaxed(len, se->base + SE_DMA_RX_LEN);
+	return 0;
+}
+EXPORT_SYMBOL(geni_se_rx_dma_prep);
+
+/**
+ * geni_se_tx_dma_unprep() - Unprepare the serial engine after TX DMA transfer
+ * @se:			Pointer to the concerned serial engine.
+ * @iova:		DMA address of the TX buffer.
+ * @len:		Length of the TX buffer.
+ *
+ * This function is used to unprepare the DMA buffers after DMA TX.
+ */
+void geni_se_tx_dma_unprep(struct geni_se *se, dma_addr_t iova, size_t len)
+{
+	struct geni_wrapper *wrapper = se->wrapper;
+
+	if (iova && !dma_mapping_error(wrapper->dev, iova))
+		dma_unmap_single(wrapper->dev, iova, len, DMA_TO_DEVICE);
+}
+EXPORT_SYMBOL(geni_se_tx_dma_unprep);
+
+/**
+ * geni_se_rx_dma_unprep() - Unprepare the serial engine after RX DMA transfer
+ * @se:			Pointer to the concerned serial engine.
+ * @iova:		DMA address of the RX buffer.
+ * @len:		Length of the RX buffer.
+ *
+ * This function is used to unprepare the DMA buffers after DMA RX.
+ */
+void geni_se_rx_dma_unprep(struct geni_se *se, dma_addr_t iova, size_t len)
+{
+	struct geni_wrapper *wrapper = se->wrapper;
+
+	if (iova && !dma_mapping_error(wrapper->dev, iova))
+		dma_unmap_single(wrapper->dev, iova, len, DMA_FROM_DEVICE);
+}
+EXPORT_SYMBOL(geni_se_rx_dma_unprep);
+
+static int geni_se_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	struct geni_wrapper *wrapper;
+	int ret;
+
+	wrapper = devm_kzalloc(dev, sizeof(*wrapper), GFP_KERNEL);
+	if (!wrapper)
+		return -ENOMEM;
+
+	wrapper->dev = dev;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	wrapper->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(wrapper->base))
+		return PTR_ERR(wrapper->base);
+
+	wrapper->ahb_clks[0].id = "m-ahb";
+	wrapper->ahb_clks[1].id = "s-ahb";
+	ret = devm_clk_bulk_get(dev, NUM_AHB_CLKS, wrapper->ahb_clks);
+	if (ret) {
+		dev_err(dev, "Err getting AHB clks %d\n", ret);
+		return ret;
+	}
+
+	dev_set_drvdata(dev, wrapper);
+	dev_dbg(dev, "GENI SE Driver probed\n");
+	return devm_of_platform_populate(dev);
+}
+
+static const struct of_device_id geni_se_dt_match[] = {
+	{ .compatible = "qcom,geni-se-qup", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, geni_se_dt_match);
+
+static struct platform_driver geni_se_driver = {
+	.driver = {
+		.name = "geni_se_qup",
+		.of_match_table = geni_se_dt_match,
+	},
+	.probe = geni_se_probe,
+};
+module_platform_driver(geni_se_driver);
+
+MODULE_DESCRIPTION("GENI Serial Engine Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
new file mode 100644
index 000000000000..5d6144977828
--- /dev/null
+++ b/include/linux/qcom-geni-se.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _LINUX_QCOM_GENI_SE
+#define _LINUX_QCOM_GENI_SE
+
+/* Transfer mode supported by GENI Serial Engines */
+enum geni_se_xfer_mode {
+	GENI_SE_INVALID,
+	GENI_SE_FIFO,
+	GENI_SE_DMA,
+};
+
+/* Protocols supported by GENI Serial Engines */
+enum geni_se_protocol_type {
+	GENI_SE_NONE,
+	GENI_SE_SPI,
+	GENI_SE_UART,
+	GENI_SE_I2C,
+	GENI_SE_I3C,
+};
+
+struct geni_wrapper;
+struct clk;
+
+/**
+ * struct geni_se - GENI Serial Engine
+ * @base:		Base Address of the Serial Engine's register block
+ * @dev:		Pointer to the Serial Engine device
+ * @wrapper:		Pointer to the parent QUP Wrapper core
+ * @clk:		Handle to the core serial engine clock
+ * @num_clk_levels:	Number of valid clock levels in clk_perf_tbl
+ * @clk_perf_tbl:	Table of clock frequency input to serial engine clock
+ */
+struct geni_se {
+	void __iomem *base;
+	struct device *dev;
+	struct geni_wrapper *wrapper;
+	struct clk *clk;
+	unsigned int num_clk_levels;
+	unsigned long *clk_perf_tbl;
+};
+
+/* Common SE registers */
+#define GENI_FORCE_DEFAULT_REG		0x20
+#define SE_GENI_STATUS			0x40
+#define GENI_SER_M_CLK_CFG		0x48
+#define GENI_SER_S_CLK_CFG		0x4c
+#define GENI_FW_REVISION_RO		0x68
+#define SE_GENI_CLK_SEL			0x7c
+#define SE_GENI_DMA_MODE_EN		0x258
+#define SE_GENI_M_CMD0			0x600
+#define SE_GENI_M_CMD_CTRL_REG		0x604
+#define SE_GENI_M_IRQ_STATUS		0x610
+#define SE_GENI_M_IRQ_EN		0x614
+#define SE_GENI_M_IRQ_CLEAR		0x618
+#define SE_GENI_S_CMD0			0x630
+#define SE_GENI_S_CMD_CTRL_REG		0x634
+#define SE_GENI_S_IRQ_STATUS		0x640
+#define SE_GENI_S_IRQ_EN		0x644
+#define SE_GENI_S_IRQ_CLEAR		0x648
+#define SE_GENI_TX_FIFOn		0x700
+#define SE_GENI_RX_FIFOn		0x780
+#define SE_GENI_TX_FIFO_STATUS		0x800
+#define SE_GENI_RX_FIFO_STATUS		0x804
+#define SE_GENI_TX_WATERMARK_REG	0x80c
+#define SE_GENI_RX_WATERMARK_REG	0x810
+#define SE_GENI_RX_RFR_WATERMARK_REG	0x814
+#define SE_GENI_IOS			0x908
+#define SE_DMA_TX_IRQ_STAT		0xc40
+#define SE_DMA_TX_IRQ_CLR		0xc44
+#define SE_DMA_TX_FSM_RST		0xc58
+#define SE_DMA_RX_IRQ_STAT		0xd40
+#define SE_DMA_RX_IRQ_CLR		0xd44
+#define SE_DMA_RX_FSM_RST		0xd58
+#define SE_HW_PARAM_0			0xe24
+#define SE_HW_PARAM_1			0xe28
+
+/* GENI_FORCE_DEFAULT_REG fields */
+#define FORCE_DEFAULT	BIT(0)
+
+/* GENI_STATUS fields */
+#define M_GENI_CMD_ACTIVE		BIT(0)
+#define S_GENI_CMD_ACTIVE		BIT(12)
+
+/* GENI_SER_M_CLK_CFG/GENI_SER_S_CLK_CFG */
+#define SER_CLK_EN			BIT(0)
+#define CLK_DIV_MSK			GENMASK(15, 4)
+#define CLK_DIV_SHFT			4
+
+/* GENI_FW_REVISION_RO fields */
+#define FW_REV_PROTOCOL_MSK		GENMASK(15, 8)
+#define FW_REV_PROTOCOL_SHFT		8
+
+/* GENI_CLK_SEL fields */
+#define CLK_SEL_MSK			GENMASK(2, 0)
+
+/* SE_GENI_DMA_MODE_EN */
+#define GENI_DMA_MODE_EN		BIT(0)
+
+/* GENI_M_CMD0 fields */
+#define M_OPCODE_MSK			GENMASK(31, 27)
+#define M_OPCODE_SHFT			27
+#define M_PARAMS_MSK			GENMASK(26, 0)
+
+/* GENI_M_CMD_CTRL_REG */
+#define M_GENI_CMD_CANCEL		BIT(2)
+#define M_GENI_CMD_ABORT		BIT(1)
+#define M_GENI_DISABLE			BIT(0)
+
+/* GENI_S_CMD0 fields */
+#define S_OPCODE_MSK			GENMASK(31, 27)
+#define S_OPCODE_SHFT			27
+#define S_PARAMS_MSK			GENMASK(26, 0)
+
+/* GENI_S_CMD_CTRL_REG */
+#define S_GENI_CMD_CANCEL		BIT(2)
+#define S_GENI_CMD_ABORT		BIT(1)
+#define S_GENI_DISABLE			BIT(0)
+
+/* GENI_M_IRQ_EN fields */
+#define M_CMD_DONE_EN			BIT(0)
+#define M_CMD_OVERRUN_EN		BIT(1)
+#define M_ILLEGAL_CMD_EN		BIT(2)
+#define M_CMD_FAILURE_EN		BIT(3)
+#define M_CMD_CANCEL_EN			BIT(4)
+#define M_CMD_ABORT_EN			BIT(5)
+#define M_TIMESTAMP_EN			BIT(6)
+#define M_RX_IRQ_EN			BIT(7)
+#define M_GP_SYNC_IRQ_0_EN		BIT(8)
+#define M_GP_IRQ_0_EN			BIT(9)
+#define M_GP_IRQ_1_EN			BIT(10)
+#define M_GP_IRQ_2_EN			BIT(11)
+#define M_GP_IRQ_3_EN			BIT(12)
+#define M_GP_IRQ_4_EN			BIT(13)
+#define M_GP_IRQ_5_EN			BIT(14)
+#define M_IO_DATA_DEASSERT_EN		BIT(22)
+#define M_IO_DATA_ASSERT_EN		BIT(23)
+#define M_RX_FIFO_RD_ERR_EN		BIT(24)
+#define M_RX_FIFO_WR_ERR_EN		BIT(25)
+#define M_RX_FIFO_WATERMARK_EN		BIT(26)
+#define M_RX_FIFO_LAST_EN		BIT(27)
+#define M_TX_FIFO_RD_ERR_EN		BIT(28)
+#define M_TX_FIFO_WR_ERR_EN		BIT(29)
+#define M_TX_FIFO_WATERMARK_EN		BIT(30)
+#define M_SEC_IRQ_EN			BIT(31)
+#define M_COMMON_GENI_M_IRQ_EN	(GENMASK(6, 1) | \
+				M_IO_DATA_DEASSERT_EN | \
+				M_IO_DATA_ASSERT_EN | M_RX_FIFO_RD_ERR_EN | \
+				M_RX_FIFO_WR_ERR_EN | M_TX_FIFO_RD_ERR_EN | \
+				M_TX_FIFO_WR_ERR_EN)
+
+/* GENI_S_IRQ_EN fields */
+#define S_CMD_DONE_EN			BIT(0)
+#define S_CMD_OVERRUN_EN		BIT(1)
+#define S_ILLEGAL_CMD_EN		BIT(2)
+#define S_CMD_FAILURE_EN		BIT(3)
+#define S_CMD_CANCEL_EN			BIT(4)
+#define S_CMD_ABORT_EN			BIT(5)
+#define S_GP_SYNC_IRQ_0_EN		BIT(8)
+#define S_GP_IRQ_0_EN			BIT(9)
+#define S_GP_IRQ_1_EN			BIT(10)
+#define S_GP_IRQ_2_EN			BIT(11)
+#define S_GP_IRQ_3_EN			BIT(12)
+#define S_GP_IRQ_4_EN			BIT(13)
+#define S_GP_IRQ_5_EN			BIT(14)
+#define S_IO_DATA_DEASSERT_EN		BIT(22)
+#define S_IO_DATA_ASSERT_EN		BIT(23)
+#define S_RX_FIFO_RD_ERR_EN		BIT(24)
+#define S_RX_FIFO_WR_ERR_EN		BIT(25)
+#define S_RX_FIFO_WATERMARK_EN		BIT(26)
+#define S_RX_FIFO_LAST_EN		BIT(27)
+#define S_COMMON_GENI_S_IRQ_EN	(GENMASK(5, 1) | GENMASK(13, 9) | \
+				 S_RX_FIFO_RD_ERR_EN | S_RX_FIFO_WR_ERR_EN)
+
+/*  GENI_/TX/RX/RX_RFR/_WATERMARK_REG fields */
+#define WATERMARK_MSK			GENMASK(5, 0)
+
+/* GENI_TX_FIFO_STATUS fields */
+#define TX_FIFO_WC			GENMASK(27, 0)
+
+/*  GENI_RX_FIFO_STATUS fields */
+#define RX_LAST				BIT(31)
+#define RX_LAST_BYTE_VALID_MSK		GENMASK(30, 28)
+#define RX_LAST_BYTE_VALID_SHFT		28
+#define RX_FIFO_WC_MSK			GENMASK(24, 0)
+
+/* SE_GENI_IOS fields */
+#define IO2_DATA_IN			BIT(1)
+#define RX_DATA_IN			BIT(0)
+
+/* SE_DMA_TX_IRQ_STAT Register fields */
+#define TX_DMA_DONE			BIT(0)
+#define TX_EOT				BIT(1)
+#define TX_SBE				BIT(2)
+#define TX_RESET_DONE			BIT(3)
+
+/* SE_DMA_RX_IRQ_STAT Register fields */
+#define RX_DMA_DONE			BIT(0)
+#define RX_EOT				BIT(1)
+#define RX_SBE				BIT(2)
+#define RX_RESET_DONE			BIT(3)
+#define RX_FLUSH_DONE			BIT(4)
+#define RX_GENI_GP_IRQ			GENMASK(10, 5)
+#define RX_GENI_CANCEL_IRQ		BIT(11)
+#define RX_GENI_GP_IRQ_EXT		GENMASK(13, 12)
+
+/* SE_HW_PARAM_0 fields */
+#define TX_FIFO_WIDTH_MSK		GENMASK(29, 24)
+#define TX_FIFO_WIDTH_SHFT		24
+#define TX_FIFO_DEPTH_MSK		GENMASK(21, 16)
+#define TX_FIFO_DEPTH_SHFT		16
+
+/* SE_HW_PARAM_1 fields */
+#define RX_FIFO_WIDTH_MSK		GENMASK(29, 24)
+#define RX_FIFO_WIDTH_SHFT		24
+#define RX_FIFO_DEPTH_MSK		GENMASK(21, 16)
+#define RX_FIFO_DEPTH_SHFT		16
+
+#define HW_VER_MAJOR_MASK		GENMASK(31, 28)
+#define HW_VER_MAJOR_SHFT		28
+#define HW_VER_MINOR_MASK		GENMASK(27, 16)
+#define HW_VER_MINOR_SHFT		16
+#define HW_VER_STEP_MASK		GENMASK(15, 0)
+
+#if IS_ENABLED(CONFIG_QCOM_GENI_SE)
+
+u32 geni_se_get_qup_hw_version(struct geni_se *se);
+
+#define geni_se_get_wrapper_version(se, major, minor, step) do { \
+	u32 ver; \
+\
+	ver = geni_se_get_qup_hw_version(se); \
+	major = (ver & HW_VER_MAJOR_MASK) >> HW_VER_MAJOR_SHFT; \
+	minor = (ver & HW_VER_MINOR_MASK) >> HW_VER_MINOR_SHFT; \
+	step = version & HW_VER_STEP_MASK; \
+} while (0)
+
+/**
+ * geni_se_read_proto() - Read the protocol configured for a serial engine
+ * @se:		Pointer to the concerned serial engine.
+ *
+ * Return: Protocol value as configured in the serial engine.
+ */
+static inline u32 geni_se_read_proto(struct geni_se *se)
+{
+	u32 val;
+
+	val = readl_relaxed(se->base + GENI_FW_REVISION_RO);
+
+	return (val & FW_REV_PROTOCOL_MSK) >> FW_REV_PROTOCOL_SHFT;
+}
+
+/**
+ * geni_se_setup_m_cmd() - Setup the primary sequencer
+ * @se:		Pointer to the concerned serial engine.
+ * @cmd:	Command/Operation to setup in the primary sequencer.
+ * @params:	Parameter for the sequencer command.
+ *
+ * This function is used to configure the primary sequencer with the
+ * command and its associated parameters.
+ */
+static inline void geni_se_setup_m_cmd(struct geni_se *se, u32 cmd, u32 params)
+{
+	u32 m_cmd;
+
+	m_cmd = (cmd << M_OPCODE_SHFT) | (params & M_PARAMS_MSK);
+	writel_relaxed(m_cmd, se->base + SE_GENI_M_CMD0);
+}
+
+/**
+ * geni_se_setup_s_cmd() - Setup the secondary sequencer
+ * @se:		Pointer to the concerned serial engine.
+ * @cmd:	Command/Operation to setup in the secondary sequencer.
+ * @params:	Parameter for the sequencer command.
+ *
+ * This function is used to configure the secondary sequencer with the
+ * command and its associated parameters.
+ */
+static inline void geni_se_setup_s_cmd(struct geni_se *se, u32 cmd, u32 params)
+{
+	u32 s_cmd;
+
+	s_cmd = readl_relaxed(se->base + SE_GENI_S_CMD0);
+	s_cmd &= ~(S_OPCODE_MSK | S_PARAMS_MSK);
+	s_cmd |= (cmd << S_OPCODE_SHFT);
+	s_cmd |= (params & S_PARAMS_MSK);
+	writel_relaxed(s_cmd, se->base + SE_GENI_S_CMD0);
+}
+
+/**
+ * geni_se_cancel_m_cmd() - Cancel the command configured in the primary
+ *                          sequencer
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * This function is used to cancel the currently configured command in the
+ * primary sequencer.
+ */
+static inline void geni_se_cancel_m_cmd(struct geni_se *se)
+{
+	writel_relaxed(M_GENI_CMD_CANCEL, se->base + SE_GENI_M_CMD_CTRL_REG);
+}
+
+/**
+ * geni_se_cancel_s_cmd() - Cancel the command configured in the secondary
+ *                          sequencer
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * This function is used to cancel the currently configured command in the
+ * secondary sequencer.
+ */
+static inline void geni_se_cancel_s_cmd(struct geni_se *se)
+{
+	writel_relaxed(S_GENI_CMD_CANCEL, se->base + SE_GENI_S_CMD_CTRL_REG);
+}
+
+/**
+ * geni_se_abort_m_cmd() - Abort the command configured in the primary sequencer
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * This function is used to force abort the currently configured command in the
+ * primary sequencer.
+ */
+static inline void geni_se_abort_m_cmd(struct geni_se *se)
+{
+	writel_relaxed(M_GENI_CMD_ABORT, se->base + SE_GENI_M_CMD_CTRL_REG);
+}
+
+/**
+ * geni_se_abort_s_cmd() - Abort the command configured in the secondary
+ *                         sequencer
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * This function is used to force abort the currently configured command in the
+ * secondary sequencer.
+ */
+static inline void geni_se_abort_s_cmd(struct geni_se *se)
+{
+	writel_relaxed(S_GENI_CMD_ABORT, se->base + SE_GENI_S_CMD_CTRL_REG);
+}
+
+/**
+ * geni_se_get_tx_fifo_depth() - Get the TX fifo depth of the serial engine
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * This function is used to get the depth i.e. number of elements in the
+ * TX fifo of the serial engine.
+ *
+ * Return: TX fifo depth in units of FIFO words.
+ */
+static inline u32 geni_se_get_tx_fifo_depth(struct geni_se *se)
+{
+	u32 val;
+
+	val = readl_relaxed(se->base + SE_HW_PARAM_0);
+
+	return (val & TX_FIFO_DEPTH_MSK) >> TX_FIFO_DEPTH_SHFT;
+}
+
+/**
+ * geni_se_get_tx_fifo_width() - Get the TX fifo width of the serial engine
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * This function is used to get the width i.e. word size per element in the
+ * TX fifo of the serial engine.
+ *
+ * Return: TX fifo width in bits
+ */
+static inline u32 geni_se_get_tx_fifo_width(struct geni_se *se)
+{
+	u32 val;
+
+	val = readl_relaxed(se->base + SE_HW_PARAM_0);
+
+	return (val & TX_FIFO_WIDTH_MSK) >> TX_FIFO_WIDTH_SHFT;
+}
+
+/**
+ * geni_se_get_rx_fifo_depth() - Get the RX fifo depth of the serial engine
+ * @se:	Pointer to the concerned serial engine.
+ *
+ * This function is used to get the depth i.e. number of elements in the
+ * RX fifo of the serial engine.
+ *
+ * Return: RX fifo depth in units of FIFO words
+ */
+static inline u32 geni_se_get_rx_fifo_depth(struct geni_se *se)
+{
+	u32 val;
+
+	val = readl_relaxed(se->base + SE_HW_PARAM_1);
+
+	return (val & RX_FIFO_DEPTH_MSK) >> RX_FIFO_DEPTH_SHFT;
+}
+
+void geni_se_init(struct geni_se *se, u32 rx_wm, u32 rx_rfr);
+
+void geni_se_select_mode(struct geni_se *se, enum geni_se_xfer_mode mode);
+
+void geni_se_config_packing(struct geni_se *se, int bpw, int pack_words,
+			    bool msb_to_lsb, bool tx_cfg, bool rx_cfg);
+
+int geni_se_resources_off(struct geni_se *se);
+
+int geni_se_resources_on(struct geni_se *se);
+
+int geni_se_clk_tbl_get(struct geni_se *se, unsigned long **tbl);
+
+int geni_se_clk_freq_match(struct geni_se *se, unsigned long req_freq,
+			   unsigned int *index, unsigned long *res_freq,
+			   bool exact);
+
+int geni_se_tx_dma_prep(struct geni_se *se, void *buf, size_t len,
+			dma_addr_t *iova);
+
+int geni_se_rx_dma_prep(struct geni_se *se, void *buf, size_t len,
+			dma_addr_t *iova);
+
+void geni_se_tx_dma_unprep(struct geni_se *se, dma_addr_t iova, size_t len);
+
+void geni_se_rx_dma_unprep(struct geni_se *se, dma_addr_t iova, size_t len);
+#endif
+#endif
-- 
cgit v1.2.3


From 09e19d73b84422c53bc83b4d72866d28b6ad8fdf Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Mon, 23 Apr 2018 21:35:08 +0200
Subject: dt-bindings: clock: meson8b: export the NAND clock

Export the NAND clock to the dt-bindings.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
---
 include/dt-bindings/clock/meson8b-clkc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/meson8b-clkc.h b/include/dt-bindings/clock/meson8b-clkc.h
index dea9d46d4fa7..a60f47b49231 100644
--- a/include/dt-bindings/clock/meson8b-clkc.h
+++ b/include/dt-bindings/clock/meson8b-clkc.h
@@ -102,5 +102,6 @@
 #define CLKID_MPLL0		93
 #define CLKID_MPLL1		94
 #define CLKID_MPLL2		95
+#define CLKID_NAND_CLK		112
 
 #endif /* __MESON8B_CLKC_H */
-- 
cgit v1.2.3


From a0b5e4e4be0cf97b101eaf1a039a3a71c3fe0e66 Mon Sep 17 00:00:00 2001
From: Maxime Jourdan <maxi.jourdan@wanadoo.fr>
Date: Tue, 24 Apr 2018 20:48:37 +0200
Subject: clk: meson: gxbb: expose VDEC_1 and VDEC_HEVC clocks

Export video decoder clock dt-bindings

Signed-off-by: Maxime Jourdan <maxi.jourdan@wanadoo.fr>
[added commit description]
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
---
 include/dt-bindings/clock/gxbb-clkc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/gxbb-clkc.h b/include/dt-bindings/clock/gxbb-clkc.h
index 8ba99a5e3fd3..7a892be90549 100644
--- a/include/dt-bindings/clock/gxbb-clkc.h
+++ b/include/dt-bindings/clock/gxbb-clkc.h
@@ -125,5 +125,7 @@
 #define CLKID_VAPB_1		138
 #define CLKID_VAPB_SEL		139
 #define CLKID_VAPB		140
+#define CLKID_VDEC_1		153
+#define CLKID_VDEC_HEVC		156
 
 #endif /* __GXBB_CLKC_H */
-- 
cgit v1.2.3


From 2eadc33f40d4c59dd0649f8b6958872d85ad05d7 Mon Sep 17 00:00:00 2001
From: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Date: Mon, 23 Apr 2018 15:10:56 +0100
Subject: typec: tcpm: Add core support for sink side PPS

This commit adds code to handle requesting of PPS APDOs. Switching
between standard PDOs and APDOs, and re-requesting an APDO to
modify operating voltage/current will be triggered by an
external call into TCPM.

Signed-off-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm.c | 569 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/usb/pd.h   |   4 +-
 include/linux/usb/tcpm.h |   1 +
 3 files changed, 558 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/typec/tcpm.c b/drivers/usb/typec/tcpm.c
index 27192083f8e6..b160da35605f 100644
--- a/drivers/usb/typec/tcpm.c
+++ b/drivers/usb/typec/tcpm.c
@@ -48,6 +48,7 @@
 	S(SNK_DISCOVERY_DEBOUNCE_DONE),		\
 	S(SNK_WAIT_CAPABILITIES),		\
 	S(SNK_NEGOTIATE_CAPABILITIES),		\
+	S(SNK_NEGOTIATE_PPS_CAPABILITIES),	\
 	S(SNK_TRANSITION_SINK),			\
 	S(SNK_TRANSITION_SINK_VBUS),		\
 	S(SNK_READY),				\
@@ -167,6 +168,16 @@ struct pd_mode_data {
 	struct typec_altmode_desc altmode_desc[SVID_DISCOVERY_MAX];
 };
 
+struct pd_pps_data {
+	u32 min_volt;
+	u32 max_volt;
+	u32 max_curr;
+	u32 out_volt;
+	u32 op_curr;
+	bool supported;
+	bool active;
+};
+
 struct tcpm_port {
 	struct device *dev;
 
@@ -235,6 +246,7 @@ struct tcpm_port {
 	struct completion swap_complete;
 	int swap_status;
 
+	unsigned int negotiated_rev;
 	unsigned int message_id;
 	unsigned int caps_count;
 	unsigned int hard_reset_count;
@@ -258,6 +270,7 @@ struct tcpm_port {
 	unsigned int nr_snk_vdo;
 
 	unsigned int operating_snk_mw;
+	bool update_sink_caps;
 
 	/* Requested current / voltage */
 	u32 current_limit;
@@ -274,8 +287,13 @@ struct tcpm_port {
 	/* VDO to retry if UFP responder replied busy */
 	u32 vdo_retry;
 
-	/* Alternate mode data */
+	/* PPS */
+	struct pd_pps_data pps_data;
+	struct completion pps_complete;
+	bool pps_pending;
+	int pps_status;
 
+	/* Alternate mode data */
 	struct pd_mode_data mode_data;
 	struct typec_altmode *partner_altmode[SVID_DISCOVERY_MAX];
 	struct typec_altmode *port_altmode[SVID_DISCOVERY_MAX];
@@ -493,6 +511,16 @@ static void tcpm_log_source_caps(struct tcpm_port *port)
 				  pdo_max_voltage(pdo),
 				  pdo_max_power(pdo));
 			break;
+		case PDO_TYPE_APDO:
+			if (pdo_apdo_type(pdo) == APDO_TYPE_PPS)
+				scnprintf(msg, sizeof(msg),
+					  "%u-%u mV, %u mA",
+					  pdo_pps_apdo_min_voltage(pdo),
+					  pdo_pps_apdo_max_voltage(pdo),
+					  pdo_pps_apdo_max_current(pdo));
+			else
+				strcpy(msg, "undefined APDO");
+			break;
 		default:
 			strcpy(msg, "undefined");
 			break;
@@ -790,11 +818,13 @@ static int tcpm_pd_send_source_caps(struct tcpm_port *port)
 		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
 					  port->pwr_role,
 					  port->data_role,
+					  port->negotiated_rev,
 					  port->message_id, 0);
 	} else {
 		msg.header = PD_HEADER_LE(PD_DATA_SOURCE_CAP,
 					  port->pwr_role,
 					  port->data_role,
+					  port->negotiated_rev,
 					  port->message_id,
 					  port->nr_src_pdo);
 	}
@@ -815,11 +845,13 @@ static int tcpm_pd_send_sink_caps(struct tcpm_port *port)
 		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
 					  port->pwr_role,
 					  port->data_role,
+					  port->negotiated_rev,
 					  port->message_id, 0);
 	} else {
 		msg.header = PD_HEADER_LE(PD_DATA_SINK_CAP,
 					  port->pwr_role,
 					  port->data_role,
+					  port->negotiated_rev,
 					  port->message_id,
 					  port->nr_snk_pdo);
 	}
@@ -1186,6 +1218,7 @@ static void vdm_run_state_machine(struct tcpm_port *port)
 		msg.header = PD_HEADER_LE(PD_DATA_VENDOR_DEF,
 					  port->pwr_role,
 					  port->data_role,
+					  port->negotiated_rev,
 					  port->message_id, port->vdo_count);
 		for (i = 0; i < port->vdo_count; i++)
 			msg.payload[i] = cpu_to_le32(port->vdo_data[i]);
@@ -1257,6 +1290,8 @@ enum pdo_err {
 	PDO_ERR_FIXED_NOT_SORTED,
 	PDO_ERR_VARIABLE_BATT_NOT_SORTED,
 	PDO_ERR_DUPE_PDO,
+	PDO_ERR_PPS_APDO_NOT_SORTED,
+	PDO_ERR_DUPE_PPS_APDO,
 };
 
 static const char * const pdo_err_msg[] = {
@@ -1272,6 +1307,10 @@ static const char * const pdo_err_msg[] = {
 	" err: Variable/Battery supply pdos should be in increasing order of their minimum voltage",
 	[PDO_ERR_DUPE_PDO] =
 	" err: Variable/Batt supply pdos cannot have same min/max voltage",
+	[PDO_ERR_PPS_APDO_NOT_SORTED] =
+	" err: Programmable power supply apdos should be in increasing order of their maximum voltage",
+	[PDO_ERR_DUPE_PPS_APDO] =
+	" err: Programmable power supply apdos cannot have same min/max voltage and max current",
 };
 
 static enum pdo_err tcpm_caps_err(struct tcpm_port *port, const u32 *pdo,
@@ -1321,6 +1360,26 @@ static enum pdo_err tcpm_caps_err(struct tcpm_port *port, const u32 *pdo,
 					  pdo_min_voltage(pdo[i - 1])))
 					return PDO_ERR_DUPE_PDO;
 				break;
+			/*
+			 * The Programmable Power Supply APDOs, if present,
+			 * shall be sent in Maximum Voltage order;
+			 * lowest to highest.
+			 */
+			case PDO_TYPE_APDO:
+				if (pdo_apdo_type(pdo[i]) != APDO_TYPE_PPS)
+					break;
+
+				if (pdo_pps_apdo_max_current(pdo[i]) <
+				    pdo_pps_apdo_max_current(pdo[i - 1]))
+					return PDO_ERR_PPS_APDO_NOT_SORTED;
+				else if (pdo_pps_apdo_min_voltage(pdo[i]) ==
+					  pdo_pps_apdo_min_voltage(pdo[i - 1]) &&
+					 pdo_pps_apdo_max_voltage(pdo[i]) ==
+					  pdo_pps_apdo_max_voltage(pdo[i - 1]) &&
+					 pdo_pps_apdo_max_current(pdo[i]) ==
+					  pdo_pps_apdo_max_current(pdo[i - 1]))
+					return PDO_ERR_DUPE_PPS_APDO;
+				break;
 			default:
 				tcpm_log_force(port, " Unknown pdo type");
 			}
@@ -1346,11 +1405,16 @@ static int tcpm_validate_caps(struct tcpm_port *port, const u32 *pdo,
 /*
  * PD (data, control) command handling functions
  */
+
+static int tcpm_pd_send_control(struct tcpm_port *port,
+				enum pd_ctrl_msg_type type);
+
 static void tcpm_pd_data_request(struct tcpm_port *port,
 				 const struct pd_message *msg)
 {
 	enum pd_data_msg_type type = pd_header_type_le(msg->header);
 	unsigned int cnt = pd_header_cnt_le(msg->header);
+	unsigned int rev = pd_header_rev_le(msg->header);
 	unsigned int i;
 
 	switch (type) {
@@ -1368,6 +1432,17 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 		tcpm_validate_caps(port, port->source_caps,
 				   port->nr_source_caps);
 
+		/*
+		 * Adjust revision in subsequent message headers, as required,
+		 * to comply with 6.2.1.1.5 of the USB PD 3.0 spec. We don't
+		 * support Rev 1.0 so just do nothing in that scenario.
+		 */
+		if (rev == PD_REV10)
+			break;
+
+		if (rev < PD_MAX_REV)
+			port->negotiated_rev = rev;
+
 		/*
 		 * This message may be received even if VBUS is not
 		 * present. This is quite unexpected; see USB PD
@@ -1389,6 +1464,20 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
 			break;
 		}
+
+		/*
+		 * Adjust revision in subsequent message headers, as required,
+		 * to comply with 6.2.1.1.5 of the USB PD 3.0 spec. We don't
+		 * support Rev 1.0 so just reject in that scenario.
+		 */
+		if (rev == PD_REV10) {
+			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
+			break;
+		}
+
+		if (rev < PD_MAX_REV)
+			port->negotiated_rev = rev;
+
 		port->sink_request = le32_to_cpu(msg->payload[0]);
 		tcpm_set_state(port, SRC_NEGOTIATE_CAPABILITIES, 0);
 		break;
@@ -1413,6 +1502,15 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 	}
 }
 
+static void tcpm_pps_complete(struct tcpm_port *port, int result)
+{
+	if (port->pps_pending) {
+		port->pps_status = result;
+		port->pps_pending = false;
+		complete(&port->pps_complete);
+	}
+}
+
 static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 				 const struct pd_message *msg)
 {
@@ -1489,6 +1587,14 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 				next_state = SNK_WAIT_CAPABILITIES;
 			tcpm_set_state(port, next_state, 0);
 			break;
+		case SNK_NEGOTIATE_PPS_CAPABILITIES:
+			/* Revert data back from any requested PPS updates */
+			port->pps_data.out_volt = port->supply_voltage;
+			port->pps_data.op_curr = port->current_limit;
+			port->pps_status = (type == PD_CTRL_WAIT ?
+					    -EAGAIN : -EOPNOTSUPP);
+			tcpm_set_state(port, SNK_READY, 0);
+			break;
 		case DR_SWAP_SEND:
 			port->swap_status = (type == PD_CTRL_WAIT ?
 					     -EAGAIN : -EOPNOTSUPP);
@@ -1511,6 +1617,13 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 	case PD_CTRL_ACCEPT:
 		switch (port->state) {
 		case SNK_NEGOTIATE_CAPABILITIES:
+			port->pps_data.active = false;
+			tcpm_set_state(port, SNK_TRANSITION_SINK, 0);
+			break;
+		case SNK_NEGOTIATE_PPS_CAPABILITIES:
+			port->pps_data.active = true;
+			port->supply_voltage = port->pps_data.out_volt;
+			port->current_limit = port->pps_data.op_curr;
 			tcpm_set_state(port, SNK_TRANSITION_SINK, 0);
 			break;
 		case SOFT_RESET_SEND:
@@ -1665,6 +1778,7 @@ static int tcpm_pd_send_control(struct tcpm_port *port,
 	memset(&msg, 0, sizeof(msg));
 	msg.header = PD_HEADER_LE(type, port->pwr_role,
 				  port->data_role,
+				  port->negotiated_rev,
 				  port->message_id, 0);
 
 	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
@@ -1780,6 +1894,8 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo,
 		     min_snk_mv = 0;
 	int ret = -EINVAL;
 
+	port->pps_data.supported = false;
+
 	/*
 	 * Select the source PDO providing the most power which has a
 	 * matchig sink cap.
@@ -1788,30 +1904,59 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo,
 		u32 pdo = port->source_caps[i];
 		enum pd_pdo_type type = pdo_type(pdo);
 
-		if (type == PDO_TYPE_FIXED) {
+		switch (type) {
+		case PDO_TYPE_FIXED:
 			max_src_mv = pdo_fixed_voltage(pdo);
 			min_src_mv = max_src_mv;
-		} else {
+			break;
+		case PDO_TYPE_BATT:
+		case PDO_TYPE_VAR:
 			max_src_mv = pdo_max_voltage(pdo);
 			min_src_mv = pdo_min_voltage(pdo);
+			break;
+		case PDO_TYPE_APDO:
+			if (pdo_apdo_type(pdo) == APDO_TYPE_PPS)
+				port->pps_data.supported = true;
+			continue;
+		default:
+			tcpm_log(port, "Invalid source PDO type, ignoring");
+			continue;
 		}
 
-		if (type == PDO_TYPE_BATT) {
-			src_mw = pdo_max_power(pdo);
-		} else {
+		switch (type) {
+		case PDO_TYPE_FIXED:
+		case PDO_TYPE_VAR:
 			src_ma = pdo_max_current(pdo);
 			src_mw = src_ma * min_src_mv / 1000;
+			break;
+		case PDO_TYPE_BATT:
+			src_mw = pdo_max_power(pdo);
+			break;
+		case PDO_TYPE_APDO:
+			continue;
+		default:
+			tcpm_log(port, "Invalid source PDO type, ignoring");
+			continue;
 		}
 
 		for (j = 0; j < port->nr_snk_pdo; j++) {
 			pdo = port->snk_pdo[j];
 
-			if (pdo_type(pdo) == PDO_TYPE_FIXED) {
-				min_snk_mv = pdo_fixed_voltage(pdo);
+			switch (pdo_type(pdo)) {
+			case PDO_TYPE_FIXED:
 				max_snk_mv = pdo_fixed_voltage(pdo);
-			} else {
-				min_snk_mv = pdo_min_voltage(pdo);
+				min_snk_mv = max_snk_mv;
+				break;
+			case PDO_TYPE_BATT:
+			case PDO_TYPE_VAR:
 				max_snk_mv = pdo_max_voltage(pdo);
+				min_snk_mv = pdo_min_voltage(pdo);
+				break;
+			case PDO_TYPE_APDO:
+				continue;
+			default:
+				tcpm_log(port, "Invalid sink PDO type, ignoring");
+				continue;
 			}
 
 			if (max_src_mv <= max_snk_mv &&
@@ -1832,6 +1977,103 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo,
 	return ret;
 }
 
+#define min_pps_apdo_current(x, y)	\
+	min(pdo_pps_apdo_max_current(x), pdo_pps_apdo_max_current(y))
+
+static unsigned int tcpm_pd_select_pps_apdo(struct tcpm_port *port)
+{
+	unsigned int i, j, max_mw = 0, max_mv = 0;
+	unsigned int min_src_mv, max_src_mv, src_ma, src_mw;
+	unsigned int min_snk_mv, max_snk_mv, snk_ma;
+	u32 pdo;
+	unsigned int src_pdo = 0, snk_pdo = 0;
+
+	/*
+	 * Select the source PPS APDO providing the most power while staying
+	 * within the board's limits. We skip the first PDO as this is always
+	 * 5V 3A.
+	 */
+	for (i = 1; i < port->nr_source_caps; ++i) {
+		pdo = port->source_caps[i];
+
+		switch (pdo_type(pdo)) {
+		case PDO_TYPE_APDO:
+			if (pdo_apdo_type(pdo) != APDO_TYPE_PPS) {
+				tcpm_log(port, "Not PPS APDO (source), ignoring");
+				continue;
+			}
+
+			min_src_mv = pdo_pps_apdo_min_voltage(pdo);
+			max_src_mv = pdo_pps_apdo_max_voltage(pdo);
+			src_ma = pdo_pps_apdo_max_current(pdo);
+			src_mw = (src_ma * max_src_mv) / 1000;
+
+			/*
+			 * Now search through the sink PDOs to find a matching
+			 * PPS APDO. Again skip the first sink PDO as this will
+			 * always be 5V 3A.
+			 */
+			for (j = i; j < port->nr_snk_pdo; j++) {
+				pdo = port->snk_pdo[j];
+
+				switch (pdo_type(pdo)) {
+				case PDO_TYPE_APDO:
+					if (pdo_apdo_type(pdo) != APDO_TYPE_PPS) {
+						tcpm_log(port,
+							 "Not PPS APDO (sink), ignoring");
+						continue;
+					}
+
+					min_snk_mv =
+						pdo_pps_apdo_min_voltage(pdo);
+					max_snk_mv =
+						pdo_pps_apdo_max_voltage(pdo);
+					snk_ma =
+						pdo_pps_apdo_max_current(pdo);
+					break;
+				default:
+					tcpm_log(port,
+						 "Not APDO type (sink), ignoring");
+					continue;
+				}
+
+				if (max_src_mv <= max_snk_mv &&
+				    min_src_mv >= min_snk_mv) {
+					/* Prefer higher voltages if available */
+					if ((src_mw == max_mw &&
+					     min_src_mv > max_mv) ||
+					    src_mw > max_mw) {
+						src_pdo = i;
+						snk_pdo = j;
+						max_mw = src_mw;
+						max_mv = max_src_mv;
+					}
+				}
+			}
+
+			break;
+		default:
+			tcpm_log(port, "Not APDO type (source), ignoring");
+			continue;
+		}
+	}
+
+	if (src_pdo) {
+		pdo = port->source_caps[src_pdo];
+
+		port->pps_data.min_volt = pdo_pps_apdo_min_voltage(pdo);
+		port->pps_data.max_volt = pdo_pps_apdo_max_voltage(pdo);
+		port->pps_data.max_curr =
+			min_pps_apdo_current(pdo, port->snk_pdo[snk_pdo]);
+		port->pps_data.out_volt =
+			min(pdo_pps_apdo_max_voltage(pdo), port->pps_data.out_volt);
+		port->pps_data.op_curr =
+			min(port->pps_data.max_curr, port->pps_data.op_curr);
+	}
+
+	return src_pdo;
+}
+
 static int tcpm_pd_build_request(struct tcpm_port *port, u32 *rdo)
 {
 	unsigned int mv, ma, mw, flags;
@@ -1849,10 +2091,18 @@ static int tcpm_pd_build_request(struct tcpm_port *port, u32 *rdo)
 	matching_snk_pdo = port->snk_pdo[snk_pdo_index];
 	type = pdo_type(pdo);
 
-	if (type == PDO_TYPE_FIXED)
+	switch (type) {
+	case PDO_TYPE_FIXED:
 		mv = pdo_fixed_voltage(pdo);
-	else
+		break;
+	case PDO_TYPE_BATT:
+	case PDO_TYPE_VAR:
 		mv = pdo_min_voltage(pdo);
+		break;
+	default:
+		tcpm_log(port, "Invalid PDO selected!");
+		return -EINVAL;
+	}
 
 	/* Select maximum available current within the sink pdo's limit */
 	if (type == PDO_TYPE_BATT) {
@@ -1917,6 +2167,105 @@ static int tcpm_pd_send_request(struct tcpm_port *port)
 	msg.header = PD_HEADER_LE(PD_DATA_REQUEST,
 				  port->pwr_role,
 				  port->data_role,
+				  port->negotiated_rev,
+				  port->message_id, 1);
+	msg.payload[0] = cpu_to_le32(rdo);
+
+	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
+}
+
+static int tcpm_pd_build_pps_request(struct tcpm_port *port, u32 *rdo)
+{
+	unsigned int out_mv, op_ma, op_mw, min_mv, max_mv, max_ma, flags;
+	enum pd_pdo_type type;
+	unsigned int src_pdo_index;
+	u32 pdo;
+
+	src_pdo_index = tcpm_pd_select_pps_apdo(port);
+	if (!src_pdo_index)
+		return -EOPNOTSUPP;
+
+	pdo = port->source_caps[src_pdo_index];
+	type = pdo_type(pdo);
+
+	switch (type) {
+	case PDO_TYPE_APDO:
+		if (pdo_apdo_type(pdo) != APDO_TYPE_PPS) {
+			tcpm_log(port, "Invalid APDO selected!");
+			return -EINVAL;
+		}
+		min_mv = port->pps_data.min_volt;
+		max_mv = port->pps_data.max_volt;
+		max_ma = port->pps_data.max_curr;
+		out_mv = port->pps_data.out_volt;
+		op_ma = port->pps_data.op_curr;
+		break;
+	default:
+		tcpm_log(port, "Invalid PDO selected!");
+		return -EINVAL;
+	}
+
+	flags = RDO_USB_COMM | RDO_NO_SUSPEND;
+
+	op_mw = (op_ma * out_mv) / 1000;
+	if (op_mw < port->operating_snk_mw) {
+		/*
+		 * Try raising current to meet power needs. If that's not enough
+		 * then try upping the voltage. If that's still not enough
+		 * then we've obviously chosen a PPS APDO which really isn't
+		 * suitable so abandon ship.
+		 */
+		op_ma = (port->operating_snk_mw * 1000) / out_mv;
+		if ((port->operating_snk_mw * 1000) % out_mv)
+			++op_ma;
+		op_ma += RDO_PROG_CURR_MA_STEP - (op_ma % RDO_PROG_CURR_MA_STEP);
+
+		if (op_ma > max_ma) {
+			op_ma = max_ma;
+			out_mv = (port->operating_snk_mw * 1000) / op_ma;
+			if ((port->operating_snk_mw * 1000) % op_ma)
+				++out_mv;
+			out_mv += RDO_PROG_VOLT_MV_STEP -
+				  (out_mv % RDO_PROG_VOLT_MV_STEP);
+
+			if (out_mv > max_mv) {
+				tcpm_log(port, "Invalid PPS APDO selected!");
+				return -EINVAL;
+			}
+		}
+	}
+
+	tcpm_log(port, "cc=%d cc1=%d cc2=%d vbus=%d vconn=%s polarity=%d",
+		 port->cc_req, port->cc1, port->cc2, port->vbus_source,
+		 port->vconn_role == TYPEC_SOURCE ? "source" : "sink",
+		 port->polarity);
+
+	*rdo = RDO_PROG(src_pdo_index + 1, out_mv, op_ma, flags);
+
+	tcpm_log(port, "Requesting APDO %d: %u mV, %u mA",
+		 src_pdo_index, out_mv, op_ma);
+
+	port->pps_data.op_curr = op_ma;
+	port->pps_data.out_volt = out_mv;
+
+	return 0;
+}
+
+static int tcpm_pd_send_pps_request(struct tcpm_port *port)
+{
+	struct pd_message msg;
+	int ret;
+	u32 rdo;
+
+	ret = tcpm_pd_build_pps_request(port, &rdo);
+	if (ret < 0)
+		return ret;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.header = PD_HEADER_LE(PD_DATA_REQUEST,
+				  port->pwr_role,
+				  port->data_role,
+				  port->negotiated_rev,
 				  port->message_id, 1);
 	msg.payload[0] = cpu_to_le32(rdo);
 
@@ -2103,6 +2452,7 @@ static void tcpm_reset_port(struct tcpm_port *port)
 	tcpm_typec_disconnect(port);
 	port->attached = false;
 	port->pd_capable = false;
+	port->pps_data.supported = false;
 
 	/*
 	 * First Rx ID should be 0; set this to a sentinel of -1 so that
@@ -2120,6 +2470,8 @@ static void tcpm_reset_port(struct tcpm_port *port)
 	tcpm_set_attached_state(port, false);
 	port->try_src_count = 0;
 	port->try_snk_count = 0;
+	port->supply_voltage = 0;
+	port->current_limit = 0;
 }
 
 static void tcpm_detach(struct tcpm_port *port)
@@ -2364,6 +2716,7 @@ static void run_state_machine(struct tcpm_port *port)
 		typec_set_pwr_opmode(port->typec_port, opmode);
 		port->pwr_opmode = TYPEC_PWR_MODE_USB;
 		port->caps_count = 0;
+		port->negotiated_rev = PD_MAX_REV;
 		port->message_id = 0;
 		port->rx_msgid = -1;
 		port->explicit_contract = false;
@@ -2424,6 +2777,7 @@ static void run_state_machine(struct tcpm_port *port)
 
 		tcpm_swap_complete(port, 0);
 		tcpm_typec_connect(port);
+
 		tcpm_check_send_discover(port);
 		/*
 		 * 6.3.5
@@ -2447,6 +2801,7 @@ static void run_state_machine(struct tcpm_port *port)
 	case SNK_UNATTACHED:
 		if (!port->non_pd_role_swap)
 			tcpm_swap_complete(port, -ENOTCONN);
+		tcpm_pps_complete(port, -ENOTCONN);
 		tcpm_snk_detach(port);
 		if (tcpm_start_drp_toggling(port)) {
 			tcpm_set_state(port, DRP_TOGGLING, 0);
@@ -2536,6 +2891,7 @@ static void run_state_machine(struct tcpm_port *port)
 					      port->cc2 : port->cc1);
 		typec_set_pwr_opmode(port->typec_port, opmode);
 		port->pwr_opmode = TYPEC_PWR_MODE_USB;
+		port->negotiated_rev = PD_MAX_REV;
 		port->message_id = 0;
 		port->rx_msgid = -1;
 		port->explicit_contract = false;
@@ -2606,6 +2962,24 @@ static void run_state_machine(struct tcpm_port *port)
 					    PD_T_SENDER_RESPONSE);
 		}
 		break;
+	case SNK_NEGOTIATE_PPS_CAPABILITIES:
+		ret = tcpm_pd_send_pps_request(port);
+		if (ret < 0) {
+			port->pps_status = ret;
+			/*
+			 * If this was called due to updates to sink
+			 * capabilities, and pps is no longer valid, we should
+			 * safely fall back to a standard PDO.
+			 */
+			if (port->update_sink_caps)
+				tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
+			else
+				tcpm_set_state(port, SNK_READY, 0);
+		} else {
+			tcpm_set_state_cond(port, hard_reset_state(port),
+					    PD_T_SENDER_RESPONSE);
+		}
+		break;
 	case SNK_TRANSITION_SINK:
 	case SNK_TRANSITION_SINK_VBUS:
 		tcpm_set_state(port, hard_reset_state(port),
@@ -2613,6 +2987,7 @@ static void run_state_machine(struct tcpm_port *port)
 		break;
 	case SNK_READY:
 		port->try_snk_count = 0;
+		port->update_sink_caps = false;
 		if (port->explicit_contract) {
 			typec_set_pwr_opmode(port->typec_port,
 					     TYPEC_PWR_MODE_PD);
@@ -2622,6 +2997,8 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_swap_complete(port, 0);
 		tcpm_typec_connect(port);
 		tcpm_check_send_discover(port);
+		tcpm_pps_complete(port, port->pps_status);
+
 		break;
 
 	/* Accessory states */
@@ -2668,6 +3045,7 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_set_state(port, SRC_UNATTACHED, PD_T_PS_SOURCE_ON);
 		break;
 	case SNK_HARD_RESET_SINK_OFF:
+		memset(&port->pps_data, 0, sizeof(port->pps_data));
 		tcpm_set_vconn(port, false);
 		tcpm_set_charge(port, false);
 		tcpm_set_roles(port, false, TYPEC_SINK, TYPEC_DEVICE);
@@ -2888,6 +3266,7 @@ static void run_state_machine(struct tcpm_port *port)
 		break;
 	case ERROR_RECOVERY:
 		tcpm_swap_complete(port, -EPROTO);
+		tcpm_pps_complete(port, -EPROTO);
 		tcpm_set_state(port, PORT_RESET, 0);
 		break;
 	case PORT_RESET:
@@ -3470,6 +3849,162 @@ static int tcpm_try_role(const struct typec_capability *cap, int role)
 	return ret;
 }
 
+static int __maybe_unused tcpm_pps_set_op_curr(struct tcpm_port *port, u16 op_curr)
+{
+	unsigned int target_mw;
+	int ret;
+
+	mutex_lock(&port->swap_lock);
+	mutex_lock(&port->lock);
+
+	if (!port->pps_data.active) {
+		ret = -EOPNOTSUPP;
+		goto port_unlock;
+	}
+
+	if (port->state != SNK_READY) {
+		ret = -EAGAIN;
+		goto port_unlock;
+	}
+
+	if (op_curr > port->pps_data.max_curr) {
+		ret = -EINVAL;
+		goto port_unlock;
+	}
+
+	target_mw = (op_curr * port->pps_data.out_volt) / 1000;
+	if (target_mw < port->operating_snk_mw) {
+		ret = -EINVAL;
+		goto port_unlock;
+	}
+
+	reinit_completion(&port->pps_complete);
+	port->pps_data.op_curr = op_curr;
+	port->pps_status = 0;
+	port->pps_pending = true;
+	tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0);
+	mutex_unlock(&port->lock);
+
+	if (!wait_for_completion_timeout(&port->pps_complete,
+				msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT)))
+		ret = -ETIMEDOUT;
+	else
+		ret = port->pps_status;
+
+	goto swap_unlock;
+
+port_unlock:
+	mutex_unlock(&port->lock);
+swap_unlock:
+	mutex_unlock(&port->swap_lock);
+
+	return ret;
+}
+
+static int __maybe_unused tcpm_pps_set_out_volt(struct tcpm_port *port, u16 out_volt)
+{
+	unsigned int target_mw;
+	int ret;
+
+	mutex_lock(&port->swap_lock);
+	mutex_lock(&port->lock);
+
+	if (!port->pps_data.active) {
+		ret = -EOPNOTSUPP;
+		goto port_unlock;
+	}
+
+	if (port->state != SNK_READY) {
+		ret = -EAGAIN;
+		goto port_unlock;
+	}
+
+	if (out_volt < port->pps_data.min_volt ||
+	    out_volt > port->pps_data.max_volt) {
+		ret = -EINVAL;
+		goto port_unlock;
+	}
+
+	target_mw = (port->pps_data.op_curr * out_volt) / 1000;
+	if (target_mw < port->operating_snk_mw) {
+		ret = -EINVAL;
+		goto port_unlock;
+	}
+
+	reinit_completion(&port->pps_complete);
+	port->pps_data.out_volt = out_volt;
+	port->pps_status = 0;
+	port->pps_pending = true;
+	tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0);
+	mutex_unlock(&port->lock);
+
+	if (!wait_for_completion_timeout(&port->pps_complete,
+				msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT)))
+		ret = -ETIMEDOUT;
+	else
+		ret = port->pps_status;
+
+	goto swap_unlock;
+
+port_unlock:
+	mutex_unlock(&port->lock);
+swap_unlock:
+	mutex_unlock(&port->swap_lock);
+
+	return ret;
+}
+
+static int __maybe_unused tcpm_pps_activate(struct tcpm_port *port, bool activate)
+{
+	int ret = 0;
+
+	mutex_lock(&port->swap_lock);
+	mutex_lock(&port->lock);
+
+	if (!port->pps_data.supported) {
+		ret = -EOPNOTSUPP;
+		goto port_unlock;
+	}
+
+	/* Trying to deactivate PPS when already deactivated so just bail */
+	if (!port->pps_data.active && !activate)
+		goto port_unlock;
+
+	if (port->state != SNK_READY) {
+		ret = -EAGAIN;
+		goto port_unlock;
+	}
+
+	reinit_completion(&port->pps_complete);
+	port->pps_status = 0;
+	port->pps_pending = true;
+
+	/* Trigger PPS request or move back to standard PDO contract */
+	if (activate) {
+		port->pps_data.out_volt = port->supply_voltage;
+		port->pps_data.op_curr = port->current_limit;
+		tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0);
+	} else {
+		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
+	}
+	mutex_unlock(&port->lock);
+
+	if (!wait_for_completion_timeout(&port->pps_complete,
+				msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT)))
+		ret = -ETIMEDOUT;
+	else
+		ret = port->pps_status;
+
+	goto swap_unlock;
+
+port_unlock:
+	mutex_unlock(&port->lock);
+swap_unlock:
+	mutex_unlock(&port->swap_lock);
+
+	return ret;
+}
+
 static void tcpm_init(struct tcpm_port *port)
 {
 	enum typec_cc_status cc1, cc2;
@@ -3603,13 +4138,18 @@ int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
 	mutex_lock(&port->lock);
 	port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, pdo, nr_pdo);
 	port->operating_snk_mw = operating_snk_mw;
+	port->update_sink_caps = true;
 
 	switch (port->state) {
 	case SNK_NEGOTIATE_CAPABILITIES:
+	case SNK_NEGOTIATE_PPS_CAPABILITIES:
 	case SNK_READY:
 	case SNK_TRANSITION_SINK:
 	case SNK_TRANSITION_SINK_VBUS:
-		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
+		if (port->pps_data.active)
+			tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0);
+		else
+			tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
 		break;
 	default:
 		break;
@@ -3651,6 +4191,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
 
 	init_completion(&port->tx_complete);
 	init_completion(&port->swap_complete);
+	init_completion(&port->pps_complete);
 	tcpm_debugfs_init(port);
 
 	if (tcpm_validate_caps(port, tcpc->config->src_pdo,
@@ -3677,7 +4218,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
 	port->typec_caps.type = tcpc->config->type;
 	port->typec_caps.data = tcpc->config->data;
 	port->typec_caps.revision = 0x0120;	/* Type-C spec release 1.2 */
-	port->typec_caps.pd_revision = 0x0200;	/* USB-PD spec release 2.0 */
+	port->typec_caps.pd_revision = 0x0300;	/* USB-PD spec release 3.0 */
 	port->typec_caps.dr_set = tcpm_dr_set;
 	port->typec_caps.pr_set = tcpm_pr_set;
 	port->typec_caps.vconn_set = tcpm_vconn_set;
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index ff359bdfdc7b..09b570feb297 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -103,8 +103,8 @@ enum pd_ext_msg_type {
 	 (((cnt) & PD_HEADER_CNT_MASK) << PD_HEADER_CNT_SHIFT) |	\
 	 ((ext_hdr) ? PD_HEADER_EXT_HDR : 0))
 
-#define PD_HEADER_LE(type, pwr, data, id, cnt) \
-	cpu_to_le16(PD_HEADER((type), (pwr), (data), PD_REV20, (id), (cnt), (0)))
+#define PD_HEADER_LE(type, pwr, data, rev, id, cnt) \
+	cpu_to_le16(PD_HEADER((type), (pwr), (data), (rev), (id), (cnt), (0)))
 
 static inline unsigned int pd_header_cnt(u16 header)
 {
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index f5bda9a0f644..b231b9314240 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -36,6 +36,7 @@ enum typec_cc_polarity {
 /* Time to wait for TCPC to complete transmit */
 #define PD_T_TCPC_TX_TIMEOUT	100		/* in ms	*/
 #define PD_ROLE_SWAP_TIMEOUT	(MSEC_PER_SEC * 10)
+#define PD_PPS_CTRL_TIMEOUT	(MSEC_PER_SEC * 10)
 
 enum tcpm_transmit_status {
 	TCPC_TX_SUCCESS = 0,
-- 
cgit v1.2.3


From cf45004195efea6b479a1d710d6fc21c2b19353e Mon Sep 17 00:00:00 2001
From: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Date: Mon, 23 Apr 2018 15:10:59 +0100
Subject: power: supply: Add 'usb_type' property and supporting code

This commit adds the 'usb_type' property to represent USB supplies
which can report a number of different types based on a connection
event.

Examples of this already exist in drivers whereby the existing 'type'
property is updated, based on an event, to represent what was
connected (e.g. USB, USB_DCP, USB_ACA, ...). Current implementations
however don't show all supported connectable types, so this knowledge
has to be exlicitly known for each driver that supports this.

The 'usb_type' property is intended to fill this void and show users
all possible USB types supported by a driver. The property, when read,
shows all available types for the driver, and the one currently chosen
is highlighted/bracketed. It is expected that the 'type' property
would then just show the top-level type 'USB', and this would be
static.

Currently the 'usb_type' enum contains all of the USB variant types
that exist for the 'type' enum at this time, and in addition has
SDP and PPS types. The mirroring is intentional so as to not impact
existing usage of the 'type' property.

Signed-off-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-power | 12 ++++++++
 drivers/power/supply/power_supply_core.c    |  8 ++++-
 drivers/power/supply/power_supply_sysfs.c   | 45 +++++++++++++++++++++++++++++
 include/linux/power_supply.h                | 16 ++++++++++
 4 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index e046566e38cb..5e23e22dce1b 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -409,6 +409,18 @@ Description:
 		Access: Read
 		Valid values: Represented in 1/10 Degrees Celsius
 
+What: 		/sys/class/power_supply/<supply_name>/usb_type
+Date:		March 2018
+Contact:	linux-pm@vger.kernel.org
+Description:
+		Reports what type of USB connection is currently active for
+		the supply, for example it can show if USB-PD capable source
+		is attached.
+
+		Access: Read-Only
+		Valid values: "Unknown", "SDP", "DCP", "CDP", "ACA", "C", "PD",
+			      "PD_DRP", "PD_PPS", "BrickID"
+
 What: 		/sys/class/power_supply/<supply_name>/voltage_max
 Date:		January 2008
 Contact:	linux-pm@vger.kernel.org
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index a7984af1dc66..ecd68c2053c5 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -843,7 +843,7 @@ __power_supply_register(struct device *parent,
 {
 	struct device *dev;
 	struct power_supply *psy;
-	int rc;
+	int i, rc;
 
 	if (!parent)
 		pr_warn("%s: Expected proper parent device for '%s'\n",
@@ -852,6 +852,12 @@ __power_supply_register(struct device *parent,
 	if (!desc || !desc->name || !desc->properties || !desc->num_properties)
 		return ERR_PTR(-EINVAL);
 
+	for (i = 0; i < desc->num_properties; ++i) {
+		if ((desc->properties[i] == POWER_SUPPLY_PROP_USB_TYPE) &&
+		    (!desc->usb_types || !desc->num_usb_types))
+			return ERR_PTR(-EINVAL);
+	}
+
 	psy = kzalloc(sizeof(*psy), GFP_KERNEL);
 	if (!psy)
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c
index 5204f115970f..1350068c401a 100644
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -46,6 +46,11 @@ static const char * const power_supply_type_text[] = {
 	"USB_PD", "USB_PD_DRP", "BrickID"
 };
 
+static const char * const power_supply_usb_type_text[] = {
+	"Unknown", "SDP", "DCP", "CDP", "ACA", "C",
+	"PD", "PD_DRP", "PD_PPS", "BrickID"
+};
+
 static const char * const power_supply_status_text[] = {
 	"Unknown", "Charging", "Discharging", "Not charging", "Full"
 };
@@ -73,6 +78,41 @@ static const char * const power_supply_scope_text[] = {
 	"Unknown", "System", "Device"
 };
 
+static ssize_t power_supply_show_usb_type(struct device *dev,
+					  enum power_supply_usb_type *usb_types,
+					  ssize_t num_usb_types,
+					  union power_supply_propval *value,
+					  char *buf)
+{
+	enum power_supply_usb_type usb_type;
+	ssize_t count = 0;
+	bool match = false;
+	int i;
+
+	for (i = 0; i < num_usb_types; ++i) {
+		usb_type = usb_types[i];
+
+		if (value->intval == usb_type) {
+			count += sprintf(buf + count, "[%s] ",
+					 power_supply_usb_type_text[usb_type]);
+			match = true;
+		} else {
+			count += sprintf(buf + count, "%s ",
+					 power_supply_usb_type_text[usb_type]);
+		}
+	}
+
+	if (!match) {
+		dev_warn(dev, "driver reporting unsupported connected type\n");
+		return -EINVAL;
+	}
+
+	if (count)
+		buf[count - 1] = '\n';
+
+	return count;
+}
+
 static ssize_t power_supply_show_property(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf) {
@@ -115,6 +155,10 @@ static ssize_t power_supply_show_property(struct device *dev,
 	else if (off == POWER_SUPPLY_PROP_TYPE)
 		return sprintf(buf, "%s\n",
 			       power_supply_type_text[value.intval]);
+	else if (off == POWER_SUPPLY_PROP_USB_TYPE)
+		return power_supply_show_usb_type(dev, psy->desc->usb_types,
+						  psy->desc->num_usb_types,
+						  &value, buf);
 	else if (off == POWER_SUPPLY_PROP_SCOPE)
 		return sprintf(buf, "%s\n",
 			       power_supply_scope_text[value.intval]);
@@ -241,6 +285,7 @@ static struct device_attribute power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(time_to_full_now),
 	POWER_SUPPLY_ATTR(time_to_full_avg),
 	POWER_SUPPLY_ATTR(type),
+	POWER_SUPPLY_ATTR(usb_type),
 	POWER_SUPPLY_ATTR(scope),
 	POWER_SUPPLY_ATTR(precharge_current),
 	POWER_SUPPLY_ATTR(charge_term_current),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index f0139b460a72..0c9a572a1eb8 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -145,6 +145,7 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_TIME_TO_FULL_NOW,
 	POWER_SUPPLY_PROP_TIME_TO_FULL_AVG,
 	POWER_SUPPLY_PROP_TYPE, /* use power_supply.type instead */
+	POWER_SUPPLY_PROP_USB_TYPE,
 	POWER_SUPPLY_PROP_SCOPE,
 	POWER_SUPPLY_PROP_PRECHARGE_CURRENT,
 	POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT,
@@ -170,6 +171,19 @@ enum power_supply_type {
 	POWER_SUPPLY_TYPE_APPLE_BRICK_ID,	/* Apple Charging Method */
 };
 
+enum power_supply_usb_type {
+	POWER_SUPPLY_USB_TYPE_UNKNOWN = 0,
+	POWER_SUPPLY_USB_TYPE_SDP,		/* Standard Downstream Port */
+	POWER_SUPPLY_USB_TYPE_DCP,		/* Dedicated Charging Port */
+	POWER_SUPPLY_USB_TYPE_CDP,		/* Charging Downstream Port */
+	POWER_SUPPLY_USB_TYPE_ACA,		/* Accessory Charger Adapters */
+	POWER_SUPPLY_USB_TYPE_C,		/* Type C Port */
+	POWER_SUPPLY_USB_TYPE_PD,		/* Power Delivery Port */
+	POWER_SUPPLY_USB_TYPE_PD_DRP,		/* PD Dual Role Port */
+	POWER_SUPPLY_USB_TYPE_PD_PPS,		/* PD Programmable Power Supply */
+	POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID,	/* Apple Charging Method */
+};
+
 enum power_supply_notifier_events {
 	PSY_EVENT_PROP_CHANGED,
 };
@@ -196,6 +210,8 @@ struct power_supply_config {
 struct power_supply_desc {
 	const char *name;
 	enum power_supply_type type;
+	enum power_supply_usb_type *usb_types;
+	size_t num_usb_types;
 	enum power_supply_property *properties;
 	size_t num_properties;
 
-- 
cgit v1.2.3


From 7d775f63470c3b6ddf34c770c973293ab925a7bb Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:03 -0400
Subject: macvlan: Rename fwd_priv to accel_priv and add accessor function

This change renames the fwd_priv member to accel_priv as this more
accurately reflects the actual purpose of this value. In addition I am
adding an accessor which will allow us to further abstract this in the
future if needed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++++-----
 drivers/net/macvlan.c                         | 31 ++++++++++++++++-----------
 include/linux/if_macvlan.h                    |  8 ++++++-
 3 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c8f59430a5a1..0fbeb2f48711 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5398,10 +5398,9 @@ static int ixgbe_fwd_ring_up(struct net_device *vdev,
 static int ixgbe_upper_dev_walk(struct net_device *upper, void *data)
 {
 	if (netif_is_macvlan(upper)) {
-		struct macvlan_dev *dfwd = netdev_priv(upper);
-		struct ixgbe_fwd_adapter *vadapter = dfwd->fwd_priv;
+		struct ixgbe_fwd_adapter *vadapter = macvlan_accel_priv(upper);
 
-		if (dfwd->fwd_priv)
+		if (vadapter)
 			ixgbe_fwd_ring_up(upper, vadapter);
 	}
 
@@ -8983,13 +8982,12 @@ struct upper_walk_data {
 static int get_macvlan_queue(struct net_device *upper, void *_data)
 {
 	if (netif_is_macvlan(upper)) {
-		struct macvlan_dev *dfwd = netdev_priv(upper);
-		struct ixgbe_fwd_adapter *vadapter = dfwd->fwd_priv;
+		struct ixgbe_fwd_adapter *vadapter = macvlan_accel_priv(upper);
 		struct upper_walk_data *data = _data;
 		struct ixgbe_adapter *adapter = data->adapter;
 		int ifindex = data->ifindex;
 
-		if (vadapter && vadapter->netdev->ifindex == ifindex) {
+		if (vadapter && upper->ifindex == ifindex) {
 			data->queue = adapter->rx_ring[vadapter->rx_base_queue]->reg_idx;
 			data->action = data->queue;
 			return 1;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 725f4b4afc6d..7ddc94ff4109 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -559,9 +559,9 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 	if (unlikely(netpoll_tx_running(dev)))
 		return macvlan_netpoll_send_skb(vlan, skb);
 
-	if (vlan->fwd_priv) {
+	if (vlan->accel_priv) {
 		skb->dev = vlan->lowerdev;
-		ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);
+		ret = dev_queue_xmit_accel(skb, vlan->accel_priv);
 	} else {
 		ret = macvlan_queue_xmit(skb, dev);
 	}
@@ -613,16 +613,23 @@ static int macvlan_open(struct net_device *dev)
 		goto hash_add;
 	}
 
+	err = -EBUSY;
+	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
+		goto out;
+
+	/* Attempt to populate accel_priv which is used to offload the L2
+	 * forwarding requests for unicast packets.
+	 */
 	if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) {
-		vlan->fwd_priv =
+		vlan->accel_priv =
 		      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
 
 		/* If we get a NULL pointer back, or if we get an error
 		 * then we should just fall through to the non accelerated path
 		 */
-		if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
-			vlan->fwd_priv = NULL;
-		} else
+		if (IS_ERR_OR_NULL(vlan->accel_priv))
+			vlan->accel_priv = NULL;
+		else
 			return 0;
 	}
 
@@ -655,10 +662,10 @@ clear_multi:
 del_unicast:
 	dev_uc_del(lowerdev, dev->dev_addr);
 out:
-	if (vlan->fwd_priv) {
+	if (vlan->accel_priv) {
 		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
-							   vlan->fwd_priv);
-		vlan->fwd_priv = NULL;
+							   vlan->accel_priv);
+		vlan->accel_priv = NULL;
 	}
 	return err;
 }
@@ -668,10 +675,10 @@ static int macvlan_stop(struct net_device *dev)
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct net_device *lowerdev = vlan->lowerdev;
 
-	if (vlan->fwd_priv) {
+	if (vlan->accel_priv) {
 		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
-							   vlan->fwd_priv);
-		vlan->fwd_priv = NULL;
+							   vlan->accel_priv);
+		vlan->accel_priv = NULL;
 		return 0;
 	}
 
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 4cb7aeeafce0..c5106ce8273a 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -21,7 +21,7 @@ struct macvlan_dev {
 	struct hlist_node	hlist;
 	struct macvlan_port	*port;
 	struct net_device	*lowerdev;
-	void			*fwd_priv;
+	void			*accel_priv;
 	struct vlan_pcpu_stats __percpu *pcpu_stats;
 
 	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
@@ -86,4 +86,10 @@ macvlan_dev_real_dev(const struct net_device *dev)
 }
 #endif
 
+static inline void *macvlan_accel_priv(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->accel_priv;
+}
 #endif /* _LINUX_IF_MACVLAN_H */
-- 
cgit v1.2.3


From a222311283641c9a476426ccacb2a3c82bfe5aaa Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:19 -0400
Subject: macvlan: macvlan_count_rx shouldn't be static inline AND extern

It doesn't make sense to define macvlan_count_rx as a static inline and
then add a forward declaration after that as an extern. I am dropping the
extern declaration since it seems like it is something that likely got
missed when the function was made an inline.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/if_macvlan.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c5106ce8273a..80089d6c5a0a 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -61,10 +61,6 @@ extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 				  struct nlattr *tb[], struct nlattr *data[],
 				  struct netlink_ext_ack *extack);
 
-extern void macvlan_count_rx(const struct macvlan_dev *vlan,
-			     unsigned int len, bool success,
-			     bool multicast);
-
 extern void macvlan_dellink(struct net_device *dev, struct list_head *head);
 
 extern int macvlan_link_register(struct rtnl_link_ops *ops);
-- 
cgit v1.2.3


From 6cb1937d4eff6936089139169377fda208a2701a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:25 -0400
Subject: macvlan: Add function to test for destination filtering support

This patch adds a function indicating if a given macvlan can fully supports
destination filtering, especially as it relates to unicast traffic. For
those macvlan interfaces that do not support destination filtering such
passthru or source mode filtering we should not be enabling offload
support.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/if_macvlan.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 80089d6c5a0a..0221390668a0 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -88,4 +88,13 @@ static inline void *macvlan_accel_priv(struct net_device *dev)
 
 	return macvlan->accel_priv;
 }
+
+static inline bool macvlan_supports_dest_filter(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->mode == MACVLAN_MODE_PRIVATE ||
+	       macvlan->mode == MACVLAN_MODE_VEPA ||
+	       macvlan->mode == MACVLAN_MODE_BRIDGE;
+}
 #endif /* _LINUX_IF_MACVLAN_H */
-- 
cgit v1.2.3


From 53cd4d8e4dfb231fa5ba125b6c0666b82a0504fa Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:30 -0400
Subject: macvlan: Provide function for interfaces to release HW offload

This patch provides a basic function to allow a lower device to disable
macvlan offload if it was previously enabled on a given macvlan. The idea
here is to allow for recovery from failure should the lowerdev run out of
resources.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/if_macvlan.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 0221390668a0..2e55e4cdbd8a 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -97,4 +97,12 @@ static inline bool macvlan_supports_dest_filter(struct net_device *dev)
 	       macvlan->mode == MACVLAN_MODE_VEPA ||
 	       macvlan->mode == MACVLAN_MODE_BRIDGE;
 }
+
+static inline int macvlan_release_l2fw_offload(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	macvlan->accel_priv = NULL;
+	return dev_uc_add(macvlan->lowerdev, dev->dev_addr);
+}
 #endif /* _LINUX_IF_MACVLAN_H */
-- 
cgit v1.2.3


From 3eb0f5193b497083391aa05d35210d5645211eef Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 17 Apr 2018 15:26:37 -0500
Subject: signal: Ensure every siginfo we send has all bits initialized

Call clear_siginfo to ensure every stack allocated siginfo is properly
initialized before being passed to the signal sending functions.

Note: It is not safe to depend on C initializers to initialize struct
siginfo on the stack because C is allowed to skip holes when
initializing a structure.

The initialization of struct siginfo in tracehook_report_syscall_exit
was moved from the helper user_single_step_siginfo into
tracehook_report_syscall_exit itself, to make it clear that the local
variable siginfo gets fully initialized.

In a few cases the scope of struct siginfo has been reduced to make it
clear that siginfo siginfo is not used on other paths in the function
in which it is declared.

Instances of using memset to initialize siginfo have been replaced
with calls clear_siginfo for clarity.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/alpha/kernel/osf_sys.c               |  1 +
 arch/alpha/kernel/signal.c                |  2 ++
 arch/alpha/kernel/traps.c                 |  5 +++++
 arch/alpha/mm/fault.c                     |  2 ++
 arch/arc/mm/fault.c                       |  2 ++
 arch/arm/kernel/ptrace.c                  |  1 +
 arch/arm/kernel/swp_emulate.c             |  1 +
 arch/arm/kernel/traps.c                   |  5 +++++
 arch/arm/mm/alignment.c                   |  1 +
 arch/arm/mm/fault.c                       |  4 ++++
 arch/arm/vfp/vfpmodule.c                  |  3 +--
 arch/arm64/kernel/fpsimd.c                |  2 +-
 arch/arm64/kernel/sys_compat.c            |  1 +
 arch/arm64/kernel/traps.c                 |  1 +
 arch/arm64/mm/fault.c                     | 18 ++++++++++++------
 arch/c6x/kernel/traps.c                   |  1 +
 arch/hexagon/kernel/traps.c               |  1 +
 arch/hexagon/mm/vm_fault.c                |  1 +
 arch/ia64/kernel/brl_emu.c                |  1 +
 arch/ia64/kernel/signal.c                 |  2 ++
 arch/ia64/kernel/traps.c                  | 27 ++++++++++++++++++++++++---
 arch/ia64/kernel/unaligned.c              |  1 +
 arch/ia64/mm/fault.c                      |  4 +++-
 arch/m68k/kernel/traps.c                  |  2 ++
 arch/microblaze/kernel/exceptions.c       |  1 +
 arch/microblaze/mm/fault.c                |  4 +++-
 arch/mips/mm/fault.c                      |  1 +
 arch/nds32/kernel/traps.c                 |  3 ++-
 arch/nds32/mm/fault.c                     |  1 +
 arch/nios2/kernel/traps.c                 |  1 +
 arch/openrisc/kernel/traps.c              |  5 ++++-
 arch/openrisc/mm/fault.c                  |  1 +
 arch/parisc/kernel/ptrace.c               |  1 +
 arch/parisc/kernel/traps.c                |  2 ++
 arch/parisc/kernel/unaligned.c            |  1 +
 arch/parisc/math-emu/driver.c             |  1 +
 arch/parisc/mm/fault.c                    |  1 +
 arch/powerpc/kernel/process.c             |  1 +
 arch/powerpc/kernel/traps.c               |  3 +--
 arch/powerpc/mm/fault.c                   |  1 +
 arch/powerpc/platforms/cell/spufs/fault.c |  2 +-
 arch/riscv/kernel/traps.c                 |  1 +
 arch/s390/kernel/traps.c                  |  5 ++++-
 arch/s390/mm/fault.c                      |  2 ++
 arch/sh/kernel/traps_32.c                 |  2 ++
 arch/sh/math-emu/math.c                   |  1 +
 arch/sh/mm/fault.c                        |  1 +
 arch/sparc/kernel/process_64.c            |  1 +
 arch/sparc/kernel/sys_sparc_32.c          |  1 +
 arch/sparc/kernel/traps_32.c              | 10 ++++++++++
 arch/sparc/kernel/traps_64.c              | 14 ++++++++++++++
 arch/sparc/kernel/unaligned_32.c          |  1 +
 arch/sparc/mm/fault_32.c                  |  1 +
 arch/sparc/mm/fault_64.c                  |  1 +
 arch/um/kernel/trap.c                     |  2 ++
 arch/unicore32/kernel/fpu-ucf64.c         |  2 +-
 arch/unicore32/mm/fault.c                 |  3 +++
 arch/x86/entry/vsyscall/vsyscall_64.c     |  2 +-
 arch/x86/kernel/ptrace.c                  |  2 +-
 arch/x86/kernel/traps.c                   |  3 +++
 arch/x86/kernel/umip.c                    |  1 +
 arch/x86/kvm/mmu.c                        |  1 +
 arch/x86/mm/fault.c                       |  1 +
 arch/xtensa/kernel/traps.c                |  1 +
 arch/xtensa/mm/fault.c                    |  1 +
 include/linux/ptrace.h                    |  1 -
 include/linux/tracehook.h                 |  1 +
 virt/kvm/arm/mmu.c                        |  1 +
 68 files changed, 158 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 2e02aef5a334..f5f154942aab 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -881,6 +881,7 @@ SYSCALL_DEFINE5(osf_setsysinfo, unsigned long, op, void __user *, buffer,
 			if (fex & IEEE_TRAP_ENABLE_DZE) si_code = FPE_FLTDIV;
 			if (fex & IEEE_TRAP_ENABLE_INV) si_code = FPE_FLTINV;
 
+			clear_siginfo(&info);
 			info.si_signo = SIGFPE;
 			info.si_errno = 0;
 			info.si_code = si_code;
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 9ebb3bcbc626..cd306e602313 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -221,6 +221,7 @@ do_sigreturn(struct sigcontext __user *sc)
 	if (ptrace_cancel_bpt (current)) {
 		siginfo_t info;
 
+		clear_siginfo(&info);
 		info.si_signo = SIGTRAP;
 		info.si_errno = 0;
 		info.si_code = TRAP_BRKPT;
@@ -255,6 +256,7 @@ do_rt_sigreturn(struct rt_sigframe __user *frame)
 	if (ptrace_cancel_bpt (current)) {
 		siginfo_t info;
 
+		clear_siginfo(&info);
 		info.si_signo = SIGTRAP;
 		info.si_errno = 0;
 		info.si_code = TRAP_BRKPT;
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index f43bd05dede2..91636765dd6d 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -228,6 +228,7 @@ do_entArith(unsigned long summary, unsigned long write_mask,
 	}
 	die_if_kernel("Arithmetic fault", regs, 0, NULL);
 
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = si_code;
@@ -241,6 +242,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 	siginfo_t info;
 	int signo, code;
 
+	clear_siginfo(&info);
 	if ((regs->ps & ~IPL_MAX) == 0) {
 		if (type == 1) {
 			const unsigned int *data
@@ -430,6 +432,7 @@ do_entDbg(struct pt_regs *regs)
 
 	die_if_kernel("Instruction fault", regs, 0, NULL);
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_ILLOPC;
@@ -761,6 +764,8 @@ do_entUnaUser(void __user * va, unsigned long opcode,
 	siginfo_t info;
 	long error;
 
+	clear_siginfo(&info);
+
 	/* Check the UAC bits to decide what the user wants us to do
 	   with the unaliged access.  */
 
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index cd3c572ee912..7f2202a9f50a 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -91,6 +91,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	siginfo_t info;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	clear_siginfo(&info);
+
 	/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
 	   (or is suppressed by the PALcode).  Support that for older CPUs
 	   by ignoring such an instruction.  */
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index a0b7bd6d030d..b884bbd6f354 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -70,6 +70,8 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
 	int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	clear_siginfo(&info);
+
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
 	 * 'reference' page table is init_mm.pgd.
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 7724b0f661b3..36718a424358 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -205,6 +205,7 @@ void ptrace_break(struct task_struct *tsk, struct pt_regs *regs)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code  = TRAP_BRKPT;
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 3bda08bee674..dfcb456afadd 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -112,6 +112,7 @@ static void set_segfault(struct pt_regs *regs, unsigned long addr)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	down_read(&current->mm->mmap_sem);
 	if (find_vma(current->mm, addr) == NULL)
 		info.si_code = SEGV_MAPERR;
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 5e3633c24e63..2584f9066da3 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -439,6 +439,7 @@ asmlinkage void do_undefinstr(struct pt_regs *regs)
 	siginfo_t info;
 	void __user *pc;
 
+	clear_siginfo(&info);
 	pc = (void __user *)instruction_pointer(regs);
 
 	if (processor_mode(regs) == SVC_MODE) {
@@ -537,6 +538,7 @@ static int bad_syscall(int n, struct pt_regs *regs)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	if ((current->personality & PER_MASK) != PER_LINUX) {
 		send_sig(SIGSEGV, current, 1);
 		return regs->ARM_r0;
@@ -604,6 +606,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	if ((no >> 16) != (__ARM_NR_BASE>> 16))
 		return bad_syscall(no, regs);
 
@@ -740,6 +743,8 @@ baddataabort(int code, unsigned long instr, struct pt_regs *regs)
 	unsigned long addr = instruction_pointer(regs);
 	siginfo_t info;
 
+	clear_siginfo(&info);
+
 #ifdef CONFIG_DEBUG_USER
 	if (user_debug & UDBG_BADABORT) {
 		pr_err("[%d] %s: bad data abort: code %d instr 0x%08lx\n",
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 2c96190e018b..bd2c739d8083 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -950,6 +950,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (ai_usermode & UM_SIGNAL) {
 		siginfo_t si;
 
+		clear_siginfo(&si);
 		si.si_signo = SIGBUS;
 		si.si_errno = 0;
 		si.si_code = BUS_ADRALN;
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index b75eada23d0a..32034543f49c 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -163,6 +163,8 @@ __do_user_fault(struct task_struct *tsk, unsigned long addr,
 {
 	struct siginfo si;
 
+	clear_siginfo(&si);
+
 #ifdef CONFIG_DEBUG_USER
 	if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
 	    ((user_debug & UDBG_BUS)  && (sig == SIGBUS))) {
@@ -557,6 +559,7 @@ do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		inf->name, fsr, addr);
 	show_pte(current->mm, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -589,6 +592,7 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs)
 	pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
 		inf->name, ifsr, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 4c375e11ae95..adda3fc2dde8 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -218,8 +218,7 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
 {
 	siginfo_t info;
 
-	memset(&info, 0, sizeof(info));
-
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_code = sicode;
 	info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 87a35364e750..4bcdd0318729 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -882,7 +882,7 @@ asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 			si_code = FPE_FLTRES;
 	}
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_code = si_code;
 	info.si_addr = (void __user *)instruction_pointer(regs);
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 93ab57dcfc14..a6109825eeb9 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -112,6 +112,7 @@ long compat_arm_syscall(struct pt_regs *regs)
 		break;
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code  = ILL_ILLTRP;
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index ba964da31a25..7f476586cacc 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -634,6 +634,7 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 	siginfo_t info;
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code  = ILL_ILLOPC;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4165485e8b6e..91c53a7d2575 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -305,11 +305,12 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	 */
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
-		struct siginfo si = {
-			.si_signo	= inf->sig,
-			.si_code	= inf->code,
-			.si_addr	= (void __user *)addr,
-		};
+		struct siginfo si;
+
+		clear_siginfo(&si);
+		si.si_signo	= inf->sig;
+		si.si_code	= inf->code;
+		si.si_addr	= (void __user *)addr;
 
 		__do_user_fault(&si, esr);
 	} else {
@@ -583,6 +584,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 			nmi_exit();
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -687,6 +689,7 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
 		show_pte(addr);
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -729,6 +732,7 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
 		local_irq_enable();
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code  = BUS_ADRALN;
@@ -772,7 +776,6 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 					      struct pt_regs *regs)
 {
 	const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
-	struct siginfo info;
 	int rv;
 
 	/*
@@ -788,6 +791,9 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 	if (!inf->fn(addr, esr, regs)) {
 		rv = 1;
 	} else {
+		struct siginfo info;
+
+		clear_siginfo(&info);
 		info.si_signo = inf->sig;
 		info.si_errno = 0;
 		info.si_code  = inf->code;
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index 4c1d4b84dd2b..c5feee4542b0 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -246,6 +246,7 @@ static void do_trap(struct exception_info *except_info, struct pt_regs *regs)
 	unsigned long addr = instruction_pointer(regs);
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	if (except_info->code != TRAP_BRKPT)
 		pr_err("TRAP: %s PC[0x%lx] signo[%d] code[%d]\n",
 		       except_info->kernel_str, regs->pc,
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 2942a9204a9a..1ff6a6a7b97c 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -414,6 +414,7 @@ void do_trap0(struct pt_regs *regs)
 		if (user_mode(regs)) {
 			struct siginfo info;
 
+			clear_siginfo(&info);
 			info.si_signo = SIGTRAP;
 			info.si_errno = 0;
 			/*
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 3eec33c5cfd7..2ad92edc877c 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -56,6 +56,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
 	const struct exception_table_entry *fixup;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	clear_siginfo(&info);
 	/*
 	 * If we're in an interrupt or have no user context,
 	 * then must not take the fault.
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
index 9bcc908bc85e..a61f6c6a36f8 100644
--- a/arch/ia64/kernel/brl_emu.c
+++ b/arch/ia64/kernel/brl_emu.c
@@ -62,6 +62,7 @@ ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
 	struct illegal_op_return rv;
 	long tmp_taken, unimplemented_address;
 
+	clear_siginfo(&siginfo);
 	rv.fkt = (unsigned long) -1;
 
 	/*
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 54547c7cf8a2..d1234a5ba4c5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -153,6 +153,7 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 	return retval;
 
   give_sigsegv:
+	clear_siginfo(&si);
 	si.si_signo = SIGSEGV;
 	si.si_errno = 0;
 	si.si_code = SI_KERNEL;
@@ -236,6 +237,7 @@ force_sigsegv_info (int sig, void __user *addr)
 	unsigned long flags;
 	struct siginfo si;
 
+	clear_siginfo(&si);
 	if (sig == SIGSEGV) {
 		/*
 		 * Acquiring siglock around the sa_handler-update is almost
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 6d4e76a4267f..972873ed1ae5 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -104,6 +104,7 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	int sig, code;
 
 	/* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */
+	clear_siginfo(&siginfo);
 	siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
 	siginfo.si_imm = break_num;
 	siginfo.si_flags = 0;		/* clear __ISR_VALID */
@@ -293,7 +294,6 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 {
 	long exception, bundle[2];
 	unsigned long fault_ip;
-	struct siginfo siginfo;
 
 	fault_ip = regs->cr_iip;
 	if (!fp_fault && (ia64_psr(regs)->ri == 0))
@@ -344,10 +344,13 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 			printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
 			return -1;
 		} else {
+			struct siginfo siginfo;
+
 			/* is next instruction a trap? */
 			if (exception & 2) {
 				ia64_increment_ip(regs);
 			}
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGFPE;
 			siginfo.si_errno = 0;
 			siginfo.si_code = FPE_FIXME;	/* default code */
@@ -372,6 +375,9 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 			return -1;
 		} else if (exception != 0) {
 			/* raise exception */
+			struct siginfo siginfo;
+
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGFPE;
 			siginfo.si_errno = 0;
 			siginfo.si_code = FPE_FIXME;	/* default code */
@@ -420,7 +426,7 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
 	if (die_if_kernel(buf, &regs, 0))
 		return rv;
 
-	memset(&si, 0, sizeof(si));
+	clear_siginfo(&si);
 	si.si_signo = SIGILL;
 	si.si_code = ILL_ILLOPC;
 	si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(&regs)->ri);
@@ -434,7 +440,6 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 	    long arg7, struct pt_regs regs)
 {
 	unsigned long code, error = isr, iip;
-	struct siginfo siginfo;
 	char buf[128];
 	int result, sig;
 	static const char *reason[] = {
@@ -485,6 +490,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 
 	      case 26: /* NaT Consumption */
 		if (user_mode(&regs)) {
+			struct siginfo siginfo;
 			void __user *addr;
 
 			if (((isr >> 4) & 0xf) == 2) {
@@ -499,6 +505,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 				addr = (void __user *) (regs.cr_iip
 							+ ia64_psr(&regs)->ri);
 			}
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = sig;
 			siginfo.si_code = code;
 			siginfo.si_errno = 0;
@@ -515,6 +522,9 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 
 	      case 31: /* Unsupported Data Reference */
 		if (user_mode(&regs)) {
+			struct siginfo siginfo;
+
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGILL;
 			siginfo.si_code = ILL_ILLOPN;
 			siginfo.si_errno = 0;
@@ -531,6 +541,10 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 	      case 29: /* Debug */
 	      case 35: /* Taken Branch Trap */
 	      case 36: /* Single Step Trap */
+	      {
+		struct siginfo siginfo;
+
+		clear_siginfo(&siginfo);
 		if (fsys_mode(current, &regs)) {
 			extern char __kernel_syscall_via_break[];
 			/*
@@ -578,11 +592,15 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		siginfo.si_isr   = isr;
 		force_sig_info(SIGTRAP, &siginfo, current);
 		return;
+	      }
 
 	      case 32: /* fp fault */
 	      case 33: /* fp trap */
 		result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
 		if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
+			struct siginfo siginfo;
+
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGFPE;
 			siginfo.si_errno = 0;
 			siginfo.si_code = FPE_FLTINV;
@@ -616,6 +634,9 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		} else {
 			/* Unimplemented Instr. Address Trap */
 			if (user_mode(&regs)) {
+				struct siginfo siginfo;
+
+				clear_siginfo(&siginfo);
 				siginfo.si_signo = SIGILL;
 				siginfo.si_code = ILL_BADIADDR;
 				siginfo.si_errno = 0;
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 72e9b4242564..e309f9859acc 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -1537,6 +1537,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 		/* NOT_REACHED */
 	}
   force_sigbus:
+	clear_siginfo(&si);
 	si.si_signo = SIGBUS;
 	si.si_errno = 0;
 	si.si_code = BUS_ADRALN;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index dfdc152d6737..817fa120645f 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -85,7 +85,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	int signal = SIGSEGV, code = SEGV_MAPERR;
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
-	struct siginfo si;
 	unsigned long mask;
 	int fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -249,6 +248,9 @@ retry:
 		return;
 	}
 	if (user_mode(regs)) {
+		struct siginfo si;
+
+		clear_siginfo(&si);
 		si.si_signo = signal;
 		si.si_errno = 0;
 		si.si_code = code;
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index c1cc4e99aa94..0a00b476236d 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1011,6 +1011,7 @@ asmlinkage void trap_c(struct frame *fp)
 	int vector = (fp->ptregs.vector >> 2) & 0xff;
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	if (fp->ptregs.sr & PS_S) {
 		if (vector == VEC_TRACE) {
 			/* traced a trapping instruction on a 68020/30,
@@ -1163,6 +1164,7 @@ asmlinkage void fpemu_signal(int signal, int code, void *addr)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = signal;
 	info.si_errno = 0;
 	info.si_code = code;
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index e6f338d0496b..443ec1feacb4 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -65,6 +65,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 	if (kernel_mode(regs))
 		die("Exception in kernel mode", regs, signr);
 
+	clear_siginfo(&info);
 	info.si_signo = signr;
 	info.si_errno = 0;
 	info.si_code = code;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index f91b30f8aaa8..43d92167012a 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -88,7 +88,6 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	siginfo_t info;
 	int code = SEGV_MAPERR;
 	int is_write = error_code & ESR_S;
 	int fault;
@@ -295,6 +294,9 @@ out_of_memory:
 do_sigbus:
 	up_read(&mm->mmap_sem);
 	if (user_mode(regs)) {
+		siginfo_t info;
+
+		clear_siginfo(&info);
 		info.si_signo = SIGBUS;
 		info.si_errno = 0;
 		info.si_code = BUS_ADRERR;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 4f8f5bf46977..75392becd933 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -63,6 +63,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 		return;
 #endif
 
+	clear_siginfo(&info);
 	info.si_code = SEGV_MAPERR;
 
 	/*
diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index 8e9a5b1f6234..46911768f4b5 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -229,6 +229,7 @@ int bad_syscall(int n, struct pt_regs *regs)
 		return regs->uregs[0];
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_ILLTRP;
@@ -292,7 +293,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	tsk->thread.trap_no = ENTRY_DEBUG_RELATED;
 	tsk->thread.error_code = error_code;
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_code = si_code;
 	info.si_addr = (void __user *)instruction_pointer(regs);
diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c
index 3a246fb8098c..876ee01ff80a 100644
--- a/arch/nds32/mm/fault.c
+++ b/arch/nds32/mm/fault.c
@@ -77,6 +77,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
 	unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	clear_siginfo(&info);
 	error_code = error_code & (ITYPE_mskINST | ITYPE_mskETYPE);
 	tsk = current;
 	mm = tsk->mm;
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 8184e7d6b385..a69861d3e1a3 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -28,6 +28,7 @@ static void _send_sig(int signo, int code, unsigned long addr)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = signo;
 	info.si_errno = 0;
 	info.si_code = code;
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 113c175fe469..1610b1d65a11 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -251,7 +251,7 @@ void __init trap_init(void)
 asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
 {
 	siginfo_t info;
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_code = TRAP_TRACE;
 	info.si_addr = (void *)address;
@@ -266,6 +266,7 @@ asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
 
 	if (user_mode(regs)) {
 		/* Send a SIGBUS */
+		clear_siginfo(&info);
 		info.si_signo = SIGBUS;
 		info.si_errno = 0;
 		info.si_code = BUS_ADRALN;
@@ -285,6 +286,7 @@ asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address)
 
 	if (user_mode(regs)) {
 		/* Send a SIGBUS */
+		clear_siginfo(&info);
 		info.si_signo = SIGBUS;
 		info.si_errno = 0;
 		info.si_code = BUS_ADRERR;
@@ -485,6 +487,7 @@ asmlinkage void do_illegal_instruction(struct pt_regs *regs,
 
 	if (user_mode(regs)) {
 		/* Send a SIGILL */
+		clear_siginfo(&info);
 		info.si_signo = SIGILL;
 		info.si_errno = 0;
 		info.si_code = ILL_ILLOPC;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index d0021dfae20a..68be33e4ae17 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -56,6 +56,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
 	int fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	clear_siginfo(&info);
 	tsk = current;
 
 	/*
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 1a2be6e639b5..b1c12ceb1c88 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -90,6 +90,7 @@ void user_enable_single_step(struct task_struct *task)
 		ptrace_disable(task);
 		/* Don't wake up the task, but let the
 		   parent know something happened. */
+		clear_siginfo(&si);
 		si.si_code = TRAP_TRACE;
 		si.si_addr = (void __user *) (task_regs(task)->iaoq[0] & ~3);
 		si.si_signo = SIGTRAP;
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 68e671a11987..98f9f2f85940 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -299,6 +299,7 @@ static void handle_gdb_break(struct pt_regs *regs, int wot)
 {
 	struct siginfo si;
 
+	clear_siginfo(&si);
 	si.si_signo = SIGTRAP;
 	si.si_errno = 0;
 	si.si_code = wot;
@@ -489,6 +490,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 	unsigned long fault_space = 0;
 	struct siginfo si;
 
+	clear_siginfo(&si);
 	if (code == 1)
 	    pdc_console_restart();  /* switch back to pdc if HPMC */
 	else
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index e36f7b75ab07..30b7c7f6c471 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -455,6 +455,7 @@ void handle_unaligned(struct pt_regs *regs)
 	struct siginfo si;
 	register int flop=0;	/* true if this is a flop */
 
+	clear_siginfo(&si);
 	__inc_irq_stat(irq_unaligned_count);
 
 	/* log a message with pacing */
diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c
index 2fb59d2e2b29..0d10efb53361 100644
--- a/arch/parisc/math-emu/driver.c
+++ b/arch/parisc/math-emu/driver.c
@@ -93,6 +93,7 @@ handle_fpe(struct pt_regs *regs)
 	 */
 	__u64 frcopy[36];
 
+	clear_siginfo(&si);
 	memcpy(frcopy, regs->fr, sizeof regs->fr);
 	frcopy[32] = 0;
 
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e247edbca68e..657b35096bd8 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -356,6 +356,7 @@ bad_area:
 		struct siginfo si;
 		unsigned int lsb = 0;
 
+		clear_siginfo(&si);
 		switch (code) {
 		case 15:	/* Data TLB miss fault/Data page fault */
 			/* send SIGSEGV when outside of vma */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1237f13fed51..26ea9793d290 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -632,6 +632,7 @@ void do_break (struct pt_regs *regs, unsigned long address,
 	hw_breakpoint_disable();
 
 	/* Deliver the signal to userspace */
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_HWBKPT;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 0904492e7032..087855caf6a9 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -296,7 +296,6 @@ NOKPROBE_SYMBOL(die);
 void user_single_step_siginfo(struct task_struct *tsk,
 				struct pt_regs *regs, siginfo_t *info)
 {
-	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 	info->si_code = TRAP_TRACE;
 	info->si_addr = (void __user *)regs->nip;
@@ -334,7 +333,7 @@ void _exception_pkey(int signr, struct pt_regs *regs, int code,
 	 */
 	thread_pkey_regs_save(&current->thread);
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 	info.si_signo = signr;
 	info.si_code = code;
 	info.si_addr = (void __user *) addr;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c01d627e687a..ef268d5d9db7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -168,6 +168,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 		return SIGBUS;
 
 	current->thread.trap_nr = BUS_ADRERR;
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRERR;
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 870c0a82d560..1e002e94d0f6 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -44,7 +44,7 @@ static void spufs_handle_event(struct spu_context *ctx,
 		return;
 	}
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 
 	switch (type) {
 	case SPE_EVENT_INVALID_DMA:
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 93132cb59184..48aa6471cede 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -68,6 +68,7 @@ static inline void do_trap_siginfo(int signo, int code,
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = signo;
 	info.si_errno = 0;
 	info.si_code = code;
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index a5297a22bc1e..3ba649d8aa5a 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -47,6 +47,7 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
 	siginfo_t info;
 
 	if (user_mode(regs)) {
+		clear_siginfo(&info);
 		info.si_signo = si_signo;
 		info.si_errno = 0;
 		info.si_code = si_code;
@@ -86,6 +87,7 @@ void do_per_trap(struct pt_regs *regs)
 		return;
 	if (!current->ptrace)
 		return;
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_HWBKPT;
@@ -165,7 +167,6 @@ void translation_exception(struct pt_regs *regs)
 
 void illegal_op(struct pt_regs *regs)
 {
-	siginfo_t info;
         __u8 opcode[6];
 	__u16 __user *location;
 	int is_uprobe_insn = 0;
@@ -178,6 +179,8 @@ void illegal_op(struct pt_regs *regs)
 			return;
 		if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) {
 			if (current->ptrace) {
+				siginfo_t info;
+				clear_siginfo(&info);
 				info.si_signo = SIGTRAP;
 				info.si_errno = 0;
 				info.si_code = TRAP_BRKPT;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 93faeca52284..b3ff0e8e5860 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -268,6 +268,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 	struct siginfo si;
 
 	report_user_fault(regs, SIGSEGV, 1);
+	clear_siginfo(&si);
 	si.si_signo = SIGSEGV;
 	si.si_errno = 0;
 	si.si_code = si_code;
@@ -323,6 +324,7 @@ static noinline void do_sigbus(struct pt_regs *regs)
 	 * Send a sigbus, regardless of whether we were in kernel
 	 * or user mode.
 	 */
+	clear_siginfo(&si);
 	si.si_signo = SIGBUS;
 	si.si_errno = 0;
 	si.si_code = BUS_ADRERR;
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index b3770bb26211..e85e59c3d6df 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -537,6 +537,7 @@ uspace_segv:
 		       "access (PC %lx PR %lx)\n", current->comm, regs->pc,
 		       regs->pr);
 
+		clear_siginfo(&info);
 		info.si_signo = SIGBUS;
 		info.si_errno = 0;
 		info.si_code = si_code;
@@ -600,6 +601,7 @@ asmlinkage void do_divide_error(unsigned long r4)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	switch (r4) {
 	case TRAP_DIVZERO_ERROR:
 		info.si_code = FPE_INTDIV;
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index c86f4360c6ce..d6d2213df078 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -560,6 +560,7 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 				~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
 			task_thread_info(tsk)->status |= TS_USEDFPU;
 		} else {
+			clear_siginfo(&info);
 			info.si_signo = SIGFPE;
 			info.si_errno = 0;
 			info.si_code = FPE_FLTINV;
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 6fd1bf7481c7..4c98b6f20e02 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -44,6 +44,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo	= si_signo;
 	info.si_errno	= 0;
 	info.si_code	= si_code;
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 454a8af28f13..2219e55206b4 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -520,6 +520,7 @@ static void stack_unaligned(unsigned long sp)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRALN;
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index e8c3cb6b6d08..00f6353fe435 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -152,6 +152,7 @@ sparc_breakpoint (struct pt_regs *regs)
 #ifdef DEBUG_SPARC_BREAKPOINT
         printk ("TRAP: Entering kernel PC=%x, nPC=%x\n", regs->pc, regs->npc);
 #endif
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_BRKPT;
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 33cd35bf3dc8..03e522274b8b 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -104,6 +104,7 @@ void do_hw_interrupt(struct pt_regs *regs, unsigned long type)
 	if(regs->psr & PSR_PS)
 		die_if_kernel("Kernel bad trap", regs);
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_ILLTRP;
@@ -124,6 +125,7 @@ void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned lon
 	       regs->pc, *(unsigned long *)regs->pc);
 #endif
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_ILLOPC;
@@ -139,6 +141,7 @@ void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long n
 
 	if(psr & PSR_PS)
 		die_if_kernel("Penguin instruction from Penguin mode??!?!", regs);
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_PRVOPC;
@@ -165,6 +168,7 @@ void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned lon
 	instruction_dump ((unsigned long *) regs->pc);
 	printk ("do_MNA!\n");
 #endif
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRALN;
@@ -303,6 +307,7 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 	}
 
 	fsr = fpt->thread.fsr;
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_addr = (void __user *)pc;
@@ -336,6 +341,7 @@ void handle_tag_overflow(struct pt_regs *regs, unsigned long pc, unsigned long n
 
 	if(psr & PSR_PS)
 		die_if_kernel("Penguin overflow trap from kernel mode", regs);
+	clear_siginfo(&info);
 	info.si_signo = SIGEMT;
 	info.si_errno = 0;
 	info.si_code = EMT_TAGOVF;
@@ -365,6 +371,7 @@ void handle_reg_access(struct pt_regs *regs, unsigned long pc, unsigned long npc
 	printk("Register Access Exception at PC %08lx NPC %08lx PSR %08lx\n",
 	       pc, npc, psr);
 #endif
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_OBJERR;
@@ -378,6 +385,7 @@ void handle_cp_disabled(struct pt_regs *regs, unsigned long pc, unsigned long np
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_COPROC;
@@ -395,6 +403,7 @@ void handle_cp_exception(struct pt_regs *regs, unsigned long pc, unsigned long n
 	printk("Co-Processor Exception at PC %08lx NPC %08lx PSR %08lx\n",
 	       pc, npc, psr);
 #endif
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_COPROC;
@@ -408,6 +417,7 @@ void handle_hw_divzero(struct pt_regs *regs, unsigned long pc, unsigned long npc
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = FPE_INTDIV;
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index e81072ac52c3..b485b49b87a8 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -107,6 +107,7 @@ void bad_trap(struct pt_regs *regs, long lvl)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_ILLTRP;
@@ -206,6 +207,7 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGSEGV;
 	info.si_errno = 0;
 	info.si_code = SEGV_MAPERR;
@@ -247,6 +249,7 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGSEGV;
 	info.si_errno = 0;
 	info.si_code = SEGV_MAPERR;
@@ -338,6 +341,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un
 	if (is_no_fault_exception(regs))
 		return;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGSEGV;
 	info.si_errno = 0;
 	info.si_code = SEGV_MAPERR;
@@ -595,6 +599,7 @@ static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned lon
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_OBJERR;
@@ -2211,6 +2216,7 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
 				addr += PAGE_SIZE;
 			}
 		}
+		clear_siginfo(&info);
 		info.si_signo = SIGKILL;
 		info.si_errno = 0;
 		info.si_trapno = 0;
@@ -2221,6 +2227,7 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
 	if (attrs & SUN4V_ERR_ATTRS_PIO) {
 		siginfo_t info;
 
+		clear_siginfo(&info);
 		info.si_signo = SIGBUS;
 		info.si_code = BUS_ADRERR;
 		info.si_addr = (void __user *)sun4v_get_vaddr(regs);
@@ -2368,6 +2375,7 @@ static void do_fpe_common(struct pt_regs *regs)
 			regs->tpc &= 0xffffffff;
 			regs->tnpc &= 0xffffffff;
 		}
+		clear_siginfo(&info);
 		info.si_signo = SIGFPE;
 		info.si_errno = 0;
 		info.si_addr = (void __user *)regs->tpc;
@@ -2440,6 +2448,7 @@ void do_tof(struct pt_regs *regs)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGEMT;
 	info.si_errno = 0;
 	info.si_code = EMT_TAGOVF;
@@ -2465,6 +2474,7 @@ void do_div0(struct pt_regs *regs)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = FPE_INTDIV;
@@ -2666,6 +2676,7 @@ void do_illegal_instruction(struct pt_regs *regs)
 			}
 		}
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_ILLOPC;
@@ -2692,6 +2703,7 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
 	if (is_no_fault_exception(regs))
 		return;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRALN;
@@ -2717,6 +2729,7 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
 	if (is_no_fault_exception(regs))
 		return;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRALN;
@@ -2785,6 +2798,7 @@ void do_privop(struct pt_regs *regs)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_PRVOPC;
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index 7642d7e4f0d9..0e4cf7217413 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -313,6 +313,7 @@ static void user_mna_trap_fault(struct pt_regs *regs, unsigned int insn)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRALN;
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index a8103a84b4ac..2deb586665b9 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -129,6 +129,7 @@ static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs,
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = sig;
 	info.si_code = code;
 	info.si_errno = 0;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 41363f46797b..46ccff95d10e 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -172,6 +172,7 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
 	unsigned long addr;
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_code = code;
 	info.si_signo = sig;
 	info.si_errno = 0;
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index b2b02df9896e..d4d38520c4c6 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -164,6 +164,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
 {
 	struct siginfo si;
 
+	clear_siginfo(&si);
 	si.si_signo = SIGSEGV;
 	si.si_code = SEGV_ACCERR;
 	si.si_addr = (void __user *) FAULT_ADDRESS(fi);
@@ -220,6 +221,7 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 	int is_write = FAULT_WRITE(fi);
 	unsigned long address = FAULT_ADDRESS(fi);
 
+	clear_siginfo(&si);
 	if (!is_user && regs)
 		current->thread.segv_regs = container_of(regs, struct pt_regs, regs);
 
diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c
index 12c8c9527b8e..d785955e1c29 100644
--- a/arch/unicore32/kernel/fpu-ucf64.c
+++ b/arch/unicore32/kernel/fpu-ucf64.c
@@ -56,7 +56,7 @@ void ucf64_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
 {
 	siginfo_t info;
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 
 	info.si_signo = SIGFPE;
 	info.si_code = sicode;
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index bbefcc46a45e..381473412937 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -125,6 +125,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
 	tsk->thread.address = addr;
 	tsk->thread.error_code = fsr;
 	tsk->thread.trap_no = 14;
+	clear_siginfo(&si);
 	si.si_signo = sig;
 	si.si_errno = 0;
 	si.si_code = code;
@@ -472,6 +473,7 @@ asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr,
 	printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n",
 	       inf->name, fsr, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code = inf->code;
@@ -491,6 +493,7 @@ asmlinkage void do_PrefetchAbort(unsigned long addr,
 	printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
 	       inf->name, ifsr, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code = inf->code;
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 70b7845434cb..7782cdbcd67d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -107,7 +107,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
 		thread->cr2		= ptr;
 		thread->trap_nr		= X86_TRAP_PF;
 
-		memset(&info, 0, sizeof(info));
+		clear_siginfo(&info);
 		info.si_signo		= SIGSEGV;
 		info.si_errno		= 0;
 		info.si_code		= SEGV_MAPERR;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ed5c4cdf0a34..e2ee403865eb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1377,7 +1377,6 @@ static void fill_sigtrap_info(struct task_struct *tsk,
 	tsk->thread.trap_nr = X86_TRAP_DB;
 	tsk->thread.error_code = error_code;
 
-	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 	info->si_code = si_code;
 	info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
@@ -1395,6 +1394,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 {
 	struct siginfo info;
 
+	clear_siginfo(&info);
 	fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 03f3d7695dac..a535dd64de63 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -299,6 +299,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		cond_local_irq_enable(regs);
+		clear_siginfo(&info);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
@@ -854,6 +855,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
+	clear_siginfo(&info);
 	info.si_signo		= SIGFPE;
 	info.si_errno		= 0;
 	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs);
@@ -929,6 +931,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_BADSTK;
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index f44ce0fb3583..ff20b35e98dd 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -278,6 +278,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
 	tsk->thread.error_code	= X86_PF_USER | X86_PF_WRITE;
 	tsk->thread.trap_nr	= X86_TRAP_PF;
 
+	clear_siginfo(&info);
 	info.si_signo	= SIGSEGV;
 	info.si_errno	= 0;
 	info.si_code	= SEGV_MAPERR;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8494dbae41b9..d634f0332c0f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3007,6 +3007,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo	= SIGBUS;
 	info.si_errno	= 0;
 	info.si_code	= BUS_MCEERR_AR;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 73bd8c95ac71..2a5a2920203d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -209,6 +209,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 	unsigned lsb = 0;
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo	= si_signo;
 	info.si_errno	= 0;
 	info.si_code	= si_code;
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 32c5207f1226..51771929f341 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -334,6 +334,7 @@ do_unaligned_user (struct pt_regs *regs)
 			    "(pid = %d, pc = %#010lx)\n",
 			    regs->excvaddr, current->comm,
 			    task_pid_nr(current), regs->pc);
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRALN;
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 8b9b6f44bb06..f9323a3e61ce 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -45,6 +45,7 @@ void do_page_fault(struct pt_regs *regs)
 	int fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	clear_siginfo(&info);
 	info.si_code = SEGV_MAPERR;
 
 	/* We fault-in kernel-space virtual memory on-demand. The
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 919b2a0b0307..037bf0ef1ae9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -345,7 +345,6 @@ extern void user_single_step_siginfo(struct task_struct *tsk,
 static inline void user_single_step_siginfo(struct task_struct *tsk,
 				struct pt_regs *regs, siginfo_t *info)
 {
-	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 }
 #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 26c152122a42..4a8841963c2e 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -124,6 +124,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 {
 	if (step) {
 		siginfo_t info;
+		clear_siginfo(&info);
 		user_single_step_siginfo(current, regs, &info);
 		force_sig_info(SIGTRAP, &info, current);
 		return;
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 7f6a944db23d..8d90de213ce9 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1401,6 +1401,7 @@ static void kvm_send_hwpoison_signal(unsigned long address,
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo   = SIGBUS;
 	info.si_errno   = 0;
 	info.si_code    = BUS_MCEERR_AR;
-- 
cgit v1.2.3


From db78e6a0a6f9f7d7277965600eeb1a5b3a6f55a8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 17 Apr 2018 16:18:25 -0500
Subject: signal: Add TRAP_UNK si_code for undiagnosted trap exceptions

Both powerpc and alpha have cases where they wronly set si_code to 0
in combination with SIGTRAP and don't mean SI_USER.

About half the time this is because the architecture can not report
accurately what kind of trap exception triggered the trap exception.
The other half the time it looks like no one has bothered to
figure out an appropriate si_code.

For the cases where the architecture does not have enough information
or is too lazy to figure out exactly what kind of trap exception
it is define TRAP_UNK.

Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-alpha@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/x86/kernel/signal_compat.c    | 2 +-
 include/uapi/asm-generic/siginfo.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 14c057f29979..9ccbf0576cd0 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(NSIGFPE  != 15);
 	BUILD_BUG_ON(NSIGSEGV != 7);
 	BUILD_BUG_ON(NSIGBUS  != 5);
-	BUILD_BUG_ON(NSIGTRAP != 4);
+	BUILD_BUG_ON(NSIGTRAP != 5);
 	BUILD_BUG_ON(NSIGCHLD != 6);
 	BUILD_BUG_ON(NSIGSYS  != 1);
 
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 558b902f18d4..80e2a7227205 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -249,7 +249,8 @@ typedef struct siginfo {
 #define TRAP_TRACE	2	/* process trace trap */
 #define TRAP_BRANCH     3	/* process taken branch trap */
 #define TRAP_HWBKPT     4	/* hardware breakpoint/watchpoint */
-#define NSIGTRAP	4
+#define TRAP_UNK	5	/* undiagnosed trap */
+#define NSIGTRAP	5
 
 /*
  * There is an additional set of SIGTRAP si_codes used by ptrace
-- 
cgit v1.2.3


From b5facfdba14ccb440bd1eac20870a8f23afa17f3 Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Tue, 24 Apr 2018 20:23:16 +0200
Subject: ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode

ECMP (equal-cost multipath) hashes are typically computed on the packets'
5-tuple(src IP, dst IP, src port, dst port, L4 proto).

For encapsulated packets, the L4 data is not readily available and ECMP
hashing will often revert to (src IP, dst IP). This will lead to traffic
polarization on a single ECMP path, causing congestion and waste of network
capacity.

In IPv6, the 20-bit flow label field is also used as part of the ECMP hash.
In the lack of L4 data, the hashing will be on (src IP, dst IP, flow
label). Having a non-zero flow label is thus important for proper traffic
load balancing when L4 data is unavailable (i.e., when packets are
encapsulated).

Currently, the seg6_do_srh_encap() function extracts the original packet's
flow label and set it as the outer IPv6 flow label. There are two issues
with this behaviour:

a) There is no guarantee that the inner flow label is set by the source.
b) If the original packet is not IPv6, the flow label will be set to
zero (e.g., IPv4 or L2 encap).

This patch adds a function, named seg6_make_flowlabel(), that computes a
flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and
leverages the per namespace 'seg6_flowlabel" sysctl value.

The currently support behaviours are as follows:
-1 set flowlabel to zero.
0 copy flowlabel from Inner paceket in case of Inner IPv6
(Set flowlabel to 0 in case IPv4/L2)
1 Compute the flowlabel using seg6_make_flowlabel()

This patch has been tested for IPv6, IPv4, and L2 traffic.

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h   |  1 +
 net/ipv6/seg6_iptunnel.c   | 24 ++++++++++++++++++++++--
 net/ipv6/sysctl_net_ipv6.c |  8 ++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 97b3a54579c8..c978a31b0f84 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -43,6 +43,7 @@ struct netns_sysctl_ipv6 {
 	int max_hbh_opts_cnt;
 	int max_dst_opts_len;
 	int max_hbh_opts_len;
+	int seg6_flowlabel;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 5fe139484919..9898926ce30d 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,6 +91,24 @@ static void set_tun_src(struct net *net, struct net_device *dev,
 	rcu_read_unlock();
 }
 
+/* Compute flowlabel for outer IPv6 header */
+static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
+				  struct ipv6hdr *inner_hdr)
+{
+	int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
+	__be32 flowlabel = 0;
+	u32 hash;
+
+	if (do_flowlabel > 0) {
+		hash = skb_get_hash(skb);
+		rol32(hash, 16);
+		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+	} else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
+		flowlabel = ip6_flowlabel(inner_hdr);
+	}
+	return flowlabel;
+}
+
 /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
 int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 {
@@ -99,6 +117,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 	struct ipv6hdr *hdr, *inner_hdr;
 	struct ipv6_sr_hdr *isrh;
 	int hdrlen, tot_len, err;
+	__be32 flowlabel;
 
 	hdrlen = (osrh->hdrlen + 1) << 3;
 	tot_len = hdrlen + sizeof(*hdr);
@@ -119,12 +138,13 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 	 * decapsulation will overwrite inner hlim with outer hlim
 	 */
 
+	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
 	if (skb->protocol == htons(ETH_P_IPV6)) {
 		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
-			     ip6_flowlabel(inner_hdr));
+			     flowlabel);
 		hdr->hop_limit = inner_hdr->hop_limit;
 	} else {
-		ip6_flow_hdr(hdr, 0, 0);
+		ip6_flow_hdr(hdr, 0, flowlabel);
 		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
 	}
 
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 6fbdef630152..e15cd37024fd 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -152,6 +152,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "seg6_flowlabel",
+		.data		= &init_net.ipv6.sysctl.seg6_flowlabel,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
@@ -217,6 +224,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
 	ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
 	ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
+	ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3


From 9ce33e46531d4b9f94b0fa135781e27c7c4e32e8 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 24 Apr 2018 13:49:34 -0700
Subject: neighbour: support for NTF_EXT_LEARNED flag

This patch extends NTF_EXT_LEARNED support to the neighbour system.
Example use-case: An Ethernet VPN implementation (eg in FRR routing suite)
can use this flag to add dynamic reachable external neigh entires
learned via control plane. The use of neigh NTF_EXT_LEARNED in this
patch is consistent with its use with bridge and vxlan fdb entries.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 19 ++++++++++++++++++-
 net/core/neighbour.c    |  8 +++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index e421f86af043..6c1eecd56a4d 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -246,6 +246,7 @@ static inline void *neighbour_priv(const struct neighbour *n)
 #define NEIGH_UPDATE_F_OVERRIDE			0x00000001
 #define NEIGH_UPDATE_F_WEAK_OVERRIDE		0x00000002
 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER	0x00000004
+#define NEIGH_UPDATE_F_EXT_LEARNED		0x20000000
 #define NEIGH_UPDATE_F_ISROUTER			0x40000000
 #define NEIGH_UPDATE_F_ADMIN			0x80000000
 
@@ -526,5 +527,21 @@ static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
 	} while (read_seqretry(&n->ha_lock, seq));
 }
 
-
+static inline void neigh_update_ext_learned(struct neighbour *neigh, u32 flags,
+					    int *notify)
+{
+	u8 ndm_flags = 0;
+
+	if (!(flags & NEIGH_UPDATE_F_ADMIN))
+		return;
+
+	ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
+	if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) {
+		if (ndm_flags & NTF_EXT_LEARNED)
+			neigh->flags |= NTF_EXT_LEARNED;
+		else
+			neigh->flags &= ~NTF_EXT_LEARNED;
+		*notify = 1;
+	}
+}
 #endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ce519861be59..5afae29367c1 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work)
 			write_lock(&n->lock);
 
 			state = n->nud_state;
-			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+			if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+			    (n->flags & NTF_EXT_LEARNED)) {
 				write_unlock(&n->lock);
 				goto next_elt;
 			}
@@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 	if (neigh->dead)
 		goto out;
 
+	neigh_update_ext_learned(neigh, flags, &notify);
+
 	if (!(new & NUD_VALID)) {
 		neigh_del_timer(neigh);
 		if (old & NUD_CONNECTED)
@@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 			flags &= ~NEIGH_UPDATE_F_OVERRIDE;
 	}
 
+	if (ndm->ndm_flags & NTF_EXT_LEARNED)
+		flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+
 	if (ndm->ndm_flags & NTF_USE) {
 		neigh_event_send(neigh, NULL);
 		err = 0;
-- 
cgit v1.2.3


From 47b3ba5175e940c9be5fd6aec05d6e7e048ff823 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 24 Apr 2018 18:17:34 -0300
Subject: sctp: fix const parameter violation in sctp_make_sack

sctp_make_sack() make changes to the asoc and this cast is just
bypassing the const attribute. As there is no need to have the const
there, just remove it and fix the violation.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    | 2 +-
 net/sctp/sm_make_chunk.c | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 2d0e782c9055..f4b657478a30 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -207,7 +207,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
 					    int len, __u8 flags, gfp_t gfp);
 struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
 				  const __u32 lowest_tsn);
-struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc);
+struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc);
 struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
 				      const struct sctp_chunk *chunk);
 struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5a4fb1dc8400..db93eabd6ef5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -779,10 +779,9 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
  * association.  This reports on which TSN's we've seen to date,
  * including duplicates and gaps.
  */
-struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
+struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc)
 {
 	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
-	struct sctp_association *aptr = (struct sctp_association *)asoc;
 	struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
 	__u16 num_gabs, num_dup_tsns;
 	struct sctp_transport *trans;
@@ -857,7 +856,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 
 	/* Add the duplicate TSN information.  */
 	if (num_dup_tsns) {
-		aptr->stats.idupchunks += num_dup_tsns;
+		asoc->stats.idupchunks += num_dup_tsns;
 		sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
 				 sctp_tsnmap_get_dups(map));
 	}
@@ -869,11 +868,11 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 	 * association so no transport will match after a wrap event like this,
 	 * Until the next sack
 	 */
-	if (++aptr->peer.sack_generation == 0) {
+	if (++asoc->peer.sack_generation == 0) {
 		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
 				    transports)
 			trans->sack_generation = 0;
-		aptr->peer.sack_generation = 1;
+		asoc->peer.sack_generation = 1;
 	}
 nodata:
 	return retval;
-- 
cgit v1.2.3


From 22e15b6f9c010ffded63e6c9720a656eb3d73835 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 25 Apr 2018 17:46:47 +0800
Subject: sctp: remove the unused sctp_assoc_is_match function

After Commit 4f0087812648 ("sctp: apply rhashtable api to send/recv
path"), there's no place using sctp_assoc_is_match, so remove it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ----
 net/sctp/associola.c       | 25 -------------------------
 2 files changed, 29 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a0ec462bc1a9..05594b248e52 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2091,10 +2091,6 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 				  enum sctp_transport_cmd command,
 				  sctp_sn_error_t error);
 struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32);
-struct sctp_transport *sctp_assoc_is_match(struct sctp_association *,
-					   struct net *,
-					   const union sctp_addr *,
-					   const union sctp_addr *);
 void sctp_assoc_migrate(struct sctp_association *, struct sock *);
 int sctp_assoc_update(struct sctp_association *old,
 		      struct sctp_association *new);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 837806dd5799..a8f3b088fcb2 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -988,31 +988,6 @@ out:
 	return match;
 }
 
-/* Is this the association we are looking for? */
-struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
-					   struct net *net,
-					   const union sctp_addr *laddr,
-					   const union sctp_addr *paddr)
-{
-	struct sctp_transport *transport;
-
-	if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
-	    (htons(asoc->peer.port) == paddr->v4.sin_port) &&
-	    net_eq(sock_net(asoc->base.sk), net)) {
-		transport = sctp_assoc_lookup_paddr(asoc, paddr);
-		if (!transport)
-			goto out;
-
-		if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
-					 sctp_sk(asoc->base.sk)))
-			goto out;
-	}
-	transport = NULL;
-
-out:
-	return transport;
-}
-
 /* Do delayed input processing.  This is scheduled by sctp_rcv(). */
 static void sctp_assoc_bh_rcv(struct work_struct *work)
 {
-- 
cgit v1.2.3


From 5ef6a16033b47afbc578c7ef8754da5ae7b198d7 Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Sat, 21 Apr 2018 18:12:49 -0500
Subject: power: supply: bq27xxx: Add support for BQ27426
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This device is software similar to the BQ27426 except it has
different data memory offsets. Add support here.

Signed-off-by: Andrew F. Davis <afd@ti.com>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 Documentation/devicetree/bindings/power/supply/bq27xxx.txt | 1 +
 drivers/power/supply/bq27xxx_battery.c                     | 9 +++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c                 | 2 ++
 include/linux/power/bq27xxx_battery.h                      | 3 ++-
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/supply/bq27xxx.txt b/Documentation/devicetree/bindings/power/supply/bq27xxx.txt
index 615c1cb6889f..37994fdb18ca 100644
--- a/Documentation/devicetree/bindings/power/supply/bq27xxx.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq27xxx.txt
@@ -25,6 +25,7 @@ Required properties:
  * "ti,bq27545" - BQ27545
  * "ti,bq27421" - BQ27421
  * "ti,bq27425" - BQ27425
+ * "ti,bq27426" - BQ27426
  * "ti,bq27441" - BQ27441
  * "ti,bq27621" - BQ27621
 - reg: integer, I2C address of the fuel gauge.
diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 7ce60519b1bc..d44ed8e17c47 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -432,6 +432,7 @@ static u8
 		BQ27XXX_DM_REG_ROWS,
 	};
 #define bq27425_regs bq27421_regs
+#define bq27426_regs bq27421_regs
 #define bq27441_regs bq27421_regs
 #define bq27621_regs bq27421_regs
 
@@ -664,6 +665,7 @@ static enum power_supply_property bq27421_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 #define bq27425_props bq27421_props
+#define bq27426_props bq27421_props
 #define bq27441_props bq27421_props
 #define bq27621_props bq27421_props
 
@@ -734,6 +736,12 @@ static struct bq27xxx_dm_reg bq27425_dm_regs[] = {
 	[BQ27XXX_DM_TERMINATE_VOLTAGE] = { 82, 18, 2, 2800,  3700 },
 };
 
+static struct bq27xxx_dm_reg bq27426_dm_regs[] = {
+	[BQ27XXX_DM_DESIGN_CAPACITY]   = { 82,  6, 2,    0,  8000 },
+	[BQ27XXX_DM_DESIGN_ENERGY]     = { 82,  8, 2,    0, 32767 },
+	[BQ27XXX_DM_TERMINATE_VOLTAGE] = { 82, 10, 2, 2500,  3700 },
+};
+
 #if 0 /* not yet tested */
 #define bq27441_dm_regs bq27421_dm_regs
 #else
@@ -795,6 +803,7 @@ static struct {
 	[BQ27545]   = BQ27XXX_DATA(bq27545,   0x04143672, BQ27XXX_O_OTDC),
 	[BQ27421]   = BQ27XXX_DATA(bq27421,   0x80008000, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP | BQ27XXX_O_RAM),
 	[BQ27425]   = BQ27XXX_DATA(bq27425,   0x04143672, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP),
+	[BQ27426]   = BQ27XXX_DATA(bq27426,   0x80008000, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP | BQ27XXX_O_RAM),
 	[BQ27441]   = BQ27XXX_DATA(bq27441,   0x80008000, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP | BQ27XXX_O_RAM),
 	[BQ27621]   = BQ27XXX_DATA(bq27621,   0x80008000, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP | BQ27XXX_O_RAM),
 };
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 6b25e5f2337e..40069128ad44 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -249,6 +249,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27545", BQ27545 },
 	{ "bq27421", BQ27421 },
 	{ "bq27425", BQ27425 },
+	{ "bq27426", BQ27426 },
 	{ "bq27441", BQ27441 },
 	{ "bq27621", BQ27621 },
 	{},
@@ -280,6 +281,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27545" },
 	{ .compatible = "ti,bq27421" },
 	{ .compatible = "ti,bq27425" },
+	{ .compatible = "ti,bq27426" },
 	{ .compatible = "ti,bq27441" },
 	{ .compatible = "ti,bq27621" },
 	{},
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 01fbf1b16258..d6355f49fbae 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -24,8 +24,9 @@ enum bq27xxx_chip {
 	BQ27546,
 	BQ27742,
 	BQ27545, /* bq27545 */
-	BQ27421, /* bq27421, bq27425, bq27441, bq27621 */
+	BQ27421, /* bq27421, bq27441, bq27621 */
 	BQ27425,
+	BQ27426,
 	BQ27441,
 	BQ27621,
 };
-- 
cgit v1.2.3


From 4c4268dc97c424cf6786c4ccc9acf345ba911987 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 22 Mar 2018 11:24:35 +0100
Subject: power: supply: ab8500: Drop AB8540/9540 support

The AB8540 was an evolved version of the AB8500, but it was never
mass produced or put into products, only reference designs exist.
The upstream support was never completed and it is unlikely that
this will happen so drop the support for now to simplify
maintenance of the AB8500.

Cc: Loic Pallardy <loic.pallardy@st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/ab8500_bmdata.c      |  63 --------------
 drivers/power/supply/ab8500_btemp.c       |  93 ++++-----------------
 drivers/power/supply/ab8500_charger.c     | 131 ++----------------------------
 drivers/power/supply/ab8500_fg.c          |  12 +--
 drivers/power/supply/abx500_chargalg.c    |  62 --------------
 include/linux/mfd/abx500.h                |   1 -
 include/linux/mfd/abx500/ab8500-bm.h      |   2 -
 include/linux/mfd/abx500/ux500_chargalg.h |   4 -
 8 files changed, 30 insertions(+), 338 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/ab8500_bmdata.c b/drivers/power/supply/ab8500_bmdata.c
index 4a7ed50d1dc5..7b2b69916f48 100644
--- a/drivers/power/supply/ab8500_bmdata.c
+++ b/drivers/power/supply/ab8500_bmdata.c
@@ -430,13 +430,6 @@ static const struct abx500_maxim_parameters ab8500_maxi_params = {
 	.charger_curr_step = 100,
 };
 
-static const struct abx500_maxim_parameters abx540_maxi_params = {
-	.ena_maxi = true,
-	.chg_curr = 3000,
-	.wait_cycles = 10,
-	.charger_curr_step = 200,
-};
-
 static const struct abx500_bm_charger_parameters chg = {
 	.usb_volt_max		= 5500,
 	.usb_curr_max		= 1500,
@@ -453,17 +446,6 @@ static int ab8500_charge_output_curr_map[] = {
         900,    1000,   1100,   1200,   1300,   1400,   1500,   1500,
 };
 
-static int ab8540_charge_output_curr_map[] = {
-        0,      0,      0,      75,     100,    125,    150,    175,
-        200,    225,    250,    275,    300,    325,    350,    375,
-        400,    425,    450,    475,    500,    525,    550,    575,
-        600,    625,    650,    675,    700,    725,    750,    775,
-        800,    825,    850,    875,    900,    925,    950,    975,
-        1000,   1025,   1050,   1075,   1100,   1125,   1150,   1175,
-        1200,   1225,   1250,   1275,   1300,   1325,   1350,   1375,
-        1400,   1425,   1450,   1500,   1600,   1700,   1900,   2000,
-};
-
 /*
  * This array maps the raw hex value to charger input current used by the
  * AB8500 values
@@ -473,17 +455,6 @@ static int ab8500_charge_input_curr_map[] = {
         700,    800,    900,    1000,   1100,   1300,   1400,   1500,
 };
 
-static int ab8540_charge_input_curr_map[] = {
-        25,     50,     75,     100,    125,    150,    175,    200,
-        225,    250,    275,    300,    325,    350,    375,    400,
-        425,    450,    475,    500,    525,    550,    575,    600,
-        625,    650,    675,    700,    725,    750,    775,    800,
-        825,    850,    875,    900,    925,    950,    975,    1000,
-        1025,   1050,   1075,   1100,   1125,   1150,   1175,   1200,
-        1225,   1250,   1275,   1300,   1325,   1350,   1375,   1400,
-        1425,   1450,   1475,   1500,   1500,   1500,   1500,   1500,
-};
-
 struct abx500_bm_data ab8500_bm_data = {
 	.temp_under             = 3,
 	.temp_low               = 8,
@@ -518,40 +489,6 @@ struct abx500_bm_data ab8500_bm_data = {
         .n_chg_in_curr          = ARRAY_SIZE(ab8500_charge_input_curr_map),
 };
 
-struct abx500_bm_data ab8540_bm_data = {
-        .temp_under             = 3,
-        .temp_low               = 8,
-        .temp_high              = 43,
-        .temp_over              = 48,
-        .main_safety_tmr_h      = 4,
-        .temp_interval_chg      = 20,
-        .temp_interval_nochg    = 120,
-        .usb_safety_tmr_h       = 4,
-        .bkup_bat_v             = BUP_VCH_SEL_2P6V,
-        .bkup_bat_i             = BUP_ICH_SEL_150UA,
-        .no_maintenance         = false,
-        .capacity_scaling       = false,
-        .adc_therm              = ABx500_ADC_THERM_BATCTRL,
-        .chg_unknown_bat        = false,
-        .enable_overshoot       = false,
-        .fg_res                 = 100,
-        .cap_levels             = &cap_levels,
-        .bat_type               = bat_type_thermistor,
-        .n_btypes               = ARRAY_SIZE(bat_type_thermistor),
-        .batt_id                = 0,
-        .interval_charging      = 5,
-        .interval_not_charging  = 120,
-        .temp_hysteresis        = 3,
-        .gnd_lift_resistance    = 0,
-        .maxi                   = &abx540_maxi_params,
-        .chg_params             = &chg,
-        .fg_params              = &fg,
-        .chg_output_curr        = ab8540_charge_output_curr_map,
-        .n_chg_out_curr         = ARRAY_SIZE(ab8540_charge_output_curr_map),
-        .chg_input_curr         = ab8540_charge_input_curr_map,
-        .n_chg_in_curr          = ARRAY_SIZE(ab8540_charge_input_curr_map),
-};
-
 int ab8500_bm_of_probe(struct device *dev,
 		       struct device_node *np,
 		       struct abx500_bm_data *bm)
diff --git a/drivers/power/supply/ab8500_btemp.c b/drivers/power/supply/ab8500_btemp.c
index f7a35ebfbab2..708fd58cd62b 100644
--- a/drivers/power/supply/ab8500_btemp.c
+++ b/drivers/power/supply/ab8500_btemp.c
@@ -214,22 +214,10 @@ static int ab8500_btemp_curr_source_enable(struct ab8500_btemp *di,
 	/* Only do this for batteries with internal NTC */
 	if (di->bm->adc_therm == ABx500_ADC_THERM_BATCTRL && enable) {
 
-		if (is_ab8540(di->parent)) {
-			if (di->curr_source == BTEMP_BATCTRL_CURR_SRC_60UA)
-				curr = BAT_CTRL_60U_ENA;
-			else
-				curr = BAT_CTRL_120U_ENA;
-		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-			if (di->curr_source == BTEMP_BATCTRL_CURR_SRC_16UA)
-				curr = BAT_CTRL_16U_ENA;
-			else
-				curr = BAT_CTRL_18U_ENA;
-		} else {
-			if (di->curr_source == BTEMP_BATCTRL_CURR_SRC_7UA)
-				curr = BAT_CTRL_7U_ENA;
-			else
-				curr = BAT_CTRL_20U_ENA;
-		}
+		if (di->curr_source == BTEMP_BATCTRL_CURR_SRC_7UA)
+			curr = BAT_CTRL_7U_ENA;
+		else
+			curr = BAT_CTRL_20U_ENA;
 
 		dev_dbg(di->dev, "Set BATCTRL %duA\n", di->curr_source);
 
@@ -260,28 +248,12 @@ static int ab8500_btemp_curr_source_enable(struct ab8500_btemp *di,
 	} else if (di->bm->adc_therm == ABx500_ADC_THERM_BATCTRL && !enable) {
 		dev_dbg(di->dev, "Disable BATCTRL curr source\n");
 
-		if (is_ab8540(di->parent)) {
-			/* Write 0 to the curr bits */
-			ret = abx500_mask_and_set_register_interruptible(
-				di->dev,
-				AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
-				BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA,
-				~(BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA));
-		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-			/* Write 0 to the curr bits */
-			ret = abx500_mask_and_set_register_interruptible(
-				di->dev,
-				AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
-				BAT_CTRL_16U_ENA | BAT_CTRL_18U_ENA,
-				~(BAT_CTRL_16U_ENA | BAT_CTRL_18U_ENA));
-		} else {
-			/* Write 0 to the curr bits */
-			ret = abx500_mask_and_set_register_interruptible(
-				di->dev,
-				AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
-				BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA,
-				~(BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA));
-		}
+		/* Write 0 to the curr bits */
+		ret = abx500_mask_and_set_register_interruptible(
+			di->dev,
+			AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
+			BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA,
+			~(BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA));
 
 		if (ret) {
 			dev_err(di->dev, "%s failed disabling current source\n",
@@ -324,25 +296,11 @@ static int ab8500_btemp_curr_source_enable(struct ab8500_btemp *di,
 	 * if we got an error above
 	 */
 disable_curr_source:
-	if (is_ab8540(di->parent)) {
-		/* Write 0 to the curr bits */
-		ret = abx500_mask_and_set_register_interruptible(di->dev,
-			AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
-			BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA,
-			~(BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA));
-	} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-		/* Write 0 to the curr bits */
-		ret = abx500_mask_and_set_register_interruptible(di->dev,
-			AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
-			BAT_CTRL_16U_ENA | BAT_CTRL_18U_ENA,
-			~(BAT_CTRL_16U_ENA | BAT_CTRL_18U_ENA));
-	} else {
-		/* Write 0 to the curr bits */
-		ret = abx500_mask_and_set_register_interruptible(di->dev,
-			AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
-			BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA,
-			~(BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA));
-	}
+	/* Write 0 to the curr bits */
+	ret = abx500_mask_and_set_register_interruptible(di->dev,
+		AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
+		BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA,
+		~(BAT_CTRL_7U_ENA | BAT_CTRL_20U_ENA));
 
 	if (ret) {
 		dev_err(di->dev, "%s failed disabling current source\n",
@@ -556,13 +514,8 @@ static int ab8500_btemp_id(struct ab8500_btemp *di)
 {
 	int res;
 	u8 i;
-	if (is_ab8540(di->parent))
-		di->curr_source = BTEMP_BATCTRL_CURR_SRC_60UA;
-	else if (is_ab9540(di->parent) || is_ab8505(di->parent))
-		di->curr_source = BTEMP_BATCTRL_CURR_SRC_16UA;
-	else
-		di->curr_source = BTEMP_BATCTRL_CURR_SRC_7UA;
 
+	di->curr_source = BTEMP_BATCTRL_CURR_SRC_7UA;
 	di->bm->batt_id = BATTERY_UNKNOWN;
 
 	res =  ab8500_btemp_get_batctrl_res(di);
@@ -600,18 +553,8 @@ static int ab8500_btemp_id(struct ab8500_btemp *di)
 	 */
 	if (di->bm->adc_therm == ABx500_ADC_THERM_BATCTRL &&
 	    di->bm->batt_id == 1) {
-		if (is_ab8540(di->parent)) {
-			dev_dbg(di->dev,
-				"Set BATCTRL current source to 60uA\n");
-			di->curr_source = BTEMP_BATCTRL_CURR_SRC_60UA;
-		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-			dev_dbg(di->dev,
-				"Set BATCTRL current source to 16uA\n");
-			di->curr_source = BTEMP_BATCTRL_CURR_SRC_16UA;
-		} else {
-			dev_dbg(di->dev, "Set BATCTRL current source to 20uA\n");
-			di->curr_source = BTEMP_BATCTRL_CURR_SRC_20UA;
-		}
+		dev_dbg(di->dev, "Set BATCTRL current source to 20uA\n");
+		di->curr_source = BTEMP_BATCTRL_CURR_SRC_20UA;
 	}
 
 	return di->bm->batt_id;
diff --git a/drivers/power/supply/ab8500_charger.c b/drivers/power/supply/ab8500_charger.c
index 5a76c6d343de..ba0e97284ae5 100644
--- a/drivers/power/supply/ab8500_charger.c
+++ b/drivers/power/supply/ab8500_charger.c
@@ -58,9 +58,7 @@
 
 #define MAIN_CH_INPUT_CURR_SHIFT	4
 #define VBUS_IN_CURR_LIM_SHIFT		4
-#define AB8540_VBUS_IN_CURR_LIM_SHIFT	2
 #define AUTO_VBUS_IN_CURR_LIM_SHIFT	4
-#define AB8540_AUTO_VBUS_IN_CURR_MASK	0x3F
 #define VBUS_IN_CURR_LIM_RETRY_SET_TIME	30 /* seconds */
 
 #define LED_INDICATOR_PWM_ENA		0x01
@@ -1138,10 +1136,7 @@ static int ab8500_charger_set_current(struct ab8500_charger *di,
 			no_stepping = true;
 		break;
 	case AB8500_USBCH_IPT_CRNTLVL_REG:
-		if (is_ab8540(di->parent))
-			shift_value = AB8540_VBUS_IN_CURR_LIM_SHIFT;
-		else
-			shift_value = VBUS_IN_CURR_LIM_SHIFT;
+		shift_value = VBUS_IN_CURR_LIM_SHIFT;
 		prev_curr_index = (reg_value >> shift_value);
 		curr_index = ab8500_vbus_in_curr_to_regval(di, ich);
 		step_udelay = STEP_UDELAY * 100;
@@ -1865,67 +1860,6 @@ static int ab8500_charger_update_charger_current(struct ux500_charger *charger,
 	return ret;
 }
 
-/**
- * ab8540_charger_power_path_enable() - enable usb power path mode
- * @charger:	pointer to the ux500_charger structure
- * @enable:	enable/disable flag
- *
- * Enable or disable the power path for usb mode
- * Returns error code in case of failure else 0(on success)
- */
-static int ab8540_charger_power_path_enable(struct ux500_charger *charger,
-		bool enable)
-{
-	int ret;
-	struct ab8500_charger *di;
-
-	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_USB)
-		di = to_ab8500_charger_usb_device_info(charger);
-	else
-		return -ENXIO;
-
-	ret = abx500_mask_and_set_register_interruptible(di->dev,
-				AB8500_CHARGER, AB8540_USB_PP_MODE_REG,
-				BUS_POWER_PATH_MODE_ENA, enable);
-	if (ret) {
-		dev_err(di->dev, "%s write failed\n", __func__);
-		return ret;
-	}
-
-	return ret;
-}
-
-
-/**
- * ab8540_charger_usb_pre_chg_enable() - enable usb pre change
- * @charger:	pointer to the ux500_charger structure
- * @enable:	enable/disable flag
- *
- * Enable or disable the pre-chage for usb mode
- * Returns error code in case of failure else 0(on success)
- */
-static int ab8540_charger_usb_pre_chg_enable(struct ux500_charger *charger,
-		bool enable)
-{
-	int ret;
-	struct ab8500_charger *di;
-
-	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_USB)
-		di = to_ab8500_charger_usb_device_info(charger);
-	else
-		return -ENXIO;
-
-	ret = abx500_mask_and_set_register_interruptible(di->dev,
-				AB8500_CHARGER, AB8540_USB_PP_CHR_REG,
-				BUS_POWER_PATH_PRECHG_ENA, enable);
-	if (ret) {
-		dev_err(di->dev, "%s write failed\n", __func__);
-		return ret;
-	}
-
-	return ret;
-}
-
 static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
 {
 	struct power_supply *psy;
@@ -2704,23 +2638,15 @@ static void ab8500_charger_vbus_drop_end_work(struct work_struct *work)
 	abx500_set_register_interruptible(di->dev,
 				  AB8500_CHARGER, AB8500_CHARGER_CTRL, 0x01);
 
-	if (is_ab8540(di->parent))
-		ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
-				AB8540_CH_USBCH_STAT3_REG, &reg_value);
-	else
-		ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
-				AB8500_CH_USBCH_STAT2_REG, &reg_value);
+	ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
+			AB8500_CH_USBCH_STAT2_REG, &reg_value);
 	if (ret < 0) {
 		dev_err(di->dev, "%s read failed\n", __func__);
 		return;
 	}
 
-	if (is_ab8540(di->parent))
-		curr = di->bm->chg_input_curr[
-			reg_value & AB8540_AUTO_VBUS_IN_CURR_MASK];
-	else
-		curr = di->bm->chg_input_curr[
-			reg_value >> AUTO_VBUS_IN_CURR_LIM_SHIFT];
+	curr = di->bm->chg_input_curr[
+		reg_value >> AUTO_VBUS_IN_CURR_LIM_SHIFT];
 
 	if (di->max_usb_in_curr.calculated_max != curr) {
 		/* USB source is collapsing */
@@ -3097,14 +3023,9 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 			goto out;
 		}
 
-		if (is_ab8540(di->parent))
-			ret = abx500_set_register_interruptible(di->dev,
-				AB8500_CHARGER, AB8500_CH_OPT_CRNTLVL_MAX_REG,
-				CH_OP_CUR_LVL_2P);
-		else
-			ret = abx500_set_register_interruptible(di->dev,
-				AB8500_CHARGER, AB8500_CH_OPT_CRNTLVL_MAX_REG,
-				CH_OP_CUR_LVL_1P6);
+		ret = abx500_set_register_interruptible(di->dev,
+			AB8500_CHARGER, AB8500_CH_OPT_CRNTLVL_MAX_REG,
+			CH_OP_CUR_LVL_1P6);
 		if (ret) {
 			dev_err(di->dev,
 				"failed to set CH_OPT_CRNTLVL_MAX_REG\n");
@@ -3112,8 +3033,7 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 		}
 	}
 
-	if (is_ab9540_2p0(di->parent) || is_ab9540_3p0(di->parent)
-	 || is_ab8505_2p0(di->parent) || is_ab8540(di->parent))
+	if (is_ab8505_2p0(di->parent))
 		ret = abx500_mask_and_set_register_interruptible(di->dev,
 			AB8500_CHARGER,
 			AB8500_USBCH_CTRL2_REG,
@@ -3205,17 +3125,6 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 		dev_err(di->dev, "failed to setup backup battery charging\n");
 		goto out;
 	}
-	if (is_ab8540(di->parent)) {
-		ret = abx500_set_register_interruptible(di->dev,
-			AB8500_RTC,
-			AB8500_RTC_CTRL1_REG,
-			bup_vch_range | vbup33_vrtcn);
-		if (ret) {
-			dev_err(di->dev,
-				"failed to setup backup battery charging\n");
-			goto out;
-		}
-	}
 
 	/* Enable backup battery charging */
 	ret = abx500_mask_and_set_register_interruptible(di->dev,
@@ -3226,25 +3135,6 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 		goto out;
 	}
 
-	if (is_ab8540(di->parent)) {
-		ret = abx500_mask_and_set_register_interruptible(di->dev,
-			AB8500_CHARGER, AB8540_USB_PP_MODE_REG,
-			BUS_VSYS_VOL_SELECT_MASK, BUS_VSYS_VOL_SELECT_3P6V);
-		if (ret) {
-			dev_err(di->dev,
-				"failed to setup usb power path vsys voltage\n");
-			goto out;
-		}
-		ret = abx500_mask_and_set_register_interruptible(di->dev,
-			AB8500_CHARGER, AB8540_USB_PP_CHR_REG,
-			BUS_PP_PRECHG_CURRENT_MASK, 0);
-		if (ret) {
-			dev_err(di->dev,
-				"failed to setup usb power path precharge current\n");
-			goto out;
-		}
-	}
-
 out:
 	return ret;
 }
@@ -3529,8 +3419,6 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->usb_chg.ops.check_enable = &ab8500_charger_usb_check_enable;
 	di->usb_chg.ops.kick_wd = &ab8500_charger_watchdog_kick;
 	di->usb_chg.ops.update_curr = &ab8500_charger_update_charger_current;
-	di->usb_chg.ops.pp_enable = &ab8540_charger_power_path_enable;
-	di->usb_chg.ops.pre_chg_enable = &ab8540_charger_usb_pre_chg_enable;
 	di->usb_chg.max_out_volt = ab8500_charger_voltage_map[
 		ARRAY_SIZE(ab8500_charger_voltage_map) - 1];
 	di->usb_chg.max_out_curr =
@@ -3538,7 +3426,6 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->usb_chg.wdt_refresh = CHG_WD_INTERVAL;
 	di->usb_chg.enabled = di->bm->usb_enabled;
 	di->usb_chg.external = false;
-	di->usb_chg.power_path = di->bm->usb_power_path;
 	di->usb_state.usb_current = -1;
 
 	/* Create a work queue for the charger */
diff --git a/drivers/power/supply/ab8500_fg.c b/drivers/power/supply/ab8500_fg.c
index 993898d1712f..d9c6c7bedd85 100644
--- a/drivers/power/supply/ab8500_fg.c
+++ b/drivers/power/supply/ab8500_fg.c
@@ -2326,9 +2326,7 @@ static int ab8500_fg_init_hw_registers(struct ab8500_fg *di)
 		goto out;
 	}
 
-	if (((is_ab8505(di->parent) || is_ab9540(di->parent)) &&
-			abx500_get_chip_id(di->dev) >= AB8500_CUT2P0)
-			|| is_ab8540(di->parent)) {
+	if (is_ab8505(di->parent)) {
 		ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
 			AB8505_RTC_PCUT_MAX_TIME_REG, di->bm->fg_params->pcut_max_time);
 
@@ -2915,9 +2913,7 @@ static int ab8500_fg_sysfs_psy_create_attrs(struct ab8500_fg *di)
 {
 	unsigned int i;
 
-	if (((is_ab8505(di->parent) || is_ab9540(di->parent)) &&
-	     abx500_get_chip_id(di->dev) >= AB8500_CUT2P0)
-	    || is_ab8540(di->parent)) {
+	if (is_ab8505(di->parent)) {
 		for (i = 0; i < ARRAY_SIZE(ab8505_fg_sysfs_psy_attrs); i++)
 			if (device_create_file(&di->fg_psy->dev,
 					       &ab8505_fg_sysfs_psy_attrs[i]))
@@ -2937,9 +2933,7 @@ static void ab8500_fg_sysfs_psy_remove_attrs(struct ab8500_fg *di)
 {
 	unsigned int i;
 
-	if (((is_ab8505(di->parent) || is_ab9540(di->parent)) &&
-	     abx500_get_chip_id(di->dev) >= AB8500_CUT2P0)
-	    || is_ab8540(di->parent)) {
+	if (is_ab8505(di->parent)) {
 		for (i = 0; i < ARRAY_SIZE(ab8505_fg_sysfs_psy_attrs); i++)
 			(void)device_remove_file(&di->fg_psy->dev,
 						 &ab8505_fg_sysfs_psy_attrs[i]);
diff --git a/drivers/power/supply/abx500_chargalg.c b/drivers/power/supply/abx500_chargalg.c
index a4411d6bbc96..947709cdd14e 100644
--- a/drivers/power/supply/abx500_chargalg.c
+++ b/drivers/power/supply/abx500_chargalg.c
@@ -44,9 +44,6 @@
 /* Five minutes expressed in seconds */
 #define FIVE_MINUTES_IN_SECONDS        300
 
-/* Plus margin for the low battery threshold */
-#define BAT_PLUS_MARGIN                (100)
-
 #define CHARGALG_CURR_STEP_LOW		0
 #define CHARGALG_CURR_STEP_HIGH	100
 
@@ -101,7 +98,6 @@ enum abx500_chargalg_states {
 	STATE_HW_TEMP_PROTECT_INIT,
 	STATE_HW_TEMP_PROTECT,
 	STATE_NORMAL_INIT,
-	STATE_USB_PP_PRE_CHARGE,
 	STATE_NORMAL,
 	STATE_WAIT_FOR_RECHARGE_INIT,
 	STATE_WAIT_FOR_RECHARGE,
@@ -133,7 +129,6 @@ static const char *states[] = {
 	"HW_TEMP_PROTECT_INIT",
 	"HW_TEMP_PROTECT",
 	"NORMAL_INIT",
-	"USB_PP_PRE_CHARGE",
 	"NORMAL",
 	"WAIT_FOR_RECHARGE_INIT",
 	"WAIT_FOR_RECHARGE",
@@ -603,37 +598,6 @@ static int abx500_chargalg_usb_en(struct abx500_chargalg *di, int enable,
 	return di->usb_chg->ops.enable(di->usb_chg, enable, vset, iset);
 }
 
- /**
- * ab8540_chargalg_usb_pp_en() - Enable/ disable USB power path
- * @di:                pointer to the abx500_chargalg structure
- * @enable:    power path enable/disable
- *
- * The USB power path will be enable/ disable
- */
-static int ab8540_chargalg_usb_pp_en(struct abx500_chargalg *di, bool enable)
-{
-	if (!di->usb_chg || !di->usb_chg->ops.pp_enable)
-		return -ENXIO;
-
-	return di->usb_chg->ops.pp_enable(di->usb_chg, enable);
-}
-
-/**
- * ab8540_chargalg_usb_pre_chg_en() - Enable/ disable USB pre-charge
- * @di:                pointer to the abx500_chargalg structure
- * @enable:    USB pre-charge enable/disable
- *
- * The USB USB pre-charge will be enable/ disable
- */
-static int ab8540_chargalg_usb_pre_chg_en(struct abx500_chargalg *di,
-					  bool enable)
-{
-	if (!di->usb_chg || !di->usb_chg->ops.pre_chg_enable)
-		return -ENXIO;
-
-	return di->usb_chg->ops.pre_chg_enable(di->usb_chg, enable);
-}
-
 /**
  * abx500_chargalg_update_chg_curr() - Update charger current
  * @di:		pointer to the abx500_chargalg structure
@@ -833,9 +797,6 @@ static void abx500_chargalg_end_of_charge(struct abx500_chargalg *di)
 		di->batt_data.avg_curr > 0) {
 		if (++di->eoc_cnt >= EOC_COND_CNT) {
 			di->eoc_cnt = 0;
-			if ((di->chg_info.charger_type & USB_CHG) &&
-			   (di->usb_chg->power_path))
-				ab8540_chargalg_usb_pp_en(di, true);
 			di->charge_status = POWER_SUPPLY_STATUS_FULL;
 			di->maintenance_chg = true;
 			dev_dbg(di->dev, "EOC reached!\n");
@@ -1536,22 +1497,6 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 		break;
 
 	case STATE_NORMAL_INIT:
-		if ((di->chg_info.charger_type & USB_CHG) &&
-				di->usb_chg->power_path) {
-			if (di->batt_data.volt >
-			    (di->bm->fg_params->lowbat_threshold +
-			     BAT_PLUS_MARGIN)) {
-				ab8540_chargalg_usb_pre_chg_en(di, false);
-				ab8540_chargalg_usb_pp_en(di, false);
-			} else {
-				ab8540_chargalg_usb_pp_en(di, true);
-				ab8540_chargalg_usb_pre_chg_en(di, true);
-				abx500_chargalg_state_to(di,
-					STATE_USB_PP_PRE_CHARGE);
-				break;
-			}
-		}
-
 		if (di->curr_status.curr_step == CHARGALG_CURR_STEP_LOW)
 			abx500_chargalg_stop_charging(di);
 		else {
@@ -1575,13 +1520,6 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 
 		break;
 
-	case STATE_USB_PP_PRE_CHARGE:
-		if (di->batt_data.volt >
-			(di->bm->fg_params->lowbat_threshold +
-			BAT_PLUS_MARGIN))
-			abx500_chargalg_state_to(di, STATE_NORMAL_INIT);
-		break;
-
 	case STATE_NORMAL:
 		handle_maxim_chg_curr(di);
 		if (di->charge_status == POWER_SUPPLY_STATUS_FULL &&
diff --git a/include/linux/mfd/abx500.h b/include/linux/mfd/abx500.h
index 44412c9d26e1..aa09414756db 100644
--- a/include/linux/mfd/abx500.h
+++ b/include/linux/mfd/abx500.h
@@ -271,7 +271,6 @@ struct abx500_bm_data {
 	bool autopower_cfg;
 	bool ac_enabled;
 	bool usb_enabled;
-	bool usb_power_path;
 	bool no_maintenance;
 	bool capacity_scaling;
 	bool chg_unknown_bat;
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index e63681eb6c62..c06daf3d490a 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -248,8 +248,6 @@ enum bup_vch_sel {
 #define BAT_CTRL_20U_ENA		0x02
 #define BAT_CTRL_18U_ENA		0x01
 #define BAT_CTRL_16U_ENA		0x02
-#define BAT_CTRL_60U_ENA		0x01
-#define BAT_CTRL_120U_ENA		0x02
 #define BAT_CTRL_CMP_ENA		0x04
 #define FORCE_BAT_CTRL_CMP_HIGH		0x08
 #define BAT_CTRL_PULL_UP_ENA		0x10
diff --git a/include/linux/mfd/abx500/ux500_chargalg.h b/include/linux/mfd/abx500/ux500_chargalg.h
index 67703f23e7ba..669894f434f5 100644
--- a/include/linux/mfd/abx500/ux500_chargalg.h
+++ b/include/linux/mfd/abx500/ux500_chargalg.h
@@ -25,8 +25,6 @@ struct ux500_charger_ops {
 	int (*check_enable) (struct ux500_charger *, int, int);
 	int (*kick_wd) (struct ux500_charger *);
 	int (*update_curr) (struct ux500_charger *, int);
-	int (*pp_enable) (struct ux500_charger *, bool);
-	int (*pre_chg_enable) (struct ux500_charger *, bool);
 };
 
 /**
@@ -37,7 +35,6 @@ struct ux500_charger_ops {
  * @max_out_curr	maximum output charger current in mA
  * @enabled		indicates if this charger is used or not
  * @external		external charger unit (pm2xxx)
- * @power_path		USB power path support
  */
 struct ux500_charger {
 	struct power_supply *psy;
@@ -47,7 +44,6 @@ struct ux500_charger {
 	int wdt_refresh;
 	bool enabled;
 	bool external;
-	bool power_path;
 };
 
 extern struct blocking_notifier_head charger_notifier_list;
-- 
cgit v1.2.3


From 227abcc6da7b803e380f034d6772ea0861612340 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 26 Apr 2018 08:58:27 +0300
Subject: staging: kernel.h: Prevent macro expantion bug in container_of_safe()

There aren't many users of this so it doesn't cause a problem, but we
obviously want to use "__mptr" here instead of "ptr" to prevent the
parameter from being executed twice.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: NeilBrown <neilb@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 58d6645b1425..3dfa3f260fc4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -977,7 +977,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
 			 !__same_type(*(ptr), void),			\
 			 "pointer type mismatch in container_of()");	\
-	IS_ERR_OR_NULL(ptr) ? ERR_CAST(ptr) :				\
+	IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) :			\
 		((type *)(__mptr - offsetof(type, member))); })
 
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
-- 
cgit v1.2.3


From ef050bece1b5564b2c7135ceadc0d5ffdcf152f7 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 24 Apr 2018 16:39:02 +0100
Subject: ASoC: Remove platform code now everything is componentised

As all drivers have been moved over to the new generic component
code remove the now unused platform specific code.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h      | 103 +--------------------
 sound/soc/soc-compress.c | 227 ++---------------------------------------------
 sound/soc/soc-core.c     | 214 --------------------------------------------
 sound/soc/soc-devres.c   |  35 --------
 sound/soc/soc-io.c       |  21 -----
 sound/soc/soc-pcm.c      | 119 +------------------------
 6 files changed, 12 insertions(+), 707 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index ad266d7e9553..9ea99e5d3c8e 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -401,9 +401,7 @@ struct snd_soc_ops;
 struct snd_soc_pcm_runtime;
 struct snd_soc_dai;
 struct snd_soc_dai_driver;
-struct snd_soc_platform;
 struct snd_soc_dai_link;
-struct snd_soc_platform_driver;
 struct snd_soc_codec;
 struct snd_soc_codec_driver;
 struct snd_soc_component;
@@ -455,15 +453,6 @@ static inline int snd_soc_resume(struct device *dev)
 }
 #endif
 int snd_soc_poweroff(struct device *dev);
-int snd_soc_register_platform(struct device *dev,
-		const struct snd_soc_platform_driver *platform_drv);
-int devm_snd_soc_register_platform(struct device *dev,
-		const struct snd_soc_platform_driver *platform_drv);
-void snd_soc_unregister_platform(struct device *dev);
-int snd_soc_add_platform(struct device *dev, struct snd_soc_platform *platform,
-		const struct snd_soc_platform_driver *platform_drv);
-void snd_soc_remove_platform(struct snd_soc_platform *platform);
-struct snd_soc_platform *snd_soc_lookup_platform(struct device *dev);
 int snd_soc_register_codec(struct device *dev,
 		const struct snd_soc_codec_driver *codec_drv,
 		struct snd_soc_dai_driver *dai_drv, int num_dai);
@@ -485,10 +474,6 @@ struct snd_soc_component *snd_soc_lookup_component(struct device *dev,
 int snd_soc_cache_init(struct snd_soc_codec *codec);
 int snd_soc_cache_exit(struct snd_soc_codec *codec);
 
-int snd_soc_platform_read(struct snd_soc_platform *platform,
-					unsigned int reg);
-int snd_soc_platform_write(struct snd_soc_platform *platform,
-					unsigned int reg, unsigned int val);
 int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num);
 #ifdef CONFIG_SND_SOC_COMPRESS
 int snd_soc_new_compress(struct snd_soc_pcm_runtime *rtd, int num);
@@ -628,8 +613,6 @@ int snd_soc_add_component_controls(struct snd_soc_component *component,
 	const struct snd_kcontrol_new *controls, unsigned int num_controls);
 int snd_soc_add_codec_controls(struct snd_soc_codec *codec,
 	const struct snd_kcontrol_new *controls, unsigned int num_controls);
-int snd_soc_add_platform_controls(struct snd_soc_platform *platform,
-	const struct snd_kcontrol_new *controls, unsigned int num_controls);
 int snd_soc_add_card_controls(struct snd_soc_card *soc_card,
 	const struct snd_kcontrol_new *controls, int num_controls);
 int snd_soc_add_dai_controls(struct snd_soc_dai *dai,
@@ -996,39 +979,12 @@ struct snd_soc_codec_driver {
 	bool ignore_pmdown_time;  /* Doesn't benefit from pmdown delay */
 };
 
-/* SoC platform interface */
-struct snd_soc_platform_driver {
-
-	int (*probe)(struct snd_soc_platform *);
-	int (*remove)(struct snd_soc_platform *);
-	struct snd_soc_component_driver component_driver;
-
-	/* pcm creation and destruction */
-	int (*pcm_new)(struct snd_soc_pcm_runtime *);
-	void (*pcm_free)(struct snd_pcm *);
-
-	/* platform stream pcm ops */
-	const struct snd_pcm_ops *ops;
-
-	/* platform stream compress ops */
-	const struct snd_compr_ops *compr_ops;
-};
-
 struct snd_soc_dai_link_component {
 	const char *name;
 	struct device_node *of_node;
 	const char *dai_name;
 };
 
-struct snd_soc_platform {
-	struct device *dev;
-	const struct snd_soc_platform_driver *driver;
-
-	struct list_head list;
-
-	struct snd_soc_component component;
-};
-
 struct snd_soc_dai_link {
 	/* config - must be set by machine driver */
 	const char *name;			/* Codec name */
@@ -1277,7 +1233,6 @@ struct snd_soc_pcm_runtime {
 	struct snd_pcm *pcm;
 	struct snd_compr *compr;
 	struct snd_soc_codec *codec;
-	struct snd_soc_platform *platform; /* will be removed */
 	struct snd_soc_dai *codec_dai;
 	struct snd_soc_dai *cpu_dai;
 
@@ -1358,19 +1313,6 @@ static inline struct snd_soc_codec *snd_soc_component_to_codec(
 	return container_of(component, struct snd_soc_codec, component);
 }
 
-/**
- * snd_soc_component_to_platform() - Casts a component to the platform it is embedded in
- * @component: The component to cast to a platform
- *
- * This function must only be used on components that are known to be platforms.
- * Otherwise the behavior is undefined.
- */
-static inline struct snd_soc_platform *snd_soc_component_to_platform(
-	struct snd_soc_component *component)
-{
-	return container_of(component, struct snd_soc_platform, component);
-}
-
 /**
  * snd_soc_dapm_to_component() - Casts a DAPM context to the component it is
  *  embedded in
@@ -1399,20 +1341,6 @@ static inline struct snd_soc_codec *snd_soc_dapm_to_codec(
 	return snd_soc_component_to_codec(snd_soc_dapm_to_component(dapm));
 }
 
-/**
- * snd_soc_dapm_to_platform() - Casts a DAPM context to the platform it is
- *  embedded in
- * @dapm: The DAPM context to cast to the platform.
- *
- * This function must only be used on DAPM contexts that are known to be part of
- * a platform (e.g. in a platform driver). Otherwise the behavior is undefined.
- */
-static inline struct snd_soc_platform *snd_soc_dapm_to_platform(
-	struct snd_soc_dapm_context *dapm)
-{
-	return snd_soc_component_to_platform(snd_soc_dapm_to_component(dapm));
-}
-
 /**
  * snd_soc_component_get_dapm() - Returns the DAPM context associated with a
  *  component
@@ -1673,17 +1601,6 @@ static inline void *snd_soc_codec_get_drvdata(struct snd_soc_codec *codec)
 	return snd_soc_component_get_drvdata(&codec->component);
 }
 
-static inline void snd_soc_platform_set_drvdata(struct snd_soc_platform *platform,
-		void *data)
-{
-	snd_soc_component_set_drvdata(&platform->component, data);
-}
-
-static inline void *snd_soc_platform_get_drvdata(struct snd_soc_platform *platform)
-{
-	return snd_soc_component_get_drvdata(&platform->component);
-}
-
 static inline void snd_soc_initialize_card_lists(struct snd_soc_card *card)
 {
 	INIT_LIST_HEAD(&card->widgets);
@@ -1746,9 +1663,9 @@ static inline bool snd_soc_codec_is_active(struct snd_soc_codec *codec)
  * @kcontrol: The control for which to get the component
  *
  * Note: This function will work correctly if the control has been registered
- * for a component. Either with snd_soc_add_codec_controls() or
- * snd_soc_add_platform_controls() or via  table based setup for either a
- * CODEC, a platform or component driver. Otherwise the behavior is undefined.
+ * for a component. With snd_soc_add_codec_controls() or via table based
+ * setup for either a CODEC or component driver. Otherwise the behavior is
+ * undefined.
  */
 static inline struct snd_soc_component *snd_soc_kcontrol_component(
 	struct snd_kcontrol *kcontrol)
@@ -1770,20 +1687,6 @@ static inline struct snd_soc_codec *snd_soc_kcontrol_codec(
 	return snd_soc_component_to_codec(snd_soc_kcontrol_component(kcontrol));
 }
 
-/**
- * snd_soc_kcontrol_platform() - Returns the platform that registered the control
- * @kcontrol: The control for which to get the platform
- *
- * Note: This function will only work correctly if the control has been
- * registered with snd_soc_add_platform_controls() or via table based setup of
- * a snd_soc_platform_driver. Otherwise the behavior is undefined.
- */
-static inline struct snd_soc_platform *snd_soc_kcontrol_platform(
-	struct snd_kcontrol *kcontrol)
-{
-	return snd_soc_component_to_platform(snd_soc_kcontrol_component(kcontrol));
-}
-
 int snd_soc_util_init(void);
 void snd_soc_util_exit(void);
 
diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c
index 948505f74229..abc00c6cc2d7 100644
--- a/sound/soc/soc-compress.c
+++ b/sound/soc/soc-compress.c
@@ -29,7 +29,6 @@
 static int soc_compr_open(struct snd_compr_stream *cstream)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -47,23 +46,9 @@ static int soc_compr_open(struct snd_compr_stream *cstream)
 		}
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->open) {
-		ret = platform->driver->compr_ops->open(cstream);
-		if (ret < 0) {
-			dev_err(platform->dev,
-				"Compress ASoC: can't open platform %s: %d\n",
-				platform->component.name, ret);
-			goto plat_err;
-		}
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->open)
 			continue;
@@ -101,10 +86,6 @@ machine_err:
 		if (err_comp == component)
 			break;
 
-		/* ignore duplication for now */
-		if (platform && (err_comp == &platform->component))
-			continue;
-
 		if (!err_comp->driver->compr_ops ||
 		    !err_comp->driver->compr_ops->free)
 			continue;
@@ -112,9 +93,6 @@ machine_err:
 		err_comp->driver->compr_ops->free(cstream);
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->free)
-		platform->driver->compr_ops->free(cstream);
-plat_err:
 	if (cpu_dai->driver->cops && cpu_dai->driver->cops->shutdown)
 		cpu_dai->driver->cops->shutdown(cstream, cpu_dai);
 out:
@@ -127,7 +105,6 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 	struct snd_soc_pcm_runtime *fe = cstream->private_data;
 	struct snd_pcm_substream *fe_substream =
 		 fe->pcm->streams[cstream->direction].substream;
-	struct snd_soc_platform *platform = fe->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = fe->cpu_dai;
@@ -153,23 +130,9 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 		}
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->open) {
-		ret = platform->driver->compr_ops->open(cstream);
-		if (ret < 0) {
-			dev_err(platform->dev,
-				"Compress ASoC: can't open platform %s: %d\n",
-				platform->component.name, ret);
-			goto plat_err;
-		}
-	}
-
 	for_each_rtdcom(fe, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->open)
 			continue;
@@ -242,10 +205,6 @@ machine_err:
 		if (err_comp == component)
 			break;
 
-		/* ignore duplication for now */
-		if (platform && (err_comp == &platform->component))
-			continue;
-
 		if (!err_comp->driver->compr_ops ||
 		    !err_comp->driver->compr_ops->free)
 			continue;
@@ -253,9 +212,6 @@ machine_err:
 		err_comp->driver->compr_ops->free(cstream);
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->free)
-		platform->driver->compr_ops->free(cstream);
-plat_err:
 	if (cpu_dai->driver->cops && cpu_dai->driver->cops->shutdown)
 		cpu_dai->driver->cops->shutdown(cstream, cpu_dai);
 out:
@@ -296,7 +252,6 @@ static void close_delayed_work(struct work_struct *work)
 static int soc_compr_free(struct snd_compr_stream *cstream)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -326,10 +281,6 @@ static int soc_compr_free(struct snd_compr_stream *cstream)
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->free)
 			continue;
@@ -337,9 +288,6 @@ static int soc_compr_free(struct snd_compr_stream *cstream)
 		component->driver->compr_ops->free(cstream);
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->free)
-		platform->driver->compr_ops->free(cstream);
-
 	if (cpu_dai->driver->cops && cpu_dai->driver->cops->shutdown)
 		cpu_dai->driver->cops->shutdown(cstream, cpu_dai);
 
@@ -368,7 +316,6 @@ static int soc_compr_free(struct snd_compr_stream *cstream)
 static int soc_compr_free_fe(struct snd_compr_stream *cstream)
 {
 	struct snd_soc_pcm_runtime *fe = cstream->private_data;
-	struct snd_soc_platform *platform = fe->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = fe->cpu_dai;
@@ -408,16 +355,9 @@ static int soc_compr_free_fe(struct snd_compr_stream *cstream)
 	if (fe->dai_link->compr_ops && fe->dai_link->compr_ops->shutdown)
 		fe->dai_link->compr_ops->shutdown(cstream);
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->free)
-		platform->driver->compr_ops->free(cstream);
-
 	for_each_rtdcom(fe, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->free)
 			continue;
@@ -436,7 +376,6 @@ static int soc_compr_trigger(struct snd_compr_stream *cstream, int cmd)
 {
 
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *codec_dai = rtd->codec_dai;
@@ -445,19 +384,9 @@ static int soc_compr_trigger(struct snd_compr_stream *cstream, int cmd)
 
 	mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass);
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->trigger) {
-		ret = platform->driver->compr_ops->trigger(cstream, cmd);
-		if (ret < 0)
-			goto out;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->trigger)
 			continue;
@@ -489,7 +418,6 @@ out:
 static int soc_compr_trigger_fe(struct snd_compr_stream *cstream, int cmd)
 {
 	struct snd_soc_pcm_runtime *fe = cstream->private_data;
-	struct snd_soc_platform *platform = fe->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = fe->cpu_dai;
@@ -498,19 +426,9 @@ static int soc_compr_trigger_fe(struct snd_compr_stream *cstream, int cmd)
 	if (cmd == SND_COMPR_TRIGGER_PARTIAL_DRAIN ||
 		cmd == SND_COMPR_TRIGGER_DRAIN) {
 
-		if (platform &&
-		    platform->driver->compr_ops &&
-		    platform->driver->compr_ops->trigger)
-			return platform->driver->compr_ops->trigger(cstream,
-								    cmd);
-
 		for_each_rtdcom(fe, rtdcom) {
 			component = rtdcom->component;
 
-			/* ignore duplication for now */
-			if (platform && (component == &platform->component))
-				continue;
-
 			if (!component->driver->compr_ops ||
 			    !component->driver->compr_ops->trigger)
 				continue;
@@ -536,19 +454,9 @@ static int soc_compr_trigger_fe(struct snd_compr_stream *cstream, int cmd)
 			goto out;
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->trigger) {
-		ret = platform->driver->compr_ops->trigger(cstream, cmd);
-		if (ret < 0)
-			goto out;
-	}
-
 	for_each_rtdcom(fe, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->trigger)
 			continue;
@@ -589,7 +497,6 @@ static int soc_compr_set_params(struct snd_compr_stream *cstream,
 					struct snd_compr_params *params)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -597,11 +504,12 @@ static int soc_compr_set_params(struct snd_compr_stream *cstream,
 
 	mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass);
 
-	/* first we call set_params for the platform driver
-	 * this should configure the soc side
-	 * if the machine has compressed ops then we call that as well
-	 * expectation is that platform and machine will configure everything
-	 * for this compress path, like configuring pcm port for codec
+	/*
+	 * First we call set_params for the CPU DAI, then the component
+	 * driver this should configure the SoC side. If the machine has
+	 * compressed ops then we call that as well. The expectation is
+	 * that these callbacks will configure everything for this compress
+	 * path, like configuring a PCM port for a CODEC.
 	 */
 	if (cpu_dai->driver->cops && cpu_dai->driver->cops->set_params) {
 		ret = cpu_dai->driver->cops->set_params(cstream, params, cpu_dai);
@@ -609,19 +517,9 @@ static int soc_compr_set_params(struct snd_compr_stream *cstream,
 			goto err;
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->set_params) {
-		ret = platform->driver->compr_ops->set_params(cstream, params);
-		if (ret < 0)
-			goto err;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->set_params)
 			continue;
@@ -665,7 +563,6 @@ static int soc_compr_set_params_fe(struct snd_compr_stream *cstream,
 	struct snd_soc_pcm_runtime *fe = cstream->private_data;
 	struct snd_pcm_substream *fe_substream =
 		 fe->pcm->streams[cstream->direction].substream;
-	struct snd_soc_platform *platform = fe->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = fe->cpu_dai;
@@ -684,19 +581,9 @@ static int soc_compr_set_params_fe(struct snd_compr_stream *cstream,
 			goto out;
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->set_params) {
-		ret = platform->driver->compr_ops->set_params(cstream, params);
-		if (ret < 0)
-			goto out;
-	}
-
 	for_each_rtdcom(fe, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->set_params)
 			continue;
@@ -745,7 +632,6 @@ static int soc_compr_get_params(struct snd_compr_stream *cstream,
 					struct snd_codec *params)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -759,19 +645,9 @@ static int soc_compr_get_params(struct snd_compr_stream *cstream,
 			goto err;
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->get_params) {
-		ret = platform->driver->compr_ops->get_params(cstream, params);
-		if (ret < 0)
-			goto err;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->get_params)
 			continue;
@@ -790,26 +666,15 @@ static int soc_compr_get_caps(struct snd_compr_stream *cstream,
 				struct snd_compr_caps *caps)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	int ret = 0, __ret;
 
 	mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass);
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->get_caps) {
-		ret = platform->driver->compr_ops->get_caps(cstream, caps);
-		if (ret < 0)
-			goto err;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->get_caps)
 			continue;
@@ -819,7 +684,6 @@ static int soc_compr_get_caps(struct snd_compr_stream *cstream,
 			ret = __ret;
 	}
 
-err:
 	mutex_unlock(&rtd->pcm_mutex);
 	return ret;
 }
@@ -828,26 +692,15 @@ static int soc_compr_get_codec_caps(struct snd_compr_stream *cstream,
 				struct snd_compr_codec_caps *codec)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	int ret = 0, __ret;
 
 	mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass);
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->get_codec_caps) {
-		ret = platform->driver->compr_ops->get_codec_caps(cstream, codec);
-		if (ret < 0)
-			goto err;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->get_codec_caps)
 			continue;
@@ -857,7 +710,6 @@ static int soc_compr_get_codec_caps(struct snd_compr_stream *cstream,
 			ret = __ret;
 	}
 
-err:
 	mutex_unlock(&rtd->pcm_mutex);
 	return ret;
 }
@@ -865,7 +717,6 @@ err:
 static int soc_compr_ack(struct snd_compr_stream *cstream, size_t bytes)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -879,19 +730,9 @@ static int soc_compr_ack(struct snd_compr_stream *cstream, size_t bytes)
 			goto err;
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->ack) {
-		ret = platform->driver->compr_ops->ack(cstream, bytes);
-		if (ret < 0)
-			goto err;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->ack)
 			continue;
@@ -910,7 +751,6 @@ static int soc_compr_pointer(struct snd_compr_stream *cstream,
 			struct snd_compr_tstamp *tstamp)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	int ret = 0, __ret;
@@ -921,19 +761,9 @@ static int soc_compr_pointer(struct snd_compr_stream *cstream,
 	if (cpu_dai->driver->cops && cpu_dai->driver->cops->pointer)
 		cpu_dai->driver->cops->pointer(cstream, tstamp, cpu_dai);
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->pointer) {
-		ret = platform->driver->compr_ops->pointer(cstream, tstamp);
-		if (ret < 0)
-			goto err;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->pointer)
 			continue;
@@ -943,7 +773,6 @@ static int soc_compr_pointer(struct snd_compr_stream *cstream,
 			ret = __ret;
 	}
 
-err:
 	mutex_unlock(&rtd->pcm_mutex);
 	return ret;
 }
@@ -952,26 +781,15 @@ static int soc_compr_copy(struct snd_compr_stream *cstream,
 			  char __user *buf, size_t count)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	int ret = 0;
 
 	mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass);
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->copy) {
-		ret = platform->driver->compr_ops->copy(cstream, buf, count);
-		if (ret < 0)
-			goto err;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->copy)
 			continue;
@@ -980,7 +798,6 @@ static int soc_compr_copy(struct snd_compr_stream *cstream,
 		break;
 	}
 
-err:
 	mutex_unlock(&rtd->pcm_mutex);
 	return ret;
 }
@@ -989,7 +806,6 @@ static int soc_compr_set_metadata(struct snd_compr_stream *cstream,
 				struct snd_compr_metadata *metadata)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -1001,19 +817,9 @@ static int soc_compr_set_metadata(struct snd_compr_stream *cstream,
 			return ret;
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->set_metadata) {
-		ret = platform->driver->compr_ops->set_metadata(cstream, metadata);
-		if (ret < 0)
-			return ret;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->set_metadata)
 			continue;
@@ -1030,7 +836,6 @@ static int soc_compr_get_metadata(struct snd_compr_stream *cstream,
 				struct snd_compr_metadata *metadata)
 {
 	struct snd_soc_pcm_runtime *rtd = cstream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -1042,19 +847,9 @@ static int soc_compr_get_metadata(struct snd_compr_stream *cstream,
 			return ret;
 	}
 
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->get_metadata) {
-		ret = platform->driver->compr_ops->get_metadata(cstream, metadata);
-		if (ret < 0)
-			return ret;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->get_metadata)
 			continue;
@@ -1107,7 +902,6 @@ static struct snd_compr_ops soc_compr_dyn_ops = {
  */
 int snd_soc_new_compress(struct snd_soc_pcm_runtime *rtd, int num)
 {
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *codec_dai = rtd->codec_dai;
@@ -1188,18 +982,9 @@ int snd_soc_new_compress(struct snd_soc_pcm_runtime *rtd, int num)
 		memcpy(compr->ops, &soc_compr_ops, sizeof(soc_compr_ops));
 	}
 
-
-	/* Add copy callback for not memory mapped DSPs */
-	if (platform && platform->driver->compr_ops && platform->driver->compr_ops->copy)
-		compr->ops->copy = soc_compr_copy;
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->compr_ops ||
 		    !component->driver->compr_ops->copy)
 			continue;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index bf7ca32ab31f..052089f16ea0 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -56,7 +56,6 @@ EXPORT_SYMBOL_GPL(snd_soc_debugfs_root);
 #endif
 
 static DEFINE_MUTEX(client_mutex);
-static LIST_HEAD(platform_list);
 static LIST_HEAD(codec_list);
 static LIST_HEAD(component_list);
 
@@ -381,21 +380,6 @@ static int dai_list_show(struct seq_file *m, void *v)
 }
 DEFINE_SHOW_ATTRIBUTE(dai_list);
 
-static int platform_list_show(struct seq_file *m, void *v)
-{
-	struct snd_soc_platform *platform;
-
-	mutex_lock(&client_mutex);
-
-	list_for_each_entry(platform, &platform_list, list)
-		seq_printf(m, "%s\n", platform->component.name);
-
-	mutex_unlock(&client_mutex);
-
-	return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(platform_list);
-
 static void soc_init_card_debugfs(struct snd_soc_card *card)
 {
 	if (!snd_soc_debugfs_root)
@@ -438,10 +422,6 @@ static void snd_soc_debugfs_init(void)
 	if (!debugfs_create_file("dais", 0444, snd_soc_debugfs_root, NULL,
 				 &dai_list_fops))
 		pr_warn("ASoC: Failed to create DAI list debugfs file\n");
-
-	if (!debugfs_create_file("platforms", 0444, snd_soc_debugfs_root, NULL,
-				 &platform_list_fops))
-		pr_warn("ASoC: Failed to create platform list debugfs file\n");
 }
 
 static void snd_soc_debugfs_exit(void)
@@ -1045,7 +1025,6 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 	struct snd_soc_dai_link_component cpu_dai_component;
 	struct snd_soc_component *component;
 	struct snd_soc_dai **codec_dais;
-	struct snd_soc_platform *platform;
 	struct device_node *platform_of_node;
 	const char *platform_name;
 	int i;
@@ -1113,23 +1092,6 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 		snd_soc_rtdcom_add(rtd, component);
 	}
 
-	/* find one from the set of registered platforms */
-	list_for_each_entry(platform, &platform_list, list) {
-		platform_of_node = platform->dev->of_node;
-		if (!platform_of_node && platform->dev->parent->of_node)
-			platform_of_node = platform->dev->parent->of_node;
-
-		if (dai_link->platform_of_node) {
-			if (platform_of_node != dai_link->platform_of_node)
-				continue;
-		} else {
-			if (strcmp(platform->component.name, platform_name))
-				continue;
-		}
-
-		rtd->platform = platform;
-	}
-
 	soc_add_pcm_runtime(card, rtd);
 	return 0;
 
@@ -2512,24 +2474,6 @@ int snd_soc_add_codec_controls(struct snd_soc_codec *codec,
 }
 EXPORT_SYMBOL_GPL(snd_soc_add_codec_controls);
 
-/**
- * snd_soc_add_platform_controls - add an array of controls to a platform.
- * Convenience function to add a list of controls.
- *
- * @platform: platform to add controls to
- * @controls: array of controls to add
- * @num_controls: number of elements in the array
- *
- * Return 0 for success, else error.
- */
-int snd_soc_add_platform_controls(struct snd_soc_platform *platform,
-	const struct snd_kcontrol_new *controls, unsigned int num_controls)
-{
-	return snd_soc_add_component_controls(&platform->component, controls,
-		num_controls);
-}
-EXPORT_SYMBOL_GPL(snd_soc_add_platform_controls);
-
 /**
  * snd_soc_add_card_controls - add an array of controls to a SoC card.
  * Convenience function to add a list of controls.
@@ -3503,164 +3447,6 @@ struct snd_soc_component *snd_soc_lookup_component(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(snd_soc_lookup_component);
 
-static int snd_soc_platform_drv_probe(struct snd_soc_component *component)
-{
-	struct snd_soc_platform *platform = snd_soc_component_to_platform(component);
-
-	return platform->driver->probe(platform);
-}
-
-static void snd_soc_platform_drv_remove(struct snd_soc_component *component)
-{
-	struct snd_soc_platform *platform = snd_soc_component_to_platform(component);
-
-	platform->driver->remove(platform);
-}
-
-static int snd_soc_platform_drv_pcm_new(struct snd_soc_component *component,
-					struct snd_soc_pcm_runtime *rtd)
-{
-	struct snd_soc_platform *platform = snd_soc_component_to_platform(component);
-
-	if (platform->driver->pcm_new)
-		return platform->driver->pcm_new(rtd);
-
-	return 0;
-}
-
-static void snd_soc_platform_drv_pcm_free(struct snd_soc_component *component,
-					  struct snd_pcm *pcm)
-{
-	struct snd_soc_platform *platform = snd_soc_component_to_platform(component);
-
-	if (platform->driver->pcm_free)
-		platform->driver->pcm_free(pcm);
-}
-
-/**
- * snd_soc_add_platform - Add a platform to the ASoC core
- * @dev: The parent device for the platform
- * @platform: The platform to add
- * @platform_drv: The driver for the platform
- */
-int snd_soc_add_platform(struct device *dev, struct snd_soc_platform *platform,
-		const struct snd_soc_platform_driver *platform_drv)
-{
-	int ret;
-
-	ret = snd_soc_component_initialize(&platform->component,
-			&platform_drv->component_driver, dev);
-	if (ret)
-		return ret;
-
-	platform->dev = dev;
-	platform->driver = platform_drv;
-
-	if (platform_drv->probe)
-		platform->component.probe = snd_soc_platform_drv_probe;
-	if (platform_drv->remove)
-		platform->component.remove = snd_soc_platform_drv_remove;
-	if (platform_drv->pcm_new)
-		platform->component.pcm_new = snd_soc_platform_drv_pcm_new;
-	if (platform_drv->pcm_free)
-		platform->component.pcm_free = snd_soc_platform_drv_pcm_free;
-
-#ifdef CONFIG_DEBUG_FS
-	platform->component.debugfs_prefix = "platform";
-#endif
-
-	mutex_lock(&client_mutex);
-	snd_soc_component_add_unlocked(&platform->component);
-	list_add(&platform->list, &platform_list);
-	mutex_unlock(&client_mutex);
-
-	dev_dbg(dev, "ASoC: Registered platform '%s'\n",
-		platform->component.name);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(snd_soc_add_platform);
-
-/**
- * snd_soc_register_platform - Register a platform with the ASoC core
- *
- * @dev: The device for the platform
- * @platform_drv: The driver for the platform
- */
-int snd_soc_register_platform(struct device *dev,
-		const struct snd_soc_platform_driver *platform_drv)
-{
-	struct snd_soc_platform *platform;
-	int ret;
-
-	dev_dbg(dev, "ASoC: platform register %s\n", dev_name(dev));
-
-	platform = kzalloc(sizeof(struct snd_soc_platform), GFP_KERNEL);
-	if (platform == NULL)
-		return -ENOMEM;
-
-	ret = snd_soc_add_platform(dev, platform, platform_drv);
-	if (ret)
-		kfree(platform);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(snd_soc_register_platform);
-
-/**
- * snd_soc_remove_platform - Remove a platform from the ASoC core
- * @platform: the platform to remove
- */
-void snd_soc_remove_platform(struct snd_soc_platform *platform)
-{
-
-	mutex_lock(&client_mutex);
-	list_del(&platform->list);
-	snd_soc_component_del_unlocked(&platform->component);
-	mutex_unlock(&client_mutex);
-
-	dev_dbg(platform->dev, "ASoC: Unregistered platform '%s'\n",
-		platform->component.name);
-
-	snd_soc_component_cleanup(&platform->component);
-}
-EXPORT_SYMBOL_GPL(snd_soc_remove_platform);
-
-struct snd_soc_platform *snd_soc_lookup_platform(struct device *dev)
-{
-	struct snd_soc_platform *platform;
-
-	mutex_lock(&client_mutex);
-	list_for_each_entry(platform, &platform_list, list) {
-		if (dev == platform->dev) {
-			mutex_unlock(&client_mutex);
-			return platform;
-		}
-	}
-	mutex_unlock(&client_mutex);
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(snd_soc_lookup_platform);
-
-/**
- * snd_soc_unregister_platform - Unregister a platform from the ASoC core
- *
- * @dev: platform to unregister
- */
-void snd_soc_unregister_platform(struct device *dev)
-{
-	struct snd_soc_platform *platform;
-
-	platform = snd_soc_lookup_platform(dev);
-	if (!platform)
-		return;
-
-	snd_soc_remove_platform(platform);
-	kfree(platform);
-}
-EXPORT_SYMBOL_GPL(snd_soc_unregister_platform);
-
 static int snd_soc_codec_drv_probe(struct snd_soc_component *component)
 {
 	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
diff --git a/sound/soc/soc-devres.c b/sound/soc/soc-devres.c
index a57921eeee81..7ac745df1412 100644
--- a/sound/soc/soc-devres.c
+++ b/sound/soc/soc-devres.c
@@ -52,41 +52,6 @@ int devm_snd_soc_register_component(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_snd_soc_register_component);
 
-static void devm_platform_release(struct device *dev, void *res)
-{
-	snd_soc_unregister_platform(*(struct device **)res);
-}
-
-/**
- * devm_snd_soc_register_platform - resource managed platform registration
- * @dev: Device used to manage platform
- * @platform_drv: platform to register
- *
- * Register a platform driver with automatic unregistration when the device is
- * unregistered.
- */
-int devm_snd_soc_register_platform(struct device *dev,
-			const struct snd_soc_platform_driver *platform_drv)
-{
-	struct device **ptr;
-	int ret;
-
-	ptr = devres_alloc(devm_platform_release, sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
-		return -ENOMEM;
-
-	ret = snd_soc_register_platform(dev, platform_drv);
-	if (ret == 0) {
-		*ptr = dev;
-		devres_add(dev, ptr);
-	} else {
-		devres_free(ptr);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(devm_snd_soc_register_platform);
-
 static void devm_card_release(struct device *dev, void *res)
 {
 	snd_soc_unregister_card(*(struct snd_soc_card **)res);
diff --git a/sound/soc/soc-io.c b/sound/soc/soc-io.c
index d36a192fbece..c92a04bac3c5 100644
--- a/sound/soc/soc-io.c
+++ b/sound/soc/soc-io.c
@@ -267,24 +267,3 @@ int snd_soc_test_bits(struct snd_soc_codec *codec, unsigned int reg,
 	return snd_soc_component_test_bits(&codec->component, reg, mask, value);
 }
 EXPORT_SYMBOL_GPL(snd_soc_test_bits);
-
-int snd_soc_platform_read(struct snd_soc_platform *platform,
-					unsigned int reg)
-{
-	unsigned int val;
-	int ret;
-
-	ret = snd_soc_component_read(&platform->component, reg, &val);
-	if (ret < 0)
-		return -1;
-
-	return val;
-}
-EXPORT_SYMBOL_GPL(snd_soc_platform_read);
-
-int snd_soc_platform_write(struct snd_soc_platform *platform,
-					 unsigned int reg, unsigned int val)
-{
-	return snd_soc_component_write(&platform->component, reg, val);
-}
-EXPORT_SYMBOL_GPL(snd_soc_platform_write);
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 68d9dc930096..da5a2dcb6520 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -456,13 +456,12 @@ static void soc_pcm_init_runtime_hw(struct snd_pcm_substream *substream)
 /*
  * Called by ALSA when a PCM substream is opened, the runtime->hw record is
  * then initialized and any private data can be allocated. This also calls
- * startup for the cpu DAI, platform, machine and codec DAI.
+ * startup for the cpu DAI, component, machine and codec DAI.
  */
 static int soc_pcm_open(struct snd_pcm_substream *substream)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
 	struct snd_pcm_runtime *runtime = substream->runtime;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -492,23 +491,10 @@ static int soc_pcm_open(struct snd_pcm_substream *substream)
 		}
 	}
 
-	if (platform && platform->driver->ops && platform->driver->ops->open) {
-		ret = platform->driver->ops->open(substream);
-		if (ret < 0) {
-			dev_err(platform->dev, "ASoC: can't open platform"
-				" %s: %d\n", platform->component.name, ret);
-			goto platform_err;
-		}
-	}
-
 	ret = 0;
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->open)
 			continue;
@@ -634,10 +620,6 @@ component_err:
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->close)
 			continue;
@@ -645,10 +627,6 @@ component_err:
 		component->driver->ops->close(substream);
 	}
 
-	if (platform && platform->driver->ops && platform->driver->ops->close)
-		platform->driver->ops->close(substream);
-
-platform_err:
 	if (cpu_dai->driver->ops->shutdown)
 		cpu_dai->driver->ops->shutdown(substream, cpu_dai);
 out:
@@ -701,13 +679,12 @@ static void close_delayed_work(struct work_struct *work)
 
 /*
  * Called by ALSA when a PCM substream is closed. Private data can be
- * freed here. The cpu DAI, codec DAI, machine and platform are also
+ * freed here. The cpu DAI, codec DAI, machine and components are also
  * shutdown.
  */
 static int soc_pcm_close(struct snd_pcm_substream *substream)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -742,16 +719,9 @@ static int soc_pcm_close(struct snd_pcm_substream *substream)
 	if (rtd->dai_link->ops->shutdown)
 		rtd->dai_link->ops->shutdown(substream);
 
-	if (platform && platform->driver->ops && platform->driver->ops->close)
-		platform->driver->ops->close(substream);
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->close)
 			continue;
@@ -805,7 +775,6 @@ static int soc_pcm_close(struct snd_pcm_substream *substream)
 static int soc_pcm_prepare(struct snd_pcm_substream *substream)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -823,22 +792,9 @@ static int soc_pcm_prepare(struct snd_pcm_substream *substream)
 		}
 	}
 
-	if (platform && platform->driver->ops && platform->driver->ops->prepare) {
-		ret = platform->driver->ops->prepare(substream);
-		if (ret < 0) {
-			dev_err(platform->dev, "ASoC: platform prepare error:"
-				" %d\n", ret);
-			goto out;
-		}
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->prepare)
 			continue;
@@ -932,7 +888,6 @@ static int soc_pcm_hw_params(struct snd_pcm_substream *substream,
 				struct snd_pcm_hw_params *params)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -994,23 +949,10 @@ static int soc_pcm_hw_params(struct snd_pcm_substream *substream,
 	if (ret < 0)
 		goto interface_err;
 
-	if (platform && platform->driver->ops && platform->driver->ops->hw_params) {
-		ret = platform->driver->ops->hw_params(substream, params);
-		if (ret < 0) {
-			dev_err(platform->dev, "ASoC: %s hw params failed: %d\n",
-			       platform->component.name, ret);
-			goto platform_err;
-		}
-	}
-
 	ret = 0;
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->hw_params)
 			continue;
@@ -1043,10 +985,6 @@ component_err:
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->hw_free)
 			continue;
@@ -1054,10 +992,6 @@ component_err:
 		component->driver->ops->hw_free(substream);
 	}
 
-	if (platform && platform->driver->ops && platform->driver->ops->hw_free)
-		platform->driver->ops->hw_free(substream);
-
-platform_err:
 	if (cpu_dai->driver->ops->hw_free)
 		cpu_dai->driver->ops->hw_free(substream, cpu_dai);
 
@@ -1085,7 +1019,6 @@ codec_err:
 static int soc_pcm_hw_free(struct snd_pcm_substream *substream)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -1123,18 +1056,10 @@ static int soc_pcm_hw_free(struct snd_pcm_substream *substream)
 	if (rtd->dai_link->ops->hw_free)
 		rtd->dai_link->ops->hw_free(substream);
 
-	/* free any DMA resources */
-	if (platform && platform->driver->ops && platform->driver->ops->hw_free)
-		platform->driver->ops->hw_free(substream);
-
 	/* free any component resources */
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->hw_free)
 			continue;
@@ -1159,7 +1084,6 @@ static int soc_pcm_hw_free(struct snd_pcm_substream *substream)
 static int soc_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -1176,19 +1100,9 @@ static int soc_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 		}
 	}
 
-	if (platform && platform->driver->ops && platform->driver->ops->trigger) {
-		ret = platform->driver->ops->trigger(substream, cmd);
-		if (ret < 0)
-			return ret;
-	}
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->trigger)
 			continue;
@@ -1240,13 +1154,12 @@ static int soc_pcm_bespoke_trigger(struct snd_pcm_substream *substream,
 }
 /*
  * soc level wrapper for pointer callback
- * If cpu_dai, codec_dai, platform driver has the delay callback, than
+ * If cpu_dai, codec_dai, component driver has the delay callback, then
  * the runtime->delay will be updated accordingly.
  */
 static snd_pcm_uframes_t soc_pcm_pointer(struct snd_pcm_substream *substream)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
@@ -1257,16 +1170,9 @@ static snd_pcm_uframes_t soc_pcm_pointer(struct snd_pcm_substream *substream)
 	snd_pcm_sframes_t codec_delay = 0;
 	int i;
 
-	if (platform && platform->driver->ops && platform->driver->ops->pointer)
-		offset = platform->driver->ops->pointer(substream);
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->pointer)
 			continue;
@@ -2470,20 +2376,12 @@ static int soc_pcm_ioctl(struct snd_pcm_substream *substream,
 		     unsigned int cmd, void *arg)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
 	struct snd_soc_rtdcom_list *rtdcom;
 
-	if (platform && platform->driver->ops && platform->driver->ops->ioctl)
-		return platform->driver->ops->ioctl(substream, cmd, arg);
-
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		/* ignore duplication for now */
-		if (platform && (component == &platform->component))
-			continue;
-
 		if (!component->driver->ops ||
 		    !component->driver->ops->ioctl)
 			continue;
@@ -2987,7 +2885,6 @@ static int soc_rtdcom_mmap(struct snd_pcm_substream *substream,
 /* create a new pcm */
 int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num)
 {
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_dai *codec_dai;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
 	struct snd_soc_component *component;
@@ -3106,16 +3003,6 @@ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num)
 			rtd->ops.mmap		= soc_rtdcom_mmap;
 	}
 
-	/* overwrite */
-	if (platform && platform->driver->ops) {
-		rtd->ops.ack		= platform->driver->ops->ack;
-		rtd->ops.copy_user	= platform->driver->ops->copy_user;
-		rtd->ops.copy_kernel	= platform->driver->ops->copy_kernel;
-		rtd->ops.fill_silence	= platform->driver->ops->fill_silence;
-		rtd->ops.page		= platform->driver->ops->page;
-		rtd->ops.mmap		= platform->driver->ops->mmap;
-	}
-
 	if (playback)
 		snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &rtd->ops);
 
-- 
cgit v1.2.3


From e1b63c0148a7f8edf1691770ec0527fe86fb6ab8 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 24 Apr 2018 10:04:32 +0200
Subject: HID: store the full list of reports in the hidinput

We were only storing the report in case of QUIRK_MULTI_INPUT.
It is interesting for the upcoming  HID_QUIRK_INPUT_PER_APP to also
store the full list of reports that are attached to it.

We need the full list because a device (Advanced Silicon has some)
might want to use a different report ID for the Input reports and
the Output reports. Storing the full list allows the drivers to
have all the data.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 6 ++++++
 include/linux/hid.h     | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 04056773102e..fd1c4fe70327 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1526,9 +1526,12 @@ static struct hid_input *hidinput_allocate(struct hid_device *hid)
 	input_dev->id.product = hid->product;
 	input_dev->id.version = hid->version;
 	input_dev->dev.parent = &hid->dev;
+
 	hidinput->input = input_dev;
 	list_add_tail(&hidinput->list, &hid->inputs);
 
+	INIT_LIST_HEAD(&hidinput->reports);
+
 	return hidinput;
 }
 
@@ -1678,6 +1681,9 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 
 			if (hid->quirks & HID_QUIRK_MULTI_INPUT)
 				hidinput->report = report;
+
+			list_add_tail(&report->hidinput_list,
+				      &hidinput->reports);
 		}
 	}
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 0267aa5c1ea3..396068ccc197 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -464,6 +464,7 @@ struct hid_field {
 
 struct hid_report {
 	struct list_head list;
+	struct list_head hidinput_list;
 	unsigned id;					/* id of this report */
 	unsigned type;					/* report type */
 	struct hid_field *field[HID_MAX_FIELDS];	/* fields of the report */
@@ -510,6 +511,7 @@ struct hid_input {
 	struct hid_report *report;
 	struct input_dev *input;
 	bool registered;
+	struct list_head reports;	/* the list of reports */
 };
 
 enum hid_type {
-- 
cgit v1.2.3


From f07b3c1da92db108662f99417a212fc1eddc44d1 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 24 Apr 2018 10:04:33 +0200
Subject: HID: generic: create one input report per application type

It is not a good idea to try to fit all types of applications in the
same input report. There are a lot of devices that are needing
the quirk HID_MULTI_INPUT but this quirk doesn't match the actual HID
description as it is based on the report ID.

Given that most devices with MULTI_INPUT I can think of split nicely
the devices inputs into application, it is a good thing to split the
devices by default based on this assumption.

Also make hid-multitouch following this rule, to not have to deal
with too many input created.

While we are at it, fix some checkpatch complaints about converting
'unsigned' to 'unsigned int'.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c       | 19 +++++++++++++------
 drivers/hid/hid-generic.c    | 15 +++++++++++++++
 drivers/hid/hid-gfrm.c       |  2 +-
 drivers/hid/hid-input.c      | 17 +++++++++++++++++
 drivers/hid/hid-magicmouse.c |  6 +++---
 include/linux/hid.h          | 10 +++++++---
 6 files changed, 56 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 5d7cc6bbbac6..68819106f4fc 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -57,7 +57,9 @@ MODULE_PARM_DESC(ignore_special_drivers, "Ignore any special drivers and handle
  * Register a new report for a device.
  */
 
-struct hid_report *hid_register_report(struct hid_device *device, unsigned type, unsigned id)
+struct hid_report *hid_register_report(struct hid_device *device,
+				       unsigned int type, unsigned int id,
+				       unsigned int application)
 {
 	struct hid_report_enum *report_enum = device->report_enum + type;
 	struct hid_report *report;
@@ -78,6 +80,7 @@ struct hid_report *hid_register_report(struct hid_device *device, unsigned type,
 	report->type = type;
 	report->size = 0;
 	report->device = device;
+	report->application = application;
 	report_enum->report_id_hash[id] = report;
 
 	list_add_tail(&report->list, &report_enum->report_list);
@@ -221,11 +224,15 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign
 {
 	struct hid_report *report;
 	struct hid_field *field;
-	unsigned usages;
-	unsigned offset;
-	unsigned i;
+	unsigned int usages;
+	unsigned int offset;
+	unsigned int i;
+	unsigned int application;
+
+	application = hid_lookup_collection(parser, HID_COLLECTION_APPLICATION);
 
-	report = hid_register_report(parser->device, report_type, parser->global.report_id);
+	report = hid_register_report(parser->device, report_type,
+				     parser->global.report_id, application);
 	if (!report) {
 		hid_err(parser->device, "hid_register_report failed\n");
 		return -1;
@@ -259,7 +266,7 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign
 
 	field->physical = hid_lookup_collection(parser, HID_COLLECTION_PHYSICAL);
 	field->logical = hid_lookup_collection(parser, HID_COLLECTION_LOGICAL);
-	field->application = hid_lookup_collection(parser, HID_COLLECTION_APPLICATION);
+	field->application = application;
 
 	for (i = 0; i < usages; i++) {
 		unsigned j = i;
diff --git a/drivers/hid/hid-generic.c b/drivers/hid/hid-generic.c
index c25b4718de44..3b6eccbc2519 100644
--- a/drivers/hid/hid-generic.c
+++ b/drivers/hid/hid-generic.c
@@ -56,6 +56,20 @@ static bool hid_generic_match(struct hid_device *hdev,
 	return true;
 }
 
+static int hid_generic_probe(struct hid_device *hdev,
+			     const struct hid_device_id *id)
+{
+	int ret;
+
+	hdev->quirks |= HID_QUIRK_INPUT_PER_APP;
+
+	ret = hid_parse(hdev);
+	if (ret)
+		return ret;
+
+	return hid_hw_start(hdev, HID_CONNECT_DEFAULT);
+}
+
 static const struct hid_device_id hid_table[] = {
 	{ HID_DEVICE(HID_BUS_ANY, HID_GROUP_ANY, HID_ANY_ID, HID_ANY_ID) },
 	{ }
@@ -66,6 +80,7 @@ static struct hid_driver hid_generic = {
 	.name = "hid-generic",
 	.id_table = hid_table,
 	.match = hid_generic_match,
+	.probe = hid_generic_probe,
 };
 module_hid_driver(hid_generic);
 
diff --git a/drivers/hid/hid-gfrm.c b/drivers/hid/hid-gfrm.c
index 075b1c020846..cf477f8c8f4c 100644
--- a/drivers/hid/hid-gfrm.c
+++ b/drivers/hid/hid-gfrm.c
@@ -116,7 +116,7 @@ static int gfrm_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		 * those reports reach gfrm_raw_event() from hid_input_report().
 		 */
 		if (!hid_register_report(hdev, HID_INPUT_REPORT,
-					 GFRM100_SEARCH_KEY_REPORT_ID)) {
+					 GFRM100_SEARCH_KEY_REPORT_ID, 0)) {
 			ret = -ENOMEM;
 			goto done;
 		}
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index fd1c4fe70327..7463ee2a1df2 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1610,6 +1610,20 @@ static struct hid_input *hidinput_match(struct hid_report *report)
 	return NULL;
 }
 
+static struct hid_input *hidinput_match_application(struct hid_report *report)
+{
+	struct hid_device *hid = report->device;
+	struct hid_input *hidinput;
+
+	list_for_each_entry(hidinput, &hid->inputs, list) {
+		if (hidinput->report &&
+		    hidinput->report->application == report->application)
+			return hidinput;
+	}
+
+	return NULL;
+}
+
 static inline void hidinput_configure_usages(struct hid_input *hidinput,
 					     struct hid_report *report)
 {
@@ -1670,6 +1684,9 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 			 */
 			if (hid->quirks & HID_QUIRK_MULTI_INPUT)
 				hidinput = hidinput_match(report);
+			else if (hid->maxapplication > 1 &&
+				 (hid->quirks & HID_QUIRK_INPUT_PER_APP))
+				hidinput = hidinput_match_application(report);
 
 			if (!hidinput) {
 				hidinput = hidinput_allocate(hid);
diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c
index 42ed887ba0be..b454c4386157 100644
--- a/drivers/hid/hid-magicmouse.c
+++ b/drivers/hid/hid-magicmouse.c
@@ -531,12 +531,12 @@ static int magicmouse_probe(struct hid_device *hdev,
 
 	if (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE)
 		report = hid_register_report(hdev, HID_INPUT_REPORT,
-			MOUSE_REPORT_ID);
+			MOUSE_REPORT_ID, 0);
 	else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
 		report = hid_register_report(hdev, HID_INPUT_REPORT,
-			TRACKPAD_REPORT_ID);
+			TRACKPAD_REPORT_ID, 0);
 		report = hid_register_report(hdev, HID_INPUT_REPORT,
-			DOUBLE_REPORT_ID);
+			DOUBLE_REPORT_ID, 0);
 	}
 
 	if (!report) {
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 396068ccc197..bcc91bfdd2cb 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -341,6 +341,7 @@ struct hid_item {
 /* BIT(8) reserved for backward compatibility, was HID_QUIRK_NO_EMPTY_INPUT */
 /* BIT(9) reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */
 #define HID_QUIRK_ALWAYS_POLL			BIT(10)
+#define HID_QUIRK_INPUT_PER_APP			BIT(11)
 #define HID_QUIRK_SKIP_OUTPUT_REPORTS		BIT(16)
 #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID		BIT(17)
 #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP	BIT(18)
@@ -465,8 +466,9 @@ struct hid_field {
 struct hid_report {
 	struct list_head list;
 	struct list_head hidinput_list;
-	unsigned id;					/* id of this report */
-	unsigned type;					/* report type */
+	unsigned int id;				/* id of this report */
+	unsigned int type;				/* report type */
+	unsigned int application;			/* application usage for this report */
 	struct hid_field *field[HID_MAX_FIELDS];	/* fields of the report */
 	unsigned maxfield;				/* maximum valid field index */
 	unsigned size;					/* size of the report (bits) */
@@ -861,7 +863,9 @@ void hid_output_report(struct hid_report *report, __u8 *data);
 void __hid_request(struct hid_device *hid, struct hid_report *rep, int reqtype);
 u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags);
 struct hid_device *hid_allocate_device(void);
-struct hid_report *hid_register_report(struct hid_device *device, unsigned type, unsigned id);
+struct hid_report *hid_register_report(struct hid_device *device,
+				       unsigned int type, unsigned int id,
+				       unsigned int application);
 int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size);
 struct hid_report *hid_validate_values(struct hid_device *hid,
 				       unsigned int type, unsigned int id,
-- 
cgit v1.2.3


From c554bb045511bd6b498b6a61cffa48e473853703 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 24 Apr 2018 10:04:34 +0200
Subject: HID: input: append a suffix matching the application

Given that we create one input node per application, we should name
the input node accordingly to not lose userspace.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 67 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/hid.h     |  1 +
 2 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 7463ee2a1df2..fea6d4898f15 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1500,15 +1500,56 @@ static void report_features(struct hid_device *hid)
 		}
 }
 
-static struct hid_input *hidinput_allocate(struct hid_device *hid)
+static struct hid_input *hidinput_allocate(struct hid_device *hid,
+					   unsigned int application)
 {
 	struct hid_input *hidinput = kzalloc(sizeof(*hidinput), GFP_KERNEL);
 	struct input_dev *input_dev = input_allocate_device();
-	if (!hidinput || !input_dev) {
-		kfree(hidinput);
-		input_free_device(input_dev);
-		hid_err(hid, "Out of memory during hid input probe\n");
-		return NULL;
+	const char *suffix = NULL;
+
+	if (!hidinput || !input_dev)
+		goto fail;
+
+	if ((hid->quirks & HID_QUIRK_INPUT_PER_APP) &&
+	    hid->maxapplication > 1) {
+		switch (application) {
+		case HID_GD_KEYBOARD:
+			suffix = "Keyboard";
+			break;
+		case HID_GD_KEYPAD:
+			suffix = "Keypad";
+			break;
+		case HID_GD_MOUSE:
+			suffix = "Mouse";
+			break;
+		case HID_DG_STYLUS:
+			suffix = "Pen";
+			break;
+		case HID_DG_TOUCHSCREEN:
+			suffix = "Touchscreen";
+			break;
+		case HID_DG_TOUCHPAD:
+			suffix = "Touchpad";
+			break;
+		case HID_GD_SYSTEM_CONTROL:
+			suffix = "System Control";
+			break;
+		case HID_CP_CONSUMER_CONTROL:
+			suffix = "Consumer Control";
+			break;
+		case HID_GD_WIRELESS_RADIO_CTLS:
+			suffix = "Wireless Radio Control";
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (suffix) {
+		hidinput->name = kasprintf(GFP_KERNEL, "%s %s",
+					   hid->name, suffix);
+		if (!hidinput->name)
+			goto fail;
 	}
 
 	input_set_drvdata(input_dev, hid);
@@ -1518,7 +1559,7 @@ static struct hid_input *hidinput_allocate(struct hid_device *hid)
 	input_dev->setkeycode = hidinput_setkeycode;
 	input_dev->getkeycode = hidinput_getkeycode;
 
-	input_dev->name = hid->name;
+	input_dev->name = hidinput->name ? hidinput->name : hid->name;
 	input_dev->phys = hid->phys;
 	input_dev->uniq = hid->uniq;
 	input_dev->id.bustype = hid->bus;
@@ -1533,6 +1574,12 @@ static struct hid_input *hidinput_allocate(struct hid_device *hid)
 	INIT_LIST_HEAD(&hidinput->reports);
 
 	return hidinput;
+
+fail:
+	kfree(hidinput);
+	input_free_device(input_dev);
+	hid_err(hid, "Out of memory during hid input probe\n");
+	return NULL;
 }
 
 static bool hidinput_has_been_populated(struct hid_input *hidinput)
@@ -1578,6 +1625,7 @@ static void hidinput_cleanup_hidinput(struct hid_device *hid,
 
 	list_del(&hidinput->list);
 	input_free_device(hidinput->input);
+	kfree(hidinput->name);
 
 	for (k = HID_INPUT_REPORT; k <= HID_OUTPUT_REPORT; k++) {
 		if (k == HID_OUTPUT_REPORT &&
@@ -1646,6 +1694,7 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 	struct hid_driver *drv = hid->driver;
 	struct hid_report *report;
 	struct hid_input *next, *hidinput = NULL;
+	unsigned int application;
 	int i, k;
 
 	INIT_LIST_HEAD(&hid->inputs);
@@ -1678,6 +1727,8 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 			if (!report->maxfield)
 				continue;
 
+			application = report->application;
+
 			/*
 			 * Find the previous hidinput report attached
 			 * to this report id.
@@ -1689,7 +1740,7 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 				hidinput = hidinput_match_application(report);
 
 			if (!hidinput) {
-				hidinput = hidinput_allocate(hid);
+				hidinput = hidinput_allocate(hid, application);
 				if (!hidinput)
 					goto out_unwind;
 			}
diff --git a/include/linux/hid.h b/include/linux/hid.h
index bcc91bfdd2cb..f03d7a410c5d 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -512,6 +512,7 @@ struct hid_input {
 	struct list_head list;
 	struct hid_report *report;
 	struct input_dev *input;
+	const char *name;
 	bool registered;
 	struct list_head reports;	/* the list of reports */
 };
-- 
cgit v1.2.3


From 02946f4b43b11026b1a76857a33b09078b900939 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 24 Apr 2018 10:04:37 +0200
Subject: HID: multitouch: implement precision touchpad latency and switches

The Win 8.1 precision touchpad spec introduce new modes for touchpads
that can come in handy[1].

Implement the settings of these modes, so we are not taken off-guard if
a firmware decides to enforce them.

[1] https://docs.microsoft.com/en-us/windows-hardware/design/component-guidelines/windows-precision-touchpad-required-hid-top-level-collections

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-multitouch.c | 34 +++++++++++++++++++++++++++++-----
 include/linux/hid.h          |  3 +++
 2 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index 8878de9eedba..82c98bf14d60 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -81,6 +81,11 @@ MODULE_LICENSE("GPL");
 
 #define MT_BUTTONTYPE_CLICKPAD		0
 
+enum latency_mode {
+	HID_LATENCY_NORMAL = 0,
+	HID_LATENCY_HIGH = 1,
+};
+
 #define MT_IO_FLAGS_RUNNING		0
 #define MT_IO_FLAGS_ACTIVE_SLOTS	1
 #define MT_IO_FLAGS_PENDING_SLOTS	2
@@ -1156,7 +1161,10 @@ static void mt_report(struct hid_device *hid, struct hid_report *report)
 
 static bool mt_need_to_apply_feature(struct hid_device *hdev,
 				     struct hid_field *field,
-				     struct hid_usage *usage)
+				     struct hid_usage *usage,
+				     enum latency_mode latency,
+				     bool surface_switch,
+				     bool button_switch)
 {
 	struct mt_device *td = hid_get_drvdata(hdev);
 	struct mt_class *cls = &td->mtclass;
@@ -1195,12 +1203,25 @@ static bool mt_need_to_apply_feature(struct hid_device *hdev,
 			}
 		}
 		break;
+
+	case HID_DG_LATENCYMODE:
+		field->value[index] = latency;
+		return 1;
+
+	case HID_DG_SURFACESWITCH:
+		field->value[index] = surface_switch;
+		return 1;
+
+	case HID_DG_BUTTONSWITCH:
+		field->value[index] = button_switch;
+		return 1;
 	}
 
 	return false; /* no need to update the report */
 }
 
-static void mt_set_modes(struct hid_device *hdev)
+static void mt_set_modes(struct hid_device *hdev, enum latency_mode latency,
+			 bool surface_switch, bool button_switch)
 {
 	struct hid_report_enum *rep_enum;
 	struct hid_report *rep;
@@ -1222,7 +1243,10 @@ static void mt_set_modes(struct hid_device *hdev)
 
 				if (mt_need_to_apply_feature(hdev,
 							     rep->field[i],
-							     usage))
+							     usage,
+							     latency,
+							     surface_switch,
+							     button_switch))
 					update_report = true;
 			}
 		}
@@ -1467,7 +1491,7 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		dev_warn(&hdev->dev, "Cannot allocate sysfs group for %s\n",
 				hdev->name);
 
-	mt_set_modes(hdev);
+	mt_set_modes(hdev, HID_LATENCY_NORMAL, true, true);
 
 	/* release .fields memory as it is not used anymore */
 	devm_kfree(&hdev->dev, td->fields);
@@ -1480,7 +1504,7 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
 static int mt_reset_resume(struct hid_device *hdev)
 {
 	mt_release_contacts(hdev);
-	mt_set_modes(hdev);
+	mt_set_modes(hdev, HID_LATENCY_NORMAL, true, true);
 	return 0;
 }
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index f03d7a410c5d..a1be991e1eae 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -292,9 +292,12 @@ struct hid_item {
 #define HID_DG_CONTACTCOUNT	0x000d0054
 #define HID_DG_CONTACTMAX	0x000d0055
 #define HID_DG_SCANTIME		0x000d0056
+#define HID_DG_SURFACESWITCH	0x000d0057
+#define HID_DG_BUTTONSWITCH	0x000d0058
 #define HID_DG_BUTTONTYPE	0x000d0059
 #define HID_DG_BARRELSWITCH2	0x000d005a
 #define HID_DG_TOOLSERIALNUMBER	0x000d005b
+#define HID_DG_LATENCYMODE	0x000d0060
 
 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS	0xff310076
 /*
-- 
cgit v1.2.3


From 1cd7884dfd78df6284d27b008823b0b4a808f196 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:15 -0400
Subject: udp: expose inet cork to udp

UDP segmentation offload needs access to inet_cork in the udp layer.
Pass the struct to ip(6)_make_skb instead of allocating it on the
stack in that function itself.

This patch is a noop otherwise.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h      |  2 +-
 include/net/ipv6.h    |  1 +
 net/ipv4/ip_output.c  | 17 ++++++++---------
 net/ipv4/udp.c        |  4 +++-
 net/ipv6/ip6_output.c | 20 ++++++++++----------
 net/ipv6/udp.c        |  3 ++-
 6 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index dc4a2d6e58a5..7ec543a64bbc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -171,7 +171,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags);
+			    struct inet_cork *cork, unsigned int flags);
 
 static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
 {
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 68b167d98879..0dd722cab037 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -950,6 +950,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc);
 
 static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 83c73bab2c3d..2883ff1e909c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1470,9 +1470,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags)
+			    struct inet_cork *cork, unsigned int flags)
 {
-	struct inet_cork cork;
 	struct sk_buff_head queue;
 	int err;
 
@@ -1481,22 +1480,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.flags = 0;
-	cork.addr = 0;
-	cork.opt = NULL;
-	err = ip_setup_cork(sk, &cork, ipc, rtp);
+	cork->flags = 0;
+	cork->addr = 0;
+	cork->opt = NULL;
+	err = ip_setup_cork(sk, cork, ipc, rtp);
 	if (err)
 		return ERR_PTR(err);
 
-	err = __ip_append_data(sk, fl4, &queue, &cork,
+	err = __ip_append_data(sk, fl4, &queue, cork,
 			       &current->task_frag, getfrag,
 			       from, length, transhdrlen, flags);
 	if (err) {
-		__ip_flush_pending_frames(sk, &queue, &cork);
+		__ip_flush_pending_frames(sk, &queue, cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip_make_skb(sk, fl4, &queue, &cork);
+	return __ip_make_skb(sk, fl4, &queue, cork);
 }
 
 /*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24b5c59b1c53..6b9d8017b319 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1030,9 +1030,11 @@ back_from_confirm:
 
 	/* Lockless fast path for the non-corking case. */
 	if (!corkreq) {
+		struct inet_cork cork;
+
 		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
 				  sizeof(struct udphdr), &ipc, &rt,
-				  msg->msg_flags);
+				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_send_skb(skb, fl4);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6a477d54f8c7..7fa1db447405 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1755,9 +1755,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc)
 {
-	struct inet_cork_full cork;
 	struct inet6_cork v6_cork;
 	struct sk_buff_head queue;
 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1768,27 +1768,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.base.flags = 0;
-	cork.base.addr = 0;
-	cork.base.opt = NULL;
-	cork.base.dst = NULL;
+	cork->base.flags = 0;
+	cork->base.addr = 0;
+	cork->base.opt = NULL;
+	cork->base.dst = NULL;
 	v6_cork.opt = NULL;
-	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
+	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
 	if (err) {
-		ip6_cork_release(&cork, &v6_cork);
+		ip6_cork_release(cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 	if (ipc6->dontfrag < 0)
 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
 
-	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
 				&current->task_frag, getfrag, from,
 				length + exthdrlen, transhdrlen + exthdrlen,
 				flags, ipc6, sockc);
 	if (err) {
-		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4ec76a87aeb8..824797f8d1ab 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1324,12 +1324,13 @@ back_from_confirm:
 
 	/* Lockless fast path for the non-corking case */
 	if (!corkreq) {
+		struct inet_cork_full cork;
 		struct sk_buff *skb;
 
 		skb = ip6_make_skb(sk, getfrag, msg, ulen,
 				   sizeof(struct udphdr), &ipc6,
 				   &fl6, (struct rt6_info *)dst,
-				   msg->msg_flags, &sockc);
+				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_v6_send_skb(skb, &fl6);
-- 
cgit v1.2.3


From ee80d1ebe5ba7f4bd74959c873119175a4fc08d3 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:16 -0400
Subject: udp: add udp gso

Implement generic segmentation offload support for udp datagrams. A
follow-up patch adds support to the protocol stack to generate such
packets.

UDP GSO is not UFO. UFO fragments a single large datagram. GSO splits
a large payload into a number of discrete UDP datagrams.

The implementation adds a GSO type SKB_UDP_GSO_L4 to differentiate it
from UFO (SKB_UDP_GSO).

IPPROTO_UDPLITE is excluded, as that protocol has no gso handler
registered.

[ Export __udp_gso_segment for ipv6. -DaveM ]

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 ++
 include/net/udp.h      |  4 ++++
 net/core/skbuff.c      |  2 ++
 net/ipv4/udp_offload.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/ip6_offload.c |  6 ++++--
 net/ipv6/udp_offload.c | 19 +++++++++++++++++-
 6 files changed, 82 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d274059529eb..a4a5c0c5cba8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -573,6 +573,8 @@ enum {
 	SKB_GSO_ESP = 1 << 15,
 
 	SKB_GSO_UDP = 1 << 16,
+
+	SKB_GSO_UDP_L4 = 1 << 17,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/include/net/udp.h b/include/net/udp.h
index 0676b272f6ac..741d888d0fdb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -174,6 +174,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+				  netdev_features_t features,
+				  unsigned int mss, __sum16 check);
+
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
 	struct udphdr *uh;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ff49e352deea..c647cfe114e0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4940,6 +4940,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 		thlen = tcp_hdrlen(skb);
 	} else if (unlikely(skb_is_gso_sctp(skb))) {
 		thlen = sizeof(struct sctphdr);
+	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+		thlen = sizeof(struct udphdr);
 	}
 	/* UFO sets gso_size to the size of the fragmentation
 	 * payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ea6e6e7df0ee..6c2b7640f6f3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,6 +187,54 @@ out_unlock:
 }
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+				  netdev_features_t features,
+				  unsigned int mss, __sum16 check)
+{
+	struct sk_buff *segs, *seg;
+	unsigned int hdrlen;
+	struct udphdr *uh;
+
+	if (gso_skb->len <= sizeof(*uh) + mss)
+		return ERR_PTR(-EINVAL);
+
+	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
+	skb_pull(gso_skb, sizeof(*uh));
+
+	segs = skb_segment(gso_skb, features);
+	if (unlikely(IS_ERR_OR_NULL(segs)))
+		return segs;
+
+	for (seg = segs; seg; seg = seg->next) {
+		uh = udp_hdr(seg);
+		uh->len = htons(seg->len - hdrlen);
+		uh->check = check;
+
+		/* last packet can be partial gso_size */
+		if (!seg->next)
+			csum_replace2(&uh->check, htons(mss),
+				      htons(seg->len - hdrlen - sizeof(*uh)));
+	}
+
+	return segs;
+}
+EXPORT_SYMBOL_GPL(__udp_gso_segment);
+
+static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
+					  netdev_features_t features)
+{
+	const struct iphdr *iph = ip_hdr(gso_skb);
+	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
+		return ERR_PTR(-EIO);
+
+	return __udp_gso_segment(gso_skb, features, mss,
+				 udp_v4_check(sizeof(struct udphdr) + mss,
+					      iph->saddr, iph->daddr, 0));
+}
+EXPORT_SYMBOL_GPL(__udp4_gso_segment);
+
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -203,12 +251,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		goto out;
 	}
 
-	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+	if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
 		goto out;
 
 	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 		goto out;
 
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+		return __udp4_gso_segment(skb, features);
+
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
 		goto out;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4a87f9428ca5..5b3f2f89ef41 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	if (skb->encapsulation &&
 	    skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
-		udpfrag = proto == IPPROTO_UDP && encap;
+		udpfrag = proto == IPPROTO_UDP && encap &&
+			  (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 	else
-		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
+			  (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 
 	ops = rcu_dereference(inet6_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2a04dc9c781b..f7b85b1e6b3e 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,6 +17,20 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
+static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
+					  netdev_features_t features)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
+	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
+		return ERR_PTR(-EIO);
+
+	return __udp_gso_segment(gso_skb, features, mss,
+				 udp_v6_check(sizeof(struct udphdr) + mss,
+					      &ip6h->saddr, &ip6h->daddr, 0));
+}
+
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -42,12 +56,15 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		const struct ipv6hdr *ipv6h;
 		struct udphdr *uh;
 
-		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+		if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
 			goto out;
 
 		if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 			goto out;
 
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+			return __udp6_gso_segment(skb, features);
+
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
 		 * do checksum of UDP packets sent as multiple IP fragments.
 		 */
-- 
cgit v1.2.3


From bec1f6f697362c5bc635dacd7ac8499d0a10a4e7 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:17 -0400
Subject: udp: generate gso with UDP_SEGMENT

Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.

To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.

A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.

Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.

The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.

Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.

    tcp tso
     3197 MB/s 54232 msg/s 54232 calls/s
         6,457,754,262      cycles

    tcp gso
     1765 MB/s 29939 msg/s 29939 calls/s
        11,203,021,806      cycles

    tcp without tso/gso *
      739 MB/s 12548 msg/s 12548 calls/s
        11,205,483,630      cycles

    udp
      876 MB/s 14873 msg/s 624666 calls/s
        11,205,777,429      cycles

    udp gso
     2139 MB/s 36282 msg/s 36282 calls/s
        11,204,374,561      cycles

   [*] after reverting commit 0a6b2a1dc2a2
       ("tcp: switch to GSO being always on")

Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:

  perf stat -a -C 12 -e cycles \
    ./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4

Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h      |  3 +++
 include/net/inet_sock.h  |  1 +
 include/net/ip.h         |  1 +
 include/net/ipv6.h       |  1 +
 include/uapi/linux/udp.h |  1 +
 net/ipv4/ip_output.c     |  9 ++++++---
 net/ipv4/udp.c           | 33 ++++++++++++++++++++++++++++++---
 net/ipv6/ip6_output.c    |  6 ++++--
 net/ipv6/udp.c           | 23 ++++++++++++++++++++---
 9 files changed, 67 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index eaea63bc79bb..ca840345571b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,6 +55,7 @@ struct udp_sock {
 	 * when the socket is uncorked.
 	 */
 	__u16		 len;		/* total length of pending frames */
+	__u16		 gso_size;
 	/*
 	 * Fields specific to UDP-Lite.
 	 */
@@ -87,6 +88,8 @@ struct udp_sock {
 	int		forward_deficit;
 };
 
+#define UDP_MAX_SEGMENTS	(1 << 6UL)
+
 static inline struct udp_sock *udp_sk(const struct sock *sk)
 {
 	return (struct udp_sock *)sk;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..83d5b3c2ac42 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -147,6 +147,7 @@ struct inet_cork {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 struct inet_cork_full {
diff --git a/include/net/ip.h b/include/net/ip.h
index 7ec543a64bbc..bada1f1f871e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -76,6 +76,7 @@ struct ipcm_cookie {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0dd722cab037..0a872a7c33c8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -298,6 +298,7 @@ struct ipcm6_cookie {
 	__s16 tclass;
 	__s8  dontfrag;
 	struct ipv6_txoptions *opt;
+	__u16 gso_size;
 };
 
 static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index efb7b5991c2f..09d00f8c442b 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -32,6 +32,7 @@ struct udphdr {
 #define UDP_ENCAP	100	/* Set the socket to accept encapsulated packets */
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
+#define UDP_SEGMENT	103	/* Set GSO segmentation size */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2883ff1e909c..da4abbee10f7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,7 +882,8 @@ static int __ip_append_data(struct sock *sk,
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+
 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
 		tskey = sk->sk_tskey++;
@@ -906,7 +907,7 @@ static int __ip_append_data(struct sock *sk,
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
 	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
-	    !(flags & MSG_MORE) &&
+	    (!(flags & MSG_MORE) || cork->gso_size) &&
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
@@ -1135,6 +1136,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	*rtp = NULL;
 	cork->fragsize = ip_sk_use_pmtu(sk) ?
 			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+	cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
@@ -1214,7 +1217,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 		return -EOPNOTSUPP;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6b9d8017b319..bda022c5480b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(udp_set_csum);
 
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+			struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +778,21 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)  				 /*     UDP-Lite      */
 		csum = udplite_csum(skb);
 
@@ -828,7 +844,7 @@ int udp_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_send_skb(skb, fl4);
+	err = udp_send_skb(skb, fl4, &inet->cork.base);
 
 out:
 	up->len = 0;
@@ -922,6 +938,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc.sockc.tsflags = sk->sk_tsflags;
 	ipc.addr = inet->inet_saddr;
 	ipc.oif = sk->sk_bound_dev_if;
+	ipc.gso_size = up->gso_size;
 
 	if (msg->msg_controllen) {
 		err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
@@ -1037,7 +1054,7 @@ back_from_confirm:
 				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_send_skb(skb, fl4);
+			err = udp_send_skb(skb, fl4, &cork);
 		goto out;
 	}
 
@@ -2367,6 +2384,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->no_check6_rx = valbool;
 		break;
 
+	case UDP_SEGMENT:
+		if (val < 0 || val > USHRT_MAX)
+			return -EINVAL;
+		up->gso_size = val;
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
@@ -2457,6 +2480,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		val = up->no_check6_rx;
 		break;
 
+	case UDP_SEGMENT:
+		val = up->gso_size;
+		break;
+
 	/* The following two cannot be changed on UDP sockets, the return is
 	 * always 0 (which corresponds to the full checksum coverage of UDP). */
 	case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7fa1db447405..a1c4a78132d2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1240,6 +1240,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	if (mtu < IPV6_MIN_MTU)
 		return -EINVAL;
 	cork->base.fragsize = mtu;
+	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
 	cork->base.length = 0;
@@ -1281,7 +1283,7 @@ static int __ip6_append_data(struct sock *sk,
 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
 	}
 
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
 	orig_mtu = mtu;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1329,7 +1331,7 @@ emsgsize:
 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
 	    headersize == sizeof(struct ipv6hdr) &&
 	    length <= mtu - headersize &&
-	    !(flags & MSG_MORE) &&
+	    (!(flags & MSG_MORE) || cork->gso_size) &&
 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
 		csummode = CHECKSUM_PARTIAL;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 824797f8d1ab..86b7dd58d4b4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
  *	Sending
  */
 
-static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+			   struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct udphdr *uh;
@@ -1042,6 +1043,21 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)
 		csum = udplite_csum(skb);
 	else if (udp_sk(sk)->no_check6_tx) {   /* UDP csum disabled */
@@ -1093,7 +1109,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_v6_send_skb(skb, &fl6);
+	err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
 
 out:
 	up->len = 0;
@@ -1127,6 +1143,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc6.hlimit = -1;
 	ipc6.tclass = -1;
 	ipc6.dontfrag = -1;
+	ipc6.gso_size = up->gso_size;
 	sockc.tsflags = sk->sk_tsflags;
 
 	/* destination address check */
@@ -1333,7 +1350,7 @@ back_from_confirm:
 				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_v6_send_skb(skb, &fl6);
+			err = udp_v6_send_skb(skb, &fl6, &cork.base);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 2e8de8576343ab540856082916bfb84d17288b08 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:20 -0400
Subject: udp: add gso segment cmsg

Allow specifying segment size in the send call.

The new control message performs the same function as socket option
UDP_SEGMENT while avoiding the extra system call.

[ Export udp_cmsg_send for ipv6. -DaveM ]

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h |  1 +
 net/ipv4/udp.c    | 44 ++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/udp.c    |  5 ++++-
 3 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index 741d888d0fdb..05990746810e 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -273,6 +273,7 @@ int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
 int udp_push_pending_frames(struct sock *sk);
 void udp_flush_pending_frames(struct sock *sk);
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
 void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 int udp_rcv(struct sk_buff *skb);
 int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bda022c5480b..794aeafeb782 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -853,6 +853,43 @@ out:
 }
 EXPORT_SYMBOL(udp_push_pending_frames);
 
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+	switch (cmsg->cmsg_type) {
+	case UDP_SEGMENT:
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+			return -EINVAL;
+		*gso_size = *(__u16 *)CMSG_DATA(cmsg);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+	struct cmsghdr *cmsg;
+	bool need_ip = false;
+	int err;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_UDP) {
+			need_ip = true;
+			continue;
+		}
+
+		err = __udp_cmsg_send(cmsg, gso_size);
+		if (err)
+			return err;
+	}
+
+	return need_ip;
+}
+EXPORT_SYMBOL_GPL(udp_cmsg_send);
+
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -941,8 +978,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc.gso_size = up->gso_size;
 
 	if (msg->msg_controllen) {
-		err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
-		if (unlikely(err)) {
+		err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+		if (err > 0)
+			err = ip_cmsg_send(sk, msg, &ipc,
+					   sk->sk_family == AF_INET6);
+		if (unlikely(err < 0)) {
 			kfree(ipc.opt);
 			return err;
 		}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 86b7dd58d4b4..6acfdd3e442b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1276,7 +1276,10 @@ do_udp_sendmsg:
 		opt->tot_len = sizeof(*opt);
 		ipc6.opt = opt;
 
-		err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
+		err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
+		if (err > 0)
+			err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+						    &ipc6, &sockc);
 		if (err < 0) {
 			fl6_sock_release(flowlabel);
 			return err;
-- 
cgit v1.2.3


From 83aa025f535f76733e334e3d2a4d8577c8441a7e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:21 -0400
Subject: udp: add gso support to virtual devices

Virtual devices such as tunnels and bonding can handle large packets.
Only segment packets when reaching a physical or loopback device.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdev-features.txt | 7 +++++++
 include/linux/netdev_features.h              | 5 ++++-
 include/linux/netdevice.h                    | 1 +
 net/core/ethtool.c                           | 1 +
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt
index c77f9d57eb91..c4a54c162547 100644
--- a/Documentation/networking/netdev-features.txt
+++ b/Documentation/networking/netdev-features.txt
@@ -113,6 +113,13 @@ whatever headers there might be.
 NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit
 set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6).
 
+ * Transmit UDP segmentation offload
+
+NETIF_F_GSO_UDP_GSO_L4 accepts a single UDP header with a payload that exceeds
+gso_size. On segmentation, it segments the payload on gso_size boundaries and
+replicates the network and UDP headers (fixing up the last one if less than
+gso_size).
+
  * Transmit DMA from high memory
 
 On platforms where this is relevant, NETIF_F_HIGHDMA signals that
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 35b79f47a13d..fe2f3b30960e 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -55,8 +55,9 @@ enum {
 	NETIF_F_GSO_SCTP_BIT,		/* ... SCTP fragmentation */
 	NETIF_F_GSO_ESP_BIT,		/* ... ESP with TSO */
 	NETIF_F_GSO_UDP_BIT,		/* ... UFO, deprecated except tuntap */
+	NETIF_F_GSO_UDP_L4_BIT,		/* ... UDP payload GSO (not UFO) */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_UDP_BIT,
+		NETIF_F_GSO_UDP_L4_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -147,6 +148,7 @@ enum {
 #define NETIF_F_HW_ESP_TX_CSUM	__NETIF_F(HW_ESP_TX_CSUM)
 #define	NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
 #define NETIF_F_HW_TLS_RECORD	__NETIF_F(HW_TLS_RECORD)
+#define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
@@ -216,6 +218,7 @@ enum {
 				 NETIF_F_GSO_GRE_CSUM |			\
 				 NETIF_F_GSO_IPXIP4 |			\
 				 NETIF_F_GSO_IPXIP6 |			\
+				 NETIF_F_GSO_UDP_L4 |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 14e0777ffcfb..366c32891158 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4186,6 +4186,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 03416e6dd5d7..4650fd6d678c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
 	[NETIF_F_GSO_SCTP_BIT] =	 "tx-sctp-segmentation",
 	[NETIF_F_GSO_ESP_BIT] =		 "tx-esp-segmentation",
+	[NETIF_F_GSO_UDP_L4_BIT] =	 "tx-udp-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
-- 
cgit v1.2.3


From b5c5f3959bb3c018a68f5659bcd6adf05e141a03 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Thu, 26 Apr 2018 22:07:47 +0200
Subject: genirq/irq_sim: Use the SPDX license identifier in the header

Use C-style comment for the identifier as per
Documentation/process/license-rules.rst and remove the license boilerplate.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20180426200747.8344-2-brgl@bgdev.pl
---
 include/linux/irq_sim.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h
index 0380d899b955..630a57e55db6 100644
--- a/include/linux/irq_sim.h
+++ b/include/linux/irq_sim.h
@@ -1,14 +1,11 @@
-#ifndef _LINUX_IRQ_SIM_H
-#define _LINUX_IRQ_SIM_H
+/* SPDX-License-Identifier: GPL-2.0+ */
 /*
- * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
+ * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
  */
 
+#ifndef _LINUX_IRQ_SIM_H
+#define _LINUX_IRQ_SIM_H
+
 #include <linux/irq_work.h>
 #include <linux/device.h>
 
-- 
cgit v1.2.3


From b85fab0e67b162014cd328cb4e2a8e8ae382cb8a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 25 Apr 2018 19:41:06 +0200
Subject: bpf: Add gpl_compatible flag to struct bpf_prog_info

Adding gpl_compatible flag to struct bpf_prog_info
so it can be dumped via bpf_prog_get_info_by_fd and
displayed via bpftool progs dump.

Alexei noticed 4-byte hole in struct bpf_prog_info,
so we put the u32 flags field in there, and we can
keep adding bit fields in there without breaking
user space.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 1 +
 kernel/bpf/syscall.c     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e6679393b687..da8801860c7d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1060,6 +1060,7 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fe23dc5a3ec4..7bb4ff1c770a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1914,6 +1914,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.load_time = prog->aux->load_time;
 	info.created_by_uid = from_kuid_munged(current_user_ns(),
 					       prog->aux->user->uid);
+	info.gpl_compatible = prog->gpl_compatible;
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
 	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
-- 
cgit v1.2.3


From b12e35832805bf120819b83d8fdb6e4d227a5ddb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Apr 2018 14:29:04 -0700
Subject: cgroup: Limit event generation frequency

".events" files generate file modified event to notify userland of
possible new events.  Some of the events can be quite bursty
(e.g. memory high event) and generating notification each time is
costly and pointless.

This patch implements a event rate limit mechanism.  If a new
notification is requested before 10ms has passed since the previous
notification, the new notification is delayed till then.

As this only delays from the second notification on in a given close
cluster of notifications, userland reactions to notifications
shouldn't be delayed at all in most cases while avoiding notification
storms.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  2 ++
 kernel/cgroup/cgroup.c      | 25 +++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index dc5b70449dc6..133531fcfb33 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -105,6 +105,8 @@ enum {
 struct cgroup_file {
 	/* do not access any fields from outside cgroup core */
 	struct kernfs_node *kn;
+	unsigned long notified_at;
+	struct timer_list notify_timer;
 };
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 25309c868728..fdb7a582f8fc 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -61,6 +61,8 @@
 
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
+/* let's not notify more than 100 times per second */
+#define CGROUP_FILE_NOTIFY_MIN_INTV	DIV_ROUND_UP(HZ, 100)
 
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
@@ -1554,6 +1556,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 		spin_lock_irq(&cgroup_file_kn_lock);
 		cfile->kn = NULL;
 		spin_unlock_irq(&cgroup_file_kn_lock);
+
+		del_timer_sync(&cfile->notify_timer);
 	}
 
 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -3532,6 +3536,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 	return kernfs_setattr(kn, &iattr);
 }
 
+static void cgroup_file_notify_timer(struct timer_list *timer)
+{
+	cgroup_file_notify(container_of(timer, struct cgroup_file,
+					notify_timer));
+}
+
 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 			   struct cftype *cft)
 {
@@ -3558,6 +3568,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 	if (cft->file_offset) {
 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
 
+		timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
+
 		spin_lock_irq(&cgroup_file_kn_lock);
 		cfile->kn = kn;
 		spin_unlock_irq(&cgroup_file_kn_lock);
@@ -3807,8 +3819,17 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 	unsigned long flags;
 
 	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
-	if (cfile->kn)
-		kernfs_notify(cfile->kn);
+	if (cfile->kn) {
+		unsigned long last = cfile->notified_at;
+		unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+
+		if (time_in_range(jiffies, last, next)) {
+			timer_reduce(&cfile->notify_timer, next);
+		} else {
+			kernfs_notify(cfile->kn);
+			cfile->notified_at = jiffies;
+		}
+	}
 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
 
-- 
cgit v1.2.3


From c58632b3631cb222da41d9dc0dd39e106c1eafd0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Apr 2018 14:29:04 -0700
Subject: cgroup: Rename stat to rstat

stat is too generic a name and ends up causing subtle confusions.
It'll be made generic so that controllers can plug into it, which will
make the problem worse.  Let's rename it to something more specific -
cgroup_rstat for cgroup recursive stat.

This patch does the following renames.  No other changes.

* cpu_stat	-> rstat_cpu
* stat		-> rstat
* ?cstat	-> ?rstatc

Note that the renames are selective.  The unrenamed are the ones which
implement basic resource statistics on top of rstat.  This will be
further cleaned up in the following patches.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h     |  16 ++--
 kernel/cgroup/cgroup-internal.h |  10 +--
 kernel/cgroup/cgroup.c          |  14 ++--
 kernel/cgroup/rstat.c           | 180 ++++++++++++++++++++--------------------
 4 files changed, 112 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 133531fcfb33..04cb42419310 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -259,11 +259,11 @@ struct css_set {
 };
 
 /*
- * cgroup basic resource usage statistics.  Accounting is done per-cpu in
- * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
- * reads.
+ * rstat - cgroup scalable recursive statistics.  Accounting is done
+ * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
+ * hierarchy on reads.
  *
- * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+ * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
  * linked into the updated tree.  On the following read, propagation only
  * considers and consumes the updated tree.  This makes reading O(the
  * number of descendants which have been active since last read) instead of
@@ -274,7 +274,7 @@ struct css_set {
  * become very expensive.  By propagating selectively, increasing reading
  * frequency decreases the cost of each read.
  */
-struct cgroup_cpu_stat {
+struct cgroup_rstat_cpu {
 	/*
 	 * ->sync protects all the current counters.  These are the only
 	 * fields which get updated in the hot path.
@@ -297,7 +297,7 @@ struct cgroup_cpu_stat {
 	 * to the cgroup makes it unnecessary for each per-cpu struct to
 	 * point back to the associated cgroup.
 	 *
-	 * Protected by per-cpu cgroup_cpu_stat_lock.
+	 * Protected by per-cpu cgroup_rstat_cpu_lock.
 	 */
 	struct cgroup *updated_children;	/* terminated by self cgroup */
 	struct cgroup *updated_next;		/* NULL iff not on the list */
@@ -408,8 +408,10 @@ struct cgroup {
 	 */
 	struct cgroup *dom_cgrp;
 
+	/* per-cpu recursive resource statistics */
+	struct cgroup_rstat_cpu __percpu *rstat_cpu;
+
 	/* cgroup basic resource statistics */
-	struct cgroup_cpu_stat __percpu *cpu_stat;
 	struct cgroup_stat pending_stat;	/* pending from children */
 	struct cgroup_stat stat;
 
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b928b27050c6..092711114a1f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,13 +201,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 int cgroup_task_count(const struct cgroup *cgrp);
 
 /*
- * stat.c
+ * rstat.c
  */
-void cgroup_stat_flush(struct cgroup *cgrp);
-int cgroup_stat_init(struct cgroup *cgrp);
-void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_rstat_flush(struct cgroup *cgrp);
+int cgroup_rstat_init(struct cgroup *cgrp);
+void cgroup_rstat_exit(struct cgroup *cgrp);
 void cgroup_stat_show_cputime(struct seq_file *seq);
-void cgroup_stat_boot(void);
+void cgroup_rstat_boot(void);
 
 /*
  * namespace.c
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index fdb7a582f8fc..32eb7ce0ad71 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -144,14 +144,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
 
-static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
 
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
+struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
@@ -4592,7 +4592,7 @@ static void css_free_rwork_fn(struct work_struct *work)
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
 			if (cgroup_on_dfl(cgrp))
-				cgroup_stat_exit(cgrp);
+				cgroup_rstat_exit(cgrp);
 			kfree(cgrp);
 		} else {
 			/*
@@ -4629,7 +4629,7 @@ static void css_release_work_fn(struct work_struct *work)
 		trace_cgroup_release(cgrp);
 
 		if (cgroup_on_dfl(cgrp))
-			cgroup_stat_flush(cgrp);
+			cgroup_rstat_flush(cgrp);
 
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
 		     tcgrp = cgroup_parent(tcgrp))
@@ -4817,7 +4817,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 		goto out_free_cgrp;
 
 	if (cgroup_on_dfl(parent)) {
-		ret = cgroup_stat_init(cgrp);
+		ret = cgroup_rstat_init(cgrp);
 		if (ret)
 			goto out_cancel_ref;
 	}
@@ -4882,7 +4882,7 @@ out_idr_free:
 	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_stat_exit:
 	if (cgroup_on_dfl(parent))
-		cgroup_stat_exit(cgrp);
+		cgroup_rstat_exit(cgrp);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5275,7 +5275,7 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
-	cgroup_stat_boot();
+	cgroup_rstat_boot();
 
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 1e111dd455c4..6824047b57a9 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -2,26 +2,26 @@
 
 #include <linux/sched/cputime.h>
 
-static DEFINE_MUTEX(cgroup_stat_mutex);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+static DEFINE_MUTEX(cgroup_rstat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
 
-static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
 {
-	return per_cpu_ptr(cgrp->cpu_stat, cpu);
+	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
 }
 
 /**
- * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * cgroup_rstat_cpu_updated - keep track of updated rstat_cpu
  * @cgrp: target cgroup
- * @cpu: cpu on which cpu_stat was updated
+ * @cpu: cpu on which rstat_cpu was updated
  *
- * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
- * cpu_stat->updated_children list.  See the comment on top of
- * cgroup_cpu_stat definition for details.
+ * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
+ * rstat_cpu->updated_children list.  See the comment on top of
+ * cgroup_rstat_cpu definition for details.
  */
-static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+static void cgroup_rstat_cpu_updated(struct cgroup *cgrp, int cpu)
 {
-	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
 	struct cgroup *parent;
 	unsigned long flags;
 
@@ -33,7 +33,7 @@ static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
 	 * instead of NULL, we can tell whether @cgrp is on the list by
 	 * testing the next pointer for NULL.
 	 */
-	if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+	if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
 		return;
 
 	raw_spin_lock_irqsave(cpu_lock, flags);
@@ -41,42 +41,42 @@ static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
 	/* put @cgrp and all ancestors on the corresponding updated lists */
 	for (parent = cgroup_parent(cgrp); parent;
 	     cgrp = parent, parent = cgroup_parent(cgrp)) {
-		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+		struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
 
 		/*
 		 * Both additions and removals are bottom-up.  If a cgroup
 		 * is already in the tree, all ancestors are.
 		 */
-		if (cstat->updated_next)
+		if (rstatc->updated_next)
 			break;
 
-		cstat->updated_next = pcstat->updated_children;
-		pcstat->updated_children = cgrp;
+		rstatc->updated_next = prstatc->updated_children;
+		prstatc->updated_children = cgrp;
 	}
 
 	raw_spin_unlock_irqrestore(cpu_lock, flags);
 }
 
 /**
- * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  * @pos: current position
  * @root: root of the tree to traversal
  * @cpu: target cpu
  *
- * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
+ * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
  * the traversal and %NULL return indicates the end.  During traversal,
  * each returned cgroup is unlinked from the tree.  Must be called with the
- * matching cgroup_cpu_stat_lock held.
+ * matching cgroup_rstat_cpu_lock held.
  *
  * The only ordering guarantee is that, for a parent and a child pair
  * covered by a given traversal, if a child is visited, its parent is
  * guaranteed to be visited afterwards.
  */
-static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
-						  struct cgroup *root, int cpu)
+static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
+						   struct cgroup *root, int cpu)
 {
-	struct cgroup_cpu_stat *cstat;
+	struct cgroup_rstat_cpu *rstatc;
 	struct cgroup *parent;
 
 	if (pos == root)
@@ -93,10 +93,10 @@ static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
 
 	/* walk down to the first leaf */
 	while (true) {
-		cstat = cgroup_cpu_stat(pos, cpu);
-		if (cstat->updated_children == pos)
+		rstatc = cgroup_rstat_cpu(pos, cpu);
+		if (rstatc->updated_children == pos)
 			break;
-		pos = cstat->updated_children;
+		pos = rstatc->updated_children;
 	}
 
 	/*
@@ -106,23 +106,23 @@ static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
 	 * child in most cases. The only exception is @root.
 	 */
 	parent = cgroup_parent(pos);
-	if (parent && cstat->updated_next) {
-		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
-		struct cgroup_cpu_stat *ncstat;
+	if (parent && rstatc->updated_next) {
+		struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+		struct cgroup_rstat_cpu *nrstatc;
 		struct cgroup **nextp;
 
-		nextp = &pcstat->updated_children;
+		nextp = &prstatc->updated_children;
 		while (true) {
-			ncstat = cgroup_cpu_stat(*nextp, cpu);
+			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
 			if (*nextp == pos)
 				break;
 
 			WARN_ON_ONCE(*nextp == parent);
-			nextp = &ncstat->updated_next;
+			nextp = &nrstatc->updated_next;
 		}
 
-		*nextp = cstat->updated_next;
-		cstat->updated_next = NULL;
+		*nextp = rstatc->updated_next;
+		rstatc->updated_next = NULL;
 	}
 
 	return pos;
@@ -139,19 +139,19 @@ static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
 static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
-	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-	struct task_cputime *last_cputime = &cstat->last_cputime;
+	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+	struct task_cputime *last_cputime = &rstatc->last_cputime;
 	struct task_cputime cputime;
 	struct cgroup_stat delta;
 	unsigned seq;
 
-	lockdep_assert_held(&cgroup_stat_mutex);
+	lockdep_assert_held(&cgroup_rstat_mutex);
 
 	/* fetch the current per-cpu values */
 	do {
-		seq = __u64_stats_fetch_begin(&cstat->sync);
-		cputime = cstat->cputime;
-	} while (__u64_stats_fetch_retry(&cstat->sync, seq));
+		seq = __u64_stats_fetch_begin(&rstatc->sync);
+		cputime = rstatc->cputime;
+	} while (__u64_stats_fetch_retry(&rstatc->sync, seq));
 
 	/* accumulate the deltas to propgate */
 	delta.cputime.utime = cputime.utime - last_cputime->utime;
@@ -170,26 +170,27 @@ static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
 		cgroup_stat_accumulate(&parent->pending_stat, &delta);
 }
 
-/* see cgroup_stat_flush() */
-static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+/* see cgroup_rstat_flush() */
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 {
 	int cpu;
 
-	lockdep_assert_held(&cgroup_stat_mutex);
+	lockdep_assert_held(&cgroup_rstat_mutex);
 
 	for_each_possible_cpu(cpu) {
-		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
+						       cpu);
 		struct cgroup *pos = NULL;
 
 		raw_spin_lock_irq(cpu_lock);
-		while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu)))
 			cgroup_cpu_stat_flush_one(pos, cpu);
 		raw_spin_unlock_irq(cpu_lock);
 	}
 }
 
 /**
- * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * cgroup_rstat_flush - flush stats in @cgrp's subtree
  * @cgrp: target cgroup
  *
  * Collect all per-cpu stats in @cgrp's subtree into the global counters
@@ -199,61 +200,62 @@ static void cgroup_stat_flush_locked(struct cgroup *cgrp)
  * This also gets all cgroups in the subtree including @cgrp off the
  * ->updated_children lists.
  */
-void cgroup_stat_flush(struct cgroup *cgrp)
+void cgroup_rstat_flush(struct cgroup *cgrp)
 {
-	mutex_lock(&cgroup_stat_mutex);
-	cgroup_stat_flush_locked(cgrp);
-	mutex_unlock(&cgroup_stat_mutex);
+	mutex_lock(&cgroup_rstat_mutex);
+	cgroup_rstat_flush_locked(cgrp);
+	mutex_unlock(&cgroup_rstat_mutex);
 }
 
-static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+static struct cgroup_rstat_cpu *
+cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
 {
-	struct cgroup_cpu_stat *cstat;
+	struct cgroup_rstat_cpu *rstatc;
 
-	cstat = get_cpu_ptr(cgrp->cpu_stat);
-	u64_stats_update_begin(&cstat->sync);
-	return cstat;
+	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
+	u64_stats_update_begin(&rstatc->sync);
+	return rstatc;
 }
 
 static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
-					struct cgroup_cpu_stat *cstat)
+					struct cgroup_rstat_cpu *rstatc)
 {
-	u64_stats_update_end(&cstat->sync);
-	cgroup_cpu_stat_updated(cgrp, smp_processor_id());
-	put_cpu_ptr(cstat);
+	u64_stats_update_end(&rstatc->sync);
+	cgroup_rstat_cpu_updated(cgrp, smp_processor_id());
+	put_cpu_ptr(rstatc);
 }
 
 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 {
-	struct cgroup_cpu_stat *cstat;
+	struct cgroup_rstat_cpu *rstatc;
 
-	cstat = cgroup_cpu_stat_account_begin(cgrp);
-	cstat->cputime.sum_exec_runtime += delta_exec;
-	cgroup_cpu_stat_account_end(cgrp, cstat);
+	rstatc = cgroup_cpu_stat_account_begin(cgrp);
+	rstatc->cputime.sum_exec_runtime += delta_exec;
+	cgroup_cpu_stat_account_end(cgrp, rstatc);
 }
 
 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 				    enum cpu_usage_stat index, u64 delta_exec)
 {
-	struct cgroup_cpu_stat *cstat;
+	struct cgroup_rstat_cpu *rstatc;
 
-	cstat = cgroup_cpu_stat_account_begin(cgrp);
+	rstatc = cgroup_cpu_stat_account_begin(cgrp);
 
 	switch (index) {
 	case CPUTIME_USER:
 	case CPUTIME_NICE:
-		cstat->cputime.utime += delta_exec;
+		rstatc->cputime.utime += delta_exec;
 		break;
 	case CPUTIME_SYSTEM:
 	case CPUTIME_IRQ:
 	case CPUTIME_SOFTIRQ:
-		cstat->cputime.stime += delta_exec;
+		rstatc->cputime.stime += delta_exec;
 		break;
 	default:
 		break;
 	}
 
-	cgroup_cpu_stat_account_end(cgrp, cstat);
+	cgroup_cpu_stat_account_end(cgrp, rstatc);
 }
 
 void cgroup_stat_show_cputime(struct seq_file *seq)
@@ -264,15 +266,15 @@ void cgroup_stat_show_cputime(struct seq_file *seq)
 	if (!cgroup_parent(cgrp))
 		return;
 
-	mutex_lock(&cgroup_stat_mutex);
+	mutex_lock(&cgroup_rstat_mutex);
 
-	cgroup_stat_flush_locked(cgrp);
+	cgroup_rstat_flush_locked(cgrp);
 
 	usage = cgrp->stat.cputime.sum_exec_runtime;
 	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
 		       &utime, &stime);
 
-	mutex_unlock(&cgroup_stat_mutex);
+	mutex_unlock(&cgroup_rstat_mutex);
 
 	do_div(usage, NSEC_PER_USEC);
 	do_div(utime, NSEC_PER_USEC);
@@ -284,23 +286,23 @@ void cgroup_stat_show_cputime(struct seq_file *seq)
 		   usage, utime, stime);
 }
 
-int cgroup_stat_init(struct cgroup *cgrp)
+int cgroup_rstat_init(struct cgroup *cgrp)
 {
 	int cpu;
 
-	/* the root cgrp has cpu_stat preallocated */
-	if (!cgrp->cpu_stat) {
-		cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
-		if (!cgrp->cpu_stat)
+	/* the root cgrp has rstat_cpu preallocated */
+	if (!cgrp->rstat_cpu) {
+		cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
+		if (!cgrp->rstat_cpu)
 			return -ENOMEM;
 	}
 
 	/* ->updated_children list is self terminated */
 	for_each_possible_cpu(cpu) {
-		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 
-		cstat->updated_children = cgrp;
-		u64_stats_init(&cstat->sync);
+		rstatc->updated_children = cgrp;
+		u64_stats_init(&rstatc->sync);
 	}
 
 	prev_cputime_init(&cgrp->stat.prev_cputime);
@@ -308,31 +310,31 @@ int cgroup_stat_init(struct cgroup *cgrp)
 	return 0;
 }
 
-void cgroup_stat_exit(struct cgroup *cgrp)
+void cgroup_rstat_exit(struct cgroup *cgrp)
 {
 	int cpu;
 
-	cgroup_stat_flush(cgrp);
+	cgroup_rstat_flush(cgrp);
 
 	/* sanity check */
 	for_each_possible_cpu(cpu) {
-		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 
-		if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
-		    WARN_ON_ONCE(cstat->updated_next))
+		if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+		    WARN_ON_ONCE(rstatc->updated_next))
 			return;
 	}
 
-	free_percpu(cgrp->cpu_stat);
-	cgrp->cpu_stat = NULL;
+	free_percpu(cgrp->rstat_cpu);
+	cgrp->rstat_cpu = NULL;
 }
 
-void __init cgroup_stat_boot(void)
+void __init cgroup_rstat_boot(void)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
 
-	BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+	BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
 }
-- 
cgit v1.2.3


From d4ff749b5e0f1e2d4d69a3e4ea81cdeaeb4904d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Apr 2018 14:29:04 -0700
Subject: cgroup: Distinguish base resource stat implementation from rstat

Base resource stat accounts universial (not specific to any
controller) resource consumptions on top of rstat.  Currently, its
implementation is intermixed with rstat implementation making the code
confusing to follow.

This patch clarifies the distintion by doing the followings.

* Encapsulate base resource stat counters, currently only cputime, in
  struct cgroup_base_stat.

* Move prev_cputime into struct cgroup and initialize it with cgroup.

* Rename the related functions so that they start with cgroup_base_stat.

* Prefix the related variables and field names with b.

This patch doesn't make any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h     | 29 ++++++++++--------
 kernel/cgroup/cgroup-internal.h |  2 +-
 kernel/cgroup/cgroup.c          |  4 ++-
 kernel/cgroup/rstat.c           | 67 ++++++++++++++++++++---------------------
 4 files changed, 52 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 04cb42419310..60d62fe97dc3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -258,6 +258,10 @@ struct css_set {
 	struct rcu_head rcu_head;
 };
 
+struct cgroup_base_stat {
+	struct task_cputime cputime;
+};
+
 /*
  * rstat - cgroup scalable recursive statistics.  Accounting is done
  * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
@@ -273,20 +277,24 @@ struct css_set {
  * aren't active and stat may be read frequently.  The combination can
  * become very expensive.  By propagating selectively, increasing reading
  * frequency decreases the cost of each read.
+ *
+ * This struct hosts both the fields which implement the above -
+ * updated_children and updated_next - and the fields which track basic
+ * resource statistics on top of it - bsync, bstat and last_bstat.
  */
 struct cgroup_rstat_cpu {
 	/*
-	 * ->sync protects all the current counters.  These are the only
-	 * fields which get updated in the hot path.
+	 * ->bsync protects ->bstat.  These are the only fields which get
+	 * updated in the hot path.
 	 */
-	struct u64_stats_sync sync;
-	struct task_cputime cputime;
+	struct u64_stats_sync bsync;
+	struct cgroup_base_stat bstat;
 
 	/*
 	 * Snapshots at the last reading.  These are used to calculate the
 	 * deltas to propagate to the global counters.
 	 */
-	struct task_cputime last_cputime;
+	struct cgroup_base_stat last_bstat;
 
 	/*
 	 * Child cgroups with stat updates on this cpu since the last read
@@ -303,12 +311,6 @@ struct cgroup_rstat_cpu {
 	struct cgroup *updated_next;		/* NULL iff not on the list */
 };
 
-struct cgroup_stat {
-	/* per-cpu statistics are collected into the folowing global counters */
-	struct task_cputime cputime;
-	struct prev_cputime prev_cputime;
-};
-
 struct cgroup {
 	/* self css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state self;
@@ -412,8 +414,9 @@ struct cgroup {
 	struct cgroup_rstat_cpu __percpu *rstat_cpu;
 
 	/* cgroup basic resource statistics */
-	struct cgroup_stat pending_stat;	/* pending from children */
-	struct cgroup_stat stat;
+	struct cgroup_base_stat pending_bstat;	/* pending from children */
+	struct cgroup_base_stat bstat;
+	struct prev_cputime prev_cputime;	/* for printing out cputime */
 
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 092711114a1f..aab4d0a09670 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -206,7 +206,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
 void cgroup_rstat_flush(struct cgroup *cgrp);
 int cgroup_rstat_init(struct cgroup *cgrp);
 void cgroup_rstat_exit(struct cgroup *cgrp);
-void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_base_stat_cputime_show(struct seq_file *seq);
 void cgroup_rstat_boot(void);
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 32eb7ce0ad71..31af98996692 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
 #include <linux/file.h>
+#include <linux/sched/cputime.h>
 #include <net/sock.h>
 
 #define CREATE_TRACE_POINTS
@@ -1859,6 +1860,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	cgrp->dom_cgrp = cgrp;
 	cgrp->max_descendants = INT_MAX;
 	cgrp->max_depth = INT_MAX;
+	prev_cputime_init(&cgrp->prev_cputime);
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3396,7 +3398,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 	struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
 	int ret = 0;
 
-	cgroup_stat_show_cputime(seq);
+	cgroup_base_stat_cputime_show(seq);
 #ifdef CONFIG_CGROUP_SCHED
 	ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
 #endif
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 6824047b57a9..7670191fa776 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -128,30 +128,30 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
 	return pos;
 }
 
-static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
-				   struct cgroup_stat *src_stat)
+static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
+					struct cgroup_base_stat *src_bstat)
 {
-	dst_stat->cputime.utime += src_stat->cputime.utime;
-	dst_stat->cputime.stime += src_stat->cputime.stime;
-	dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+	dst_bstat->cputime.utime += src_bstat->cputime.utime;
+	dst_bstat->cputime.stime += src_bstat->cputime.stime;
+	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
 }
 
-static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
 	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
-	struct task_cputime *last_cputime = &rstatc->last_cputime;
+	struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
 	struct task_cputime cputime;
-	struct cgroup_stat delta;
+	struct cgroup_base_stat delta;
 	unsigned seq;
 
 	lockdep_assert_held(&cgroup_rstat_mutex);
 
 	/* fetch the current per-cpu values */
 	do {
-		seq = __u64_stats_fetch_begin(&rstatc->sync);
-		cputime = rstatc->cputime;
-	} while (__u64_stats_fetch_retry(&rstatc->sync, seq));
+		seq = __u64_stats_fetch_begin(&rstatc->bsync);
+		cputime = rstatc->bstat.cputime;
+	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 
 	/* accumulate the deltas to propgate */
 	delta.cputime.utime = cputime.utime - last_cputime->utime;
@@ -161,13 +161,13 @@ static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
 	*last_cputime = cputime;
 
 	/* transfer the pending stat into delta */
-	cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
-	memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+	cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
+	memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
 
 	/* propagate delta into the global stat and the parent's pending */
-	cgroup_stat_accumulate(&cgrp->stat, &delta);
+	cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
 	if (parent)
-		cgroup_stat_accumulate(&parent->pending_stat, &delta);
+		cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
 }
 
 /* see cgroup_rstat_flush() */
@@ -184,7 +184,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 
 		raw_spin_lock_irq(cpu_lock);
 		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu)))
-			cgroup_cpu_stat_flush_one(pos, cpu);
+			cgroup_base_stat_flush(pos, cpu);
 		raw_spin_unlock_irq(cpu_lock);
 	}
 }
@@ -208,19 +208,19 @@ void cgroup_rstat_flush(struct cgroup *cgrp)
 }
 
 static struct cgroup_rstat_cpu *
-cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
 {
 	struct cgroup_rstat_cpu *rstatc;
 
 	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
-	u64_stats_update_begin(&rstatc->sync);
+	u64_stats_update_begin(&rstatc->bsync);
 	return rstatc;
 }
 
-static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
-					struct cgroup_rstat_cpu *rstatc)
+static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
+						 struct cgroup_rstat_cpu *rstatc)
 {
-	u64_stats_update_end(&rstatc->sync);
+	u64_stats_update_end(&rstatc->bsync);
 	cgroup_rstat_cpu_updated(cgrp, smp_processor_id());
 	put_cpu_ptr(rstatc);
 }
@@ -229,9 +229,9 @@ void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 {
 	struct cgroup_rstat_cpu *rstatc;
 
-	rstatc = cgroup_cpu_stat_account_begin(cgrp);
-	rstatc->cputime.sum_exec_runtime += delta_exec;
-	cgroup_cpu_stat_account_end(cgrp, rstatc);
+	rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
+	cgroup_base_stat_cputime_account_end(cgrp, rstatc);
 }
 
 void __cgroup_account_cputime_field(struct cgroup *cgrp,
@@ -239,26 +239,26 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 {
 	struct cgroup_rstat_cpu *rstatc;
 
-	rstatc = cgroup_cpu_stat_account_begin(cgrp);
+	rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
 
 	switch (index) {
 	case CPUTIME_USER:
 	case CPUTIME_NICE:
-		rstatc->cputime.utime += delta_exec;
+		rstatc->bstat.cputime.utime += delta_exec;
 		break;
 	case CPUTIME_SYSTEM:
 	case CPUTIME_IRQ:
 	case CPUTIME_SOFTIRQ:
-		rstatc->cputime.stime += delta_exec;
+		rstatc->bstat.cputime.stime += delta_exec;
 		break;
 	default:
 		break;
 	}
 
-	cgroup_cpu_stat_account_end(cgrp, rstatc);
+	cgroup_base_stat_cputime_account_end(cgrp, rstatc);
 }
 
-void cgroup_stat_show_cputime(struct seq_file *seq)
+void cgroup_base_stat_cputime_show(struct seq_file *seq)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	u64 usage, utime, stime;
@@ -270,9 +270,8 @@ void cgroup_stat_show_cputime(struct seq_file *seq)
 
 	cgroup_rstat_flush_locked(cgrp);
 
-	usage = cgrp->stat.cputime.sum_exec_runtime;
-	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
-		       &utime, &stime);
+	usage = cgrp->bstat.cputime.sum_exec_runtime;
+	cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
 
 	mutex_unlock(&cgroup_rstat_mutex);
 
@@ -302,11 +301,9 @@ int cgroup_rstat_init(struct cgroup *cgrp)
 		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 
 		rstatc->updated_children = cgrp;
-		u64_stats_init(&rstatc->sync);
+		u64_stats_init(&rstatc->bsync);
 	}
 
-	prev_cputime_init(&cgrp->stat.prev_cputime);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 6162cef0f741c70eb0c7ac7e6142f85808d8abc4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Apr 2018 14:29:05 -0700
Subject: cgroup: Factor out and expose cgroup_rstat_*() interface functions

cgroup_rstat is being generalized so that controllers can use it too.
This patch factors out and exposes the following interface functions.

* cgroup_rstat_updated(): Renamed from cgroup_rstat_cpu_updated() for
  consistency.

* cgroup_rstat_flush_hold/release(): Factored out from base stat
  implementation.

* cgroup_rstat_flush(): Verbatim expose.

While at it, drop assert on cgroup_rstat_mutex in
cgroup_base_stat_flush() as it crosses layers and make a minor comment
update.

v2: Added EXPORT_SYMBOL_GPL(cgroup_rstat_updated) to fix a build bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h          | 11 +++++++++--
 kernel/cgroup/cgroup-internal.h |  1 -
 kernel/cgroup/rstat.c           | 42 +++++++++++++++++++++++++++++------------
 3 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0abb86..5c6018fef5aa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -690,11 +690,18 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 	char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
 
+#ifdef CONFIG_CGROUPS
 /*
- * Basic resource stats.
+ * cgroup scalable recursive statistics.
  */
-#ifdef CONFIG_CGROUPS
+void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
+void cgroup_rstat_flush(struct cgroup *cgrp);
+void cgroup_rstat_flush_hold(struct cgroup *cgrp);
+void cgroup_rstat_flush_release(void);
 
+/*
+ * Basic resource stats.
+ */
 #ifdef CONFIG_CGROUP_CPUACCT
 void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 2bf6fb417588..b68e1a7c146c 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -203,7 +203,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
 /*
  * rstat.c
  */
-void cgroup_rstat_flush(struct cgroup *cgrp);
 int cgroup_rstat_init(struct cgroup *cgrp);
 void cgroup_rstat_exit(struct cgroup *cgrp);
 void cgroup_rstat_boot(void);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 87d7252769e7..d49bf92ac3d4 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -13,7 +13,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
 }
 
 /**
- * cgroup_rstat_cpu_updated - keep track of updated rstat_cpu
+ * cgroup_rstat_updated - keep track of updated rstat_cpu
  * @cgrp: target cgroup
  * @cpu: cpu on which rstat_cpu was updated
  *
@@ -21,7 +21,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  * rstat_cpu->updated_children list.  See the comment on top of
  * cgroup_rstat_cpu definition for details.
  */
-static void cgroup_rstat_cpu_updated(struct cgroup *cgrp, int cpu)
+void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 {
 	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
 	struct cgroup *parent;
@@ -59,6 +59,7 @@ static void cgroup_rstat_cpu_updated(struct cgroup *cgrp, int cpu)
 
 	raw_spin_unlock_irqrestore(cpu_lock, flags);
 }
+EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
 
 /**
  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
@@ -167,6 +168,29 @@ void cgroup_rstat_flush(struct cgroup *cgrp)
 	mutex_unlock(&cgroup_rstat_mutex);
 }
 
+/**
+ * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * @cgrp: target cgroup
+ *
+ * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
+ * paired with cgroup_rstat_flush_release().
+ */
+void cgroup_rstat_flush_hold(struct cgroup *cgrp)
+	__acquires(&cgroup_rstat_mutex)
+{
+	mutex_lock(&cgroup_rstat_mutex);
+	cgroup_rstat_flush_locked(cgrp);
+}
+
+/**
+ * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ */
+void cgroup_rstat_flush_release(void)
+	__releases(&cgroup_rstat_mutex)
+{
+	mutex_unlock(&cgroup_rstat_mutex);
+}
+
 int cgroup_rstat_init(struct cgroup *cgrp)
 {
 	int cpu;
@@ -239,15 +263,13 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 	struct cgroup_base_stat delta;
 	unsigned seq;
 
-	lockdep_assert_held(&cgroup_rstat_mutex);
-
 	/* fetch the current per-cpu values */
 	do {
 		seq = __u64_stats_fetch_begin(&rstatc->bsync);
 		cputime = rstatc->bstat.cputime;
 	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 
-	/* accumulate the deltas to propgate */
+	/* calculate the delta to propgate */
 	delta.cputime.utime = cputime.utime - last_cputime->utime;
 	delta.cputime.stime = cputime.stime - last_cputime->stime;
 	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
@@ -278,7 +300,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
 						 struct cgroup_rstat_cpu *rstatc)
 {
 	u64_stats_update_end(&rstatc->bsync);
-	cgroup_rstat_cpu_updated(cgrp, smp_processor_id());
+	cgroup_rstat_updated(cgrp, smp_processor_id());
 	put_cpu_ptr(rstatc);
 }
 
@@ -323,14 +345,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 	if (!cgroup_parent(cgrp))
 		return;
 
-	mutex_lock(&cgroup_rstat_mutex);
-
-	cgroup_rstat_flush_locked(cgrp);
-
+	cgroup_rstat_flush_hold(cgrp);
 	usage = cgrp->bstat.cputime.sum_exec_runtime;
 	cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
-
-	mutex_unlock(&cgroup_rstat_mutex);
+	cgroup_rstat_flush_release();
 
 	do_div(usage, NSEC_PER_USEC);
 	do_div(utime, NSEC_PER_USEC);
-- 
cgit v1.2.3


From 0fa294fb1985c06c4e3325e30e759d4ca580f59a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Apr 2018 14:29:05 -0700
Subject: cgroup: Replace cgroup_rstat_mutex with a spinlock

Currently, rstat flush path is protected with a mutex which is fine as
all the existing users are from interface file show path.  However,
rstat is being generalized for use by controllers and flushing from
atomic contexts will be necessary.

This patch replaces cgroup_rstat_mutex with a spinlock and adds a
irq-safe flush function - cgroup_rstat_flush_irqsafe().  Explicit
yield handling is added to the flush path so that other flush
functions can yield to other threads and flushers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup/rstat.c  | 58 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5c6018fef5aa..c9fdf6f57913 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -696,6 +696,7 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
  */
 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
 void cgroup_rstat_flush(struct cgroup *cgrp);
+void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
 void cgroup_rstat_flush_hold(struct cgroup *cgrp);
 void cgroup_rstat_flush_release(void);
 
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d49bf92ac3d4..3386fb251a9e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -2,7 +2,7 @@
 
 #include <linux/sched/cputime.h>
 
-static DEFINE_MUTEX(cgroup_rstat_mutex);
+static DEFINE_SPINLOCK(cgroup_rstat_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -132,21 +132,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
 }
 
 /* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
 {
 	int cpu;
 
-	lockdep_assert_held(&cgroup_rstat_mutex);
+	lockdep_assert_held(&cgroup_rstat_lock);
 
 	for_each_possible_cpu(cpu) {
 		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
 						       cpu);
 		struct cgroup *pos = NULL;
 
-		raw_spin_lock_irq(cpu_lock);
+		raw_spin_lock(cpu_lock);
 		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu)))
 			cgroup_base_stat_flush(pos, cpu);
-		raw_spin_unlock_irq(cpu_lock);
+		raw_spin_unlock(cpu_lock);
+
+		/* if @may_sleep, play nice and yield if necessary */
+		if (may_sleep && (need_resched() ||
+				  spin_needbreak(&cgroup_rstat_lock))) {
+			spin_unlock_irq(&cgroup_rstat_lock);
+			if (!cond_resched())
+				cpu_relax();
+			spin_lock_irq(&cgroup_rstat_lock);
+		}
 	}
 }
 
@@ -160,12 +170,31 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
  *
  * This also gets all cgroups in the subtree including @cgrp off the
  * ->updated_children lists.
+ *
+ * This function may block.
  */
 void cgroup_rstat_flush(struct cgroup *cgrp)
 {
-	mutex_lock(&cgroup_rstat_mutex);
-	cgroup_rstat_flush_locked(cgrp);
-	mutex_unlock(&cgroup_rstat_mutex);
+	might_sleep();
+
+	spin_lock_irq(&cgroup_rstat_lock);
+	cgroup_rstat_flush_locked(cgrp, true);
+	spin_unlock_irq(&cgroup_rstat_lock);
+}
+
+/**
+ * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * @cgrp: target cgroup
+ *
+ * This function can be called from any context.
+ */
+void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cgroup_rstat_lock, flags);
+	cgroup_rstat_flush_locked(cgrp, false);
+	spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
 }
 
 /**
@@ -174,21 +203,24 @@ void cgroup_rstat_flush(struct cgroup *cgrp)
  *
  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
  * paired with cgroup_rstat_flush_release().
+ *
+ * This function may block.
  */
 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
-	__acquires(&cgroup_rstat_mutex)
+	__acquires(&cgroup_rstat_lock)
 {
-	mutex_lock(&cgroup_rstat_mutex);
-	cgroup_rstat_flush_locked(cgrp);
+	might_sleep();
+	spin_lock_irq(&cgroup_rstat_lock);
+	cgroup_rstat_flush_locked(cgrp, true);
 }
 
 /**
  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
  */
 void cgroup_rstat_flush_release(void)
-	__releases(&cgroup_rstat_mutex)
+	__releases(&cgroup_rstat_lock)
 {
-	mutex_unlock(&cgroup_rstat_mutex);
+	spin_unlock_irq(&cgroup_rstat_lock);
 }
 
 int cgroup_rstat_init(struct cgroup *cgrp)
-- 
cgit v1.2.3


From 8f53470bab04229e93ff9e4c20338cc08b42b344 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Apr 2018 14:29:05 -0700
Subject: cgroup: Add cgroup_subsys->css_rstat_flush()

This patch adds cgroup_subsys->css_rstat_flush().  If a subsystem has
this callback, its csses are linked on cgrp->css_rstat_list and rstat
will call the function whenever the associated cgroup is flushed.
Flush is also performed when such csses are released so that residual
counts aren't lost.

Combined with the rstat API previous patches factored out, this allows
controllers to plug into rstat to manage their statistics in a
scalable way.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  5 +++++
 kernel/cgroup/cgroup.c      | 11 +++++++++++
 kernel/cgroup/rstat.c       | 11 ++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 60d62fe97dc3..c0e68f903011 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -130,6 +130,9 @@ struct cgroup_subsys_state {
 	struct list_head sibling;
 	struct list_head children;
 
+	/* flush target list anchored at cgrp->rstat_css_list */
+	struct list_head rstat_css_node;
+
 	/*
 	 * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
 	 * matching css can be looked up using css_from_id().
@@ -412,6 +415,7 @@ struct cgroup {
 
 	/* per-cpu recursive resource statistics */
 	struct cgroup_rstat_cpu __percpu *rstat_cpu;
+	struct list_head rstat_css_list;
 
 	/* cgroup basic resource statistics */
 	struct cgroup_base_stat pending_bstat;	/* pending from children */
@@ -577,6 +581,7 @@ struct cgroup_subsys {
 	void (*css_released)(struct cgroup_subsys_state *css);
 	void (*css_free)(struct cgroup_subsys_state *css);
 	void (*css_reset)(struct cgroup_subsys_state *css);
+	void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
 	int (*css_extra_stat_show)(struct seq_file *seq,
 				   struct cgroup_subsys_state *css);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 31af98996692..04b7e7fad31a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1860,6 +1860,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	cgrp->dom_cgrp = cgrp;
 	cgrp->max_descendants = INT_MAX;
 	cgrp->max_depth = INT_MAX;
+	INIT_LIST_HEAD(&cgrp->rstat_css_list);
 	prev_cputime_init(&cgrp->prev_cputime);
 
 	for_each_subsys(ss, ssid)
@@ -4621,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work)
 
 	if (ss) {
 		/* css release path */
+		if (!list_empty(&css->rstat_css_node)) {
+			cgroup_rstat_flush(cgrp);
+			list_del_rcu(&css->rstat_css_node);
+		}
+
 		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
 		if (ss->css_released)
 			ss->css_released(css);
@@ -4682,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	css->id = -1;
 	INIT_LIST_HEAD(&css->sibling);
 	INIT_LIST_HEAD(&css->children);
+	INIT_LIST_HEAD(&css->rstat_css_node);
 	css->serial_nr = css_serial_nr_next++;
 	atomic_set(&css->online_cnt, 0);
 
@@ -4690,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 		css_get(css->parent);
 	}
 
+	if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+		list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
+
 	BUG_ON(cgroup_css(cgrp, ss));
 }
 
@@ -4791,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 err_list_del:
 	list_del_rcu(&css->sibling);
 err_free_css:
+	list_del_rcu(&css->rstat_css_node);
 	INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
 	queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
 	return ERR_PTR(err);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 3386fb251a9e..339366e257d4 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -145,8 +145,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 		struct cgroup *pos = NULL;
 
 		raw_spin_lock(cpu_lock);
-		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu)))
+		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+			struct cgroup_subsys_state *css;
+
 			cgroup_base_stat_flush(pos, cpu);
+
+			rcu_read_lock();
+			list_for_each_entry_rcu(css, &pos->rstat_css_list,
+						rstat_css_node)
+				css->ss->css_rstat_flush(css, cpu);
+			rcu_read_unlock();
+		}
 		raw_spin_unlock(cpu_lock);
 
 		/* if @may_sleep, play nice and yield if necessary */
-- 
cgit v1.2.3


From 8a22543c8e7093104d7c22190746b9492bfbd897 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:54 +0200
Subject: netfilter: nf_tables: make meta expression builtin

size net/netfilter/nft_meta.ko
   text    data     bss     dec     hex filename
   5826     936       1    6763    1a6b net/netfilter/nft_meta.ko
  96407    2064     400   98871   18237 net/netfilter/nf_tables.ko

after:
 100826    2240     401  103467   1942b net/netfilter/nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/Kconfig                  |  6 ------
 net/netfilter/Makefile                 |  3 +--
 net/netfilter/nf_tables_core.c         |  1 +
 net/netfilter/nft_meta.c               | 22 +---------------------
 5 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index ea5aab568be8..3339cce8f585 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -10,6 +10,7 @@ extern struct nft_expr_type nft_byteorder_type;
 extern struct nft_expr_type nft_payload_type;
 extern struct nft_expr_type nft_dynset_type;
 extern struct nft_expr_type nft_range_type;
+extern struct nft_expr_type nft_meta_type;
 
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index d20664b02ae4..29a13c7a5af2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -480,12 +480,6 @@ config NFT_EXTHDR
 	  This option adds the "exthdr" expression that you can use to match
 	  IPv6 extension headers and tcp options.
 
-config NFT_META
-	tristate "Netfilter nf_tables meta module"
-	help
-	  This option adds the "meta" expression that you can use to match and
-	  to set packet metainformation such as the packet mark.
-
 config NFT_RT
 	tristate "Netfilter nf_tables routing module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3103ed1efe17..89634c389fe7 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -76,12 +76,11 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
-		  nft_dynset.o
+		  nft_dynset.o nft_meta.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
-obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_RT)		+= nft_rt.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index dfd0bf3810d2..b67d6577f767 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -251,6 +251,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_payload_type,
 	&nft_dynset_type,
 	&nft_range_type,
+	&nft_meta_type,
 };
 
 int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 6c0b82628117..5348bd058c88 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -11,8 +11,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
@@ -495,7 +493,6 @@ static void nft_meta_set_destroy(const struct nft_ctx *ctx,
 		static_branch_dec(&nft_trace_enabled);
 }
 
-static struct nft_expr_type nft_meta_type;
 static const struct nft_expr_ops nft_meta_get_ops = {
 	.type		= &nft_meta_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -534,27 +531,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
 	return ERR_PTR(-EINVAL);
 }
 
-static struct nft_expr_type nft_meta_type __read_mostly = {
+struct nft_expr_type nft_meta_type __read_mostly = {
 	.name		= "meta",
 	.select_ops	= nft_meta_select_ops,
 	.policy		= nft_meta_policy,
 	.maxattr	= NFTA_META_MAX,
 	.owner		= THIS_MODULE,
 };
-
-static int __init nft_meta_module_init(void)
-{
-	return nft_register_expr(&nft_meta_type);
-}
-
-static void __exit nft_meta_module_exit(void)
-{
-	nft_unregister_expr(&nft_meta_type);
-}
-
-module_init(nft_meta_module_init);
-module_exit(nft_meta_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("meta");
-- 
cgit v1.2.3


From ae1bc6a9f398d5e0310387eb077a0d9ce1fb21f5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:55 +0200
Subject: netfilter: nf_tables: merge rt expression into nft core

before:
   text    data     bss     dec     hex filename
   2657     844       0    3501     dad net/netfilter/nft_rt.ko
 100826    2240     401  103467   1942b net/netfilter/nf_tables.ko
after:
   2657     844       0    3501     dad net/netfilter/nft_rt.ko
 102456    2316     401  105173   19ad5 net/netfilter/nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/Kconfig                  |  6 ------
 net/netfilter/Makefile                 |  3 +--
 net/netfilter/nf_tables_core.c         |  1 +
 net/netfilter/nft_rt.c                 | 22 +---------------------
 5 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 3339cce8f585..d6a358ae3749 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -11,6 +11,7 @@ extern struct nft_expr_type nft_payload_type;
 extern struct nft_expr_type nft_dynset_type;
 extern struct nft_expr_type nft_range_type;
 extern struct nft_expr_type nft_meta_type;
+extern struct nft_expr_type nft_rt_type;
 
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 29a13c7a5af2..771f1a4f3376 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -480,12 +480,6 @@ config NFT_EXTHDR
 	  This option adds the "exthdr" expression that you can use to match
 	  IPv6 extension headers and tcp options.
 
-config NFT_RT
-	tristate "Netfilter nf_tables routing module"
-	help
-	  This option adds the "rt" expression that you can use to match
-	  packet routing information such as the packet nexthop.
-
 config NFT_NUMGEN
 	tristate "Netfilter nf_tables number generator module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 89634c389fe7..128dbcfaa194 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -76,12 +76,11 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
-		  nft_dynset.o nft_meta.o
+		  nft_dynset.o nft_meta.o nft_rt.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
-obj-$(CONFIG_NFT_RT)		+= nft_rt.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index b67d6577f767..481ce2c0bbbf 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -252,6 +252,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_dynset_type,
 	&nft_range_type,
 	&nft_meta_type,
+	&nft_rt_type,
 };
 
 int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 11a2071b6dd4..76dba9f6b6f6 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -7,8 +7,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
@@ -179,7 +177,6 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
 
-static struct nft_expr_type nft_rt_type;
 static const struct nft_expr_ops nft_rt_get_ops = {
 	.type		= &nft_rt_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_rt)),
@@ -189,27 +186,10 @@ static const struct nft_expr_ops nft_rt_get_ops = {
 	.validate	= nft_rt_validate,
 };
 
-static struct nft_expr_type nft_rt_type __read_mostly = {
+struct nft_expr_type nft_rt_type __read_mostly = {
 	.name		= "rt",
 	.ops		= &nft_rt_get_ops,
 	.policy		= nft_rt_policy,
 	.maxattr	= NFTA_RT_MAX,
 	.owner		= THIS_MODULE,
 };
-
-static int __init nft_rt_module_init(void)
-{
-	return nft_register_expr(&nft_rt_type);
-}
-
-static void __exit nft_rt_module_exit(void)
-{
-	nft_unregister_expr(&nft_rt_type);
-}
-
-module_init(nft_rt_module_init);
-module_exit(nft_rt_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
-MODULE_ALIAS_NFT_EXPR("rt");
-- 
cgit v1.2.3


From d0103158cf7c9190860dabd12b85ccad3c6e3455 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:56 +0200
Subject: netfilter: nf_tables: merge exthdr expression into nft core

before:
   text    data     bss     dec     hex filename
   5056     844       0    5900    170c net/netfilter/nft_exthdr.ko
 102456    2316     401  105173   19ad5 net/netfilter/nf_tables.ko

after:
 106410    2392     401  109203   1aa93 net/netfilter/nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/Kconfig                  |  6 ------
 net/netfilter/Makefile                 |  3 +--
 net/netfilter/nf_tables_core.c         |  1 +
 net/netfilter/nft_exthdr.c             | 23 ++---------------------
 5 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index d6a358ae3749..cd6915b6c054 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -12,6 +12,7 @@ extern struct nft_expr_type nft_dynset_type;
 extern struct nft_expr_type nft_range_type;
 extern struct nft_expr_type nft_meta_type;
 extern struct nft_expr_type nft_rt_type;
+extern struct nft_expr_type nft_exthdr_type;
 
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 771f1a4f3376..f66586fb41cd 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -474,12 +474,6 @@ config NF_TABLES_NETDEV
 	help
 	  This option enables support for the "netdev" table.
 
-config NFT_EXTHDR
-	tristate "Netfilter nf_tables exthdr module"
-	help
-	  This option adds the "exthdr" expression that you can use to match
-	  IPv6 extension headers and tcp options.
-
 config NFT_NUMGEN
 	tristate "Netfilter nf_tables number generator module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 128dbcfaa194..b37ce0bc9ab7 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -76,11 +76,10 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
-		  nft_dynset.o nft_meta.o nft_rt.o
+		  nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
-obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 481ce2c0bbbf..9cf47c4cb9d5 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -253,6 +253,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_range_type,
 	&nft_meta_type,
 	&nft_rt_type,
+	&nft_exthdr_type,
 };
 
 int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47ec1046ad11..a940c9fd9045 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -10,11 +10,10 @@
 
 #include <asm/unaligned.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/tcp.h>
 
@@ -353,7 +352,6 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
 	return nft_exthdr_dump_common(skb, priv);
 }
 
-static struct nft_expr_type nft_exthdr_type;
 static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
 	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -407,27 +405,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static struct nft_expr_type nft_exthdr_type __read_mostly = {
+struct nft_expr_type nft_exthdr_type __read_mostly = {
 	.name		= "exthdr",
 	.select_ops	= nft_exthdr_select_ops,
 	.policy		= nft_exthdr_policy,
 	.maxattr	= NFTA_EXTHDR_MAX,
 	.owner		= THIS_MODULE,
 };
-
-static int __init nft_exthdr_module_init(void)
-{
-	return nft_register_expr(&nft_exthdr_type);
-}
-
-static void __exit nft_exthdr_module_exit(void)
-{
-	nft_unregister_expr(&nft_exthdr_type);
-}
-
-module_init(nft_exthdr_module_init);
-module_exit(nft_exthdr_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("exthdr");
-- 
cgit v1.2.3


From 56a092c895054a6b423781d788339775bd2bda10 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:52 +0100
Subject: bpf: add script and prepare bpf.h for new helpers documentation

Remove previous "overview" of eBPF helpers from user bpf.h header.
Replace it by a comment explaining how to process the new documentation
(to come in following patches) with a Python script to produce RST, then
man page documentation.

Also add the aforementioned Python script under scripts/. It is used to
process include/uapi/linux/bpf.h and to extract helper descriptions, to
turn it into a RST document that can further be processed with rst2man
to produce a man page. The script takes one "--filename <path/to/file>"
option. If the script is launched from scripts/ in the kernel root
directory, it should be able to find the location of the header to
parse, and "--filename <path/to/file>" is then optional. If it cannot
find the file, then the option becomes mandatory. RST-formatted
documentation is printed to standard output.

Typical workflow for producing the final man page would be:

    $ ./scripts/bpf_helpers_doc.py \
            --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    $ man /tmp/bpf-helpers.7

Note that the tool kernel-doc cannot be used to document eBPF helpers,
whose signatures are not available directly in the header files
(pre-processor directives are used to produce them at the beginning of
the compilation process).

v4:
- Also remove overviews for newly added bpf_xdp_adjust_tail() and
  bpf_skb_get_xfrm_state().
- Remove vague statement about what helpers are restricted to GPL
  programs in "LICENSE" section for man page footer.
- Replace license boilerplate with SPDX tag for Python script.

v3:
- Change license for man page.
- Remove "for safety reasons" from man page header text.
- Change "packets metadata" to "packets" in man page header text.
- Move and fix comment on helpers introducing no overhead.
- Remove "NOTES" section from man page footer.
- Add "LICENSE" section to man page footer.
- Edit description of file include/uapi/linux/bpf.h in man page footer.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h   | 422 ++-------------------------------------------
 scripts/bpf_helpers_doc.py | 421 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 437 insertions(+), 406 deletions(-)
 create mode 100755 scripts/bpf_helpers_doc.py

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da8801860c7d..dd43fc98c982 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -377,412 +377,22 @@ union bpf_attr {
 	};
 } __attribute__((aligned(8)));
 
-/* BPF helper function descriptions:
- *
- * void *bpf_map_lookup_elem(&map, &key)
- *     Return: Map value or NULL
- *
- * int bpf_map_update_elem(&map, &key, &value, flags)
- *     Return: 0 on success or negative error
- *
- * int bpf_map_delete_elem(&map, &key)
- *     Return: 0 on success or negative error
- *
- * int bpf_probe_read(void *dst, int size, void *src)
- *     Return: 0 on success or negative error
- *
- * u64 bpf_ktime_get_ns(void)
- *     Return: current ktime
- *
- * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
- *     Return: length of buffer written or negative error
- *
- * u32 bpf_prandom_u32(void)
- *     Return: random value
- *
- * u32 bpf_raw_smp_processor_id(void)
- *     Return: SMP processor ID
- *
- * int bpf_skb_store_bytes(skb, offset, from, len, flags)
- *     store bytes into packet
- *     @skb: pointer to skb
- *     @offset: offset within packet from skb->mac_header
- *     @from: pointer where to copy bytes from
- *     @len: number of bytes to store into packet
- *     @flags: bit 0 - if true, recompute skb->csum
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l3_csum_replace(skb, offset, from, to, flags)
- *     recompute IP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where IP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l4_csum_replace(skb, offset, from, to, flags)
- *     recompute TCP/UDP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where TCP/UDP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             bit 4 - is pseudo header
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_tail_call(ctx, prog_array_map, index)
- *     jump into another BPF program
- *     @ctx: context pointer passed to next program
- *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: 32-bit index inside array that selects specific program to run
- *     Return: 0 on success or negative error
- *
- * int bpf_clone_redirect(skb, ifindex, flags)
- *     redirect to another netdev
- *     @skb: pointer to skb
- *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * u64 bpf_get_current_pid_tgid(void)
- *     Return: current->tgid << 32 | current->pid
- *
- * u64 bpf_get_current_uid_gid(void)
- *     Return: current_gid << 32 | current_uid
- *
- * int bpf_get_current_comm(char *buf, int size_of_buf)
- *     stores current->comm into buf
- *     Return: 0 on success or negative error
- *
- * u32 bpf_get_cgroup_classid(skb)
- *     retrieve a proc's classid
- *     @skb: pointer to skb
- *     Return: classid if != 0
- *
- * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_vlan_pop(skb)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_get_tunnel_key(skb, key, size, flags)
- * int bpf_skb_set_tunnel_key(skb, key, size, flags)
- *     retrieve or populate tunnel metadata
- *     @skb: pointer to skb
- *     @key: pointer to 'struct bpf_tunnel_key'
- *     @size: size of 'struct bpf_tunnel_key'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
- *
- * u64 bpf_perf_event_read(map, flags)
- *     read perf event counter value
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     Return: value of perf event counter read or error code
- *
- * int bpf_redirect(ifindex, flags)
- *     redirect to another netdev
- *     @ifindex: ifindex of the net device
- *     @flags:
- *	  cls_bpf:
- *          bit 0 - if set, redirect to ingress instead of egress
- *          other bits - reserved
- *	  xdp_bpf:
- *	    all bits - reserved
- *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
- *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
- * int bpf_redirect_map(map, key, flags)
- *     redirect to endpoint in map
- *     @map: pointer to dev map
- *     @key: index in map to lookup
- *     @flags: --
- *     Return: XDP_REDIRECT on success or XDP_ABORT on error
- *
- * u32 bpf_get_route_realm(skb)
- *     retrieve a dst's tclassid
- *     @skb: pointer to skb
- *     Return: realm if != 0
- *
- * int bpf_perf_event_output(ctx, map, flags, data, size)
- *     output perf raw sample
- *     @ctx: struct pt_regs*
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @data: data on stack to be output as raw data
- *     @size: size of data
- *     Return: 0 on success or negative error
- *
- * int bpf_get_stackid(ctx, map, flags)
- *     walk user or kernel stack and return id
- *     @ctx: struct pt_regs*
- *     @map: pointer to stack_trace map
- *     @flags: bits 0-7 - numer of stack frames to skip
- *             bit 8 - collect user stack instead of kernel
- *             bit 9 - compare stacks by hash only
- *             bit 10 - if two different stacks hash into the same stackid
- *                      discard old
- *             other bits - reserved
- *     Return: >= 0 stackid on success or negative error
- *
- * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
- *     calculate csum diff
- *     @from: raw from buffer
- *     @from_size: length of from buffer
- *     @to: raw to buffer
- *     @to_size: length of to buffer
- *     @seed: optional seed
- *     Return: csum result or negative error code
- *
- * int bpf_skb_get_tunnel_opt(skb, opt, size)
- *     retrieve tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: option size
- *
- * int bpf_skb_set_tunnel_opt(skb, opt, size)
- *     populate tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_proto(skb, proto, flags)
- *     Change protocol of the skb. Currently supported is v4 -> v6,
- *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
- *     program is expected to fill the new headers via skb_store_bytes
- *     and lX_csum_replace.
- *     @skb: pointer to skb
- *     @proto: new skb->protocol type
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_type(skb, type)
- *     Change packet type of skb.
- *     @skb: pointer to skb
- *     @type: new skb->pkt_type type
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_under_cgroup(skb, map, index)
- *     Check cgroup2 membership of skb
- *     @skb: pointer to skb
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 skb failed the cgroup2 descendant test
- *       == 1 skb succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * u32 bpf_get_hash_recalc(skb)
- *     Retrieve and possibly recalculate skb->hash.
- *     @skb: pointer to skb
- *     Return: hash
- *
- * u64 bpf_get_current_task(void)
- *     Returns current task_struct
- *     Return: current
- *
- * int bpf_probe_write_user(void *dst, void *src, int len)
- *     safely attempt to write to a location
- *     @dst: destination address in userspace
- *     @src: source address on stack
- *     @len: number of bytes to copy
- *     Return: 0 on success or negative error
- *
- * int bpf_current_task_under_cgroup(map, index)
- *     Check cgroup2 membership of current task
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 current failed the cgroup2 descendant test
- *       == 1 current succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * int bpf_skb_change_tail(skb, len, flags)
- *     The helper will resize the skb to the given new size, to be used f.e.
- *     with control messages.
- *     @skb: pointer to skb
- *     @len: new skb length
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_pull_data(skb, len)
- *     The helper will pull in non-linear data in case the skb is non-linear
- *     and not all of len are part of the linear section. Only needed for
- *     read/write with direct packet access.
- *     @skb: pointer to skb
- *     @len: len to make read/writeable
- *     Return: 0 on success or negative error
- *
- * s64 bpf_csum_update(skb, csum)
- *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
- *     @skb: pointer to skb
- *     @csum: csum to add
- *     Return: csum on success or negative error
- *
- * void bpf_set_hash_invalid(skb)
- *     Invalidate current skb->hash.
- *     @skb: pointer to skb
- *
- * int bpf_get_numa_node_id()
- *     Return: Id of current NUMA node.
- *
- * int bpf_skb_change_head()
- *     Grows headroom of skb and adjusts MAC header offset accordingly.
- *     Will extends/reallocae as required automatically.
- *     May change skb data pointer and will thus invalidate any check
- *     performed for direct packet access.
- *     @skb: pointer to skb
- *     @len: length of header to be pushed in front
- *     @flags: Flags (unused for now)
- *     Return: 0 on success or negative error
- *
- * int bpf_xdp_adjust_head(xdp_md, delta)
- *     Adjust the xdp_md.data by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data
- *     Return: 0 on success or negative on error
- *
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
- *     Copy a NUL terminated string from unsafe address. In case the string
- *     length is smaller than size, the target is not padded with further NUL
- *     bytes. In case the string length is larger than size, just count-1
- *     bytes are copied and the last byte is set to NUL.
- *     @dst: destination address
- *     @size: maximum number of bytes to copy, including the trailing NUL
- *     @unsafe_ptr: unsafe address
- *     Return:
- *       > 0 length of the string including the trailing NUL on success
- *       < 0 error
- *
- * u64 bpf_get_socket_cookie(skb)
- *     Get the cookie for the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
- *     field is missing inside sk_buff
- *
- * u32 bpf_get_socket_uid(skb)
- *     Get the owner uid of the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or overflowuid if failed.
- *
- * u32 bpf_set_hash(skb, hash)
- *     Set full skb->hash.
- *     @skb: pointer to skb
- *     @hash: hash to set
- *
- * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls setsockopt. Not all opts are available, only those with
- *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls getsockopt. Not all opts are available.
- *     Supported levels: IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
- *     Set callback flags for sock_ops
- *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
- *     @flags: flags value
- *     Return: 0 for no error
- *             -EINVAL if there is no full tcp socket
- *             bits in flags that are not supported by current kernel
- *
- * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
- *     Grow or shrink room in sk_buff.
- *     @skb: pointer to skb
- *     @len_diff: (signed) amount of room to grow/shrink
- *     @mode: operation mode (enum bpf_adj_room_mode)
- *     @flags: reserved for future use
- *     Return: 0 on success or negative error code
- *
- * int bpf_sk_redirect_map(map, key, flags)
- *     Redirect skb to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_sock_map_update(skops, map, key, flags)
- *	@skops: pointer to bpf_sock_ops
- *	@map: pointer to sockmap to update
- *	@key: key to insert/update sock in map
- *	@flags: same flags as map update elem
- *
- * int bpf_xdp_adjust_meta(xdp_md, delta)
- *     Adjust the xdp_md.data_meta by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data_meta
- *     Return: 0 on success or negative on error
- *
- * int bpf_perf_event_read_value(map, flags, buf, buf_size)
- *     read perf event counter value and perf event enabled/running time
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return: 0 on success or negative error code
- *
- * int bpf_perf_prog_read_value(ctx, buf, buf_size)
- *     read perf prog attached perf event counter and enabled/running time
- *     @ctx: pointer to ctx
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
- *
- * int bpf_msg_redirect_map(map, key, flags)
- *     Redirect msg to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_bind(ctx, addr, addr_len)
- *     Bind socket to address. Only binding to IP is supported, no port can be
- *     set in addr.
- *     @ctx: pointer to context of type bpf_sock_addr
- *     @addr: pointer to struct sockaddr to bind socket to
- *     @addr_len: length of sockaddr structure
- *     Return: 0 on success or negative error code
- *
- * int bpf_xdp_adjust_tail(xdp_md, delta)
- *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
- *     size is supported.
- *     @xdp_md: pointer to xdp_md
- *     @delta: A negative integer to be added to xdp_md.data_end
- *     Return: 0 on success or negative on error
- *
- * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
- *     retrieve XFRM state
- *     @skb: pointer to skb
- *     @index: index of the xfrm state in the secpath
- *     @key: pointer to 'struct bpf_xfrm_state'
- *     @size: size of 'struct bpf_xfrm_state'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
new file mode 100755
index 000000000000..30ba0fee36e4
--- /dev/null
+++ b/scripts/bpf_helpers_doc.py
@@ -0,0 +1,421 @@
+#!/usr/bin/python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (C) 2018 Netronome Systems, Inc.
+
+# In case user attempts to run with Python 2.
+from __future__ import print_function
+
+import argparse
+import re
+import sys, os
+
+class NoHelperFound(BaseException):
+    pass
+
+class ParsingError(BaseException):
+    def __init__(self, line='<line not provided>', reader=None):
+        if reader:
+            BaseException.__init__(self,
+                                   'Error at file offset %d, parsing line: %s' %
+                                   (reader.tell(), line))
+        else:
+            BaseException.__init__(self, 'Error parsing line: %s' % line)
+
+class Helper(object):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    """
+    def __init__(self, proto='', desc='', ret=''):
+        self.proto = proto
+        self.desc = desc
+        self.ret = ret
+
+    def proto_break_down(self):
+        """
+        Break down helper function protocol into smaller chunks: return type,
+        name, distincts arguments.
+        """
+        arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
+        res = {}
+        proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+
+        capture = proto_re.match(self.proto)
+        res['ret_type'] = capture.group(1)
+        res['ret_star'] = capture.group(2)
+        res['name']     = capture.group(3)
+        res['args'] = []
+
+        args    = capture.group(4).split(', ')
+        for a in args:
+            capture = arg_re.match(a)
+            res['args'].append({
+                'type' : capture.group(1),
+                'star' : capture.group(6),
+                'name' : capture.group(7)
+            })
+
+        return res
+
+class HeaderParser(object):
+    """
+    An object used to parse a file in order to extract the documentation of a
+    list of eBPF helper functions. All the helpers that can be retrieved are
+    stored as Helper object, in the self.helpers() array.
+    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
+               kernel tree
+    """
+    def __init__(self, filename):
+        self.reader = open(filename, 'r')
+        self.line = ''
+        self.helpers = []
+
+    def parse_helper(self):
+        proto    = self.parse_proto()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        return Helper(proto=proto, desc=desc, ret=ret)
+
+    def parse_proto(self):
+        # Argument can be of shape:
+        #   - "void"
+        #   - "type  name"
+        #   - "type *name"
+        #   - Same as above, with "const" and/or "struct" in front of type
+        #   - "..." (undefined number of arguments, for bpf_trace_printk())
+        # There is at least one term ("void"), and at most five arguments.
+        p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoHelperFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_desc(self):
+        p = re.compile('^ \* \tDescription$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty description and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Description can be several lines, some of them possibly empty, and it
+        # stops when another subsection title is met.
+        desc = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                desc += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    desc += capture.group(1) + '\n'
+                else:
+                    break
+        return desc
+
+    def parse_ret(self):
+        p = re.compile('^ \* \tReturn$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty retval and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Return value description can be several lines, some of them possibly
+        # empty, and it stops when another subsection title is met.
+        ret = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                ret += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    ret += capture.group(1) + '\n'
+                else:
+                    break
+        return ret
+
+    def run(self):
+        # Advance to start of helper function descriptions.
+        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
+        if offset == -1:
+            raise Exception('Could not find start of eBPF helper descriptions list')
+        self.reader.seek(offset)
+        self.reader.readline()
+        self.reader.readline()
+        self.line = self.reader.readline()
+
+        while True:
+            try:
+                helper = self.parse_helper()
+                self.helpers.append(helper)
+            except NoHelperFound:
+                break
+
+        self.reader.close()
+        print('Parsed description of %d helper function(s)' % len(self.helpers),
+              file=sys.stderr)
+
+###############################################################################
+
+class Printer(object):
+    """
+    A generic class for printers. Printers should be created with an array of
+    Helper objects, and implement a way to print them in the desired fashion.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def __init__(self, helpers):
+        self.helpers = helpers
+
+    def print_header(self):
+        pass
+
+    def print_footer(self):
+        pass
+
+    def print_one(self, helper):
+        pass
+
+    def print_all(self):
+        self.print_header()
+        for helper in self.helpers:
+            self.print_one(helper)
+        self.print_footer()
+
+class PrinterRST(Printer):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def print_header(self):
+        header = '''\
+.. Copyright (C) All BPF authors and contributors from 2014 to present.
+.. See git log include/uapi/linux/bpf.h in kernel tree for details.
+.. 
+.. %%%LICENSE_START(VERBATIM)
+.. Permission is granted to make and distribute verbatim copies of this
+.. manual provided the copyright notice and this permission notice are
+.. preserved on all copies.
+.. 
+.. Permission is granted to copy and distribute modified versions of this
+.. manual under the conditions for verbatim copying, provided that the
+.. entire resulting derived work is distributed under the terms of a
+.. permission notice identical to this one.
+.. 
+.. Since the Linux kernel and libraries are constantly changing, this
+.. manual page may be incorrect or out-of-date.  The author(s) assume no
+.. responsibility for errors or omissions, or for damages resulting from
+.. the use of the information contained herein.  The author(s) may not
+.. have taken the same level of care in the production of this manual,
+.. which is licensed free of charge, as they might when working
+.. professionally.
+.. 
+.. Formatted or processed versions of this manual, if unaccompanied by
+.. the source, must acknowledge the copyright and authors of this work.
+.. %%%LICENSE_END
+.. 
+.. Please do not edit this file. It was generated from the documentation
+.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
+.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
+.. repository (header and footer).
+
+===========
+BPF-HELPERS
+===========
+-------------------------------------------------------------------------------
+list of eBPF helper functions
+-------------------------------------------------------------------------------
+
+:Manual section: 7
+
+DESCRIPTION
+===========
+
+The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
+written in a pseudo-assembly language, then attached to one of the several
+kernel hooks and run in reaction of specific events. This framework differs
+from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
+the ability to call special functions (or "helpers") from within a program.
+These functions are restricted to a white-list of helpers defined in the
+kernel.
+
+These helpers are used by eBPF programs to interact with the system, or with
+the context in which they work. For instance, they can be used to print
+debugging messages, to get the time since the system was booted, to interact
+with eBPF maps, or to manipulate network packets. Since there are several eBPF
+program types, and that they do not run in the same context, each program type
+can only call a subset of those helpers.
+
+Due to eBPF conventions, a helper can not have more than five arguments.
+
+Internally, eBPF programs call directly into the compiled helper functions
+without requiring any foreign-function interface. As a result, calling helpers
+introduces no overhead, thus offering excellent performance.
+
+This document is an attempt to list and document the helpers available to eBPF
+developers. They are sorted by chronological order (the oldest helpers in the
+kernel at the top).
+
+HELPERS
+=======
+'''
+        print(header)
+
+    def print_footer(self):
+        footer = '''
+EXAMPLES
+========
+
+Example usage for most of the eBPF helpers listed in this manual page are
+available within the Linux kernel sources, at the following locations:
+
+* *samples/bpf/*
+* *tools/testing/selftests/bpf/*
+
+LICENSE
+=======
+
+eBPF programs can have an associated license, passed along with the bytecode
+instructions to the kernel when the programs are loaded. The format for that
+string is identical to the one in use for kernel modules (Dual licenses, such
+as "Dual BSD/GPL", may be used). Some helper functions are only accessible to
+programs that are compatible with the GNU Privacy License (GPL).
+
+In order to use such helpers, the eBPF program must be loaded with the correct
+license string passed (via **attr**) to the **bpf**\ () system call, and this
+generally translates into the C source code of the program containing a line
+similar to the following:
+
+::
+
+	char ____license[] __attribute__((section("license"), used)) = "GPL";
+
+IMPLEMENTATION
+==============
+
+This manual page is an effort to document the existing eBPF helper functions.
+But as of this writing, the BPF sub-system is under heavy development. New eBPF
+program or map types are added, along with new helper functions. Some helpers
+are occasionally made available for additional program types. So in spite of
+the efforts of the community, this page might not be up-to-date. If you want to
+check by yourself what helper functions exist in your kernel, or what types of
+programs they can support, here are some files among the kernel tree that you
+may be interested in:
+
+* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list
+  of all helper functions, as well as many other BPF definitions including most
+  of the flags, structs or constants used by the helpers.
+* *net/core/filter.c* contains the definition of most network-related helper
+  functions, and the list of program types from which they can be used.
+* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
+  helpers.
+* *kernel/bpf/verifier.c* contains the functions used to check that valid types
+  of eBPF maps are used with a given helper function.
+* *kernel/bpf/* directory contains other files in which additional helpers are
+  defined (for cgroups, sockmaps, etc.).
+
+Compatibility between helper functions and program types can generally be found
+in the files where helper functions are defined. Look for the **struct
+bpf_func_proto** objects and for functions returning them: these functions
+contain a list of helpers that a given program type can call. Note that the
+**default:** label of the **switch ... case** used to filter helpers can call
+other functions, themselves allowing access to additional helpers. The
+requirement for GPL license is also in those **struct bpf_func_proto**.
+
+Compatibility between helper functions and map types can be found in the
+**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
+
+Helper functions that invalidate the checks on **data** and **data_end**
+pointers for network processing are listed in function
+**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
+
+SEE ALSO
+========
+
+**bpf**\ (2),
+**cgroups**\ (7),
+**ip**\ (8),
+**perf_event_open**\ (2),
+**sendmsg**\ (2),
+**socket**\ (7),
+**tc-bpf**\ (8)'''
+        print(footer)
+
+    def print_proto(self, helper):
+        """
+        Format function protocol with bold and italics markers. This makes RST
+        file less readable, but gives nice results in the manual page.
+        """
+        proto = helper.proto_break_down()
+
+        print('**%s %s%s(' % (proto['ret_type'],
+                              proto['ret_star'].replace('*', '\\*'),
+                              proto['name']),
+              end='')
+
+        comma = ''
+        for a in proto['args']:
+            one_arg = '{}{}'.format(comma, a['type'])
+            if a['name']:
+                if a['star']:
+                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
+                else:
+                    one_arg += '** '
+                one_arg += '*{}*\\ **'.format(a['name'])
+            comma = ', '
+            print(one_arg, end='')
+
+        print(')**')
+
+    def print_one(self, helper):
+        self.print_proto(helper)
+
+        if (helper.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (helper.ret):
+            print('\tReturn')
+            for line in helper.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+
+###############################################################################
+
+# If script is launched from scripts/ from kernel tree and can access
+# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
+# otherwise the --filename argument will be required from the command line.
+script = os.path.abspath(sys.argv[0])
+linuxRoot = os.path.dirname(os.path.dirname(script))
+bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
+
+argParser = argparse.ArgumentParser(description="""
+Parse eBPF header file and generate documentation for eBPF helper functions.
+The RST-formatted output produced can be turned into a manual page with the
+rst2man utility.
+""")
+if (os.path.isfile(bpfh)):
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
+                           default=bpfh)
+else:
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+args = argParser.parse_args()
+
+# Parse file.
+headerParser = HeaderParser(args.filename)
+headerParser.run()
+
+# Print formatted output to standard output.
+printer = PrinterRST(headerParser.helpers)
+printer.print_all()
-- 
cgit v1.2.3


From ad4a522349e5c4fa6af63df376256749dc489fb8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:53 +0100
Subject: bpf: add documentation for eBPF helpers (01-11)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Alexei:

- bpf_map_lookup_elem()
- bpf_map_update_elem()
- bpf_map_delete_elem()
- bpf_probe_read()
- bpf_ktime_get_ns()
- bpf_trace_printk()
- bpf_skb_store_bytes()
- bpf_l3_csum_replace()
- bpf_l4_csum_replace()
- bpf_tail_call()
- bpf_clone_redirect()

v4:
- bpf_map_lookup_elem(): Add "const" qualifier for key.
- bpf_map_update_elem(): Add "const" qualifier for key and value.
- bpf_map_lookup_elem(): Add "const" qualifier for key.
- bpf_skb_store_bytes(): Clarify comment about invalidated verifier
  checks.
- bpf_l3_csum_replace(): Mention L3 instead of just IP, and add a note
  about bpf_csum_diff().
- bpf_l4_csum_replace(): Mention L4 instead of just TCP/UDP, and add a
  note about bpf_csum_diff().
- bpf_tail_call(): Bring minor edits to description.
- bpf_clone_redirect(): Add a note about the relation with
  bpf_redirect(). Also clarify comment about invalidated verifier
  checks.

v3:
- bpf_map_lookup_elem(): Fix description of restrictions for flags
  related to the existence of the entry.
- bpf_trace_printk(): State that trace_pipe can be configured. Fix
  return value in case an unknown format specifier is met. Add a note on
  kernel log notice when the helper is used. Edit example.
- bpf_tail_call(): Improve comment on stack inheritance.
- bpf_clone_redirect(): Improve description of BPF_F_INGRESS flag.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd43fc98c982..96925e949a24 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -393,6 +393,236 @@ union bpf_attr {
  * intentional, removing them would break paragraphs for rst2man.
  *
  * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 		**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 		elements always exist), the helper would return an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_ktime_get_ns(void)
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five).
+ *
+ * 		Each time the helper is called, it appends a line to the trace.
+ * 		The format of the trace is customizable, and the exact output
+ * 		one will get depends on the options set in
+ * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*README* file under the same directory). However, it usually
+ * 		defaults to something like:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``<formatted msg>`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will return **-EINVAL** (but print nothing) if it
+ * 		encounters an unknown specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For this reason, a notice
+ * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		states that the helper should not be used "for production use"
+ * 		the first time this helper is used (or more precisely, when
+ * 		**trace_printk**\ () buffers are allocated). For passing values
+ * 		to user space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 		associated to *skb*. Computation is incremental, so the helper
+ * 		must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored in *size*.
+ * 		Alternatively, it is possible to store the difference between
+ * 		the previous and the new values of the header field in *to*, by
+ * 		setting *from* and *size* to 0. For both methods, *offset*
+ * 		indicates the location of the IP checksum within the packet.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 		packet associated to *skb*. Computation is incremental, so the
+ * 		helper must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored on the lowest
+ * 		four bits of *flags*. Alternatively, it is possible to store
+ * 		the difference between the previous and the new values of the
+ * 		header field in *to*, by setting *from* and the four lowest
+ * 		bits of *flags* to 0. For both methods, *offset* indicates the
+ * 		location of the IP checksum within the packet. In addition to
+ * 		the size of the field, *flags* can be added (bitwise OR) actual
+ * 		flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 		untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 		for updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 		the checksum is to be computed against a pseudo-header.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The same stack
+ * 		frame is used (but values on stack and in registers for the
+ * 		caller are not accessible to the callee). This mechanism allows
+ * 		for program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never returns to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its subsequent instructions. A call can fail if the
+ * 		destination program for the jump does not exist (i.e. *index*
+ * 		is superior to the number of entries in *prog_array_map*), or
+ * 		if the maximum number of tail calls has been reached for this
+ * 		chain of programs. This limit is defined in the kernel by the
+ * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 		which is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. Both ingress and egress
+ * 		interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 		value in *flags* is used to make the distinction (ingress path
+ * 		is selected if the flag is present, egress path otherwise).
+ * 		This is the only flag supported for now.
+ *
+ * 		In comparison with **bpf_redirect**\ () helper,
+ * 		**bpf_clone_redirect**\ () has the associated cost of
+ * 		duplicating the packet buffer, but this can be executed out of
+ * 		the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 		efficient, but it is handled through an action code where the
+ * 		redirection happens only after the eBPF program has returned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3


From c456dec4d2656b70de7371e3a6ae60b7ec7ab74f Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:54 +0100
Subject: bpf: add documentation for eBPF helpers (12-22)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Alexei:

- bpf_get_current_pid_tgid()
- bpf_get_current_uid_gid()
- bpf_get_current_comm()
- bpf_skb_vlan_push()
- bpf_skb_vlan_pop()
- bpf_skb_get_tunnel_key()
- bpf_skb_set_tunnel_key()
- bpf_redirect()
- bpf_perf_event_output()
- bpf_get_stackid()
- bpf_get_current_task()

v4:
- bpf_redirect(): Fix typo: "XDP_ABORT" changed to "XDP_ABORTED". Add
  note on bpf_redirect_map() providing better performance. Replace "Save
  for" with "Except for".
- bpf_skb_vlan_push(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_vlan_pop(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_get_tunnel_key(): Add notes on tunnel_id, "collect metadata"
  mode, and example tunneling protocols with which it can be used.
- bpf_skb_set_tunnel_key(): Add a reference to the description of
  bpf_skb_get_tunnel_key().
- bpf_perf_event_output(): Specify that, and for what purpose, the
  helper can be used with programs attached to TC and XDP.

v3:
- bpf_skb_get_tunnel_key(): Change and improve description and example.
- bpf_redirect(): Improve description of BPF_F_INGRESS flag.
- bpf_perf_event_output(): Fix first sentence of description. Delete
  wrong statement on context being evaluated as a struct pt_reg. Remove
  the long yet incomplete example.
- bpf_get_stackid(): Add a note about PERF_MAX_STACK_DEPTH being
  configurable.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 254 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 96925e949a24..465ea273729a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -623,6 +623,260 @@ union bpf_attr {
  * 		direct packet access.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		The **struct bpf_tunnel_key** is an object that generalizes the
+ * 		principal parameters used by various tunneling protocols into a
+ * 		single struct. This way, it can be used to easily make a
+ * 		decision based on the contents of the encapsulation header,
+ * 		"summarized" in this struct. In particular, it holds the IP
+ * 		address of the remote end (IPv4 or IPv6, depending on the case)
+ * 		in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 		this struct exposes the *key*\ **->tunnel_id**, which is
+ * 		generally mapped to a VNI (Virtual Network Identifier), making
+ * 		it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 		() helper.
+ *
+ * 		Let's imagine that the following code is part of a program
+ * 		attached to the TC ingress interface, on one end of a GRE
+ * 		tunnel, and is supposed to filter out all messages coming from
+ * 		remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 		::
+ *
+ * 			int ret;
+ * 			struct bpf_tunnel_key key = {};
+ * 			
+ * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			if (ret < 0)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			if (key.remote_ipv4 != 0x0a000001)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			return TC_ACT_OK;		// accept packet
+ *
+ * 		This interface can also be used with all encapsulation devices
+ * 		that can operate in "collect metadata" mode: instead of having
+ * 		one network device per specific configuration, the "collect
+ * 		metadata" mode only requires a single device where the
+ * 		configuration can be extracted from this helper.
+ *
+ * 		This can be used together with various tunnels such as VXLan,
+ * 		Geneve, GRE or IP in IP (IPIP).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		Except for XDP, both ingress and egress interfaces can be used
+ * 		for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 		to make the distinction (ingress path is selected if the flag
+ * 		is present, egress path otherwise). Currently, XDP only
+ * 		supports redirection to the egress interface, and accepts no
+ * 		flag at all.
+ *
+ * 		The same effect can be attained with the more generic
+ * 		**bpf_redirect_map**\ (), which requires specific maps to be
+ * 		used but offers better performance.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORTED** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*).
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ *
+ * 		Note that this helper is not restricted to tracing use cases
+ * 		and can be used with programs attached to TC or XDP as well,
+ * 		where it allows for passing data to user space listeners. Data
+ * 		can be:
+ *
+ * 		* Only custom structs,
+ * 		* Only the packet payload, or
+ * 		* A combination of both.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ *
+ * u64 bpf_get_current_task(void)
+ * 	Return
+ * 		A pointer to the current task struct.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3


From 1fdd08bedc56d2a927c5f230e2d63f17ca1068f1 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:55 +0100
Subject: bpf: add documentation for eBPF helpers (23-32)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_prandom_u32()
- bpf_get_smp_processor_id()
- bpf_get_cgroup_classid()
- bpf_get_route_realm()
- bpf_skb_load_bytes()
- bpf_csum_diff()
- bpf_skb_get_tunnel_opt()
- bpf_skb_set_tunnel_opt()
- bpf_skb_change_proto()
- bpf_skb_change_type()

v4:
- bpf_get_prandom_u32(): Warn that the prng is not cryptographically
  secure.
- bpf_get_smp_processor_id(): Fix a typo (case).
- bpf_get_cgroup_classid(): Clarify description. Add notes on the helper
  being limited to cgroup v1, and to egress path.
- bpf_get_route_realm(): Add comparison with bpf_get_cgroup_classid().
  Add a note about usage with TC and advantage of clsact. Fix a typo in
  return value ("sdb" instead of "skb").
- bpf_skb_load_bytes(): Make explicit loading large data loads it to the
  eBPF stack.
- bpf_csum_diff(): Add a note on seed that can be cascaded. Link to
  bpf_l3|l4_csum_replace().
- bpf_skb_get_tunnel_opt(): Add a note about usage with "collect
  metadata" mode, and example of this with Geneve.
- bpf_skb_set_tunnel_opt(): Add a link to bpf_skb_get_tunnel_opt()
  description.
- bpf_skb_change_proto(): Mention that the main use case is NAT64.
  Clarify comment about invalidated verifier checks.

v3:
- bpf_get_prandom_u32(): Fix helper name :(. Add description, including
  a note on the internal random state.
- bpf_get_smp_processor_id(): Add description, including a note on the
  processor id remaining stable during program run.
- bpf_get_cgroup_classid(): State that CONFIG_CGROUP_NET_CLASSID is
  required to use the helper. Add a reference to related documentation.
  State that placing a task in net_cls controller disables cgroup-bpf.
- bpf_get_route_realm(): State that CONFIG_CGROUP_NET_CLASSID is
  required to use this helper.
- bpf_skb_load_bytes(): Fix comment on current use cases for the helper.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 197 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 465ea273729a..7352abf72f50 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -495,6 +495,27 @@ union bpf_attr {
  * 		The number of bytes written to the buffer, or a negative error
  * 		in case of failure.
  *
+ * u32 bpf_get_prandom_u32(void)
+ * 	Description
+ * 		Get a pseudo-random number.
+ *
+ * 		From a security point of view, this helper uses its own
+ * 		pseudo-random internal state, and cannot be used to infer the
+ * 		seed of other random functions in the kernel. However, it is
+ * 		essential to note that the generator used by the helper is not
+ * 		cryptographically secure.
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Description
+ * 		Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 		all programs run with preemption disabled, which means that the
+ * 		SMP processor id is stable during all the execution of the
+ * 		program.
+ * 	Return
+ * 		The SMP id of the processor running the program.
+ *
  * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
  * 	Description
  * 		Store *len* bytes from address *from* into the packet
@@ -647,6 +668,32 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the net_cls
+ * 		cgroup to which *skb* belongs.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress.
+ *
+ * 		The net_cls cgroup provides an interface to tag network packets
+ * 		based on a user-provided identifier for all traffic coming from
+ * 		the tasks belonging to the related cgroup. See also the related
+ * 		kernel documentation, available from the Linux sources in file
+ * 		*Documentation/cgroup-v1/net_cls.txt*.
+ *
+ * 		The Linux kernel has two versions for cgroups: there are
+ * 		cgroups v1 and cgroups v2. Both are available to users, who can
+ * 		use a mixture of them, but note that the net_cls cgroup is for
+ * 		cgroup v1 only. This makes it incompatible with BPF programs
+ * 		run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 		only hold data for one version of cgroups at a time).
+ *
+ * 		This helper is only available is the kernel was compiled with
+ * 		the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 		"**y**" or to "**m**".
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ *
  * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
  * 	Description
  * 		Push a *vlan_tci* (VLAN tag control information) of protocol
@@ -786,6 +833,30 @@ union bpf_attr {
  * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
  * 		error.
  *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*. The
+ * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		one used with the net_cls cgroup (see description for
+ * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 		held by a route (a destination entry), not by a task.
+ *
+ * 		Retrieving this identifier works with the clsact TC egress hook
+ * 		(see also **tc-bpf(8)**), or alternatively on conventional
+ * 		classful egress qdiscs, but not on TC ingress path. In case of
+ * 		clsact TC egress hook, this has the advantage that, internally,
+ * 		the destination entry has not been dropped yet in the transmit
+ * 		path. Therefore, the destination entry does not need to be
+ * 		artificially held via **netif_keep_dst**\ () for a classful
+ * 		qdisc until the *skb* is freed.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ * 	Return
+ * 		The realm of the route for the packet associated to *skb*, or 0
+ * 		if none was found.
+ *
  * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
@@ -831,6 +902,23 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, usage of this helper has mostly been replaced
+ * 		by "direct packet access", enabling packet data to be
+ * 		manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 		pointing respectively to the first byte of packet data and to
+ * 		the byte after the last byte of packet data. However, it
+ * 		remains useful if one wishes to read large quantities of data
+ * 		at once from a packet into the eBPF stack.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
@@ -874,6 +962,115 @@ union bpf_attr {
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
  *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value
+ * 		(this can be cascaded, the seed may come from a previous call
+ * 		to the helper).
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ *
+ * 		This helper can be used in combination with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 		which one can feed in the difference computed with
+ * 		**bpf_csum_diff**\ ().
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ *
+ * 		This helper can be used with encapsulation devices that can
+ * 		operate in "collect metadata" mode (please refer to the related
+ * 		note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 		more details). A particular example where this can be used is
+ * 		in combination with the Geneve encapsulation protocol, where it
+ * 		allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 		and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 		the eBPF program. This allows for full customization of these
+ * 		headers.
+ * 	Return
+ * 		The size of the option data retrieved.
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		(). The main case for this helper is to perform NAT64
+ * 		operations out of an eBPF program.
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
-- 
cgit v1.2.3


From fa15601ab31e5a90bb2a7dbdb43083470a8e787b Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:56 +0100
Subject: bpf: add documentation for eBPF helpers (33-41)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_hash_recalc()
- bpf_skb_change_tail()
- bpf_skb_pull_data()
- bpf_csum_update()
- bpf_set_hash_invalid()
- bpf_get_numa_node_id()
- bpf_set_hash()
- bpf_skb_adjust_room()
- bpf_xdp_adjust_meta()

v4:
- bpf_skb_change_tail(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_pull_data(): Clarify the motivation for using this helper or
  bpf_skb_load_bytes(), on non-linear buffers. Fix RST formatting for
  *skb*. Clarify comment about invalidated verifier checks.
- bpf_csum_update(): Fix description of checksum (entire packet, not IP
  checksum). Fix a typo: "header" instead of "helper".
- bpf_set_hash_invalid(): Mention bpf_get_hash_recalc().
- bpf_get_numa_node_id(): State that the helper is not restricted to
  programs attached to sockets.
- bpf_skb_adjust_room(): Clarify comment about invalidated verifier
  checks.
- bpf_xdp_adjust_meta(): Clarify comment about invalidated verifier
  checks.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 164 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7352abf72f50..b9c28f63f511 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1071,9 +1071,173 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 		susceptible to fail if offsets are invalid, or if the requested
+ * 		data is in non-linear parts of the *skb*. On failure the
+ * 		program can just bail out, or in the case of a non-linear
+ * 		buffer, use a helper to make the data available. The
+ * 		**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 		the data. Another one consists in using **bpf_skb_pull_data**
+ * 		to pull in once the non-linear parts, then retesting and
+ * 		eventually access the data.
+ *
+ * 		At the same time, this also makes sure the *skb* is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 		the very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver has supplied a checksum for the entire packet into that
+ * 		field. Return an error otherwise. This helper is intended to be
+ * 		used in combination with **bpf_csum_diff**\ (), in particular
+ * 		when the checksum needs to be updated after data has been
+ * 		written into the packet through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 		but the helper is also available to other eBPF program types,
+ * 		similarly to **bpf_get_smp_processor_id**\ ().
+ * 	Return
+ * 		The id of current NUMA node.
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3


From c6b5fb8690faccf770d2f344094db1a8e482148f Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:57 +0100
Subject: bpf: add documentation for eBPF helpers (42-50)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helper from Kaixu:
- bpf_perf_event_read()

Helpers from Martin:
- bpf_skb_under_cgroup()
- bpf_xdp_adjust_head()

Helpers from Sargun:
- bpf_probe_write_user()
- bpf_current_task_under_cgroup()

Helper from Thomas:
- bpf_skb_change_head()

Helper from Gianluca:
- bpf_probe_read_str()

Helpers from Chenbo:
- bpf_get_socket_cookie()
- bpf_get_socket_uid()

v4:
- bpf_perf_event_read(): State that bpf_perf_event_read_value() should
  be preferred over this helper.
- bpf_skb_change_head(): Clarify comment about invalidated verifier
  checks.
- bpf_xdp_adjust_head(): Clarify comment about invalidated verifier
  checks.
- bpf_probe_write_user(): Add that dst must be a valid user space
  address.
- bpf_get_socket_cookie(): Improve description by making clearer that
  the cockie belongs to the socket, and state that it remains stable for
  the life of the socket.

v3:
- bpf_perf_event_read(): Fix time of selection for perf event type in
  description. Remove occurences of "cores" to avoid confusion with
  "CPU".

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Gianluca Borello <g.borello@gmail.com>
Cc: Chenbo Feng <fengc@google.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
[for bpf_skb_under_cgroup(), bpf_xdp_adjust_head()]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9c28f63f511..0d4b55e85e07 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -810,6 +810,35 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 		the perf event counter is selected when *map* is updated with
+ * 		perf event file descriptors. The *map* is an array whose size
+ * 		is the number of available CPUs, and each cell contains a value
+ * 		relative to one CPU. The value to retrieve is indicated by
+ * 		*flags*, that contains the index of the CPU to look up, masked
+ * 		with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ *
+ * 		Also, be aware that the newer helper
+ * 		**bpf_perf_event_read_value**\ () is recommended over
+ * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		quirks where error and counter value are used as a return code
+ * 		(which is wrong to do since ranges may overlap). This issue is
+ * 		fixed with bpf_perf_event_read_value(), which at the same time
+ * 		provides more features over the **bpf_perf_event_read**\ ()
+ * 		interface. Please refer to the description of
+ * 		**bpf_perf_event_read_value**\ () for details.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ *
  * int bpf_redirect(u32 ifindex, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*.
@@ -1071,6 +1100,17 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ *
  * u32 bpf_get_hash_recalc(struct sk_buff *skb)
  * 	Description
  * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
@@ -1091,6 +1131,37 @@ union bpf_attr {
  * 	Return
  * 		A pointer to the current task struct.
  *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context, and *dst* must be a valid user space address.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ *
  * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
  * 	Description
  * 		Resize (trim or grow) the packet associated to *skb* to the
@@ -1182,6 +1253,107 @@ union bpf_attr {
  * 	Return
  * 		The id of current NUMA node.
  *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 		retrieve the cookie (generated by the kernel) of this socket.
+ * 		If no cookie has been set yet, generate a new cookie. Once
+ * 		generated, the socket cookie remains stable for the life of the
+ * 		socket. This helper can be useful for monitoring per socket
+ * 		networking traffic statistics as it provides a unique socket
+ * 		identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ *
  * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
  * 	Description
  * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
-- 
cgit v1.2.3


From 7aa79a869dd6d3aa345ea8ffd54085c980f5b1c3 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:58 +0100
Subject: bpf: add documentation for eBPF helpers (51-57)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helpers from Lawrence:
- bpf_setsockopt()
- bpf_getsockopt()
- bpf_sock_ops_cb_flags_set()

Helpers from Yonghong:
- bpf_perf_event_read_value()
- bpf_perf_prog_read_value()

Helper from Josef:
- bpf_override_return()

Helper from Andrey:
- bpf_bind()

v4:
- bpf_perf_event_read_value(): State that this helper should be
  preferred over bpf_perf_event_read().

v3:
- bpf_perf_event_read_value(): Fix time of selection for perf event type
  in description. Remove occurences of "cores" to avoid confusion with
  "CPU".
- bpf_bind(): Remove last paragraph of description, which was off topic.

Cc: Lawrence Brakmo <brakmo@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Yonghong Song <yhs@fb.com>
[for bpf_perf_event_read_value(), bpf_perf_prog_read_value()]
Acked-by: Andrey Ignatov <rdna@fb.com>
[for bpf_bind()]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 180 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d4b55e85e07..4aaee0169421 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1361,6 +1361,28 @@ union bpf_attr {
  * 	Return
  * 		0
  *
+ * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
@@ -1410,6 +1432,164 @@ union bpf_attr {
  * 		direct packet access.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 		counter is selected when *map* is updated with perf event file
+ * 		descriptors. The *map* is an array whose size is the number of
+ * 		available CPUs, and each cell contains a value relative to one
+ * 		CPU. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the CPU to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied. In general, **bpf_perf_event_read_value**\ () is
+ * 		recommended over **bpf_perf_event_read**\ (), which has some
+ * 		ABI issues and provides fewer functionalities.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ *
+ * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3


From ab12704099c2001915843a8b7885567ee6f1800e Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:59 +0100
Subject: bpf: add documentation for eBPF helpers (58-64)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by John:

- bpf_redirect_map()
- bpf_sk_redirect_map()
- bpf_sock_map_update()
- bpf_msg_redirect_map()
- bpf_msg_apply_bytes()
- bpf_msg_cork_bytes()
- bpf_msg_pull_data()

v4:
- bpf_redirect_map(): Fix typos: "XDP_ABORT" changed to "XDP_ABORTED",
  "his" to "this". Also add a paragraph on performance improvement over
  bpf_redirect() helper.

v3:
- bpf_sk_redirect_map(): Improve description of BPF_F_INGRESS flag.
- bpf_msg_redirect_map(): Improve description of BPF_F_INGRESS flag.
- bpf_redirect_map(): Fix note on CPU redirection, not fully implemented
  for generic XDP but supported on native XDP.
- bpf_msg_pull_data(): Clarify comment about invalidated verifier
  checks.

Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 147 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4aaee0169421..27940a36501a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1404,6 +1404,56 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, this *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is only implemented for native XDP (with driver
+ * 		support) as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		When used to redirect packets to net devices, this helper
+ * 		provides a high performance increase over **bpf_redirect**\ ().
+ * 		This is due to various implementation details of the underlying
+ * 		mechanisms, one of which is the fact that **bpf_redirect_map**\
+ * 		() tries to send packet as a "bulk" to the device.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
@@ -1574,6 +1624,103 @@ union bpf_attr {
  * 		be set is returned (which comes down to 0 if all bits were set
  * 		as required).
  *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
-- 
cgit v1.2.3


From 2d020dd771762d96d158a9deed41bc6b7480a8fe Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:17:00 +0100
Subject: bpf: add documentation for eBPF helpers (65-66)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helper from Nikita:
- bpf_xdp_adjust_tail()

Helper from Eyal:
- bpf_skb_get_xfrm_state()

v4:
- New patch (helpers did not exist yet for previous versions).

Cc: Nikita V. Shirokov <tehnerd@tehnerd.com>
Cc: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 27940a36501a..da77a9388947 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1737,6 +1737,36 @@ union bpf_attr {
  * 		must be set to zero.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 		only possible to shrink the packet as of this writing,
+ * 		therefore *delta* must be a negative integer.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * 	Description
+ * 		Retrieve the XFRM state (IP transform framework, see also
+ * 		**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 		The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 		pointed by *xfrm_state* and of length *size*.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_XFRM** configuration option.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3


From 76b7f670730e87974f71df9f6129811e2769666e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 24 Apr 2018 20:48:32 -0500
Subject: signal/signalfd: Add support for SIGSYS

I don't know why signalfd has never grown support for SIGSYS but grow it now.

This corrects an oversight and removes a need for a default in the
switch statement.  Allowing gcc to warn when future members are added
to the enum siginfo_layout, and signalfd does not handle them.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/signalfd.c                 | 6 +++++-
 include/uapi/linux/signalfd.h | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/signalfd.c b/fs/signalfd.c
index 31e960209a08..f652249f59f9 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -134,7 +134,6 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		new.ssi_stime  = kinfo->si_stime;
 		break;
 	case SIL_RT:
-	default:
 		/*
 		 * This case catches also the signals queued by sigqueue().
 		 */
@@ -143,6 +142,11 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		new.ssi_ptr = (long) kinfo->si_ptr;
 		new.ssi_int = kinfo->si_int;
 		break;
+	case SIL_SYS:
+		new.ssi_call_addr = (long) kinfo->si_call_addr;
+		new.ssi_syscall   = kinfo->si_syscall;
+		new.ssi_arch      = kinfo->si_arch;
+		break;
 	}
 
 	if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo)))
diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h
index 6f0da42fc5ef..83429a05b698 100644
--- a/include/uapi/linux/signalfd.h
+++ b/include/uapi/linux/signalfd.h
@@ -35,6 +35,10 @@ struct signalfd_siginfo {
 	__u64 ssi_stime;
 	__u64 ssi_addr;
 	__u16 ssi_addr_lsb;
+	__u16 __pad2;
+	__s32 ssi_syscall;
+	__u64 ssi_call_addr;
+	__u32 ssi_arch;
 
 	/*
 	 * Pad strcture to 128 bytes. Remember to update the
@@ -45,7 +49,7 @@ struct signalfd_siginfo {
 	 * comes out of a read(2) and we really don't want to have
 	 * a compat on read(2).
 	 */
-	__u8 __pad[46];
+	__u8 __pad[28];
 };
 
 
-- 
cgit v1.2.3


From 31931c93dfe05a76385a443ed28244a50e915a46 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 24 Apr 2018 20:59:47 -0500
Subject: signal: Extend siginfo_layout with SIL_FAULT_{MCEERR|BNDERR|PKUERR}

Update the siginfo_layout function and enum siginfo_layout to represent
all of the possible field layouts of struct siginfo.

This allows the uses of siginfo_layout in um and arm64 where they are testing
for SIL_FAULT to be more accurate as this rules out the other cases.

Further this allows the switch statements on siginfo_layout to be simpler
if perhaps a little more wordy.  Making it easier to understand what is
actually going on.

As SIL_FAULT_BNDERR and SIL_FAULT_PKUERR are never expected to appear
in signalfd just treat them as SIL_FAULT.  To include them would take
20 extra bytes an pretty much fill up what is left of
signalfd_siginfo.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/signalfd.c          | 24 ++++++++++-----
 include/linux/signal.h |  3 ++
 kernel/signal.c        | 81 ++++++++++++++++++++++++++++++++++----------------
 3 files changed, 75 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/fs/signalfd.c b/fs/signalfd.c
index f652249f59f9..cbb42f77a2bd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -112,19 +112,27 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		new.ssi_band = kinfo->si_band;
 		new.ssi_fd   = kinfo->si_fd;
 		break;
+	case SIL_FAULT_BNDERR:
+	case SIL_FAULT_PKUERR:
+		/*
+		 * Fall through to the SIL_FAULT case.  Both SIL_FAULT_BNDERR
+		 * and SIL_FAULT_PKUERR are only generated by faults that
+		 * deliver them synchronously to userspace.  In case someone
+		 * injects one of these signals and signalfd catches it treat
+		 * it as SIL_FAULT.
+		 */
 	case SIL_FAULT:
 		new.ssi_addr = (long) kinfo->si_addr;
 #ifdef __ARCH_SI_TRAPNO
 		new.ssi_trapno = kinfo->si_trapno;
 #endif
-		/*
-		 * Other callers might not initialize the si_lsb field,
-		 * so check explicitly for the right codes here.
-		 */
-		if (kinfo->si_signo == SIGBUS &&
-		    ((kinfo->si_code == BUS_MCEERR_AR) ||
-		     (kinfo->si_code == BUS_MCEERR_AO)))
-			new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
+		break;
+	case SIL_FAULT_MCEERR:
+		new.ssi_addr = (long) kinfo->si_addr;
+#ifdef __ARCH_SI_TRAPNO
+		new.ssi_trapno = kinfo->si_trapno;
+#endif
+		new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
 		break;
 	case SIL_CHLD:
 		new.ssi_pid    = kinfo->si_pid;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a9bc7e1b077e..3c5200137b24 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -28,6 +28,9 @@ enum siginfo_layout {
 	SIL_TIMER,
 	SIL_POLL,
 	SIL_FAULT,
+	SIL_FAULT_MCEERR,
+	SIL_FAULT_BNDERR,
+	SIL_FAULT_PKUERR,
 	SIL_CHLD,
 	SIL_RT,
 	SIL_SYS,
diff --git a/kernel/signal.c b/kernel/signal.c
index 376b42f26e6d..8a85da8aaa7c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2820,8 +2820,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
 			[SIGPOLL] = { NSIGPOLL, SIL_POLL },
 			[SIGSYS]  = { NSIGSYS,  SIL_SYS },
 		};
-		if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+		if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) {
 			layout = filter[sig].layout;
+			/* Handle the exceptions */
+			if ((sig == SIGBUS) &&
+			    (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
+				layout = SIL_FAULT_MCEERR;
+			else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
+				layout = SIL_FAULT_BNDERR;
+#ifdef SEGV_PKUERR
+			else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
+				layout = SIL_FAULT_PKUERR;
+#endif
+		}
 		else if (si_code <= NSIGPOLL)
 			layout = SIL_POLL;
 	} else {
@@ -2878,19 +2889,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
 #ifdef __ARCH_SI_TRAPNO
 		new.si_trapno = from->si_trapno;
 #endif
-		if ((from->si_signo == SIGBUS) &&
-		    ((from->si_code == BUS_MCEERR_AR) ||
-		     (from->si_code == BUS_MCEERR_AO)))
-			new.si_addr_lsb = from->si_addr_lsb;
-
-		if ((from->si_signo == SIGSEGV) &&
-		    (from->si_code == SEGV_BNDERR)) {
-			new.si_lower = ptr_to_compat(from->si_lower);
-			new.si_upper = ptr_to_compat(from->si_upper);
-		}
-		if ((from->si_signo == SIGSEGV) &&
-		    (from->si_code == SEGV_PKUERR))
-			new.si_pkey = from->si_pkey;
+		break;
+	case SIL_FAULT_MCEERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
+#endif
+		new.si_addr_lsb = from->si_addr_lsb;
+		break;
+	case SIL_FAULT_BNDERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
+#endif
+		new.si_lower = ptr_to_compat(from->si_lower);
+		new.si_upper = ptr_to_compat(from->si_upper);
+		break;
+	case SIL_FAULT_PKUERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
+#endif
+		new.si_pkey = from->si_pkey;
 		break;
 	case SIL_CHLD:
 		new.si_pid    = from->si_pid;
@@ -2956,17 +2976,28 @@ int copy_siginfo_from_user32(struct siginfo *to,
 #ifdef __ARCH_SI_TRAPNO
 		to->si_trapno = from.si_trapno;
 #endif
-		if ((from.si_signo == SIGBUS) &&
-		    ((from.si_code == BUS_MCEERR_AR) ||
-		     (from.si_code == BUS_MCEERR_AO)))
-			to->si_addr_lsb = from.si_addr_lsb;
-
-		if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
-			to->si_lower = compat_ptr(from.si_lower);
-			to->si_upper = compat_ptr(from.si_upper);
-		}
-		if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
-			to->si_pkey = from.si_pkey;
+		break;
+	case SIL_FAULT_MCEERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
+#endif
+		to->si_addr_lsb = from.si_addr_lsb;
+		break;
+	case SIL_FAULT_BNDERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
+#endif
+		to->si_lower = compat_ptr(from.si_lower);
+		to->si_upper = compat_ptr(from.si_upper);
+		break;
+	case SIL_FAULT_PKUERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
+#endif
+		to->si_pkey = from.si_pkey;
 		break;
 	case SIL_CHLD:
 		to->si_pid    = from.si_pid;
-- 
cgit v1.2.3


From ab42ddb9eb71b580349b03e4fc5bfcf230422eb8 Mon Sep 17 00:00:00 2001
From: Eric Long <eric.long@spreadtrum.com>
Date: Thu, 19 Apr 2018 10:00:48 +0800
Subject: dmaengine: sprd: Move DMA request mode and interrupt type into head
 file

This patch will move the Spreadtrum DMA request mode and interrupt type
into one head file for user to configure.

Signed-off-by: Eric Long <eric.long@spreadtrum.com>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sprd-dma.c       | 52 +---------------------------------------
 include/linux/dma/sprd-dma.h | 57 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 51 deletions(-)
 create mode 100644 include/linux/dma/sprd-dma.h

(limited to 'include')

diff --git a/drivers/dma/sprd-dma.c b/drivers/dma/sprd-dma.c
index 65ff0a583615..ccdeb8f999e5 100644
--- a/drivers/dma/sprd-dma.c
+++ b/drivers/dma/sprd-dma.c
@@ -6,6 +6,7 @@
 
 #include <linux/clk.h>
 #include <linux/dma-mapping.h>
+#include <linux/dma/sprd-dma.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
@@ -125,57 +126,6 @@
 
 #define SPRD_DMA_SOFTWARE_UID		0
 
-/*
- * enum sprd_dma_req_mode: define the DMA request mode
- * @SPRD_DMA_FRAG_REQ: fragment request mode
- * @SPRD_DMA_BLK_REQ: block request mode
- * @SPRD_DMA_TRANS_REQ: transaction request mode
- * @SPRD_DMA_LIST_REQ: link-list request mode
- *
- * We have 4 types request mode: fragment mode, block mode, transaction mode
- * and linklist mode. One transaction can contain several blocks, one block can
- * contain several fragments. Link-list mode means we can save several DMA
- * configuration into one reserved memory, then DMA can fetch each DMA
- * configuration automatically to start transfer.
- */
-enum sprd_dma_req_mode {
-	SPRD_DMA_FRAG_REQ,
-	SPRD_DMA_BLK_REQ,
-	SPRD_DMA_TRANS_REQ,
-	SPRD_DMA_LIST_REQ,
-};
-
-/*
- * enum sprd_dma_int_type: define the DMA interrupt type
- * @SPRD_DMA_NO_INT: do not need generate DMA interrupts.
- * @SPRD_DMA_FRAG_INT: fragment done interrupt when one fragment request
- * is done.
- * @SPRD_DMA_BLK_INT: block done interrupt when one block request is done.
- * @SPRD_DMA_BLK_FRAG_INT: block and fragment interrupt when one fragment
- * or one block request is done.
- * @SPRD_DMA_TRANS_INT: tansaction done interrupt when one transaction
- * request is done.
- * @SPRD_DMA_TRANS_FRAG_INT: transaction and fragment interrupt when one
- * transaction request or fragment request is done.
- * @SPRD_DMA_TRANS_BLK_INT: transaction and block interrupt when one
- * transaction request or block request is done.
- * @SPRD_DMA_LIST_INT: link-list done interrupt when one link-list request
- * is done.
- * @SPRD_DMA_CFGERR_INT: configure error interrupt when configuration is
- * incorrect.
- */
-enum sprd_dma_int_type {
-	SPRD_DMA_NO_INT,
-	SPRD_DMA_FRAG_INT,
-	SPRD_DMA_BLK_INT,
-	SPRD_DMA_BLK_FRAG_INT,
-	SPRD_DMA_TRANS_INT,
-	SPRD_DMA_TRANS_FRAG_INT,
-	SPRD_DMA_TRANS_BLK_INT,
-	SPRD_DMA_LIST_INT,
-	SPRD_DMA_CFGERR_INT,
-};
-
 /* dma data width values */
 enum sprd_dma_datawidth {
 	SPRD_DMA_DATAWIDTH_1_BYTE,
diff --git a/include/linux/dma/sprd-dma.h b/include/linux/dma/sprd-dma.h
new file mode 100644
index 000000000000..c545162e725b
--- /dev/null
+++ b/include/linux/dma/sprd-dma.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _SPRD_DMA_H_
+#define _SPRD_DMA_H_
+
+/*
+ * enum sprd_dma_req_mode: define the DMA request mode
+ * @SPRD_DMA_FRAG_REQ: fragment request mode
+ * @SPRD_DMA_BLK_REQ: block request mode
+ * @SPRD_DMA_TRANS_REQ: transaction request mode
+ * @SPRD_DMA_LIST_REQ: link-list request mode
+ *
+ * We have 4 types request mode: fragment mode, block mode, transaction mode
+ * and linklist mode. One transaction can contain several blocks, one block can
+ * contain several fragments. Link-list mode means we can save several DMA
+ * configuration into one reserved memory, then DMA can fetch each DMA
+ * configuration automatically to start transfer.
+ */
+enum sprd_dma_req_mode {
+	SPRD_DMA_FRAG_REQ,
+	SPRD_DMA_BLK_REQ,
+	SPRD_DMA_TRANS_REQ,
+	SPRD_DMA_LIST_REQ,
+};
+
+/*
+ * enum sprd_dma_int_type: define the DMA interrupt type
+ * @SPRD_DMA_NO_INT: do not need generate DMA interrupts.
+ * @SPRD_DMA_FRAG_INT: fragment done interrupt when one fragment request
+ * is done.
+ * @SPRD_DMA_BLK_INT: block done interrupt when one block request is done.
+ * @SPRD_DMA_BLK_FRAG_INT: block and fragment interrupt when one fragment
+ * or one block request is done.
+ * @SPRD_DMA_TRANS_INT: tansaction done interrupt when one transaction
+ * request is done.
+ * @SPRD_DMA_TRANS_FRAG_INT: transaction and fragment interrupt when one
+ * transaction request or fragment request is done.
+ * @SPRD_DMA_TRANS_BLK_INT: transaction and block interrupt when one
+ * transaction request or block request is done.
+ * @SPRD_DMA_LIST_INT: link-list done interrupt when one link-list request
+ * is done.
+ * @SPRD_DMA_CFGERR_INT: configure error interrupt when configuration is
+ * incorrect.
+ */
+enum sprd_dma_int_type {
+	SPRD_DMA_NO_INT,
+	SPRD_DMA_FRAG_INT,
+	SPRD_DMA_BLK_INT,
+	SPRD_DMA_BLK_FRAG_INT,
+	SPRD_DMA_TRANS_INT,
+	SPRD_DMA_TRANS_FRAG_INT,
+	SPRD_DMA_TRANS_BLK_INT,
+	SPRD_DMA_LIST_INT,
+	SPRD_DMA_CFGERR_INT,
+};
+
+#endif
-- 
cgit v1.2.3


From fcfdfe30e324725007e9dc5814b62a4c430ea909 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 26 Apr 2018 11:34:15 +0100
Subject: locking/barriers: Introduce smp_cond_load_relaxed() and
 atomic_cond_read_relaxed()

Whilst we currently provide smp_cond_load_acquire() and
atomic_cond_read_acquire(), there are cases where the ACQUIRE semantics are
not required because of a subsequent fence or release operation once the
conditional loop has exited.

This patch adds relaxed versions of the conditional spinning primitives
to avoid unnecessary barrier overhead on architectures such as arm64.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boqun.feng@gmail.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1524738868-31318-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/atomic-long.h |  2 ++
 include/asm-generic/barrier.h     | 27 +++++++++++++++++++++------
 include/linux/atomic.h            |  2 ++
 3 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index 34a028a7bcc5..5b2b0b5ea06d 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -244,6 +244,8 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 #define atomic_long_inc_not_zero(l) \
 	ATOMIC_LONG_PFX(_inc_not_zero)((ATOMIC_LONG_PFX(_t) *)(l))
 
+#define atomic_long_cond_read_relaxed(v, c) \
+	ATOMIC_LONG_PFX(_cond_read_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (c))
 #define atomic_long_cond_read_acquire(v, c) \
 	ATOMIC_LONG_PFX(_cond_read_acquire)((ATOMIC_LONG_PFX(_t) *)(v), (c))
 
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 29458bbb2fa0..2cafdbb9ae4c 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -221,18 +221,17 @@ do {									\
 #endif
 
 /**
- * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * smp_cond_load_relaxed() - (Spin) wait for cond with no ordering guarantees
  * @ptr: pointer to the variable to wait on
  * @cond: boolean expression to wait for
  *
- * Equivalent to using smp_load_acquire() on the condition variable but employs
- * the control dependency of the wait to reduce the barrier on many platforms.
+ * Equivalent to using READ_ONCE() on the condition variable.
  *
  * Due to C lacking lambda expressions we load the value of *ptr into a
  * pre-named variable @VAL to be used in @cond.
  */
-#ifndef smp_cond_load_acquire
-#define smp_cond_load_acquire(ptr, cond_expr) ({		\
+#ifndef smp_cond_load_relaxed
+#define smp_cond_load_relaxed(ptr, cond_expr) ({		\
 	typeof(ptr) __PTR = (ptr);				\
 	typeof(*ptr) VAL;					\
 	for (;;) {						\
@@ -241,10 +240,26 @@ do {									\
 			break;					\
 		cpu_relax();					\
 	}							\
-	smp_acquire__after_ctrl_dep();				\
 	VAL;							\
 })
 #endif
 
+/**
+ * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable to wait on
+ * @cond: boolean expression to wait for
+ *
+ * Equivalent to using smp_load_acquire() on the condition variable but employs
+ * the control dependency of the wait to reduce the barrier on many platforms.
+ */
+#ifndef smp_cond_load_acquire
+#define smp_cond_load_acquire(ptr, cond_expr) ({		\
+	typeof(*ptr) _val;					\
+	_val = smp_cond_load_relaxed(ptr, cond_expr);		\
+	smp_acquire__after_ctrl_dep();				\
+	_val;							\
+})
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __ASM_GENERIC_BARRIER_H */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 8b276fd9a127..01ce3997cb42 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -654,6 +654,7 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 }
 #endif
 
+#define atomic_cond_read_relaxed(v, c)	smp_cond_load_relaxed(&(v)->counter, (c))
 #define atomic_cond_read_acquire(v, c)	smp_cond_load_acquire(&(v)->counter, (c))
 
 #ifdef CONFIG_GENERIC_ATOMIC64
@@ -1075,6 +1076,7 @@ static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v
 }
 #endif
 
+#define atomic64_cond_read_relaxed(v, c)	smp_cond_load_relaxed(&(v)->counter, (c))
 #define atomic64_cond_read_acquire(v, c)	smp_cond_load_acquire(&(v)->counter, (c))
 
 #include <asm-generic/atomic-long.h>
-- 
cgit v1.2.3


From 625e88be1f41b53cec55827c984e4a89ea8ee9f9 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 26 Apr 2018 11:34:16 +0100
Subject: locking/qspinlock: Merge 'struct __qspinlock' into 'struct qspinlock'

'struct __qspinlock' provides a handy union of fields so that
subcomponents of the lockword can be accessed by name, without having to
manage shifts and masks explicitly and take endianness into account.

This is useful in qspinlock.h and also potentially in arch headers, so
move the 'struct __qspinlock' into 'struct qspinlock' and kill the extra
definition.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1524738868-31318-3-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/qspinlock.h          |  2 +-
 arch/x86/include/asm/qspinlock_paravirt.h |  3 +-
 include/asm-generic/qspinlock_types.h     | 32 +++++++++++++++++++--
 kernel/locking/qspinlock.c                | 46 ++-----------------------------
 kernel/locking/qspinlock_paravirt.h       | 34 ++++++++---------------
 5 files changed, 46 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 5e16b5d40d32..90b0b0ed8161 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -16,7 +16,7 @@
  */
 static inline void native_queued_spin_unlock(struct qspinlock *lock)
 {
-	smp_store_release((u8 *)lock, 0);
+	smp_store_release(&lock->locked, 0);
 }
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 923307ea11c7..9ef5ee03d2d7 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -22,8 +22,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
  *
  * void __pv_queued_spin_unlock(struct qspinlock *lock)
  * {
- *	struct __qspinlock *l = (void *)lock;
- *	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *	u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
  *
  *	if (likely(lockval == _Q_LOCKED_VAL))
  *		return;
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
index 034acd0c4956..0763f065b975 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -29,13 +29,41 @@
 #endif
 
 typedef struct qspinlock {
-	atomic_t	val;
+	union {
+		atomic_t val;
+
+		/*
+		 * By using the whole 2nd least significant byte for the
+		 * pending bit, we can allow better optimization of the lock
+		 * acquisition for the pending bit holder.
+		 */
+#ifdef __LITTLE_ENDIAN
+		struct {
+			u8	locked;
+			u8	pending;
+		};
+		struct {
+			u16	locked_pending;
+			u16	tail;
+		};
+#else
+		struct {
+			u16	tail;
+			u16	locked_pending;
+		};
+		struct {
+			u8	reserved[2];
+			u8	pending;
+			u8	locked;
+		};
+#endif
+	};
 } arch_spinlock_t;
 
 /*
  * Initializier
  */
-#define	__ARCH_SPIN_LOCK_UNLOCKED	{ ATOMIC_INIT(0) }
+#define	__ARCH_SPIN_LOCK_UNLOCKED	{ .val = ATOMIC_INIT(0) }
 
 /*
  * Bitfields in the atomic value:
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index d880296245c5..f5b0e59f6d14 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -114,40 +114,6 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
 
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
 
-/*
- * By using the whole 2nd least significant byte for the pending bit, we
- * can allow better optimization of the lock acquisition for the pending
- * bit holder.
- *
- * This internal structure is also used by the set_locked function which
- * is not restricted to _Q_PENDING_BITS == 8.
- */
-struct __qspinlock {
-	union {
-		atomic_t val;
-#ifdef __LITTLE_ENDIAN
-		struct {
-			u8	locked;
-			u8	pending;
-		};
-		struct {
-			u16	locked_pending;
-			u16	tail;
-		};
-#else
-		struct {
-			u16	tail;
-			u16	locked_pending;
-		};
-		struct {
-			u8	reserved[2];
-			u8	pending;
-			u8	locked;
-		};
-#endif
-	};
-};
-
 #if _Q_PENDING_BITS == 8
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
@@ -159,9 +125,7 @@ struct __qspinlock {
  */
 static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
 }
 
 /*
@@ -176,13 +140,11 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
  */
 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
-	struct __qspinlock *l = (void *)lock;
-
 	/*
 	 * Use release semantics to make sure that the MCS node is properly
 	 * initialized before changing the tail code.
 	 */
-	return (u32)xchg_release(&l->tail,
+	return (u32)xchg_release(&lock->tail,
 				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 
@@ -237,9 +199,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
  */
 static __always_inline void set_locked(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 }
 
 
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6ee477765e6c..2711940429f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -87,8 +87,6 @@ struct pv_node {
 #define queued_spin_trylock(l)	pv_hybrid_queued_unfair_trylock(l)
 static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
 	/*
 	 * Stay in unfair lock mode as long as queued mode waiters are
 	 * present in the MCS wait queue but the pending bit isn't set.
@@ -97,7 +95,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 		int val = atomic_read(&lock->val);
 
 		if (!(val & _Q_LOCKED_PENDING_MASK) &&
-		   (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
 			qstat_inc(qstat_pv_lock_stealing, true);
 			return true;
 		}
@@ -117,16 +115,12 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 #if _Q_PENDING_BITS == 8
 static __always_inline void set_pending(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->pending, 1);
+	WRITE_ONCE(lock->pending, 1);
 }
 
 static __always_inline void clear_pending(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->pending, 0);
+	WRITE_ONCE(lock->pending, 0);
 }
 
 /*
@@ -136,10 +130,8 @@ static __always_inline void clear_pending(struct qspinlock *lock)
  */
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	return !READ_ONCE(l->locked) &&
-	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+	return !READ_ONCE(lock->locked) &&
+	       (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
 				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
 }
 #else /* _Q_PENDING_BITS == 8 */
@@ -384,7 +376,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
-	struct __qspinlock *l = (void *)lock;
 
 	/*
 	 * If the vCPU is indeed halted, advance its state to match that of
@@ -413,7 +404,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	 * the hash table later on at unlock time, no atomic instruction is
 	 * needed.
 	 */
-	WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+	WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
 	(void)pv_hash(lock, pn);
 }
 
@@ -428,7 +419,6 @@ static u32
 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
-	struct __qspinlock *l = (void *)lock;
 	struct qspinlock **lp = NULL;
 	int waitcnt = 0;
 	int loop;
@@ -479,13 +469,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
-			if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
+			if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
 				/*
 				 * The lock was free and now we own the lock.
 				 * Change the lock value back to _Q_LOCKED_VAL
 				 * and unhash the table.
 				 */
-				WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+				WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 				WRITE_ONCE(*lp, NULL);
 				goto gotlock;
 			}
@@ -493,7 +483,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 		WRITE_ONCE(pn->state, vcpu_hashed);
 		qstat_inc(qstat_pv_wait_head, true);
 		qstat_inc(qstat_pv_wait_again, waitcnt);
-		pv_wait(&l->locked, _Q_SLOW_VAL);
+		pv_wait(&lock->locked, _Q_SLOW_VAL);
 
 		/*
 		 * Because of lock stealing, the queue head vCPU may not be
@@ -518,7 +508,6 @@ gotlock:
 __visible void
 __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 {
-	struct __qspinlock *l = (void *)lock;
 	struct pv_node *node;
 
 	if (unlikely(locked != _Q_SLOW_VAL)) {
@@ -547,7 +536,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * Now that we have a reference to the (likely) blocked pv_node,
 	 * release the lock.
 	 */
-	smp_store_release(&l->locked, 0);
+	smp_store_release(&lock->locked, 0);
 
 	/*
 	 * At this point the memory pointed at by lock can be freed/reused,
@@ -573,7 +562,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 #ifndef __pv_queued_spin_unlock
 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
 	u8 locked;
 
 	/*
@@ -581,7 +569,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * unhash. Otherwise it would be possible to have multiple @lock
 	 * entries, which would be BAD.
 	 */
-	locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
+	locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
 	if (likely(locked == _Q_LOCKED_VAL))
 		return;
 
-- 
cgit v1.2.3


From 626e5fbc14358901ddaa90ce510e0fbeab310432 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 26 Apr 2018 11:34:24 +0100
Subject: locking/qspinlock: Use smp_store_release() in queued_spin_unlock()

A qspinlock can be unlocked simply by writing zero to the locked byte.
This can be implemented in the generic code, so do that and remove the
arch-specific override for x86 in the !PV case.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boqun.feng@gmail.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1524738868-31318-11-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/qspinlock.h | 17 ++++++-----------
 include/asm-generic/qspinlock.h  |  2 +-
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index da1370ad206d..3e70bed8a978 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -9,6 +9,12 @@
 
 #define _Q_PENDING_LOOPS	(1 << 9)
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
 #define	queued_spin_unlock queued_spin_unlock
 /**
  * queued_spin_unlock - release a queued spinlock
@@ -21,12 +27,6 @@ static inline void native_queued_spin_unlock(struct qspinlock *lock)
 	smp_store_release(&lock->locked, 0);
 }
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_init_lock_hash(void);
-extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
-
 static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
 	pv_queued_spin_lock_slowpath(lock, val);
@@ -42,11 +42,6 @@ static inline bool vcpu_is_preempted(long cpu)
 {
 	return pv_vcpu_is_preempted(cpu);
 }
-#else
-static inline void queued_spin_unlock(struct qspinlock *lock)
-{
-	native_queued_spin_unlock(lock);
-}
 #endif
 
 #ifdef CONFIG_PARAVIRT
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index b37b4ad7eb94..a8ed0a352d75 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -100,7 +100,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
 	/*
 	 * unlock() needs release semantics:
 	 */
-	(void)atomic_sub_return_release(_Q_LOCKED_VAL, &lock->val);
+	smp_store_release(&lock->locked, 0);
 }
 #endif
 
-- 
cgit v1.2.3


From 5c8d08f3471265dfd2f6db6d381751848dbf7db3 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 9 Apr 2018 22:28:26 +0300
Subject: dt-bindings: memory: tegra: Add hot resets definitions

Add definitions for the Tegra20+ memory controller hot resets.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/memory/tegra114-mc.h | 19 +++++++++++++++++++
 include/dt-bindings/memory/tegra124-mc.h | 25 +++++++++++++++++++++++++
 include/dt-bindings/memory/tegra20-mc.h  | 21 +++++++++++++++++++++
 include/dt-bindings/memory/tegra210-mc.h | 31 +++++++++++++++++++++++++++++++
 include/dt-bindings/memory/tegra30-mc.h  | 19 +++++++++++++++++++
 5 files changed, 115 insertions(+)
 create mode 100644 include/dt-bindings/memory/tegra20-mc.h

(limited to 'include')

diff --git a/include/dt-bindings/memory/tegra114-mc.h b/include/dt-bindings/memory/tegra114-mc.h
index 27c8386987ff..54a12adec7b8 100644
--- a/include/dt-bindings/memory/tegra114-mc.h
+++ b/include/dt-bindings/memory/tegra114-mc.h
@@ -23,4 +23,23 @@
 #define TEGRA_SWGROUP_EMUCIF	18
 #define TEGRA_SWGROUP_TSEC	19
 
+#define TEGRA114_MC_RESET_AFI		0
+#define TEGRA114_MC_RESET_AVPC		1
+#define TEGRA114_MC_RESET_DC		2
+#define TEGRA114_MC_RESET_DCB		3
+#define TEGRA114_MC_RESET_EPP		4
+#define TEGRA114_MC_RESET_2D		5
+#define TEGRA114_MC_RESET_HC		6
+#define TEGRA114_MC_RESET_HDA		7
+#define TEGRA114_MC_RESET_ISP		8
+#define TEGRA114_MC_RESET_MPCORE	9
+#define TEGRA114_MC_RESET_MPCORELP	10
+#define TEGRA114_MC_RESET_MPE		11
+#define TEGRA114_MC_RESET_3D		12
+#define TEGRA114_MC_RESET_3D2		13
+#define TEGRA114_MC_RESET_PPCS		14
+#define TEGRA114_MC_RESET_SATA		15
+#define TEGRA114_MC_RESET_VDE		16
+#define TEGRA114_MC_RESET_VI		17
+
 #endif
diff --git a/include/dt-bindings/memory/tegra124-mc.h b/include/dt-bindings/memory/tegra124-mc.h
index f534d7c06019..186e6b7e9b35 100644
--- a/include/dt-bindings/memory/tegra124-mc.h
+++ b/include/dt-bindings/memory/tegra124-mc.h
@@ -29,4 +29,29 @@
 #define TEGRA_SWGROUP_VIC	24
 #define TEGRA_SWGROUP_VI	25
 
+#define TEGRA124_MC_RESET_AFI		0
+#define TEGRA124_MC_RESET_AVPC		1
+#define TEGRA124_MC_RESET_DC		2
+#define TEGRA124_MC_RESET_DCB		3
+#define TEGRA124_MC_RESET_HC		4
+#define TEGRA124_MC_RESET_HDA		5
+#define TEGRA124_MC_RESET_ISP2		6
+#define TEGRA124_MC_RESET_MPCORE	7
+#define TEGRA124_MC_RESET_MPCORELP	8
+#define TEGRA124_MC_RESET_MSENC		9
+#define TEGRA124_MC_RESET_PPCS		10
+#define TEGRA124_MC_RESET_SATA		11
+#define TEGRA124_MC_RESET_VDE		12
+#define TEGRA124_MC_RESET_VI		13
+#define TEGRA124_MC_RESET_VIC		14
+#define TEGRA124_MC_RESET_XUSB_HOST	15
+#define TEGRA124_MC_RESET_XUSB_DEV	16
+#define TEGRA124_MC_RESET_TSEC		17
+#define TEGRA124_MC_RESET_SDMMC1	18
+#define TEGRA124_MC_RESET_SDMMC2	19
+#define TEGRA124_MC_RESET_SDMMC3	20
+#define TEGRA124_MC_RESET_SDMMC4	21
+#define TEGRA124_MC_RESET_ISP2B		22
+#define TEGRA124_MC_RESET_GPU		23
+
 #endif
diff --git a/include/dt-bindings/memory/tegra20-mc.h b/include/dt-bindings/memory/tegra20-mc.h
new file mode 100644
index 000000000000..35e131eee198
--- /dev/null
+++ b/include/dt-bindings/memory/tegra20-mc.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef DT_BINDINGS_MEMORY_TEGRA20_MC_H
+#define DT_BINDINGS_MEMORY_TEGRA20_MC_H
+
+#define TEGRA20_MC_RESET_AVPC		0
+#define TEGRA20_MC_RESET_DC		1
+#define TEGRA20_MC_RESET_DCB		2
+#define TEGRA20_MC_RESET_EPP		3
+#define TEGRA20_MC_RESET_2D		4
+#define TEGRA20_MC_RESET_HC		5
+#define TEGRA20_MC_RESET_ISP		6
+#define TEGRA20_MC_RESET_MPCORE		7
+#define TEGRA20_MC_RESET_MPEA		8
+#define TEGRA20_MC_RESET_MPEB		9
+#define TEGRA20_MC_RESET_MPEC		10
+#define TEGRA20_MC_RESET_3D		11
+#define TEGRA20_MC_RESET_PPCS		12
+#define TEGRA20_MC_RESET_VDE		13
+#define TEGRA20_MC_RESET_VI		14
+
+#endif
diff --git a/include/dt-bindings/memory/tegra210-mc.h b/include/dt-bindings/memory/tegra210-mc.h
index 4490f7cf4772..cacf05617e03 100644
--- a/include/dt-bindings/memory/tegra210-mc.h
+++ b/include/dt-bindings/memory/tegra210-mc.h
@@ -34,4 +34,35 @@
 #define TEGRA_SWGROUP_ETR	29
 #define TEGRA_SWGROUP_TSECB	30
 
+#define TEGRA210_MC_RESET_AFI		0
+#define TEGRA210_MC_RESET_AVPC		1
+#define TEGRA210_MC_RESET_DC		2
+#define TEGRA210_MC_RESET_DCB		3
+#define TEGRA210_MC_RESET_HC		4
+#define TEGRA210_MC_RESET_HDA		5
+#define TEGRA210_MC_RESET_ISP2		6
+#define TEGRA210_MC_RESET_MPCORE	7
+#define TEGRA210_MC_RESET_NVENC		8
+#define TEGRA210_MC_RESET_PPCS		9
+#define TEGRA210_MC_RESET_SATA		10
+#define TEGRA210_MC_RESET_VI		11
+#define TEGRA210_MC_RESET_VIC		12
+#define TEGRA210_MC_RESET_XUSB_HOST	13
+#define TEGRA210_MC_RESET_XUSB_DEV	14
+#define TEGRA210_MC_RESET_A9AVP		15
+#define TEGRA210_MC_RESET_TSEC		16
+#define TEGRA210_MC_RESET_SDMMC1	17
+#define TEGRA210_MC_RESET_SDMMC2	18
+#define TEGRA210_MC_RESET_SDMMC3	19
+#define TEGRA210_MC_RESET_SDMMC4	20
+#define TEGRA210_MC_RESET_ISP2B		21
+#define TEGRA210_MC_RESET_GPU		22
+#define TEGRA210_MC_RESET_NVDEC		23
+#define TEGRA210_MC_RESET_APE		24
+#define TEGRA210_MC_RESET_SE		25
+#define TEGRA210_MC_RESET_NVJPG		26
+#define TEGRA210_MC_RESET_AXIAP		27
+#define TEGRA210_MC_RESET_ETR		28
+#define TEGRA210_MC_RESET_TSECB		29
+
 #endif
diff --git a/include/dt-bindings/memory/tegra30-mc.h b/include/dt-bindings/memory/tegra30-mc.h
index 3cac81919023..169f005fbc78 100644
--- a/include/dt-bindings/memory/tegra30-mc.h
+++ b/include/dt-bindings/memory/tegra30-mc.h
@@ -22,4 +22,23 @@
 #define TEGRA_SWGROUP_MPCORE	17
 #define TEGRA_SWGROUP_ISP	18
 
+#define TEGRA30_MC_RESET_AFI		0
+#define TEGRA30_MC_RESET_AVPC		1
+#define TEGRA30_MC_RESET_DC		2
+#define TEGRA30_MC_RESET_DCB		3
+#define TEGRA30_MC_RESET_EPP		4
+#define TEGRA30_MC_RESET_2D		5
+#define TEGRA30_MC_RESET_HC		6
+#define TEGRA30_MC_RESET_HDA		7
+#define TEGRA30_MC_RESET_ISP		8
+#define TEGRA30_MC_RESET_MPCORE		9
+#define TEGRA30_MC_RESET_MPCORELP	10
+#define TEGRA30_MC_RESET_MPE		11
+#define TEGRA30_MC_RESET_3D		12
+#define TEGRA30_MC_RESET_3D2		13
+#define TEGRA30_MC_RESET_PPCS		14
+#define TEGRA30_MC_RESET_SATA		15
+#define TEGRA30_MC_RESET_VDE		16
+#define TEGRA30_MC_RESET_VI		17
+
 #endif
-- 
cgit v1.2.3


From 1c74d5c0de0c2cc29fef97a19251da2ad6f579bd Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 9 Apr 2018 22:28:29 +0300
Subject: memory: tegra: Apply interrupts mask per SoC

Currently we are enabling handling of interrupts specific to Tegra124+
which happen to overlap with previous generations. Let's specify
interrupts mask per SoC generation for consistency and in a preparation
of squashing of Tegra20 driver into the common one that will enable
handling of GART faults which may be undesirable by newer generations.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/memory/tegra/mc.c       | 21 +++------------------
 drivers/memory/tegra/mc.h       |  9 +++++++++
 drivers/memory/tegra/tegra114.c |  2 ++
 drivers/memory/tegra/tegra124.c |  6 ++++++
 drivers/memory/tegra/tegra210.c |  3 +++
 drivers/memory/tegra/tegra30.c  |  2 ++
 include/soc/tegra/mc.h          |  2 ++
 7 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c
index e55b9733bd83..60509f0a386b 100644
--- a/drivers/memory/tegra/mc.c
+++ b/drivers/memory/tegra/mc.c
@@ -20,14 +20,6 @@
 #include "mc.h"
 
 #define MC_INTSTATUS 0x000
-#define  MC_INT_DECERR_MTS (1 << 16)
-#define  MC_INT_SECERR_SEC (1 << 13)
-#define  MC_INT_DECERR_VPR (1 << 12)
-#define  MC_INT_INVALID_APB_ASID_UPDATE (1 << 11)
-#define  MC_INT_INVALID_SMMU_PAGE (1 << 10)
-#define  MC_INT_ARBITRATION_EMEM (1 << 9)
-#define  MC_INT_SECURITY_VIOLATION (1 << 8)
-#define  MC_INT_DECERR_EMEM (1 << 6)
 
 #define MC_INTMASK 0x004
 
@@ -248,13 +240,11 @@ static const char *const error_names[8] = {
 static irqreturn_t tegra_mc_irq(int irq, void *data)
 {
 	struct tegra_mc *mc = data;
-	unsigned long status, mask;
+	unsigned long status;
 	unsigned int bit;
 
 	/* mask all interrupts to avoid flooding */
-	mask = mc_readl(mc, MC_INTMASK);
-	status = mc_readl(mc, MC_INTSTATUS) & mask;
-
+	status = mc_readl(mc, MC_INTSTATUS) & mc->soc->intmask;
 	if (!status)
 		return IRQ_NONE;
 
@@ -349,7 +339,6 @@ static int tegra_mc_probe(struct platform_device *pdev)
 	const struct of_device_id *match;
 	struct resource *res;
 	struct tegra_mc *mc;
-	u32 value;
 	int err;
 
 	match = of_match_node(tegra_mc_of_match, pdev->dev.of_node);
@@ -409,11 +398,7 @@ static int tegra_mc_probe(struct platform_device *pdev)
 
 	WARN(!mc->soc->client_id_mask, "Missing client ID mask for this SoC\n");
 
-	value = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR |
-		MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE |
-		MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM;
-
-	mc_writel(mc, value, MC_INTMASK);
+	mc_writel(mc, mc->soc->intmask, MC_INTMASK);
 
 	err = devm_request_irq(&pdev->dev, mc->irq, tegra_mc_irq, IRQF_SHARED,
 			       dev_name(&pdev->dev), mc);
diff --git a/drivers/memory/tegra/mc.h b/drivers/memory/tegra/mc.h
index ddb16676c3af..24e020b4609b 100644
--- a/drivers/memory/tegra/mc.h
+++ b/drivers/memory/tegra/mc.h
@@ -14,6 +14,15 @@
 
 #include <soc/tegra/mc.h>
 
+#define MC_INT_DECERR_MTS (1 << 16)
+#define MC_INT_SECERR_SEC (1 << 13)
+#define MC_INT_DECERR_VPR (1 << 12)
+#define MC_INT_INVALID_APB_ASID_UPDATE (1 << 11)
+#define MC_INT_INVALID_SMMU_PAGE (1 << 10)
+#define MC_INT_ARBITRATION_EMEM (1 << 9)
+#define MC_INT_SECURITY_VIOLATION (1 << 8)
+#define MC_INT_DECERR_EMEM (1 << 6)
+
 static inline u32 mc_readl(struct tegra_mc *mc, unsigned long offset)
 {
 	return readl(mc->regs + offset);
diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c
index b20e6e3e208e..7560b2f558a7 100644
--- a/drivers/memory/tegra/tegra114.c
+++ b/drivers/memory/tegra/tegra114.c
@@ -945,4 +945,6 @@ const struct tegra_mc_soc tegra114_mc_soc = {
 	.atom_size = 32,
 	.client_id_mask = 0x7f,
 	.smmu = &tegra114_smmu_soc,
+	.intmask = MC_INT_INVALID_SMMU_PAGE | MC_INT_SECURITY_VIOLATION |
+		   MC_INT_DECERR_EMEM,
 };
diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c
index 8b6360eabb8a..bd16555cca0f 100644
--- a/drivers/memory/tegra/tegra124.c
+++ b/drivers/memory/tegra/tegra124.c
@@ -1035,6 +1035,9 @@ const struct tegra_mc_soc tegra124_mc_soc = {
 	.smmu = &tegra124_smmu_soc,
 	.emem_regs = tegra124_mc_emem_regs,
 	.num_emem_regs = ARRAY_SIZE(tegra124_mc_emem_regs),
+	.intmask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR |
+		   MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE |
+		   MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM,
 };
 #endif /* CONFIG_ARCH_TEGRA_124_SOC */
 
@@ -1059,5 +1062,8 @@ const struct tegra_mc_soc tegra132_mc_soc = {
 	.atom_size = 32,
 	.client_id_mask = 0x7f,
 	.smmu = &tegra132_smmu_soc,
+	.intmask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR |
+		   MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE |
+		   MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM,
 };
 #endif /* CONFIG_ARCH_TEGRA_132_SOC */
diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c
index d398bcd3fc57..3b8d0100088c 100644
--- a/drivers/memory/tegra/tegra210.c
+++ b/drivers/memory/tegra/tegra210.c
@@ -1092,4 +1092,7 @@ const struct tegra_mc_soc tegra210_mc_soc = {
 	.atom_size = 64,
 	.client_id_mask = 0xff,
 	.smmu = &tegra210_smmu_soc,
+	.intmask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR |
+		   MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE |
+		   MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM,
 };
diff --git a/drivers/memory/tegra/tegra30.c b/drivers/memory/tegra/tegra30.c
index d756c837f23e..d2ba50ed0490 100644
--- a/drivers/memory/tegra/tegra30.c
+++ b/drivers/memory/tegra/tegra30.c
@@ -967,4 +967,6 @@ const struct tegra_mc_soc tegra30_mc_soc = {
 	.atom_size = 16,
 	.client_id_mask = 0x7f,
 	.smmu = &tegra30_smmu_soc,
+	.intmask = MC_INT_INVALID_SMMU_PAGE | MC_INT_SECURITY_VIOLATION |
+		   MC_INT_DECERR_EMEM,
 };
diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h
index 233bae954970..be6e49124c6d 100644
--- a/include/soc/tegra/mc.h
+++ b/include/soc/tegra/mc.h
@@ -108,6 +108,8 @@ struct tegra_mc_soc {
 	u8 client_id_mask;
 
 	const struct tegra_smmu_soc *smmu;
+
+	u32 intmask;
 };
 
 struct tegra_mc {
-- 
cgit v1.2.3


From 02acc80d19edb0d5684c997b2004ad19f9f5236e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 23 Apr 2018 18:10:23 +0200
Subject: delayacct: Use raw_spinlocks

try_to_wake_up() might invoke delayacct_blkio_end() while holding the
pi_lock (which is a raw_spinlock_t). delayacct_blkio_end() acquires
task_delay_info.lock which is a spinlock_t. This causes a might sleep splat
on -RT where non raw spinlocks are converted to 'sleeping' spinlocks.

task_delay_info.lock is only held for a short amount of time so it's not a
problem latency wise to make convert it to a raw spinlock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Link: https://lkml.kernel.org/r/20180423161024.6710-1-bigeasy@linutronix.de
---
 include/linux/delayacct.h |  2 +-
 kernel/delayacct.c        | 17 +++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 5e335b6203f4..e6c0448ebcc7 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -29,7 +29,7 @@
 
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
-	spinlock_t	lock;
+	raw_spinlock_t	lock;
 	unsigned int	flags;	/* Private per-task flags */
 
 	/* For each stat XXX, add following, aligned appropriately
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e2764d767f18..ca8ac2824f0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 {
 	tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
 	if (tsk->delays)
-		spin_lock_init(&tsk->delays->lock);
+		raw_spin_lock_init(&tsk->delays->lock);
 }
 
 /*
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumalator (@total) and @count
  */
-static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
+			  u32 *count)
 {
 	s64 ns = ktime_get_ns() - *start;
 	unsigned long flags;
 
 	if (ns > 0) {
-		spin_lock_irqsave(lock, flags);
+		raw_spin_lock_irqsave(lock, flags);
 		*total += ns;
 		(*count)++;
-		spin_unlock_irqrestore(lock, flags);
+		raw_spin_unlock_irqrestore(lock, flags);
 	}
 }
 
@@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
 
-	spin_lock_irqsave(&tsk->delays->lock, flags);
+	raw_spin_lock_irqsave(&tsk->delays->lock, flags);
 	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
 	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
 	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
@@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
-	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
 }
@@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 	__u64 ret;
 	unsigned long flags;
 
-	spin_lock_irqsave(&tsk->delays->lock, flags);
+	raw_spin_lock_irqsave(&tsk->delays->lock, flags);
 	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
 				tsk->delays->swapin_delay);
-	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From ebb513adb1bed9a2fa5a9db4363b009ed407879a Mon Sep 17 00:00:00 2001
From: Manasi Navare <manasi.d.navare@intel.com>
Date: Thu, 26 Apr 2018 12:27:48 -0700
Subject: drm/dp: Rename the edp_sdp_header as dp_sdp_header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No functional changes in this patch.

The SDP Header is a generic header for secondary data packets for
both eDP and DP so call it dp_sdp_header. This header gets used for
different SDP types already defined.
Also header bytes 2 and 3 are secondary data packet specific header bytes.
So change the comment to indicate the same.

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: Manasi Navare <manasi.d.navare@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1524770868-16869-1-git-send-email-manasi.d.navare@intel.com
---
 include/drm/drm_dp_helper.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 62903bae0221..930919f74af5 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -967,18 +967,18 @@ int drm_dp_bw_code_to_link_rate(u8 link_bw);
 #define DP_SDP_VSC_EXT_CEA		0x21 /* DP 1.4 */
 /* 0x80+ CEA-861 infoframe types */
 
-struct edp_sdp_header {
+struct dp_sdp_header {
 	u8 HB0; /* Secondary Data Packet ID */
 	u8 HB1; /* Secondary Data Packet Type */
-	u8 HB2; /* 7:5 reserved, 4:0 revision number */
-	u8 HB3; /* 7:5 reserved, 4:0 number of valid data bytes */
+	u8 HB2; /* Secondary Data Packet Specific header, Byte 0 */
+	u8 HB3; /* Secondary Data packet Specific header, Byte 1 */
 } __packed;
 
 #define EDP_SDP_HEADER_REVISION_MASK		0x1F
 #define EDP_SDP_HEADER_VALID_PAYLOAD_BYTES	0x1F
 
 struct edp_vsc_psr {
-	struct edp_sdp_header sdp_header;
+	struct dp_sdp_header sdp_header;
 	u8 DB0; /* Stereo Interface */
 	u8 DB1; /* 0 - PSR State; 1 - Update RFB; 2 - CRC Valid */
 	u8 DB2; /* CRC value bits 7:0 of the R or Cr component */
-- 
cgit v1.2.3


From 30e9db6d046ba667070e5a011a13951830d60a6e Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Fri, 16 Mar 2018 21:04:20 +0200
Subject: drm: Don't pass the index to drm_property_add_enum()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drm_property_add_enum() can calculate the index itself just fine,
so no point in having the caller pass it in.

Cc: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: nouveau@lists.freedesktop.org
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180316190420.26734-1-ville.syrjala@linux.intel.com
Reviewed-by: Stanislav Lisovskiy <stanislav.lisovskiy@intel.com>
---
 drivers/gpu/drm/drm_connector.c           |  6 +++---
 drivers/gpu/drm/drm_property.c            | 27 +++++++++++++--------------
 drivers/gpu/drm/gma500/cdv_device.c       |  4 ++--
 drivers/gpu/drm/gma500/psb_intel_sdvo.c   |  2 +-
 drivers/gpu/drm/i915/intel_sdvo.c         |  5 ++---
 drivers/gpu/drm/nouveau/nouveau_display.c |  4 +---
 include/drm/drm_property.h                |  2 +-
 7 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index b3cde897cd80..dfc8ca1e9413 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1069,7 +1069,7 @@ int drm_mode_create_tv_properties(struct drm_device *dev,
 		goto nomem;
 
 	for (i = 0; i < num_modes; i++)
-		drm_property_add_enum(dev->mode_config.tv_mode_property, i,
+		drm_property_add_enum(dev->mode_config.tv_mode_property,
 				      i, modes[i]);
 
 	dev->mode_config.tv_brightness_property =
@@ -1156,7 +1156,7 @@ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector,
 {
 	struct drm_device *dev = connector->dev;
 	struct drm_property *scaling_mode_property;
-	int i, j = 0;
+	int i;
 	const unsigned valid_scaling_mode_mask =
 		(1U << ARRAY_SIZE(drm_scaling_mode_enum_list)) - 1;
 
@@ -1177,7 +1177,7 @@ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector,
 		if (!(BIT(i) & scaling_mode_mask))
 			continue;
 
-		ret = drm_property_add_enum(scaling_mode_property, j++,
+		ret = drm_property_add_enum(scaling_mode_property,
 					    drm_scaling_mode_enum_list[i].type,
 					    drm_scaling_mode_enum_list[i].name);
 
diff --git a/drivers/gpu/drm/drm_property.c b/drivers/gpu/drm/drm_property.c
index 8f4672daac7f..1f8031e30f53 100644
--- a/drivers/gpu/drm/drm_property.c
+++ b/drivers/gpu/drm/drm_property.c
@@ -169,9 +169,9 @@ struct drm_property *drm_property_create_enum(struct drm_device *dev,
 		return NULL;
 
 	for (i = 0; i < num_values; i++) {
-		ret = drm_property_add_enum(property, i,
-				      props[i].type,
-				      props[i].name);
+		ret = drm_property_add_enum(property,
+					    props[i].type,
+					    props[i].name);
 		if (ret) {
 			drm_property_destroy(dev, property);
 			return NULL;
@@ -209,7 +209,7 @@ struct drm_property *drm_property_create_bitmask(struct drm_device *dev,
 						 uint64_t supported_bits)
 {
 	struct drm_property *property;
-	int i, ret, index = 0;
+	int i, ret;
 	int num_values = hweight64(supported_bits);
 
 	flags |= DRM_MODE_PROP_BITMASK;
@@ -221,14 +221,9 @@ struct drm_property *drm_property_create_bitmask(struct drm_device *dev,
 		if (!(supported_bits & (1ULL << props[i].type)))
 			continue;
 
-		if (WARN_ON(index >= num_values)) {
-			drm_property_destroy(dev, property);
-			return NULL;
-		}
-
-		ret = drm_property_add_enum(property, index++,
-				      props[i].type,
-				      props[i].name);
+		ret = drm_property_add_enum(property,
+					    props[i].type,
+					    props[i].name);
 		if (ret) {
 			drm_property_destroy(dev, property);
 			return NULL;
@@ -376,7 +371,6 @@ EXPORT_SYMBOL(drm_property_create_bool);
 /**
  * drm_property_add_enum - add a possible value to an enumeration property
  * @property: enumeration property to change
- * @index: index of the new enumeration
  * @value: value of the new enumeration
  * @name: symbolic name of the new enumeration
  *
@@ -388,10 +382,11 @@ EXPORT_SYMBOL(drm_property_create_bool);
  * Returns:
  * Zero on success, error code on failure.
  */
-int drm_property_add_enum(struct drm_property *property, int index,
+int drm_property_add_enum(struct drm_property *property,
 			  uint64_t value, const char *name)
 {
 	struct drm_property_enum *prop_enum;
+	int index = 0;
 
 	if (WARN_ON(strlen(name) >= DRM_PROP_NAME_LEN))
 		return -EINVAL;
@@ -411,8 +406,12 @@ int drm_property_add_enum(struct drm_property *property, int index,
 	list_for_each_entry(prop_enum, &property->enum_list, head) {
 		if (WARN_ON(prop_enum->value == value))
 			return -EINVAL;
+		index++;
 	}
 
+	if (WARN_ON(index >= property->num_values))
+		return -EINVAL;
+
 	prop_enum = kzalloc(sizeof(struct drm_property_enum), GFP_KERNEL);
 	if (!prop_enum)
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/gma500/cdv_device.c b/drivers/gpu/drm/gma500/cdv_device.c
index 3a3bf752e03a..34b85767e4da 100644
--- a/drivers/gpu/drm/gma500/cdv_device.c
+++ b/drivers/gpu/drm/gma500/cdv_device.c
@@ -485,7 +485,7 @@ void cdv_intel_attach_force_audio_property(struct drm_connector *connector)
 			return;
 
 		for (i = 0; i < ARRAY_SIZE(force_audio_names); i++)
-			drm_property_add_enum(prop, i, i-1, force_audio_names[i]);
+			drm_property_add_enum(prop, i-1, force_audio_names[i]);
 
 		dev_priv->force_audio_property = prop;
 	}
@@ -514,7 +514,7 @@ void cdv_intel_attach_broadcast_rgb_property(struct drm_connector *connector)
 			return;
 
 		for (i = 0; i < ARRAY_SIZE(broadcast_rgb_names); i++)
-			drm_property_add_enum(prop, i, i, broadcast_rgb_names[i]);
+			drm_property_add_enum(prop, i, broadcast_rgb_names[i]);
 
 		dev_priv->broadcast_rgb_property = prop;
 	}
diff --git a/drivers/gpu/drm/gma500/psb_intel_sdvo.c b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
index 8dc2b19f913b..f2ee6aa10afa 100644
--- a/drivers/gpu/drm/gma500/psb_intel_sdvo.c
+++ b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
@@ -2281,7 +2281,7 @@ static bool psb_intel_sdvo_tv_create_property(struct psb_intel_sdvo *psb_intel_s
 
 	for (i = 0; i < psb_intel_sdvo_connector->format_supported_num; i++)
 		drm_property_add_enum(
-				psb_intel_sdvo_connector->tv_format, i,
+				psb_intel_sdvo_connector->tv_format,
 				i, tv_format_names[psb_intel_sdvo_connector->tv_format_supported[i]]);
 
 	psb_intel_sdvo->tv_format_index = psb_intel_sdvo_connector->tv_format_supported[0];
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index 96e213ec202d..25005023c243 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -2779,9 +2779,8 @@ static bool intel_sdvo_tv_create_property(struct intel_sdvo *intel_sdvo,
 		return false;
 
 	for (i = 0; i < intel_sdvo_connector->format_supported_num; i++)
-		drm_property_add_enum(
-				intel_sdvo_connector->tv_format, i,
-				i, tv_format_names[intel_sdvo_connector->tv_format_supported[i]]);
+		drm_property_add_enum(intel_sdvo_connector->tv_format, i,
+				      tv_format_names[intel_sdvo_connector->tv_format_supported[i]]);
 
 	intel_sdvo_connector->base.base.state->tv.mode = intel_sdvo_connector->tv_format_supported[0];
 	drm_object_attach_property(&intel_sdvo_connector->base.base.base,
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c
index 009713404cc4..7d0bec8dd03d 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.c
+++ b/drivers/gpu/drm/nouveau/nouveau_display.c
@@ -338,11 +338,9 @@ static struct nouveau_drm_prop_enum_list dither_depth[] = {
 	if (c) {                                                               \
 		p = drm_property_create(dev, DRM_MODE_PROP_ENUM, n, c);        \
 		l = (list);                                                    \
-		c = 0;                                                         \
 		while (p && l->gen_mask) {                                     \
 			if (l->gen_mask & (1 << (gen))) {                      \
-				drm_property_add_enum(p, c, l->type, l->name); \
-				c++;                                           \
+				drm_property_add_enum(p, l->type, l->name);    \
 			}                                                      \
 			l++;                                                   \
 		}                                                              \
diff --git a/include/drm/drm_property.h b/include/drm/drm_property.h
index ab8167baade5..1d5c0b2a8956 100644
--- a/include/drm/drm_property.h
+++ b/include/drm/drm_property.h
@@ -260,7 +260,7 @@ struct drm_property *drm_property_create_object(struct drm_device *dev,
 						uint32_t type);
 struct drm_property *drm_property_create_bool(struct drm_device *dev,
 					      u32 flags, const char *name);
-int drm_property_add_enum(struct drm_property *property, int index,
+int drm_property_add_enum(struct drm_property *property,
 			  uint64_t value, const char *name);
 void drm_property_destroy(struct drm_device *dev, struct drm_property *property);
 
-- 
cgit v1.2.3


From 3e5cf362c34b14c8d01d19d4b821fb35e1779862 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 25 Apr 2018 19:29:36 +0200
Subject: tipc: introduce ioctl for fetching node identity

After the introduction of a 128-bit node identity it may be difficult
for a user to correlate between this identity and the generated node
hash address.

We now try to make this easier by introducing a new ioctl() call for
fetching a node identity by using the hash value as key. This will
be particularly useful when we extend some of the commands in the
'tipc' tool, but we also expect regular user applications to need
this feature.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h | 12 ++++++++----
 net/tipc/node.c           | 21 +++++++++++++++++++++
 net/tipc/node.h           |  1 +
 net/tipc/socket.c         | 13 +++++++++++--
 4 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index bf6d28677cfe..6b2fd4d9655f 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -209,16 +209,16 @@ struct tipc_group_req {
  * The string formatting for each name element is:
  * media: media
  * interface: media:interface name
- * link: Z.C.N:interface-Z.C.N:interface
- *
+ * link: node:interface-node:interface
  */
-
+#define TIPC_NODEID_LEN         16
 #define TIPC_MAX_MEDIA_NAME	16
 #define TIPC_MAX_IF_NAME	16
 #define TIPC_MAX_BEARER_NAME	32
 #define TIPC_MAX_LINK_NAME	68
 
-#define SIOCGETLINKNAME		SIOCPROTOPRIVATE
+#define SIOCGETLINKNAME        SIOCPROTOPRIVATE
+#define SIOCGETNODEID          (SIOCPROTOPRIVATE + 1)
 
 struct tipc_sioc_ln_req {
 	__u32 peer;
@@ -226,6 +226,10 @@ struct tipc_sioc_ln_req {
 	char linkname[TIPC_MAX_LINK_NAME];
 };
 
+struct tipc_sioc_nodeid_req {
+	__u32 peer;
+	char node_id[TIPC_NODEID_LEN];
+};
 
 /* The macros and functions below are deprecated:
  */
diff --git a/net/tipc/node.c b/net/tipc/node.c
index e9c52e1416c5..81e6dd0cd1ca 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -195,6 +195,27 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
 	return mtu;
 }
 
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id)
+{
+	u8 *own_id = tipc_own_id(net);
+	struct tipc_node *n;
+
+	if (!own_id)
+		return true;
+
+	if (addr == tipc_own_addr(net)) {
+		memcpy(id, own_id, TIPC_NODEID_LEN);
+		return true;
+	}
+	n = tipc_node_find(net, addr);
+	if (!n)
+		return false;
+
+	memcpy(id, &n->peer_id, TIPC_NODEID_LEN);
+	tipc_node_put(n);
+	return true;
+}
+
 u16 tipc_node_get_capabilities(struct net *net, u32 addr)
 {
 	struct tipc_node *n;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index bb271a37c93f..846c8f240872 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,6 +60,7 @@ enum {
 #define INVALID_BEARER_ID -1
 
 void tipc_node_stop(struct net *net);
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
 u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
 void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
 			  struct tipc_bearer *bearer,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 252a52ae0893..c4992002fd68 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2973,7 +2973,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
 
 static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
-	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sock->sk);
+	struct tipc_sioc_nodeid_req nr = {0};
 	struct tipc_sioc_ln_req lnr;
 	void __user *argp = (void __user *)arg;
 
@@ -2981,7 +2982,7 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCGETLINKNAME:
 		if (copy_from_user(&lnr, argp, sizeof(lnr)))
 			return -EFAULT;
-		if (!tipc_node_get_linkname(sock_net(sk),
+		if (!tipc_node_get_linkname(net,
 					    lnr.bearer_id & 0xffff, lnr.peer,
 					    lnr.linkname, TIPC_MAX_LINK_NAME)) {
 			if (copy_to_user(argp, &lnr, sizeof(lnr)))
@@ -2989,6 +2990,14 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			return 0;
 		}
 		return -EADDRNOTAVAIL;
+	case SIOCGETNODEID:
+		if (copy_from_user(&nr, argp, sizeof(nr)))
+			return -EFAULT;
+		if (!tipc_node_get_id(net, nr.peer, nr.node_id))
+			return -EADDRNOTAVAIL;
+		if (copy_to_user(argp, &nr, sizeof(nr)))
+			return -EFAULT;
+		return 0;
 	default:
 		return -ENOIOCTLCMD;
 	}
-- 
cgit v1.2.3


From c59530d0d5dccc96795af12c139f618182cf98db Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:47 -0700
Subject: net: Move PHY statistics code into PHY library helpers

In order to make it possible for network device drivers that do not
necessarily have a phy_device attached, but still report PHY statistics,
have a preliminary refactoring consisting in creating helper functions
that encapsulate the PHY device driver knowledge within PHYLIB.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h   | 20 ++++++++++++++++++++
 net/core/ethtool.c    | 38 ++++++++------------------------------
 3 files changed, 76 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 05c1e8ef15e6..a98ed12c0009 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1277,3 +1277,51 @@ int phy_ethtool_nway_reset(struct net_device *ndev)
 	return phy_restart_aneg(phydev);
 }
 EXPORT_SYMBOL(phy_ethtool_nway_reset);
+
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+{
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_strings(phydev, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_get_strings);
+
+int phy_ethtool_get_sset_count(struct phy_device *phydev)
+{
+	int ret;
+
+	if (!phydev->drv)
+		return -EIO;
+
+	if (phydev->drv->get_sset_count &&
+	    phydev->drv->get_strings &&
+	    phydev->drv->get_stats) {
+		mutex_lock(&phydev->lock);
+		ret = phydev->drv->get_sset_count(phydev);
+		mutex_unlock(&phydev->lock);
+
+		return ret;
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(phy_ethtool_get_sset_count);
+
+int phy_ethtool_get_stats(struct phy_device *phydev,
+			  struct ethtool_stats *stats, u64 *data)
+{
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_stats(phydev, stats, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_get_stats);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f0b5870a6d40..6ca81395c545 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1066,6 +1066,26 @@ int phy_ethtool_nway_reset(struct net_device *ndev);
 #if IS_ENABLED(CONFIG_PHYLIB)
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
+int phy_ethtool_get_sset_count(struct phy_device *phydev);
+int phy_ethtool_get_stats(struct phy_device *phydev,
+			  struct ethtool_stats *stats, u64 *data);
+#else
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+{
+	return -EOPNOTSUPP;
+}
+
+int phy_ethtool_get_sset_count(struct phy_device *phydev)
+{
+	return -EOPNOTSUPP;
+}
+
+int phy_ethtool_get_stats(struct phy_device *phydev,
+			  struct ethtool_stats *stats, u64 *data)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 extern struct bus_type mdio_bus_type;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4650fd6d678c..efcd8e620796 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -211,23 +211,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
-static int phy_get_sset_count(struct phy_device *phydev)
-{
-	int ret;
-
-	if (phydev->drv->get_sset_count &&
-	    phydev->drv->get_strings &&
-	    phydev->drv->get_stats) {
-		mutex_lock(&phydev->lock);
-		ret = phydev->drv->get_sset_count(phydev);
-		mutex_unlock(&phydev->lock);
-
-		return ret;
-	}
-
-	return -EOPNOTSUPP;
-}
-
 static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 {
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -246,7 +229,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 
 	if (sset == ETH_SS_PHY_STATS) {
 		if (dev->phydev)
-			return phy_get_sset_count(dev->phydev);
+			return phy_ethtool_get_sset_count(dev->phydev);
 		else
 			return -EOPNOTSUPP;
 	}
@@ -273,15 +256,10 @@ static void __ethtool_get_strings(struct net_device *dev,
 	else if (stringset == ETH_SS_PHY_TUNABLES)
 		memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
 	else if (stringset == ETH_SS_PHY_STATS) {
-		struct phy_device *phydev = dev->phydev;
-
-		if (phydev) {
-			mutex_lock(&phydev->lock);
-			phydev->drv->get_strings(phydev, data);
-			mutex_unlock(&phydev->lock);
-		} else {
+		if (dev->phydev)
+			phy_ethtool_get_strings(dev->phydev, data);
+		else
 			return;
-		}
 	} else
 		/* ops->get_strings is valid because checked earlier */
 		ops->get_strings(dev, stringset, data);
@@ -2002,7 +1980,7 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (!phydev)
 		return -EOPNOTSUPP;
 
-	n_stats = phy_get_sset_count(phydev);
+	n_stats = phy_ethtool_get_sset_count(dev->phydev);
 	if (n_stats < 0)
 		return n_stats;
 	if (n_stats > S32_MAX / sizeof(u64))
@@ -2017,9 +1995,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (n_stats && !data)
 		return -ENOMEM;
 
-	mutex_lock(&phydev->lock);
-	phydev->drv->get_stats(phydev, &stats, data);
-	mutex_unlock(&phydev->lock);
+	ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+	if (ret < 0)
+		return ret;
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
-- 
cgit v1.2.3


From 9994338227179eaf8db6f4493504e108b1fae5fc Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:48 -0700
Subject: net: Allow network devices to have PHY statistics

Add a new callback: get_ethtool_phy_stats() which allows network device
drivers not making use of the PHY library to return PHY statistics.
Update ethtool_get_phy_stats(), __ethtool_get_sset_count() and
__ethtool_get_strings() accordingly to interogate the network device
about ETH_SS_PHY_STATS.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  5 +++++
 net/core/ethtool.c      | 39 +++++++++++++++++++++------------------
 2 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index b32cd2062f18..f8a2245b70ac 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -312,6 +312,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  *	by kernel. Returns a negative error code or zero.
  * @get_fecparam: Get the network device Forward Error Correction parameters.
  * @set_fecparam: Set the network device Forward Error Correction parameters.
+ * @get_ethtool_phy_stats: Return extended statistics about the PHY device.
+ *	This is only useful if the device maintains PHY statistics and
+ *	cannot use the standard PHY library helpers.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -407,5 +410,7 @@ struct ethtool_ops {
 				      struct ethtool_fecparam *);
 	int	(*set_fecparam)(struct net_device *,
 				      struct ethtool_fecparam *);
+	void	(*get_ethtool_phy_stats)(struct net_device *,
+					 struct ethtool_stats *, u64 *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index efcd8e620796..b849fdae7e87 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -227,12 +227,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 	if (sset == ETH_SS_PHY_TUNABLES)
 		return ARRAY_SIZE(phy_tunable_strings);
 
-	if (sset == ETH_SS_PHY_STATS) {
-		if (dev->phydev)
-			return phy_ethtool_get_sset_count(dev->phydev);
-		else
-			return -EOPNOTSUPP;
-	}
+	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats)
+		return phy_ethtool_get_sset_count(dev->phydev);
 
 	if (ops->get_sset_count && ops->get_strings)
 		return ops->get_sset_count(dev, sset);
@@ -255,12 +252,10 @@ static void __ethtool_get_strings(struct net_device *dev,
 		memcpy(data, tunable_strings, sizeof(tunable_strings));
 	else if (stringset == ETH_SS_PHY_TUNABLES)
 		memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
-	else if (stringset == ETH_SS_PHY_STATS) {
-		if (dev->phydev)
-			phy_ethtool_get_strings(dev->phydev, data);
-		else
-			return;
-	} else
+	else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+		 !ops->get_ethtool_phy_stats)
+		phy_ethtool_get_strings(dev->phydev, data);
+	else
 		/* ops->get_strings is valid because checked earlier */
 		ops->get_strings(dev, stringset, data);
 }
@@ -1972,15 +1967,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 
 static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 {
-	struct ethtool_stats stats;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
 	struct phy_device *phydev = dev->phydev;
+	struct ethtool_stats stats;
 	u64 *data;
 	int ret, n_stats;
 
-	if (!phydev)
+	if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
 		return -EOPNOTSUPP;
 
-	n_stats = phy_ethtool_get_sset_count(dev->phydev);
+	if (dev->phydev && !ops->get_ethtool_phy_stats)
+		n_stats = phy_ethtool_get_sset_count(dev->phydev);
+	else
+		n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
 	if (n_stats < 0)
 		return n_stats;
 	if (n_stats > S32_MAX / sizeof(u64))
@@ -1995,9 +1994,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (n_stats && !data)
 		return -ENOMEM;
 
-	ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
-	if (ret < 0)
-		return ret;
+	if (dev->phydev && !ops->get_ethtool_phy_stats) {
+		ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+		if (ret < 0)
+			return ret;
+	} else {
+		ops->get_ethtool_phy_stats(dev, &stats, data);
+	}
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
-- 
cgit v1.2.3


From 89f09048348936a9a8c5131c8538cc6ed26fd44c Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:50 -0700
Subject: net: dsa: Pass stringset to ethtool operations

Up until now we largely assumed that we were interested in ETH_SS_STATS
type of strings for all ethtool operations, this is about to change with
the introduction of additional string sets, e.g: ETH_SS_PHY_STATS.
Update all functions to take an appropriate stringset argument and act
on it when it is different than ETH_SS_STATS for now.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c       | 11 +++++++++--
 drivers/net/dsa/b53/b53_priv.h         |  5 +++--
 drivers/net/dsa/dsa_loop.c             | 11 +++++++++--
 drivers/net/dsa/lan9303-core.c         | 11 +++++++++--
 drivers/net/dsa/microchip/ksz_common.c | 11 +++++++++--
 drivers/net/dsa/mt7530.c               | 11 +++++++++--
 drivers/net/dsa/mv88e6xxx/chip.c       | 10 ++++++++--
 drivers/net/dsa/qca8k.c                | 10 ++++++++--
 include/net/dsa.h                      |  5 +++--
 net/dsa/master.c                       | 21 +++++++++++++--------
 net/dsa/slave.c                        |  5 +++--
 11 files changed, 83 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 78616787f2a3..726b2d8c6fe9 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -806,13 +806,17 @@ static unsigned int b53_get_mib_size(struct b53_device *dev)
 		return B53_MIBS_SIZE;
 }
 
-void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+		     uint8_t *data)
 {
 	struct b53_device *dev = ds->priv;
 	const struct b53_mib_desc *mibs = b53_get_mib(dev);
 	unsigned int mib_size = b53_get_mib_size(dev);
 	unsigned int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < mib_size; i++)
 		strlcpy(data + i * ETH_GSTRING_LEN,
 			mibs[i].name, ETH_GSTRING_LEN);
@@ -852,10 +856,13 @@ void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
 }
 EXPORT_SYMBOL(b53_get_ethtool_stats);
 
-int b53_get_sset_count(struct dsa_switch *ds, int port)
+int b53_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
 	struct b53_device *dev = ds->priv;
 
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return b53_get_mib_size(dev);
 }
 EXPORT_SYMBOL(b53_get_sset_count);
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 1187ebd79287..b933d5cb5c2d 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -286,9 +286,10 @@ static inline int b53_switch_get_reset_gpio(struct b53_device *dev)
 /* Exported functions towards other drivers */
 void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port);
 int b53_configure_vlan(struct dsa_switch *ds);
-void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data);
+void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+		     uint8_t *data);
 void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
-int b53_get_sset_count(struct dsa_switch *ds, int port);
+int b53_get_sset_count(struct dsa_switch *ds, int port, int sset);
 int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index f77be9f85cb3..9354cc08d3fd 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -86,16 +86,23 @@ static int dsa_loop_setup(struct dsa_switch *ds)
 	return 0;
 }
 
-static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port)
+static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return __DSA_LOOP_CNT_MAX;
 }
 
-static void dsa_loop_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+static void dsa_loop_get_strings(struct dsa_switch *ds, int port,
+				 u32 stringset, uint8_t *data)
 {
 	struct dsa_loop_priv *ps = ds->priv;
 	unsigned int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < __DSA_LOOP_CNT_MAX; i++)
 		memcpy(data + i * ETH_GSTRING_LEN,
 		       ps->ports[port].mib[i].name, ETH_GSTRING_LEN);
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index fefa454f3e56..b4f6e1a67dd9 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -977,10 +977,14 @@ static const struct lan9303_mib_desc lan9303_mib[] = {
 	{ .offset = LAN9303_MAC_TX_LATECOL_0, .name = "TxLateCol", },
 };
 
-static void lan9303_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+static void lan9303_get_strings(struct dsa_switch *ds, int port,
+				u32 stringset, uint8_t *data)
 {
 	unsigned int u;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (u = 0; u < ARRAY_SIZE(lan9303_mib); u++) {
 		strncpy(data + u * ETH_GSTRING_LEN, lan9303_mib[u].name,
 			ETH_GSTRING_LEN);
@@ -1007,8 +1011,11 @@ static void lan9303_get_ethtool_stats(struct dsa_switch *ds, int port,
 	}
 }
 
-static int lan9303_get_sset_count(struct dsa_switch *ds, int port)
+static int lan9303_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return ARRAY_SIZE(lan9303_mib);
 }
 
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index bcb3e6c734f2..7210c49b7922 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -439,15 +439,22 @@ static void ksz_disable_port(struct dsa_switch *ds, int port,
 	ksz_port_cfg(dev, port, REG_PORT_CTRL_0, PORT_MAC_LOOPBACK, true);
 }
 
-static int ksz_sset_count(struct dsa_switch *ds, int port)
+static int ksz_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return TOTAL_SWITCH_COUNTER_NUM;
 }
 
-static void ksz_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
+static void ksz_get_strings(struct dsa_switch *ds, int port,
+			    u32 stringset, uint8_t *buf)
 {
 	int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < TOTAL_SWITCH_COUNTER_NUM; i++) {
 		memcpy(buf + i * ETH_GSTRING_LEN, mib_names[i].string,
 		       ETH_GSTRING_LEN);
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 80a4dbc3a499..62e486652e62 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -573,10 +573,14 @@ static int mt7530_phy_write(struct dsa_switch *ds, int port, int regnum,
 }
 
 static void
-mt7530_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+mt7530_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+		   uint8_t *data)
 {
 	int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(mt7530_mib); i++)
 		strncpy(data + i * ETH_GSTRING_LEN, mt7530_mib[i].name,
 			ETH_GSTRING_LEN);
@@ -604,8 +608,11 @@ mt7530_get_ethtool_stats(struct dsa_switch *ds, int port,
 }
 
 static int
-mt7530_get_sset_count(struct dsa_switch *ds, int port)
+mt7530_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return ARRAY_SIZE(mt7530_mib);
 }
 
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3d2091099f7f..8f92ccc0dd54 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -742,11 +742,14 @@ static void mv88e6xxx_atu_vtu_get_strings(uint8_t *data)
 }
 
 static void mv88e6xxx_get_strings(struct dsa_switch *ds, int port,
-				  uint8_t *data)
+				  u32 stringset, uint8_t *data)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int count = 0;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	mutex_lock(&chip->reg_lock);
 
 	if (chip->info->ops->stats_get_strings)
@@ -789,12 +792,15 @@ static int mv88e6320_stats_get_sset_count(struct mv88e6xxx_chip *chip)
 					      STATS_TYPE_BANK1);
 }
 
-static int mv88e6xxx_get_sset_count(struct dsa_switch *ds, int port)
+static int mv88e6xxx_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int serdes_count = 0;
 	int count = 0;
 
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	mutex_lock(&chip->reg_lock);
 	if (chip->info->ops->stats_get_sset_count)
 		count = chip->info->ops->stats_get_sset_count(chip);
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 600d5ad1fbde..757b6d90ea36 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -600,10 +600,13 @@ qca8k_phy_write(struct dsa_switch *ds, int phy, int regnum, u16 val)
 }
 
 static void
-qca8k_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+qca8k_get_strings(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data)
 {
 	int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(ar8327_mib); i++)
 		strncpy(data + i * ETH_GSTRING_LEN, ar8327_mib[i].name,
 			ETH_GSTRING_LEN);
@@ -631,8 +634,11 @@ qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
 }
 
 static int
-qca8k_get_sset_count(struct dsa_switch *ds, int port)
+qca8k_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return ARRAY_SIZE(ar8327_mib);
 }
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 60fb4ec8ba61..0bc0aad1b02e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -356,10 +356,11 @@ struct dsa_switch_ops {
 	/*
 	 * ethtool hardware statistics.
 	 */
-	void	(*get_strings)(struct dsa_switch *ds, int port, uint8_t *data);
+	void	(*get_strings)(struct dsa_switch *ds, int port,
+			       u32 stringset, uint8_t *data);
 	void	(*get_ethtool_stats)(struct dsa_switch *ds,
 				     int port, uint64_t *data);
-	int	(*get_sset_count)(struct dsa_switch *ds, int port);
+	int	(*get_sset_count)(struct dsa_switch *ds, int port, int sset);
 
 	/*
 	 * ethtool Wake-on-LAN
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 9ec16b39ed15..8d27687fd0ca 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -38,11 +38,14 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 	struct dsa_switch *ds = cpu_dp->ds;
 	int count = 0;
 
-	if (ops->get_sset_count)
-		count += ops->get_sset_count(dev, sset);
+	if (ops->get_sset_count) {
+		count = ops->get_sset_count(dev, sset);
+		if (count < 0)
+			count = 0;
+	}
 
-	if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
-		count += ds->ops->get_sset_count(ds, cpu_dp->index);
+	if (ds->ops->get_sset_count)
+		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
 
 	return count;
 }
@@ -65,18 +68,20 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 	pfx[sizeof(pfx) - 1] = '_';
 
 	if (ops->get_sset_count && ops->get_strings) {
-		mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+		mcount = ops->get_sset_count(dev, stringset);
+		if (mcount < 0)
+			mcount = 0;
 		ops->get_strings(dev, stringset, data);
 	}
 
-	if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
+	if (ds->ops->get_strings) {
 		ndata = data + mcount * len;
 		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
 		 * the output after to prepend our CPU port prefix we
 		 * constructed earlier
 		 */
-		ds->ops->get_strings(ds, port, ndata);
-		count = ds->ops->get_sset_count(ds, port);
+		ds->ops->get_strings(ds, port, stringset, ndata);
+		count = ds->ops->get_sset_count(ds, port, stringset);
 		for (i = 0; i < count; i++) {
 			memmove(ndata + (i * len + sizeof(pfx)),
 				ndata + i * len, len - sizeof(pfx));
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..f3fb3a0880b1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -560,7 +560,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
 		strncpy(data + 2 * len, "rx_packets", len);
 		strncpy(data + 3 * len, "rx_bytes", len);
 		if (ds->ops->get_strings)
-			ds->ops->get_strings(ds, dp->index, data + 4 * len);
+			ds->ops->get_strings(ds, dp->index, stringset,
+					     data + 4 * len);
 	}
 }
 
@@ -605,7 +606,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
 
 		count = 4;
 		if (ds->ops->get_sset_count)
-			count += ds->ops->get_sset_count(ds, dp->index);
+			count += ds->ops->get_sset_count(ds, dp->index, sset);
 
 		return count;
 	}
-- 
cgit v1.2.3


From cf963573039a333eff7156629e61a06e59da61cf Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:52 -0700
Subject: net: dsa: Allow providing PHY statistics from CPU port

Implement the same type of ethtool diversion that we have for
ETH_SS_STATS and make it work with ETH_SS_PHY_STATS. This allows
providing PHY level statistics for CPU ports that are directly
connecting to a PHY device.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  7 +++++++
 net/dsa/master.c  | 47 ++++++++++++++++++++++++++++++++++++++++-----
 net/dsa/port.c    | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0bc0aad1b02e..462e9741b210 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -361,6 +361,8 @@ struct dsa_switch_ops {
 	void	(*get_ethtool_stats)(struct dsa_switch *ds,
 				     int port, uint64_t *data);
 	int	(*get_sset_count)(struct dsa_switch *ds, int port, int sset);
+	void	(*get_ethtool_phy_stats)(struct dsa_switch *ds,
+					 int port, uint64_t *data);
 
 	/*
 	 * ethtool Wake-on-LAN
@@ -589,4 +591,9 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 #define BRCM_TAG_GET_PORT(v)		((v) >> 8)
 #define BRCM_TAG_GET_QUEUE(v)		((v) & 0xff)
 
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
+int dsa_port_get_phy_sset_count(struct dsa_port *dp);
+
 #endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 8d27687fd0ca..c90ee3227dea 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
 		ds->ops->get_ethtool_stats(ds, port, data + count);
 }
 
+static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
+					     struct ethtool_stats *stats,
+					     uint64_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int count = 0;
+
+	if (dev->phydev && !ops->get_ethtool_phy_stats) {
+		count = phy_ethtool_get_sset_count(dev->phydev);
+		if (count >= 0)
+			phy_ethtool_get_stats(dev->phydev, stats, data);
+	} else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+		ops->get_ethtool_phy_stats(dev, stats, data);
+	}
+
+	if (count < 0)
+		count = 0;
+
+	if (ds->ops->get_ethtool_phy_stats)
+		ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
 static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -38,11 +64,14 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 	struct dsa_switch *ds = cpu_dp->ds;
 	int count = 0;
 
-	if (ops->get_sset_count) {
+	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats)
+		count = phy_ethtool_get_sset_count(dev->phydev);
+	else if (ops->get_sset_count)
 		count = ops->get_sset_count(dev, sset);
-		if (count < 0)
-			count = 0;
-	}
+
+	if (count < 0)
+		count = 0;
 
 	if (ds->ops->get_sset_count)
 		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
@@ -67,7 +96,14 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 	/* We do not want to be NULL-terminated, since this is a prefix */
 	pfx[sizeof(pfx) - 1] = '_';
 
-	if (ops->get_sset_count && ops->get_strings) {
+	if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats) {
+		mcount = phy_ethtool_get_sset_count(dev->phydev);
+		if (mcount < 0)
+			mcount = 0;
+		else
+			phy_ethtool_get_strings(dev->phydev, data);
+	} else if (ops->get_sset_count && ops->get_strings) {
 		mcount = ops->get_sset_count(dev, stringset);
 		if (mcount < 0)
 			mcount = 0;
@@ -107,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
 	ops->get_sset_count = dsa_master_get_sset_count;
 	ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
 	ops->get_strings = dsa_master_get_strings;
+	ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
 
 	dev->ethtool_ops = ops;
 
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 5e2a88720a9a..2413beb995be 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -383,3 +383,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
 	else
 		dsa_port_setup_phy_of(dp, false);
 }
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
+{
+	struct phy_device *phydev;
+	int ret = -EOPNOTSUPP;
+
+	if (of_phy_is_fixed_link(dp->dn))
+		return ret;
+
+	phydev = dsa_port_get_phy_device(dp);
+	if (IS_ERR_OR_NULL(phydev))
+		return ret;
+
+	ret = phy_ethtool_get_strings(phydev, data);
+	put_device(&phydev->mdio.dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
+
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
+{
+	struct phy_device *phydev;
+	int ret = -EOPNOTSUPP;
+
+	if (of_phy_is_fixed_link(dp->dn))
+		return ret;
+
+	phydev = dsa_port_get_phy_device(dp);
+	if (IS_ERR_OR_NULL(phydev))
+		return ret;
+
+	ret = phy_ethtool_get_stats(phydev, NULL, data);
+	put_device(&phydev->mdio.dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
+
+int dsa_port_get_phy_sset_count(struct dsa_port *dp)
+{
+	struct phy_device *phydev;
+	int ret = -EOPNOTSUPP;
+
+	if (of_phy_is_fixed_link(dp->dn))
+		return ret;
+
+	phydev = dsa_port_get_phy_device(dp);
+	if (IS_ERR_OR_NULL(phydev))
+		return ret;
+
+	ret = phy_ethtool_get_sset_count(phydev);
+	put_device(&phydev->mdio.dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
-- 
cgit v1.2.3


From c80851f6ce63a6e313f8c7b4b6eb82c67aa4497b Mon Sep 17 00:00:00 2001
From: Frederick Lawler <fred@fredlawl.com>
Date: Mon, 16 Apr 2018 19:28:24 -0500
Subject: PCI: Add PCI_EXP_LNKCTL2_TLS* macros

The Link Control 2 register is missing macros for Target Link Speeds.  Add
those in.

Signed-off-by: Frederick Lawler <fred@fredlawl.com>
[bhelgaas: use "GT" instead of "GB"]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/uapi/linux/pci_regs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba797a8f3..d9885db3b43a 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -655,6 +655,11 @@
 #define  PCI_EXP_LNKCAP2_SLS_16_0GB	0x00000010 /* Supported Speed 16GT/s */
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
+#define PCI_EXP_LNKCTL2_TLS		0x000f
+#define PCI_EXP_LNKCTL2_TLS_2_5GT	0x0001 /* Supported Speed 2.5GT/s */
+#define PCI_EXP_LNKCTL2_TLS_5_0GT	0x0002 /* Supported Speed 5GT/s */
+#define PCI_EXP_LNKCTL2_TLS_8_0GT	0x0003 /* Supported Speed 8GT/s */
+#define PCI_EXP_LNKCTL2_TLS_16_0GT	0x0004 /* Supported Speed 16GT/s */
 #define PCI_EXP_LNKSTA2		50	/* Link Status 2 */
 #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	52	/* v2 endpoints with link end here */
 #define PCI_EXP_SLTCAP2		52	/* Slot Capabilities 2 */
-- 
cgit v1.2.3


From c45698f89626177f8c51409142cbe9c5bbed4af7 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:50 -0300
Subject: sctp: remove old and unused SCTP_MIN_PMTU

This value is not used anywhere in the code. In essence it is a
duplicate of SCTP_DEFAULT_MINSEGMENT, which is used by the stack.

SCTP_MIN_PMTU value makes more sense, but we should not change to it now
as it would risk breaking applications.

So this patch removes SCTP_MIN_PMTU and adjust the comment above it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 20ff237c5eb2..86f034b524d4 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -254,11 +254,10 @@ enum { SCTP_ARBITRARY_COOKIE_ECHO_LEN = 200 };
 #define SCTP_TSN_MAP_SIZE 4096
 
 /* We will not record more than this many duplicate TSNs between two
- * SACKs.  The minimum PMTU is 576.  Remove all the headers and there
- * is enough room for 131 duplicate reports.  Round down to the
+ * SACKs.  The minimum PMTU is 512.  Remove all the headers and there
+ * is enough room for 117 duplicate reports.  Round down to the
  * nearest power of 2.
  */
-enum { SCTP_MIN_PMTU = 576 };
 enum { SCTP_MAX_DUP_TSNS = 16 };
 enum { SCTP_MAX_GABS = 16 };
 
-- 
cgit v1.2.3


From c4b2893dae6427ce1e528033383c94cbf81e80d8 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:53 -0300
Subject: sctp: introduce sctp_assoc_set_pmtu

All changes to asoc PMTU should now go through this wrapper, making it
easier to track them and to do other actions upon it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 +
 net/sctp/associola.c       | 21 +++++++++++++--------
 net/sctp/socket.c          |  2 +-
 3 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 05594b248e52..c5e244be8f1e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2097,6 +2097,7 @@ int sctp_assoc_update(struct sctp_association *old,
 
 __u32 sctp_association_get_next_tsn(struct sctp_association *);
 
+void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu);
 void sctp_assoc_sync_pmtu(struct sctp_association *asoc);
 void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
 void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index c5ed09cfa842..85b362324084 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -660,13 +660,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	 * If not and the current association PMTU is higher than the new
 	 * peer's PMTU, reset the association PMTU to the new peer's PMTU.
 	 */
-	if (asoc->pathmtu)
-		asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
-	else
-		asoc->pathmtu = peer->pathmtu;
-
-	pr_debug("%s: association:%p PMTU set to %d\n", __func__, asoc,
-		 asoc->pathmtu);
+	sctp_assoc_set_pmtu(asoc, asoc->pathmtu ?
+				  min_t(int, peer->pathmtu, asoc->pathmtu) :
+				  peer->pathmtu);
 
 	peer->pmtu_pending = 0;
 
@@ -1374,6 +1370,15 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
 	}
 }
 
+void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
+{
+	if (asoc->pathmtu != pmtu)
+		asoc->pathmtu = pmtu;
+
+	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
+		 asoc->pathmtu, asoc->frag_point);
+}
+
 /* Update the association's pmtu and frag_point by going through all the
  * transports. This routine is called when a transport's PMTU has changed.
  */
@@ -1397,7 +1402,7 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 			pmtu = t->pathmtu;
 	}
 
-	asoc->pathmtu = pmtu;
+	sctp_assoc_set_pmtu(asoc, pmtu);
 	asoc->frag_point = sctp_frag_point(asoc, pmtu);
 
 	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 80835ac26d2c..eeec81d5c485 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2539,7 +2539,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 			trans->pathmtu = params->spp_pathmtu;
 			sctp_assoc_sync_pmtu(asoc);
 		} else if (asoc) {
-			asoc->pathmtu = params->spp_pathmtu;
+			sctp_assoc_set_pmtu(asoc, params->spp_pathmtu);
 		} else {
 			sp->pathmtu = params->spp_pathmtu;
 		}
-- 
cgit v1.2.3


From feddd6c1af30ab11d73ce0e4e76b40dfc899dbda Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:54 -0300
Subject: sctp: introduce sctp_mtu_payload

When given a MTU, this function calculates how much payload we can carry
on it. Without a MTU, it calculates the amount of header overhead we
have.

So that when we have extra overhead, like the one added for IP options
on SELinux patches, it is easier to handle it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 19 +++++++++++++++++++
 net/sctp/output.c       | 25 ++++++++++---------------
 net/sctp/socket.c       |  7 ++-----
 3 files changed, 31 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 28b996d63490..0b98e4683f10 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -607,6 +607,25 @@ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *
 	return t->dst;
 }
 
+/* Calculate max payload size given a MTU, or the total overhead if
+ * given MTU is zero
+ */
+static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp,
+				     __u32 mtu, __u32 extra)
+{
+	__u32 overhead = sizeof(struct sctphdr) + extra;
+
+	if (sp)
+		overhead += sp->pf->af->net_header_len;
+	else
+		overhead += sizeof(struct ipv6hdr);
+
+	if (WARN_ON_ONCE(mtu && mtu <= overhead))
+		mtu = overhead;
+
+	return mtu ? mtu - overhead : overhead;
+}
+
 static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
 {
 	__u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)),
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 690d8557bb7b..bf4226c3cc1d 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -90,8 +90,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 {
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
+	struct sctp_sock *sp = NULL;
 	struct sock *sk;
-	size_t overhead = sizeof(struct ipv6hdr) + sizeof(struct sctphdr);
 
 	pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
 	packet->vtag = vtag;
@@ -102,25 +102,20 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 
 	/* set packet max_size with pathmtu, then calculate overhead */
 	packet->max_size = tp->pathmtu;
+
 	if (asoc) {
-		struct sctp_sock *sp = sctp_sk(asoc->base.sk);
-		struct sctp_af *af = sp->pf->af;
-
-		overhead = af->net_header_len +
-			   af->ip_options_len(asoc->base.sk);
-		overhead += sizeof(struct sctphdr);
-		packet->overhead = overhead;
-		packet->size = overhead;
-	} else {
-		packet->overhead = overhead;
-		packet->size = overhead;
-		return;
+		sk = asoc->base.sk;
+		sp = sctp_sk(sk);
 	}
+	packet->overhead = sctp_mtu_payload(sp, 0, 0);
+	packet->size = packet->overhead;
+
+	if (!asoc)
+		return;
 
 	/* update dst or transport pathmtu if in need */
-	sk = asoc->base.sk;
 	if (!sctp_transport_dst_check(tp)) {
-		sctp_transport_route(tp, NULL, sctp_sk(sk));
+		sctp_transport_route(tp, NULL, sp);
 		if (asoc->param_flags & SPP_PMTUD_ENABLE)
 			sctp_assoc_sync_pmtu(asoc);
 	} else if (!sctp_transport_pmtu_check(tp)) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index eeec81d5c485..b9d14f57146b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3234,11 +3234,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 	if (val) {
 		int min_len, max_len;
 
-		min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
-		min_len -= af->ip_options_len(sk);
-		min_len -= sizeof(struct sctphdr) +
-			   sizeof(struct sctp_data_chunk);
-
+		min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
+					   sizeof(struct sctp_data_chunk));
 		max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
 
 		if (val < min_len || val > max_len)
-- 
cgit v1.2.3


From 2f5e3c9df6938b823664869ec87af3da8df272f6 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:55 -0300
Subject: sctp: introduce sctp_assoc_update_frag_point

and avoid the open-coded versions of it.

Now sctp_datamsg_from_user can just re-use asoc->frag_point as it will
always be updated.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    | 20 --------------------
 include/net/sctp/structs.h |  1 +
 net/sctp/associola.c       | 24 +++++++++++++++++-------
 net/sctp/chunk.c           | 12 +-----------
 net/sctp/socket.c          |  2 +-
 5 files changed, 20 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 0b98e4683f10..350c65620a4e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -428,26 +428,6 @@ static inline int sctp_list_single_entry(struct list_head *head)
 	return (head->next != head) && (head->next == head->prev);
 }
 
-/* Break down data chunks at this point.  */
-static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
-{
-	struct sctp_sock *sp = sctp_sk(asoc->base.sk);
-	struct sctp_af *af = sp->pf->af;
-	int frag = pmtu;
-
-	frag -= af->ip_options_len(asoc->base.sk);
-	frag -= af->net_header_len;
-	frag -= sizeof(struct sctphdr) + sctp_datachk_len(&asoc->stream);
-
-	if (asoc->user_frag)
-		frag = min_t(int, frag, asoc->user_frag);
-
-	frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN -
-					    sctp_datachk_len(&asoc->stream)));
-
-	return frag;
-}
-
 static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc)
 {
 	sctp_assoc_sync_pmtu(asoc);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index c5e244be8f1e..ebf809eed33a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2097,6 +2097,7 @@ int sctp_assoc_update(struct sctp_association *old,
 
 __u32 sctp_association_get_next_tsn(struct sctp_association *);
 
+void sctp_assoc_update_frag_point(struct sctp_association *asoc);
 void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu);
 void sctp_assoc_sync_pmtu(struct sctp_association *asoc);
 void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 85b362324084..a29025418b96 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -666,8 +666,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 
 	peer->pmtu_pending = 0;
 
-	asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
-
 	/* The asoc->peer.port might not be meaningful yet, but
 	 * initialize the packet structure anyway.
 	 */
@@ -1370,10 +1368,26 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
 	}
 }
 
+void sctp_assoc_update_frag_point(struct sctp_association *asoc)
+{
+	int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
+				    sctp_datachk_len(&asoc->stream));
+
+	if (asoc->user_frag)
+		frag = min_t(int, frag, asoc->user_frag);
+
+	frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN -
+				sctp_datachk_len(&asoc->stream));
+
+	asoc->frag_point = SCTP_TRUNC4(frag);
+}
+
 void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
 {
-	if (asoc->pathmtu != pmtu)
+	if (asoc->pathmtu != pmtu) {
 		asoc->pathmtu = pmtu;
+		sctp_assoc_update_frag_point(asoc);
+	}
 
 	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
 		 asoc->pathmtu, asoc->frag_point);
@@ -1403,10 +1417,6 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 	}
 
 	sctp_assoc_set_pmtu(asoc, pmtu);
-	asoc->frag_point = sctp_frag_point(asoc, pmtu);
-
-	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
-		 asoc->pathmtu, asoc->frag_point);
 }
 
 /* Should we send a SACK to update our peer? */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index be296d633e95..79daa98208c3 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -172,8 +172,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	struct list_head *pos, *temp;
 	struct sctp_chunk *chunk;
 	struct sctp_datamsg *msg;
-	struct sctp_sock *sp;
-	struct sctp_af *af;
 	int err;
 
 	msg = sctp_datamsg_new(GFP_KERNEL);
@@ -192,12 +190,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
 	 */
-	sp = sctp_sk(asoc->base.sk);
-	af = sp->pf->af;
-	max_data = asoc->pathmtu - af->net_header_len -
-		   sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream) -
-		   af->ip_options_len(asoc->base.sk);
-	max_data = SCTP_TRUNC4(max_data);
+	max_data = asoc->frag_point;
 
 	/* If the the peer requested that we authenticate DATA chunks
 	 * we need to account for bundling of the AUTH chunks along with
@@ -222,9 +215,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		}
 	}
 
-	/* Check what's our max considering the above */
-	max_data = min_t(size_t, max_data, asoc->frag_point);
-
 	/* Set first_len and then account for possible bundles on first frag */
 	first_len = max_data;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b9d14f57146b..21bf457be3ea 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3251,7 +3251,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 			       sctp_datachk_len(&asoc->stream);
 		}
 		asoc->user_frag = val;
-		asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+		sctp_assoc_update_frag_point(asoc);
 	} else {
 		if (params.assoc_id && sctp_style(sk, UDP))
 			return -EINVAL;
-- 
cgit v1.2.3


From 2521680e1830c21033efe48322829044c6e6b32b Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:56 -0300
Subject: sctp: remove sctp_assoc_pending_pmtu

No need for this helper.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 6 ------
 net/sctp/socket.c       | 6 ++++--
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 350c65620a4e..e327acad8e7d 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -428,12 +428,6 @@ static inline int sctp_list_single_entry(struct list_head *head)
 	return (head->next != head) && (head->next == head->prev);
 }
 
-static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc)
-{
-	sctp_assoc_sync_pmtu(asoc);
-	asoc->pmtu_pending = 0;
-}
-
 static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk)
 {
 	return !list_empty(&chunk->list);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 21bf457be3ea..a93b60a28cc5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1918,8 +1918,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		goto err;
 	}
 
-	if (asoc->pmtu_pending)
-		sctp_assoc_pending_pmtu(asoc);
+	if (asoc->pmtu_pending) {
+		sctp_assoc_sync_pmtu(asoc);
+		asoc->pmtu_pending = 0;
+	}
 
 	if (sctp_wspace(asoc) < msg_len)
 		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
-- 
cgit v1.2.3


From 6ff0f871c20ec1769a481edca86f23c76b2b06d3 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:57 -0300
Subject: sctp: introduce sctp_dst_mtu

Which makes sure that the MTU respects the minimum value of
SCTP_DEFAULT_MINSEGMENT and that it is correctly aligned.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 9 +++++++--
 net/sctp/associola.c    | 6 ++----
 net/sctp/transport.c    | 6 +++---
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index e327acad8e7d..4965cbfa7d92 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -600,10 +600,15 @@ static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp,
 	return mtu ? mtu - overhead : overhead;
 }
 
+static inline __u32 sctp_dst_mtu(const struct dst_entry *dst)
+{
+	return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst),
+				 SCTP_DEFAULT_MINSEGMENT));
+}
+
 static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
 {
-	__u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)),
-			   SCTP_DEFAULT_MINSEGMENT);
+	__u32 pmtu = sctp_dst_mtu(t->dst);
 
 	if (t->pathmtu == pmtu)
 		return true;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a29025418b96..039fdb862b17 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1405,11 +1405,9 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 		return;
 
 	/* Get the lowest pmtu of all the transports. */
-	list_for_each_entry(t, &asoc->peer.transport_addr_list,
-				transports) {
+	list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
 		if (t->pmtu_pending && t->dst) {
-			sctp_transport_update_pmtu(
-					t, SCTP_TRUNC4(dst_mtu(t->dst)));
+			sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
 			t->pmtu_pending = 0;
 		}
 		if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index c5fc3aed08a1..ed73a9d91b83 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -242,9 +242,9 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 						&transport->fl, sk);
 	}
 
-	if (transport->dst) {
-		transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
-	} else
+	if (transport->dst)
+		transport->pathmtu = sctp_dst_mtu(transport->dst);
+	else
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
-- 
cgit v1.2.3


From 22d7be267eaa8114dcc28d66c1c347f667d7878a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:58 -0300
Subject: sctp: remove sctp_transport_pmtu_check

We are now keeping the MTU information synced between asoc, transport
and dst, which makes the check at sctp_packet_config() not needed
anymore. As it was the sole caller to this function, lets remove it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 12 ------------
 net/sctp/output.c       |  3 ---
 2 files changed, 15 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 4965cbfa7d92..f66d44350007 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -606,16 +606,4 @@ static inline __u32 sctp_dst_mtu(const struct dst_entry *dst)
 				 SCTP_DEFAULT_MINSEGMENT));
 }
 
-static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
-{
-	__u32 pmtu = sctp_dst_mtu(t->dst);
-
-	if (t->pathmtu == pmtu)
-		return true;
-
-	t->pathmtu = pmtu;
-
-	return false;
-}
-
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index bf4226c3cc1d..e672dee302c7 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -118,9 +118,6 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 		sctp_transport_route(tp, NULL, sp);
 		if (asoc->param_flags & SPP_PMTUD_ENABLE)
 			sctp_assoc_sync_pmtu(asoc);
-	} else if (!sctp_transport_pmtu_check(tp)) {
-		if (asoc->param_flags & SPP_PMTUD_ENABLE)
-			sctp_assoc_sync_pmtu(asoc);
 	}
 
 	/* If there a is a prepend chunk stick it on the list before
-- 
cgit v1.2.3


From 9e8d438e8ba43a38def2890a65a26917f2233a83 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 27 Apr 2018 12:41:49 -0700
Subject: net: phy: Fix modular PHYLIB build

After commit c59530d0d5dc ("net: Move PHY statistics code into PHY
library helpers") we made net/core/ethtool.c reference symbols which are
part of the library which can be modular. David introduced a temporary
fix with 1ecd6e8ad996 ("phy: Temporary build fix after phylib changes.")
which would prevent such modularity.

This is not desireable of course, so instead, just inline the functions
into include/linux/phy.h to keep both options available.

Fixes: c59530d0d5dc ("net: Move PHY statistics code into PHY library helpers")
Fixes: 1ecd6e8ad996 ("phy: Temporary build fix after phylib changes.")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig |  3 ++-
 drivers/net/phy/phy.c   | 48 -----------------------------------------------
 include/linux/phy.h     | 50 +++++++++++++++++++++++++++++++++++++------------
 3 files changed, 40 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 7c5e8c1e9370..edb8b9ab827f 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -9,6 +9,7 @@ menuconfig MDIO_DEVICE
 
 config MDIO_BUS
 	tristate
+	default m if PHYLIB=m
 	default MDIO_DEVICE
 	help
 	  This internal symbol is used for link time dependencies and it
@@ -170,7 +171,7 @@ config PHYLINK
 	  autonegotiation modes.
 
 menuconfig PHYLIB
-	bool "PHY Device support and infrastructure"
+	tristate "PHY Device support and infrastructure"
 	depends on NETDEVICES
 	select MDIO_DEVICE
 	help
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a98ed12c0009..05c1e8ef15e6 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1277,51 +1277,3 @@ int phy_ethtool_nway_reset(struct net_device *ndev)
 	return phy_restart_aneg(phydev);
 }
 EXPORT_SYMBOL(phy_ethtool_nway_reset);
-
-int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
-{
-	if (!phydev->drv)
-		return -EIO;
-
-	mutex_lock(&phydev->lock);
-	phydev->drv->get_strings(phydev, data);
-	mutex_unlock(&phydev->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(phy_ethtool_get_strings);
-
-int phy_ethtool_get_sset_count(struct phy_device *phydev)
-{
-	int ret;
-
-	if (!phydev->drv)
-		return -EIO;
-
-	if (phydev->drv->get_sset_count &&
-	    phydev->drv->get_strings &&
-	    phydev->drv->get_stats) {
-		mutex_lock(&phydev->lock);
-		ret = phydev->drv->get_sset_count(phydev);
-		mutex_unlock(&phydev->lock);
-
-		return ret;
-	}
-
-	return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(phy_ethtool_get_sset_count);
-
-int phy_ethtool_get_stats(struct phy_device *phydev,
-			  struct ethtool_stats *stats, u64 *data)
-{
-	if (!phydev->drv)
-		return -EIO;
-
-	mutex_lock(&phydev->lock);
-	phydev->drv->get_stats(phydev, stats, data);
-	mutex_unlock(&phydev->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(phy_ethtool_get_stats);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6ca81395c545..073235e70442 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1066,27 +1066,53 @@ int phy_ethtool_nway_reset(struct net_device *ndev);
 #if IS_ENABLED(CONFIG_PHYLIB)
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
-int phy_ethtool_get_sset_count(struct phy_device *phydev);
-int phy_ethtool_get_stats(struct phy_device *phydev,
-			  struct ethtool_stats *stats, u64 *data);
-#else
-int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+#endif
+
+/* Inline function for use within net/core/ethtool.c (built-in) */
+static inline int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
 {
-	return -EOPNOTSUPP;
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_strings(phydev, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
 }
 
-int phy_ethtool_get_sset_count(struct phy_device *phydev)
+static inline int phy_ethtool_get_sset_count(struct phy_device *phydev)
 {
+	int ret;
+
+	if (!phydev->drv)
+		return -EIO;
+
+	if (phydev->drv->get_sset_count &&
+	    phydev->drv->get_strings &&
+	    phydev->drv->get_stats) {
+		mutex_lock(&phydev->lock);
+		ret = phydev->drv->get_sset_count(phydev);
+		mutex_unlock(&phydev->lock);
+
+		return ret;
+	}
+
 	return -EOPNOTSUPP;
 }
 
-int phy_ethtool_get_stats(struct phy_device *phydev,
-			  struct ethtool_stats *stats, u64 *data)
+static inline int phy_ethtool_get_stats(struct phy_device *phydev,
+					struct ethtool_stats *stats, u64 *data)
 {
-	return -EOPNOTSUPP;
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_stats(phydev, stats, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
 }
-#endif
 
 extern struct bus_type mdio_bus_type;
 
-- 
cgit v1.2.3


From 256c4fc76a80a69a5108069d8a09b3836bbf6542 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Sun, 22 Apr 2018 18:02:30 +0200
Subject: mtd: rawnand: add a way to pass an ID table with nand_scan()

As part of the work of migrating all the drivers to nand_scan(), and
because nand_scan() does not provide a way to pass an ID table, rename
the function nand_scan_with_ids() and add a third parameter to give a
flash ID table (like what was done with nand_scan_ident()).

Create a nand_scan() helper that is just a wrapper of
nand_scan_with_ids(), passing NULL as the ID table. This way a
controller drivers can continue using nand_scan() transparently.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 10 ++++++----
 include/linux/mtd/rawnand.h      |  9 ++++++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 72f3a89da513..414a2a3a91d0 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -6630,24 +6630,26 @@ EXPORT_SYMBOL(nand_scan_tail);
 #endif
 
 /**
- * nand_scan - [NAND Interface] Scan for the NAND device
+ * nand_scan_with_ids - [NAND Interface] Scan for the NAND device
  * @mtd: MTD device structure
  * @maxchips: number of chips to scan for
+ * @ids: optional flash IDs table
  *
  * This fills out all the uninitialized function pointers with the defaults.
  * The flash ID is read and the mtd/chip structures are filled with the
  * appropriate values.
  */
-int nand_scan(struct mtd_info *mtd, int maxchips)
+int nand_scan_with_ids(struct mtd_info *mtd, int maxchips,
+		       struct nand_flash_dev *ids)
 {
 	int ret;
 
-	ret = nand_scan_ident(mtd, maxchips, NULL);
+	ret = nand_scan_ident(mtd, maxchips, ids);
 	if (!ret)
 		ret = nand_scan_tail(mtd);
 	return ret;
 }
-EXPORT_SYMBOL(nand_scan);
+EXPORT_SYMBOL(nand_scan_with_ids);
 
 /**
  * nand_cleanup - [NAND Interface] Free resources held by the NAND device
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 5dad59b31244..ba8d908f5cc7 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -28,7 +28,14 @@ struct nand_flash_dev;
 struct device_node;
 
 /* Scan and identify a NAND device */
-int nand_scan(struct mtd_info *mtd, int max_chips);
+int nand_scan_with_ids(struct mtd_info *mtd, int max_chips,
+		       struct nand_flash_dev *ids);
+
+static inline int nand_scan(struct mtd_info *mtd, int max_chips)
+{
+	return nand_scan_with_ids(mtd, max_chips, NULL);
+}
+
 /*
  * Separate phases of nand_scan(), allowing board driver to intervene
  * and override command or ECC setup according to flash type.
-- 
cgit v1.2.3


From c195651e565ae7f41a68acb7d4aa7390ad215de1 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:08 -0700
Subject: bpf: add bpf_get_stack helper

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++++++--
 kernel/bpf/core.c        |  5 ++++
 kernel/bpf/stackmap.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c    | 19 ++++++++++++++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++-
 7 files changed, 183 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 38ebbc61ed99..c553f6f9c6b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b2308174..64899c04c1a6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -468,7 +468,8 @@ struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				has_callchain_buf:1; /* callchain buffer allocated? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da77a9388947..1afb606a18b9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1767,6 +1767,40 @@ union bpf_attr {
  * 		**CONFIG_XFRM** configuration option.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *ctx*, which is a pointer
+ *		to the context on which the tracing program is executed.
+ *		To store the stacktrace, the bpf program provides *buf* with
+ *		a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *	::
+ *
+ *		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		a non-negative value equal to or less than size on success, or
+ * 		a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1835,7 +1869,8 @@ union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1869,11 +1904,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..9349a5db3cf2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
+#include <linux/perf_event.h>
 
 #include <asm/unaligned.h>
 
@@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+	if (aux->prog->has_callchain_buf)
+		put_callchain_buffers();
+#endif
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1679f0..3ba102b41512 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	int err = -EINVAL;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+	if (kernel && user_build_id)
+		goto clear;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		goto clear;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		goto err_fault;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr < skip)
+		goto err_fault;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	if (size > copy_len)
+		memset(buf + copy_len, 0, size - copy_len);
+	return copy_len;
+
+err_fault:
+	err = -EFAULT;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596aebd3..253f6bdb9117 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
 #include <linux/stringify.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/perf_event.h>
 
 #include "disasm.h"
 
@@ -2450,6 +2451,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+		const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+		err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+		err = -ENOTSUPP;
+		err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+		if (err) {
+			verbose(env, err_str, func_id_name(func_id), func_id);
+			return err;
+		}
+
+		env->prog->has_callchain_buf = true;
+	}
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 56ba0f2a01db..46d866e9937c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3


From 9cbe1f5a32dcd6d0508326f7d9098e5bc380a4fe Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:11 -0700
Subject: bpf/verifier: improve register value range tracking with ARSH

When helpers like bpf_get_stack returns an int value
and later on used for arithmetic computation, the LSH and ARSH
operations are often required to get proper sign extension into
64-bit. For example, without this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0)
With this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
With better range of "R8", later on when "R8" is added to other register,
e.g., a map pointer or scalar-value register, the better register
range can be derived and verifier failure may be avoided.

In our later example,
    ......
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0)
        return 0;
    ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
    ......
Without improving ARSH value range tracking, the register representing
"max_len - usize" will have smin_value equal to S64_MIN and will be
rejected by verifier.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tnum.h  |  4 +++-
 kernel/bpf/tnum.c     | 10 ++++++++++
 kernel/bpf/verifier.c | 23 +++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 0d2d3da46139..c7dc2b5902c0 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max);
 /* Arithmetic and logical ops */
 /* Shift a tnum left (by a fixed shift) */
 struct tnum tnum_lshift(struct tnum a, u8 shift);
-/* Shift a tnum right (by a fixed shift) */
+/* Shift (rsh) a tnum right (by a fixed shift) */
 struct tnum tnum_rshift(struct tnum a, u8 shift);
+/* Shift (arsh) a tnum right (by a fixed min_shift) */
+struct tnum tnum_arshift(struct tnum a, u8 min_shift);
 /* Add two tnums, return @a + @b */
 struct tnum tnum_add(struct tnum a, struct tnum b);
 /* Subtract two tnums, return @a - @b */
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68c12db..938d41211be7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
 	return TNUM(a.value >> shift, a.mask >> shift);
 }
 
+struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+{
+	/* if a.value is negative, arithmetic shifting by minimum shift
+	 * will have larger negative offset compared to more shifting.
+	 * If a.value is nonnegative, arithmetic shifting by minimum shift
+	 * will have larger positive offset compare to more shifting.
+	 */
+	return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+}
+
 struct tnum tnum_add(struct tnum a, struct tnum b)
 {
 	u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6e3f859b3abf..712d8655e916 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2974,6 +2974,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
+	case BPF_ARSH:
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
+			 */
+			mark_reg_unknown(env, regs, insn->dst_reg);
+			break;
+		}
+
+		/* Upon reaching here, src_known is true and
+		 * umax_val is equal to umin_val.
+		 */
+		dst_reg->smin_value >>= umin_val;
+		dst_reg->smax_value >>= umin_val;
+		dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+
+		/* blow away the dst_reg umin_value/umax_value and rely on
+		 * dst_reg var_off to refine the result.
+		 */
+		dst_reg->umin_value = 0;
+		dst_reg->umax_value = U64_MAX;
+		__update_reg_bounds(dst_reg);
+		break;
 	default:
 		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
-- 
cgit v1.2.3


From a3ef8e9a4d7cc26d7528d50d10c5720b523b07c9 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Sat, 28 Apr 2018 16:06:19 -0700
Subject: bpf: Fix helpers ctx struct types in uapi doc

Helpers may operate on two types of ctx structures: user visible ones
(e.g. `struct bpf_sock_ops`) when used in user programs, and kernel ones
(e.g. `struct bpf_sock_ops_kern`) in kernel implementation.

UAPI documentation must refer to only user visible structures.

The patch replaces references to `_kern` structures in BPF helpers
description by corresponding user visible structures.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1afb606a18b9..23b334bba1a6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1361,7 +1361,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1435,7 +1435,7 @@ union bpf_attr {
  * 	Return
  * 		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  * 	Description
  * 		Add an entry to, or update a *map* referencing sockets. The
  * 		*skops* is used as a new value for the entry associated to
@@ -1533,7 +1533,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
  * 		For en eBPF program attached to a perf event, retrieve the
  * 		value of the event counter associated to *ctx* and store it in
@@ -1544,7 +1544,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1588,7 +1588,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
  * 	Description
  * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
  * 		for the full TCP socket associated to *bpf_sock_ops* to
@@ -1721,7 +1721,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
  * 		*addr*, of length *addr_len*. This allows for making outgoing
-- 
cgit v1.2.3


From 05255b823a6173525587f29c4e8f1ca33fd7677d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Apr 2018 08:58:08 -0700
Subject: tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive

When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.

Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.

1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
  This operation does not involve any TCP locking.

2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
 the transfert of pages from skbs to one VMA.
  This operation only uses down_read(&current->mm->mmap_sem) after
  holding TCP lock, thus solving the lockdep issue.

This new implementation was suggested by Andy Lutomirski with great details.

Benefits are :

- Better scalability, in case multiple threads reuse VMAS
   (without mmap()/munmap() calls) since mmap_sem wont be write locked.

- Better error recovery.
   The previous mmap() model had to provide the expected size of the
   mapping. If for some reason one part could not be mapped (partial MSS),
   the whole operation had to be aborted.
   With the tcp_zerocopy_receive struct, kernel can report how
   many bytes were successfuly mapped, and how many bytes should
   be read to skip the problematic sequence.

- No more memory allocation to hold an array of page pointers.
  16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/

- skbs are freed while mmap_sem has been released

Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)

Note that memcg might require additional changes.

Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |   8 ++
 net/ipv4/af_inet.c       |   2 +
 net/ipv4/tcp.c           | 194 +++++++++++++++++++++++++----------------------
 net/ipv6/af_inet6.c      |   2 +
 4 files changed, 116 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 379b08700a54..e9e8373b34b9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -122,6 +122,7 @@ enum {
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
+#define TCP_ZEROCOPY_RECEIVE	35
 
 struct tcp_repair_opt {
 	__u32	opt_code;
@@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
+/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
+
+struct tcp_zerocopy_receive {
+	__u64 address;		/* in: address of mapping */
+	__u32 length;		/* in/out: number of bytes to map/mapped */
+	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
+};
 #endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3ebf599cebae..b403499fdabe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
+#ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
+#endif
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dfd090ea54ad..4028ddd14dd5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,118 +1726,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_set_rcvlowat);
 
-/* When user wants to mmap X pages, we first need to perform the mapping
- * before freeing any skbs in receive queue, otherwise user would be unable
- * to fallback to standard recvmsg(). This happens if some data in the
- * requested block is not exactly fitting in a page.
- *
- * We only support order-0 pages for the moment.
- * mmap() on TCP is very strict, there is no point
- * trying to accommodate with pathological layouts.
- */
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
 int tcp_mmap(struct file *file, struct socket *sock,
 	     struct vm_area_struct *vma)
 {
-	unsigned long size = vma->vm_end - vma->vm_start;
-	unsigned int nr_pages = size >> PAGE_SHIFT;
-	struct page **pages_array = NULL;
-	u32 seq, len, offset, nr = 0;
-	struct sock *sk = sock->sk;
-	const skb_frag_t *frags;
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+
+	/* Instruct vm_insert_page() to not down_read(mmap_sem) */
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	vma->vm_ops = &tcp_vm_ops;
+	return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static int tcp_zerocopy_receive(struct sock *sk,
+				struct tcp_zerocopy_receive *zc)
+{
+	unsigned long address = (unsigned long)zc->address;
+	const skb_frag_t *frags = NULL;
+	u32 length = 0, seq, offset;
+	struct vm_area_struct *vma;
+	struct sk_buff *skb = NULL;
 	struct tcp_sock *tp;
-	struct sk_buff *skb;
 	int ret;
 
-	if (vma->vm_pgoff || !nr_pages)
+	if (address & (PAGE_SIZE - 1) || address != zc->address)
 		return -EINVAL;
 
-	if (vma->vm_flags & VM_WRITE)
-		return -EPERM;
-	/* TODO: Maybe the following is not needed if pages are COW */
-	vma->vm_flags &= ~VM_MAYWRITE;
-
-	lock_sock(sk);
-
-	ret = -ENOTCONN;
 	if (sk->sk_state == TCP_LISTEN)
-		goto out;
+		return -ENOTCONN;
 
 	sock_rps_record_flow(sk);
 
-	if (tcp_inq(sk) < size) {
-		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+	down_read(&current->mm->mmap_sem);
+
+	ret = -EINVAL;
+	vma = find_vma(current->mm, address);
+	if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
 		goto out;
-	}
+	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+
 	tp = tcp_sk(sk);
 	seq = tp->copied_seq;
-	/* Abort if urgent data is in the area */
-	if (unlikely(tp->urg_data)) {
-		u32 urg_offset = tp->urg_seq - seq;
+	zc->length = min_t(u32, zc->length, tcp_inq(sk));
+	zc->length &= ~(PAGE_SIZE - 1);
 
-		ret = -EINVAL;
-		if (urg_offset < size)
-			goto out;
-	}
-	ret = -ENOMEM;
-	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
-				     GFP_KERNEL);
-	if (!pages_array)
-		goto out;
-	skb = tcp_recv_skb(sk, seq, &offset);
-	ret = -EINVAL;
-skb_start:
-	/* We do not support anything not in page frags */
-	offset -= skb_headlen(skb);
-	if ((int)offset < 0)
-		goto out;
-	if (skb_has_frag_list(skb))
-		goto out;
-	len = skb->data_len - offset;
-	frags = skb_shinfo(skb)->frags;
-	while (offset) {
-		if (frags->size > offset)
-			goto out;
-		offset -= frags->size;
-		frags++;
-	}
-	while (nr < nr_pages) {
-		if (len) {
-			if (len < PAGE_SIZE)
-				goto out;
-			if (frags->size != PAGE_SIZE || frags->page_offset)
-				goto out;
-			pages_array[nr++] = skb_frag_page(frags);
-			frags++;
-			len -= PAGE_SIZE;
-			seq += PAGE_SIZE;
-			continue;
+	zap_page_range(vma, address, zc->length);
+
+	zc->recv_skip_hint = 0;
+	ret = 0;
+	while (length + PAGE_SIZE <= zc->length) {
+		if (zc->recv_skip_hint < PAGE_SIZE) {
+			if (skb) {
+				skb = skb->next;
+				offset = seq - TCP_SKB_CB(skb)->seq;
+			} else {
+				skb = tcp_recv_skb(sk, seq, &offset);
+			}
+
+			zc->recv_skip_hint = skb->len - offset;
+			offset -= skb_headlen(skb);
+			if ((int)offset < 0 || skb_has_frag_list(skb))
+				break;
+			frags = skb_shinfo(skb)->frags;
+			while (offset) {
+				if (frags->size > offset)
+					goto out;
+				offset -= frags->size;
+				frags++;
+			}
 		}
-		skb = skb->next;
-		offset = seq - TCP_SKB_CB(skb)->seq;
-		goto skb_start;
-	}
-	/* OK, we have a full set of pages ready to be inserted into vma */
-	for (nr = 0; nr < nr_pages; nr++) {
-		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
-				     pages_array[nr]);
+		if (frags->size != PAGE_SIZE || frags->page_offset)
+			break;
+		ret = vm_insert_page(vma, address + length,
+				     skb_frag_page(frags));
 		if (ret)
-			goto out;
+			break;
+		length += PAGE_SIZE;
+		seq += PAGE_SIZE;
+		zc->recv_skip_hint -= PAGE_SIZE;
+		frags++;
 	}
-	/* operation is complete, we can 'consume' all skbs */
-	tp->copied_seq = seq;
-	tcp_rcv_space_adjust(sk);
-
-	/* Clean up data we have read: This will do ACK frames. */
-	tcp_recv_skb(sk, seq, &offset);
-	tcp_cleanup_rbuf(sk, size);
-
-	ret = 0;
 out:
-	release_sock(sk);
-	kvfree(pages_array);
+	up_read(&current->mm->mmap_sem);
+	if (length) {
+		tp->copied_seq = seq;
+		tcp_rcv_space_adjust(sk);
+
+		/* Clean up data we have read: This will do ACK frames. */
+		tcp_recv_skb(sk, seq, &offset);
+		tcp_cleanup_rbuf(sk, length);
+		ret = 0;
+		if (length == zc->length)
+			zc->recv_skip_hint = 0;
+	} else {
+		if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+			ret = -EIO;
+	}
+	zc->length = length;
 	return ret;
 }
-EXPORT_SYMBOL(tcp_mmap);
+#endif
 
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
@@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		}
 		return 0;
 	}
+#ifdef CONFIG_MMU
+	case TCP_ZEROCOPY_RECEIVE: {
+		struct tcp_zerocopy_receive zc;
+		int err;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+		if (len != sizeof(zc))
+			return -EINVAL;
+		if (copy_from_user(&zc, optval, len))
+			return -EFAULT;
+		lock_sock(sk);
+		err = tcp_zerocopy_receive(sk, &zc);
+		release_sock(sk);
+		if (!err && copy_to_user(optval, &zc, len))
+			err = -EFAULT;
+		return err;
+	}
+#endif
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..d0af96e0d109 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet_sendmsg,		/* ok		*/
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
+#ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
+#endif
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
-- 
cgit v1.2.3


From 1b837d489e06a5289753ddbee99cfbc26d251d6d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 27 Apr 2018 14:06:53 -0400
Subject: net: Revoke export for __skb_tx_hash, update it to just be static
 skb_tx_hash

I am dropping the export of __skb_tx_hash as after my patches nobody is
using it outside of the net/core/dev.c file. In addition I am renaming and
repurposing it to just be a static declaration of skb_tx_hash since that
was the only user for it at this point. By doing this the compiler can
inline it into __netdev_pick_tx as that will improve performance.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 -------------
 net/core/dev.c            | 10 ++++------
 2 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 366c32891158..82f5a9aba578 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3213,19 +3213,6 @@ static inline int netif_set_xps_queue(struct net_device *dev,
 }
 #endif
 
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues);
-
-/*
- * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used
- * as a distribution range limit for the returned value.
- */
-static inline u16 skb_tx_hash(const struct net_device *dev,
-			      struct sk_buff *skb)
-{
-	return __skb_tx_hash(dev, skb, dev->real_num_tx_queues);
-}
-
 /**
  *	netif_is_multiqueue - test if device has multiple transmit queues
  *	@dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index 0a2d46424069..25ceecfdd8fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2615,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);
  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
  * to be used as a distribution range.
  */
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues)
+static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
 {
 	u32 hash;
 	u16 qoffset = 0;
-	u16 qcount = num_tx_queues;
+	u16 qcount = dev->real_num_tx_queues;
 
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
-		while (unlikely(hash >= num_tx_queues))
-			hash -= num_tx_queues;
+		while (unlikely(hash >= qcount))
+			hash -= qcount;
 		return hash;
 	}
 
@@ -2638,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
 
 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 }
-EXPORT_SYMBOL(__skb_tx_hash);
 
 static void skb_warn_bad_offload(const struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 3ac305c386f698abccd0523c64a8aef248c89bc6 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 27 Apr 2018 13:11:14 -0700
Subject: net: core: Assert the size of netdev_featres_t

We have about 53 netdev_features_t bits defined and counting, add a
build time check to catch when an u64 type will not be enough and we
will have to convert that to a bitmap. This is done in
register_netdevice() for convenience.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ++++++
 net/core/dev.c            | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 82f5a9aba578..9e09dd897b74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4108,6 +4108,12 @@ const char *netdev_drivername(const struct net_device *dev);
 
 void linkwatch_run_queue(void);
 
+static inline void netdev_features_size_check(void)
+{
+	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+		     NETDEV_FEATURE_COUNT);
+}
+
 static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
 							  netdev_features_t f2)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 25ceecfdd8fe..e01c21a88cae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7879,6 +7879,7 @@ int register_netdevice(struct net_device *dev)
 	int ret;
 	struct net *net = dev_net(dev);
 
+	netdev_features_size_check();
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-- 
cgit v1.2.3


From a8d502fd33484ed8c4acc6acae73918844ca6811 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 9 Apr 2018 22:28:31 +0300
Subject: memory: tegra: Squash tegra20-mc into common tegra-mc driver

Tegra30+ has some minor differences in registers / bits layout compared
to Tegra20. Let's squash Tegra20 driver into the common tegra-mc driver
in a preparation for the upcoming MC hot reset controls implementation,
avoiding code duplication.

Note that this currently doesn't report the value of MC_GART_ERROR_REQ
because it is located within the GART register area and cannot be safely
accessed from the MC driver (this happens to work only by accident). The
proper solution is to integrate the GART driver with the MC driver, much
like is done for the Tegra SMMU, but that is an invasive change and will
be part of a separate patch series.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/memory/Kconfig         |  10 --
 drivers/memory/Makefile        |   1 -
 drivers/memory/tegra/Makefile  |   1 +
 drivers/memory/tegra/mc.c      | 112 ++++++++++++++++--
 drivers/memory/tegra/mc.h      |  11 ++
 drivers/memory/tegra/tegra20.c | 178 +++++++++++++++++++++++++++++
 drivers/memory/tegra20-mc.c    | 254 -----------------------------------------
 include/soc/tegra/mc.h         |   2 +-
 8 files changed, 291 insertions(+), 278 deletions(-)
 create mode 100644 drivers/memory/tegra/tegra20.c
 delete mode 100644 drivers/memory/tegra20-mc.c

(limited to 'include')

diff --git a/drivers/memory/Kconfig b/drivers/memory/Kconfig
index 19a0e83f260d..8d731d6c3e54 100644
--- a/drivers/memory/Kconfig
+++ b/drivers/memory/Kconfig
@@ -104,16 +104,6 @@ config MVEBU_DEVBUS
 	  Armada 370 and Armada XP. This controller allows to handle flash
 	  devices such as NOR, NAND, SRAM, and FPGA.
 
-config TEGRA20_MC
-	bool "Tegra20 Memory Controller(MC) driver"
-	default y
-	depends on ARCH_TEGRA_2x_SOC
-	help
-	  This driver is for the Memory Controller(MC) module available
-	  in Tegra20 SoCs, mainly for a address translation fault
-	  analysis, especially for IOMMU/GART(Graphics Address
-	  Relocation Table) module.
-
 config FSL_CORENET_CF
 	tristate "Freescale CoreNet Error Reporting"
 	depends on FSL_SOC_BOOKE
diff --git a/drivers/memory/Makefile b/drivers/memory/Makefile
index 66f55240830e..a01ab3e22f94 100644
--- a/drivers/memory/Makefile
+++ b/drivers/memory/Makefile
@@ -16,7 +16,6 @@ obj-$(CONFIG_OMAP_GPMC)		+= omap-gpmc.o
 obj-$(CONFIG_FSL_CORENET_CF)	+= fsl-corenet-cf.o
 obj-$(CONFIG_FSL_IFC)		+= fsl_ifc.o
 obj-$(CONFIG_MVEBU_DEVBUS)	+= mvebu-devbus.o
-obj-$(CONFIG_TEGRA20_MC)	+= tegra20-mc.o
 obj-$(CONFIG_JZ4780_NEMC)	+= jz4780-nemc.o
 obj-$(CONFIG_MTK_SMI)		+= mtk-smi.o
 obj-$(CONFIG_DA8XX_DDRCTL)	+= da8xx-ddrctl.o
diff --git a/drivers/memory/tegra/Makefile b/drivers/memory/tegra/Makefile
index ce87a9470034..94ab16ba075b 100644
--- a/drivers/memory/tegra/Makefile
+++ b/drivers/memory/tegra/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 tegra-mc-y := mc.o
 
+tegra-mc-$(CONFIG_ARCH_TEGRA_2x_SOC)  += tegra20.o
 tegra-mc-$(CONFIG_ARCH_TEGRA_3x_SOC)  += tegra30.o
 tegra-mc-$(CONFIG_ARCH_TEGRA_114_SOC) += tegra114.o
 tegra-mc-$(CONFIG_ARCH_TEGRA_124_SOC) += tegra124.o
diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c
index 60509f0a386b..b1a060ce8116 100644
--- a/drivers/memory/tegra/mc.c
+++ b/drivers/memory/tegra/mc.c
@@ -37,6 +37,9 @@
 
 #define MC_ERR_ADR 0x0c
 
+#define MC_DECERR_EMEM_OTHERS_STATUS	0x58
+#define MC_SECURITY_VIOLATION_STATUS	0x74
+
 #define MC_EMEM_ARB_CFG 0x90
 #define  MC_EMEM_ARB_CFG_CYCLES_PER_UPDATE(x)	(((x) & 0x1ff) << 0)
 #define  MC_EMEM_ARB_CFG_CYCLES_PER_UPDATE_MASK	0x1ff
@@ -46,6 +49,9 @@
 #define MC_EMEM_ADR_CFG_EMEM_NUMDEV BIT(0)
 
 static const struct of_device_id tegra_mc_of_match[] = {
+#ifdef CONFIG_ARCH_TEGRA_2x_SOC
+	{ .compatible = "nvidia,tegra20-mc", .data = &tegra20_mc_soc },
+#endif
 #ifdef CONFIG_ARCH_TEGRA_3x_SOC
 	{ .compatible = "nvidia,tegra30-mc", .data = &tegra30_mc_soc },
 #endif
@@ -221,6 +227,7 @@ static int tegra_mc_setup_timings(struct tegra_mc *mc)
 static const char *const status_names[32] = {
 	[ 1] = "External interrupt",
 	[ 6] = "EMEM address decode error",
+	[ 7] = "GART page fault",
 	[ 8] = "Security violation",
 	[ 9] = "EMEM arbitration error",
 	[10] = "Page fault",
@@ -334,11 +341,78 @@ static irqreturn_t tegra_mc_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+static __maybe_unused irqreturn_t tegra20_mc_irq(int irq, void *data)
+{
+	struct tegra_mc *mc = data;
+	unsigned long status;
+	unsigned int bit;
+
+	/* mask all interrupts to avoid flooding */
+	status = mc_readl(mc, MC_INTSTATUS) & mc->soc->intmask;
+	if (!status)
+		return IRQ_NONE;
+
+	for_each_set_bit(bit, &status, 32) {
+		const char *direction = "read", *secure = "";
+		const char *error = status_names[bit];
+		const char *client, *desc;
+		phys_addr_t addr;
+		u32 value, reg;
+		u8 id, type;
+
+		switch (BIT(bit)) {
+		case MC_INT_DECERR_EMEM:
+			reg = MC_DECERR_EMEM_OTHERS_STATUS;
+			value = mc_readl(mc, reg);
+
+			id = value & mc->soc->client_id_mask;
+			desc = error_names[2];
+
+			if (value & BIT(31))
+				direction = "write";
+			break;
+
+		case MC_INT_INVALID_GART_PAGE:
+			dev_err_ratelimited(mc->dev, "%s\n", error);
+			continue;
+
+		case MC_INT_SECURITY_VIOLATION:
+			reg = MC_SECURITY_VIOLATION_STATUS;
+			value = mc_readl(mc, reg);
+
+			id = value & mc->soc->client_id_mask;
+			type = (value & BIT(30)) ? 4 : 3;
+			desc = error_names[type];
+			secure = "secure ";
+
+			if (value & BIT(31))
+				direction = "write";
+			break;
+
+		default:
+			continue;
+		}
+
+		client = mc->soc->clients[id].name;
+		addr = mc_readl(mc, reg + sizeof(u32));
+
+		dev_err_ratelimited(mc->dev, "%s: %s%s @%pa: %s (%s)\n",
+				    client, secure, direction, &addr, error,
+				    desc);
+	}
+
+	/* clear interrupts */
+	mc_writel(mc, status, MC_INTSTATUS);
+
+	return IRQ_HANDLED;
+}
+
 static int tegra_mc_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *match;
 	struct resource *res;
 	struct tegra_mc *mc;
+	void *isr;
 	int err;
 
 	match = of_match_node(tegra_mc_of_match, pdev->dev.of_node);
@@ -361,18 +435,32 @@ static int tegra_mc_probe(struct platform_device *pdev)
 	if (IS_ERR(mc->regs))
 		return PTR_ERR(mc->regs);
 
-	mc->clk = devm_clk_get(&pdev->dev, "mc");
-	if (IS_ERR(mc->clk)) {
-		dev_err(&pdev->dev, "failed to get MC clock: %ld\n",
-			PTR_ERR(mc->clk));
-		return PTR_ERR(mc->clk);
-	}
+#ifdef CONFIG_ARCH_TEGRA_2x_SOC
+	if (mc->soc == &tegra20_mc_soc) {
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+		mc->regs2 = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(mc->regs2))
+			return PTR_ERR(mc->regs2);
 
-	err = tegra_mc_setup_latency_allowance(mc);
-	if (err < 0) {
-		dev_err(&pdev->dev, "failed to setup latency allowance: %d\n",
-			err);
-		return err;
+		isr = tegra20_mc_irq;
+	} else
+#endif
+	{
+		mc->clk = devm_clk_get(&pdev->dev, "mc");
+		if (IS_ERR(mc->clk)) {
+			dev_err(&pdev->dev, "failed to get MC clock: %ld\n",
+				PTR_ERR(mc->clk));
+			return PTR_ERR(mc->clk);
+		}
+
+		err = tegra_mc_setup_latency_allowance(mc);
+		if (err < 0) {
+			dev_err(&pdev->dev, "failed to setup latency allowance: %d\n",
+				err);
+			return err;
+		}
+
+		isr = tegra_mc_irq;
 	}
 
 	err = tegra_mc_setup_timings(mc);
@@ -400,7 +488,7 @@ static int tegra_mc_probe(struct platform_device *pdev)
 
 	mc_writel(mc, mc->soc->intmask, MC_INTMASK);
 
-	err = devm_request_irq(&pdev->dev, mc->irq, tegra_mc_irq, IRQF_SHARED,
+	err = devm_request_irq(&pdev->dev, mc->irq, isr, IRQF_SHARED,
 			       dev_name(&pdev->dev), mc);
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to request IRQ#%u: %d\n", mc->irq,
diff --git a/drivers/memory/tegra/mc.h b/drivers/memory/tegra/mc.h
index 24e020b4609b..cdd6911f4079 100644
--- a/drivers/memory/tegra/mc.h
+++ b/drivers/memory/tegra/mc.h
@@ -21,19 +21,30 @@
 #define MC_INT_INVALID_SMMU_PAGE (1 << 10)
 #define MC_INT_ARBITRATION_EMEM (1 << 9)
 #define MC_INT_SECURITY_VIOLATION (1 << 8)
+#define MC_INT_INVALID_GART_PAGE (1 << 7)
 #define MC_INT_DECERR_EMEM (1 << 6)
 
 static inline u32 mc_readl(struct tegra_mc *mc, unsigned long offset)
 {
+	if (mc->regs2 && offset >= 0x24)
+		return readl(mc->regs2 + offset - 0x3c);
+
 	return readl(mc->regs + offset);
 }
 
 static inline void mc_writel(struct tegra_mc *mc, u32 value,
 			     unsigned long offset)
 {
+	if (mc->regs2 && offset >= 0x24)
+		return writel(value, mc->regs2 + offset - 0x3c);
+
 	writel(value, mc->regs + offset);
 }
 
+#ifdef CONFIG_ARCH_TEGRA_2x_SOC
+extern const struct tegra_mc_soc tegra20_mc_soc;
+#endif
+
 #ifdef CONFIG_ARCH_TEGRA_3x_SOC
 extern const struct tegra_mc_soc tegra30_mc_soc;
 #endif
diff --git a/drivers/memory/tegra/tegra20.c b/drivers/memory/tegra/tegra20.c
new file mode 100644
index 000000000000..512a3418cb80
--- /dev/null
+++ b/drivers/memory/tegra/tegra20.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2012 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "mc.h"
+
+static const struct tegra_mc_client tegra20_mc_clients[] = {
+	{
+		.id = 0x00,
+		.name = "display0a",
+	}, {
+		.id = 0x01,
+		.name = "display0ab",
+	}, {
+		.id = 0x02,
+		.name = "display0b",
+	}, {
+		.id = 0x03,
+		.name = "display0bb",
+	}, {
+		.id = 0x04,
+		.name = "display0c",
+	}, {
+		.id = 0x05,
+		.name = "display0cb",
+	}, {
+		.id = 0x06,
+		.name = "display1b",
+	}, {
+		.id = 0x07,
+		.name = "display1bb",
+	}, {
+		.id = 0x08,
+		.name = "eppup",
+	}, {
+		.id = 0x09,
+		.name = "g2pr",
+	}, {
+		.id = 0x0a,
+		.name = "g2sr",
+	}, {
+		.id = 0x0b,
+		.name = "mpeunifbr",
+	}, {
+		.id = 0x0c,
+		.name = "viruv",
+	}, {
+		.id = 0x0d,
+		.name = "avpcarm7r",
+	}, {
+		.id = 0x0e,
+		.name = "displayhc",
+	}, {
+		.id = 0x0f,
+		.name = "displayhcb",
+	}, {
+		.id = 0x10,
+		.name = "fdcdrd",
+	}, {
+		.id = 0x11,
+		.name = "g2dr",
+	}, {
+		.id = 0x12,
+		.name = "host1xdmar",
+	}, {
+		.id = 0x13,
+		.name = "host1xr",
+	}, {
+		.id = 0x14,
+		.name = "idxsrd",
+	}, {
+		.id = 0x15,
+		.name = "mpcorer",
+	}, {
+		.id = 0x16,
+		.name = "mpe_ipred",
+	}, {
+		.id = 0x17,
+		.name = "mpeamemrd",
+	}, {
+		.id = 0x18,
+		.name = "mpecsrd",
+	}, {
+		.id = 0x19,
+		.name = "ppcsahbdmar",
+	}, {
+		.id = 0x1a,
+		.name = "ppcsahbslvr",
+	}, {
+		.id = 0x1b,
+		.name = "texsrd",
+	}, {
+		.id = 0x1c,
+		.name = "vdebsevr",
+	}, {
+		.id = 0x1d,
+		.name = "vdember",
+	}, {
+		.id = 0x1e,
+		.name = "vdemcer",
+	}, {
+		.id = 0x1f,
+		.name = "vdetper",
+	}, {
+		.id = 0x20,
+		.name = "eppu",
+	}, {
+		.id = 0x21,
+		.name = "eppv",
+	}, {
+		.id = 0x22,
+		.name = "eppy",
+	}, {
+		.id = 0x23,
+		.name = "mpeunifbw",
+	}, {
+		.id = 0x24,
+		.name = "viwsb",
+	}, {
+		.id = 0x25,
+		.name = "viwu",
+	}, {
+		.id = 0x26,
+		.name = "viwv",
+	}, {
+		.id = 0x27,
+		.name = "viwy",
+	}, {
+		.id = 0x28,
+		.name = "g2dw",
+	}, {
+		.id = 0x29,
+		.name = "avpcarm7w",
+	}, {
+		.id = 0x2a,
+		.name = "fdcdwr",
+	}, {
+		.id = 0x2b,
+		.name = "host1xw",
+	}, {
+		.id = 0x2c,
+		.name = "ispw",
+	}, {
+		.id = 0x2d,
+		.name = "mpcorew",
+	}, {
+		.id = 0x2e,
+		.name = "mpecswr",
+	}, {
+		.id = 0x2f,
+		.name = "ppcsahbdmaw",
+	}, {
+		.id = 0x30,
+		.name = "ppcsahbslvw",
+	}, {
+		.id = 0x31,
+		.name = "vdebsevw",
+	}, {
+		.id = 0x32,
+		.name = "vdembew",
+	}, {
+		.id = 0x33,
+		.name = "vdetpmw",
+	},
+};
+
+const struct tegra_mc_soc tegra20_mc_soc = {
+	.clients = tegra20_mc_clients,
+	.num_clients = ARRAY_SIZE(tegra20_mc_clients),
+	.num_address_bits = 32,
+	.client_id_mask = 0x3f,
+	.intmask = MC_INT_SECURITY_VIOLATION | MC_INT_INVALID_GART_PAGE |
+		   MC_INT_DECERR_EMEM,
+};
diff --git a/drivers/memory/tegra20-mc.c b/drivers/memory/tegra20-mc.c
deleted file mode 100644
index cc309a05289a..000000000000
--- a/drivers/memory/tegra20-mc.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Tegra20 Memory Controller
- *
- * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#include <linux/err.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ratelimit.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#define DRV_NAME "tegra20-mc"
-
-#define MC_INTSTATUS			0x0
-#define MC_INTMASK			0x4
-
-#define MC_INT_ERR_SHIFT		6
-#define MC_INT_ERR_MASK			(0x1f << MC_INT_ERR_SHIFT)
-#define MC_INT_DECERR_EMEM		BIT(MC_INT_ERR_SHIFT)
-#define MC_INT_INVALID_GART_PAGE	BIT(MC_INT_ERR_SHIFT + 1)
-#define MC_INT_SECURITY_VIOLATION	BIT(MC_INT_ERR_SHIFT + 2)
-#define MC_INT_ARBITRATION_EMEM		BIT(MC_INT_ERR_SHIFT + 3)
-
-#define MC_GART_ERROR_REQ		0x30
-#define MC_DECERR_EMEM_OTHERS_STATUS	0x58
-#define MC_SECURITY_VIOLATION_STATUS	0x74
-
-#define SECURITY_VIOLATION_TYPE		BIT(30)	/* 0=TRUSTZONE, 1=CARVEOUT */
-
-#define MC_CLIENT_ID_MASK		0x3f
-
-#define NUM_MC_REG_BANKS		2
-
-struct tegra20_mc {
-	void __iomem *regs[NUM_MC_REG_BANKS];
-	struct device *dev;
-};
-
-static inline u32 mc_readl(struct tegra20_mc *mc, u32 offs)
-{
-	u32 val = 0;
-
-	if (offs < 0x24)
-		val = readl(mc->regs[0] + offs);
-	else if (offs < 0x400)
-		val = readl(mc->regs[1] + offs - 0x3c);
-
-	return val;
-}
-
-static inline void mc_writel(struct tegra20_mc *mc, u32 val, u32 offs)
-{
-	if (offs < 0x24)
-		writel(val, mc->regs[0] + offs);
-	else if (offs < 0x400)
-		writel(val, mc->regs[1] + offs - 0x3c);
-}
-
-static const char * const tegra20_mc_client[] = {
-	"cbr_display0a",
-	"cbr_display0ab",
-	"cbr_display0b",
-	"cbr_display0bb",
-	"cbr_display0c",
-	"cbr_display0cb",
-	"cbr_display1b",
-	"cbr_display1bb",
-	"cbr_eppup",
-	"cbr_g2pr",
-	"cbr_g2sr",
-	"cbr_mpeunifbr",
-	"cbr_viruv",
-	"csr_avpcarm7r",
-	"csr_displayhc",
-	"csr_displayhcb",
-	"csr_fdcdrd",
-	"csr_g2dr",
-	"csr_host1xdmar",
-	"csr_host1xr",
-	"csr_idxsrd",
-	"csr_mpcorer",
-	"csr_mpe_ipred",
-	"csr_mpeamemrd",
-	"csr_mpecsrd",
-	"csr_ppcsahbdmar",
-	"csr_ppcsahbslvr",
-	"csr_texsrd",
-	"csr_vdebsevr",
-	"csr_vdember",
-	"csr_vdemcer",
-	"csr_vdetper",
-	"cbw_eppu",
-	"cbw_eppv",
-	"cbw_eppy",
-	"cbw_mpeunifbw",
-	"cbw_viwsb",
-	"cbw_viwu",
-	"cbw_viwv",
-	"cbw_viwy",
-	"ccw_g2dw",
-	"csw_avpcarm7w",
-	"csw_fdcdwr",
-	"csw_host1xw",
-	"csw_ispw",
-	"csw_mpcorew",
-	"csw_mpecswr",
-	"csw_ppcsahbdmaw",
-	"csw_ppcsahbslvw",
-	"csw_vdebsevw",
-	"csw_vdembew",
-	"csw_vdetpmw",
-};
-
-static void tegra20_mc_decode(struct tegra20_mc *mc, int n)
-{
-	u32 addr, req;
-	const char *client = "Unknown";
-	int idx, cid;
-	const struct reg_info {
-		u32 offset;
-		u32 write_bit;	/* 0=READ, 1=WRITE */
-		int cid_shift;
-		char *message;
-	} reg[] = {
-		{
-			.offset = MC_DECERR_EMEM_OTHERS_STATUS,
-			.write_bit = 31,
-			.message = "MC_DECERR",
-		},
-		{
-			.offset	= MC_GART_ERROR_REQ,
-			.cid_shift = 1,
-			.message = "MC_GART_ERR",
-
-		},
-		{
-			.offset = MC_SECURITY_VIOLATION_STATUS,
-			.write_bit = 31,
-			.message = "MC_SECURITY_ERR",
-		},
-	};
-
-	idx = n - MC_INT_ERR_SHIFT;
-	if ((idx < 0) || (idx >= ARRAY_SIZE(reg))) {
-		dev_err_ratelimited(mc->dev, "Unknown interrupt status %08lx\n",
-				    BIT(n));
-		return;
-	}
-
-	req = mc_readl(mc, reg[idx].offset);
-	cid = (req >> reg[idx].cid_shift) & MC_CLIENT_ID_MASK;
-	if (cid < ARRAY_SIZE(tegra20_mc_client))
-		client = tegra20_mc_client[cid];
-
-	addr = mc_readl(mc, reg[idx].offset + sizeof(u32));
-
-	dev_err_ratelimited(mc->dev, "%s (0x%08x): 0x%08x %s (%s %s)\n",
-			   reg[idx].message, req, addr, client,
-			   (req & BIT(reg[idx].write_bit)) ? "write" : "read",
-			   (reg[idx].offset == MC_SECURITY_VIOLATION_STATUS) ?
-			   ((req & SECURITY_VIOLATION_TYPE) ?
-			    "carveout" : "trustzone") : "");
-}
-
-static const struct of_device_id tegra20_mc_of_match[] = {
-	{ .compatible = "nvidia,tegra20-mc", },
-	{},
-};
-
-static irqreturn_t tegra20_mc_isr(int irq, void *data)
-{
-	u32 stat, mask, bit;
-	struct tegra20_mc *mc = data;
-
-	stat = mc_readl(mc, MC_INTSTATUS);
-	mask = mc_readl(mc, MC_INTMASK);
-	mask &= stat;
-	if (!mask)
-		return IRQ_NONE;
-	while ((bit = ffs(mask)) != 0) {
-		tegra20_mc_decode(mc, bit - 1);
-		mask &= ~BIT(bit - 1);
-	}
-
-	mc_writel(mc, stat, MC_INTSTATUS);
-	return IRQ_HANDLED;
-}
-
-static int tegra20_mc_probe(struct platform_device *pdev)
-{
-	struct resource *irq;
-	struct tegra20_mc *mc;
-	int i, err;
-	u32 intmask;
-
-	mc = devm_kzalloc(&pdev->dev, sizeof(*mc), GFP_KERNEL);
-	if (!mc)
-		return -ENOMEM;
-	mc->dev = &pdev->dev;
-
-	for (i = 0; i < ARRAY_SIZE(mc->regs); i++) {
-		struct resource *res;
-
-		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
-		mc->regs[i] = devm_ioremap_resource(&pdev->dev, res);
-		if (IS_ERR(mc->regs[i]))
-			return PTR_ERR(mc->regs[i]);
-	}
-
-	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!irq)
-		return -ENODEV;
-	err = devm_request_irq(&pdev->dev, irq->start, tegra20_mc_isr,
-			       IRQF_SHARED, dev_name(&pdev->dev), mc);
-	if (err)
-		return -ENODEV;
-
-	platform_set_drvdata(pdev, mc);
-
-	intmask = MC_INT_INVALID_GART_PAGE |
-		MC_INT_DECERR_EMEM | MC_INT_SECURITY_VIOLATION;
-	mc_writel(mc, intmask, MC_INTMASK);
-	return 0;
-}
-
-static struct platform_driver tegra20_mc_driver = {
-	.probe = tegra20_mc_probe,
-	.driver = {
-		.name = DRV_NAME,
-		.of_match_table = tegra20_mc_of_match,
-	},
-};
-module_platform_driver(tegra20_mc_driver);
-
-MODULE_AUTHOR("Hiroshi DOYU <hdoyu@nvidia.com>");
-MODULE_DESCRIPTION("Tegra20 MC driver");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h
index be6e49124c6d..bea7fe776825 100644
--- a/include/soc/tegra/mc.h
+++ b/include/soc/tegra/mc.h
@@ -115,7 +115,7 @@ struct tegra_mc_soc {
 struct tegra_mc {
 	struct device *dev;
 	struct tegra_smmu *smmu;
-	void __iomem *regs;
+	void __iomem *regs, *regs2;
 	struct clk *clk;
 	int irq;
 
-- 
cgit v1.2.3


From 20e92462cdfb2772e9d784ec355c90b61ec10222 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Fri, 13 Apr 2018 14:33:49 +0300
Subject: memory: tegra: Introduce memory client hot reset

In order to reset busy HW properly, memory controller needs to be
involved, otherwise it is possible to get corrupted memory or hang machine
if HW was reset during DMA. Introduce memory client 'hot reset' that will
be used for resetting of busy HW.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/memory/tegra/mc.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/memory/tegra/mc.h |   2 +
 include/soc/tegra/mc.h    |  33 ++++++++
 3 files changed, 245 insertions(+)

(limited to 'include')

diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c
index b1a060ce8116..c81d01caf1a8 100644
--- a/drivers/memory/tegra/mc.c
+++ b/drivers/memory/tegra/mc.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/clk.h>
+#include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -71,6 +72,207 @@ static const struct of_device_id tegra_mc_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, tegra_mc_of_match);
 
+static int terga_mc_block_dma_common(struct tegra_mc *mc,
+				     const struct tegra_mc_reset *rst)
+{
+	unsigned long flags;
+	u32 value;
+
+	spin_lock_irqsave(&mc->lock, flags);
+
+	value = mc_readl(mc, rst->control) | BIT(rst->bit);
+	mc_writel(mc, value, rst->control);
+
+	spin_unlock_irqrestore(&mc->lock, flags);
+
+	return 0;
+}
+
+static bool terga_mc_dma_idling_common(struct tegra_mc *mc,
+				       const struct tegra_mc_reset *rst)
+{
+	return (mc_readl(mc, rst->status) & BIT(rst->bit)) != 0;
+}
+
+static int terga_mc_unblock_dma_common(struct tegra_mc *mc,
+				       const struct tegra_mc_reset *rst)
+{
+	unsigned long flags;
+	u32 value;
+
+	spin_lock_irqsave(&mc->lock, flags);
+
+	value = mc_readl(mc, rst->control) & ~BIT(rst->bit);
+	mc_writel(mc, value, rst->control);
+
+	spin_unlock_irqrestore(&mc->lock, flags);
+
+	return 0;
+}
+
+static int terga_mc_reset_status_common(struct tegra_mc *mc,
+					const struct tegra_mc_reset *rst)
+{
+	return (mc_readl(mc, rst->control) & BIT(rst->bit)) != 0;
+}
+
+const struct tegra_mc_reset_ops terga_mc_reset_ops_common = {
+	.block_dma = terga_mc_block_dma_common,
+	.dma_idling = terga_mc_dma_idling_common,
+	.unblock_dma = terga_mc_unblock_dma_common,
+	.reset_status = terga_mc_reset_status_common,
+};
+
+static inline struct tegra_mc *reset_to_mc(struct reset_controller_dev *rcdev)
+{
+	return container_of(rcdev, struct tegra_mc, reset);
+}
+
+static const struct tegra_mc_reset *tegra_mc_reset_find(struct tegra_mc *mc,
+							unsigned long id)
+{
+	unsigned int i;
+
+	for (i = 0; i < mc->soc->num_resets; i++)
+		if (mc->soc->resets[i].id == id)
+			return &mc->soc->resets[i];
+
+	return NULL;
+}
+
+static int tegra_mc_hotreset_assert(struct reset_controller_dev *rcdev,
+				    unsigned long id)
+{
+	struct tegra_mc *mc = reset_to_mc(rcdev);
+	const struct tegra_mc_reset_ops *rst_ops;
+	const struct tegra_mc_reset *rst;
+	int retries = 500;
+	int err;
+
+	rst = tegra_mc_reset_find(mc, id);
+	if (!rst)
+		return -ENODEV;
+
+	rst_ops = mc->soc->reset_ops;
+	if (!rst_ops)
+		return -ENODEV;
+
+	if (rst_ops->block_dma) {
+		/* block clients DMA requests */
+		err = rst_ops->block_dma(mc, rst);
+		if (err) {
+			dev_err(mc->dev, "Failed to block %s DMA: %d\n",
+				rst->name, err);
+			return err;
+		}
+	}
+
+	if (rst_ops->dma_idling) {
+		/* wait for completion of the outstanding DMA requests */
+		while (!rst_ops->dma_idling(mc, rst)) {
+			if (!retries--) {
+				dev_err(mc->dev, "Failed to flush %s DMA\n",
+					rst->name);
+				return -EBUSY;
+			}
+
+			usleep_range(10, 100);
+		}
+	}
+
+	if (rst_ops->hotreset_assert) {
+		/* clear clients DMA requests sitting before arbitration */
+		err = rst_ops->hotreset_assert(mc, rst);
+		if (err) {
+			dev_err(mc->dev, "Failed to hot reset %s: %d\n",
+				rst->name, err);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int tegra_mc_hotreset_deassert(struct reset_controller_dev *rcdev,
+				      unsigned long id)
+{
+	struct tegra_mc *mc = reset_to_mc(rcdev);
+	const struct tegra_mc_reset_ops *rst_ops;
+	const struct tegra_mc_reset *rst;
+	int err;
+
+	rst = tegra_mc_reset_find(mc, id);
+	if (!rst)
+		return -ENODEV;
+
+	rst_ops = mc->soc->reset_ops;
+	if (!rst_ops)
+		return -ENODEV;
+
+	if (rst_ops->hotreset_deassert) {
+		/* take out client from hot reset */
+		err = rst_ops->hotreset_deassert(mc, rst);
+		if (err) {
+			dev_err(mc->dev, "Failed to deassert hot reset %s: %d\n",
+				rst->name, err);
+			return err;
+		}
+	}
+
+	if (rst_ops->unblock_dma) {
+		/* allow new DMA requests to proceed to arbitration */
+		err = rst_ops->unblock_dma(mc, rst);
+		if (err) {
+			dev_err(mc->dev, "Failed to unblock %s DMA : %d\n",
+				rst->name, err);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int tegra_mc_hotreset_status(struct reset_controller_dev *rcdev,
+				    unsigned long id)
+{
+	struct tegra_mc *mc = reset_to_mc(rcdev);
+	const struct tegra_mc_reset_ops *rst_ops;
+	const struct tegra_mc_reset *rst;
+
+	rst = tegra_mc_reset_find(mc, id);
+	if (!rst)
+		return -ENODEV;
+
+	rst_ops = mc->soc->reset_ops;
+	if (!rst_ops)
+		return -ENODEV;
+
+	return rst_ops->reset_status(mc, rst);
+}
+
+static const struct reset_control_ops tegra_mc_reset_ops = {
+	.assert = tegra_mc_hotreset_assert,
+	.deassert = tegra_mc_hotreset_deassert,
+	.status = tegra_mc_hotreset_status,
+};
+
+static int tegra_mc_reset_setup(struct tegra_mc *mc)
+{
+	int err;
+
+	mc->reset.ops = &tegra_mc_reset_ops;
+	mc->reset.owner = THIS_MODULE;
+	mc->reset.of_node = mc->dev->of_node;
+	mc->reset.of_reset_n_cells = 1;
+	mc->reset.nr_resets = mc->soc->num_resets;
+
+	err = reset_controller_register(&mc->reset);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
 static int tegra_mc_setup_latency_allowance(struct tegra_mc *mc)
 {
 	unsigned long long tick;
@@ -424,6 +626,7 @@ static int tegra_mc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	platform_set_drvdata(pdev, mc);
+	spin_lock_init(&mc->lock);
 	mc->soc = match->data;
 	mc->dev = &pdev->dev;
 
@@ -478,6 +681,13 @@ static int tegra_mc_probe(struct platform_device *pdev)
 		}
 	}
 
+	err = tegra_mc_reset_setup(mc);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to register reset controller: %d\n",
+			err);
+		return err;
+	}
+
 	mc->irq = platform_get_irq(pdev, 0);
 	if (mc->irq < 0) {
 		dev_err(&pdev->dev, "interrupt not specified\n");
diff --git a/drivers/memory/tegra/mc.h b/drivers/memory/tegra/mc.h
index cdd6911f4079..01065f12ebeb 100644
--- a/drivers/memory/tegra/mc.h
+++ b/drivers/memory/tegra/mc.h
@@ -41,6 +41,8 @@ static inline void mc_writel(struct tegra_mc *mc, u32 value,
 	writel(value, mc->regs + offset);
 }
 
+extern const struct tegra_mc_reset_ops terga_mc_reset_ops_common;
+
 #ifdef CONFIG_ARCH_TEGRA_2x_SOC
 extern const struct tegra_mc_soc tegra20_mc_soc;
 #endif
diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h
index bea7fe776825..b43f37fea096 100644
--- a/include/soc/tegra/mc.h
+++ b/include/soc/tegra/mc.h
@@ -9,6 +9,7 @@
 #ifndef __SOC_TEGRA_MC_H__
 #define __SOC_TEGRA_MC_H__
 
+#include <linux/reset-controller.h>
 #include <linux/types.h>
 
 struct clk;
@@ -95,6 +96,30 @@ static inline void tegra_smmu_remove(struct tegra_smmu *smmu)
 }
 #endif
 
+struct tegra_mc_reset {
+	const char *name;
+	unsigned long id;
+	unsigned int control;
+	unsigned int status;
+	unsigned int reset;
+	unsigned int bit;
+};
+
+struct tegra_mc_reset_ops {
+	int (*hotreset_assert)(struct tegra_mc *mc,
+			       const struct tegra_mc_reset *rst);
+	int (*hotreset_deassert)(struct tegra_mc *mc,
+				 const struct tegra_mc_reset *rst);
+	int (*block_dma)(struct tegra_mc *mc,
+			 const struct tegra_mc_reset *rst);
+	bool (*dma_idling)(struct tegra_mc *mc,
+			   const struct tegra_mc_reset *rst);
+	int (*unblock_dma)(struct tegra_mc *mc,
+			   const struct tegra_mc_reset *rst);
+	int (*reset_status)(struct tegra_mc *mc,
+			    const struct tegra_mc_reset *rst);
+};
+
 struct tegra_mc_soc {
 	const struct tegra_mc_client *clients;
 	unsigned int num_clients;
@@ -110,6 +135,10 @@ struct tegra_mc_soc {
 	const struct tegra_smmu_soc *smmu;
 
 	u32 intmask;
+
+	const struct tegra_mc_reset_ops *reset_ops;
+	const struct tegra_mc_reset *resets;
+	unsigned int num_resets;
 };
 
 struct tegra_mc {
@@ -124,6 +153,10 @@ struct tegra_mc {
 
 	struct tegra_mc_timing *timings;
 	unsigned int num_timings;
+
+	struct reset_controller_dev reset;
+
+	spinlock_t lock;
 };
 
 void tegra_mc_write_emem_configuration(struct tegra_mc *mc, unsigned long rate);
-- 
cgit v1.2.3


From 4d220ed0f8140c478ab7b0a14d96821da639b646 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 28 Apr 2018 19:56:37 -0700
Subject: bpf: remove tracepoints from bpf core

tracepoints to bpf core were added as a way to provide introspection
to bpf programs and maps, but after some time it became clear that
this approach is inadequate, so prog_id, map_id and corresponding
get_next_id, get_fd_by_id, get_info_by_fd, prog_query APIs were
introduced and fully adopted by bpftool and other applications.
The tracepoints in bpf core started to rot and causing syzbot warnings:
WARNING: CPU: 0 PID: 3008 at kernel/trace/trace_event_perf.c:274
Kernel panic - not syncing: panic_on_warn set ...
perf_trace_bpf_map_keyval+0x260/0xbd0 include/trace/events/bpf.h:228
trace_bpf_map_update_elem include/trace/events/bpf.h:274 [inline]
map_update_elem kernel/bpf/syscall.c:597 [inline]
SYSC_bpf kernel/bpf/syscall.c:1478 [inline]
Hence this patch deletes tracepoints in bpf core.

Reported-by: Eric Biggers <ebiggers3@gmail.com>
Reported-by: syzbot <bot+a9dbb3c3e64b62536a4bc5ee7bbd4ca627566188@syzkaller.appspotmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 MAINTAINERS                |   1 -
 include/linux/bpf_trace.h  |   1 -
 include/trace/events/bpf.h | 355 ---------------------------------------------
 kernel/bpf/core.c          |   6 -
 kernel/bpf/inode.c         |  16 +-
 kernel/bpf/syscall.c       |  15 +-
 6 files changed, 2 insertions(+), 392 deletions(-)
 delete mode 100644 include/trace/events/bpf.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index a52800867850..537fd17a211b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2727,7 +2727,6 @@ F:	Documentation/networking/filter.txt
 F:	Documentation/bpf/
 F:	include/linux/bpf*
 F:	include/linux/filter.h
-F:	include/trace/events/bpf.h
 F:	include/trace/events/xdp.h
 F:	include/uapi/linux/bpf*
 F:	include/uapi/linux/filter.h
diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index e6fe98ae3794..ddf896abcfb6 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -2,7 +2,6 @@
 #ifndef __LINUX_BPF_TRACE_H__
 #define __LINUX_BPF_TRACE_H__
 
-#include <trace/events/bpf.h>
 #include <trace/events/xdp.h>
 
 #endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
deleted file mode 100644
index 150185647e6b..000000000000
--- a/include/trace/events/bpf.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bpf
-
-#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_BPF_H
-
-/* These are only used within the BPF_SYSCALL code */
-#ifdef CONFIG_BPF_SYSCALL
-
-#include <linux/filter.h>
-#include <linux/bpf.h>
-#include <linux/fs.h>
-#include <linux/tracepoint.h>
-
-#define __PROG_TYPE_MAP(FN)	\
-	FN(SOCKET_FILTER)	\
-	FN(KPROBE)		\
-	FN(SCHED_CLS)		\
-	FN(SCHED_ACT)		\
-	FN(TRACEPOINT)		\
-	FN(XDP)			\
-	FN(PERF_EVENT)		\
-	FN(CGROUP_SKB)		\
-	FN(CGROUP_SOCK)		\
-	FN(LWT_IN)		\
-	FN(LWT_OUT)		\
-	FN(LWT_XMIT)
-
-#define __MAP_TYPE_MAP(FN)	\
-	FN(HASH)		\
-	FN(ARRAY)		\
-	FN(PROG_ARRAY)		\
-	FN(PERF_EVENT_ARRAY)	\
-	FN(PERCPU_HASH)		\
-	FN(PERCPU_ARRAY)	\
-	FN(STACK_TRACE)		\
-	FN(CGROUP_ARRAY)	\
-	FN(LRU_HASH)		\
-	FN(LRU_PERCPU_HASH)	\
-	FN(LPM_TRIE)
-
-#define __PROG_TYPE_TP_FN(x)	\
-	TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x);
-#define __PROG_TYPE_SYM_FN(x)	\
-	{ BPF_PROG_TYPE_##x, #x },
-#define __PROG_TYPE_SYM_TAB	\
-	__PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 }
-__PROG_TYPE_MAP(__PROG_TYPE_TP_FN)
-
-#define __MAP_TYPE_TP_FN(x)	\
-	TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x);
-#define __MAP_TYPE_SYM_FN(x)	\
-	{ BPF_MAP_TYPE_##x, #x },
-#define __MAP_TYPE_SYM_TAB	\
-	__MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 }
-__MAP_TYPE_MAP(__MAP_TYPE_TP_FN)
-
-DECLARE_EVENT_CLASS(bpf_prog_event,
-
-	TP_PROTO(const struct bpf_prog *prg),
-
-	TP_ARGS(prg),
-
-	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
-		__field(u32, type)
-	),
-
-	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
-		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
-		__entry->type = prg->type;
-	),
-
-	TP_printk("prog=%s type=%s",
-		  __print_hex_str(__entry->prog_tag, 8),
-		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB))
-);
-
-DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type,
-
-	TP_PROTO(const struct bpf_prog *prg),
-
-	TP_ARGS(prg)
-);
-
-DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu,
-
-	TP_PROTO(const struct bpf_prog *prg),
-
-	TP_ARGS(prg)
-);
-
-TRACE_EVENT(bpf_prog_load,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd),
-
-	TP_ARGS(prg, ufd),
-
-	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
-		__field(u32, type)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
-		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
-		__entry->type = prg->type;
-		__entry->ufd  = ufd;
-	),
-
-	TP_printk("prog=%s type=%s ufd=%d",
-		  __print_hex_str(__entry->prog_tag, 8),
-		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB),
-		  __entry->ufd)
-);
-
-TRACE_EVENT(bpf_map_create,
-
-	TP_PROTO(const struct bpf_map *map, int ufd),
-
-	TP_ARGS(map, ufd),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, size_key)
-		__field(u32, size_value)
-		__field(u32, max_entries)
-		__field(u32, flags)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		__entry->type        = map->map_type;
-		__entry->size_key    = map->key_size;
-		__entry->size_value  = map->value_size;
-		__entry->max_entries = map->max_entries;
-		__entry->flags       = map->map_flags;
-		__entry->ufd         = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd, __entry->size_key, __entry->size_value,
-		  __entry->max_entries, __entry->flags)
-);
-
-DECLARE_EVENT_CLASS(bpf_obj_prog,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(prg, ufd, pname),
-
-	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
-		__field(int, ufd)
-		__string(path, pname->name)
-	),
-
-	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
-		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
-		__assign_str(path, pname->name);
-		__entry->ufd = ufd;
-	),
-
-	TP_printk("prog=%s path=%s ufd=%d",
-		  __print_hex_str(__entry->prog_tag, 8),
-		  __get_str(path), __entry->ufd)
-);
-
-DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(prg, ufd, pname)
-);
-
-DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(prg, ufd, pname)
-);
-
-DECLARE_EVENT_CLASS(bpf_obj_map,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(map, ufd, pname),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(int, ufd)
-		__string(path, pname->name)
-	),
-
-	TP_fast_assign(
-		__assign_str(path, pname->name);
-		__entry->type = map->map_type;
-		__entry->ufd  = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d path=%s",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd, __get_str(path))
-);
-
-DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(map, ufd, pname)
-);
-
-DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(map, ufd, pname)
-);
-
-DECLARE_EVENT_CLASS(bpf_map_keyval,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *val),
-
-	TP_ARGS(map, ufd, key, val),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, key_len)
-		__dynamic_array(u8, key, map->key_size)
-		__field(bool, key_trunc)
-		__field(u32, val_len)
-		__dynamic_array(u8, val, map->value_size)
-		__field(bool, val_trunc)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		memcpy(__get_dynamic_array(key), key, map->key_size);
-		memcpy(__get_dynamic_array(val), val, map->value_size);
-		__entry->type      = map->map_type;
-		__entry->key_len   = min(map->key_size, 16U);
-		__entry->key_trunc = map->key_size != __entry->key_len;
-		__entry->val_len   = min(map->value_size, 16U);
-		__entry->val_trunc = map->value_size != __entry->val_len;
-		__entry->ufd       = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd,
-		  __print_hex(__get_dynamic_array(key), __entry->key_len),
-		  __entry->key_trunc ? " ..." : "",
-		  __print_hex(__get_dynamic_array(val), __entry->val_len),
-		  __entry->val_trunc ? " ..." : "")
-);
-
-DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *val),
-
-	TP_ARGS(map, ufd, key, val)
-);
-
-DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *val),
-
-	TP_ARGS(map, ufd, key, val)
-);
-
-TRACE_EVENT(bpf_map_delete_elem,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key),
-
-	TP_ARGS(map, ufd, key),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, key_len)
-		__dynamic_array(u8, key, map->key_size)
-		__field(bool, key_trunc)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		memcpy(__get_dynamic_array(key), key, map->key_size);
-		__entry->type      = map->map_type;
-		__entry->key_len   = min(map->key_size, 16U);
-		__entry->key_trunc = map->key_size != __entry->key_len;
-		__entry->ufd       = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=[%s%s]",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd,
-		  __print_hex(__get_dynamic_array(key), __entry->key_len),
-		  __entry->key_trunc ? " ..." : "")
-);
-
-TRACE_EVENT(bpf_map_next_key,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *key_next),
-
-	TP_ARGS(map, ufd, key, key_next),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, key_len)
-		__dynamic_array(u8, key, map->key_size)
-		__dynamic_array(u8, nxt, map->key_size)
-		__field(bool, key_trunc)
-		__field(bool, key_null)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		if (key)
-			memcpy(__get_dynamic_array(key), key, map->key_size);
-		__entry->key_null = !key;
-		memcpy(__get_dynamic_array(nxt), key_next, map->key_size);
-		__entry->type      = map->map_type;
-		__entry->key_len   = min(map->key_size, 16U);
-		__entry->key_trunc = map->key_size != __entry->key_len;
-		__entry->ufd       = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd,
-		  __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key),
-							   __entry->key_len),
-		  __entry->key_trunc && !__entry->key_null ? " ..." : "",
-		  __print_hex(__get_dynamic_array(nxt), __entry->key_len),
-		  __entry->key_trunc ? " ..." : "")
-);
-#endif /* CONFIG_BPF_SYSCALL */
-#endif /* _TRACE_BPF_H */
-
-#include <trace/define_trace.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9349a5db3cf2..90feeba3a1a1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1845,9 +1845,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 #include <linux/bpf_trace.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
-
-/* These are only used within the BPF_SYSCALL code */
-#ifdef CONFIG_BPF_SYSCALL
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
-#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index a41343009ccc..ed13645bd80c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -429,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
 	ret = bpf_obj_do_pin(pname, raw, type);
 	if (ret != 0)
 		bpf_any_put(raw, type);
-	if ((trace_bpf_obj_pin_prog_enabled() ||
-	     trace_bpf_obj_pin_map_enabled()) && !ret) {
-		if (type == BPF_TYPE_PROG)
-			trace_bpf_obj_pin_prog(raw, ufd, pname);
-		if (type == BPF_TYPE_MAP)
-			trace_bpf_obj_pin_map(raw, ufd, pname);
-	}
 out:
 	putname(pname);
 	return ret;
@@ -502,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
 	else
 		goto out;
 
-	if (ret < 0) {
+	if (ret < 0)
 		bpf_any_put(raw, type);
-	} else if (trace_bpf_obj_get_prog_enabled() ||
-		   trace_bpf_obj_get_map_enabled()) {
-		if (type == BPF_TYPE_PROG)
-			trace_bpf_obj_get_prog(raw, ret, pname);
-		if (type == BPF_TYPE_MAP)
-			trace_bpf_obj_get_map(raw, ret, pname);
-	}
 out:
 	putname(pname);
 	return ret;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0bd2944eafb9..263e13ede029 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -503,7 +503,6 @@ static int map_create(union bpf_attr *attr)
 		return err;
 	}
 
-	trace_bpf_map_create(map, err);
 	return err;
 
 free_map:
@@ -663,7 +662,6 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_to_user(uvalue, value, value_size) != 0)
 		goto free_value;
 
-	trace_bpf_map_lookup_elem(map, ufd, key, value);
 	err = 0;
 
 free_value:
@@ -760,8 +758,6 @@ static int map_update_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 out:
-	if (!err)
-		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
 	kfree(value);
 free_key:
@@ -814,8 +810,6 @@ static int map_delete_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 out:
-	if (!err)
-		trace_bpf_map_delete_elem(map, ufd, key);
 	kfree(key);
 err_put:
 	fdput(f);
@@ -879,7 +873,6 @@ out:
 	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
 		goto free_next_key;
 
-	trace_bpf_map_next_key(map, ufd, key, next_key);
 	err = 0;
 
 free_next_key:
@@ -1027,7 +1020,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		int i;
 
-		trace_bpf_prog_put_rcu(prog);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 
@@ -1194,11 +1186,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
-
-	if (!IS_ERR(prog))
-		trace_bpf_prog_get_type(prog);
-	return prog;
+	return __bpf_prog_get(ufd, &type, attach_drv);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
@@ -1373,7 +1361,6 @@ static int bpf_prog_load(union bpf_attr *attr)
 	}
 
 	bpf_prog_kallsyms_add(prog);
-	trace_bpf_prog_load(prog, err);
 	return err;
 
 free_used_maps:
-- 
cgit v1.2.3


From 6552d3141064f09736580dc2dc527019762cbf71 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Thu, 8 Mar 2018 11:20:45 -0800
Subject: backlight: Add RAVE SP backlight driver

This driver provides access to RAVE SP backlight control
functionality.

Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/video/backlight/Kconfig             |  6 +++
 drivers/video/backlight/Makefile            |  1 +
 drivers/video/backlight/rave-sp-backlight.c | 82 +++++++++++++++++++++++++++++
 include/linux/mfd/rave-sp.h                 |  1 +
 4 files changed, 90 insertions(+)
 create mode 100644 drivers/video/backlight/rave-sp-backlight.c

(limited to 'include')

diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 4e1d2ad50ba1..5d2d0d7e8100 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -467,6 +467,12 @@ config BACKLIGHT_ARCXCNN
 	  If you have an ARCxCnnnn family backlight say Y to enable
 	  the backlight driver.
 
+config BACKLIGHT_RAVE_SP
+	tristate "RAVE SP Backlight driver"
+	depends on RAVE_SP_CORE
+	help
+	  Support for backlight control on RAVE SP device.
+
 endif # BACKLIGHT_CLASS_DEVICE
 
 endif # BACKLIGHT_LCD_SUPPORT
diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile
index 5e28f01c8391..19da71d518bf 100644
--- a/drivers/video/backlight/Makefile
+++ b/drivers/video/backlight/Makefile
@@ -57,3 +57,4 @@ obj-$(CONFIG_BACKLIGHT_TOSA)		+= tosa_bl.o
 obj-$(CONFIG_BACKLIGHT_TPS65217)	+= tps65217_bl.o
 obj-$(CONFIG_BACKLIGHT_WM831X)		+= wm831x_bl.o
 obj-$(CONFIG_BACKLIGHT_ARCXCNN) 	+= arcxcnn_bl.o
+obj-$(CONFIG_BACKLIGHT_RAVE_SP)		+= rave-sp-backlight.o
diff --git a/drivers/video/backlight/rave-sp-backlight.c b/drivers/video/backlight/rave-sp-backlight.c
new file mode 100644
index 000000000000..462f14a1b19d
--- /dev/null
+++ b/drivers/video/backlight/rave-sp-backlight.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * LCD Backlight driver for RAVE SP
+ *
+ * Copyright (C) 2018 Zodiac Inflight Innovations
+ *
+ */
+
+#include <linux/backlight.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mfd/rave-sp.h>
+#include <linux/platform_device.h>
+
+#define	RAVE_SP_BACKLIGHT_LCD_EN	BIT(7)
+
+static int rave_sp_backlight_update_status(struct backlight_device *bd)
+{
+	const struct backlight_properties *p = &bd->props;
+	const u8 intensity =
+		(p->power == FB_BLANK_UNBLANK) ? p->brightness : 0;
+	struct rave_sp *sp = dev_get_drvdata(&bd->dev);
+	u8 cmd[] = {
+		[0] = RAVE_SP_CMD_SET_BACKLIGHT,
+		[1] = 0,
+		[2] = intensity ? RAVE_SP_BACKLIGHT_LCD_EN | intensity : 0,
+		[3] = 0,
+		[4] = 0,
+	};
+
+	return rave_sp_exec(sp, cmd, sizeof(cmd), NULL, 0);
+}
+
+static const struct backlight_ops rave_sp_backlight_ops = {
+	.options	= BL_CORE_SUSPENDRESUME,
+	.update_status	= rave_sp_backlight_update_status,
+};
+
+static struct backlight_properties rave_sp_backlight_props = {
+	.type		= BACKLIGHT_PLATFORM,
+	.max_brightness = 100,
+	.brightness	= 50,
+};
+
+static int rave_sp_backlight_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct backlight_device *bd;
+
+	bd = devm_backlight_device_register(dev, pdev->name, dev->parent,
+					    dev_get_drvdata(dev->parent),
+					    &rave_sp_backlight_ops,
+					    &rave_sp_backlight_props);
+	if (IS_ERR(bd))
+		return PTR_ERR(bd);
+
+	backlight_update_status(bd);
+
+	return 0;
+}
+
+static const struct of_device_id rave_sp_backlight_of_match[] = {
+	{ .compatible = "zii,rave-sp-backlight" },
+	{}
+};
+
+static struct platform_driver rave_sp_backlight_driver = {
+	.probe = rave_sp_backlight_probe,
+	.driver	= {
+		.name = KBUILD_MODNAME,
+		.of_match_table = rave_sp_backlight_of_match,
+	},
+};
+module_platform_driver(rave_sp_backlight_driver);
+
+MODULE_DEVICE_TABLE(of, rave_sp_backlight_of_match);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andrey Vostrikov <andrey.vostrikov@cogentembedded.com>");
+MODULE_AUTHOR("Nikita Yushchenko <nikita.yoush@cogentembedded.com>");
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
+MODULE_DESCRIPTION("RAVE SP Backlight driver");
diff --git a/include/linux/mfd/rave-sp.h b/include/linux/mfd/rave-sp.h
index 796fb9794c9e..fe0ce7bc59cf 100644
--- a/include/linux/mfd/rave-sp.h
+++ b/include/linux/mfd/rave-sp.h
@@ -21,6 +21,7 @@ enum rave_sp_command {
 	RAVE_SP_CMD_STATUS			= 0xA0,
 	RAVE_SP_CMD_SW_WDT			= 0xA1,
 	RAVE_SP_CMD_PET_WDT			= 0xA2,
+	RAVE_SP_CMD_SET_BACKLIGHT		= 0xA6,
 	RAVE_SP_CMD_RESET			= 0xA7,
 	RAVE_SP_CMD_RESET_REASON		= 0xA8,
 
-- 
cgit v1.2.3


From 3157694d8c7fed9046dece8b63434cafef21cbf6 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 28 Mar 2018 19:03:25 +0200
Subject: pwm-backlight: Add support for PWM delays proprieties.

Some panels (i.e. N116BGE-L41), in their power sequence specifications,
request a delay between set the PWM signal and enable the backlight and
between clear the PWM signal and disable the backlight. Add support for
the new post-pwm-on-delay-ms and pwm-off-delay-ms proprieties to meet
the timings.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Jingoo Han <jingoohan1@gmail.com>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/video/backlight/pwm_bl.c | 19 +++++++++++++++++++
 include/linux/pwm_backlight.h    |  2 ++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index 698ec68bcdc9..8e3f1245f5c5 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -10,6 +10,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/delay.h>
 #include <linux/gpio/consumer.h>
 #include <linux/gpio.h>
 #include <linux/module.h>
@@ -35,6 +36,8 @@ struct pwm_bl_data {
 	struct gpio_desc	*enable_gpio;
 	unsigned int		scale;
 	bool			legacy;
+	unsigned int		post_pwm_on_delay;
+	unsigned int		pwm_off_delay;
 	int			(*notify)(struct device *,
 					  int brightness);
 	void			(*notify_after)(struct device *,
@@ -56,6 +59,9 @@ static void pwm_backlight_power_on(struct pwm_bl_data *pb, int brightness)
 
 	pwm_enable(pb->pwm);
 
+	if (pb->post_pwm_on_delay)
+		msleep(pb->post_pwm_on_delay);
+
 	if (pb->enable_gpio)
 		gpiod_set_value_cansleep(pb->enable_gpio, 1);
 
@@ -70,6 +76,9 @@ static void pwm_backlight_power_off(struct pwm_bl_data *pb)
 	if (pb->enable_gpio)
 		gpiod_set_value_cansleep(pb->enable_gpio, 0);
 
+	if (pb->pwm_off_delay)
+		msleep(pb->pwm_off_delay);
+
 	pwm_config(pb->pwm, 0, pb->period);
 	pwm_disable(pb->pwm);
 
@@ -178,6 +187,14 @@ static int pwm_backlight_parse_dt(struct device *dev,
 		data->max_brightness--;
 	}
 
+	/*
+	 * These values are optional and set as 0 by default, the out values
+	 * are modified only if a valid u32 value can be decoded.
+	 */
+	of_property_read_u32(node, "post-pwm-on-delay-ms",
+			     &data->post_pwm_on_delay);
+	of_property_read_u32(node, "pwm-off-delay-ms", &data->pwm_off_delay);
+
 	data->enable_gpio = -EINVAL;
 	return 0;
 }
@@ -276,6 +293,8 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 	pb->exit = data->exit;
 	pb->dev = &pdev->dev;
 	pb->enabled = false;
+	pb->post_pwm_on_delay = data->post_pwm_on_delay;
+	pb->pwm_off_delay = data->pwm_off_delay;
 
 	pb->enable_gpio = devm_gpiod_get_optional(&pdev->dev, "enable",
 						  GPIOD_ASIS);
diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h
index e8afbd71a140..8ea265a022fd 100644
--- a/include/linux/pwm_backlight.h
+++ b/include/linux/pwm_backlight.h
@@ -14,6 +14,8 @@ struct platform_pwm_backlight_data {
 	unsigned int lth_brightness;
 	unsigned int pwm_period_ns;
 	unsigned int *levels;
+	unsigned int post_pwm_on_delay;
+	unsigned int pwm_off_delay;
 	/* TODO remove once all users are switched to gpiod_* API */
 	int enable_gpio;
 	int (*init)(struct device *dev);
-- 
cgit v1.2.3


From 3bd5a09b529c03bac354c9d48e688ed2aca934fd Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Mon, 30 Apr 2018 11:39:03 +0100
Subject: bpf: fix formatting for bpf_perf_event_read() helper doc

Some edits brought to the last iteration of BPF helper functions
documentation introduced an error with RST formatting. As a result, most
of one paragraph is rendered in bold text when only the name of a helper
should be. Fix it, and fix formatting of another function name in the
same paragraph.

Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23b334bba1a6..530ff6588d8f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -828,12 +828,12 @@ union bpf_attr {
  *
  * 		Also, be aware that the newer helper
  * 		**bpf_perf_event_read_value**\ () is recommended over
- * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		**bpf_perf_event_read**\ () in general. The latter has some ABI
  * 		quirks where error and counter value are used as a return code
  * 		(which is wrong to do since ranges may overlap). This issue is
- * 		fixed with bpf_perf_event_read_value(), which at the same time
- * 		provides more features over the **bpf_perf_event_read**\ ()
- * 		interface. Please refer to the description of
+ * 		fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 		time provides more features over the **bpf_perf_event_read**\
+ * 		() interface. Please refer to the description of
  * 		**bpf_perf_event_read_value**\ () for details.
  * 	Return
  * 		The value of the perf event counter read from the map, or a
-- 
cgit v1.2.3


From 79552fbc0f9dc20dc022be7cc48eb3761623fa56 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Mon, 30 Apr 2018 11:39:04 +0100
Subject: bpf: fix formatting for bpf_get_stack() helper doc

Fix formatting (indent) for bpf_get_stack() helper documentation, so
that the doc is rendered correctly with the Python script.

Fixes: c195651e565a ("bpf: add bpf_get_stack helper")
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 54 ++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 530ff6588d8f..8daef7326bb7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1770,33 +1770,33 @@ union bpf_attr {
  *
  * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
  * 	Description
- *		Return a user or a kernel stack in bpf program provided buffer.
- *		To achieve this, the helper needs *ctx*, which is a pointer
- *		to the context on which the tracing program is executed.
- *		To store the stacktrace, the bpf program provides *buf* with
- *		a nonnegative *size*.
- *
- *		The last argument, *flags*, holds the number of stack frames to
- *		skip (from 0 to 255), masked with
- *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
- *		the following flags:
- *
- *		**BPF_F_USER_STACK**
- *			Collect a user space stack instead of a kernel stack.
- *		**BPF_F_USER_BUILD_ID**
- *			Collect buildid+offset instead of ips for user stack,
- *			only valid if **BPF_F_USER_STACK** is also specified.
- *
- *		**bpf_get_stack**\ () can collect up to
- *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
- *		to sufficient large buffer size. Note that
- *		this limit can be controlled with the **sysctl** program, and
- *		that it should be manually increased in order to profile long
- *		user stacks (such as stacks for Java programs). To do so, use:
- *
- *	::
- *
- *		# sysctl kernel.perf_event_max_stack=<new value>
+ * 		Return a user or a kernel stack in bpf program provided buffer.
+ * 		To achieve this, the helper needs *ctx*, which is a pointer
+ * 		to the context on which the tracing program is executed.
+ * 		To store the stacktrace, the bpf program provides *buf* with
+ * 		a nonnegative *size*.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_USER_BUILD_ID**
+ * 			Collect buildid+offset instead of ips for user stack,
+ * 			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 		**bpf_get_stack**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 		to sufficient large buffer size. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
  *
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
-- 
cgit v1.2.3


From 3a0c991fe3b48be02373102b1d06cb99934f0bee Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 25 Apr 2018 19:42:48 +0200
Subject: backlight: Nuke unused backlight.props.state states

The backlight power state handling is supremely confusing. We have:
- props.power, using FB_BLANK_* defines
- props.fb_blank, using the same, but deprecated int favour of
  props.state
- props.state, using the BL_CORE_* defines
- and finally a bunch of backlight drivers treat brightness == 0 as
  off. But of course not all of them.

This is way too much confusion to fix in a simple patch, but at least
prevent more hilarity from spreading by removing the unused BL_CORE_*
defines. I have no idea why exactly anyone would need that.

Wrt the ideal state, we really just want a boolean state. The 4 power
saving states that the fbdev subsystem uses are overkill in todays hw
(this was only relevant for VGA and similar analog circuits like
TV-out), the new drm atomic modeset api simplified even the uapi to a
simple bool. And there was never a valid technical reason to have the
intermediate fbdev power states for backlights (those really only can
be either off or on).

Cleanup motivated by Meghana's questions about all this.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Jingoo Han <jingoohan1@gmail.com>
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/backlight.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 2baab6f3861d..1db67662bfcb 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -84,9 +84,6 @@ struct backlight_properties {
 
 #define BL_CORE_SUSPENDED	(1 << 0)	/* backlight is suspended */
 #define BL_CORE_FBBLANK		(1 << 1)	/* backlight is under an fb blank event */
-#define BL_CORE_DRIVER4		(1 << 28)	/* reserved for driver specific use */
-#define BL_CORE_DRIVER3		(1 << 29)	/* reserved for driver specific use */
-#define BL_CORE_DRIVER2		(1 << 30)	/* reserved for driver specific use */
 #define BL_CORE_DRIVER1		(1 << 31)	/* reserved for driver specific use */
 
 };
-- 
cgit v1.2.3


From 3cf91adaa594e8933af1727942ac560e5c7bc70e Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 25 Apr 2018 19:42:52 +0200
Subject: backlight: Nuke BL_CORE_DRIVER1

Now that the 3 drivers using this are cleaned up we can also remove
this final bit of confusion of leaking driver internals into the
backlight power interface.

The backlight power interface itself is still a massive mess.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/backlight.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 1db67662bfcb..7fbf0539e14a 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -84,7 +84,6 @@ struct backlight_properties {
 
 #define BL_CORE_SUSPENDED	(1 << 0)	/* backlight is suspended */
 #define BL_CORE_FBBLANK		(1 << 1)	/* backlight is under an fb blank event */
-#define BL_CORE_DRIVER1		(1 << 31)	/* reserved for driver specific use */
 
 };
 
-- 
cgit v1.2.3


From 4d4fd36126d66d6091ca5aaabab262b5da3849c5 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 29 Apr 2018 10:56:08 +0300
Subject: net: bridge: Publish bridge accessor functions

Add a couple new functions to allow querying FDB and vlan settings of a
bridge.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 28 ++++++++++++++++++++++++++++
 net/bridge/br_fdb.c       | 22 ++++++++++++++++++++++
 net/bridge/br_private.h   | 11 +++++++++++
 net/bridge/br_vlan.c      | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 02639ebea2f0..585d27182425 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -93,11 +93,39 @@ static inline bool br_multicast_router(const struct net_device *dev)
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
 bool br_vlan_enabled(const struct net_device *dev);
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid);
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+		     struct bridge_vlan_info *p_vinfo);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
 {
 	return false;
 }
+
+static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	return -1;
+}
+
+static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
+				   struct bridge_vlan_info *p_vinfo)
+{
+	return -1;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_BRIDGE)
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+				    const unsigned char *addr,
+				    __u16 vid);
+#else
+static inline struct net_device *
+br_fdb_find_port(const struct net_device *br_dev,
+		 const unsigned char *addr,
+		 __u16 vid)
+{
+	return NULL;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..a1c409cacec4 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
 	return fdb;
 }
 
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+				    const unsigned char *addr,
+				    __u16 vid)
+{
+	struct net_bridge_fdb_entry *f;
+	struct net_device *dev = NULL;
+	struct net_bridge *br;
+
+	ASSERT_RTNL();
+
+	if (!netif_is_bridge_master(br_dev))
+		return NULL;
+
+	br = netdev_priv(br_dev);
+	f = br_fdb_find(br, addr, vid);
+	if (f && f->dst)
+		dev = f->dst->dev;
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_find_port);
+
 struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
 					     const unsigned char *addr,
 					     __u16 vid)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a7cb3ece5031..1a5093115534 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -594,11 +594,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
 	return rcu_dereference(dev->rx_handler) == br_handle_frame;
 }
 
+static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+}
+
 static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
 {
 	return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
 }
 
+static inline struct net_bridge_port *
+br_port_get_check_rtnl(const struct net_device *dev)
+{
+	return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
+}
+
 /* br_ioctl.c */
 int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..df37a5137c25 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1149,3 +1149,42 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
 		stats->tx_packets += txpackets;
 	}
 }
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	struct net_bridge_vlan_group *vg;
+
+	ASSERT_RTNL();
+	if (netif_is_bridge_master(dev))
+		vg = br_vlan_group(netdev_priv(dev));
+	else
+		return -EINVAL;
+
+	*p_pvid = br_get_pvid(vg);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+		     struct bridge_vlan_info *p_vinfo)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+	struct net_bridge_port *p;
+
+	ASSERT_RTNL();
+	p = br_port_get_check_rtnl(dev);
+	if (p)
+		vg = nbp_vlan_group(p);
+	else
+		return -EINVAL;
+
+	v = br_vlan_find(vg, vid);
+	if (!v)
+		return -ENOENT;
+
+	p_vinfo->vid = vid;
+	p_vinfo->flags = v->flags;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info);
-- 
cgit v1.2.3


From 818f5c8f4cd27747e8218e8a5fb230c322e02d1e Mon Sep 17 00:00:00 2001
From: Stefan Schake <stschake@gmail.com>
Date: Wed, 25 Apr 2018 00:03:45 +0200
Subject: drm/vc4: Syncobj import support

Allow userland to specify a syncobj that is waited on before a render job
starts processing.

v2: Use 0 as invalid syncobj to drop flag (Eric)
    Drop extra newline (Eric)

Signed-off-by: Stefan Schake <stschake@gmail.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/1524607427-12876-2-git-send-email-stschake@gmail.com
---
 drivers/gpu/drm/vc4/vc4_drv.h |  1 +
 drivers/gpu/drm/vc4/vc4_gem.c | 30 +++++++++++++++++++++++++-----
 include/uapi/drm/vc4_drm.h    |  7 +++----
 3 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 22589d39083c..554a4e810d5b 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -11,6 +11,7 @@
 #include <drm/drm_encoder.h>
 #include <drm/drm_gem_cma_helper.h>
 #include <drm/drm_atomic.h>
+#include <drm/drm_syncobj.h>
 
 #include "uapi/drm/vc4_drm.h"
 
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 2107b0daf8ef..e305ccdedf47 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -27,6 +27,7 @@
 #include <linux/device.h>
 #include <linux/io.h>
 #include <linux/sched/signal.h>
+#include <linux/dma-fence-array.h>
 
 #include "uapi/drm/vc4_drm.h"
 #include "vc4_drv.h"
@@ -1115,6 +1116,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 	struct drm_vc4_submit_cl *args = data;
 	struct vc4_exec_info *exec;
 	struct ww_acquire_ctx acquire_ctx;
+	struct dma_fence *in_fence;
 	int ret = 0;
 
 	if ((args->flags & ~(VC4_SUBMIT_CL_USE_CLEAR_COLOR |
@@ -1125,11 +1127,6 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->pad2 != 0) {
-		DRM_DEBUG("->pad2 must be set to zero\n");
-		return -EINVAL;
-	}
-
 	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
 	if (!exec) {
 		DRM_ERROR("malloc failure on exec struct\n");
@@ -1164,6 +1161,29 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		}
 	}
 
+	if (args->in_sync) {
+		ret = drm_syncobj_find_fence(file_priv, args->in_sync,
+					     &in_fence);
+		if (ret)
+			goto fail;
+
+		/* When the fence (or fence array) is exclusively from our
+		 * context we can skip the wait since jobs are executed in
+		 * order of their submission through this ioctl and this can
+		 * only have fences from a prior job.
+		 */
+		if (!dma_fence_match_context(in_fence,
+					     vc4->dma_fence_context)) {
+			ret = dma_fence_wait(in_fence, true);
+			if (ret) {
+				dma_fence_put(in_fence);
+				goto fail;
+			}
+		}
+
+		dma_fence_put(in_fence);
+	}
+
 	if (exec->args->bin_cl_size != 0) {
 		ret = vc4_get_bcl(dev, exec);
 		if (ret)
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index b95a0e11cb07..d97065b86431 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -183,11 +183,10 @@ struct drm_vc4_submit_cl {
 	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
 	__u32 perfmonid;
 
-	/* Unused field to align this struct on 64 bits. Must be set to 0.
-	 * If one ever needs to add an u32 field to this struct, this field
-	 * can be used.
+	/* Syncobj handle to wait on. If set, processing of this render job
+	 * will not start until the syncobj is signaled. 0 means ignore.
 	 */
-	__u32 pad2;
+	__u32 in_sync;
 };
 
 /**
-- 
cgit v1.2.3


From e84fcb95e07442edd7ce3b13973523646dbc581a Mon Sep 17 00:00:00 2001
From: Stefan Schake <stschake@gmail.com>
Date: Wed, 25 Apr 2018 00:03:46 +0200
Subject: drm/vc4: Export fence through syncobj

Allow specifying a syncobj on render job submission where we store the
fence for the job. This gives userland flexible access to the fence.

v2: Use 0 as invalid syncobj to drop flag (Eric)
    Don't reintroduce the padding (Eric)

Signed-off-by: Stefan Schake <stschake@gmail.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/1524607427-12876-3-git-send-email-stschake@gmail.com
---
 drivers/gpu/drm/vc4/vc4_gem.c | 30 ++++++++++++++++++++++++++++--
 include/uapi/drm/vc4_drm.h    |  6 ++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index e305ccdedf47..a4c4be3ac6af 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -656,7 +656,8 @@ retry:
  */
 static int
 vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
-		 struct ww_acquire_ctx *acquire_ctx)
+		 struct ww_acquire_ctx *acquire_ctx,
+		 struct drm_syncobj *out_sync)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct vc4_exec_info *renderjob;
@@ -679,6 +680,9 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
 	fence->seqno = exec->seqno;
 	exec->fence = &fence->base;
 
+	if (out_sync)
+		drm_syncobj_replace_fence(out_sync, exec->fence);
+
 	vc4_update_bo_seqnos(exec, seqno);
 
 	vc4_unlock_bo_reservations(dev, exec, acquire_ctx);
@@ -1114,6 +1118,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct vc4_file *vc4file = file_priv->driver_priv;
 	struct drm_vc4_submit_cl *args = data;
+	struct drm_syncobj *out_sync = NULL;
 	struct vc4_exec_info *exec;
 	struct ww_acquire_ctx acquire_ctx;
 	struct dma_fence *in_fence;
@@ -1201,12 +1206,33 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	if (args->out_sync) {
+		out_sync = drm_syncobj_find(file_priv, args->out_sync);
+		if (!out_sync) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		/* We replace the fence in out_sync in vc4_queue_submit since
+		 * the render job could execute immediately after that call.
+		 * If it finishes before our ioctl processing resumes the
+		 * render job fence could already have been freed.
+		 */
+	}
+
 	/* Clear this out of the struct we'll be putting in the queue,
 	 * since it's part of our stack.
 	 */
 	exec->args = NULL;
 
-	ret = vc4_queue_submit(dev, exec, &acquire_ctx);
+	ret = vc4_queue_submit(dev, exec, &acquire_ctx, out_sync);
+
+	/* The syncobj isn't part of the exec data and we need to free our
+	 * reference even if job submission failed.
+	 */
+	if (out_sync)
+		drm_syncobj_put(out_sync);
+
 	if (ret)
 		goto fail;
 
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index d97065b86431..2be4fe3610b8 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -187,6 +187,12 @@ struct drm_vc4_submit_cl {
 	 * will not start until the syncobj is signaled. 0 means ignore.
 	 */
 	__u32 in_sync;
+
+	/* Syncobj handle to export fence to. If set, the fence in the syncobj
+	 * will be replaced with a fence that signals upon completion of this
+	 * render job. 0 means ignore.
+	 */
+	__u32 out_sync;
 };
 
 /**
-- 
cgit v1.2.3


From 7a107c0f55a3b4c6f84a4323df5610360bde1684 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 5 Apr 2018 14:58:06 +0300
Subject: fasync: Fix deadlock between task-context and interrupt-context
 kill_fasync()

I observed the following deadlock between them:

[task 1]                          [task 2]                         [task 3]
kill_fasync()                     mm_update_next_owner()           copy_process()
 spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
  send_sigio()                    <IRQ>                             ...
   read_lock(&fown->lock)         kill_fasync()                     ...
    read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...

Task 1 can't acquire read locked tasklist_lock, since there is
already task 3 expressed its wish to take the lock exclusive.
Task 2 holds the read locked lock, but it can't take the spin lock.

Also, there is possible another deadlock (which I haven't observed):

[task 1]                            [task 2]
f_getown()                          kill_fasync()
 read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
 <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
  kill_fasync()                       read_lock(&fown->lock)
   spin_lock_irqsave(&fa->fa_lock,)

Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
as it guarantees fa->fa_file->f_owner integrity only. It may seem,
that it used to give a task a small possibility to receive two sequential
signals, if there are two parallel kill_fasync() callers, and task
handles the first signal fastly, but the behaviour won't become
different, since there is exclusive sighand lock in do_send_sig_info().

The patch converts fa_lock into rwlock_t, and this fixes two above
deadlocks, as rwlock is allowed to be taken from interrupt handler
by qrwlock design.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/fcntl.c         | 15 +++++++--------
 include/linux/fs.h |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index d737ff082472..c42169459298 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -871,9 +871,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 		if (fa->fa_file != filp)
 			continue;
 
-		spin_lock_irq(&fa->fa_lock);
+		write_lock_irq(&fa->fa_lock);
 		fa->fa_file = NULL;
-		spin_unlock_irq(&fa->fa_lock);
+		write_unlock_irq(&fa->fa_lock);
 
 		*fp = fa->fa_next;
 		call_rcu(&fa->fa_rcu, fasync_free_rcu);
@@ -918,13 +918,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
 		if (fa->fa_file != filp)
 			continue;
 
-		spin_lock_irq(&fa->fa_lock);
+		write_lock_irq(&fa->fa_lock);
 		fa->fa_fd = fd;
-		spin_unlock_irq(&fa->fa_lock);
+		write_unlock_irq(&fa->fa_lock);
 		goto out;
 	}
 
-	spin_lock_init(&new->fa_lock);
+	rwlock_init(&new->fa_lock);
 	new->magic = FASYNC_MAGIC;
 	new->fa_file = filp;
 	new->fa_fd = fd;
@@ -987,14 +987,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
 	while (fa) {
 		struct fown_struct *fown;
-		unsigned long flags;
 
 		if (fa->magic != FASYNC_MAGIC) {
 			printk(KERN_ERR "kill_fasync: bad magic number in "
 			       "fasync_struct!\n");
 			return;
 		}
-		spin_lock_irqsave(&fa->fa_lock, flags);
+		read_lock(&fa->fa_lock);
 		if (fa->fa_file) {
 			fown = &fa->fa_file->f_owner;
 			/* Don't send SIGURG to processes which have not set a
@@ -1003,7 +1002,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 			if (!(sig == SIGURG && fown->signum == 0))
 				send_sigio(fown, fa->fa_fd, band);
 		}
-		spin_unlock_irqrestore(&fa->fa_lock, flags);
+		read_unlock(&fa->fa_lock);
 		fa = rcu_dereference(fa->fa_next);
 	}
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..d6c24b68a092 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
 }
 
 struct fasync_struct {
-	spinlock_t		fa_lock;
+	rwlock_t		fa_lock;
 	int			magic;
 	int			fa_fd;
 	struct fasync_struct	*fa_next; /* singly linked list */
-- 
cgit v1.2.3


From 6dac152355d9308c9e187bf1d38d98afefcaa315 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:10 +0300
Subject: tcp: Add clean acked data hook

Called when a TCP segment is acknowledged.
Could be used by application protocols who hold additional
metadata associated with the stream data.

This is required by TLS device offload to release
metadata associated with acknowledged TLS records.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  2 ++
 include/net/tcp.h                  |  8 ++++++++
 net/ipv4/tcp_input.c               | 25 +++++++++++++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b68fea022a82..2ab6667275df 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ulp_ops	   Pluggable ULP control hook
  * @icsk_ulp_data	   ULP private data
+ * @icsk_clean_acked	   Clean acked data hook
  * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
@@ -102,6 +103,7 @@ struct inet_connection_sock {
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
 	const struct tcp_ulp_ops  *icsk_ulp_ops;
 	void			  *icsk_ulp_data;
+	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
 	struct hlist_node         icsk_listen_portaddr_node;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:6,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 833154e3df17..cf803fe0fb86 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2105,4 +2105,12 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 #if IS_ENABLED(CONFIG_SMC)
 extern struct static_key_false tcp_have_smc;
 #endif
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+			     void (*cad)(struct sock *sk, u32 ack_seq));
+void clean_acked_data_disable(struct inet_connection_sock *icsk);
+
+#endif
+
 #endif	/* _TCP_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8acbe5fd2098..b188e0d75edd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 #define REXMIT_LOST	1 /* retransmit packets marked lost */
 #define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
+
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+			     void (*cad)(struct sock *sk, u32 ack_seq))
+{
+	icsk->icsk_clean_acked = cad;
+	static_branch_inc(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_enable);
+
+void clean_acked_data_disable(struct inet_connection_sock *icsk)
+{
+	static_branch_dec(&clean_acked_data_enabled);
+	icsk->icsk_clean_acked = NULL;
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_disable);
+#endif
+
 static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
 			     unsigned int len)
 {
@@ -3560,6 +3579,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, prior_snd_una)) {
 		flag |= FLAG_SND_UNA_ADVANCED;
 		icsk->icsk_retransmits = 0;
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+		if (static_branch_unlikely(&clean_acked_data_enabled))
+			if (icsk->icsk_clean_acked)
+				icsk->icsk_clean_acked(sk, ack);
+#endif
 	}
 
 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
-- 
cgit v1.2.3


From 08303c189581c985e60f588ad92a041e46b6e307 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:11 +0300
Subject: net: Rename and export copy_skb_header

copy_skb_header is renamed to skb_copy_header and
exported. Exposing this function give more flexibility
in copying SKBs.
skb_copy and skb_copy_expand do not give enough control
over which parts are copied.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a4a5c0c5cba8..908d66e55b14 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1034,6 +1034,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 				   gfp_t gfp_mask, bool fclone);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c647cfe114e0..c642304f178c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
 	skb->inner_mac_header += off;
 }
 
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
 {
 	__copy_skb_header(new, old);
 
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
 	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
+EXPORT_SYMBOL(skb_copy_header);
 
 static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
 {
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 
 	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 	return n;
 }
 EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 		skb_clone_fraglist(n);
 	}
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 out:
 	return n;
 }
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
 			     skb->len + head_copy_len));
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 
 	skb_headers_offset_update(n, newheadroom - oldheadroom);
 
-- 
cgit v1.2.3


From ebf4e808fa0b22e551baf862e17c26c325c068f4 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:12 +0300
Subject: net: Add Software fallback infrastructure for socket dependent
 offloads

With socket dependent offloads we rely on the netdev to transform
the transmitted packets before sending them to the wire.
When a packet from an offloaded socket is rerouted to a different
device we need to detect it and do the transformation in software.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 21 +++++++++++++++++++++
 net/Kconfig        |  3 +++
 net/core/dev.c     |  4 ++++
 3 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 74d725fdbe0f..3c568b36ee36 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -481,6 +481,11 @@ struct sock {
 	void			(*sk_error_report)(struct sock *sk);
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+	struct sk_buff*		(*sk_validate_xmit_skb)(struct sock *sk,
+							struct net_device *dev,
+							struct sk_buff *skb);
+#endif
 	void                    (*sk_destruct)(struct sock *sk);
 	struct sock_reuseport __rcu	*sk_reuseport_cb;
 	struct rcu_head		sk_rcu;
@@ -2332,6 +2337,22 @@ static inline bool sk_fullsock(const struct sock *sk)
 	return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
 }
 
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ */
+static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+						   struct net_device *dev)
+{
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+	struct sock *sk = skb->sk;
+
+	if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb)
+		skb = sk->sk_validate_xmit_skb(sk, dev, skb);
+#endif
+
+	return skb;
+}
+
 /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
  * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
  */
diff --git a/net/Kconfig b/net/Kconfig
index 6fa1a4493b8c..b62089fb1332 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -407,6 +407,9 @@ config GRO_CELLS
 	bool
 	default n
 
+config SOCK_VALIDATE_XMIT
+	bool
+
 config NET_DEVLINK
 	tristate "Network physical/parent device Netlink interface"
 	help
diff --git a/net/core/dev.c b/net/core/dev.c
index e01c21a88cae..8944e1e0059d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3112,6 +3112,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 	if (unlikely(!skb))
 		goto out_null;
 
+	skb = sk_validate_xmit_skb(skb, dev);
+	if (unlikely(!skb))
+		goto out_null;
+
 	if (netif_needs_gso(skb, features)) {
 		struct sk_buff *segs;
 
-- 
cgit v1.2.3


From a5c37c63f71b044fbe1a3f893d6f6daa36fee5f5 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:13 +0300
Subject: net: Add TLS offload netdev ops

Add new netdev ops to add and delete tls context

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e09dd897b74..a569bf5524fa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -865,6 +865,26 @@ struct xfrmdev_ops {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+enum tls_offload_ctx_dir {
+	TLS_OFFLOAD_CTX_DIR_RX,
+	TLS_OFFLOAD_CTX_DIR_TX,
+};
+
+struct tls_crypto_info;
+struct tls_context;
+
+struct tlsdev_ops {
+	int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
+			   enum tls_offload_ctx_dir direction,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn);
+	void (*tls_dev_del)(struct net_device *netdev,
+			    struct tls_context *ctx,
+			    enum tls_offload_ctx_dir direction);
+};
+#endif
+
 struct dev_ifalias {
 	struct rcu_head rcuhead;
 	char ifalias[];
@@ -1750,6 +1770,10 @@ struct net_device {
 	const struct xfrmdev_ops *xfrmdev_ops;
 #endif
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+	const struct tlsdev_ops *tlsdev_ops;
+#endif
+
 	const struct header_ops *header_ops;
 
 	unsigned int		flags;
-- 
cgit v1.2.3


From 2342a8512a1e98c286b6b36094058e4ee2261c74 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:14 +0300
Subject: net: Add TLS TX offload features

This patch adds a netdev feature to configure TLS TX offloads.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 2 ++
 net/core/ethtool.c              | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index fe2f3b30960e..c87c3a3453c1 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -78,6 +78,7 @@ enum {
 	NETIF_F_HW_ESP_BIT,		/* Hardware ESP transformation offload */
 	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
 	NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
+	NETIF_F_HW_TLS_TX_BIT,		/* Hardware TLS TX offload */
 
 	NETIF_F_GRO_HW_BIT,		/* Hardware Generic receive offload */
 	NETIF_F_HW_TLS_RECORD_BIT,	/* Offload TLS record */
@@ -149,6 +150,7 @@ enum {
 #define	NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
 #define NETIF_F_HW_TLS_RECORD	__NETIF_F(HW_TLS_RECORD)
 #define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
+#define NETIF_F_HW_TLS_TX	__NETIF_F(HW_TLS_TX)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b849fdae7e87..3c6ce8a3928a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -110,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",
 	[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] =	 "rx-udp_tunnel-port-offload",
 	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record",
+	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",
 };
 
 static const char
-- 
cgit v1.2.3


From f66de3ee2c161fcc2d66974e9671f8a2a471ab20 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:15 +0300
Subject: net/tls: Split conf to rx + tx

In TLS inline crypto, we can have one direction in software
and another in hardware. Thus, we split the TLS configuration to separate
structures for receive and transmit.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  51 +++++++++++++-------
 net/tls/tls_main.c | 103 ++++++++++++++++++++-------------------
 net/tls/tls_sw.c   | 138 ++++++++++++++++++++++++++++++-----------------------
 3 files changed, 163 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 3da8e13a6d96..95a8c60b36be 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -83,21 +83,10 @@ struct tls_device {
 	void (*unhash)(struct tls_device *device, struct sock *sk);
 };
 
-struct tls_sw_context {
+struct tls_sw_context_tx {
 	struct crypto_aead *aead_send;
-	struct crypto_aead *aead_recv;
 	struct crypto_wait async_wait;
 
-	/* Receive context */
-	struct strparser strp;
-	void (*saved_data_ready)(struct sock *sk);
-	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
-				struct poll_table_struct *wait);
-	struct sk_buff *recv_pkt;
-	u8 control;
-	bool decrypted;
-
-	/* Sending context */
 	char aad_space[TLS_AAD_SPACE_SIZE];
 
 	unsigned int sg_plaintext_size;
@@ -114,6 +103,19 @@ struct tls_sw_context {
 	struct scatterlist sg_aead_out[2];
 };
 
+struct tls_sw_context_rx {
+	struct crypto_aead *aead_recv;
+	struct crypto_wait async_wait;
+
+	struct strparser strp;
+	void (*saved_data_ready)(struct sock *sk);
+	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+				struct poll_table_struct *wait);
+	struct sk_buff *recv_pkt;
+	u8 control;
+	bool decrypted;
+};
+
 enum {
 	TLS_PENDING_CLOSED_RECORD
 };
@@ -138,9 +140,15 @@ struct tls_context {
 		struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128;
 	};
 
-	void *priv_ctx;
+	struct list_head list;
+	struct net_device *netdev;
+	refcount_t refcount;
+
+	void *priv_ctx_tx;
+	void *priv_ctx_rx;
 
-	u8 conf:3;
+	u8 tx_conf:3;
+	u8 rx_conf:3;
 
 	struct cipher_context tx;
 	struct cipher_context rx;
@@ -177,7 +185,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
-void tls_sw_free_resources(struct sock *sk);
+void tls_sw_free_resources_tx(struct sock *sk);
+void tls_sw_free_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
 unsigned int tls_sw_poll(struct file *file, struct socket *sock,
@@ -297,16 +306,22 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk)
 	return icsk->icsk_ulp_data;
 }
 
-static inline struct tls_sw_context *tls_sw_ctx(
+static inline struct tls_sw_context_rx *tls_sw_ctx_rx(
+		const struct tls_context *tls_ctx)
+{
+	return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx;
+}
+
+static inline struct tls_sw_context_tx *tls_sw_ctx_tx(
 		const struct tls_context *tls_ctx)
 {
-	return (struct tls_sw_context *)tls_ctx->priv_ctx;
+	return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx;
 }
 
 static inline struct tls_offload_context *tls_offload_ctx(
 		const struct tls_context *tls_ctx)
 {
-	return (struct tls_offload_context *)tls_ctx->priv_ctx;
+	return (struct tls_offload_context *)tls_ctx->priv_ctx_tx;
 }
 
 int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 0d379970960e..545bf34ed599 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -51,12 +51,9 @@ enum {
 	TLSV6,
 	TLS_NUM_PROTS,
 };
-
 enum {
 	TLS_BASE,
-	TLS_SW_TX,
-	TLS_SW_RX,
-	TLS_SW_RXTX,
+	TLS_SW,
 	TLS_HW_RECORD,
 	TLS_NUM_CONFIG,
 };
@@ -65,14 +62,14 @@ static struct proto *saved_tcpv6_prot;
 static DEFINE_MUTEX(tcpv6_prot_mutex);
 static LIST_HEAD(device_list);
 static DEFINE_MUTEX(device_mutex);
-static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
+static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
 static struct proto_ops tls_sw_proto_ops;
 
-static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
 	int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
-	sk->sk_prot = &tls_prots[ip_ver][ctx->conf];
+	sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf];
 }
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -245,10 +242,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	lock_sock(sk);
 	sk_proto_close = ctx->sk_proto_close;
 
-	if (ctx->conf == TLS_HW_RECORD)
+	if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD)
 		goto skip_tx_cleanup;
 
-	if (ctx->conf == TLS_BASE) {
+	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) {
 		kfree(ctx);
 		ctx = NULL;
 		goto skip_tx_cleanup;
@@ -270,15 +267,17 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		}
 	}
 
-	kfree(ctx->tx.rec_seq);
-	kfree(ctx->tx.iv);
-	kfree(ctx->rx.rec_seq);
-	kfree(ctx->rx.iv);
+	/* We need these for tls_sw_fallback handling of other packets */
+	if (ctx->tx_conf == TLS_SW) {
+		kfree(ctx->tx.rec_seq);
+		kfree(ctx->tx.iv);
+		tls_sw_free_resources_tx(sk);
+	}
 
-	if (ctx->conf == TLS_SW_TX ||
-	    ctx->conf == TLS_SW_RX ||
-	    ctx->conf == TLS_SW_RXTX) {
-		tls_sw_free_resources(sk);
+	if (ctx->rx_conf == TLS_SW) {
+		kfree(ctx->rx.rec_seq);
+		kfree(ctx->rx.iv);
+		tls_sw_free_resources_rx(sk);
 	}
 
 skip_tx_cleanup:
@@ -287,7 +286,8 @@ skip_tx_cleanup:
 	/* free ctx for TLS_HW_RECORD, used by tcp_set_state
 	 * for sk->sk_prot->unhash [tls_hw_unhash]
 	 */
-	if (ctx && ctx->conf == TLS_HW_RECORD)
+	if (ctx && ctx->tx_conf == TLS_HW_RECORD &&
+	    ctx->rx_conf == TLS_HW_RECORD)
 		kfree(ctx);
 }
 
@@ -441,25 +441,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 		goto err_crypto_info;
 	}
 
-	/* currently SW is default, we will have ethtool in future */
 	if (tx) {
 		rc = tls_set_sw_offload(sk, ctx, 1);
-		if (ctx->conf == TLS_SW_RX)
-			conf = TLS_SW_RXTX;
-		else
-			conf = TLS_SW_TX;
+		conf = TLS_SW;
 	} else {
 		rc = tls_set_sw_offload(sk, ctx, 0);
-		if (ctx->conf == TLS_SW_TX)
-			conf = TLS_SW_RXTX;
-		else
-			conf = TLS_SW_RX;
+		conf = TLS_SW;
 	}
 
 	if (rc)
 		goto err_crypto_info;
 
-	ctx->conf = conf;
+	if (tx)
+		ctx->tx_conf = conf;
+	else
+		ctx->rx_conf = conf;
 	update_sk_prot(sk, ctx);
 	if (tx) {
 		ctx->sk_write_space = sk->sk_write_space;
@@ -535,7 +531,8 @@ static int tls_hw_prot(struct sock *sk)
 			ctx->hash = sk->sk_prot->hash;
 			ctx->unhash = sk->sk_prot->unhash;
 			ctx->sk_proto_close = sk->sk_prot->close;
-			ctx->conf = TLS_HW_RECORD;
+			ctx->rx_conf = TLS_HW_RECORD;
+			ctx->tx_conf = TLS_HW_RECORD;
 			update_sk_prot(sk, ctx);
 			rc = 1;
 			break;
@@ -579,29 +576,30 @@ static int tls_hw_hash(struct sock *sk)
 	return err;
 }
 
-static void build_protos(struct proto *prot, struct proto *base)
+static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
+			 struct proto *base)
 {
-	prot[TLS_BASE] = *base;
-	prot[TLS_BASE].setsockopt	= tls_setsockopt;
-	prot[TLS_BASE].getsockopt	= tls_getsockopt;
-	prot[TLS_BASE].close		= tls_sk_proto_close;
-
-	prot[TLS_SW_TX] = prot[TLS_BASE];
-	prot[TLS_SW_TX].sendmsg		= tls_sw_sendmsg;
-	prot[TLS_SW_TX].sendpage	= tls_sw_sendpage;
-
-	prot[TLS_SW_RX] = prot[TLS_BASE];
-	prot[TLS_SW_RX].recvmsg		= tls_sw_recvmsg;
-	prot[TLS_SW_RX].close		= tls_sk_proto_close;
-
-	prot[TLS_SW_RXTX] = prot[TLS_SW_TX];
-	prot[TLS_SW_RXTX].recvmsg	= tls_sw_recvmsg;
-	prot[TLS_SW_RXTX].close		= tls_sk_proto_close;
-
-	prot[TLS_HW_RECORD] = *base;
-	prot[TLS_HW_RECORD].hash	= tls_hw_hash;
-	prot[TLS_HW_RECORD].unhash	= tls_hw_unhash;
-	prot[TLS_HW_RECORD].close	= tls_sk_proto_close;
+	prot[TLS_BASE][TLS_BASE] = *base;
+	prot[TLS_BASE][TLS_BASE].setsockopt	= tls_setsockopt;
+	prot[TLS_BASE][TLS_BASE].getsockopt	= tls_getsockopt;
+	prot[TLS_BASE][TLS_BASE].close		= tls_sk_proto_close;
+
+	prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+	prot[TLS_SW][TLS_BASE].sendmsg		= tls_sw_sendmsg;
+	prot[TLS_SW][TLS_BASE].sendpage		= tls_sw_sendpage;
+
+	prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
+	prot[TLS_BASE][TLS_SW].recvmsg		= tls_sw_recvmsg;
+	prot[TLS_BASE][TLS_SW].close		= tls_sk_proto_close;
+
+	prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
+	prot[TLS_SW][TLS_SW].recvmsg	= tls_sw_recvmsg;
+	prot[TLS_SW][TLS_SW].close	= tls_sk_proto_close;
+
+	prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].hash		= tls_hw_hash;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash	= tls_hw_unhash;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].close	= tls_sk_proto_close;
 }
 
 static int tls_init(struct sock *sk)
@@ -643,7 +641,8 @@ static int tls_init(struct sock *sk)
 		mutex_unlock(&tcpv6_prot_mutex);
 	}
 
-	ctx->conf = TLS_BASE;
+	ctx->tx_conf = TLS_BASE;
+	ctx->rx_conf = TLS_BASE;
 	update_sk_prot(sk, ctx);
 out:
 	return rc;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 6ed1c02cfc94..5c3909c311f1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -52,7 +52,7 @@ static int tls_do_decryption(struct sock *sk,
 			     gfp_t flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm = strp_msg(skb);
 	struct aead_request *aead_req;
 
@@ -122,7 +122,7 @@ out:
 static void trim_both_sgl(struct sock *sk, int target_size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 
 	trim_sg(sk, ctx->sg_plaintext_data,
 		&ctx->sg_plaintext_num_elem,
@@ -141,7 +141,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
 static int alloc_encrypted_sg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int rc = 0;
 
 	rc = sk_alloc_sg(sk, len,
@@ -155,7 +155,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
 static int alloc_plaintext_sg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int rc = 0;
 
 	rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
@@ -181,7 +181,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg,
 static void tls_free_both_sg(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 
 	free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
 		&ctx->sg_encrypted_size);
@@ -191,7 +191,7 @@ static void tls_free_both_sg(struct sock *sk)
 }
 
 static int tls_do_encryption(struct tls_context *tls_ctx,
-			     struct tls_sw_context *ctx, size_t data_len,
+			     struct tls_sw_context_tx *ctx, size_t data_len,
 			     gfp_t flags)
 {
 	unsigned int req_size = sizeof(struct aead_request) +
@@ -227,7 +227,7 @@ static int tls_push_record(struct sock *sk, int flags,
 			   unsigned char record_type)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int rc;
 
 	sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
@@ -339,7 +339,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     int bytes)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct scatterlist *sg = ctx->sg_plaintext_data;
 	int copy, i, rc = 0;
 
@@ -367,7 +367,7 @@ out:
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int ret = 0;
 	int required_size;
 	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -522,7 +522,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int ret = 0;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 	bool eor;
@@ -636,7 +636,7 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
 				     long timeo, int *err)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct sk_buff *skb;
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
@@ -674,7 +674,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 		       struct scatterlist *sgout)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
 	struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
 	struct scatterlist *sgin = &sgin_arr[0];
@@ -723,7 +723,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
 			       unsigned int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm = strp_msg(skb);
 
 	if (len < rxm->full_len) {
@@ -749,7 +749,7 @@ int tls_sw_recvmsg(struct sock *sk,
 		   int *addr_len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	unsigned char control;
 	struct strp_msg *rxm;
 	struct sk_buff *skb;
@@ -869,7 +869,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 			   size_t len, unsigned int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sock->sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm = NULL;
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
@@ -922,7 +922,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
 	unsigned int ret;
 	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
 	/* Grab POLLOUT and POLLHUP from the underlying socket */
 	ret = ctx->sk_poll(file, sock, wait);
@@ -938,7 +938,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	char header[tls_ctx->rx.prepend_size];
 	struct strp_msg *rxm = strp_msg(skb);
 	size_t cipher_overhead;
@@ -987,7 +987,7 @@ read_failure:
 static void tls_queue(struct strparser *strp, struct sk_buff *skb)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm;
 
 	rxm = strp_msg(skb);
@@ -1003,18 +1003,28 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
 static void tls_data_ready(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
 	strp_data_ready(&ctx->strp);
 }
 
-void tls_sw_free_resources(struct sock *sk)
+void tls_sw_free_resources_tx(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 
 	if (ctx->aead_send)
 		crypto_free_aead(ctx->aead_send);
+	tls_free_both_sg(sk);
+
+	kfree(ctx);
+}
+
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+
 	if (ctx->aead_recv) {
 		if (ctx->recv_pkt) {
 			kfree_skb(ctx->recv_pkt);
@@ -1030,10 +1040,7 @@ void tls_sw_free_resources(struct sock *sk)
 		lock_sock(sk);
 	}
 
-	tls_free_both_sg(sk);
-
 	kfree(ctx);
-	kfree(tls_ctx);
 }
 
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
@@ -1041,7 +1048,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
 	struct tls_crypto_info *crypto_info;
 	struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
-	struct tls_sw_context *sw_ctx;
+	struct tls_sw_context_tx *sw_ctx_tx = NULL;
+	struct tls_sw_context_rx *sw_ctx_rx = NULL;
 	struct cipher_context *cctx;
 	struct crypto_aead **aead;
 	struct strp_callbacks cb;
@@ -1054,27 +1062,32 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto out;
 	}
 
-	if (!ctx->priv_ctx) {
-		sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
-		if (!sw_ctx) {
+	if (tx) {
+		sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
+		if (!sw_ctx_tx) {
 			rc = -ENOMEM;
 			goto out;
 		}
-		crypto_init_wait(&sw_ctx->async_wait);
+		crypto_init_wait(&sw_ctx_tx->async_wait);
+		ctx->priv_ctx_tx = sw_ctx_tx;
 	} else {
-		sw_ctx = ctx->priv_ctx;
+		sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
+		if (!sw_ctx_rx) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		crypto_init_wait(&sw_ctx_rx->async_wait);
+		ctx->priv_ctx_rx = sw_ctx_rx;
 	}
 
-	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
-
 	if (tx) {
 		crypto_info = &ctx->crypto_send;
 		cctx = &ctx->tx;
-		aead = &sw_ctx->aead_send;
+		aead = &sw_ctx_tx->aead_send;
 	} else {
 		crypto_info = &ctx->crypto_recv;
 		cctx = &ctx->rx;
-		aead = &sw_ctx->aead_recv;
+		aead = &sw_ctx_rx->aead_recv;
 	}
 
 	switch (crypto_info->cipher_type) {
@@ -1121,22 +1134,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	}
 	memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
 
-	if (tx) {
-		sg_init_table(sw_ctx->sg_encrypted_data,
-			      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
-		sg_init_table(sw_ctx->sg_plaintext_data,
-			      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
-
-		sg_init_table(sw_ctx->sg_aead_in, 2);
-		sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
-			   sizeof(sw_ctx->aad_space));
-		sg_unmark_end(&sw_ctx->sg_aead_in[1]);
-		sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
-		sg_init_table(sw_ctx->sg_aead_out, 2);
-		sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
-			   sizeof(sw_ctx->aad_space));
-		sg_unmark_end(&sw_ctx->sg_aead_out[1]);
-		sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+	if (sw_ctx_tx) {
+		sg_init_table(sw_ctx_tx->sg_encrypted_data,
+			      ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
+		sg_init_table(sw_ctx_tx->sg_plaintext_data,
+			      ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
+
+		sg_init_table(sw_ctx_tx->sg_aead_in, 2);
+		sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
+			   sizeof(sw_ctx_tx->aad_space));
+		sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
+		sg_chain(sw_ctx_tx->sg_aead_in, 2,
+			 sw_ctx_tx->sg_plaintext_data);
+		sg_init_table(sw_ctx_tx->sg_aead_out, 2);
+		sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
+			   sizeof(sw_ctx_tx->aad_space));
+		sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
+		sg_chain(sw_ctx_tx->sg_aead_out, 2,
+			 sw_ctx_tx->sg_encrypted_data);
 	}
 
 	if (!*aead) {
@@ -1161,22 +1176,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	if (rc)
 		goto free_aead;
 
-	if (!tx) {
+	if (sw_ctx_rx) {
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
 		cb.rcv_msg = tls_queue;
 		cb.parse_msg = tls_read_size;
 
-		strp_init(&sw_ctx->strp, sk, &cb);
+		strp_init(&sw_ctx_rx->strp, sk, &cb);
 
 		write_lock_bh(&sk->sk_callback_lock);
-		sw_ctx->saved_data_ready = sk->sk_data_ready;
+		sw_ctx_rx->saved_data_ready = sk->sk_data_ready;
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx->sk_poll = sk->sk_socket->ops->poll;
+		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
 
-		strp_check_rcv(&sw_ctx->strp);
+		strp_check_rcv(&sw_ctx_rx->strp);
 	}
 
 	goto out;
@@ -1188,11 +1203,16 @@ free_rec_seq:
 	kfree(cctx->rec_seq);
 	cctx->rec_seq = NULL;
 free_iv:
-	kfree(ctx->tx.iv);
-	ctx->tx.iv = NULL;
+	kfree(cctx->iv);
+	cctx->iv = NULL;
 free_priv:
-	kfree(ctx->priv_ctx);
-	ctx->priv_ctx = NULL;
+	if (tx) {
+		kfree(ctx->priv_ctx_tx);
+		ctx->priv_ctx_tx = NULL;
+	} else {
+		kfree(ctx->priv_ctx_rx);
+		ctx->priv_ctx_rx = NULL;
+	}
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From e8f69799810c32dd40c6724d829eccc70baad07f Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:16 +0300
Subject: net/tls: Add generic NIC offload infrastructure

This patch adds a generic infrastructure to offload TLS crypto to a
network device. It enables the kernel TLS socket to skip encryption
and authentication operations on the transmit side of the data path.
Leaving those computationally expensive operations to the NIC.

The NIC offload infrastructure builds TLS records and pushes them to
the TCP layer just like the SW KTLS implementation and using the same
API.
TCP segmentation is mostly unaffected. Currently the only exception is
that we prevent mixed SKBs where only part of the payload requires
offload. In the future we are likely to add a similar restriction
following a change cipher spec record.

The notable differences between SW KTLS and NIC offloaded TLS
implementations are as follows:
1. The offloaded implementation builds "plaintext TLS record", those
records contain plaintext instead of ciphertext and place holder bytes
instead of authentication tags.
2. The offloaded implementation maintains a mapping from TCP sequence
number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
TLS socket, we can use the tls NIC offload infrastructure to obtain
enough context to encrypt the payload of the SKB.
A TLS record is released when the last byte of the record is ack'ed,
this is done through the new icsk_clean_acked callback.

The infrastructure should be extendable to support various NIC offload
implementations.  However it is currently written with the
implementation below in mind:
The NIC assumes that packets from each offloaded stream are sent as
plaintext and in-order. It keeps track of the TLS records in the TCP
stream. When a packet marked for offload is transmitted, the NIC
encrypts the payload in-place and puts authentication tags in the
relevant place holders.

The responsibility for handling out-of-order packets (i.e. TCP
retransmission, qdisc drops) falls on the netdev driver.

The netdev driver keeps track of the expected TCP SN from the NIC's
perspective.  If the next packet to transmit matches the expected TCP
SN, the driver advances the expected TCP SN, and transmits the packet
with TLS offload indication.

If the next packet to transmit does not match the expected TCP SN. The
driver calls the TLS layer to obtain the TLS record that includes the
TCP of the packet for transmission. Using this TLS record, the driver
posts a work entry on the transmit queue to reconstruct the NIC TLS
state required for the offload of the out-of-order packet. It updates
the expected TCP SN accordingly and transmits the now in-order packet.
The same queue is used for packet transmission and TLS context
reconstruction to avoid the need for flushing the transmit queue before
issuing the context reconstruction request.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h             |  69 +++-
 net/tls/Kconfig               |  10 +
 net/tls/Makefile              |   2 +
 net/tls/tls_device.c          | 764 ++++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_device_fallback.c | 450 +++++++++++++++++++++++++
 net/tls/tls_main.c            |  42 ++-
 6 files changed, 1332 insertions(+), 5 deletions(-)
 create mode 100644 net/tls/tls_device.c
 create mode 100644 net/tls/tls_device_fallback.c

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 95a8c60b36be..8c56809eb384 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -116,6 +116,37 @@ struct tls_sw_context_rx {
 	bool decrypted;
 };
 
+struct tls_record_info {
+	struct list_head list;
+	u32 end_seq;
+	int len;
+	int num_frags;
+	skb_frag_t frags[MAX_SKB_FRAGS];
+};
+
+struct tls_offload_context {
+	struct crypto_aead *aead_send;
+	spinlock_t lock;	/* protects records list */
+	struct list_head records_list;
+	struct tls_record_info *open_record;
+	struct tls_record_info *retransmit_hint;
+	u64 hint_record_sn;
+	u64 unacked_record_sn;
+
+	struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
+	void (*sk_destruct)(struct sock *sk);
+	u8 driver_state[];
+	/* The TLS layer reserves room for driver specific state
+	 * Currently the belief is that there is not enough
+	 * driver specific state to justify another layer of indirection
+	 */
+#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *)))
+};
+
+#define TLS_OFFLOAD_CONTEXT_SIZE                                               \
+	(ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) +           \
+	 TLS_DRIVER_STATE_SIZE)
+
 enum {
 	TLS_PENDING_CLOSED_RECORD
 };
@@ -195,9 +226,28 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
 
-void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
-void tls_icsk_clean_acked(struct sock *sk);
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_device_sendpage(struct sock *sk, struct page *page,
+			int offset, size_t size, int flags);
+void tls_device_sk_destruct(struct sock *sk);
+void tls_device_init(void);
+void tls_device_cleanup(void);
+
+struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+				       u32 seq, u64 *p_record_sn);
+
+static inline bool tls_record_is_start_marker(struct tls_record_info *rec)
+{
+	return rec->len == 0;
+}
+
+static inline u32 tls_record_start_seq(struct tls_record_info *rec)
+{
+	return rec->end_seq - rec->len;
+}
 
+void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 int tls_push_sg(struct sock *sk, struct tls_context *ctx,
 		struct scatterlist *sg, u16 first_offset,
 		int flags);
@@ -234,6 +284,13 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
 	return tls_ctx->pending_open_record_frags;
 }
 
+static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk)
+{
+	return sk_fullsock(sk) &&
+	       /* matches smp_store_release in tls_set_device_offload */
+	       smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct;
+}
+
 static inline void tls_err_abort(struct sock *sk, int err)
 {
 	sk->sk_err = err;
@@ -329,4 +386,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
 void tls_register_device(struct tls_device *device);
 void tls_unregister_device(struct tls_device *device);
 
+struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
+				      struct net_device *dev,
+				      struct sk_buff *skb);
+
+int tls_sw_fallback_init(struct sock *sk,
+			 struct tls_offload_context *offload_ctx,
+			 struct tls_crypto_info *crypto_info);
+
 #endif /* _TLS_OFFLOAD_H */
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 89b8745a986f..73f05ece53d0 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -14,3 +14,13 @@ config TLS
 	encryption handling of the TLS protocol to be done in-kernel.
 
 	If unsure, say N.
+
+config TLS_DEVICE
+	bool "Transport Layer Security HW offload"
+	depends on TLS
+	select SOCK_VALIDATE_XMIT
+	default n
+	help
+	Enable kernel support for HW offload of the TLS protocol.
+
+	If unsure, say N.
diff --git a/net/tls/Makefile b/net/tls/Makefile
index a930fd1c4f7b..4d6b728a67d0 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -5,3 +5,5 @@
 obj-$(CONFIG_TLS) += tls.o
 
 tls-y := tls_main.o tls_sw.o
+
+tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
new file mode 100644
index 000000000000..ac45d6224e41
--- /dev/null
+++ b/net/tls/tls_device.c
@@ -0,0 +1,764 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <crypto/aead.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/dst.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+#include <net/tls.h>
+
+/* device_offload_lock is used to synchronize tls_dev_add
+ * against NETDEV_DOWN notifications.
+ */
+static DECLARE_RWSEM(device_offload_lock);
+
+static void tls_device_gc_task(struct work_struct *work);
+
+static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
+static LIST_HEAD(tls_device_gc_list);
+static LIST_HEAD(tls_device_list);
+static DEFINE_SPINLOCK(tls_device_lock);
+
+static void tls_device_free_ctx(struct tls_context *ctx)
+{
+	struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
+
+	kfree(offload_ctx);
+	kfree(ctx);
+}
+
+static void tls_device_gc_task(struct work_struct *work)
+{
+	struct tls_context *ctx, *tmp;
+	unsigned long flags;
+	LIST_HEAD(gc_list);
+
+	spin_lock_irqsave(&tls_device_lock, flags);
+	list_splice_init(&tls_device_gc_list, &gc_list);
+	spin_unlock_irqrestore(&tls_device_lock, flags);
+
+	list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
+		struct net_device *netdev = ctx->netdev;
+
+		if (netdev) {
+			netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+							TLS_OFFLOAD_CTX_DIR_TX);
+			dev_put(netdev);
+		}
+
+		list_del(&ctx->list);
+		tls_device_free_ctx(ctx);
+	}
+}
+
+static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tls_device_lock, flags);
+	list_move_tail(&ctx->list, &tls_device_gc_list);
+
+	/* schedule_work inside the spinlock
+	 * to make sure tls_device_down waits for that work.
+	 */
+	schedule_work(&tls_device_gc_work);
+
+	spin_unlock_irqrestore(&tls_device_lock, flags);
+}
+
+/* We assume that the socket is already connected */
+static struct net_device *get_netdev_for_sock(struct sock *sk)
+{
+	struct dst_entry *dst = sk_dst_get(sk);
+	struct net_device *netdev = NULL;
+
+	if (likely(dst)) {
+		netdev = dst->dev;
+		dev_hold(netdev);
+	}
+
+	dst_release(dst);
+
+	return netdev;
+}
+
+static void destroy_record(struct tls_record_info *record)
+{
+	int nr_frags = record->num_frags;
+	skb_frag_t *frag;
+
+	while (nr_frags-- > 0) {
+		frag = &record->frags[nr_frags];
+		__skb_frag_unref(frag);
+	}
+	kfree(record);
+}
+
+static void delete_all_records(struct tls_offload_context *offload_ctx)
+{
+	struct tls_record_info *info, *temp;
+
+	list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) {
+		list_del(&info->list);
+		destroy_record(info);
+	}
+
+	offload_ctx->retransmit_hint = NULL;
+}
+
+static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_record_info *info, *temp;
+	struct tls_offload_context *ctx;
+	u64 deleted_records = 0;
+	unsigned long flags;
+
+	if (!tls_ctx)
+		return;
+
+	ctx = tls_offload_ctx(tls_ctx);
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	info = ctx->retransmit_hint;
+	if (info && !before(acked_seq, info->end_seq)) {
+		ctx->retransmit_hint = NULL;
+		list_del(&info->list);
+		destroy_record(info);
+		deleted_records++;
+	}
+
+	list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
+		if (before(acked_seq, info->end_seq))
+			break;
+		list_del(&info->list);
+
+		destroy_record(info);
+		deleted_records++;
+	}
+
+	ctx->unacked_record_sn += deleted_records;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/* At this point, there should be no references on this
+ * socket and no in-flight SKBs associated with this
+ * socket, so it is safe to free all the resources.
+ */
+void tls_device_sk_destruct(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+
+	if (ctx->open_record)
+		destroy_record(ctx->open_record);
+
+	delete_all_records(ctx);
+	crypto_free_aead(ctx->aead_send);
+	ctx->sk_destruct(sk);
+	clean_acked_data_disable(inet_csk(sk));
+
+	if (refcount_dec_and_test(&tls_ctx->refcount))
+		tls_device_queue_ctx_destruction(tls_ctx);
+}
+EXPORT_SYMBOL(tls_device_sk_destruct);
+
+static void tls_append_frag(struct tls_record_info *record,
+			    struct page_frag *pfrag,
+			    int size)
+{
+	skb_frag_t *frag;
+
+	frag = &record->frags[record->num_frags - 1];
+	if (frag->page.p == pfrag->page &&
+	    frag->page_offset + frag->size == pfrag->offset) {
+		frag->size += size;
+	} else {
+		++frag;
+		frag->page.p = pfrag->page;
+		frag->page_offset = pfrag->offset;
+		frag->size = size;
+		++record->num_frags;
+		get_page(pfrag->page);
+	}
+
+	pfrag->offset += size;
+	record->len += size;
+}
+
+static int tls_push_record(struct sock *sk,
+			   struct tls_context *ctx,
+			   struct tls_offload_context *offload_ctx,
+			   struct tls_record_info *record,
+			   struct page_frag *pfrag,
+			   int flags,
+			   unsigned char record_type)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct page_frag dummy_tag_frag;
+	skb_frag_t *frag;
+	int i;
+
+	/* fill prepend */
+	frag = &record->frags[0];
+	tls_fill_prepend(ctx,
+			 skb_frag_address(frag),
+			 record->len - ctx->tx.prepend_size,
+			 record_type);
+
+	/* HW doesn't care about the data in the tag, because it fills it. */
+	dummy_tag_frag.page = skb_frag_page(frag);
+	dummy_tag_frag.offset = 0;
+
+	tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size);
+	record->end_seq = tp->write_seq + record->len;
+	spin_lock_irq(&offload_ctx->lock);
+	list_add_tail(&record->list, &offload_ctx->records_list);
+	spin_unlock_irq(&offload_ctx->lock);
+	offload_ctx->open_record = NULL;
+	set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+	tls_advance_record_sn(sk, &ctx->tx);
+
+	for (i = 0; i < record->num_frags; i++) {
+		frag = &record->frags[i];
+		sg_unmark_end(&offload_ctx->sg_tx_data[i]);
+		sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
+			    frag->size, frag->page_offset);
+		sk_mem_charge(sk, frag->size);
+		get_page(skb_frag_page(frag));
+	}
+	sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);
+
+	/* all ready, send */
+	return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
+}
+
+static int tls_create_new_record(struct tls_offload_context *offload_ctx,
+				 struct page_frag *pfrag,
+				 size_t prepend_size)
+{
+	struct tls_record_info *record;
+	skb_frag_t *frag;
+
+	record = kmalloc(sizeof(*record), GFP_KERNEL);
+	if (!record)
+		return -ENOMEM;
+
+	frag = &record->frags[0];
+	__skb_frag_set_page(frag, pfrag->page);
+	frag->page_offset = pfrag->offset;
+	skb_frag_size_set(frag, prepend_size);
+
+	get_page(pfrag->page);
+	pfrag->offset += prepend_size;
+
+	record->num_frags = 1;
+	record->len = prepend_size;
+	offload_ctx->open_record = record;
+	return 0;
+}
+
+static int tls_do_allocation(struct sock *sk,
+			     struct tls_offload_context *offload_ctx,
+			     struct page_frag *pfrag,
+			     size_t prepend_size)
+{
+	int ret;
+
+	if (!offload_ctx->open_record) {
+		if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
+						   sk->sk_allocation))) {
+			sk->sk_prot->enter_memory_pressure(sk);
+			sk_stream_moderate_sndbuf(sk);
+			return -ENOMEM;
+		}
+
+		ret = tls_create_new_record(offload_ctx, pfrag, prepend_size);
+		if (ret)
+			return ret;
+
+		if (pfrag->size > pfrag->offset)
+			return 0;
+	}
+
+	if (!sk_page_frag_refill(sk, pfrag))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int tls_push_data(struct sock *sk,
+			 struct iov_iter *msg_iter,
+			 size_t size, int flags,
+			 unsigned char record_type)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+	int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
+	int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
+	struct tls_record_info *record = ctx->open_record;
+	struct page_frag *pfrag;
+	size_t orig_size = size;
+	u32 max_open_record_len;
+	int copy, rc = 0;
+	bool done = false;
+	long timeo;
+
+	if (flags &
+	    ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST))
+		return -ENOTSUPP;
+
+	if (sk->sk_err)
+		return -sk->sk_err;
+
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
+	if (rc < 0)
+		return rc;
+
+	pfrag = sk_page_frag(sk);
+
+	/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
+	 * we need to leave room for an authentication tag.
+	 */
+	max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+			      tls_ctx->tx.prepend_size;
+	do {
+		rc = tls_do_allocation(sk, ctx, pfrag,
+				       tls_ctx->tx.prepend_size);
+		if (rc) {
+			rc = sk_stream_wait_memory(sk, &timeo);
+			if (!rc)
+				continue;
+
+			record = ctx->open_record;
+			if (!record)
+				break;
+handle_error:
+			if (record_type != TLS_RECORD_TYPE_DATA) {
+				/* avoid sending partial
+				 * record with type !=
+				 * application_data
+				 */
+				size = orig_size;
+				destroy_record(record);
+				ctx->open_record = NULL;
+			} else if (record->len > tls_ctx->tx.prepend_size) {
+				goto last_record;
+			}
+
+			break;
+		}
+
+		record = ctx->open_record;
+		copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
+		copy = min_t(size_t, copy, (max_open_record_len - record->len));
+
+		if (copy_from_iter_nocache(page_address(pfrag->page) +
+					       pfrag->offset,
+					   copy, msg_iter) != copy) {
+			rc = -EFAULT;
+			goto handle_error;
+		}
+		tls_append_frag(record, pfrag, copy);
+
+		size -= copy;
+		if (!size) {
+last_record:
+			tls_push_record_flags = flags;
+			if (more) {
+				tls_ctx->pending_open_record_frags =
+						record->num_frags;
+				break;
+			}
+
+			done = true;
+		}
+
+		if (done || record->len >= max_open_record_len ||
+		    (record->num_frags >= MAX_SKB_FRAGS - 1)) {
+			rc = tls_push_record(sk,
+					     tls_ctx,
+					     ctx,
+					     record,
+					     pfrag,
+					     tls_push_record_flags,
+					     record_type);
+			if (rc < 0)
+				break;
+		}
+	} while (!done);
+
+	if (orig_size - size > 0)
+		rc = orig_size - size;
+
+	return rc;
+}
+
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	int rc;
+
+	lock_sock(sk);
+
+	if (unlikely(msg->msg_controllen)) {
+		rc = tls_proccess_cmsg(sk, msg, &record_type);
+		if (rc)
+			goto out;
+	}
+
+	rc = tls_push_data(sk, &msg->msg_iter, size,
+			   msg->msg_flags, record_type);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+int tls_device_sendpage(struct sock *sk, struct page *page,
+			int offset, size_t size, int flags)
+{
+	struct iov_iter	msg_iter;
+	char *kaddr = kmap(page);
+	struct kvec iov;
+	int rc;
+
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
+	lock_sock(sk);
+
+	if (flags & MSG_OOB) {
+		rc = -ENOTSUPP;
+		goto out;
+	}
+
+	iov.iov_base = kaddr + offset;
+	iov.iov_len = size;
+	iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
+	rc = tls_push_data(sk, &msg_iter, size,
+			   flags, TLS_RECORD_TYPE_DATA);
+	kunmap(page);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+				       u32 seq, u64 *p_record_sn)
+{
+	u64 record_sn = context->hint_record_sn;
+	struct tls_record_info *info;
+
+	info = context->retransmit_hint;
+	if (!info ||
+	    before(seq, info->end_seq - info->len)) {
+		/* if retransmit_hint is irrelevant start
+		 * from the beggining of the list
+		 */
+		info = list_first_entry(&context->records_list,
+					struct tls_record_info, list);
+		record_sn = context->unacked_record_sn;
+	}
+
+	list_for_each_entry_from(info, &context->records_list, list) {
+		if (before(seq, info->end_seq)) {
+			if (!context->retransmit_hint ||
+			    after(info->end_seq,
+				  context->retransmit_hint->end_seq)) {
+				context->hint_record_sn = record_sn;
+				context->retransmit_hint = info;
+			}
+			*p_record_sn = record_sn;
+			return info;
+		}
+		record_sn++;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(tls_get_record);
+
+static int tls_device_push_pending_record(struct sock *sk, int flags)
+{
+	struct iov_iter	msg_iter;
+
+	iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+	return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
+}
+
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
+{
+	u16 nonce_size, tag_size, iv_size, rec_seq_size;
+	struct tls_record_info *start_marker_record;
+	struct tls_offload_context *offload_ctx;
+	struct tls_crypto_info *crypto_info;
+	struct net_device *netdev;
+	char *iv, *rec_seq;
+	struct sk_buff *skb;
+	int rc = -EINVAL;
+	__be64 rcd_sn;
+
+	if (!ctx)
+		goto out;
+
+	if (ctx->priv_ctx_tx) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
+	if (!start_marker_record) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
+	if (!offload_ctx) {
+		rc = -ENOMEM;
+		goto free_marker_record;
+	}
+
+	crypto_info = &ctx->crypto_send;
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128:
+		nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+		iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
+		rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
+		rec_seq =
+		 ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
+		break;
+	default:
+		rc = -EINVAL;
+		goto free_offload_ctx;
+	}
+
+	ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
+	ctx->tx.tag_size = tag_size;
+	ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
+	ctx->tx.iv_size = iv_size;
+	ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			     GFP_KERNEL);
+	if (!ctx->tx.iv) {
+		rc = -ENOMEM;
+		goto free_offload_ctx;
+	}
+
+	memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+
+	ctx->tx.rec_seq_size = rec_seq_size;
+	ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+	if (!ctx->tx.rec_seq) {
+		rc = -ENOMEM;
+		goto free_iv;
+	}
+	memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
+
+	rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
+	if (rc)
+		goto free_rec_seq;
+
+	/* start at rec_seq - 1 to account for the start marker record */
+	memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn));
+	offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1;
+
+	start_marker_record->end_seq = tcp_sk(sk)->write_seq;
+	start_marker_record->len = 0;
+	start_marker_record->num_frags = 0;
+
+	INIT_LIST_HEAD(&offload_ctx->records_list);
+	list_add_tail(&start_marker_record->list, &offload_ctx->records_list);
+	spin_lock_init(&offload_ctx->lock);
+
+	clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
+	ctx->push_pending_record = tls_device_push_pending_record;
+	offload_ctx->sk_destruct = sk->sk_destruct;
+
+	/* TLS offload is greatly simplified if we don't send
+	 * SKBs where only part of the payload needs to be encrypted.
+	 * So mark the last skb in the write queue as end of record.
+	 */
+	skb = tcp_write_queue_tail(sk);
+	if (skb)
+		TCP_SKB_CB(skb)->eor = 1;
+
+	refcount_set(&ctx->refcount, 1);
+
+	/* We support starting offload on multiple sockets
+	 * concurrently, so we only need a read lock here.
+	 * This lock must precede get_netdev_for_sock to prevent races between
+	 * NETDEV_DOWN and setsockopt.
+	 */
+	down_read(&device_offload_lock);
+	netdev = get_netdev_for_sock(sk);
+	if (!netdev) {
+		pr_err_ratelimited("%s: netdev not found\n", __func__);
+		rc = -EINVAL;
+		goto release_lock;
+	}
+
+	if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
+		rc = -ENOTSUPP;
+		goto release_netdev;
+	}
+
+	/* Avoid offloading if the device is down
+	 * We don't want to offload new flows after
+	 * the NETDEV_DOWN event
+	 */
+	if (!(netdev->flags & IFF_UP)) {
+		rc = -EINVAL;
+		goto release_netdev;
+	}
+
+	ctx->priv_ctx_tx = offload_ctx;
+	rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
+					     &ctx->crypto_send,
+					     tcp_sk(sk)->write_seq);
+	if (rc)
+		goto release_netdev;
+
+	ctx->netdev = netdev;
+
+	spin_lock_irq(&tls_device_lock);
+	list_add_tail(&ctx->list, &tls_device_list);
+	spin_unlock_irq(&tls_device_lock);
+
+	sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
+	/* following this assignment tls_is_sk_tx_device_offloaded
+	 * will return true and the context might be accessed
+	 * by the netdev's xmit function.
+	 */
+	smp_store_release(&sk->sk_destruct,
+			  &tls_device_sk_destruct);
+	up_read(&device_offload_lock);
+	goto out;
+
+release_netdev:
+	dev_put(netdev);
+release_lock:
+	up_read(&device_offload_lock);
+	clean_acked_data_disable(inet_csk(sk));
+	crypto_free_aead(offload_ctx->aead_send);
+free_rec_seq:
+	kfree(ctx->tx.rec_seq);
+free_iv:
+	kfree(ctx->tx.iv);
+free_offload_ctx:
+	kfree(offload_ctx);
+	ctx->priv_ctx_tx = NULL;
+free_marker_record:
+	kfree(start_marker_record);
+out:
+	return rc;
+}
+
+static int tls_device_down(struct net_device *netdev)
+{
+	struct tls_context *ctx, *tmp;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	/* Request a write lock to block new offload attempts */
+	down_write(&device_offload_lock);
+
+	spin_lock_irqsave(&tls_device_lock, flags);
+	list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
+		if (ctx->netdev != netdev ||
+		    !refcount_inc_not_zero(&ctx->refcount))
+			continue;
+
+		list_move(&ctx->list, &list);
+	}
+	spin_unlock_irqrestore(&tls_device_lock, flags);
+
+	list_for_each_entry_safe(ctx, tmp, &list, list)	{
+		netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+						TLS_OFFLOAD_CTX_DIR_TX);
+		ctx->netdev = NULL;
+		dev_put(netdev);
+		list_del_init(&ctx->list);
+
+		if (refcount_dec_and_test(&ctx->refcount))
+			tls_device_free_ctx(ctx);
+	}
+
+	up_write(&device_offload_lock);
+
+	flush_work(&tls_device_gc_work);
+
+	return NOTIFY_DONE;
+}
+
+static int tls_dev_event(struct notifier_block *this, unsigned long event,
+			 void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (!(dev->features & NETIF_F_HW_TLS_TX))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+	case NETDEV_FEAT_CHANGE:
+		if  (dev->tlsdev_ops &&
+		     dev->tlsdev_ops->tls_dev_add &&
+		     dev->tlsdev_ops->tls_dev_del)
+			return NOTIFY_DONE;
+		else
+			return NOTIFY_BAD;
+	case NETDEV_DOWN:
+		return tls_device_down(dev);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block tls_dev_notifier = {
+	.notifier_call	= tls_dev_event,
+};
+
+void __init tls_device_init(void)
+{
+	register_netdevice_notifier(&tls_dev_notifier);
+}
+
+void __exit tls_device_cleanup(void)
+{
+	unregister_netdevice_notifier(&tls_dev_notifier);
+	flush_work(&tls_device_gc_work);
+}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
new file mode 100644
index 000000000000..748914abdb60
--- /dev/null
+++ b/net/tls/tls_device_fallback.c
@@ -0,0 +1,450 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tls.h>
+#include <crypto/aead.h>
+#include <crypto/scatterwalk.h>
+#include <net/ip6_checksum.h>
+
+static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
+{
+	struct scatterlist *src = walk->sg;
+	int diff = walk->offset - src->offset;
+
+	sg_set_page(sg, sg_page(src),
+		    src->length - diff, walk->offset);
+
+	scatterwalk_crypto_chain(sg, sg_next(src), 0, 2);
+}
+
+static int tls_enc_record(struct aead_request *aead_req,
+			  struct crypto_aead *aead, char *aad,
+			  char *iv, __be64 rcd_sn,
+			  struct scatter_walk *in,
+			  struct scatter_walk *out, int *in_len)
+{
+	unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE];
+	struct scatterlist sg_in[3];
+	struct scatterlist sg_out[3];
+	u16 len;
+	int rc;
+
+	len = min_t(int, *in_len, ARRAY_SIZE(buf));
+
+	scatterwalk_copychunks(buf, in, len, 0);
+	scatterwalk_copychunks(buf, out, len, 1);
+
+	*in_len -= len;
+	if (!*in_len)
+		return 0;
+
+	scatterwalk_pagedone(in, 0, 1);
+	scatterwalk_pagedone(out, 1, 1);
+
+	len = buf[4] | (buf[3] << 8);
+	len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
+
+	tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
+		     (char *)&rcd_sn, sizeof(rcd_sn), buf[0]);
+
+	memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
+	       TLS_CIPHER_AES_GCM_128_IV_SIZE);
+
+	sg_init_table(sg_in, ARRAY_SIZE(sg_in));
+	sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+	sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE);
+	sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE);
+	chain_to_walk(sg_in + 1, in);
+	chain_to_walk(sg_out + 1, out);
+
+	*in_len -= len;
+	if (*in_len < 0) {
+		*in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+		/* the input buffer doesn't contain the entire record.
+		 * trim len accordingly. The resulting authentication tag
+		 * will contain garbage, but we don't care, so we won't
+		 * include any of it in the output skb
+		 * Note that we assume the output buffer length
+		 * is larger then input buffer length + tag size
+		 */
+		if (*in_len < 0)
+			len += *in_len;
+
+		*in_len = 0;
+	}
+
+	if (*in_len) {
+		scatterwalk_copychunks(NULL, in, len, 2);
+		scatterwalk_pagedone(in, 0, 1);
+		scatterwalk_copychunks(NULL, out, len, 2);
+		scatterwalk_pagedone(out, 1, 1);
+	}
+
+	len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+	aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv);
+
+	rc = crypto_aead_encrypt(aead_req);
+
+	return rc;
+}
+
+static void tls_init_aead_request(struct aead_request *aead_req,
+				  struct crypto_aead *aead)
+{
+	aead_request_set_tfm(aead_req, aead);
+	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+}
+
+static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead,
+						   gfp_t flags)
+{
+	unsigned int req_size = sizeof(struct aead_request) +
+		crypto_aead_reqsize(aead);
+	struct aead_request *aead_req;
+
+	aead_req = kzalloc(req_size, flags);
+	if (aead_req)
+		tls_init_aead_request(aead_req, aead);
+	return aead_req;
+}
+
+static int tls_enc_records(struct aead_request *aead_req,
+			   struct crypto_aead *aead, struct scatterlist *sg_in,
+			   struct scatterlist *sg_out, char *aad, char *iv,
+			   u64 rcd_sn, int len)
+{
+	struct scatter_walk out, in;
+	int rc;
+
+	scatterwalk_start(&in, sg_in);
+	scatterwalk_start(&out, sg_out);
+
+	do {
+		rc = tls_enc_record(aead_req, aead, aad, iv,
+				    cpu_to_be64(rcd_sn), &in, &out, &len);
+		rcd_sn++;
+
+	} while (rc == 0 && len);
+
+	scatterwalk_done(&in, 0, 0);
+	scatterwalk_done(&out, 1, 0);
+
+	return rc;
+}
+
+/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses
+ * might have been changed by NAT.
+ */
+static void update_chksum(struct sk_buff *skb, int headln)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	int datalen = skb->len - headln;
+	const struct ipv6hdr *ipv6h;
+	const struct iphdr *iph;
+
+	/* We only changed the payload so if we are using partial we don't
+	 * need to update anything.
+	 */
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL))
+		return;
+
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_offset = offsetof(struct tcphdr, check);
+
+	if (skb->sk->sk_family == AF_INET6) {
+		ipv6h = ipv6_hdr(skb);
+		th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+					     datalen, IPPROTO_TCP, 0);
+	} else {
+		iph = ip_hdr(skb);
+		th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+					       IPPROTO_TCP, 0);
+	}
+}
+
+static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
+{
+	skb_copy_header(nskb, skb);
+
+	skb_put(nskb, skb->len);
+	memcpy(nskb->data, skb->data, headln);
+	update_chksum(nskb, headln);
+
+	nskb->destructor = skb->destructor;
+	nskb->sk = skb->sk;
+	skb->destructor = NULL;
+	skb->sk = NULL;
+	refcount_add(nskb->truesize - skb->truesize,
+		     &nskb->sk->sk_wmem_alloc);
+}
+
+/* This function may be called after the user socket is already
+ * closed so make sure we don't use anything freed during
+ * tls_sk_proto_close here
+ */
+
+static int fill_sg_in(struct scatterlist *sg_in,
+		      struct sk_buff *skb,
+		      struct tls_offload_context *ctx,
+		      u64 *rcd_sn,
+		      s32 *sync_size,
+		      int *resync_sgs)
+{
+	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	int payload_len = skb->len - tcp_payload_offset;
+	u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
+	struct tls_record_info *record;
+	unsigned long flags;
+	int remaining;
+	int i;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	record = tls_get_record(ctx, tcp_seq, rcd_sn);
+	if (!record) {
+		spin_unlock_irqrestore(&ctx->lock, flags);
+		WARN(1, "Record not found for seq %u\n", tcp_seq);
+		return -EINVAL;
+	}
+
+	*sync_size = tcp_seq - tls_record_start_seq(record);
+	if (*sync_size < 0) {
+		int is_start_marker = tls_record_is_start_marker(record);
+
+		spin_unlock_irqrestore(&ctx->lock, flags);
+		/* This should only occur if the relevant record was
+		 * already acked. In that case it should be ok
+		 * to drop the packet and avoid retransmission.
+		 *
+		 * There is a corner case where the packet contains
+		 * both an acked and a non-acked record.
+		 * We currently don't handle that case and rely
+		 * on TCP to retranmit a packet that doesn't contain
+		 * already acked payload.
+		 */
+		if (!is_start_marker)
+			*sync_size = 0;
+		return -EINVAL;
+	}
+
+	remaining = *sync_size;
+	for (i = 0; remaining > 0; i++) {
+		skb_frag_t *frag = &record->frags[i];
+
+		__skb_frag_ref(frag);
+		sg_set_page(sg_in + i, skb_frag_page(frag),
+			    skb_frag_size(frag), frag->page_offset);
+
+		remaining -= skb_frag_size(frag);
+
+		if (remaining < 0)
+			sg_in[i].length += remaining;
+	}
+	*resync_sgs = i;
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	if (skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset, payload_len) < 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void fill_sg_out(struct scatterlist sg_out[3], void *buf,
+			struct tls_context *tls_ctx,
+			struct sk_buff *nskb,
+			int tcp_payload_offset,
+			int payload_len,
+			int sync_size,
+			void *dummy_buf)
+{
+	sg_set_buf(&sg_out[0], dummy_buf, sync_size);
+	sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len);
+	/* Add room for authentication tag produced by crypto */
+	dummy_buf += sync_size;
+	sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+}
+
+static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
+				   struct scatterlist sg_out[3],
+				   struct scatterlist *sg_in,
+				   struct sk_buff *skb,
+				   s32 sync_size, u64 rcd_sn)
+{
+	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+	int payload_len = skb->len - tcp_payload_offset;
+	void *buf, *iv, *aad, *dummy_buf;
+	struct aead_request *aead_req;
+	struct sk_buff *nskb = NULL;
+	int buf_len;
+
+	aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC);
+	if (!aead_req)
+		return NULL;
+
+	buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+		  TLS_CIPHER_AES_GCM_128_IV_SIZE +
+		  TLS_AAD_SPACE_SIZE +
+		  sync_size +
+		  TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+	buf = kmalloc(buf_len, GFP_ATOMIC);
+	if (!buf)
+		goto free_req;
+
+	iv = buf;
+	memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
+	       TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+	      TLS_CIPHER_AES_GCM_128_IV_SIZE;
+	dummy_buf = aad + TLS_AAD_SPACE_SIZE;
+
+	nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC);
+	if (!nskb)
+		goto free_buf;
+
+	skb_reserve(nskb, skb_headroom(skb));
+
+	fill_sg_out(sg_out, buf, tls_ctx, nskb, tcp_payload_offset,
+		    payload_len, sync_size, dummy_buf);
+
+	if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv,
+			    rcd_sn, sync_size + payload_len) < 0)
+		goto free_nskb;
+
+	complete_skb(nskb, skb, tcp_payload_offset);
+
+	/* validate_xmit_skb_list assumes that if the skb wasn't segmented
+	 * nskb->prev will point to the skb itself
+	 */
+	nskb->prev = nskb;
+
+free_buf:
+	kfree(buf);
+free_req:
+	kfree(aead_req);
+	return nskb;
+free_nskb:
+	kfree_skb(nskb);
+	nskb = NULL;
+	goto free_buf;
+}
+
+static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
+{
+	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+	int payload_len = skb->len - tcp_payload_offset;
+	struct scatterlist *sg_in, sg_out[3];
+	struct sk_buff *nskb = NULL;
+	int sg_in_max_elements;
+	int resync_sgs = 0;
+	s32 sync_size = 0;
+	u64 rcd_sn;
+
+	/* worst case is:
+	 * MAX_SKB_FRAGS in tls_record_info
+	 * MAX_SKB_FRAGS + 1 in SKB head and frags.
+	 */
+	sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1;
+
+	if (!payload_len)
+		return skb;
+
+	sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC);
+	if (!sg_in)
+		goto free_orig;
+
+	sg_init_table(sg_in, sg_in_max_elements);
+	sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+
+	if (fill_sg_in(sg_in, skb, ctx, &rcd_sn, &sync_size, &resync_sgs)) {
+		/* bypass packets before kernel TLS socket option was set */
+		if (sync_size < 0 && payload_len <= -sync_size)
+			nskb = skb_get(skb);
+		goto put_sg;
+	}
+
+	nskb = tls_enc_skb(tls_ctx, sg_out, sg_in, skb, sync_size, rcd_sn);
+
+put_sg:
+	while (resync_sgs)
+		put_page(sg_page(&sg_in[--resync_sgs]));
+	kfree(sg_in);
+free_orig:
+	kfree_skb(skb);
+	return nskb;
+}
+
+struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
+				      struct net_device *dev,
+				      struct sk_buff *skb)
+{
+	if (dev == tls_get_ctx(sk)->netdev)
+		return skb;
+
+	return tls_sw_fallback(sk, skb);
+}
+
+int tls_sw_fallback_init(struct sock *sk,
+			 struct tls_offload_context *offload_ctx,
+			 struct tls_crypto_info *crypto_info)
+{
+	const u8 *key;
+	int rc;
+
+	offload_ctx->aead_send =
+	    crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(offload_ctx->aead_send)) {
+		rc = PTR_ERR(offload_ctx->aead_send);
+		pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc);
+		offload_ctx->aead_send = NULL;
+		goto err_out;
+	}
+
+	key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key;
+
+	rc = crypto_aead_setkey(offload_ctx->aead_send, key,
+				TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+	if (rc)
+		goto free_aead;
+
+	rc = crypto_aead_setauthsize(offload_ctx->aead_send,
+				     TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+	if (rc)
+		goto free_aead;
+
+	return 0;
+free_aead:
+	crypto_free_aead(offload_ctx->aead_send);
+err_out:
+	return rc;
+}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 545bf34ed599..3aafb871a0a8 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -54,6 +54,9 @@ enum {
 enum {
 	TLS_BASE,
 	TLS_SW,
+#ifdef CONFIG_TLS_DEVICE
+	TLS_HW,
+#endif
 	TLS_HW_RECORD,
 	TLS_NUM_CONFIG,
 };
@@ -280,6 +283,15 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		tls_sw_free_resources_rx(sk);
 	}
 
+#ifdef CONFIG_TLS_DEVICE
+	if (ctx->tx_conf != TLS_HW) {
+#else
+	{
+#endif
+		kfree(ctx);
+		ctx = NULL;
+	}
+
 skip_tx_cleanup:
 	release_sock(sk);
 	sk_proto_close(sk, timeout);
@@ -442,8 +454,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 	}
 
 	if (tx) {
-		rc = tls_set_sw_offload(sk, ctx, 1);
-		conf = TLS_SW;
+#ifdef CONFIG_TLS_DEVICE
+		rc = tls_set_device_offload(sk, ctx);
+		conf = TLS_HW;
+		if (rc) {
+#else
+		{
+#endif
+			rc = tls_set_sw_offload(sk, ctx, 1);
+			conf = TLS_SW;
+		}
 	} else {
 		rc = tls_set_sw_offload(sk, ctx, 0);
 		conf = TLS_SW;
@@ -596,6 +616,16 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 	prot[TLS_SW][TLS_SW].recvmsg	= tls_sw_recvmsg;
 	prot[TLS_SW][TLS_SW].close	= tls_sk_proto_close;
 
+#ifdef CONFIG_TLS_DEVICE
+	prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+	prot[TLS_HW][TLS_BASE].sendmsg		= tls_device_sendmsg;
+	prot[TLS_HW][TLS_BASE].sendpage		= tls_device_sendpage;
+
+	prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
+	prot[TLS_HW][TLS_SW].sendmsg		= tls_device_sendmsg;
+	prot[TLS_HW][TLS_SW].sendpage		= tls_device_sendpage;
+#endif
+
 	prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
 	prot[TLS_HW_RECORD][TLS_HW_RECORD].hash		= tls_hw_hash;
 	prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash	= tls_hw_unhash;
@@ -630,7 +660,7 @@ static int tls_init(struct sock *sk)
 	ctx->getsockopt = sk->sk_prot->getsockopt;
 	ctx->sk_proto_close = sk->sk_prot->close;
 
-	/* Build IPv6 TLS whenever the address of tcpv6_prot changes */
+	/* Build IPv6 TLS whenever the address of tcpv6	_prot changes */
 	if (ip_ver == TLSV6 &&
 	    unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
 		mutex_lock(&tcpv6_prot_mutex);
@@ -680,6 +710,9 @@ static int __init tls_register(void)
 	tls_sw_proto_ops.poll = tls_sw_poll;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
+#ifdef CONFIG_TLS_DEVICE
+	tls_device_init();
+#endif
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
 	return 0;
@@ -688,6 +721,9 @@ static int __init tls_register(void)
 static void __exit tls_unregister(void)
 {
 	tcp_unregister_ulp(&tcp_tls_ulp_ops);
+#ifdef CONFIG_TLS_DEVICE
+	tls_device_cleanup();
+#endif
 }
 
 module_init(tls_register);
-- 
cgit v1.2.3


From 1ae1732284895498b7119e42323cf12821423e6d Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:18 +0300
Subject: net/mlx5: Accel, Add TLS tx offload interface

Add routines for manipulating TLS TX offload contexts.

In Innova TLS, TLS contexts are added or deleted
via a command message over the SBU connection.
The HW then sends a response message over the same connection.

Add implementation for Innova TLS (FPGA-based) hardware.

These routines will be used by the TLS offload support in a later patch

mlx5/accel is a middle acceleration layer to allow mlx5e and other ULPs
to work directly with mlx5_core rather than Innova FPGA or other mlx5
acceleration providers.

In the future, when IPSec/TLS or any other acceleration gets integrated
into ConnectX chip, mlx5/accel layer will provide the integrated
acceleration, rather than the Innova one.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c    |  71 +++
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h    |  86 ++++
 .../net/ethernet/mellanox/mlx5/core/fpga/core.h    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 562 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h |  68 +++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  11 +
 include/linux/mlx5/mlx5_ifc.h                      |  16 -
 include/linux/mlx5/mlx5_ifc_fpga.h                 |  77 +++
 9 files changed, 878 insertions(+), 18 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index c805769d92a9..9989e5265a45 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -8,10 +8,10 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o lib/clock.o \
 		diag/fs_tracepoint.o
 
-mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o
+mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
 
 mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
-		fpga/ipsec.o
+		fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 		en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
new file mode 100644
index 000000000000..77ac19f38cbe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+
+#include "accel/tls.h"
+#include "mlx5_core.h"
+#include "fpga/tls.h"
+
+int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			       struct tls_crypto_info *crypto_info,
+			       u32 start_offload_tcp_sn, u32 *p_swid)
+{
+	return mlx5_fpga_tls_add_tx_flow(mdev, flow, crypto_info,
+					 start_offload_tcp_sn, p_swid);
+}
+
+void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid)
+{
+	mlx5_fpga_tls_del_tx_flow(mdev, swid, GFP_KERNEL);
+}
+
+bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_is_tls_device(mdev);
+}
+
+u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_tls_device_caps(mdev);
+}
+
+int mlx5_accel_tls_init(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_tls_init(mdev);
+}
+
+void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev)
+{
+	mlx5_fpga_tls_cleanup(mdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
new file mode 100644
index 000000000000..6f9c9f446ecc
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_ACCEL_TLS_H__
+#define __MLX5_ACCEL_TLS_H__
+
+#include <linux/mlx5/driver.h>
+#include <linux/tls.h>
+
+#ifdef CONFIG_MLX5_ACCEL
+
+enum {
+	MLX5_ACCEL_TLS_TX = BIT(0),
+	MLX5_ACCEL_TLS_RX = BIT(1),
+	MLX5_ACCEL_TLS_V12 = BIT(2),
+	MLX5_ACCEL_TLS_V13 = BIT(3),
+	MLX5_ACCEL_TLS_LRO = BIT(4),
+	MLX5_ACCEL_TLS_IPV6 = BIT(5),
+	MLX5_ACCEL_TLS_AES_GCM128 = BIT(30),
+	MLX5_ACCEL_TLS_AES_GCM256 = BIT(31),
+};
+
+struct mlx5_ifc_tls_flow_bits {
+	u8         src_port[0x10];
+	u8         dst_port[0x10];
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6;
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6;
+	u8         ipv6[0x1];
+	u8         direction_sx[0x1];
+	u8         reserved_at_2[0x1e];
+};
+
+int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			       struct tls_crypto_info *crypto_info,
+			       u32 start_offload_tcp_sn, u32 *p_swid);
+void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid);
+bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev);
+u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev);
+int mlx5_accel_tls_init(struct mlx5_core_dev *mdev);
+void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev);
+
+#else
+
+static inline int
+mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn, u32 *p_swid) { return 0; }
+static inline void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid) { }
+static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return false; }
+static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; }
+static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; }
+static inline void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev) { }
+
+#endif
+
+#endif	/* __MLX5_ACCEL_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
index 82405ed84725..3e2355c8df3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
@@ -53,6 +53,7 @@ struct mlx5_fpga_device {
 	} conn_res;
 
 	struct mlx5_fpga_ipsec *ipsec;
+	struct mlx5_fpga_tls *tls;
 };
 
 #define mlx5_fpga_dbg(__adev, format, ...) \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
new file mode 100644
index 000000000000..21048013826c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+#include "fpga/tls.h"
+#include "fpga/cmd.h"
+#include "fpga/sdk.h"
+#include "fpga/core.h"
+#include "accel/tls.h"
+
+struct mlx5_fpga_tls_command_context;
+
+typedef void (*mlx5_fpga_tls_command_complete)
+	(struct mlx5_fpga_conn *conn, struct mlx5_fpga_device *fdev,
+	 struct mlx5_fpga_tls_command_context *ctx,
+	 struct mlx5_fpga_dma_buf *resp);
+
+struct mlx5_fpga_tls_command_context {
+	struct list_head list;
+	/* There is no guarantee on the order between the TX completion
+	 * and the command response.
+	 * The TX completion is going to touch cmd->buf even in
+	 * the case of successful transmission.
+	 * So instead of requiring separate allocations for cmd
+	 * and cmd->buf we've decided to use a reference counter
+	 */
+	refcount_t ref;
+	struct mlx5_fpga_dma_buf buf;
+	mlx5_fpga_tls_command_complete complete;
+};
+
+static void
+mlx5_fpga_tls_put_command_ctx(struct mlx5_fpga_tls_command_context *ctx)
+{
+	if (refcount_dec_and_test(&ctx->ref))
+		kfree(ctx);
+}
+
+static void mlx5_fpga_tls_cmd_complete(struct mlx5_fpga_device *fdev,
+				       struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_fpga_conn *conn = fdev->tls->conn;
+	struct mlx5_fpga_tls_command_context *ctx;
+	struct mlx5_fpga_tls *tls = fdev->tls;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tls->pending_cmds_lock, flags);
+	ctx = list_first_entry(&tls->pending_cmds,
+			       struct mlx5_fpga_tls_command_context, list);
+	list_del(&ctx->list);
+	spin_unlock_irqrestore(&tls->pending_cmds_lock, flags);
+	ctx->complete(conn, fdev, ctx, resp);
+}
+
+static void mlx5_fpga_cmd_send_complete(struct mlx5_fpga_conn *conn,
+					struct mlx5_fpga_device *fdev,
+					struct mlx5_fpga_dma_buf *buf,
+					u8 status)
+{
+	struct mlx5_fpga_tls_command_context *ctx =
+	    container_of(buf, struct mlx5_fpga_tls_command_context, buf);
+
+	mlx5_fpga_tls_put_command_ctx(ctx);
+
+	if (unlikely(status))
+		mlx5_fpga_tls_cmd_complete(fdev, NULL);
+}
+
+static void mlx5_fpga_tls_cmd_send(struct mlx5_fpga_device *fdev,
+				   struct mlx5_fpga_tls_command_context *cmd,
+				   mlx5_fpga_tls_command_complete complete)
+{
+	struct mlx5_fpga_tls *tls = fdev->tls;
+	unsigned long flags;
+	int ret;
+
+	refcount_set(&cmd->ref, 2);
+	cmd->complete = complete;
+	cmd->buf.complete = mlx5_fpga_cmd_send_complete;
+
+	spin_lock_irqsave(&tls->pending_cmds_lock, flags);
+	/* mlx5_fpga_sbu_conn_sendmsg is called under pending_cmds_lock
+	 * to make sure commands are inserted to the tls->pending_cmds list
+	 * and the command QP in the same order.
+	 */
+	ret = mlx5_fpga_sbu_conn_sendmsg(tls->conn, &cmd->buf);
+	if (likely(!ret))
+		list_add_tail(&cmd->list, &tls->pending_cmds);
+	else
+		complete(tls->conn, fdev, cmd, NULL);
+	spin_unlock_irqrestore(&tls->pending_cmds_lock, flags);
+}
+
+/* Start of context identifiers range (inclusive) */
+#define SWID_START	0
+/* End of context identifiers range (exclusive) */
+#define SWID_END	BIT(24)
+
+static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock,
+				    void *ptr)
+{
+	int ret;
+
+	/* TLS metadata format is 1 byte for syndrome followed
+	 * by 3 bytes of swid (software ID)
+	 * swid must not exceed 3 bytes.
+	 * See tls_rxtx.c:insert_pet() for details
+	 */
+	BUILD_BUG_ON((SWID_END - 1) & 0xFF000000);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(idr_spinlock);
+	ret = idr_alloc(idr, ptr, SWID_START, SWID_END, GFP_ATOMIC);
+	spin_unlock_irq(idr_spinlock);
+	idr_preload_end();
+
+	return ret;
+}
+
+static void mlx5_fpga_tls_release_swid(struct idr *idr,
+				       spinlock_t *idr_spinlock, u32 swid)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(idr_spinlock, flags);
+	idr_remove(idr, swid);
+	spin_unlock_irqrestore(idr_spinlock, flags);
+}
+
+struct mlx5_teardown_stream_context {
+	struct mlx5_fpga_tls_command_context cmd;
+	u32 swid;
+};
+
+static void
+mlx5_fpga_tls_teardown_completion(struct mlx5_fpga_conn *conn,
+				  struct mlx5_fpga_device *fdev,
+				  struct mlx5_fpga_tls_command_context *cmd,
+				  struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_teardown_stream_context *ctx =
+		    container_of(cmd, struct mlx5_teardown_stream_context, cmd);
+
+	if (resp) {
+		u32 syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome);
+
+		if (syndrome)
+			mlx5_fpga_err(fdev,
+				      "Teardown stream failed with syndrome = %d",
+				      syndrome);
+		else
+			mlx5_fpga_tls_release_swid(&fdev->tls->tx_idr,
+						   &fdev->tls->idr_spinlock,
+						   ctx->swid);
+	}
+	mlx5_fpga_tls_put_command_ctx(cmd);
+}
+
+static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd)
+{
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, src_port), flow,
+	       MLX5_BYTE_OFF(tls_flow, ipv6));
+
+	MLX5_SET(tls_cmd, cmd, ipv6, MLX5_GET(tls_flow, flow, ipv6));
+	MLX5_SET(tls_cmd, cmd, direction_sx,
+		 MLX5_GET(tls_flow, flow, direction_sx));
+}
+
+void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev, void *flow,
+				     u32 swid, gfp_t flags)
+{
+	struct mlx5_teardown_stream_context *ctx;
+	struct mlx5_fpga_dma_buf *buf;
+	void *cmd;
+
+	ctx = kzalloc(sizeof(*ctx) + MLX5_TLS_COMMAND_SIZE, flags);
+	if (!ctx)
+		return;
+
+	buf = &ctx->cmd.buf;
+	cmd = (ctx + 1);
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_TEARDOWN_STREAM);
+	MLX5_SET(tls_cmd, cmd, swid, swid);
+
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+	kfree(flow);
+
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+
+	ctx->swid = swid;
+	mlx5_fpga_tls_cmd_send(mdev->fpga, &ctx->cmd,
+			       mlx5_fpga_tls_teardown_completion);
+}
+
+void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid,
+			       gfp_t flags)
+{
+	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
+	void *flow;
+
+	rcu_read_lock();
+	flow = idr_find(&tls->tx_idr, swid);
+	rcu_read_unlock();
+
+	if (!flow) {
+		mlx5_fpga_err(mdev->fpga, "No flow information for swid %u\n",
+			      swid);
+		return;
+	}
+
+	mlx5_fpga_tls_send_teardown_cmd(mdev, flow, swid, flags);
+}
+
+enum mlx5_fpga_setup_stream_status {
+	MLX5_FPGA_CMD_PENDING,
+	MLX5_FPGA_CMD_SEND_FAILED,
+	MLX5_FPGA_CMD_RESPONSE_RECEIVED,
+	MLX5_FPGA_CMD_ABANDONED,
+};
+
+struct mlx5_setup_stream_context {
+	struct mlx5_fpga_tls_command_context cmd;
+	atomic_t status;
+	u32 syndrome;
+	struct completion comp;
+};
+
+static void
+mlx5_fpga_tls_setup_completion(struct mlx5_fpga_conn *conn,
+			       struct mlx5_fpga_device *fdev,
+			       struct mlx5_fpga_tls_command_context *cmd,
+			       struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_setup_stream_context *ctx =
+	    container_of(cmd, struct mlx5_setup_stream_context, cmd);
+	int status = MLX5_FPGA_CMD_SEND_FAILED;
+	void *tls_cmd = ctx + 1;
+
+	/* If we failed to send to command resp == NULL */
+	if (resp) {
+		ctx->syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome);
+		status = MLX5_FPGA_CMD_RESPONSE_RECEIVED;
+	}
+
+	status = atomic_xchg_release(&ctx->status, status);
+	if (likely(status != MLX5_FPGA_CMD_ABANDONED)) {
+		complete(&ctx->comp);
+		return;
+	}
+
+	mlx5_fpga_err(fdev, "Command was abandoned, syndrome = %u\n",
+		      ctx->syndrome);
+
+	if (!ctx->syndrome) {
+		/* The process was killed while waiting for the context to be
+		 * added, and the add completed successfully.
+		 * We need to destroy the HW context, and we can't can't reuse
+		 * the command context because we might not have received
+		 * the tx completion yet.
+		 */
+		mlx5_fpga_tls_del_tx_flow(fdev->mdev,
+					  MLX5_GET(tls_cmd, tls_cmd, swid),
+					  GFP_ATOMIC);
+	}
+
+	mlx5_fpga_tls_put_command_ctx(cmd);
+}
+
+static int mlx5_fpga_tls_setup_stream_cmd(struct mlx5_core_dev *mdev,
+					  struct mlx5_setup_stream_context *ctx)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	void *cmd = ctx + 1;
+	int status, ret = 0;
+
+	buf = &ctx->cmd.buf;
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_SETUP_STREAM);
+
+	init_completion(&ctx->comp);
+	atomic_set(&ctx->status, MLX5_FPGA_CMD_PENDING);
+	ctx->syndrome = -1;
+
+	mlx5_fpga_tls_cmd_send(mdev->fpga, &ctx->cmd,
+			       mlx5_fpga_tls_setup_completion);
+	wait_for_completion_killable(&ctx->comp);
+
+	status = atomic_xchg_acquire(&ctx->status, MLX5_FPGA_CMD_ABANDONED);
+	if (unlikely(status == MLX5_FPGA_CMD_PENDING))
+	/* ctx is going to be released in mlx5_fpga_tls_setup_completion */
+		return -EINTR;
+
+	if (unlikely(ctx->syndrome))
+		ret = -ENOMEM;
+
+	mlx5_fpga_tls_put_command_ctx(&ctx->cmd);
+	return ret;
+}
+
+static void mlx5_fpga_tls_hw_qp_recv_cb(void *cb_arg,
+					struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_fpga_device *fdev = (struct mlx5_fpga_device *)cb_arg;
+
+	mlx5_fpga_tls_cmd_complete(fdev, buf);
+}
+
+bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga))
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, ieee_vendor_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_version) != 0)
+		return false;
+
+	return true;
+}
+
+static int mlx5_fpga_tls_get_caps(struct mlx5_fpga_device *fdev,
+				  u32 *p_caps)
+{
+	int err, cap_size = MLX5_ST_SZ_BYTES(tls_extended_cap);
+	u32 caps = 0;
+	void *buf;
+
+	buf = kzalloc(cap_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	err = mlx5_fpga_get_sbu_caps(fdev, cap_size, buf);
+	if (err)
+		goto out;
+
+	if (MLX5_GET(tls_extended_cap, buf, tx))
+		caps |= MLX5_ACCEL_TLS_TX;
+	if (MLX5_GET(tls_extended_cap, buf, rx))
+		caps |= MLX5_ACCEL_TLS_RX;
+	if (MLX5_GET(tls_extended_cap, buf, tls_v12))
+		caps |= MLX5_ACCEL_TLS_V12;
+	if (MLX5_GET(tls_extended_cap, buf, tls_v13))
+		caps |= MLX5_ACCEL_TLS_V13;
+	if (MLX5_GET(tls_extended_cap, buf, lro))
+		caps |= MLX5_ACCEL_TLS_LRO;
+	if (MLX5_GET(tls_extended_cap, buf, ipv6))
+		caps |= MLX5_ACCEL_TLS_IPV6;
+
+	if (MLX5_GET(tls_extended_cap, buf, aes_gcm_128))
+		caps |= MLX5_ACCEL_TLS_AES_GCM128;
+	if (MLX5_GET(tls_extended_cap, buf, aes_gcm_256))
+		caps |= MLX5_ACCEL_TLS_AES_GCM256;
+
+	*p_caps = caps;
+	err = 0;
+out:
+	kfree(buf);
+	return err;
+}
+
+int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_conn_attr init_attr = {0};
+	struct mlx5_fpga_conn *conn;
+	struct mlx5_fpga_tls *tls;
+	int err = 0;
+
+	if (!mlx5_fpga_is_tls_device(mdev) || !fdev)
+		return 0;
+
+	tls = kzalloc(sizeof(*tls), GFP_KERNEL);
+	if (!tls)
+		return -ENOMEM;
+
+	err = mlx5_fpga_tls_get_caps(fdev, &tls->caps);
+	if (err)
+		goto error;
+
+	if (!(tls->caps & (MLX5_ACCEL_TLS_TX | MLX5_ACCEL_TLS_V12 |
+				 MLX5_ACCEL_TLS_AES_GCM128))) {
+		err = -ENOTSUPP;
+		goto error;
+	}
+
+	init_attr.rx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.tx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.recv_cb = mlx5_fpga_tls_hw_qp_recv_cb;
+	init_attr.cb_arg = fdev;
+	conn = mlx5_fpga_sbu_conn_create(fdev, &init_attr);
+	if (IS_ERR(conn)) {
+		err = PTR_ERR(conn);
+		mlx5_fpga_err(fdev, "Error creating TLS command connection %d\n",
+			      err);
+		goto error;
+	}
+
+	tls->conn = conn;
+	spin_lock_init(&tls->pending_cmds_lock);
+	INIT_LIST_HEAD(&tls->pending_cmds);
+
+	idr_init(&tls->tx_idr);
+	spin_lock_init(&tls->idr_spinlock);
+	fdev->tls = tls;
+	return 0;
+
+error:
+	kfree(tls);
+	return err;
+}
+
+void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	if (!fdev || !fdev->tls)
+		return;
+
+	mlx5_fpga_sbu_conn_destroy(fdev->tls->conn);
+	kfree(fdev->tls);
+	fdev->tls = NULL;
+}
+
+static void mlx5_fpga_tls_set_aes_gcm128_ctx(void *cmd,
+					     struct tls_crypto_info *info,
+					     __be64 *rcd_sn)
+{
+	struct tls12_crypto_info_aes_gcm_128 *crypto_info =
+	    (struct tls12_crypto_info_aes_gcm_128 *)info;
+
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_rcd_sn), crypto_info->rec_seq,
+	       TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_implicit_iv),
+	       crypto_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key),
+	       crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	/* in AES-GCM 128 we need to write the key twice */
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key) +
+		   TLS_CIPHER_AES_GCM_128_KEY_SIZE,
+	       crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	MLX5_SET(tls_cmd, cmd, alg, MLX5_TLS_ALG_AES_GCM_128);
+}
+
+static int mlx5_fpga_tls_set_key_material(void *cmd, u32 caps,
+					  struct tls_crypto_info *crypto_info)
+{
+	__be64 rcd_sn;
+
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128:
+		if (!(caps & MLX5_ACCEL_TLS_AES_GCM128))
+			return -EINVAL;
+		mlx5_fpga_tls_set_aes_gcm128_ctx(cmd, crypto_info, &rcd_sn);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+				  struct tls_crypto_info *crypto_info, u32 swid,
+				  u32 tcp_sn)
+{
+	u32 caps = mlx5_fpga_tls_device_caps(mdev);
+	struct mlx5_setup_stream_context *ctx;
+	int ret = -ENOMEM;
+	size_t cmd_size;
+	void *cmd;
+
+	cmd_size = MLX5_TLS_COMMAND_SIZE + sizeof(*ctx);
+	ctx = kzalloc(cmd_size, GFP_KERNEL);
+	if (!ctx)
+		goto out;
+
+	cmd = ctx + 1;
+	ret = mlx5_fpga_tls_set_key_material(cmd, caps, crypto_info);
+	if (ret)
+		goto free_ctx;
+
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+
+	MLX5_SET(tls_cmd, cmd, swid, swid);
+	MLX5_SET(tls_cmd, cmd, tcp_sn, tcp_sn);
+
+	return mlx5_fpga_tls_setup_stream_cmd(mdev, ctx);
+
+free_ctx:
+	kfree(ctx);
+out:
+	return ret;
+}
+
+int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			      struct tls_crypto_info *crypto_info,
+			      u32 start_offload_tcp_sn, u32 *p_swid)
+{
+	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
+	int ret = -ENOMEM;
+	u32 swid;
+
+	ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr, &tls->idr_spinlock, flow);
+	if (ret < 0)
+		return ret;
+
+	swid = ret;
+	MLX5_SET(tls_flow, flow, direction_sx, 1);
+
+	ret = mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid,
+				     start_offload_tcp_sn);
+	if (ret && ret != -EINTR)
+		goto free_swid;
+
+	*p_swid = swid;
+	return 0;
+free_swid:
+	mlx5_fpga_tls_release_swid(&tls->tx_idr, &tls->idr_spinlock, swid);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
new file mode 100644
index 000000000000..800a214e4e49
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_FPGA_TLS_H__
+#define __MLX5_FPGA_TLS_H__
+
+#include <linux/mlx5/driver.h>
+
+#include <net/tls.h>
+#include "fpga/core.h"
+
+struct mlx5_fpga_tls {
+	struct list_head pending_cmds;
+	spinlock_t pending_cmds_lock; /* Protects pending_cmds */
+	u32 caps;
+	struct mlx5_fpga_conn *conn;
+
+	struct idr tx_idr;
+	spinlock_t idr_spinlock; /* protects the IDR */
+};
+
+int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			      struct tls_crypto_info *crypto_info,
+			      u32 start_offload_tcp_sn, u32 *p_swid);
+
+void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid,
+			       gfp_t flags);
+
+bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev);
+int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev);
+void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev);
+
+static inline u32 mlx5_fpga_tls_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mdev->fpga->tls->caps;
+}
+
+#endif /* __MLX5_FPGA_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 63a8ea31601c..b865597630ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -60,6 +60,7 @@
 #include "fpga/core.h"
 #include "fpga/ipsec.h"
 #include "accel/ipsec.h"
+#include "accel/tls.h"
 #include "lib/clock.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
@@ -1190,6 +1191,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_ipsec_start;
 	}
 
+	err = mlx5_accel_tls_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "TLS device start failed %d\n", err);
+		goto err_tls_start;
+	}
+
 	err = mlx5_init_fs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init flow steering\n");
@@ -1231,6 +1238,9 @@ err_sriov:
 	mlx5_cleanup_fs(dev);
 
 err_fs:
+	mlx5_accel_tls_cleanup(dev);
+
+err_tls_start:
 	mlx5_accel_ipsec_cleanup(dev);
 
 err_ipsec_start:
@@ -1306,6 +1316,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_sriov_detach(dev);
 	mlx5_cleanup_fs(dev);
 	mlx5_accel_ipsec_cleanup(dev);
+	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
 	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1aad455538f4..b8918a1da11f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -356,22 +356,6 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits {
 	u8         reserved_at_6[0x1a];
 };
 
-struct mlx5_ifc_ipv4_layout_bits {
-	u8         reserved_at_0[0x60];
-
-	u8         ipv4[0x20];
-};
-
-struct mlx5_ifc_ipv6_layout_bits {
-	u8         ipv6[16][0x8];
-};
-
-union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
-	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
-	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
-	u8         reserved_at_0[0x80];
-};
-
 struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         smac_47_16[0x20];
 
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index ec052491ba3d..193091537cb6 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -32,12 +32,29 @@
 #ifndef MLX5_IFC_FPGA_H
 #define MLX5_IFC_FPGA_H
 
+struct mlx5_ifc_ipv4_layout_bits {
+	u8         reserved_at_0[0x60];
+
+	u8         ipv4[0x20];
+};
+
+struct mlx5_ifc_ipv6_layout_bits {
+	u8         ipv6[16][0x8];
+};
+
+union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
+	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
+	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
+	u8         reserved_at_0[0x80];
+};
+
 enum {
 	MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX = 0x2c9,
 };
 
 enum {
 	MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC    = 0x2,
+	MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS      = 0x3,
 };
 
 struct mlx5_ifc_fpga_shell_caps_bits {
@@ -370,6 +387,27 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_tls_extended_cap_bits {
+	u8         aes_gcm_128[0x1];
+	u8         aes_gcm_256[0x1];
+	u8         reserved_at_2[0x1e];
+	u8         reserved_at_20[0x20];
+	u8         context_capacity_total[0x20];
+	u8         context_capacity_rx[0x20];
+	u8         context_capacity_tx[0x20];
+	u8         reserved_at_a0[0x10];
+	u8         tls_counter_size[0x10];
+	u8         tls_counters_addr_low[0x20];
+	u8         tls_counters_addr_high[0x20];
+	u8         rx[0x1];
+	u8         tx[0x1];
+	u8         tls_v12[0x1];
+	u8         tls_v13[0x1];
+	u8         lro[0x1];
+	u8         ipv6[0x1];
+	u8         reserved_at_106[0x1a];
+};
+
 struct mlx5_ifc_ipsec_extended_cap_bits {
 	u8         encapsulation[0x20];
 
@@ -519,4 +557,43 @@ struct mlx5_ifc_fpga_ipsec_sa {
 	__be16 reserved2;
 } __packed;
 
+enum fpga_tls_cmds {
+	CMD_SETUP_STREAM		= 0x1001,
+	CMD_TEARDOWN_STREAM		= 0x1002,
+};
+
+#define MLX5_TLS_1_2 (0)
+
+#define MLX5_TLS_ALG_AES_GCM_128 (0)
+#define MLX5_TLS_ALG_AES_GCM_256 (1)
+
+struct mlx5_ifc_tls_cmd_bits {
+	u8         command_type[0x20];
+	u8         ipv6[0x1];
+	u8         direction_sx[0x1];
+	u8         tls_version[0x2];
+	u8         reserved[0x1c];
+	u8         swid[0x20];
+	u8         src_port[0x10];
+	u8         dst_port[0x10];
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6;
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6;
+	u8         tls_rcd_sn[0x40];
+	u8         tcp_sn[0x20];
+	u8         tls_implicit_iv[0x20];
+	u8         tls_xor_iv[0x40];
+	u8         encryption_key[0x100];
+	u8         alg[4];
+	u8         reserved2[0x1c];
+	u8         reserved3[0x4a0];
+};
+
+struct mlx5_ifc_tls_resp_bits {
+	u8         syndrome[0x20];
+	u8         stream_id[0x20];
+	u8         reserverd[0x40];
+};
+
+#define MLX5_TLS_COMMAND_SIZE (0x100)
+
 #endif /* MLX5_IFC_FPGA_H */
-- 
cgit v1.2.3


From e7420c2d4495cbb9c14dd8bf8b3b4e5bdded6e20 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 16 Apr 2018 10:26:46 -0700
Subject: bus: ti-sysc: Tag some modules resource providers for noirq suspend

Modules that provide resources for other modules need to be suspended
and resumed in the noirq calls. Tag the resource providing modules.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 77 +++++++++++++++++++++++++++++++++++
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 78 insertions(+)

(limited to 'include')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index f27b182384cd..1f90b91dbfae 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -665,6 +665,10 @@ static int sysc_suspend(struct device *dev)
 
 	ddata = dev_get_drvdata(dev);
 
+	if (ddata->cfg.quirks & (SYSC_QUIRK_RESOURCE_PROVIDER |
+				 SYSC_QUIRK_LEGACY_IDLE))
+		return 0;
+
 	if (!ddata->enabled)
 		return 0;
 
@@ -678,6 +682,58 @@ static int sysc_resume(struct device *dev)
 	struct sysc *ddata;
 
 	ddata = dev_get_drvdata(dev);
+
+	if (ddata->cfg.quirks & (SYSC_QUIRK_RESOURCE_PROVIDER |
+				 SYSC_QUIRK_LEGACY_IDLE))
+		return 0;
+
+	if (ddata->needs_resume) {
+		dev_dbg(ddata->dev, "%s %s\n", __func__,
+			ddata->name ? ddata->name : "");
+
+		ddata->needs_resume = false;
+
+		return sysc_runtime_resume(dev);
+	}
+
+	return 0;
+}
+
+static int sysc_noirq_suspend(struct device *dev)
+{
+	struct sysc *ddata;
+
+	ddata = dev_get_drvdata(dev);
+
+	if (ddata->cfg.quirks & SYSC_QUIRK_LEGACY_IDLE)
+		return 0;
+
+	if (!(ddata->cfg.quirks & SYSC_QUIRK_RESOURCE_PROVIDER))
+		return 0;
+
+	if (!ddata->enabled)
+		return 0;
+
+	dev_dbg(ddata->dev, "%s %s\n", __func__,
+		ddata->name ? ddata->name : "");
+
+	ddata->needs_resume = true;
+
+	return sysc_runtime_suspend(dev);
+}
+
+static int sysc_noirq_resume(struct device *dev)
+{
+	struct sysc *ddata;
+
+	ddata = dev_get_drvdata(dev);
+
+	if (ddata->cfg.quirks & SYSC_QUIRK_LEGACY_IDLE)
+		return 0;
+
+	if (!(ddata->cfg.quirks & SYSC_QUIRK_RESOURCE_PROVIDER))
+		return 0;
+
 	if (ddata->needs_resume) {
 		ddata->needs_resume = false;
 
@@ -690,6 +746,7 @@ static int sysc_resume(struct device *dev)
 
 static const struct dev_pm_ops sysc_pm_ops = {
 	SET_SYSTEM_SLEEP_PM_OPS(sysc_suspend, sysc_resume)
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(sysc_noirq_suspend, sysc_noirq_resume)
 	SET_RUNTIME_PM_OPS(sysc_runtime_suspend,
 			   sysc_runtime_resume,
 			   NULL)
@@ -721,6 +778,26 @@ struct sysc_revision_quirk {
 	}
 
 static const struct sysc_revision_quirk sysc_revision_quirks[] = {
+	/* These need to use noirq_suspend */
+	SYSC_QUIRK("control", 0, 0, 0x10, -1, 0x40000900, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("mcspi", 0, 0, 0x10, -1, 0x40300a0b, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("prcm", 0, 0, -1, -1, 0x40000100, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("ocp2scp", 0, 0, 0x10, 0x14, 0x50060005, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("padconf", 0, 0, 0x10, -1, 0x4fff0800, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("scm", 0, 0, 0x10, -1, 0x40000900, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("scrm", 0, 0, -1, -1, 0x00000010, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+	SYSC_QUIRK("sdma", 0, 0, 0x2c, 0x28, 0x00010900, 0xffffffff,
+		   SYSC_QUIRK_RESOURCE_PROVIDER),
+
 	/* These drivers need to be fixed to not use pm_runtime_irq_safe() */
 	SYSC_QUIRK("gpio", 0, 0, 0x10, 0x114, 0x50600801, 0xffffffff,
 		   SYSC_QUIRK_LEGACY_IDLE | SYSC_QUIRK_OPT_CLKS_IN_RESET),
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 80ce28d40832..990aad477458 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -45,6 +45,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_QUIRK_RESOURCE_PROVIDER	BIT(9)
 #define SYSC_QUIRK_LEGACY_IDLE		BIT(8)
 #define SYSC_QUIRK_RESET_STATUS		BIT(7)
 #define SYSC_QUIRK_NO_IDLE_ON_INIT	BIT(6)
-- 
cgit v1.2.3


From 6d3e8aa87622b30bfabbbfad7674d2464f2a9cb9 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Sun, 29 Apr 2018 12:56:31 -0300
Subject: sctp: allow sctp_init_cause to return errors

And do so if the skb doesn't have enough space for the payload.
This is a preparation for the next patch.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    |  2 +-
 net/sctp/sm_make_chunk.c | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index f4b657478a30..5ef1bad81ef5 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -215,7 +215,7 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
 struct sctp_chunk *sctp_make_shutdown_complete(
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *chunk);
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
 struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
 				   const struct sctp_chunk *chunk,
 				   const size_t hint);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index db93eabd6ef5..e518eb64ccf3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -158,8 +158,8 @@ static const struct sctp_paramhdr prsctp_param = {
  * provided chunk, as most cause codes will be embedded inside an
  * abort chunk.
  */
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
-		     size_t paylen)
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+		    size_t paylen)
 {
 	struct sctp_errhdr err;
 	__u16 len;
@@ -167,8 +167,14 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
 	/* Cause code constants are now defined in network order.  */
 	err.cause = cause_code;
 	len = sizeof(err) + paylen;
-	err.length  = htons(len);
+	err.length = htons(len);
+
+	if (skb_tailroom(chunk->skb) < len)
+		return -ENOSPC;
+
 	chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
+
+	return 0;
 }
 
 /* A helper to initialize an op error inside a
-- 
cgit v1.2.3


From e283de3a4fa885aed11525129fd4570f92c1d1a9 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 30 Apr 2018 14:20:05 -0700
Subject: net: core: Inline netdev_features_size_check()

We do not require this inline function to be used in multiple different
locations, just inline it where it gets used in register_netdevice().

Suggested-by: David Miller <davem@davemloft.net>
Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ------
 net/core/dev.c            | 3 ++-
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a569bf5524fa..46dcb5f7522f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4132,12 +4132,6 @@ const char *netdev_drivername(const struct net_device *dev);
 
 void linkwatch_run_queue(void);
 
-static inline void netdev_features_size_check(void)
-{
-	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
-		     NETDEV_FEATURE_COUNT);
-}
-
 static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
 							  netdev_features_t f2)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 8944e1e0059d..bb81a6e1d354 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7883,7 +7883,8 @@ int register_netdevice(struct net_device *dev)
 	int ret;
 	struct net *net = dev_net(dev);
 
-	netdev_features_size_check();
+	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+		     NETDEV_FEATURE_COUNT);
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-- 
cgit v1.2.3


From b086ff87251b4a4c147bc3af20369514e9d0d9ad Mon Sep 17 00:00:00 2001
From: Stefan Strogin <sstrogin@cisco.com>
Date: Tue, 1 May 2018 01:04:29 +0300
Subject: connector: add parent pid and tgid to coredump and exit events

The intention is to get notified of process failures as soon
as possible, before a possible core dumping (which could be very long)
(e.g. in some process-manager). Coredump and exit process events
are perfect for such use cases (see 2b5faa4c553f "connector: Added
coredumping event to the process connector").

The problem is that for now the process-manager cannot know the parent
of a dying process using connectors. This could be useful if the
process-manager should monitor for failures only children of certain
parents, so we could filter the coredump and exit events by parent
process and/or thread ID.

Add parent pid and tgid to coredump and exit process connectors event
data.

Signed-off-by: Stefan Strogin <sstrogin@cisco.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_proc.c  | 4 ++++
 include/uapi/linux/cn_proc.h | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index a782ce87715c..ed5e42461094 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -262,6 +262,8 @@ void proc_coredump_connector(struct task_struct *task)
 	ev->what = PROC_EVENT_COREDUMP;
 	ev->event_data.coredump.process_pid = task->pid;
 	ev->event_data.coredump.process_tgid = task->tgid;
+	ev->event_data.coredump.parent_pid = task->real_parent->pid;
+	ev->event_data.coredump.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
@@ -288,6 +290,8 @@ void proc_exit_connector(struct task_struct *task)
 	ev->event_data.exit.process_tgid = task->tgid;
 	ev->event_data.exit.exit_code = task->exit_code;
 	ev->event_data.exit.exit_signal = task->exit_signal;
+	ev->event_data.exit.parent_pid = task->real_parent->pid;
+	ev->event_data.exit.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index 68ff25414700..db210625cee8 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -116,12 +116,16 @@ struct proc_event {
 		struct coredump_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} coredump;
 
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
 			__u32 exit_code, exit_signal;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} exit;
 
 	} event_data;
-- 
cgit v1.2.3


From 40c57963789d451c26269e3bc9f9e803060fd9af Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Fri, 27 Apr 2018 13:31:51 +0200
Subject: ASoC: dai playback and capture active may be greater than 1

At the moment playback_active and capture_active are using only 1 bit so
the maximum active count is 1.

However, snd_soc_runtime_activate() may be called several time on the
same dai. This happens when a dai is part of several dai_links. It is
often the case for "snd-soc-dummy-dai".

This is a problem if snd_soc_runtime_activate() is called an even number
of times on a dai. In this case the active count overflow back to 0. As
consequence, ASoC functions, such as soc_dpcm_runtime_update(), won't run
correctly.

Storing these usage counts on plain 'unsigned int' solves the problem.

Fixes: f0fba2ad1b6b ("ASoC: multi-component - ASoC Multi-Component Support")
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dai.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h
index 8ad11669e4d8..35ebb0be5114 100644
--- a/include/sound/soc-dai.h
+++ b/include/sound/soc-dai.h
@@ -294,8 +294,8 @@ struct snd_soc_dai {
 	struct snd_soc_dai_driver *driver;
 
 	/* DAI runtime info */
-	unsigned int capture_active:1;		/* stream is in use */
-	unsigned int playback_active:1;		/* stream is in use */
+	unsigned int capture_active;		/* stream usage count */
+	unsigned int playback_active;		/* stream usage count */
 	unsigned int probed:1;
 
 	unsigned int active;
-- 
cgit v1.2.3


From b75eba76d3d72e2374fac999926dafef2997edd2 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Tue, 1 May 2018 15:39:15 -0400
Subject: tcp: send in-queue bytes in cmsg upon read

Applications with many concurrent connections, high variance
in receive queue length and tight memory bounds cannot
allocate worst-case buffer size to drain sockets. Knowing
the size of receive queue length, applications can optimize
how they allocate buffers to read from the socket.

The number of bytes pending on the socket is directly
available through ioctl(FIONREAD/SIOCINQ) and can be
approximated using getsockopt(MEMINFO) (rmem_alloc includes
skb overheads in addition to application data). But, both of
these options add an extra syscall per recvmsg. Moreover,
ioctl(FIONREAD/SIOCINQ) takes the socket lock.

Add the TCP_INQ socket option to TCP. When this socket
option is set, recvmsg() relays the number of bytes available
on the socket for reading to the application via the
TCP_CM_INQ control message.

Calculate the number of bytes after releasing the socket lock
to include the processed backlog, if any. To avoid an extra
branch in the hot path of recvmsg() for this new control
message, move all cmsg processing inside an existing branch for
processing receive timestamps. Since the socket lock is not held
when calculating the size of receive queue, TCP_INQ is a hint.
For example, it can overestimate the queue size by one byte,
if FIN is received.

With this method, applications can start reading from the socket
using a small buffer, and then use larger buffers based on the
remaining data when needed.

V3 change-log:
	As suggested by David Miller, added loads with barrier
	to check whether we have multiple threads calling recvmsg
	in parallel. When that happens we lock the socket to
	calculate inq.
V4 change-log:
	Removed inline from a static function.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  2 +-
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 43 +++++++++++++++++++++++++++++++++++++++----
 3 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 20585d5c4e1c..807776928cb8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -228,7 +228,7 @@ struct tcp_sock {
 		unused:2;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		unused1	    : 1,
+		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index e9e8373b34b9..29eb659aa77a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -123,6 +123,9 @@ enum {
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 #define TCP_ZEROCOPY_RECEIVE	35
+#define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */
+
+#define TCP_CM_INQ		TCP_INQ
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4028ddd14dd5..868ed74a76a8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1889,6 +1889,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 	}
 }
 
+static int tcp_inq_hint(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 copied_seq = READ_ONCE(tp->copied_seq);
+	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+	int inq;
+
+	inq = rcv_nxt - copied_seq;
+	if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+		lock_sock(sk);
+		inq = tp->rcv_nxt - tp->copied_seq;
+		release_sock(sk);
+	}
+	return inq;
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -1905,13 +1921,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	u32 peek_seq;
 	u32 *seq;
 	unsigned long used;
-	int err;
+	int err, inq;
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
 	struct scm_timestamping tss;
 	bool has_tss = false;
+	bool has_cmsg;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
@@ -1926,6 +1943,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	if (sk->sk_state == TCP_LISTEN)
 		goto out;
 
+	has_cmsg = tp->recvmsg_inq;
 	timeo = sock_rcvtimeo(sk, nonblock);
 
 	/* Urgent data needs to be handled specially. */
@@ -2112,6 +2130,7 @@ skip_copy:
 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
 			tcp_update_recv_tstamps(skb, &tss);
 			has_tss = true;
+			has_cmsg = true;
 		}
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
@@ -2131,13 +2150,20 @@ skip_copy:
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
 
-	if (has_tss)
-		tcp_recv_timestamp(msg, sk, &tss);
-
 	/* Clean up data we have read: This will do ACK frames. */
 	tcp_cleanup_rbuf(sk, copied);
 
 	release_sock(sk);
+
+	if (has_cmsg) {
+		if (has_tss)
+			tcp_recv_timestamp(msg, sk, &tss);
+		if (tp->recvmsg_inq) {
+			inq = tcp_inq_hint(sk);
+			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+		}
+	}
+
 	return copied;
 
 out:
@@ -3006,6 +3032,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		tp->notsent_lowat = val;
 		sk->sk_write_space(sk);
 		break;
+	case TCP_INQ:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else
+			tp->recvmsg_inq = val;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -3431,6 +3463,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_NOTSENT_LOWAT:
 		val = tp->notsent_lowat;
 		break;
+	case TCP_INQ:
+		val = tp->recvmsg_inq;
+		break;
 	case TCP_SAVE_SYN:
 		val = tp->save_syn;
 		break;
-- 
cgit v1.2.3


From 2c269b090651234203c2f74af059a19f98ed101d Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Apr 2018 08:17:08 +0200
Subject: dma-fence: Some kerneldoc polish for dma-fence.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Switch to inline member docs for dma_fence_ops.
- Mild polish all around.
- hyperlink all the things!

v2: - Remove the various [in] annotations, they seem really uncommon
in kerneldoc and look funny.

v3: Linebreak the "Returns" part of the @fill_driver_data kerneldoc
(Eric).

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180427061724.28497-2-daniel.vetter@ffwll.ch
---
 include/linux/dma-fence.h | 236 ++++++++++++++++++++++++++++++----------------
 1 file changed, 155 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 4c008170fe65..eb9b05aa5aea 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -94,11 +94,11 @@ typedef void (*dma_fence_func_t)(struct dma_fence *fence,
 				 struct dma_fence_cb *cb);
 
 /**
- * struct dma_fence_cb - callback for dma_fence_add_callback
- * @node: used by dma_fence_add_callback to append this struct to fence::cb_list
+ * struct dma_fence_cb - callback for dma_fence_add_callback()
+ * @node: used by dma_fence_add_callback() to append this struct to fence::cb_list
  * @func: dma_fence_func_t to call
  *
- * This struct will be initialized by dma_fence_add_callback, additional
+ * This struct will be initialized by dma_fence_add_callback(), additional
  * data can be passed along by embedding dma_fence_cb in another struct.
  */
 struct dma_fence_cb {
@@ -108,75 +108,143 @@ struct dma_fence_cb {
 
 /**
  * struct dma_fence_ops - operations implemented for fence
- * @get_driver_name: returns the driver name.
- * @get_timeline_name: return the name of the context this fence belongs to.
- * @enable_signaling: enable software signaling of fence.
- * @signaled: [optional] peek whether the fence is signaled, can be null.
- * @wait: custom wait implementation, or dma_fence_default_wait.
- * @release: [optional] called on destruction of fence, can be null
- * @fill_driver_data: [optional] callback to fill in free-form debug info
- * Returns amount of bytes filled, or -errno.
- * @fence_value_str: [optional] fills in the value of the fence as a string
- * @timeline_value_str: [optional] fills in the current value of the timeline
- * as a string
  *
- * Notes on enable_signaling:
- * For fence implementations that have the capability for hw->hw
- * signaling, they can implement this op to enable the necessary
- * irqs, or insert commands into cmdstream, etc.  This is called
- * in the first wait() or add_callback() path to let the fence
- * implementation know that there is another driver waiting on
- * the signal (ie. hw->sw case).
- *
- * This function can be called from atomic context, but not
- * from irq context, so normal spinlocks can be used.
- *
- * A return value of false indicates the fence already passed,
- * or some failure occurred that made it impossible to enable
- * signaling. True indicates successful enabling.
- *
- * fence->error may be set in enable_signaling, but only when false is
- * returned.
- *
- * Calling dma_fence_signal before enable_signaling is called allows
- * for a tiny race window in which enable_signaling is called during,
- * before, or after dma_fence_signal. To fight this, it is recommended
- * that before enable_signaling returns true an extra reference is
- * taken on the fence, to be released when the fence is signaled.
- * This will mean dma_fence_signal will still be called twice, but
- * the second time will be a noop since it was already signaled.
- *
- * Notes on signaled:
- * May set fence->error if returning true.
- *
- * Notes on wait:
- * Must not be NULL, set to dma_fence_default_wait for default implementation.
- * the dma_fence_default_wait implementation should work for any fence, as long
- * as enable_signaling works correctly.
- *
- * Must return -ERESTARTSYS if the wait is intr = true and the wait was
- * interrupted, and remaining jiffies if fence has signaled, or 0 if wait
- * timed out. Can also return other error values on custom implementations,
- * which should be treated as if the fence is signaled. For example a hardware
- * lockup could be reported like that.
- *
- * Notes on release:
- * Can be NULL, this function allows additional commands to run on
- * destruction of the fence. Can be called from irq context.
- * If pointer is set to NULL, kfree will get called instead.
  */
-
 struct dma_fence_ops {
+	/**
+	 * @get_driver_name:
+	 *
+	 * Returns the driver name. This is a callback to allow drivers to
+	 * compute the name at runtime, without having it to store permanently
+	 * for each fence, or build a cache of some sort.
+	 *
+	 * This callback is mandatory.
+	 */
 	const char * (*get_driver_name)(struct dma_fence *fence);
+
+	/**
+	 * @get_timeline_name:
+	 *
+	 * Return the name of the context this fence belongs to. This is a
+	 * callback to allow drivers to compute the name at runtime, without
+	 * having it to store permanently for each fence, or build a cache of
+	 * some sort.
+	 *
+	 * This callback is mandatory.
+	 */
 	const char * (*get_timeline_name)(struct dma_fence *fence);
+
+	/**
+	 * @enable_signaling:
+	 *
+	 * Enable software signaling of fence.
+	 *
+	 * For fence implementations that have the capability for hw->hw
+	 * signaling, they can implement this op to enable the necessary
+	 * interrupts, or insert commands into cmdstream, etc, to avoid these
+	 * costly operations for the common case where only hw->hw
+	 * synchronization is required.  This is called in the first
+	 * dma_fence_wait() or dma_fence_add_callback() path to let the fence
+	 * implementation know that there is another driver waiting on the
+	 * signal (ie. hw->sw case).
+	 *
+	 * This function can be called from atomic context, but not
+	 * from irq context, so normal spinlocks can be used.
+	 *
+	 * A return value of false indicates the fence already passed,
+	 * or some failure occurred that made it impossible to enable
+	 * signaling. True indicates successful enabling.
+	 *
+	 * &dma_fence.error may be set in enable_signaling, but only when false
+	 * is returned.
+	 *
+	 * Since many implementations can call dma_fence_signal() even when before
+	 * @enable_signaling has been called there's a race window, where the
+	 * dma_fence_signal() might result in the final fence reference being
+	 * released and its memory freed. To avoid this, implementations of this
+	 * callback should grab their own reference using dma_fence_get(), to be
+	 * released when the fence is signalled (through e.g. the interrupt
+	 * handler).
+	 *
+	 * This callback is mandatory.
+	 */
 	bool (*enable_signaling)(struct dma_fence *fence);
+
+	/**
+	 * @signaled:
+	 *
+	 * Peek whether the fence is signaled, as a fastpath optimization for
+	 * e.g. dma_fence_wait() or dma_fence_add_callback(). Note that this
+	 * callback does not need to make any guarantees beyond that a fence
+	 * once indicates as signalled must always return true from this
+	 * callback. This callback may return false even if the fence has
+	 * completed already, in this case information hasn't propogated throug
+	 * the system yet. See also dma_fence_is_signaled().
+	 *
+	 * May set &dma_fence.error if returning true.
+	 *
+	 * This callback is optional.
+	 */
 	bool (*signaled)(struct dma_fence *fence);
+
+	/**
+	 * @wait:
+	 *
+	 * Custom wait implementation, or dma_fence_default_wait.
+	 *
+	 * Must not be NULL, set to dma_fence_default_wait for default implementation.
+	 * the dma_fence_default_wait implementation should work for any fence, as long
+	 * as enable_signaling works correctly.
+	 *
+	 * Must return -ERESTARTSYS if the wait is intr = true and the wait was
+	 * interrupted, and remaining jiffies if fence has signaled, or 0 if wait
+	 * timed out. Can also return other error values on custom implementations,
+	 * which should be treated as if the fence is signaled. For example a hardware
+	 * lockup could be reported like that.
+	 *
+	 * This callback is mandatory.
+	 */
 	signed long (*wait)(struct dma_fence *fence,
 			    bool intr, signed long timeout);
+
+	/**
+	 * @release:
+	 *
+	 * Called on destruction of fence to release additional resources.
+	 * Can be called from irq context.  This callback is optional. If it is
+	 * NULL, then dma_fence_free() is instead called as the default
+	 * implementation.
+	 */
 	void (*release)(struct dma_fence *fence);
 
+	/**
+	 * @fill_driver_data:
+	 *
+	 * Callback to fill in free-form debug info.
+	 *
+	 * Returns amount of bytes filled, or negative error on failure.
+	 *
+	 * This callback is optional.
+	 */
 	int (*fill_driver_data)(struct dma_fence *fence, void *data, int size);
+
+	/**
+	 * @fence_value_str:
+	 *
+	 * Callback to fill in free-form debug info specific to this fence, like
+	 * the sequence number.
+	 *
+	 * This callback is optional.
+	 */
 	void (*fence_value_str)(struct dma_fence *fence, char *str, int size);
+
+	/**
+	 * @timeline_value_str:
+	 *
+	 * Fills in the current value of the timeline as a string, like the
+	 * sequence number. This should match what @fill_driver_data prints for
+	 * the most recently signalled fence (assuming no delayed signalling).
+	 */
 	void (*timeline_value_str)(struct dma_fence *fence,
 				   char *str, int size);
 };
@@ -189,7 +257,7 @@ void dma_fence_free(struct dma_fence *fence);
 
 /**
  * dma_fence_put - decreases refcount of the fence
- * @fence:	[in]	fence to reduce refcount of
+ * @fence: fence to reduce refcount of
  */
 static inline void dma_fence_put(struct dma_fence *fence)
 {
@@ -199,7 +267,7 @@ static inline void dma_fence_put(struct dma_fence *fence)
 
 /**
  * dma_fence_get - increases refcount of the fence
- * @fence:	[in]	fence to increase refcount of
+ * @fence: fence to increase refcount of
  *
  * Returns the same fence, with refcount increased by 1.
  */
@@ -213,7 +281,7 @@ static inline struct dma_fence *dma_fence_get(struct dma_fence *fence)
 /**
  * dma_fence_get_rcu - get a fence from a reservation_object_list with
  *                     rcu read lock
- * @fence:	[in]	fence to increase refcount of
+ * @fence: fence to increase refcount of
  *
  * Function returns NULL if no refcount could be obtained, or the fence.
  */
@@ -227,7 +295,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence)
 
 /**
  * dma_fence_get_rcu_safe  - acquire a reference to an RCU tracked fence
- * @fencep:	[in]	pointer to fence to increase refcount of
+ * @fencep: pointer to fence to increase refcount of
  *
  * Function returns NULL if no refcount could be obtained, or the fence.
  * This function handles acquiring a reference to a fence that may be
@@ -289,14 +357,16 @@ void dma_fence_enable_sw_signaling(struct dma_fence *fence);
 /**
  * dma_fence_is_signaled_locked - Return an indication if the fence
  *                                is signaled yet.
- * @fence:	[in]	the fence to check
+ * @fence: the fence to check
  *
  * Returns true if the fence was already signaled, false if not. Since this
  * function doesn't enable signaling, it is not guaranteed to ever return
- * true if dma_fence_add_callback, dma_fence_wait or
- * dma_fence_enable_sw_signaling haven't been called before.
+ * true if dma_fence_add_callback(), dma_fence_wait() or
+ * dma_fence_enable_sw_signaling() haven't been called before.
  *
- * This function requires fence->lock to be held.
+ * This function requires &dma_fence.lock to be held.
+ *
+ * See also dma_fence_is_signaled().
  */
 static inline bool
 dma_fence_is_signaled_locked(struct dma_fence *fence)
@@ -314,17 +384,19 @@ dma_fence_is_signaled_locked(struct dma_fence *fence)
 
 /**
  * dma_fence_is_signaled - Return an indication if the fence is signaled yet.
- * @fence:	[in]	the fence to check
+ * @fence: the fence to check
  *
  * Returns true if the fence was already signaled, false if not. Since this
  * function doesn't enable signaling, it is not guaranteed to ever return
- * true if dma_fence_add_callback, dma_fence_wait or
- * dma_fence_enable_sw_signaling haven't been called before.
+ * true if dma_fence_add_callback(), dma_fence_wait() or
+ * dma_fence_enable_sw_signaling() haven't been called before.
  *
  * It's recommended for seqno fences to call dma_fence_signal when the
  * operation is complete, it makes it possible to prevent issues from
  * wraparound between time of issue and time of use by checking the return
  * value of this function before calling hardware-specific wait instructions.
+ *
+ * See also dma_fence_is_signaled_locked().
  */
 static inline bool
 dma_fence_is_signaled(struct dma_fence *fence)
@@ -342,8 +414,8 @@ dma_fence_is_signaled(struct dma_fence *fence)
 
 /**
  * __dma_fence_is_later - return if f1 is chronologically later than f2
- * @f1:	[in]	the first fence's seqno
- * @f2:	[in]	the second fence's seqno from the same context
+ * @f1: the first fence's seqno
+ * @f2: the second fence's seqno from the same context
  *
  * Returns true if f1 is chronologically later than f2. Both fences must be
  * from the same context, since a seqno is not common across contexts.
@@ -355,8 +427,8 @@ static inline bool __dma_fence_is_later(u32 f1, u32 f2)
 
 /**
  * dma_fence_is_later - return if f1 is chronologically later than f2
- * @f1:	[in]	the first fence from the same context
- * @f2:	[in]	the second fence from the same context
+ * @f1: the first fence from the same context
+ * @f2: the second fence from the same context
  *
  * Returns true if f1 is chronologically later than f2. Both fences must be
  * from the same context, since a seqno is not re-used across contexts.
@@ -372,8 +444,8 @@ static inline bool dma_fence_is_later(struct dma_fence *f1,
 
 /**
  * dma_fence_later - return the chronologically later fence
- * @f1:	[in]	the first fence from the same context
- * @f2:	[in]	the second fence from the same context
+ * @f1:	the first fence from the same context
+ * @f2:	the second fence from the same context
  *
  * Returns NULL if both fences are signaled, otherwise the fence that would be
  * signaled last. Both fences must be from the same context, since a seqno is
@@ -398,7 +470,7 @@ static inline struct dma_fence *dma_fence_later(struct dma_fence *f1,
 
 /**
  * dma_fence_get_status_locked - returns the status upon completion
- * @fence: [in]	the dma_fence to query
+ * @fence: the dma_fence to query
  *
  * Drivers can supply an optional error status condition before they signal
  * the fence (to indicate whether the fence was completed due to an error
@@ -422,8 +494,8 @@ int dma_fence_get_status(struct dma_fence *fence);
 
 /**
  * dma_fence_set_error - flag an error condition on the fence
- * @fence: [in]	the dma_fence
- * @error: [in]	the error to store
+ * @fence: the dma_fence
+ * @error: the error to store
  *
  * Drivers can supply an optional error status condition before they signal
  * the fence, to indicate that the fence was completed due to an error
@@ -449,8 +521,8 @@ signed long dma_fence_wait_any_timeout(struct dma_fence **fences,
 
 /**
  * dma_fence_wait - sleep until the fence gets signaled
- * @fence:	[in]	the fence to wait on
- * @intr:	[in]	if true, do an interruptible wait
+ * @fence: the fence to wait on
+ * @intr: if true, do an interruptible wait
  *
  * This function will return -ERESTARTSYS if interrupted by a signal,
  * or 0 if the fence was signaled. Other error values may be
@@ -459,6 +531,8 @@ signed long dma_fence_wait_any_timeout(struct dma_fence **fences,
  * Performs a synchronous wait on this fence. It is assumed the caller
  * directly or indirectly holds a reference to the fence, otherwise the
  * fence might be freed before return, resulting in undefined behavior.
+ *
+ * See also dma_fence_wait_timeout() and dma_fence_wait_any_timeout().
  */
 static inline signed long dma_fence_wait(struct dma_fence *fence, bool intr)
 {
-- 
cgit v1.2.3


From bde1a3d84634f98151e3f748ab90865e9f544b10 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 30 Apr 2018 10:24:42 +0200
Subject: mtd: rawnand: davinci: store the core chipselect number in platform
 data

We have the 'ti,davinci-chipselect' property in the device tree, but
when using platform data the driver silently uses the id field of
struct platform_device as the chipselect. This is confusing and we
almost broke the nand support again recently after converting the
platform to common clock framework (which changed the device id in the
clock lookup - the problem is gone now that we no longer acquire the
clock in the nand driver.

This patch adds a new field - core_chipsel - to the platform_data.
Subsequent patches will convert the platforms to using this new field.

Acked-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 include/linux/platform_data/mtd-davinci.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/mtd-davinci.h b/include/linux/platform_data/mtd-davinci.h
index f1a2cf655bdb..1bbfa27cccb4 100644
--- a/include/linux/platform_data/mtd-davinci.h
+++ b/include/linux/platform_data/mtd-davinci.h
@@ -56,6 +56,16 @@ struct davinci_nand_pdata {		/* platform_data */
 	uint32_t		mask_ale;
 	uint32_t		mask_cle;
 
+	/*
+	 * 0-indexed chip-select number of the asynchronous
+	 * interface to which the NAND device has been connected.
+	 *
+	 * So, if you have NAND connected to CS3 of DA850, you
+	 * will pass '1' here. Since the asynchronous interface
+	 * on DA850 starts from CS2.
+	 */
+	uint32_t		core_chipsel;
+
 	/* for packages using two chipselects */
 	uint32_t		mask_chipsel;
 
-- 
cgit v1.2.3


From cc7f3f132658289b6661ab8294ab08a9d32ea026 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Apr 2018 14:16:46 +0200
Subject: ghes, EDAC: Fix ghes_edac registration

Tony reported seeing

  "Internal error: Can't find EDAC structure"

when injecting correctable errors due to the fact that ghes_edac would
still load even if the whitelist won't hit. Drop the pr_err() in
ghes_edac_report_mem_error() for now due to the hacky way how ghes_edac
depends on ghes.c.

While at it, make ghes_edac_register() return an error if it doesn't hit
in the whitelist as it is the only sensible thing to do in that
situation.

Furthermore, move the call to it to happen last in ghes_probe() so that
GHES initializing properly does not depend on ghes_edac init at all
as latter is only reporting errors and not required for GHES's proper
functioning.

Reviewed-by: Toshi Kani <toshi.kani@hpe.com>
Tested-by: Sughosh Ganu <sughosh.ganu@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20180420182015.zao3olss4tvvlxki@agluck-desk
---
 drivers/acpi/apei/ghes.c | 14 ++++++--------
 drivers/edac/ghes_edac.c |  6 ++----
 include/acpi/ghes.h      |  2 +-
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 1efefe919555..88103333ee1b 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -1087,10 +1087,6 @@ static int ghes_probe(struct platform_device *ghes_dev)
 		goto err;
 	}
 
-	rc = ghes_edac_register(ghes, &ghes_dev->dev);
-	if (rc < 0)
-		goto err;
-
 	switch (generic->notify.type) {
 	case ACPI_HEST_NOTIFY_POLLED:
 		timer_setup(&ghes->timer, ghes_poll_func, TIMER_DEFERRABLE);
@@ -1102,14 +1098,14 @@ static int ghes_probe(struct platform_device *ghes_dev)
 		if (rc) {
 			pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
 			       generic->header.source_id);
-			goto err_edac_unreg;
+			goto err;
 		}
 		rc = request_irq(ghes->irq, ghes_irq_func, IRQF_SHARED,
 				 "GHES IRQ", ghes);
 		if (rc) {
 			pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
 			       generic->header.source_id);
-			goto err_edac_unreg;
+			goto err;
 		}
 		break;
 
@@ -1132,14 +1128,16 @@ static int ghes_probe(struct platform_device *ghes_dev)
 	default:
 		BUG();
 	}
+
 	platform_set_drvdata(ghes_dev, ghes);
 
+	ghes_edac_register(ghes, &ghes_dev->dev);
+
 	/* Handle any pending errors right away */
 	ghes_proc(ghes);
 
 	return 0;
-err_edac_unreg:
-	ghes_edac_unregister(ghes);
+
 err:
 	if (ghes) {
 		ghes_fini(ghes);
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 68b6ee18bea6..7fdbfe871f5c 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -183,10 +183,8 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 	char *p;
 	u8 grain_bits;
 
-	if (!pvt) {
-		pr_err("Internal error: Can't find EDAC structure\n");
+	if (!pvt)
 		return;
-	}
 
 	/*
 	 * We can do the locking below because GHES defers error processing
@@ -439,7 +437,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	/* Check if safe to enable on this system */
 	idx = acpi_match_platform_list(plat_list);
 	if (!force_load && idx < 0)
-		return 0;
+		return -ENODEV;
 
 	/*
 	 * We have only one logical memory controller to which all DIMMs belong.
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 8feb0c866ee0..4fc92289f7f8 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -70,7 +70,7 @@ static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 
 static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
 {
-	return 0;
+	return -ENODEV;
 }
 
 static inline void ghes_edac_unregister(struct ghes *ghes)
-- 
cgit v1.2.3


From 25d5adf808523c73c8cec7520659229a5bb86fee Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Mon, 16 Apr 2018 10:32:19 +0800
Subject: pinctrl: mediatek: update pinmux defintions for MT7623

Fulfill the pinmux macros for MT7623

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/dt-bindings/pinctrl/mt7623-pinfunc.h | 90 +++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/pinctrl/mt7623-pinfunc.h b/include/dt-bindings/pinctrl/mt7623-pinfunc.h
index 4878a67a844c..604fe781c465 100644
--- a/include/dt-bindings/pinctrl/mt7623-pinfunc.h
+++ b/include/dt-bindings/pinctrl/mt7623-pinfunc.h
@@ -23,20 +23,26 @@
 
 #define MT7623_PIN_5_PWRAP_SPI0_CK2_FUNC_GPIO5 (MTK_PIN_NO(5) | 0)
 #define MT7623_PIN_5_PWRAP_SPI0_CK2_FUNC_PWRAP_SPICK2_I (MTK_PIN_NO(5) | 1)
+#define MT7623_PIN_5_PWRAP_SPI0_CK2_FUNC_ANT_SEL1 (MTK_PIN_NO(5) | 5)
 
 #define MT7623_PIN_6_PWRAP_SPI0_CSN2_FUNC_GPIO6 (MTK_PIN_NO(6) | 0)
 #define MT7623_PIN_6_PWRAP_SPI0_CSN2_FUNC_PWRAP_SPICS2_B_I (MTK_PIN_NO(6) | 1)
+#define MT7623_PIN_6_PWRAP_SPI0_CSN2_FUNC_ANT_SEL0 (MTK_PIN_NO(6) | 5)
 
 #define MT7623_PIN_7_SPI1_CSN_FUNC_GPIO7 (MTK_PIN_NO(7) | 0)
 #define MT7623_PIN_7_SPI1_CSN_FUNC_SPI1_CS (MTK_PIN_NO(7) | 1)
+#define MT7623_PIN_7_SPI1_CSN_FUNC_KCOL0 (MTK_PIN_NO(7) | 4)
 
 #define MT7623_PIN_8_SPI1_MI_FUNC_GPIO8 (MTK_PIN_NO(8) | 0)
 #define MT7623_PIN_8_SPI1_MI_FUNC_SPI1_MI (MTK_PIN_NO(8) | 1)
 #define MT7623_PIN_8_SPI1_MI_FUNC_SPI1_MO (MTK_PIN_NO(8) | 2)
+#define MT7623_PIN_8_SPI1_MI_FUNC_KCOL1 (MTK_PIN_NO(8) | 4)
 
 #define MT7623_PIN_9_SPI1_MO_FUNC_GPIO9 (MTK_PIN_NO(9) | 0)
 #define MT7623_PIN_9_SPI1_MO_FUNC_SPI1_MO (MTK_PIN_NO(9) | 1)
 #define MT7623_PIN_9_SPI1_MO_FUNC_SPI1_MI (MTK_PIN_NO(9) | 2)
+#define MT7623_PIN_9_SPI1_MO_FUNC_EXT_FRAME_SYNC (MTK_PIN_NO(9) | 3)
+#define MT7623_PIN_9_SPI1_MO_FUNC_KCOL2 (MTK_PIN_NO(9) | 4)
 
 #define MT7623_PIN_10_RTC32K_CK_FUNC_GPIO10 (MTK_PIN_NO(10) | 0)
 #define MT7623_PIN_10_RTC32K_CK_FUNC_RTC32K_CK (MTK_PIN_NO(10) | 1)
@@ -53,6 +59,7 @@
 #define MT7623_PIN_14_GPIO14_FUNC_GPIO14 (MTK_PIN_NO(14) | 0)
 #define MT7623_PIN_14_GPIO14_FUNC_URXD2 (MTK_PIN_NO(14) | 1)
 #define MT7623_PIN_14_GPIO14_FUNC_UTXD2 (MTK_PIN_NO(14) | 2)
+#define MT7623_PIN_14_GPIO14_FUNC_SRCCLKENAI2 (MTK_PIN_NO(14) | 5)
 
 #define MT7623_PIN_15_GPIO15_FUNC_GPIO15 (MTK_PIN_NO(15) | 0)
 #define MT7623_PIN_15_GPIO15_FUNC_UTXD2 (MTK_PIN_NO(15) | 1)
@@ -60,88 +67,139 @@
 
 #define MT7623_PIN_18_PCM_CLK_FUNC_GPIO18 (MTK_PIN_NO(18) | 0)
 #define MT7623_PIN_18_PCM_CLK_FUNC_PCM_CLK0 (MTK_PIN_NO(18) | 1)
+#define MT7623_PIN_18_PCM_CLK_FUNC_MRG_CLK (MTK_PIN_NO(18) | 2)
+#define MT7623_PIN_18_PCM_CLK_FUNC_MM_TEST_CK (MTK_PIN_NO(18) | 4)
+#define MT7623_PIN_18_PCM_CLK_FUNC_CONN_DSP_JCK (MTK_PIN_NO(18) | 5)
 #define MT7623_PIN_18_PCM_CLK_FUNC_AP_PCM_CLKO (MTK_PIN_NO(18) | 6)
 
 #define MT7623_PIN_19_PCM_SYNC_FUNC_GPIO19 (MTK_PIN_NO(19) | 0)
 #define MT7623_PIN_19_PCM_SYNC_FUNC_PCM_SYNC (MTK_PIN_NO(19) | 1)
+#define MT7623_PIN_19_PCM_SYNC_FUNC_MRG_SYNC (MTK_PIN_NO(19) | 2)
+#define MT7623_PIN_19_PCM_SYNC_FUNC_CONN_DSP_JINTP (MTK_PIN_NO(19) | 5)
 #define MT7623_PIN_19_PCM_SYNC_FUNC_AP_PCM_SYNC (MTK_PIN_NO(19) | 6)
 
 #define MT7623_PIN_20_PCM_RX_FUNC_GPIO20 (MTK_PIN_NO(20) | 0)
 #define MT7623_PIN_20_PCM_RX_FUNC_PCM_RX (MTK_PIN_NO(20) | 1)
+#define MT7623_PIN_20_PCM_RX_FUNC_MRG_RX (MTK_PIN_NO(20) | 2)
+#define MT7623_PIN_20_PCM_RX_FUNC_MRG_TX (MTK_PIN_NO(20) | 3)
 #define MT7623_PIN_20_PCM_RX_FUNC_PCM_TX (MTK_PIN_NO(20) | 4)
+#define MT7623_PIN_20_PCM_RX_FUNC_CONN_DSP_JDI (MTK_PIN_NO(20) | 5)
 #define MT7623_PIN_20_PCM_RX_FUNC_AP_PCM_RX (MTK_PIN_NO(20) | 6)
 
 #define MT7623_PIN_21_PCM_TX_FUNC_GPIO21 (MTK_PIN_NO(21) | 0)
 #define MT7623_PIN_21_PCM_TX_FUNC_PCM_TX (MTK_PIN_NO(21) | 1)
+#define MT7623_PIN_21_PCM_TX_FUNC_MRG_TX (MTK_PIN_NO(21) | 2)
+#define MT7623_PIN_21_PCM_TX_FUNC_MRG_RX (MTK_PIN_NO(21) | 3)
 #define MT7623_PIN_21_PCM_TX_FUNC_PCM_RX (MTK_PIN_NO(21) | 4)
+#define MT7623_PIN_21_PCM_TX_FUNC_CONN_DSP_JMS (MTK_PIN_NO(21) | 5)
 #define MT7623_PIN_21_PCM_TX_FUNC_AP_PCM_TX (MTK_PIN_NO(21) | 6)
 
 #define MT7623_PIN_22_EINT0_FUNC_GPIO22 (MTK_PIN_NO(22) | 0)
 #define MT7623_PIN_22_EINT0_FUNC_UCTS0 (MTK_PIN_NO(22) | 1)
 #define MT7623_PIN_22_EINT0_FUNC_PCIE0_PERST_N (MTK_PIN_NO(22) | 2)
+#define MT7623_PIN_22_EINT0_FUNC_KCOL3 (MTK_PIN_NO(22) | 3)
+#define MT7623_PIN_22_EINT0_FUNC_CONN_DSP_JDO (MTK_PIN_NO(22) | 4)
+#define MT7623_PIN_22_EINT0_FUNC_EXT_FRAME_SYNC (MTK_PIN_NO(22) | 5)
 
 #define MT7623_PIN_23_EINT1_FUNC_GPIO23 (MTK_PIN_NO(23) | 0)
 #define MT7623_PIN_23_EINT1_FUNC_URTS0 (MTK_PIN_NO(23) | 1)
 #define MT7623_PIN_23_EINT1_FUNC_PCIE1_PERST_N (MTK_PIN_NO(23) | 2)
+#define MT7623_PIN_23_EINT1_FUNC_KCOL2 (MTK_PIN_NO(23) | 3)
+#define MT7623_PIN_23_EINT1_FUNC_CONN_MCU_TDO (MTK_PIN_NO(23) | 4)
+#define MT7623_PIN_23_EINT1_FUNC_EXT_FRAME_SYNC (MTK_PIN_NO(23) | 5)
 
 #define MT7623_PIN_24_EINT2_FUNC_GPIO24 (MTK_PIN_NO(24) | 0)
 #define MT7623_PIN_24_EINT2_FUNC_UCTS1 (MTK_PIN_NO(24) | 1)
 #define MT7623_PIN_24_EINT2_FUNC_PCIE2_PERST_N (MTK_PIN_NO(24) | 2)
+#define MT7623_PIN_24_EINT2_FUNC_KCOL1 (MTK_PIN_NO(24) | 3)
+#define MT7623_PIN_24_EINT2_FUNC_CONN_MCU_DBGACK_N (MTK_PIN_NO(24) | 4)
 
 #define MT7623_PIN_25_EINT3_FUNC_GPIO25 (MTK_PIN_NO(25) | 0)
 #define MT7623_PIN_25_EINT3_FUNC_URTS1 (MTK_PIN_NO(25) | 1)
+#define MT7623_PIN_25_EINT3_FUNC_KCOL0 (MTK_PIN_NO(25) | 3)
+#define MT7623_PIN_25_EINT3_FUNC_CONN_MCU_DBGI_N (MTK_PIN_NO(25) | 4)
 
 #define MT7623_PIN_26_EINT4_FUNC_GPIO26 (MTK_PIN_NO(26) | 0)
 #define MT7623_PIN_26_EINT4_FUNC_UCTS3 (MTK_PIN_NO(26) | 1)
+#define MT7623_PIN_26_EINT4_FUNC_DRV_VBUS_P1 (MTK_PIN_NO(26) | 2)
+#define MT7623_PIN_26_EINT4_FUNC_KROW3 (MTK_PIN_NO(26) | 3)
+#define MT7623_PIN_26_EINT4_FUNC_CONN_MCU_TCK0 (MTK_PIN_NO(26) | 4)
+#define MT7623_PIN_26_EINT4_FUNC_CONN_MCU_AICE_JCKC (MTK_PIN_NO(26) | 5)
 #define MT7623_PIN_26_EINT4_FUNC_PCIE2_WAKE_N (MTK_PIN_NO(26) | 6)
 
 #define MT7623_PIN_27_EINT5_FUNC_GPIO27 (MTK_PIN_NO(27) | 0)
 #define MT7623_PIN_27_EINT5_FUNC_URTS3 (MTK_PIN_NO(27) | 1)
+#define MT7623_PIN_27_EINT5_FUNC_IDDIG_P1 (MTK_PIN_NO(27) | 2)
+#define MT7623_PIN_27_EINT5_FUNC_KROW2 (MTK_PIN_NO(27) | 3)
+#define MT7623_PIN_27_EINT5_FUNC_CONN_MCU_TDI (MTK_PIN_NO(27) | 4)
 #define MT7623_PIN_27_EINT5_FUNC_PCIE1_WAKE_N (MTK_PIN_NO(27) | 6)
 
 #define MT7623_PIN_28_EINT6_FUNC_GPIO28 (MTK_PIN_NO(28) | 0)
 #define MT7623_PIN_28_EINT6_FUNC_DRV_VBUS (MTK_PIN_NO(28) | 1)
+#define MT7623_PIN_28_EINT6_FUNC_KROW1 (MTK_PIN_NO(28) | 3)
+#define MT7623_PIN_28_EINT6_FUNC_CONN_MCU_TRST_B (MTK_PIN_NO(28) | 4)
 #define MT7623_PIN_28_EINT6_FUNC_PCIE0_WAKE_N (MTK_PIN_NO(28) | 6)
 
 #define MT7623_PIN_29_EINT7_FUNC_GPIO29 (MTK_PIN_NO(29) | 0)
 #define MT7623_PIN_29_EINT7_FUNC_IDDIG (MTK_PIN_NO(29) | 1)
 #define MT7623_PIN_29_EINT7_FUNC_MSDC1_WP (MTK_PIN_NO(29) | 2)
+#define MT7623_PIN_29_EINT7_FUNC_KROW0 (MTK_PIN_NO(29) | 3)
+#define MT7623_PIN_29_EINT7_FUNC_CONN_MCU_TMS (MTK_PIN_NO(29) | 4)
+#define MT7623_PIN_29_EINT7_FUNC_CONN_MCU_AICE_JMSC (MTK_PIN_NO(29) | 5)
 #define MT7623_PIN_29_EINT7_FUNC_PCIE2_PERST_N (MTK_PIN_NO(29) | 6)
 
 #define MT7623_PIN_33_I2S1_DATA_FUNC_GPIO33 (MTK_PIN_NO(33) | 0)
 #define MT7623_PIN_33_I2S1_DATA_FUNC_I2S1_DATA (MTK_PIN_NO(33) | 1)
+#define MT7623_PIN_33_I2S1_DATA_FUNC_I2S1_DATA_BYPS (MTK_PIN_NO(33) | 2)
 #define MT7623_PIN_33_I2S1_DATA_FUNC_PCM_TX (MTK_PIN_NO(33) | 3)
+#define MT7623_PIN_33_I2S1_DATA_FUNC_IMG_TEST_CK (MTK_PIN_NO(33) | 4)
+#define MT7623_PIN_33_I2S1_DATA_FUNC_G1_RXD0 (MTK_PIN_NO(33) | 5)
 #define MT7623_PIN_33_I2S1_DATA_FUNC_AP_PCM_TX (MTK_PIN_NO(33) | 6)
 
 #define MT7623_PIN_34_I2S1_DATA_IN_FUNC_GPIO34 (MTK_PIN_NO(34) | 0)
 #define MT7623_PIN_34_I2S1_DATA_IN_FUNC_I2S1_DATA_IN (MTK_PIN_NO(34) | 1)
 #define MT7623_PIN_34_I2S1_DATA_IN_FUNC_PCM_RX (MTK_PIN_NO(34) | 3)
+#define MT7623_PIN_34_I2S1_DATA_IN_FUNC_VDEC_TEST_CK (MTK_PIN_NO(34) | 4)
+#define MT7623_PIN_34_I2S1_DATA_IN_FUNC_G1_RXD1 (MTK_PIN_NO(34) | 5)
 #define MT7623_PIN_34_I2S1_DATA_IN_FUNC_AP_PCM_RX (MTK_PIN_NO(34) | 6)
 
 #define MT7623_PIN_35_I2S1_BCK_FUNC_GPIO35 (MTK_PIN_NO(35) | 0)
 #define MT7623_PIN_35_I2S1_BCK_FUNC_I2S1_BCK (MTK_PIN_NO(35) | 1)
 #define MT7623_PIN_35_I2S1_BCK_FUNC_PCM_CLK0 (MTK_PIN_NO(35) | 3)
+#define MT7623_PIN_35_I2S1_BCK_FUNC_G1_RXD2 (MTK_PIN_NO(35) | 5)
 #define MT7623_PIN_35_I2S1_BCK_FUNC_AP_PCM_CLKO (MTK_PIN_NO(35) | 6)
 
 #define MT7623_PIN_36_I2S1_LRCK_FUNC_GPIO36 (MTK_PIN_NO(36) | 0)
 #define MT7623_PIN_36_I2S1_LRCK_FUNC_I2S1_LRCK (MTK_PIN_NO(36) | 1)
 #define MT7623_PIN_36_I2S1_LRCK_FUNC_PCM_SYNC (MTK_PIN_NO(36) | 3)
+#define MT7623_PIN_36_I2S1_LRCK_FUNC_G1_RXD3 (MTK_PIN_NO(36) | 5)
 #define MT7623_PIN_36_I2S1_LRCK_FUNC_AP_PCM_SYNC (MTK_PIN_NO(36) | 6)
 
 #define MT7623_PIN_37_I2S1_MCLK_FUNC_GPIO37 (MTK_PIN_NO(37) | 0)
 #define MT7623_PIN_37_I2S1_MCLK_FUNC_I2S1_MCLK (MTK_PIN_NO(37) | 1)
+#define MT7623_PIN_37_I2S1_MCLK_FUNC_G1_RXDV (MTK_PIN_NO(37) | 5)
 
 #define MT7623_PIN_39_JTMS_FUNC_GPIO39 (MTK_PIN_NO(39) | 0)
 #define MT7623_PIN_39_JTMS_FUNC_JTMS (MTK_PIN_NO(39) | 1)
+#define MT7623_PIN_39_JTMS_FUNC_CONN_MCU_TMS (MTK_PIN_NO(39) | 2)
+#define MT7623_PIN_39_JTMS_FUNC_CONN_MCU_AICE_JMSC (MTK_PIN_NO(39) | 3)
+#define MT7623_PIN_39_JTMS_FUNC_DFD_TMS_XI (MTK_PIN_NO(39) | 4)
 
 #define MT7623_PIN_40_JTCK_FUNC_GPIO40 (MTK_PIN_NO(40) | 0)
 #define MT7623_PIN_40_JTCK_FUNC_JTCK (MTK_PIN_NO(40) | 1)
+#define MT7623_PIN_40_JTCK_FUNC_CONN_MCU_TCK1 (MTK_PIN_NO(40) | 2)
+#define MT7623_PIN_40_JTCK_FUNC_CONN_MCU_AICE_JCKC (MTK_PIN_NO(40) | 3)
+#define MT7623_PIN_40_JTCK_FUNC_DFD_TCK_XI (MTK_PIN_NO(40) | 4)
 
 #define MT7623_PIN_41_JTDI_FUNC_GPIO41 (MTK_PIN_NO(41) | 0)
 #define MT7623_PIN_41_JTDI_FUNC_JTDI (MTK_PIN_NO(41) | 1)
+#define MT7623_PIN_41_JTDI_FUNC_CONN_MCU_TDI (MTK_PIN_NO(41) | 2)
+#define MT7623_PIN_41_JTDI_FUNC_DFD_TDI_XI (MTK_PIN_NO(41) | 4)
 
 #define MT7623_PIN_42_JTDO_FUNC_GPIO42 (MTK_PIN_NO(42) | 0)
 #define MT7623_PIN_42_JTDO_FUNC_JTDO (MTK_PIN_NO(42) | 1)
+#define MT7623_PIN_42_JTDO_FUNC_CONN_MCU_TDO (MTK_PIN_NO(42) | 2)
+#define MT7623_PIN_42_JTDO_FUNC_DFD_TDO (MTK_PIN_NO(42) | 4)
 
 #define MT7623_PIN_43_NCLE_FUNC_GPIO43 (MTK_PIN_NO(43) | 0)
 #define MT7623_PIN_43_NCLE_FUNC_NCLE (MTK_PIN_NO(43) | 1)
@@ -160,31 +218,40 @@
 
 #define MT7623_PIN_47_NREB_FUNC_GPIO47 (MTK_PIN_NO(47) | 0)
 #define MT7623_PIN_47_NREB_FUNC_NREB (MTK_PIN_NO(47) | 1)
+#define MT7623_PIN_47_NREB_FUNC_IDDIG_P1 (MTK_PIN_NO(47) | 2)
 
 #define MT7623_PIN_48_NRNB_FUNC_GPIO48 (MTK_PIN_NO(48) | 0)
 #define MT7623_PIN_48_NRNB_FUNC_NRNB (MTK_PIN_NO(48) | 1)
+#define MT7623_PIN_48_NRNB_FUNC_DRV_VBUS_P1 (MTK_PIN_NO(48) | 2)
 
 #define MT7623_PIN_49_I2S0_DATA_FUNC_GPIO49 (MTK_PIN_NO(49) | 0)
 #define MT7623_PIN_49_I2S0_DATA_FUNC_I2S0_DATA (MTK_PIN_NO(49) | 1)
+#define MT7623_PIN_49_I2S0_DATA_FUNC_I2S0_DATA_BYPS (MTK_PIN_NO(49) | 2)
 #define MT7623_PIN_49_I2S0_DATA_FUNC_PCM_TX (MTK_PIN_NO(49) | 3)
 #define MT7623_PIN_49_I2S0_DATA_FUNC_AP_I2S_DO (MTK_PIN_NO(49) | 6)
 
 #define MT7623_PIN_53_SPI0_CSN_FUNC_GPIO53 (MTK_PIN_NO(53) | 0)
 #define MT7623_PIN_53_SPI0_CSN_FUNC_SPI0_CS (MTK_PIN_NO(53) | 1)
+#define MT7623_PIN_53_SPI0_CSN_FUNC_SPDIF (MTK_PIN_NO(53) | 3)
+#define MT7623_PIN_53_SPI0_CSN_FUNC_ADC_CK (MTK_PIN_NO(53) | 4)
 #define MT7623_PIN_53_SPI0_CSN_FUNC_PWM1 (MTK_PIN_NO(53) | 5)
 
 #define MT7623_PIN_54_SPI0_CK_FUNC_GPIO54 (MTK_PIN_NO(54) | 0)
 #define MT7623_PIN_54_SPI0_CK_FUNC_SPI0_CK (MTK_PIN_NO(54) | 1)
+#define MT7623_PIN_54_SPI0_CK_FUNC_SPDIF_IN1 (MTK_PIN_NO(54) | 3)
+#define MT7623_PIN_54_SPI0_CK_FUNC_ADC_DAT_IN (MTK_PIN_NO(54) | 4)
 
 #define MT7623_PIN_55_SPI0_MI_FUNC_GPIO55 (MTK_PIN_NO(55) | 0)
 #define MT7623_PIN_55_SPI0_MI_FUNC_SPI0_MI (MTK_PIN_NO(55) | 1)
 #define MT7623_PIN_55_SPI0_MI_FUNC_SPI0_MO (MTK_PIN_NO(55) | 2)
 #define MT7623_PIN_55_SPI0_MI_FUNC_MSDC1_WP (MTK_PIN_NO(55) | 3)
+#define MT7623_PIN_55_SPI0_MI_FUNC_ADC_WS (MTK_PIN_NO(55) | 4)
 #define MT7623_PIN_55_SPI0_MI_FUNC_PWM2 (MTK_PIN_NO(55) | 5)
 
 #define MT7623_PIN_56_SPI0_MO_FUNC_GPIO56 (MTK_PIN_NO(56) | 0)
 #define MT7623_PIN_56_SPI0_MO_FUNC_SPI0_MO (MTK_PIN_NO(56) | 1)
 #define MT7623_PIN_56_SPI0_MO_FUNC_SPI0_MI (MTK_PIN_NO(56) | 2)
+#define MT7623_PIN_56_SPI0_MO_FUNC_SPDIF_IN0 (MTK_PIN_NO(56) | 3)
 
 #define MT7623_PIN_57_SDA1_FUNC_GPIO57 (MTK_PIN_NO(57) | 0)
 #define MT7623_PIN_57_SDA1_FUNC_SDA1 (MTK_PIN_NO(57) | 1)
@@ -275,10 +342,23 @@
 
 #define MT7623_PIN_83_LCM_RST_FUNC_GPIO83 (MTK_PIN_NO(83) | 0)
 #define MT7623_PIN_83_LCM_RST_FUNC_LCM_RST (MTK_PIN_NO(83) | 1)
+#define MT7623_PIN_83_LCM_RST_FUNC_VDAC_CK_XI (MTK_PIN_NO(83) | 2)
 
 #define MT7623_PIN_84_DSI_TE_FUNC_GPIO84 (MTK_PIN_NO(84) | 0)
 #define MT7623_PIN_84_DSI_TE_FUNC_DSI_TE (MTK_PIN_NO(84) | 1)
 
+#define MT7623_PIN_91_MIPI_TDN3_FUNC_GPIO91 (MTK_PIN_NO(91) | 0)
+#define MT7623_PIN_91_MIPI_TDN3_FUNC_TDN3 (MTK_PIN_NO(91) | 1)
+
+#define MT7623_PIN_92_MIPI_TDP3_FUNC_GPIO92 (MTK_PIN_NO(92) | 0)
+#define MT7623_PIN_92_MIPI_TDP3_FUNC_TDP3 (MTK_PIN_NO(92) | 1)
+
+#define MT7623_PIN_93_MIPI_TDN2_FUNC_GPIO93 (MTK_PIN_NO(93) | 0)
+#define MT7623_PIN_93_MIPI_TDN2_FUNC_TDN2 (MTK_PIN_NO(93) | 1)
+
+#define MT7623_PIN_94_MIPI_TDP2_FUNC_GPIO94 (MTK_PIN_NO(94) | 0)
+#define MT7623_PIN_94_MIPI_TDP2_FUNC_TDP2 (MTK_PIN_NO(94) | 1)
+
 #define MT7623_PIN_95_MIPI_TCN_FUNC_GPIO95 (MTK_PIN_NO(95) | 0)
 #define MT7623_PIN_95_MIPI_TCN_FUNC_TCN (MTK_PIN_NO(95) | 1)
 
@@ -300,20 +380,24 @@
 #define MT7623_PIN_101_SPI2_CSN_FUNC_GPIO101 (MTK_PIN_NO(101) | 0)
 #define MT7623_PIN_101_SPI2_CSN_FUNC_SPI2_CS (MTK_PIN_NO(101) | 1)
 #define MT7623_PIN_101_SPI2_CSN_FUNC_SCL3 (MTK_PIN_NO(101) | 3)
+#define MT7623_PIN_101_SPI2_CSN_FUNC_KROW0 (MTK_PIN_NO(101) | 4)
 
 #define MT7623_PIN_102_SPI2_MI_FUNC_GPIO102 (MTK_PIN_NO(102) | 0)
 #define MT7623_PIN_102_SPI2_MI_FUNC_SPI2_MI (MTK_PIN_NO(102) | 1)
 #define MT7623_PIN_102_SPI2_MI_FUNC_SPI2_MO (MTK_PIN_NO(102) | 2)
 #define MT7623_PIN_102_SPI2_MI_FUNC_SDA3 (MTK_PIN_NO(102) | 3)
+#define MT7623_PIN_102_SPI2_MI_FUNC_KROW1 (MTK_PIN_NO(102) | 4)
 
 #define MT7623_PIN_103_SPI2_MO_FUNC_GPIO103 (MTK_PIN_NO(103) | 0)
 #define MT7623_PIN_103_SPI2_MO_FUNC_SPI2_MO (MTK_PIN_NO(103) | 1)
 #define MT7623_PIN_103_SPI2_MO_FUNC_SPI2_MI (MTK_PIN_NO(103) | 2)
 #define MT7623_PIN_103_SPI2_MO_FUNC_SCL3 (MTK_PIN_NO(103) | 3)
+#define MT7623_PIN_103_SPI2_MO_FUNC_KROW2 (MTK_PIN_NO(103) | 4)
 
 #define MT7623_PIN_104_SPI2_CK_FUNC_GPIO104 (MTK_PIN_NO(104) | 0)
 #define MT7623_PIN_104_SPI2_CK_FUNC_SPI2_CK (MTK_PIN_NO(104) | 1)
 #define MT7623_PIN_104_SPI2_CK_FUNC_SDA3 (MTK_PIN_NO(104) | 3)
+#define MT7623_PIN_104_SPI2_CK_FUNC_KROW3 (MTK_PIN_NO(104) | 4)
 
 #define MT7623_PIN_105_MSDC1_CMD_FUNC_GPIO105 (MTK_PIN_NO(105) | 0)
 #define MT7623_PIN_105_MSDC1_CMD_FUNC_MSDC1_CMD (MTK_PIN_NO(105) | 1)
@@ -394,7 +478,7 @@
 #define MT7623_PIN_121_MSDC0_DAT0_FUNC_WATCHDOG (MTK_PIN_NO(121) | 5)
 
 #define MT7623_PIN_122_GPIO122_FUNC_GPIO122 (MTK_PIN_NO(122) | 0)
-#define MT7623_PIN_122_GPIO122_FUNC_TEST (MTK_PIN_NO(122) | 1)
+#define MT7623_PIN_122_GPIO122_FUNC_CEC (MTK_PIN_NO(122) | 1)
 #define MT7623_PIN_122_GPIO122_FUNC_SDA2 (MTK_PIN_NO(122) | 4)
 #define MT7623_PIN_122_GPIO122_FUNC_URXD0 (MTK_PIN_NO(122) | 5)
 
@@ -404,12 +488,12 @@
 #define MT7623_PIN_123_HTPLG_FUNC_UTXD0 (MTK_PIN_NO(123) | 5)
 
 #define MT7623_PIN_124_GPIO124_FUNC_GPIO124 (MTK_PIN_NO(124) | 0)
-#define MT7623_PIN_124_GPIO124_FUNC_TEST (MTK_PIN_NO(124) | 1)
+#define MT7623_PIN_124_GPIO124_FUNC_HDMISCK (MTK_PIN_NO(124) | 1)
 #define MT7623_PIN_124_GPIO124_FUNC_SDA1 (MTK_PIN_NO(124) | 4)
 #define MT7623_PIN_124_GPIO124_FUNC_PWM3 (MTK_PIN_NO(124) | 5)
 
 #define MT7623_PIN_125_GPIO125_FUNC_GPIO125 (MTK_PIN_NO(125) | 0)
-#define MT7623_PIN_125_GPIO125_FUNC_TEST (MTK_PIN_NO(125) | 1)
+#define MT7623_PIN_125_GPIO125_FUNC_HDMISD (MTK_PIN_NO(125) | 1)
 #define MT7623_PIN_125_GPIO125_FUNC_SCL1 (MTK_PIN_NO(125) | 4)
 #define MT7623_PIN_125_GPIO125_FUNC_PWM4 (MTK_PIN_NO(125) | 5)
 
-- 
cgit v1.2.3


From 2d972b6ac972b3d3c2e4336325745568c79a5dd3 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.co.uk>
Date: Wed, 28 Mar 2018 18:00:53 -0300
Subject: MIPS: dts: jz4780: Add DMA controller node to the devicetree

Add the devicetree node to support the DMA controller found
in JZ480 SoCs.

Tested-by: Mathieu Malaterre <malat@debian.org>
Acked-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Ezequiel Garcia <ezequiel@collabora.co.uk>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 arch/mips/boot/dts/ingenic/jz4780.dtsi | 12 +++++++++
 include/dt-bindings/dma/jz4780-dma.h   | 49 ++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 include/dt-bindings/dma/jz4780-dma.h

(limited to 'include')

diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi
index 9b5794667aee..15a9801430bd 100644
--- a/arch/mips/boot/dts/ingenic/jz4780.dtsi
+++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <dt-bindings/clock/jz4780-cgu.h>
+#include <dt-bindings/dma/jz4780-dma.h>
 
 / {
 	#address-cells = <1>;
@@ -241,6 +242,17 @@
 		status = "disabled";
 	};
 
+	dma: dma@13420000 {
+		compatible = "ingenic,jz4780-dma";
+		reg = <0x13420000 0x10000>;
+		#dma-cells = <2>;
+
+		interrupt-parent = <&intc>;
+		interrupts = <10>;
+
+		clocks = <&cgu JZ4780_CLK_PDMA>;
+	};
+
 	bch: bch@134d0000 {
 		compatible = "ingenic,jz4780-bch";
 		reg = <0x134d0000 0x10000>;
diff --git a/include/dt-bindings/dma/jz4780-dma.h b/include/dt-bindings/dma/jz4780-dma.h
new file mode 100644
index 000000000000..df017fdfb44e
--- /dev/null
+++ b/include/dt-bindings/dma/jz4780-dma.h
@@ -0,0 +1,49 @@
+#ifndef __DT_BINDINGS_DMA_JZ4780_DMA_H__
+#define __DT_BINDINGS_DMA_JZ4780_DMA_H__
+
+/*
+ * Request type numbers for the JZ4780 DMA controller (written to the DRTn
+ * register for the channel).
+ */
+#define JZ4780_DMA_I2S1_TX	0x4
+#define JZ4780_DMA_I2S1_RX	0x5
+#define JZ4780_DMA_I2S0_TX	0x6
+#define JZ4780_DMA_I2S0_RX	0x7
+#define JZ4780_DMA_AUTO		0x8
+#define JZ4780_DMA_SADC_RX	0x9
+#define JZ4780_DMA_UART4_TX	0xc
+#define JZ4780_DMA_UART4_RX	0xd
+#define JZ4780_DMA_UART3_TX	0xe
+#define JZ4780_DMA_UART3_RX	0xf
+#define JZ4780_DMA_UART2_TX	0x10
+#define JZ4780_DMA_UART2_RX	0x11
+#define JZ4780_DMA_UART1_TX	0x12
+#define JZ4780_DMA_UART1_RX	0x13
+#define JZ4780_DMA_UART0_TX	0x14
+#define JZ4780_DMA_UART0_RX	0x15
+#define JZ4780_DMA_SSI0_TX	0x16
+#define JZ4780_DMA_SSI0_RX	0x17
+#define JZ4780_DMA_SSI1_TX	0x18
+#define JZ4780_DMA_SSI1_RX	0x19
+#define JZ4780_DMA_MSC0_TX	0x1a
+#define JZ4780_DMA_MSC0_RX	0x1b
+#define JZ4780_DMA_MSC1_TX	0x1c
+#define JZ4780_DMA_MSC1_RX	0x1d
+#define JZ4780_DMA_MSC2_TX	0x1e
+#define JZ4780_DMA_MSC2_RX	0x1f
+#define JZ4780_DMA_PCM0_TX	0x20
+#define JZ4780_DMA_PCM0_RX	0x21
+#define JZ4780_DMA_SMB0_TX	0x24
+#define JZ4780_DMA_SMB0_RX	0x25
+#define JZ4780_DMA_SMB1_TX	0x26
+#define JZ4780_DMA_SMB1_RX	0x27
+#define JZ4780_DMA_SMB2_TX	0x28
+#define JZ4780_DMA_SMB2_RX	0x29
+#define JZ4780_DMA_SMB3_TX	0x2a
+#define JZ4780_DMA_SMB3_RX	0x2b
+#define JZ4780_DMA_SMB4_TX	0x2c
+#define JZ4780_DMA_SMB4_RX	0x2d
+#define JZ4780_DMA_DES_TX	0x2e
+#define JZ4780_DMA_DES_RX	0x2f
+
+#endif /* __DT_BINDINGS_DMA_JZ4780_DMA_H__ */
-- 
cgit v1.2.3


From ebc5a1bf4f2afc2f2b348320dcfb45a8c0ac3de5 Mon Sep 17 00:00:00 2001
From: "harish_kandiga@mentor.com" <harish_kandiga@mentor.com>
Date: Tue, 10 Apr 2018 12:30:31 +0530
Subject: mmc: core: Add a new quirk for limiting clock rate

This patch adds a quirk to limit clock rate which
can be used to reduce the SDIO clock rate for some
chips with broken UHS.

Signed-off-by: Harish Jenny K N <harish_kandiga@mentor.com>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/card.h  | 6 ++++++
 drivers/mmc/core/sdio.c  | 6 +++++-
 include/linux/mmc/card.h | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h
index 9c821eedd156..1170feb8f969 100644
--- a/drivers/mmc/core/card.h
+++ b/drivers/mmc/core/card.h
@@ -149,6 +149,12 @@ static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data)
 	card->quirks &= ~data;
 }
 
+static inline void __maybe_unused add_limit_rate_quirk(struct mmc_card *card,
+						       int data)
+{
+	card->quirk_max_rate = data;
+}
+
 /*
  * Quirk add/remove for MMC products.
  */
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index c599a628a387..24b510b743da 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -444,6 +444,7 @@ static int sdio_set_bus_speed_mode(struct mmc_card *card)
 	unsigned int bus_speed, timing;
 	int err;
 	unsigned char speed;
+	unsigned int max_rate;
 
 	/*
 	 * If the host doesn't support any of the UHS-I modes, fallback on
@@ -500,9 +501,12 @@ static int sdio_set_bus_speed_mode(struct mmc_card *card)
 	if (err)
 		return err;
 
+	max_rate = min_not_zero(card->quirk_max_rate,
+				card->sw_caps.uhs_max_dtr);
+
 	if (bus_speed) {
 		mmc_set_timing(card->host, timing);
-		mmc_set_clock(card->host, card->sw_caps.uhs_max_dtr);
+		mmc_set_clock(card->host, max_rate);
 	}
 
 	return 0;
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 279b39008a33..5ebc47855721 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -252,6 +252,7 @@ struct mmc_card {
 #define MMC_TYPE_SD_COMBO	3		/* SD combo (IO+mem) card */
 	unsigned int		state;		/* (our) card state */
 	unsigned int		quirks; 	/* card quirks */
+	unsigned int		quirk_max_rate;	/* max rate set by quirks */
 #define MMC_QUIRK_LENIENT_FN0	(1<<0)		/* allow SDIO FN0 writes outside of the VS CCCR range */
 #define MMC_QUIRK_BLKSZ_FOR_BYTE_MODE (1<<1)	/* use func->cur_blksize */
 						/* for byte mode */
-- 
cgit v1.2.3


From 8ccd66f258cd79420102f3735b3bc8d974a22088 Mon Sep 17 00:00:00 2001
From: Diwakar Sharma <diwakar.sharma@in.bosch.com>
Date: Tue, 10 Apr 2018 12:30:32 +0530
Subject: mmc: core: sdio: Set SDIO clock of SDR104 to 150MHz for Marvell 8887
 chip

This patch uses limit clock rate quirk to reduce clock rate
for "SDR104" mode on IMX side for Marvell 8887
WiFi + Bluetooth chip side, as Marvell does not recommend
to use SDIO at the speed of higher than 150MHz.

Signed-off-by: Diwakar Sharma <diwakar.sharma@in.bosch.com>
Signed-off-by: Harish Jenny K N <harish_kandiga@mentor.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/quirks.h    | 3 +++
 include/linux/mmc/sdio_ids.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h
index 5153577754f0..dd2f73af8f2c 100644
--- a/drivers/mmc/core/quirks.h
+++ b/drivers/mmc/core/quirks.h
@@ -132,6 +132,9 @@ static const struct mmc_fixup sdio_fixup_methods[] = {
 	SDIO_FIXUP(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8797_F0,
 		   add_quirk, MMC_QUIRK_BROKEN_IRQ_POLLING),
 
+	SDIO_FIXUP(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8887WLAN,
+		   add_limit_rate_quirk, 150000000),
+
 	END_FIXUP
 };
 
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index cdd66a5fbd5e..2836a96e014a 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -55,6 +55,7 @@
 #define SDIO_DEVICE_ID_MARVELL_8688WLAN		0x9104
 #define SDIO_DEVICE_ID_MARVELL_8688BT		0x9105
 #define SDIO_DEVICE_ID_MARVELL_8797_F0		0x9128
+#define SDIO_DEVICE_ID_MARVELL_8887WLAN	0x9134
 
 #define SDIO_VENDOR_ID_SIANO			0x039a
 #define SDIO_DEVICE_ID_SIANO_NOVA_B0		0x0201
-- 
cgit v1.2.3


From 247cfe53557524a94dd1001d19e5aa50bd5aca81 Mon Sep 17 00:00:00 2001
From: Kyle Roeschley <kyle.roeschley@ni.com>
Date: Fri, 13 Apr 2018 16:54:57 -0500
Subject: mmc: core: Add capability to avoid 3.3V signaling

Some SD host controllers cannot handle extended use of 3.3V signaling.
To accommodate these controllers, add a capability that requires us to
negotiate the voltage down from 3.3V during card initialization.

Signed-off-by: Kyle Roeschley <kyle.roeschley@ni.com>
Signed-off-by: Jennifer Dahm <jennifer.dahm@ni.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   | 8 ++++++++
 drivers/mmc/core/sd.c    | 8 ++++++++
 drivers/mmc/core/sdio.c  | 8 ++++++++
 include/linux/mmc/host.h | 1 +
 4 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 6f8ebd6caa4c..915b58f9bfc5 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1830,6 +1830,14 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		}
 	}
 
+	if (host->caps2 & MMC_CAP2_AVOID_3_3V &&
+	    host->ios.signal_voltage == MMC_SIGNAL_VOLTAGE_330) {
+		pr_err("%s: Host failed to negotiate down from 3.3V\n",
+			mmc_hostname(host));
+		err = -EINVAL;
+		goto free_card;
+	}
+
 	if (!oldcard)
 		host->card = card;
 
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index baf3d5da4ccb..7f87a5354f9f 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -1058,6 +1058,14 @@ retry:
 			mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
 		}
 	}
+
+	if (host->caps2 & MMC_CAP2_AVOID_3_3V &&
+	    host->ios.signal_voltage == MMC_SIGNAL_VOLTAGE_330) {
+		pr_err("%s: Host failed to negotiate down from 3.3V\n",
+			mmc_hostname(host));
+		err = -EINVAL;
+		goto free_card;
+	}
 done:
 	host->card = card;
 	return 0;
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 24b510b743da..87e53a721a10 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -792,6 +792,14 @@ try_again:
 		if (err)
 			goto remove;
 	}
+
+	if (host->caps2 & MMC_CAP2_AVOID_3_3V &&
+	    host->ios.signal_voltage == MMC_SIGNAL_VOLTAGE_330) {
+		pr_err("%s: Host failed to negotiate down from 3.3V\n",
+			mmc_hostname(host));
+		err = -EINVAL;
+		goto remove;
+	}
 finish:
 	if (!oldcard)
 		host->card = card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 85146235231e..7c6eaf63f5ce 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -354,6 +354,7 @@ struct mmc_host {
 #define MMC_CAP2_NO_MMC		(1 << 22)	/* Do not send (e)MMC commands during initialization */
 #define MMC_CAP2_CQE		(1 << 23)	/* Has eMMC command queue engine */
 #define MMC_CAP2_CQE_DCMD	(1 << 24)	/* CQE can issue a direct command */
+#define MMC_CAP2_AVOID_3_3V	(1 << 25)	/* Host must negotiate down from 3.3V */
 
 	int			fixed_drv_type;	/* fixed driver type for non-removable media */
 
-- 
cgit v1.2.3


From 1f8777a45ac03b3db104a7d7d55926292bf0a7c2 Mon Sep 17 00:00:00 2001
From: Taniya Das <tdas@codeaurora.org>
Date: Tue, 24 Apr 2018 17:53:18 +0530
Subject: dt-bindings: clock: Introduce QCOM RPMh clock bindings

Add RPMh clock device bindings for Qualcomm Technology Inc's SoCs. These
devices would be used for communicating resource state requests to control
the clocks managed by RPMh.

Signed-off-by: Taniya Das <tdas@codeaurora.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/qcom,rpmh-clk.txt    | 22 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,rpmh.h              | 22 ++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,rpmh-clk.txt
 create mode 100644 include/dt-bindings/clock/qcom,rpmh.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,rpmh-clk.txt b/Documentation/devicetree/bindings/clock/qcom,rpmh-clk.txt
new file mode 100644
index 000000000000..3c007653da31
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,rpmh-clk.txt
@@ -0,0 +1,22 @@
+Qualcomm Technologies, Inc. RPMh Clocks
+-------------------------------------------------------
+
+Resource Power Manager Hardened (RPMh) manages shared resources on
+some Qualcomm Technologies Inc. SoCs. It accepts clock requests from
+other hardware subsystems via RSC to control clocks.
+
+Required properties :
+- compatible : shall contain "qcom,sdm845-rpmh-clk"
+
+- #clock-cells : must contain 1
+
+Example :
+
+#include <dt-bindings/clock/qcom,rpmh.h>
+
+	&apps_rsc {
+		rpmhcc: clock-controller {
+			compatible = "qcom,sdm845-rpmh-clk";
+			#clock-cells = <1>;
+		};
+	};
diff --git a/include/dt-bindings/clock/qcom,rpmh.h b/include/dt-bindings/clock/qcom,rpmh.h
new file mode 100644
index 000000000000..f48fbd6f2095
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,rpmh.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
+
+
+#ifndef _DT_BINDINGS_CLK_MSM_RPMH_H
+#define _DT_BINDINGS_CLK_MSM_RPMH_H
+
+/* RPMh controlled clocks */
+#define RPMH_CXO_CLK				0
+#define RPMH_CXO_CLK_A				1
+#define RPMH_LN_BB_CLK2				2
+#define RPMH_LN_BB_CLK2_A			3
+#define RPMH_LN_BB_CLK3				4
+#define RPMH_LN_BB_CLK3_A			5
+#define RPMH_RF_CLK1				6
+#define RPMH_RF_CLK1_A				7
+#define RPMH_RF_CLK2				8
+#define RPMH_RF_CLK2_A				9
+#define RPMH_RF_CLK3				10
+#define RPMH_RF_CLK3_A				11
+
+#endif
-- 
cgit v1.2.3


From eb06d6bbc45a7561de78a00fb17bfbb75893ee26 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 18 Apr 2018 16:50:01 +0200
Subject: clk: Extract OF clock helpers in <linux/of_clk.h>

The use of of_clk_get_parent_{count,name}() and of_clk_init() is not
limited to clock providers.

Hence move these helpers into their own header file, so callers that are
not clock providers no longer have to include <linux/clk-provider.h>.

Suggested-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 MAINTAINERS                  |  1 +
 include/linux/clk-provider.h | 14 +-------------
 include/linux/of_clk.h       | 30 ++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/of_clk.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a1410d5a621..b61b2bf1eb75 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3556,6 +3556,7 @@ F:	drivers/clk/
 X:	drivers/clk/clkdev.c
 F:	include/linux/clk-pr*
 F:	include/linux/clk/
+F:	include/linux/of_clk.h
 
 COMMON INTERNET FILE SYSTEM (CIFS)
 M:	Steve French <sfrench@samba.org>
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 210a890008f9..61cb4729f22a 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -13,6 +13,7 @@
 
 #include <linux/io.h>
 #include <linux/of.h>
+#include <linux/of_clk.h>
 
 #ifdef CONFIG_COMMON_CLK
 
@@ -890,13 +891,10 @@ struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec,
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data);
 struct clk_hw *of_clk_hw_onecell_get(struct of_phandle_args *clkspec,
 				     void *data);
-unsigned int of_clk_get_parent_count(struct device_node *np);
 int of_clk_parent_fill(struct device_node *np, const char **parents,
 		       unsigned int size);
-const char *of_clk_get_parent_name(struct device_node *np, int index);
 int of_clk_detect_critical(struct device_node *np, int index,
 			    unsigned long *flags);
-void of_clk_init(const struct of_device_id *matches);
 
 #else /* !CONFIG_OF */
 
@@ -943,26 +941,16 @@ of_clk_hw_onecell_get(struct of_phandle_args *clkspec, void *data)
 {
 	return ERR_PTR(-ENOENT);
 }
-static inline unsigned int of_clk_get_parent_count(struct device_node *np)
-{
-	return 0;
-}
 static inline int of_clk_parent_fill(struct device_node *np,
 				     const char **parents, unsigned int size)
 {
 	return 0;
 }
-static inline const char *of_clk_get_parent_name(struct device_node *np,
-						 int index)
-{
-	return NULL;
-}
 static inline int of_clk_detect_critical(struct device_node *np, int index,
 					  unsigned long *flags)
 {
 	return 0;
 }
-static inline void of_clk_init(const struct of_device_id *matches) {}
 #endif /* CONFIG_OF */
 
 /*
diff --git a/include/linux/of_clk.h b/include/linux/of_clk.h
new file mode 100644
index 000000000000..b27da9f164cb
--- /dev/null
+++ b/include/linux/of_clk.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OF clock helpers
+ */
+
+#ifndef __LINUX_OF_CLK_H
+#define __LINUX_OF_CLK_H
+
+#if defined(CONFIG_COMMON_CLK) && defined(CONFIG_OF)
+
+unsigned int of_clk_get_parent_count(struct device_node *np);
+const char *of_clk_get_parent_name(struct device_node *np, int index);
+void of_clk_init(const struct of_device_id *matches);
+
+#else /* !CONFIG_COMMON_CLK || !CONFIG_OF */
+
+static inline unsigned int of_clk_get_parent_count(struct device_node *np)
+{
+	return 0;
+}
+static inline const char *of_clk_get_parent_name(struct device_node *np,
+						 int index)
+{
+	return NULL;
+}
+static inline void of_clk_init(const struct of_device_id *matches) {}
+
+#endif /* !CONFIG_COMMON_CLK || !CONFIG_OF */
+
+#endif /* __LINUX_OF_CLK_H */
-- 
cgit v1.2.3


From 7a074e96dee62586c935c80cecd931431bfdd0be Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 May 2018 19:51:00 +0200
Subject: aio: implement io_pgetevents

This is the io_getevents equivalent of ppoll/pselect and allows to
properly mix signals and aio completions (especially with IOCB_CMD_POLL)
and atomically executes the following sequence:

	sigset_t origmask;

	pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
	ret = io_getevents(ctx, min_nr, nr, events, timeout);
	pthread_sigmask(SIG_SETMASK, &origmask, NULL);

Note that unlike many other signal related calls we do not pass a sigmask
size, as that would get us to 7 arguments, which aren't easily supported
by the syscall infrastructure.  It seems a lot less painful to just add a
new syscall variant in the unlikely case we're going to increase the
sigset size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/aio.c                               | 114 ++++++++++++++++++++++++++++++---
 include/linux/compat.h                 |   7 ++
 include/linux/syscalls.h               |   6 ++
 include/uapi/asm-generic/unistd.h      |   4 +-
 include/uapi/linux/aio_abi.h           |   6 ++
 kernel/sys_ni.c                        |   2 +
 8 files changed, 130 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index d6b27dab1b30..14a2f996e543 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -396,3 +396,4 @@
 382	i386	pkey_free		sys_pkey_free			__ia32_sys_pkey_free
 383	i386	statx			sys_statx			__ia32_sys_statx
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
+385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 4dfe42666d0c..cd36232ab62f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -341,6 +341,7 @@
 330	common	pkey_alloc		__x64_sys_pkey_alloc
 331	common	pkey_free		__x64_sys_pkey_free
 332	common	statx			__x64_sys_statx
+333	common	io_pgetevents		__x64_sys_io_pgetevents
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index 61d2e6942951..f3eae5d5771b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1303,10 +1303,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 		wait_event_interruptible_hrtimeout(ctx->wait,
 				aio_read_events(ctx, min_nr, nr, event, &ret),
 				until);
-
-	if (!ret && signal_pending(current))
-		ret = -EINTR;
-
 	return ret;
 }
 
@@ -1921,13 +1917,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		struct timespec __user *, timeout)
 {
 	struct timespec64	ts;
+	int			ret;
+
+	if (timeout && unlikely(get_timespec64(&ts, timeout)))
+		return -EFAULT;
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+	return ret;
+}
 
-	if (timeout) {
-		if (unlikely(get_timespec64(&ts, timeout)))
+SYSCALL_DEFINE6(io_pgetevents,
+		aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct timespec __user *, timeout,
+		const struct __aio_sigset __user *, usig)
+{
+	struct __aio_sigset	ksig = { NULL, };
+	sigset_t		ksigmask, sigsaved;
+	struct timespec64	ts;
+	int ret;
+
+	if (timeout && unlikely(get_timespec64(&ts, timeout)))
+		return -EFAULT;
+
+	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+		return -EFAULT;
+
+	if (ksig.sigmask) {
+		if (ksig.sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
 			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	if (signal_pending(current)) {
+		if (ksig.sigmask) {
+			current->saved_sigmask = sigsaved;
+			set_restore_sigmask();
+		}
+
+		if (!ret)
+			ret = -ERESTARTNOHAND;
+	} else {
+		if (ksig.sigmask)
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
 
-	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	return ret;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1938,13 +1981,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
 		       struct compat_timespec __user *, timeout)
 {
 	struct timespec64 t;
+	int ret;
+
+	if (timeout && compat_get_timespec64(&t, timeout))
+		return -EFAULT;
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+	return ret;
+}
+
 
-	if (timeout) {
-		if (compat_get_timespec64(&t, timeout))
+struct __compat_aio_sigset {
+	compat_sigset_t __user	*sigmask;
+	compat_size_t		sigsetsize;
+};
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents,
+		compat_aio_context_t, ctx_id,
+		compat_long_t, min_nr,
+		compat_long_t, nr,
+		struct io_event __user *, events,
+		struct compat_timespec __user *, timeout,
+		const struct __compat_aio_sigset __user *, usig)
+{
+	struct __compat_aio_sigset ksig = { NULL, };
+	sigset_t ksigmask, sigsaved;
+	struct timespec64 t;
+	int ret;
+
+	if (timeout && compat_get_timespec64(&t, timeout))
+		return -EFAULT;
+
+	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+		return -EFAULT;
+
+	if (ksig.sigmask) {
+		if (ksig.sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (get_compat_sigset(&ksigmask, ksig.sigmask))
 			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
 
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	if (signal_pending(current)) {
+		if (ksig.sigmask) {
+			current->saved_sigmask = sigsaved;
+			set_restore_sigmask();
+		}
+		if (!ret)
+			ret = -ERESTARTNOHAND;
+	} else {
+		if (ksig.sigmask)
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
 
-	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	return ret;
 }
 #endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 081281ad5772..ad192057b887 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -330,6 +330,7 @@ extern int put_compat_rusage(const struct rusage *,
 			     struct compat_rusage __user *);
 
 struct compat_siginfo;
+struct __compat_aio_sigset;
 
 struct compat_dirent {
 	u32		d_ino;
@@ -553,6 +554,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id,
 					compat_long_t nr,
 					struct io_event __user *events,
 					struct compat_timespec __user *timeout);
+asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
+					compat_long_t min_nr,
+					compat_long_t nr,
+					struct io_event __user *events,
+					struct compat_timespec __user *timeout,
+					const struct __compat_aio_sigset __user *usig);
 
 /* fs/cookies.c */
 asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 70fcda1a9049..811172fcb916 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -290,6 +290,12 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,
 				long nr,
 				struct io_event __user *events,
 				struct timespec __user *timeout);
+asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
+				long min_nr,
+				long nr,
+				struct io_event __user *events,
+				struct timespec __user *timeout,
+				const struct __aio_sigset *sig);
 
 /* fs/xattr.c */
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8bcb186c6f67..42990676a55e 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -732,9 +732,11 @@ __SYSCALL(__NR_pkey_alloc,    sys_pkey_alloc)
 __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 #define __NR_statx 291
 __SYSCALL(__NR_statx,     sys_statx)
+#define __NR_io_pgetevents 292
+__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 
 #undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a04adbc70ddf..2c0a3415beee 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -29,6 +29,7 @@
 
 #include <linux/types.h>
 #include <linux/fs.h>
+#include <linux/signal.h>
 #include <asm/byteorder.h>
 
 typedef __kernel_ulong_t aio_context_t;
@@ -108,5 +109,10 @@ struct iocb {
 #undef IFBIG
 #undef IFLITTLE
 
+struct __aio_sigset {
+	sigset_t __user	*sigmask;
+	size_t		sigsetsize;
+};
+
 #endif /* __LINUX__AIO_ABI_H */
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 9791364925dc..183169c2a75b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -43,7 +43,9 @@ COND_SYSCALL(io_submit);
 COND_SYSCALL_COMPAT(io_submit);
 COND_SYSCALL(io_cancel);
 COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents);
 COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents);
 
 /* fs/xattr.c */
 
-- 
cgit v1.2.3


From 0cb8dae4a0df2a977847c2dc6766a7783ce50f9d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 18 Apr 2018 11:09:47 -0700
Subject: fscrypt: allow synchronous bio decryption

Currently, fscrypt provides fscrypt_decrypt_bio_pages() which decrypts a
bio's pages asynchronously, then unlocks them afterwards.  But, this
assumes that decryption is the last "postprocessing step" for the bio,
so it's incompatible with additional postprocessing steps such as
authenticity verification after decryption.

Therefore, rename the existing fscrypt_decrypt_bio_pages() to
fscrypt_enqueue_decrypt_bio().  Then, add fscrypt_decrypt_bio() which
decrypts the pages in the bio synchronously without unlocking the pages,
nor setting them Uptodate; and add fscrypt_enqueue_decrypt_work(), which
enqueues work on the fscrypt_read_workqueue.  The new functions will be
used by filesystems that support both fscrypt and fs-verity.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/bio.c                 | 35 ++++++++++++++++++++++-------------
 fs/crypto/crypto.c              |  8 +++++++-
 fs/crypto/fscrypt_private.h     |  1 -
 fs/ext4/readpage.c              |  2 +-
 fs/f2fs/data.c                  |  2 +-
 include/linux/fscrypt_notsupp.h | 13 ++++++++++---
 include/linux/fscrypt_supp.h    |  5 ++++-
 7 files changed, 45 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0d5e6a569d58..0959044c5cee 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -26,15 +26,8 @@
 #include <linux/namei.h>
 #include "fscrypt_private.h"
 
-/*
- * Call fscrypt_decrypt_page on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
+static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
 {
-	struct fscrypt_ctx *ctx =
-		container_of(work, struct fscrypt_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
 	struct bio_vec *bv;
 	int i;
 
@@ -46,22 +39,38 @@ static void completion_pages(struct work_struct *work)
 		if (ret) {
 			WARN_ON_ONCE(1);
 			SetPageError(page);
-		} else {
+		} else if (done) {
 			SetPageUptodate(page);
 		}
-		unlock_page(page);
+		if (done)
+			unlock_page(page);
 	}
+}
+
+void fscrypt_decrypt_bio(struct bio *bio)
+{
+	__fscrypt_decrypt_bio(bio, false);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio);
+
+static void completion_pages(struct work_struct *work)
+{
+	struct fscrypt_ctx *ctx =
+		container_of(work, struct fscrypt_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+
+	__fscrypt_decrypt_bio(bio, true);
 	fscrypt_release_ctx(ctx);
 	bio_put(bio);
 }
 
-void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio)
 {
 	INIT_WORK(&ctx->r.work, completion_pages);
 	ctx->r.bio = bio;
-	queue_work(fscrypt_read_workqueue, &ctx->r.work);
+	fscrypt_enqueue_decrypt_work(&ctx->r.work);
 }
-EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio);
 
 void fscrypt_pullback_bio_page(struct page **page, bool restore)
 {
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ce654526c0fb..0758d32ad01b 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -45,12 +45,18 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static LIST_HEAD(fscrypt_free_ctxs);
 static DEFINE_SPINLOCK(fscrypt_ctx_lock);
 
-struct workqueue_struct *fscrypt_read_workqueue;
+static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 static struct kmem_cache *fscrypt_ctx_cachep;
 struct kmem_cache *fscrypt_info_cachep;
 
+void fscrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+	queue_work(fscrypt_read_workqueue, work);
+}
+EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
+
 /**
  * fscrypt_release_ctx() - Releases an encryption context
  * @ctx: The encryption context to release.
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index ad6722bae8b7..4012558f6115 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -97,7 +97,6 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
 extern int fscrypt_initialize(unsigned int cop_flags);
-extern struct workqueue_struct *fscrypt_read_workqueue;
 extern int fscrypt_do_page_crypto(const struct inode *inode,
 				  fscrypt_direction_t rw, u64 lblk_num,
 				  struct page *src_page,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 9ffa6fad18db..19b87a8de6ff 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -77,7 +77,7 @@ static void mpage_end_io(struct bio *bio)
 		if (bio->bi_status) {
 			fscrypt_release_ctx(bio->bi_private);
 		} else {
-			fscrypt_decrypt_bio_pages(bio->bi_private, bio);
+			fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
 			return;
 		}
 	}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 02237d4d91f5..39225519de1c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -66,7 +66,7 @@ static void f2fs_read_end_io(struct bio *bio)
 		if (bio->bi_status) {
 			fscrypt_release_ctx(bio->bi_private);
 		} else {
-			fscrypt_decrypt_bio_pages(bio->bi_private, bio);
+			fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
 			return;
 		}
 	}
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 44b50c04bae9..9770be37c9d4 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -25,6 +25,10 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
 }
 
 /* crypto.c */
+static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+}
+
 static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
 						  gfp_t gfp_flags)
 {
@@ -160,10 +164,13 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
 }
 
 /* bio.c */
-static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx,
-					     struct bio *bio)
+static inline void fscrypt_decrypt_bio(struct bio *bio)
+{
+}
+
+static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+					       struct bio *bio)
 {
-	return;
 }
 
 static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 477a7a6504d2..2c9a86ac5e83 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -59,6 +59,7 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
 }
 
 /* crypto.c */
+extern void fscrypt_enqueue_decrypt_work(struct work_struct *);
 extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
@@ -188,7 +189,9 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
 }
 
 /* bio.c */
-extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_decrypt_bio(struct bio *);
+extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+					struct bio *bio);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 				 unsigned int);
-- 
cgit v1.2.3


From c16bc9a7678a27ece028dcbfea8df2049a65dbec Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Fri, 27 Apr 2018 17:17:14 +0530
Subject: mmc: sdhci: Disable 1.8v modes (HS200/HS400/UHS) if controller can't
 support 1.8v

The SDHCI controller in a SoC might support HS200/HS400 (indicated
using mmc-hs200-1_8v/mmc-hs400-1_8v dt property), but if the board is
modeled such that the IO lines are not connected to 1.8v then
HS200/HS400 cannot be supported. Disable HS200/HS400 if the board
does not have 1.8v connected to the IO lines. Also Disable DDR/UHS in 1.8v
if the IO lines are not connected to 1.8v.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c | 10 ++++++++++
 include/linux/mmc/host.h |  4 ++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 2ededa7f43df..0f3cdca3e769 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -3672,6 +3672,16 @@ int sdhci_setup_host(struct sdhci_host *host)
 	if (host->quirks2 & SDHCI_QUIRK2_NO_1_8_V) {
 		host->caps1 &= ~(SDHCI_SUPPORT_SDR104 | SDHCI_SUPPORT_SDR50 |
 				 SDHCI_SUPPORT_DDR50);
+		/*
+		 * The SDHCI controller in a SoC might support HS200/HS400
+		 * (indicated using mmc-hs200-1_8v/mmc-hs400-1_8v dt property),
+		 * but if the board is modeled such that the IO lines are not
+		 * connected to 1.8v then HS200/HS400 cannot be supported.
+		 * Disable HS200/HS400 if the board does not have 1.8v connected
+		 * to the IO lines. (Applicable for other modes in 1.8v)
+		 */
+		mmc->caps2 &= ~(MMC_CAP2_HSX00_1_8V | MMC_CAP2_HS400_ES);
+		mmc->caps &= ~(MMC_CAP_1_8V_DDR | MMC_CAP_UHS);
 	}
 
 	/* Any UHS-I mode in caps implies SDR12 and SDR25 support. */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 85146235231e..4e80c199c9c2 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -320,6 +320,9 @@ struct mmc_host {
 #define MMC_CAP_UHS_SDR50	(1 << 18)	/* Host supports UHS SDR50 mode */
 #define MMC_CAP_UHS_SDR104	(1 << 19)	/* Host supports UHS SDR104 mode */
 #define MMC_CAP_UHS_DDR50	(1 << 20)	/* Host supports UHS DDR50 mode */
+#define MMC_CAP_UHS		(MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25 | \
+				 MMC_CAP_UHS_SDR50 | MMC_CAP_UHS_SDR104 | \
+				 MMC_CAP_UHS_DDR50)
 /* (1 << 21) is free for reuse */
 #define MMC_CAP_DRIVER_TYPE_A	(1 << 23)	/* Host supports Driver Type A */
 #define MMC_CAP_DRIVER_TYPE_C	(1 << 24)	/* Host supports Driver Type C */
@@ -345,6 +348,7 @@ struct mmc_host {
 #define MMC_CAP2_HS400_1_2V	(1 << 16)	/* Can support HS400 1.2V */
 #define MMC_CAP2_HS400		(MMC_CAP2_HS400_1_8V | \
 				 MMC_CAP2_HS400_1_2V)
+#define MMC_CAP2_HSX00_1_8V	(MMC_CAP2_HS200_1_8V_SDR | MMC_CAP2_HS400_1_8V)
 #define MMC_CAP2_HSX00_1_2V	(MMC_CAP2_HS200_1_2V_SDR | MMC_CAP2_HS400_1_2V)
 #define MMC_CAP2_SDIO_IRQ_NOTHREAD (1 << 17)
 #define MMC_CAP2_NO_WRITE_PROTECT (1 << 18)	/* No physical write protect pin, assume that card is always read-write */
-- 
cgit v1.2.3


From aee1a37d0f1a904a1443c327211f4bcd645681f1 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 2 May 2018 10:23:59 +0200
Subject: dma-fence: remove fill_driver_data callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Noticed while I was typing docs. Entirely unused.

v2: Remove reference in @timeline_value_str too. While at it clarify
why timeline_value_str has a fence parameter - we don't have an
explicit timeline structure unfortunately.

Cc: Eric Anholt <eric@anholt.net>
Reviewed-by: Christian König <christian.koenig@amd.com> (v1)
Reviewed-by: Eric Anholt <eric@anholt.net>
Cc: Christian König <christian.koenig@amd.com> (v1)
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20180502082359.30345-1-daniel.vetter@ffwll.ch
---
 include/linux/dma-fence.h | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index eb9b05aa5aea..111aefe1c956 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -217,17 +217,6 @@ struct dma_fence_ops {
 	 */
 	void (*release)(struct dma_fence *fence);
 
-	/**
-	 * @fill_driver_data:
-	 *
-	 * Callback to fill in free-form debug info.
-	 *
-	 * Returns amount of bytes filled, or negative error on failure.
-	 *
-	 * This callback is optional.
-	 */
-	int (*fill_driver_data)(struct dma_fence *fence, void *data, int size);
-
 	/**
 	 * @fence_value_str:
 	 *
@@ -242,8 +231,9 @@ struct dma_fence_ops {
 	 * @timeline_value_str:
 	 *
 	 * Fills in the current value of the timeline as a string, like the
-	 * sequence number. This should match what @fill_driver_data prints for
-	 * the most recently signalled fence (assuming no delayed signalling).
+	 * sequence number. Note that the specific fence passed to this function
+	 * should not matter, drivers should only use it to look up the
+	 * corresponding timeline structures.
 	 */
 	void (*timeline_value_str)(struct dma_fence *fence,
 				   char *str, int size);
-- 
cgit v1.2.3


From 95ed01ea97b3d76380a817bc41ceeefffa6a99f1 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Apr 2018 08:17:10 +0200
Subject: dma-fence: Make ->enable_signaling optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many drivers have a trivial implementation for ->enable_signaling.
Let's make it optional by assuming that signalling is already
available when the callback isn't present.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180427061724.28497-4-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/dma-fence.c | 13 ++++++++++++-
 include/linux/dma-fence.h   |  3 ++-
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 4edb9fd3cf47..7b5b40d6b70e 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -181,6 +181,13 @@ void dma_fence_release(struct kref *kref)
 }
 EXPORT_SYMBOL(dma_fence_release);
 
+/**
+ * dma_fence_free - default release function for &dma_fence.
+ * @fence: fence to release
+ *
+ * This is the default implementation for &dma_fence_ops.release. It calls
+ * kfree_rcu() on @fence.
+ */
 void dma_fence_free(struct dma_fence *fence)
 {
 	kfree_rcu(fence, rcu);
@@ -560,7 +567,7 @@ dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 	       spinlock_t *lock, u64 context, unsigned seqno)
 {
 	BUG_ON(!lock);
-	BUG_ON(!ops || !ops->wait || !ops->enable_signaling ||
+	BUG_ON(!ops || !ops->wait ||
 	       !ops->get_driver_name || !ops->get_timeline_name);
 
 	kref_init(&fence->refcount);
@@ -572,6 +579,10 @@ dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 	fence->flags = 0UL;
 	fence->error = 0;
 
+	if (!ops->enable_signaling)
+		set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+			&fence->flags);
+
 	trace_dma_fence_init(fence);
 }
 EXPORT_SYMBOL(dma_fence_init);
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 111aefe1c956..c053d19e1e24 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -166,7 +166,8 @@ struct dma_fence_ops {
 	 * released when the fence is signalled (through e.g. the interrupt
 	 * handler).
 	 *
-	 * This callback is mandatory.
+	 * This callback is optional. If this callback is not present, then the
+	 * driver must always have signaling enabled.
 	 */
 	bool (*enable_signaling)(struct dma_fence *fence);
 
-- 
cgit v1.2.3


From 49a53d493e603c594e39dfbc7171917effcaf01e Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Apr 2018 08:17:12 +0200
Subject: dma-fence: Make ->wait callback optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Almost everyone uses dma_fence_default_wait.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20180427061724.28497-6-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/dma-fence-array.c |  1 -
 drivers/dma-buf/dma-fence.c       |  5 ++++-
 drivers/dma-buf/sw_sync.c         |  1 -
 include/linux/dma-fence.h         | 13 ++++++++-----
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index dd1edfb27b61..a8c254497251 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -104,7 +104,6 @@ const struct dma_fence_ops dma_fence_array_ops = {
 	.get_timeline_name = dma_fence_array_get_timeline_name,
 	.enable_signaling = dma_fence_array_enable_signaling,
 	.signaled = dma_fence_array_signaled,
-	.wait = dma_fence_default_wait,
 	.release = dma_fence_array_release,
 };
 EXPORT_SYMBOL(dma_fence_array_ops);
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 59049375bd19..30fcbe415ff4 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -158,7 +158,10 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout)
 		return -EINVAL;
 
 	trace_dma_fence_wait_start(fence);
-	ret = fence->ops->wait(fence, intr, timeout);
+	if (fence->ops->wait)
+		ret = fence->ops->wait(fence, intr, timeout);
+	else
+		ret = dma_fence_default_wait(fence, intr, timeout);
 	trace_dma_fence_wait_end(fence);
 	return ret;
 }
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
index 3d78ca89a605..53c1d6d36a64 100644
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -188,7 +188,6 @@ static const struct dma_fence_ops timeline_fence_ops = {
 	.get_timeline_name = timeline_fence_get_timeline_name,
 	.enable_signaling = timeline_fence_enable_signaling,
 	.signaled = timeline_fence_signaled,
-	.wait = dma_fence_default_wait,
 	.release = timeline_fence_release,
 	.fence_value_str = timeline_fence_value_str,
 	.timeline_value_str = timeline_fence_timeline_value_str,
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index c053d19e1e24..02dba8cd033d 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -191,11 +191,14 @@ struct dma_fence_ops {
 	/**
 	 * @wait:
 	 *
-	 * Custom wait implementation, or dma_fence_default_wait.
+	 * Custom wait implementation, defaults to dma_fence_default_wait() if
+	 * not set.
 	 *
-	 * Must not be NULL, set to dma_fence_default_wait for default implementation.
-	 * the dma_fence_default_wait implementation should work for any fence, as long
-	 * as enable_signaling works correctly.
+	 * The dma_fence_default_wait implementation should work for any fence, as long
+	 * as @enable_signaling works correctly. This hook allows drivers to
+	 * have an optimized version for the case where a process context is
+	 * already available, e.g. if @enable_signaling for the general case
+	 * needs to set up a worker thread.
 	 *
 	 * Must return -ERESTARTSYS if the wait is intr = true and the wait was
 	 * interrupted, and remaining jiffies if fence has signaled, or 0 if wait
@@ -203,7 +206,7 @@ struct dma_fence_ops {
 	 * which should be treated as if the fence is signaled. For example a hardware
 	 * lockup could be reported like that.
 	 *
-	 * This callback is mandatory.
+	 * This callback is optional.
 	 */
 	signed long (*wait)(struct dma_fence *fence,
 			    bool intr, signed long timeout);
-- 
cgit v1.2.3


From 51f170a544bdb06d93316d8ff0814a52daa24a6c Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 3 May 2018 12:31:38 +0200
Subject: Revert
 190c462d5be19ba622a82f5fd0625087c870a1e6..bf3012ada1b2222e770de5c35c1bb16f73b3a01d"

I shouldn't have pushed this, CI was right - I failed to remove the
BUG_ON(!ops->wait);

Reported-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 drivers/dma-buf/dma-fence-array.c       |  1 +
 drivers/dma-buf/dma-fence.c             | 23 +++++++----------------
 drivers/dma-buf/sw_sync.c               |  1 +
 drivers/gpu/drm/drm_crtc.c              |  7 +++++++
 drivers/gpu/drm/drm_syncobj.c           |  1 +
 drivers/gpu/drm/qxl/qxl_release.c       |  7 +++++++
 drivers/gpu/drm/scheduler/sched_fence.c | 11 +++++++++++
 include/linux/dma-fence.h               | 32 +++++++++++++++++++-------------
 8 files changed, 54 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index a8c254497251..dd1edfb27b61 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -104,6 +104,7 @@ const struct dma_fence_ops dma_fence_array_ops = {
 	.get_timeline_name = dma_fence_array_get_timeline_name,
 	.enable_signaling = dma_fence_array_enable_signaling,
 	.signaled = dma_fence_array_signaled,
+	.wait = dma_fence_default_wait,
 	.release = dma_fence_array_release,
 };
 EXPORT_SYMBOL(dma_fence_array_ops);
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 30fcbe415ff4..4edb9fd3cf47 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -158,10 +158,7 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout)
 		return -EINVAL;
 
 	trace_dma_fence_wait_start(fence);
-	if (fence->ops->wait)
-		ret = fence->ops->wait(fence, intr, timeout);
-	else
-		ret = dma_fence_default_wait(fence, intr, timeout);
+	ret = fence->ops->wait(fence, intr, timeout);
 	trace_dma_fence_wait_end(fence);
 	return ret;
 }
@@ -184,13 +181,6 @@ void dma_fence_release(struct kref *kref)
 }
 EXPORT_SYMBOL(dma_fence_release);
 
-/**
- * dma_fence_free - default release function for &dma_fence.
- * @fence: fence to release
- *
- * This is the default implementation for &dma_fence_ops.release. It calls
- * kfree_rcu() on @fence.
- */
 void dma_fence_free(struct dma_fence *fence)
 {
 	kfree_rcu(fence, rcu);
@@ -506,6 +496,11 @@ dma_fence_wait_any_timeout(struct dma_fence **fences, uint32_t count,
 	for (i = 0; i < count; ++i) {
 		struct dma_fence *fence = fences[i];
 
+		if (fence->ops->wait != dma_fence_default_wait) {
+			ret = -EINVAL;
+			goto fence_rm_cb;
+		}
+
 		cb[i].task = current;
 		if (dma_fence_add_callback(fence, &cb[i].base,
 					   dma_fence_default_wait_cb)) {
@@ -565,7 +560,7 @@ dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 	       spinlock_t *lock, u64 context, unsigned seqno)
 {
 	BUG_ON(!lock);
-	BUG_ON(!ops || !ops->wait ||
+	BUG_ON(!ops || !ops->wait || !ops->enable_signaling ||
 	       !ops->get_driver_name || !ops->get_timeline_name);
 
 	kref_init(&fence->refcount);
@@ -577,10 +572,6 @@ dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 	fence->flags = 0UL;
 	fence->error = 0;
 
-	if (!ops->enable_signaling)
-		set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
-			&fence->flags);
-
 	trace_dma_fence_init(fence);
 }
 EXPORT_SYMBOL(dma_fence_init);
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
index 53c1d6d36a64..3d78ca89a605 100644
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -188,6 +188,7 @@ static const struct dma_fence_ops timeline_fence_ops = {
 	.get_timeline_name = timeline_fence_get_timeline_name,
 	.enable_signaling = timeline_fence_enable_signaling,
 	.signaled = timeline_fence_signaled,
+	.wait = dma_fence_default_wait,
 	.release = timeline_fence_release,
 	.fence_value_str = timeline_fence_value_str,
 	.timeline_value_str = timeline_fence_timeline_value_str,
diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index e4d3285f4191..a231dd5dce16 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -225,9 +225,16 @@ static const char *drm_crtc_fence_get_timeline_name(struct dma_fence *fence)
 	return crtc->timeline_name;
 }
 
+static bool drm_crtc_fence_enable_signaling(struct dma_fence *fence)
+{
+	return true;
+}
+
 static const struct dma_fence_ops drm_crtc_fence_ops = {
 	.get_driver_name = drm_crtc_fence_get_driver_name,
 	.get_timeline_name = drm_crtc_fence_get_timeline_name,
+	.enable_signaling = drm_crtc_fence_enable_signaling,
+	.wait = dma_fence_default_wait,
 };
 
 struct dma_fence *drm_crtc_create_fence(struct drm_crtc *crtc)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index adb3cb27d31e..d4f4ce484529 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -207,6 +207,7 @@ static const struct dma_fence_ops drm_syncobj_null_fence_ops = {
 	.get_driver_name = drm_syncobj_null_fence_get_name,
 	.get_timeline_name = drm_syncobj_null_fence_get_name,
 	.enable_signaling = drm_syncobj_null_fence_enable_signaling,
+	.wait = dma_fence_default_wait,
 	.release = NULL,
 };
 
diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index 04f3605ac42a..5d84a66fed36 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -50,6 +50,12 @@ static const char *qxl_get_timeline_name(struct dma_fence *fence)
 	return "release";
 }
 
+static bool qxl_nop_signaling(struct dma_fence *fence)
+{
+	/* fences are always automatically signaled, so just pretend we did this.. */
+	return true;
+}
+
 static long qxl_fence_wait(struct dma_fence *fence, bool intr,
 			   signed long timeout)
 {
@@ -113,6 +119,7 @@ signaled:
 static const struct dma_fence_ops qxl_fence_ops = {
 	.get_driver_name = qxl_get_driver_name,
 	.get_timeline_name = qxl_get_timeline_name,
+	.enable_signaling = qxl_nop_signaling,
 	.wait = qxl_fence_wait,
 };
 
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
index 4843289cc8f0..69aab086b913 100644
--- a/drivers/gpu/drm/scheduler/sched_fence.c
+++ b/drivers/gpu/drm/scheduler/sched_fence.c
@@ -81,6 +81,11 @@ static const char *drm_sched_fence_get_timeline_name(struct dma_fence *f)
 	return (const char *)fence->sched->name;
 }
 
+static bool drm_sched_fence_enable_signaling(struct dma_fence *f)
+{
+	return true;
+}
+
 /**
  * amd_sched_fence_free - free up the fence memory
  *
@@ -129,12 +134,18 @@ static void drm_sched_fence_release_finished(struct dma_fence *f)
 const struct dma_fence_ops drm_sched_fence_ops_scheduled = {
 	.get_driver_name = drm_sched_fence_get_driver_name,
 	.get_timeline_name = drm_sched_fence_get_timeline_name,
+	.enable_signaling = drm_sched_fence_enable_signaling,
+	.signaled = NULL,
+	.wait = dma_fence_default_wait,
 	.release = drm_sched_fence_release_scheduled,
 };
 
 const struct dma_fence_ops drm_sched_fence_ops_finished = {
 	.get_driver_name = drm_sched_fence_get_driver_name,
 	.get_timeline_name = drm_sched_fence_get_timeline_name,
+	.enable_signaling = drm_sched_fence_enable_signaling,
+	.signaled = NULL,
+	.wait = dma_fence_default_wait,
 	.release = drm_sched_fence_release_finished,
 };
 
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 02dba8cd033d..eb9b05aa5aea 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -166,8 +166,7 @@ struct dma_fence_ops {
 	 * released when the fence is signalled (through e.g. the interrupt
 	 * handler).
 	 *
-	 * This callback is optional. If this callback is not present, then the
-	 * driver must always have signaling enabled.
+	 * This callback is mandatory.
 	 */
 	bool (*enable_signaling)(struct dma_fence *fence);
 
@@ -191,14 +190,11 @@ struct dma_fence_ops {
 	/**
 	 * @wait:
 	 *
-	 * Custom wait implementation, defaults to dma_fence_default_wait() if
-	 * not set.
+	 * Custom wait implementation, or dma_fence_default_wait.
 	 *
-	 * The dma_fence_default_wait implementation should work for any fence, as long
-	 * as @enable_signaling works correctly. This hook allows drivers to
-	 * have an optimized version for the case where a process context is
-	 * already available, e.g. if @enable_signaling for the general case
-	 * needs to set up a worker thread.
+	 * Must not be NULL, set to dma_fence_default_wait for default implementation.
+	 * the dma_fence_default_wait implementation should work for any fence, as long
+	 * as enable_signaling works correctly.
 	 *
 	 * Must return -ERESTARTSYS if the wait is intr = true and the wait was
 	 * interrupted, and remaining jiffies if fence has signaled, or 0 if wait
@@ -206,7 +202,7 @@ struct dma_fence_ops {
 	 * which should be treated as if the fence is signaled. For example a hardware
 	 * lockup could be reported like that.
 	 *
-	 * This callback is optional.
+	 * This callback is mandatory.
 	 */
 	signed long (*wait)(struct dma_fence *fence,
 			    bool intr, signed long timeout);
@@ -221,6 +217,17 @@ struct dma_fence_ops {
 	 */
 	void (*release)(struct dma_fence *fence);
 
+	/**
+	 * @fill_driver_data:
+	 *
+	 * Callback to fill in free-form debug info.
+	 *
+	 * Returns amount of bytes filled, or negative error on failure.
+	 *
+	 * This callback is optional.
+	 */
+	int (*fill_driver_data)(struct dma_fence *fence, void *data, int size);
+
 	/**
 	 * @fence_value_str:
 	 *
@@ -235,9 +242,8 @@ struct dma_fence_ops {
 	 * @timeline_value_str:
 	 *
 	 * Fills in the current value of the timeline as a string, like the
-	 * sequence number. Note that the specific fence passed to this function
-	 * should not matter, drivers should only use it to look up the
-	 * corresponding timeline structures.
+	 * sequence number. This should match what @fill_driver_data prints for
+	 * the most recently signalled fence (assuming no delayed signalling).
 	 */
 	void (*timeline_value_str)(struct dma_fence *fence,
 				   char *str, int size);
-- 
cgit v1.2.3


From 07397df29e57cde5799af16e8f148ae10ed75285 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@nxp.com>
Date: Sat, 28 Apr 2018 08:21:58 +0530
Subject: dma-mapping: move dma configuration to bus infrastructure

ACPI/OF support for configuration of DMA is a bus specific aspect, and
thus should be configured by the bus.  Introduces a 'dma_configure' bus
method so that busses can control their DMA capabilities.

Also update the PCI, Platform, ACPI and host1x buses to use the new
method.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nipun Gupta <nipun.gupta@nxp.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>  # PCI parts
Acked-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[hch: simplified host1x_dma_configure based on a comment from Thierry,
      rewrote changelog]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/amba/bus.c              |  4 ++++
 drivers/base/dma-mapping.c      | 31 ++++---------------------------
 drivers/base/platform.c         | 17 +++++++++++++++++
 drivers/gpu/host1x/bus.c        |  6 ++++++
 drivers/pci/pci-driver.c        | 32 ++++++++++++++++++++++++++++++++
 include/linux/device.h          |  4 ++++
 include/linux/platform_device.h |  2 ++
 7 files changed, 69 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 4a3ac31c07d0..b1f41f7d8eeb 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -20,6 +20,7 @@
 #include <linux/sizes.h>
 #include <linux/limits.h>
 #include <linux/clk/clk-conf.h>
+#include <linux/platform_device.h>
 
 #include <asm/irq.h>
 
@@ -193,12 +194,15 @@ static const struct dev_pm_ops amba_pm = {
 /*
  * Primecells are part of the Advanced Microcontroller Bus Architecture,
  * so we call the bus "amba".
+ * DMA configuration for platform and AMBA bus is same. So here we reuse
+ * platform's DMA config routine.
  */
 struct bus_type amba_bustype = {
 	.name		= "amba",
 	.dev_groups	= amba_dev_groups,
 	.match		= amba_match,
 	.uevent		= amba_uevent,
+	.dma_configure	= platform_dma_configure,
 	.pm		= &amba_pm,
 	.force_dma	= true,
 };
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index d82566d6e237..f831a582209c 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -329,36 +329,13 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
 #endif
 
 /*
- * Common configuration to enable DMA API use for a device
+ * enables DMA API use for a device
  */
-#include <linux/pci.h>
-
 int dma_configure(struct device *dev)
 {
-	struct device *bridge = NULL, *dma_dev = dev;
-	enum dev_dma_attr attr;
-	int ret = 0;
-
-	if (dev_is_pci(dev)) {
-		bridge = pci_get_host_bridge_device(to_pci_dev(dev));
-		dma_dev = bridge;
-		if (IS_ENABLED(CONFIG_OF) && dma_dev->parent &&
-		    dma_dev->parent->of_node)
-			dma_dev = dma_dev->parent;
-	}
-
-	if (dma_dev->of_node) {
-		ret = of_dma_configure(dev, dma_dev->of_node);
-	} else if (has_acpi_companion(dma_dev)) {
-		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
-		if (attr != DEV_DMA_NOT_SUPPORTED)
-			ret = acpi_dma_configure(dev, attr);
-	}
-
-	if (bridge)
-		pci_put_host_bridge_device(bridge);
-
-	return ret;
+	if (dev->bus->dma_configure)
+		return dev->bus->dma_configure(dev);
+	return 0;
 }
 
 void dma_deconfigure(struct device *dev)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 8075ddc70a17..638d42e93772 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1130,6 +1130,22 @@ int platform_pm_restore(struct device *dev)
 
 #endif /* CONFIG_HIBERNATE_CALLBACKS */
 
+int platform_dma_configure(struct device *dev)
+{
+	enum dev_dma_attr attr;
+	int ret = 0;
+
+	if (dev->of_node) {
+		ret = of_dma_configure(dev, dev->of_node);
+	} else if (has_acpi_companion(dev)) {
+		attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode));
+		if (attr != DEV_DMA_NOT_SUPPORTED)
+			ret = acpi_dma_configure(dev, attr);
+	}
+
+	return ret;
+}
+
 static const struct dev_pm_ops platform_dev_pm_ops = {
 	.runtime_suspend = pm_generic_runtime_suspend,
 	.runtime_resume = pm_generic_runtime_resume,
@@ -1141,6 +1157,7 @@ struct bus_type platform_bus_type = {
 	.dev_groups	= platform_dev_groups,
 	.match		= platform_match,
 	.uevent		= platform_uevent,
+	.dma_configure	= platform_dma_configure,
 	.pm		= &platform_dev_pm_ops,
 	.force_dma	= true,
 };
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 88a3558b7916..0c79bafae96c 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -314,6 +314,11 @@ static int host1x_device_match(struct device *dev, struct device_driver *drv)
 	return strcmp(dev_name(dev), drv->name) == 0;
 }
 
+static int host1x_dma_configure(struct device *dev)
+{
+	return of_dma_configure(dev, dev->of_node);
+}
+
 static const struct dev_pm_ops host1x_device_pm_ops = {
 	.suspend = pm_generic_suspend,
 	.resume = pm_generic_resume,
@@ -326,6 +331,7 @@ static const struct dev_pm_ops host1x_device_pm_ops = {
 struct bus_type host1x_bus_type = {
 	.name = "host1x",
 	.match = host1x_device_match,
+	.dma_configure	= host1x_dma_configure,
 	.pm = &host1x_device_pm_ops,
 	.force_dma = true,
 };
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index b9a131137e64..ba8c6b46269e 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -16,6 +16,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/suspend.h>
 #include <linux/kexec.h>
+#include <linux/of_device.h>
+#include <linux/acpi.h>
 #include "pci.h"
 #include "pcie/portdrv.h"
 
@@ -1577,6 +1579,35 @@ static int pci_bus_num_vf(struct device *dev)
 	return pci_num_vf(to_pci_dev(dev));
 }
 
+/**
+ * pci_dma_configure - Setup DMA configuration
+ * @dev: ptr to dev structure
+ *
+ * Function to update PCI devices's DMA configuration using the same
+ * info from the OF node or ACPI node of host bridge's parent (if any).
+ */
+static int pci_dma_configure(struct device *dev)
+{
+	struct device *bridge;
+	int ret = 0;
+
+	bridge = pci_get_host_bridge_device(to_pci_dev(dev));
+
+	if (IS_ENABLED(CONFIG_OF) && bridge->parent &&
+	    bridge->parent->of_node) {
+		ret = of_dma_configure(dev, bridge->parent->of_node);
+	} else if (has_acpi_companion(bridge)) {
+		struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
+		enum dev_dma_attr attr = acpi_get_dma_attr(adev);
+
+		if (attr != DEV_DMA_NOT_SUPPORTED)
+			ret = acpi_dma_configure(dev, attr);
+	}
+
+	pci_put_host_bridge_device(bridge);
+	return ret;
+}
+
 struct bus_type pci_bus_type = {
 	.name		= "pci",
 	.match		= pci_bus_match,
@@ -1589,6 +1620,7 @@ struct bus_type pci_bus_type = {
 	.drv_groups	= pci_drv_groups,
 	.pm		= PCI_PM_OPS_PTR,
 	.num_vf		= pci_bus_num_vf,
+	.dma_configure	= pci_dma_configure,
 	.force_dma	= true,
 };
 EXPORT_SYMBOL(pci_bus_type);
diff --git a/include/linux/device.h b/include/linux/device.h
index 477956990f5e..63aa672bd394 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -88,6 +88,8 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @resume:	Called to bring a device on this bus out of sleep mode.
  * @num_vf:	Called to find out how many virtual functions a device on this
  *		bus supports.
+ * @dma_configure:	Called to setup DMA configuration on a device on
+			this bus.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
  * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
@@ -130,6 +132,8 @@ struct bus_type {
 
 	int (*num_vf)(struct device *dev);
 
+	int (*dma_configure)(struct device *dev);
+
 	const struct dev_pm_ops *pm;
 
 	const struct iommu_ops *iommu_ops;
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 49f634d96118..3097c943fab9 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -356,6 +356,8 @@ extern int platform_pm_restore(struct device *dev);
 #define platform_pm_restore		NULL
 #endif
 
+extern int platform_dma_configure(struct device *dev);
+
 #ifdef CONFIG_PM_SLEEP
 #define USE_PLATFORM_PM_SLEEP_OPS \
 	.suspend = platform_pm_suspend, \
-- 
cgit v1.2.3


From 3d6ce86ee79465e1b1b6e287f8ea26b553fc768e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 May 2018 16:25:08 +0200
Subject: drivers: remove force dma flag from buses

With each bus implementing its own DMA configuration callback, there is no
need for bus to explicitly set the force_dma flag.  Modify the
of_dma_configure function to accept an input parameter which specifies if
implicit DMA configuration is required when it is not described by the
firmware.

Signed-off-by: Nipun Gupta <nipun.gupta@nxp.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>  # PCI parts
Reviewed-by: Rob Herring <robh@kernel.org>
[hch: tweaked the changelog a bit]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/amba/bus.c            | 1 -
 drivers/base/platform.c       | 3 +--
 drivers/bcma/main.c           | 2 +-
 drivers/dma/qcom/hidma_mgmt.c | 2 +-
 drivers/gpu/host1x/bus.c      | 5 ++---
 drivers/of/device.c           | 6 ++++--
 drivers/of/of_reserved_mem.c  | 2 +-
 drivers/pci/pci-driver.c      | 3 +--
 include/linux/device.h        | 4 ----
 include/linux/of_device.h     | 8 ++++++--
 10 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index b1f41f7d8eeb..3b0118786b43 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -204,7 +204,6 @@ struct bus_type amba_bustype = {
 	.uevent		= amba_uevent,
 	.dma_configure	= platform_dma_configure,
 	.pm		= &amba_pm,
-	.force_dma	= true,
 };
 
 static int __init amba_init(void)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 638d42e93772..c0ff1e73a634 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1136,7 +1136,7 @@ int platform_dma_configure(struct device *dev)
 	int ret = 0;
 
 	if (dev->of_node) {
-		ret = of_dma_configure(dev, dev->of_node);
+		ret = of_dma_configure(dev, dev->of_node, true);
 	} else if (has_acpi_companion(dev)) {
 		attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode));
 		if (attr != DEV_DMA_NOT_SUPPORTED)
@@ -1159,7 +1159,6 @@ struct bus_type platform_bus_type = {
 	.uevent		= platform_uevent,
 	.dma_configure	= platform_dma_configure,
 	.pm		= &platform_dev_pm_ops,
-	.force_dma	= true,
 };
 EXPORT_SYMBOL_GPL(platform_bus_type);
 
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index e6986c7608f1..fc1f4acdd189 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -207,7 +207,7 @@ static void bcma_of_fill_device(struct device *parent,
 
 	core->irq = bcma_of_get_irq(parent, core, 0);
 
-	of_dma_configure(&core->dev, node);
+	of_dma_configure(&core->dev, node, false);
 }
 
 unsigned int bcma_core_irq(struct bcma_device *core, int num)
diff --git a/drivers/dma/qcom/hidma_mgmt.c b/drivers/dma/qcom/hidma_mgmt.c
index 000c7019ca7d..d64edeb6771a 100644
--- a/drivers/dma/qcom/hidma_mgmt.c
+++ b/drivers/dma/qcom/hidma_mgmt.c
@@ -398,7 +398,7 @@ static int __init hidma_mgmt_of_populate_channels(struct device_node *np)
 		}
 		of_node_get(child);
 		new_pdev->dev.of_node = child;
-		of_dma_configure(&new_pdev->dev, child);
+		of_dma_configure(&new_pdev->dev, child, true);
 		/*
 		 * It is assumed that calling of_msi_configure is safe on
 		 * platforms with or without MSI support.
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 0c79bafae96c..815bdb42e3f0 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -316,7 +316,7 @@ static int host1x_device_match(struct device *dev, struct device_driver *drv)
 
 static int host1x_dma_configure(struct device *dev)
 {
-	return of_dma_configure(dev, dev->of_node);
+	return of_dma_configure(dev, dev->of_node, true);
 }
 
 static const struct dev_pm_ops host1x_device_pm_ops = {
@@ -333,7 +333,6 @@ struct bus_type host1x_bus_type = {
 	.match = host1x_device_match,
 	.dma_configure	= host1x_dma_configure,
 	.pm = &host1x_device_pm_ops,
-	.force_dma = true,
 };
 
 static void __host1x_device_del(struct host1x_device *device)
@@ -422,7 +421,7 @@ static int host1x_device_add(struct host1x *host1x,
 	device->dev.bus = &host1x_bus_type;
 	device->dev.parent = host1x->dev;
 
-	of_dma_configure(&device->dev, host1x->dev->of_node);
+	of_dma_configure(&device->dev, host1x->dev->of_node, true);
 
 	err = host1x_device_parse_dt(device, driver);
 	if (err < 0) {
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 064c818105bd..33d85511d790 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -76,6 +76,8 @@ int of_device_add(struct platform_device *ofdev)
  * of_dma_configure - Setup DMA configuration
  * @dev:	Device to apply DMA configuration
  * @np:		Pointer to OF node having DMA configuration
+ * @force_dma:  Whether device is to be set up by of_dma_configure() even if
+ *		DMA capability is not explicitly described by firmware.
  *
  * Try to get devices's DMA configuration from DT and update it
  * accordingly.
@@ -84,7 +86,7 @@ int of_device_add(struct platform_device *ofdev)
  * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
  * to fix up DMA configuration.
  */
-int of_dma_configure(struct device *dev, struct device_node *np)
+int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 {
 	u64 dma_addr, paddr, size = 0;
 	int ret;
@@ -100,7 +102,7 @@ int of_dma_configure(struct device *dev, struct device_node *np)
 		 * DMA configuration regardless of whether "dma-ranges" is
 		 * correctly specified or not.
 		 */
-		if (!dev->bus->force_dma)
+		if (!force_dma)
 			return ret == -ENODEV ? 0 : ret;
 
 		dma_addr = offset = 0;
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 9a4f4246231d..895c83e0c7b6 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -353,7 +353,7 @@ int of_reserved_mem_device_init_by_idx(struct device *dev,
 		/* ensure that dma_ops is set for virtual devices
 		 * using reserved memory
 		 */
-		of_dma_configure(dev, np);
+		of_dma_configure(dev, np, true);
 
 		dev_info(dev, "assigned reserved memory node %s\n", rmem->name);
 	} else {
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index ba8c6b46269e..f8269a725667 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1595,7 +1595,7 @@ static int pci_dma_configure(struct device *dev)
 
 	if (IS_ENABLED(CONFIG_OF) && bridge->parent &&
 	    bridge->parent->of_node) {
-		ret = of_dma_configure(dev, bridge->parent->of_node);
+		ret = of_dma_configure(dev, bridge->parent->of_node, true);
 	} else if (has_acpi_companion(bridge)) {
 		struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
 		enum dev_dma_attr attr = acpi_get_dma_attr(adev);
@@ -1621,7 +1621,6 @@ struct bus_type pci_bus_type = {
 	.pm		= PCI_PM_OPS_PTR,
 	.num_vf		= pci_bus_num_vf,
 	.dma_configure	= pci_dma_configure,
-	.force_dma	= true,
 };
 EXPORT_SYMBOL(pci_bus_type);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 63aa672bd394..563077d1cdc1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -98,8 +98,6 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @p:		The private data of the driver core, only the driver core can
  *		touch this.
  * @lock_key:	Lock class key for use by the lock validator
- * @force_dma:	Assume devices on this bus should be set up by dma_configure()
- * 		even if DMA capability is not explicitly described by firmware.
  *
  * A bus is a channel between the processor and one or more devices. For the
  * purposes of the device model, all devices are connected via a bus, even if
@@ -140,8 +138,6 @@ struct bus_type {
 
 	struct subsys_private *p;
 	struct lock_class_key lock_key;
-
-	bool force_dma;
 };
 
 extern int __must_check bus_register(struct bus_type *bus);
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 8da5a1b31ece..165fd302b442 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -55,7 +55,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return of_node_get(cpu_dev->of_node);
 }
 
-int of_dma_configure(struct device *dev, struct device_node *np);
+int of_dma_configure(struct device *dev,
+		     struct device_node *np,
+		     bool force_dma);
 void of_dma_deconfigure(struct device *dev);
 #else /* CONFIG_OF */
 
@@ -105,7 +107,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return NULL;
 }
 
-static inline int of_dma_configure(struct device *dev, struct device_node *np)
+static inline int of_dma_configure(struct device *dev,
+				   struct device_node *np,
+				   bool force_dma)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 816a3bed9549fcc0c90ba97c9077e64e734f0df6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 3 May 2018 14:43:46 +0200
Subject: switchdev: Add fdb.added_by_user to switchdev notifications

The following patch enables sending notifications also for events on FDB
entries that weren't added by the user. Give the drivers the information
necessary to distinguish between the two origins of FDB entries.

To maintain the current behavior, have switchdev-implementing drivers
bail out on notifications about non-user-added FDB entries. In case of
mlxsw driver, allow a call to mlxsw_sp_span_respin() so that SPAN over
bridge catches up with the changed FDB.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c |  4 ++++
 drivers/net/ethernet/rocker/rocker_main.c                |  2 ++
 include/net/switchdev.h                                  |  1 +
 net/bridge/br_switchdev.c                                | 10 +++++++---
 net/dsa/slave.c                                          |  5 ++++-
 5 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 1af99fe5fd32..3973d90cc908 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -2270,6 +2270,8 @@ static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
 	switch (switchdev_work->event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
 		err = mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, true);
 		if (err)
 			break;
@@ -2279,6 +2281,8 @@ static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
 		break;
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
 		mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, false);
 		break;
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 056cb6093630..152d6948611c 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2783,6 +2783,8 @@ static int rocker_switchdev_event(struct notifier_block *unused,
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		if (!fdb_info->added_by_user)
+			break;
 		memcpy(&switchdev_work->fdb_info, ptr,
 		       sizeof(switchdev_work->fdb_info));
 		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 39bc855d7fee..d574ce63bf22 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -155,6 +155,7 @@ struct switchdev_notifier_fdb_info {
 	struct switchdev_notifier_info info; /* must be first */
 	const unsigned char *addr;
 	u16 vid;
+	bool added_by_user;
 };
 
 static inline struct net_device *
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index ee775f4ff76c..71a03c487c4b 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -102,13 +102,15 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
 
 static void
 br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
-				u16 vid, struct net_device *dev)
+				u16 vid, struct net_device *dev,
+				bool added_by_user)
 {
 	struct switchdev_notifier_fdb_info info;
 	unsigned long notifier_type;
 
 	info.addr = mac;
 	info.vid = vid;
+	info.added_by_user = added_by_user;
 	notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
 	call_switchdev_notifiers(notifier_type, dev, &info.info);
 }
@@ -123,12 +125,14 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 	case RTM_DELNEIGH:
 		br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
 						fdb->key.vlan_id,
-						fdb->dst->dev);
+						fdb->dst->dev,
+						fdb->added_by_user);
 		break;
 	case RTM_NEWNEIGH:
 		br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
 						fdb->key.vlan_id,
-						fdb->dst->dev);
+						fdb->dst->dev,
+						fdb->added_by_user);
 		break;
 	}
 }
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f3fb3a0880b1..c287f1ef964c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1441,6 +1441,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
 				     unsigned long event, void *ptr)
 {
 	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	struct switchdev_notifier_fdb_info *fdb_info = ptr;
 	struct dsa_switchdev_event_work *switchdev_work;
 
 	if (!dsa_slave_dev_check(dev))
@@ -1458,8 +1459,10 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		if (!fdb_info->added_by_user)
+			break;
 		if (dsa_slave_switchdev_fdb_work_init(switchdev_work,
-						      ptr))
+						      fdb_info))
 			goto err_fdb_work_init;
 		dev_hold(dev);
 		break;
-- 
cgit v1.2.3


From 0d49f303e8a7006e0af3b58ed3809e1cad0900fb Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 20 Apr 2018 08:51:59 +0200
Subject: drm: remove all control node code

With the ioctl and driver prep done, we can remove everything else.

Reviewed-by: Sean Paul <seanpaul@chromium.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: David Airlie <airlied@linux.ie>
Link: https://patchwork.freedesktop.org/patch/msgid/20180420065159.4531-4-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_drv.c         | 10 ----------
 drivers/gpu/drm/drm_framebuffer.c |  3 +--
 drivers/gpu/drm/drm_ioctl.c       |  8 +-------
 drivers/gpu/drm/drm_sysfs.c       |  4 +---
 include/drm/drm_device.h          |  1 -
 include/drm/drm_file.h            | 13 -------------
 include/drm/drm_ioctl.h           |  7 -------
 7 files changed, 3 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 32a83b41ab61..f6910ebe4d0e 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -99,8 +99,6 @@ static struct drm_minor **drm_minor_get_slot(struct drm_device *dev,
 		return &dev->primary;
 	case DRM_MINOR_RENDER:
 		return &dev->render;
-	case DRM_MINOR_CONTROL:
-		return &dev->control;
 	default:
 		BUG();
 	}
@@ -567,7 +565,6 @@ err_ctxbitmap:
 err_minors:
 	drm_minor_free(dev, DRM_MINOR_PRIMARY);
 	drm_minor_free(dev, DRM_MINOR_RENDER);
-	drm_minor_free(dev, DRM_MINOR_CONTROL);
 	drm_fs_inode_free(dev->anon_inode);
 err_free:
 	mutex_destroy(&dev->master_mutex);
@@ -603,7 +600,6 @@ void drm_dev_fini(struct drm_device *dev)
 
 	drm_minor_free(dev, DRM_MINOR_PRIMARY);
 	drm_minor_free(dev, DRM_MINOR_RENDER);
-	drm_minor_free(dev, DRM_MINOR_CONTROL);
 
 	mutex_destroy(&dev->master_mutex);
 	mutex_destroy(&dev->ctxlist_mutex);
@@ -796,10 +792,6 @@ int drm_dev_register(struct drm_device *dev, unsigned long flags)
 
 	mutex_lock(&drm_global_mutex);
 
-	ret = drm_minor_register(dev, DRM_MINOR_CONTROL);
-	if (ret)
-		goto err_minors;
-
 	ret = drm_minor_register(dev, DRM_MINOR_RENDER);
 	if (ret)
 		goto err_minors;
@@ -837,7 +829,6 @@ err_minors:
 	remove_compat_control_link(dev);
 	drm_minor_unregister(dev, DRM_MINOR_PRIMARY);
 	drm_minor_unregister(dev, DRM_MINOR_RENDER);
-	drm_minor_unregister(dev, DRM_MINOR_CONTROL);
 out_unlock:
 	mutex_unlock(&drm_global_mutex);
 	return ret;
@@ -882,7 +873,6 @@ void drm_dev_unregister(struct drm_device *dev)
 	remove_compat_control_link(dev);
 	drm_minor_unregister(dev, DRM_MINOR_PRIMARY);
 	drm_minor_unregister(dev, DRM_MINOR_RENDER);
-	drm_minor_unregister(dev, DRM_MINOR_CONTROL);
 }
 EXPORT_SYMBOL(drm_dev_unregister);
 
diff --git a/drivers/gpu/drm/drm_framebuffer.c b/drivers/gpu/drm/drm_framebuffer.c
index 8c4d32adcc17..bfedceff87bb 100644
--- a/drivers/gpu/drm/drm_framebuffer.c
+++ b/drivers/gpu/drm/drm_framebuffer.c
@@ -484,8 +484,7 @@ int drm_mode_getfb(struct drm_device *dev,
 	 * backwards-compatibility reasons, we cannot make GET_FB() privileged,
 	 * so just return an invalid handle for non-masters.
 	 */
-	if (!drm_is_current_master(file_priv) && !capable(CAP_SYS_ADMIN) &&
-	    !drm_is_control_client(file_priv)) {
+	if (!drm_is_current_master(file_priv) && !capable(CAP_SYS_ADMIN)) {
 		r->handle = 0;
 		ret = 0;
 		goto out;
diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 43f7e2e81294..eadeabc393f0 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -510,13 +510,7 @@ int drm_ioctl_permit(u32 flags, struct drm_file *file_priv)
 
 	/* MASTER is only for master or control clients */
 	if (unlikely((flags & DRM_MASTER) &&
-		     !drm_is_current_master(file_priv) &&
-		     !drm_is_control_client(file_priv)))
-		return -EACCES;
-
-	/* Control clients must be explicitly allowed */
-	if (unlikely(!(flags & DRM_CONTROL_ALLOW) &&
-		     drm_is_control_client(file_priv)))
+		     !drm_is_current_master(file_priv)))
 		return -EACCES;
 
 	/* Render clients must be explicitly allowed */
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 1c5b5ce1fd7f..b3c1daad1169 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -331,9 +331,7 @@ struct device *drm_sysfs_minor_alloc(struct drm_minor *minor)
 	struct device *kdev;
 	int r;
 
-	if (minor->type == DRM_MINOR_CONTROL)
-		minor_str = "controlD%d";
-	else if (minor->type == DRM_MINOR_RENDER)
+	if (minor->type == DRM_MINOR_RENDER)
 		minor_str = "renderD%d";
 	else
 		minor_str = "card%d";
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 3a0eac2885b7..858ba19a3e29 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -38,7 +38,6 @@ struct drm_device {
 	struct device *dev;		/**< Device structure of bus-device */
 	struct drm_driver *driver;	/**< DRM driver managing the device */
 	void *dev_private;		/**< DRM driver private data */
-	struct drm_minor *control;		/**< Control node */
 	struct drm_minor *primary;		/**< Primary node */
 	struct drm_minor *render;		/**< Render node */
 	bool registered;
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 5176c3797680..99ab50cbab00 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -49,7 +49,6 @@ struct device;
 
 enum drm_minor_type {
 	DRM_MINOR_PRIMARY,
-	DRM_MINOR_CONTROL,
 	DRM_MINOR_RENDER,
 };
 
@@ -348,18 +347,6 @@ static inline bool drm_is_render_client(const struct drm_file *file_priv)
 	return file_priv->minor->type == DRM_MINOR_RENDER;
 }
 
-/**
- * drm_is_control_client - is this an open file of the control node
- * @file_priv: DRM file
- *
- * Control nodes are deprecated and in the process of getting removed from the
- * DRM userspace API. Do not ever use!
- */
-static inline bool drm_is_control_client(const struct drm_file *file_priv)
-{
-	return file_priv->minor->type == DRM_MINOR_CONTROL;
-}
-
 int drm_open(struct inode *inode, struct file *filp);
 ssize_t drm_read(struct file *filp, char __user *buffer,
 		 size_t count, loff_t *offset);
diff --git a/include/drm/drm_ioctl.h b/include/drm/drm_ioctl.h
index add42809642a..fafb6f592c4b 100644
--- a/include/drm/drm_ioctl.h
+++ b/include/drm/drm_ioctl.h
@@ -108,13 +108,6 @@ enum drm_ioctl_flags {
 	 * This is equivalent to callers with the SYSADMIN capability.
 	 */
 	DRM_ROOT_ONLY		= BIT(2),
-	/**
-	 * @DRM_CONTROL_ALLOW:
-	 *
-	 * Deprecated, do not use. Control nodes are in the process of getting
-	 * removed.
-	 */
-	DRM_CONTROL_ALLOW	= BIT(3),
 	/**
 	 * @DRM_UNLOCKED:
 	 *
-- 
cgit v1.2.3


From 25a0ad85156a7b697d4340560fff0d25a3b19243 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Thu, 3 May 2018 08:40:49 -0700
Subject: RDMA/nldev: Add explicit pad attribute

Add a specific RDMA_NLDEV_ATTR_PAD attribute to be used for 64b
attribute padding.  To preserve the ABI, make this attribute equal to
RDMA_NLDEV_ATTR_UNSPEC, which has a value of 0, because that has been
used up until now as the pad attribute.

Change all the previous use of 0 as the pad with this
new enum.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/nldev.c  | 26 +++++++++++++++-----------
 include/uapi/rdma/rdma_netlink.h |  3 +++
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index eb567765f45c..6b0c1eb71ea0 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -122,7 +122,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 
 	BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64));
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
-			      device->attrs.device_cap_flags, 0))
+			      device->attrs.device_cap_flags,
+			      RDMA_NLDEV_ATTR_PAD))
 		return -EMSGSIZE;
 
 	ib_get_device_fw_str(device, fw);
@@ -131,10 +132,12 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 		return -EMSGSIZE;
 
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID,
-			      be64_to_cpu(device->node_guid), 0))
+			      be64_to_cpu(device->node_guid),
+			      RDMA_NLDEV_ATTR_PAD))
 		return -EMSGSIZE;
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,
-			      be64_to_cpu(device->attrs.sys_image_guid), 0))
+			      be64_to_cpu(device->attrs.sys_image_guid),
+			      RDMA_NLDEV_ATTR_PAD))
 		return -EMSGSIZE;
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
 		return -EMSGSIZE;
@@ -161,11 +164,11 @@ static int fill_port_info(struct sk_buff *msg,
 
 	BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
-			      (u64)attr.port_cap_flags, 0))
+			      (u64)attr.port_cap_flags, RDMA_NLDEV_ATTR_PAD))
 		return -EMSGSIZE;
 	if (rdma_protocol_ib(device, port) &&
 	    nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
-			      attr.subnet_prefix, 0))
+			      attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD))
 		return -EMSGSIZE;
 	if (rdma_protocol_ib(device, port)) {
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid))
@@ -209,8 +212,8 @@ static int fill_res_info_entry(struct sk_buff *msg,
 
 	if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name))
 		goto err;
-	if (nla_put_u64_64bit(msg,
-			      RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, 0))
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr,
+			      RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
 	nla_nest_end(msg, entry_attr);
@@ -409,7 +412,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
 		goto err;
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
-			      atomic_read(&cq->usecnt), 0))
+			      atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
 	/* Poll context is only valid for kernel CQs */
@@ -445,11 +448,12 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
 			goto err;
 		if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_IOVA,
-				      mr->iova, 0))
+				      mr->iova, RDMA_NLDEV_ATTR_PAD))
 			goto err;
 	}
 
-	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, 0))
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length,
+			      RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
 	if (fill_res_name_pid(msg, res))
@@ -484,7 +488,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
 			goto err;
 	}
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
-			      atomic_read(&pd->usecnt), 0))
+			      atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
 		goto err;
 	if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) &&
 	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 0ce0943fc808..fbf99aa8a03c 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -253,6 +253,9 @@ enum rdma_nldev_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	RDMA_NLDEV_ATTR_UNSPEC,
 
+	/* Pad attribute for 64b alignment */
+	RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC,
+
 	/* Identifier for ib_device */
 	RDMA_NLDEV_ATTR_DEV_INDEX,		/* u32 */
 
-- 
cgit v1.2.3


From da5c8507821573b8ed6e3f47e009f273493ffaf7 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Thu, 3 May 2018 08:41:30 -0700
Subject: RDMA/nldev: add driver-specific resource tracking

Each driver can register a "fill entry" function with the restrack core.
This function will be called when filling out a resource, allowing the
driver to add driver-specific details.  The details consist of a
nltable of nested attributes, that are in the form of <key, [print-type],
value> tuples.  Both key and value attributes are mandatory.  The key
nlattr must be a string, and the value nlattr can be one of the driver
attributes that are generic, but typed, allowing the attributes to be
validated.  Currently the driver nlattr types include string, s32,
u32, s64, and u64.  The print-type nlattr allows a driver to specify
an alternative display format for user tools displaying the attribute.
For example, a u32 attribute will default to "%u", but a print-type
attribute can be included for it to be displayed in hex.  This allows
the user tool to print the number in the format desired by the driver
driver.

More attrs can be defined as they become needed by drivers.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/nldev.c    | 29 +++++++++++++++++++++++++++++
 drivers/infiniband/core/restrack.c |  7 +++++++
 include/rdma/restrack.h            |  9 +++++++++
 include/uapi/rdma/rdma_netlink.h   | 23 +++++++++++++++++++++++
 4 files changed, 68 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 6b0c1eb71ea0..50efca482a6c 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -98,6 +98,15 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_NDEV_INDEX]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_NDEV_NAME]		= { .type = NLA_NUL_STRING,
 						    .len = IFNAMSIZ },
+	[RDMA_NLDEV_ATTR_DRIVER]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_DRIVER_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_DRIVER_STRING]		= { .type = NLA_NUL_STRING,
+				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+	[RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DRIVER_S32]		= { .type = NLA_S32 },
+	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
+	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
 };
 
 static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
@@ -285,6 +294,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_qp *qp = container_of(res, struct ib_qp, res);
+	struct rdma_restrack_root *resroot = &qp->device->res;
 	struct ib_qp_init_attr qp_init_attr;
 	struct nlattr *entry_attr;
 	struct ib_qp_attr qp_attr;
@@ -334,6 +344,9 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
+	if (resroot->fill_res_entry(msg, res))
+		goto err;
+
 	nla_nest_end(msg, entry_attr);
 	return 0;
 
@@ -349,6 +362,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg,
 {
 	struct rdma_id_private *id_priv =
 				container_of(res, struct rdma_id_private, res);
+	struct rdma_restrack_root *resroot = &id_priv->id.device->res;
 	struct rdma_cm_id *cm_id = &id_priv->id;
 	struct nlattr *entry_attr;
 
@@ -390,6 +404,9 @@ static int fill_res_cm_id_entry(struct sk_buff *msg,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
+	if (resroot->fill_res_entry(msg, res))
+		goto err;
+
 	nla_nest_end(msg, entry_attr);
 	return 0;
 
@@ -403,6 +420,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_cq *cq = container_of(res, struct ib_cq, res);
+	struct rdma_restrack_root *resroot = &cq->device->res;
 	struct nlattr *entry_attr;
 
 	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY);
@@ -423,6 +441,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
+	if (resroot->fill_res_entry(msg, res))
+		goto err;
+
 	nla_nest_end(msg, entry_attr);
 	return 0;
 
@@ -436,6 +457,7 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_mr *mr = container_of(res, struct ib_mr, res);
+	struct rdma_restrack_root *resroot = &mr->pd->device->res;
 	struct nlattr *entry_attr;
 
 	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY);
@@ -459,6 +481,9 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
+	if (resroot->fill_res_entry(msg, res))
+		goto err;
+
 	nla_nest_end(msg, entry_attr);
 	return 0;
 
@@ -472,6 +497,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_pd *pd = container_of(res, struct ib_pd, res);
+	struct rdma_restrack_root *resroot = &pd->device->res;
 	struct nlattr *entry_attr;
 
 	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY);
@@ -498,6 +524,9 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
+	if (resroot->fill_res_entry(msg, res))
+		goto err;
+
 	nla_nest_end(msg, entry_attr);
 	return 0;
 
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index efddd13e3edb..172b517dc7b9 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -12,9 +12,16 @@
 
 #include "cma_priv.h"
 
+static int fill_res_noop(struct sk_buff *msg,
+			 struct rdma_restrack_entry *entry)
+{
+	return 0;
+}
+
 void rdma_restrack_init(struct rdma_restrack_root *res)
 {
 	init_rwsem(&res->rwsem);
+	res->fill_res_entry = fill_res_noop;
 }
 
 static const char *type2str(enum rdma_restrack_type type)
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index f3b3e3576f6a..14304e249685 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -44,6 +44,8 @@ enum rdma_restrack_type {
 };
 
 #define RDMA_RESTRACK_HASH_BITS	8
+struct rdma_restrack_entry;
+
 /**
  * struct rdma_restrack_root - main resource tracking management
  * entity, per-device
@@ -57,6 +59,13 @@ struct rdma_restrack_root {
 	 * @hash: global database for all resources per-device
 	 */
 	DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS);
+	/**
+	 * @fill_res_entry: driver-specific fill function
+	 *
+	 * Allows rdma drivers to add their own restrack attributes.
+	 */
+	int (*fill_res_entry)(struct sk_buff *msg,
+			      struct rdma_restrack_entry *entry);
 };
 
 /**
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index fbf99aa8a03c..07ff6c72fc50 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -249,6 +249,15 @@ enum rdma_nldev_command {
 	RDMA_NLDEV_NUM_OPS
 };
 
+enum {
+	RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+};
+
+enum rdma_nldev_print_type {
+	RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+	RDMA_NLDEV_PRINT_TYPE_HEX,
+};
+
 enum rdma_nldev_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	RDMA_NLDEV_ATTR_UNSPEC,
@@ -390,6 +399,20 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_PD_ENTRY,		/* nested table */
 	RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,	/* u32 */
 	RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,	/* u32 */
+	/*
+	 * driver-specific attributes.
+	 */
+	RDMA_NLDEV_ATTR_DRIVER,			/* nested table */
+	RDMA_NLDEV_ATTR_DRIVER_ENTRY,		/* nested table */
+	RDMA_NLDEV_ATTR_DRIVER_STRING,		/* string */
+	/*
+	 * u8 values from enum rdma_nldev_print_type
+	 */
+	RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE,	/* u8 */
+	RDMA_NLDEV_ATTR_DRIVER_S32,		/* s32 */
+	RDMA_NLDEV_ATTR_DRIVER_U32,		/* u32 */
+	RDMA_NLDEV_ATTR_DRIVER_S64,		/* s64 */
+	RDMA_NLDEV_ATTR_DRIVER_U64,		/* u64 */
 
 	/*
 	 * Provides logical name and index of netdevice which is
-- 
cgit v1.2.3


From 73937e8a030b046c6b0fa73868bee25647a29be4 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Thu, 3 May 2018 08:41:42 -0700
Subject: RDMA/nldev: helper functions to add driver attributes

These help rdma drivers to fill out the driver entries.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/nldev.c | 66 +++++++++++++++++++++++++++++++++++++++++
 include/rdma/restrack.h         | 11 +++++++
 2 files changed, 77 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 50efca482a6c..8674ca2d8f91 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -109,6 +109,72 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
 };
 
+static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
+				      enum rdma_nldev_print_type print_type)
+{
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name))
+		return -EMSGSIZE;
+	if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC &&
+	    nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name,
+				   enum rdma_nldev_print_type print_type,
+				   u32 value)
+{
+	if (put_driver_name_print_type(msg, name, print_type))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name,
+				   enum rdma_nldev_print_type print_type,
+				   u64 value)
+{
+	if (put_driver_name_print_type(msg, name, print_type))
+		return -EMSGSIZE;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value,
+			      RDMA_NLDEV_ATTR_PAD))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value)
+{
+	return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+				       value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u32);
+
+int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name,
+			       u32 value)
+{
+	return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX,
+				       value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex);
+
+int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value)
+{
+	return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+				       value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u64);
+
+int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value)
+{
+	return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX,
+				       value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex);
+
 static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
 {
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 14304e249685..637968589922 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -12,6 +12,7 @@
 #include <linux/kref.h>
 #include <linux/completion.h>
 #include <linux/sched/task.h>
+#include <uapi/rdma/rdma_netlink.h>
 
 /**
  * enum rdma_restrack_type - HW objects to track
@@ -183,4 +184,14 @@ static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res,
 	res->task = task;
 }
 
+/*
+ * Helper functions for rdma drivers when filling out
+ * nldev driver attributes.
+ */
+int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value);
+int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name,
+			       u32 value);
+int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value);
+int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name,
+			       u64 value);
 #endif /* _RDMA_RESTRACK_H_ */
-- 
cgit v1.2.3


From 4c70ac7639f6af6d7c2d01f0307665a4b9afada7 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 30 Apr 2018 16:59:27 -0700
Subject: drm/vc4: Add a pad field to align drm_vc4_submit_cl to 64 bits.

I had originally asked Stefan Schake to drop the pad field from the
syncobj changes that just landed, because I couldn't come up with a
reason to align to 64 bits.

Talking with Dave Airlie about the new v3d driver's submit ioctl, we
came up with a reason: sizeof() on 64-bit platforms may align to 64
bits, in which case the userspace will be submitting the aligned size
and the final 32 bits won't be zero-padded by the kernel.  If
userspace doesn't zero-fill, then a future ABI change adding a 32-bit
field at the end could potentially cause the kernel to read undefined
data from old userspace (our userspace happens to use structure
initialization that zero-fills, but as a general rule we try not to
rely on that in the kernel).

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20180430235927.28712-1-eric@anholt.net
Reviewed-by: Stefan Schake <stschake@gmail.com>
---
 drivers/gpu/drm/vc4/vc4_gem.c | 5 +++++
 include/uapi/drm/vc4_drm.h    | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index a4c4be3ac6af..7910b9acedd6 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -1132,6 +1132,11 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	if (args->pad2 != 0) {
+		DRM_DEBUG("Invalid pad: 0x%08x\n", args->pad2);
+		return -EINVAL;
+	}
+
 	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
 	if (!exec) {
 		DRM_ERROR("malloc failure on exec struct\n");
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 2be4fe3610b8..2cac6277a1d7 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -193,6 +193,8 @@ struct drm_vc4_submit_cl {
 	 * render job. 0 means ignore.
 	 */
 	__u32 out_sync;
+
+	__u32 pad2;
 };
 
 /**
-- 
cgit v1.2.3


From 68e8b849b221b37a78a110a0307717d45e3593a0 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:22 +0200
Subject: net: initial AF_XDP skeleton
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Buildable skeleton of AF_XDP without any functionality. Just what it
takes to register a new address family.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 MAINTAINERS                         |  8 ++++++++
 include/linux/socket.h              |  5 ++++-
 net/Kconfig                         |  1 +
 net/core/sock.c                     | 12 ++++++++----
 net/xdp/Kconfig                     |  7 +++++++
 security/selinux/hooks.c            |  4 +++-
 security/selinux/include/classmap.h |  4 +++-
 7 files changed, 34 insertions(+), 7 deletions(-)
 create mode 100644 net/xdp/Kconfig

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 537fd17a211b..52d246fd29c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15424,6 +15424,14 @@ T:	git git://linuxtv.org/media_tree.git
 S:	Maintained
 F:	drivers/media/tuners/tuner-xc2028.*
 
+XDP SOCKETS (AF_XDP)
+M:	Björn Töpel <bjorn.topel@intel.com>
+M:	Magnus Karlsson <magnus.karlsson@intel.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	kernel/bpf/xskmap.c
+F:	net/xdp/
+
 XEN BLOCK SUBSYSTEM
 M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
 M:	Roger Pau Monné <roger.pau@citrix.com>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index ea50f4a65816..7ed4713d5337 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -207,8 +207,9 @@ struct ucred {
 				 * PF_SMC protocol family that
 				 * reuses AF_INET address family
 				 */
+#define AF_XDP		44	/* XDP sockets			*/
 
-#define AF_MAX		44	/* For now.. */
+#define AF_MAX		45	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -257,6 +258,7 @@ struct ucred {
 #define PF_KCM		AF_KCM
 #define PF_QIPCRTR	AF_QIPCRTR
 #define PF_SMC		AF_SMC
+#define PF_XDP		AF_XDP
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -338,6 +340,7 @@ struct ucred {
 #define SOL_NFC		280
 #define SOL_KCM		281
 #define SOL_TLS		282
+#define SOL_XDP		283
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/net/Kconfig b/net/Kconfig
index 6fa1a4493b8c..86471a1c1ed4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
+source "net/xdp/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db169ca1..e7d8b6c955c6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
-  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
+  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
+  x "AF_MAX"
 
 static const char *const af_family_key_strings[AF_MAX+1] = {
 	_sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
-  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
+  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      ,
+  "rlock-AF_MAX"
 };
 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
-  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
+  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      ,
+  "wlock-AF_MAX"
 };
 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
-  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
+  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      ,
+  "elock-AF_MAX"
 };
 
 /*
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 000000000000..90e4a7152854
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,7 @@
+config XDP_SOCKETS
+	bool "XDP sockets"
+	depends on BPF_SYSCALL
+	default n
+	help
+	  XDP sockets allows a channel between XDP programs and
+	  userspace applications.
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..5c508d26b367 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1471,7 +1471,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
 			return SECCLASS_QIPCRTR_SOCKET;
 		case PF_SMC:
 			return SECCLASS_SMC_SOCKET;
-#if PF_MAX > 44
+		case PF_XDP:
+			return SECCLASS_XDP_SOCKET;
+#if PF_MAX > 45
 #error New address family defined, please update this function.
 #endif
 		}
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 7f0372426494..bd5fe0d3204a 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -240,9 +240,11 @@ struct security_class_mapping secclass_map[] = {
 	  { "manage_subnet", NULL } },
 	{ "bpf",
 	  {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
+	{ "xdp_socket",
+	  { COMMON_SOCK_PERMS, NULL } },
 	{ NULL }
   };
 
-#if PF_MAX > 44
+#if PF_MAX > 45
 #error New address family defined, please update secclass_map.
 #endif
-- 
cgit v1.2.3


From c0c77d8fb787cfe0c3fca689c2a30d1dad4eaba7 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:23 +0200
Subject: xsk: add user memory registration support sockopt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In this commit the base structure of the AF_XDP address family is set
up. Further, we introduce the abilty register a window of user memory
to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory
window is viewed by an AF_XDP socket as a set of equally large
frames. After a user memory registration all frames are "owned" by the
user application, and not the kernel.

v2: More robust checks on umem creation and unaccount on error.
    Call set_page_dirty_lock on cleanup.
    Simplified xdp_umem_reg.

Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |  31 ++++++
 include/uapi/linux/if_xdp.h |  34 ++++++
 net/Makefile                |   1 +
 net/xdp/Makefile            |   2 +
 net/xdp/xdp_umem.c          | 245 ++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  45 ++++++++
 net/xdp/xdp_umem_props.h    |  23 +++++
 net/xdp/xsk.c               | 215 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 596 insertions(+)
 create mode 100644 include/net/xdp_sock.h
 create mode 100644 include/uapi/linux/if_xdp.h
 create mode 100644 net/xdp/Makefile
 create mode 100644 net/xdp/xdp_umem.c
 create mode 100644 net/xdp/xdp_umem.h
 create mode 100644 net/xdp/xdp_umem_props.h
 create mode 100644 net/xdp/xsk.c

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
new file mode 100644
index 000000000000..94785f5db13e
--- /dev/null
+++ b/include/net/xdp_sock.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * AF_XDP internal functions
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XDP_SOCK_H
+#define _LINUX_XDP_SOCK_H
+
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+struct xdp_umem;
+
+struct xdp_sock {
+	/* struct sock must be the first member of struct xdp_sock */
+	struct sock sk;
+	struct xdp_umem *umem;
+	/* Protects multiple processes in the control path */
+	struct mutex mutex;
+};
+
+#endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
new file mode 100644
index 000000000000..41252135a0fe
--- /dev/null
+++ b/include/uapi/linux/if_xdp.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+ *
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* XDP socket options */
+#define XDP_UMEM_REG			3
+
+struct xdp_umem_reg {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 frame_size; /* Frame size */
+	__u32 frame_headroom; /* Frame head room */
+};
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..77aaddedbd29 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -85,3 +85,4 @@ obj-y				+= l3mdev/
 endif
 obj-$(CONFIG_QRTR)		+= qrtr/
 obj-$(CONFIG_NET_NCSI)		+= ncsi/
+obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..a5d736640a0f
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..ec8b3552be44
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+
+#include "xdp_umem.h"
+
+#define XDP_UMEM_MIN_FRAME_SIZE 2048
+
+int xdp_umem_create(struct xdp_umem **umem)
+{
+	*umem = kzalloc(sizeof(**umem), GFP_KERNEL);
+
+	if (!(*umem))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+	unsigned int i;
+
+	if (umem->pgs) {
+		for (i = 0; i < umem->npgs; i++) {
+			struct page *page = umem->pgs[i];
+
+			set_page_dirty_lock(page);
+			put_page(page);
+		}
+
+		kfree(umem->pgs);
+		umem->pgs = NULL;
+	}
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+	if (umem->user) {
+		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+		free_uid(umem->user);
+	}
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+
+	if (umem->pgs) {
+		xdp_umem_unpin_pages(umem);
+
+		task = get_pid_task(umem->pid, PIDTYPE_PID);
+		put_pid(umem->pid);
+		if (!task)
+			goto out;
+		mm = get_task_mm(task);
+		put_task_struct(task);
+		if (!mm)
+			goto out;
+
+		mmput(mm);
+		umem->pgs = NULL;
+	}
+
+	xdp_umem_unaccount_pages(umem);
+out:
+	kfree(umem);
+}
+
+static void xdp_umem_release_deferred(struct work_struct *work)
+{
+	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
+
+	xdp_umem_release(umem);
+}
+
+void xdp_get_umem(struct xdp_umem *umem)
+{
+	atomic_inc(&umem->users);
+}
+
+void xdp_put_umem(struct xdp_umem *umem)
+{
+	if (!umem)
+		return;
+
+	if (atomic_dec_and_test(&umem->users)) {
+		INIT_WORK(&umem->work, xdp_umem_release_deferred);
+		schedule_work(&umem->work);
+	}
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
+	if (!umem->pgs)
+		return -ENOMEM;
+
+	down_write(&current->mm->mmap_sem);
+	npgs = get_user_pages(umem->address, umem->npgs,
+			      gup_flags, &umem->pgs[0], NULL);
+	up_write(&current->mm->mmap_sem);
+
+	if (npgs != umem->npgs) {
+		if (npgs >= 0) {
+			umem->npgs = npgs;
+			err = -ENOMEM;
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+	return 0;
+
+out_pin:
+	xdp_umem_unpin_pages(umem);
+out_pgs:
+	kfree(umem->pgs);
+	umem->pgs = NULL;
+	return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+	unsigned long lock_limit, new_npgs, old_npgs;
+
+	if (capable(CAP_IPC_LOCK))
+		return 0;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	umem->user = get_uid(current_user());
+
+	do {
+		old_npgs = atomic_long_read(&umem->user->locked_vm);
+		new_npgs = old_npgs + umem->npgs;
+		if (new_npgs > lock_limit) {
+			free_uid(umem->user);
+			umem->user = NULL;
+			return -ENOBUFS;
+		}
+	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+				     new_npgs) != old_npgs);
+	return 0;
+}
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u64 addr = mr->addr, size = mr->len;
+	unsigned int nframes, nfpp;
+	int size_chk, err;
+
+	if (!umem)
+		return -EINVAL;
+
+	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+		/* Strictly speaking we could support this, if:
+		 * - huge pages, or*
+		 * - using an IOMMU, or
+		 * - making sure the memory area is consecutive
+		 * but for now, we simply say "computer says no".
+		 */
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(frame_size))
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(addr)) {
+		/* Memory area has to be page size aligned. For
+		 * simplicity, this might change.
+		 */
+		return -EINVAL;
+	}
+
+	if ((addr + size) < addr)
+		return -EINVAL;
+
+	nframes = size / frame_size;
+	if (nframes == 0 || nframes > UINT_MAX)
+		return -EINVAL;
+
+	nfpp = PAGE_SIZE / frame_size;
+	if (nframes < nfpp || nframes % nfpp)
+		return -EINVAL;
+
+	frame_headroom = ALIGN(frame_headroom, 64);
+
+	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	if (size_chk < 0)
+		return -EINVAL;
+
+	umem->pid = get_task_pid(current, PIDTYPE_PID);
+	umem->size = (size_t)size;
+	umem->address = (unsigned long)addr;
+	umem->props.frame_size = frame_size;
+	umem->props.nframes = nframes;
+	umem->frame_headroom = frame_headroom;
+	umem->npgs = size / PAGE_SIZE;
+	umem->pgs = NULL;
+	umem->user = NULL;
+
+	umem->frame_size_log2 = ilog2(frame_size);
+	umem->nfpp_mask = nfpp - 1;
+	umem->nfpplog2 = ilog2(nfpp);
+	atomic_set(&umem->users, 1);
+
+	err = xdp_umem_account_pages(umem);
+	if (err)
+		goto out;
+
+	err = xdp_umem_pin_pages(umem);
+	if (err)
+		goto out_account;
+	return 0;
+
+out_account:
+	xdp_umem_unaccount_pages(umem);
+out:
+	put_pid(umem->pid);
+	return err;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..4597ae81a221
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <linux/mm.h>
+#include <linux/if_xdp.h>
+#include <linux/workqueue.h>
+
+#include "xdp_umem_props.h"
+
+struct xdp_umem {
+	struct page **pgs;
+	struct xdp_umem_props props;
+	u32 npgs;
+	u32 frame_headroom;
+	u32 nfpp_mask;
+	u32 nfpplog2;
+	u32 frame_size_log2;
+	struct user_struct *user;
+	struct pid *pid;
+	unsigned long address;
+	size_t size;
+	atomic_t users;
+	struct work_struct work;
+};
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
+void xdp_get_umem(struct xdp_umem *umem);
+void xdp_put_umem(struct xdp_umem *umem);
+int xdp_umem_create(struct xdp_umem **umem);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..77fb5daf29f3
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_PROPS_H_
+#define XDP_UMEM_PROPS_H_
+
+struct xdp_umem_props {
+	u32 frame_size;
+	u32 nframes;
+};
+
+#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..84e0e867febb
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <net/xdp_sock.h>
+
+#include "xdp_umem.h"
+
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+	return (struct xdp_sock *)sk;
+}
+
+static int xsk_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net;
+
+	if (!sk)
+		return 0;
+
+	net = sock_net(sk);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, sk->sk_prot, -1);
+	local_bh_enable();
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	sk_refcnt_debug_release(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int err;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	switch (optname) {
+	case XDP_UMEM_REG:
+	{
+		struct xdp_umem_reg mr;
+		struct xdp_umem *umem;
+
+		if (xs->umem)
+			return -EBUSY;
+
+		if (copy_from_user(&mr, optval, sizeof(mr)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		err = xdp_umem_create(&umem);
+
+		err = xdp_umem_reg(umem, &mr);
+		if (err) {
+			kfree(umem);
+			mutex_unlock(&xs->mutex);
+			return err;
+		}
+
+		/* Make sure umem is ready before it can be seen by others */
+		smp_wmb();
+
+		xs->umem = umem;
+		mutex_unlock(&xs->mutex);
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -ENOPROTOOPT;
+}
+
+static struct proto xsk_proto = {
+	.name =		"XDP",
+	.owner =	THIS_MODULE,
+	.obj_size =	sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+	.family =	PF_XDP,
+	.owner =	THIS_MODULE,
+	.release =	xsk_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		sock_no_poll,
+	.ioctl =	sock_no_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	xsk_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	sock_no_sendmsg,
+	.recvmsg =	sock_no_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	xdp_put_umem(xs->umem);
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	struct xdp_sock *xs;
+
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
+		return -EPERM;
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+	if (!sk)
+		return -ENOBUFS;
+
+	sock->ops = &xsk_proto_ops;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_XDP;
+
+	sk->sk_destruct = xsk_destruct;
+	sk_refcnt_debug_inc(sk);
+
+	xs = xdp_sk(sk);
+	mutex_init(&xs->mutex);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, &xsk_proto, 1);
+	local_bh_enable();
+
+	return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+	.family = PF_XDP,
+	.create = xsk_create,
+	.owner	= THIS_MODULE,
+};
+
+static int __init xsk_init(void)
+{
+	int err;
+
+	err = proto_register(&xsk_proto, 0 /* no slab */);
+	if (err)
+		goto out;
+
+	err = sock_register(&xsk_family_ops);
+	if (err)
+		goto out_proto;
+
+	return 0;
+
+out_proto:
+	proto_unregister(&xsk_proto);
+out:
+	return err;
+}
+
+fs_initcall(xsk_init);
-- 
cgit v1.2.3


From 423f38329d267969130fb6f2c685f73d72687558 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:24 +0200
Subject: xsk: add umem fill queue support and mmap

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can
ask the kernel to allocate a queue (ring buffer) and also mmap it
(XDP_UMEM_PGOFF_FILL_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
user process to the kernel. These frames will in a later patch be
filled in with Rx packet data by the kernel.

v2: Fixed potential crash in xsk_mmap.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h | 15 +++++++++++
 net/xdp/Makefile            |  2 +-
 net/xdp/xdp_umem.c          |  5 ++++
 net/xdp/xdp_umem.h          |  2 ++
 net/xdp/xsk.c               | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         | 58 ++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 38 ++++++++++++++++++++++++++
 7 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 net/xdp/xsk_queue.c
 create mode 100644 net/xdp/xsk_queue.h

(limited to 'include')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 41252135a0fe..975661e1baca 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -23,6 +23,7 @@
 
 /* XDP socket options */
 #define XDP_UMEM_REG			3
+#define XDP_UMEM_FILL_RING		4
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -31,4 +32,18 @@ struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+/* Pgoff for mmaping the rings */
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000
+
+struct xdp_ring {
+	__u32 producer __attribute__((aligned(64)));
+	__u32 consumer __attribute__((aligned(64)));
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	__u32 desc[0] __attribute__((aligned(64)));
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index a5d736640a0f..074fb2b2d51c 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index ec8b3552be44..e1f627d0cc1c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -65,6 +65,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct task_struct *task;
 	struct mm_struct *mm;
 
+	if (umem->fq) {
+		xskq_destroy(umem->fq);
+		umem->fq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 4597ae81a221..25634b8a5c6f 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -19,9 +19,11 @@
 #include <linux/if_xdp.h>
 #include <linux/workqueue.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem_props.h"
 
 struct xdp_umem {
+	struct xsk_queue *fq;
 	struct page **pgs;
 	struct xdp_umem_props props;
 	u32 npgs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 84e0e867febb..da67a3c5c1c9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <net/xdp_sock.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem.h"
 
 static struct xdp_sock *xdp_sk(struct sock *sk)
@@ -39,6 +40,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+{
+	struct xsk_queue *q;
+
+	if (entries == 0 || *queue || !is_power_of_2(entries))
+		return -EINVAL;
+
+	q = xskq_create(entries);
+	if (!q)
+		return -ENOMEM;
+
+	*queue = q;
+	return 0;
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -101,6 +117,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return 0;
 	}
+	case XDP_UMEM_FILL_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (!xs->umem)
+			return -EINVAL;
+
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->umem->fq;
+		err = xsk_init_queue(entries, q);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	default:
 		break;
 	}
@@ -108,6 +141,36 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_mmap(struct file *file, struct socket *sock,
+		    struct vm_area_struct *vma)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	struct xsk_queue *q = NULL;
+	unsigned long pfn;
+	struct page *qpg;
+
+	if (!xs->umem)
+		return -EINVAL;
+
+	if (offset == XDP_UMEM_PGOFF_FILL_RING)
+		q = xs->umem->fq;
+	else
+		return -EINVAL;
+
+	if (!q)
+		return -EINVAL;
+
+	qpg = virt_to_head_page(q->ring);
+	if (size > (PAGE_SIZE << compound_order(qpg)))
+		return -EINVAL;
+
+	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn,
+			       size, vma->vm_page_prot);
+}
+
 static struct proto xsk_proto = {
 	.name =		"XDP",
 	.owner =	THIS_MODULE,
@@ -131,7 +194,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.getsockopt =	sock_no_getsockopt,
 	.sendmsg =	sock_no_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
-	.mmap =		sock_no_mmap,
+	.mmap =		xsk_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..23da4f29d3fb
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/slab.h>
+
+#include "xsk_queue.h"
+
+static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+{
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+}
+
+struct xsk_queue *xskq_create(u32 nentries)
+{
+	struct xsk_queue *q;
+	gfp_t gfp_flags;
+	size_t size;
+
+	q = kzalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		return NULL;
+
+	q->nentries = nentries;
+	q->ring_mask = nentries - 1;
+
+	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
+		    __GFP_COMP  | __GFP_NORETRY;
+	size = xskq_umem_get_ring_size(q);
+
+	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
+						      get_order(size));
+	if (!q->ring) {
+		kfree(q);
+		return NULL;
+	}
+
+	return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+	if (!q)
+		return;
+
+	page_frag_free(q->ring);
+	kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..7eb556bf73be
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+
+#include "xdp_umem_props.h"
+
+struct xsk_queue {
+	struct xdp_umem_props umem_props;
+	u32 ring_mask;
+	u32 nentries;
+	u32 prod_head;
+	u32 prod_tail;
+	u32 cons_head;
+	u32 cons_tail;
+	struct xdp_ring *ring;
+	u64 invalid_descs;
+};
+
+struct xsk_queue *xskq_create(u32 nentries);
+void xskq_destroy(struct xsk_queue *q);
+
+#endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3


From b9b6b68e8abd101be6eb5330e4999218c696d1e8 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:25 +0200
Subject: xsk: add Rx queue setup and mmap support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Another setsockopt (XDP_RX_QUEUE) is added to let the process allocate
a queue, where the kernel can pass completed Rx frames from the kernel
to user process.

The mmapping of the queue is done using the XDP_PGOFF_RX_QUEUE offset.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |  4 ++++
 include/uapi/linux/if_xdp.h | 16 ++++++++++++++++
 net/xdp/xsk.c               | 41 ++++++++++++++++++++++++++++++++---------
 net/xdp/xsk_queue.c         | 11 +++++++++--
 net/xdp/xsk_queue.h         |  2 +-
 5 files changed, 62 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 94785f5db13e..db9a321de087 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -18,11 +18,15 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
+struct net_device;
+struct xsk_queue;
 struct xdp_umem;
 
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
+	struct xsk_queue *rx;
+	struct net_device *dev;
 	struct xdp_umem *umem;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 975661e1baca..65324558829d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 
 /* XDP socket options */
+#define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 
@@ -33,13 +34,28 @@ struct xdp_umem_reg {
 };
 
 /* Pgoff for mmaping the rings */
+#define XDP_PGOFF_RX_RING			  0
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 
+struct xdp_desc {
+	__u32 idx;
+	__u32 len;
+	__u16 offset;
+	__u8 flags;
+	__u8 padding[5];
+};
+
 struct xdp_ring {
 	__u32 producer __attribute__((aligned(64)));
 	__u32 consumer __attribute__((aligned(64)));
 };
 
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+	struct xdp_ring ptrs;
+	struct xdp_desc desc[0] __attribute__((aligned(64)));
+};
+
 /* Used for the fill and completion queues for buffers */
 struct xdp_umem_ring {
 	struct xdp_ring ptrs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index da67a3c5c1c9..92bd9b7e548f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -31,6 +31,7 @@
 #include <linux/net.h>
 #include <linux/netdevice.h>
 #include <net/xdp_sock.h>
+#include <net/xdp.h>
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
@@ -40,14 +41,15 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
-static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
+			  bool umem_queue)
 {
 	struct xsk_queue *q;
 
 	if (entries == 0 || *queue || !is_power_of_2(entries))
 		return -EINVAL;
 
-	q = xskq_create(entries);
+	q = xskq_create(entries, umem_queue);
 	if (!q)
 		return -ENOMEM;
 
@@ -89,6 +91,22 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		return -ENOPROTOOPT;
 
 	switch (optname) {
+	case XDP_RX_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (optlen < sizeof(entries))
+			return -EINVAL;
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->rx;
+		err = xsk_init_queue(entries, q, false);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	case XDP_UMEM_REG:
 	{
 		struct xdp_umem_reg mr;
@@ -130,7 +148,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 		mutex_lock(&xs->mutex);
 		q = &xs->umem->fq;
-		err = xsk_init_queue(entries, q);
+		err = xsk_init_queue(entries, q, true);
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
@@ -151,13 +169,17 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long pfn;
 	struct page *qpg;
 
-	if (!xs->umem)
-		return -EINVAL;
+	if (offset == XDP_PGOFF_RX_RING) {
+		q = xs->rx;
+	} else {
+		if (!xs->umem)
+			return -EINVAL;
 
-	if (offset == XDP_UMEM_PGOFF_FILL_RING)
-		q = xs->umem->fq;
-	else
-		return -EINVAL;
+		if (offset == XDP_UMEM_PGOFF_FILL_RING)
+			q = xs->umem->fq;
+		else
+			return -EINVAL;
+	}
 
 	if (!q)
 		return -EINVAL;
@@ -205,6 +227,7 @@ static void xsk_destruct(struct sock *sk)
 	if (!sock_flag(sk, SOCK_DEAD))
 		return;
 
+	xskq_destroy(xs->rx);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 23da4f29d3fb..894f9f89afc7 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -21,7 +21,13 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
 }
 
-struct xsk_queue *xskq_create(u32 nentries)
+static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
+{
+	return (sizeof(struct xdp_ring) +
+		q->nentries * sizeof(struct xdp_desc));
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
 {
 	struct xsk_queue *q;
 	gfp_t gfp_flags;
@@ -36,7 +42,8 @@ struct xsk_queue *xskq_create(u32 nentries)
 
 	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
 		    __GFP_COMP  | __GFP_NORETRY;
-	size = xskq_umem_get_ring_size(q);
+	size = umem_queue ? xskq_umem_get_ring_size(q) :
+	       xskq_rxtx_get_ring_size(q);
 
 	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
 						      get_order(size));
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 7eb556bf73be..5439fa381763 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -32,7 +32,7 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
-struct xsk_queue *xskq_create(u32 nentries);
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q);
 
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3


From 965a990984432cd01a9eb3514c64d86f56704295 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:26 +0200
Subject: xsk: add support for bind for Rx

Here, the bind syscall is added. Binding an AF_XDP socket, means
associating the socket to an umem, a netdev and a queue index. This
can be done in two ways.

The first way, creating a "socket from scratch". Create the umem using
the XDP_UMEM_REG setsockopt and an associated fill queue with
XDP_UMEM_FILL_QUEUE. Create the Rx queue using the XDP_RX_QUEUE
setsockopt. Call bind passing ifindex and queue index ("channel" in
ethtool speak).

The second way to bind a socket, is simply skipping the
umem/netdev/queue index, and passing another already setup AF_XDP
socket. The new socket will then have the same umem/netdev/queue index
as the parent so it will share the same umem. You must also set the
flags field in the socket address to XDP_SHARED_UMEM.

v2: Use PTR_ERR instead of passing error variable explicitly.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |   1 +
 include/uapi/linux/if_xdp.h |  11 ++++
 net/xdp/xdp_umem.c          |   5 ++
 net/xdp/xdp_umem.h          |   1 +
 net/xdp/xsk.c               | 124 +++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         |   8 +++
 net/xdp/xsk_queue.h         |   1 +
 7 files changed, 150 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index db9a321de087..85d02512f59b 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -28,6 +28,7 @@ struct xdp_sock {
 	struct xsk_queue *rx;
 	struct net_device *dev;
 	struct xdp_umem *umem;
+	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 };
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 65324558829d..e5091881f776 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -21,6 +21,17 @@
 
 #include <linux/types.h>
 
+/* Options for the sxdp_flags field */
+#define XDP_SHARED_UMEM 1
+
+struct sockaddr_xdp {
+	__u16 sxdp_family;
+	__u32 sxdp_ifindex;
+	__u32 sxdp_queue_id;
+	__u32 sxdp_shared_umem_fd;
+	__u16 sxdp_flags;
+};
+
 /* XDP socket options */
 #define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index e1f627d0cc1c..9bac1ad570fa 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -248,3 +248,8 @@ out:
 	put_pid(umem->pid);
 	return err;
 }
+
+bool xdp_umem_validate_queues(struct xdp_umem *umem)
+{
+	return umem->fq;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 25634b8a5c6f..b13133e9c501 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -39,6 +39,7 @@ struct xdp_umem {
 	struct work_struct work;
 };
 
+bool xdp_umem_validate_queues(struct xdp_umem *umem);
 int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 92bd9b7e548f..bf2c97b87992 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -57,9 +57,18 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 	return 0;
 }
 
+static void __xsk_release(struct xdp_sock *xs)
+{
+	/* Wait for driver to stop using the xdp socket. */
+	synchronize_net();
+
+	dev_put(xs->dev);
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
 	struct net *net;
 
 	if (!sk)
@@ -71,6 +80,11 @@ static int xsk_release(struct socket *sock)
 	sock_prot_inuse_add(net, sk->sk_prot, -1);
 	local_bh_enable();
 
+	if (xs->dev) {
+		__xsk_release(xs);
+		xs->dev = NULL;
+	}
+
 	sock_orphan(sk);
 	sock->sk = NULL;
 
@@ -80,6 +94,114 @@ static int xsk_release(struct socket *sock)
 	return 0;
 }
 
+static struct socket *xsk_lookup_xsk_from_fd(int fd)
+{
+	struct socket *sock;
+	int err;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return ERR_PTR(-ENOTSOCK);
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return ERR_PTR(-ENOPROTOOPT);
+	}
+
+	return sock;
+}
+
+static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
+	struct sock *sk = sock->sk;
+	struct net_device *dev, *dev_curr;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct xdp_umem *old_umem = NULL;
+	int err = 0;
+
+	if (addr_len < sizeof(struct sockaddr_xdp))
+		return -EINVAL;
+	if (sxdp->sxdp_family != AF_XDP)
+		return -EINVAL;
+
+	mutex_lock(&xs->mutex);
+	dev_curr = xs->dev;
+	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
+	if (!dev) {
+		err = -ENODEV;
+		goto out_release;
+	}
+
+	if (!xs->rx) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (sxdp->sxdp_queue_id >= dev->num_rx_queues) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
+		struct xdp_sock *umem_xs;
+		struct socket *sock;
+
+		if (xs->umem) {
+			/* We have already our own. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
+		if (IS_ERR(sock)) {
+			err = PTR_ERR(sock);
+			goto out_unlock;
+		}
+
+		umem_xs = xdp_sk(sock->sk);
+		if (!umem_xs->umem) {
+			/* No umem to inherit. */
+			err = -EBADF;
+			sockfd_put(sock);
+			goto out_unlock;
+		} else if (umem_xs->dev != dev ||
+			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
+			err = -EINVAL;
+			sockfd_put(sock);
+			goto out_unlock;
+		}
+
+		xdp_get_umem(umem_xs->umem);
+		old_umem = xs->umem;
+		xs->umem = umem_xs->umem;
+		sockfd_put(sock);
+	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Rebind? */
+	if (dev_curr && (dev_curr != dev ||
+			 xs->queue_id != sxdp->sxdp_queue_id)) {
+		__xsk_release(xs);
+		if (old_umem)
+			xdp_put_umem(old_umem);
+	}
+
+	xs->dev = dev;
+	xs->queue_id = sxdp->sxdp_queue_id;
+
+	xskq_set_umem(xs->rx, &xs->umem->props);
+
+out_unlock:
+	if (err)
+		dev_put(dev);
+out_release:
+	mutex_unlock(&xs->mutex);
+	return err;
+}
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -203,7 +325,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.family =	PF_XDP,
 	.owner =	THIS_MODULE,
 	.release =	xsk_release,
-	.bind =		sock_no_bind,
+	.bind =		xsk_bind,
 	.connect =	sock_no_connect,
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 894f9f89afc7..d012e5e23591 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -16,6 +16,14 @@
 
 #include "xsk_queue.h"
 
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
+{
+	if (!q)
+		return;
+
+	q->umem_props = *umem_props;
+}
+
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 {
 	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 5439fa381763..9ddd2ee07a84 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -32,6 +32,7 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q);
 
-- 
cgit v1.2.3


From c497176cb2e478f0a5713b0e05f242276e3194b5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:27 +0200
Subject: xsk: add Rx receive functions and poll support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here the actual receive functions of AF_XDP are implemented, that in a
later commit, will be called from the XDP layers.

There's one set of functions for the XDP_DRV side and another for
XDP_SKB (generic).

A new XDP API, xdp_return_buff, is also introduced.

Adding xdp_return_buff, which is analogous to xdp_return_frame, but
acts upon an struct xdp_buff. The API will be used by AF_XDP in future
commits.

Support for the poll syscall is also implemented.

v2: xskq_validate_id did not update cons_tail.
    The entries variable was calculated twice in xskq_nb_avail.
    Squashed xdp_return_buff commit.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h      |   1 +
 include/net/xdp_sock.h |  22 ++++++++++
 net/core/xdp.c         |  15 +++++--
 net/xdp/xdp_umem.h     |  18 ++++++++
 net/xdp/xsk.c          |  73 ++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.h    | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 238 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 137ad5f9f40f..0b689cf561c7 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_buff(struct xdp_buff *xdp);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 85d02512f59b..a0342dff6a4d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -31,6 +31,28 @@ struct xdp_sock {
 	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
+	u64 rx_dropped;
 };
 
+struct xdp_buff;
+#ifdef CONFIG_XDP_SOCKETS
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
+void xsk_flush(struct xdp_sock *xs);
+#else
+static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	return -ENOTSUPP;
+}
+
+static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	return -ENOTSUPP;
+}
+
+static inline void xsk_flush(struct xdp_sock *xs)
+{
+}
+#endif /* CONFIG_XDP_SOCKETS */
+
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 0c86b53a3a63..bf6758f74339 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,11 +308,9 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-void xdp_return_frame(struct xdp_frame *xdpf)
+static void xdp_return(void *data, struct xdp_mem_info *mem)
 {
-	struct xdp_mem_info *mem = &xdpf->mem;
 	struct xdp_mem_allocator *xa;
-	void *data = xdpf->data;
 	struct page *page;
 
 	switch (mem->type) {
@@ -339,4 +337,15 @@ void xdp_return_frame(struct xdp_frame *xdpf)
 		break;
 	}
 }
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+	xdp_return(xdpf->data, &xdpf->mem);
+}
 EXPORT_SYMBOL_GPL(xdp_return_frame);
+
+void xdp_return_buff(struct xdp_buff *xdp)
+{
+	xdp_return(xdp->data, &xdp->rxq->mem);
+}
+EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index b13133e9c501..c7378a11721f 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -39,6 +39,24 @@ struct xdp_umem {
 	struct work_struct work;
 };
 
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx)
+{
+	u64 pg, off;
+	char *data;
+
+	pg = idx >> umem->nfpplog2;
+	off = (idx & umem->nfpp_mask) << umem->frame_size_log2;
+
+	data = page_address(umem->pgs[pg]);
+	return data + off;
+}
+
+static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
+						    u32 idx)
+{
+	return xdp_umem_get_data(umem, idx) + umem->frame_headroom;
+}
+
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
 void xdp_get_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index bf2c97b87992..4e1e6c581e1d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,6 +41,74 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 *id, len = xdp->data_end - xdp->data;
+	void *buffer;
+	int err = 0;
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+		return -EINVAL;
+
+	id = xskq_peek_id(xs->umem->fq);
+	if (!id)
+		return -ENOSPC;
+
+	buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
+	memcpy(buffer, xdp->data, len);
+	err = xskq_produce_batch_desc(xs->rx, *id, len,
+				      xs->umem->frame_headroom);
+	if (!err)
+		xskq_discard_id(xs->umem->fq);
+
+	return err;
+}
+
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	int err;
+
+	err = __xsk_rcv(xs, xdp);
+	if (likely(!err))
+		xdp_return_buff(xdp);
+	else
+		xs->rx_dropped++;
+
+	return err;
+}
+
+void xsk_flush(struct xdp_sock *xs)
+{
+	xskq_produce_flush_desc(xs->rx);
+	xs->sk.sk_data_ready(&xs->sk);
+}
+
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	int err;
+
+	err = __xsk_rcv(xs, xdp);
+	if (!err)
+		xsk_flush(xs);
+	else
+		xs->rx_dropped++;
+
+	return err;
+}
+
+static unsigned int xsk_poll(struct file *file, struct socket *sock,
+			     struct poll_table_struct *wait)
+{
+	unsigned int mask = datagram_poll(file, sock, wait);
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (xs->rx && !xskq_empty_desc(xs->rx))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 			  bool umem_queue)
 {
@@ -179,6 +247,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
 		err = -EINVAL;
 		goto out_unlock;
+	} else {
+		/* This xsk has its own umem. */
+		xskq_set_umem(xs->umem->fq, &xs->umem->props);
 	}
 
 	/* Rebind? */
@@ -330,7 +401,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll =		sock_no_poll,
+	.poll =		xsk_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 9ddd2ee07a84..0a9b92b4f93a 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -20,6 +20,8 @@
 
 #include "xdp_umem_props.h"
 
+#define RX_BATCH_SIZE 16
+
 struct xsk_queue {
 	struct xdp_umem_props umem_props;
 	u32 ring_mask;
@@ -32,8 +34,118 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
+/* Common functions operating for both RXTX and umem queues */
+
+static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
+{
+	u32 entries = q->prod_tail - q->cons_tail;
+
+	if (entries == 0) {
+		/* Refresh the local pointer */
+		q->prod_tail = READ_ONCE(q->ring->producer);
+		entries = q->prod_tail - q->cons_tail;
+	}
+
+	return (entries > dcnt) ? dcnt : entries;
+}
+
+static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
+{
+	u32 free_entries = q->nentries - (producer - q->cons_tail);
+
+	if (free_entries >= dcnt)
+		return free_entries;
+
+	/* Refresh the local tail pointer */
+	q->cons_tail = READ_ONCE(q->ring->consumer);
+	return q->nentries - (producer - q->cons_tail);
+}
+
+/* UMEM queue */
+
+static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
+{
+	if (unlikely(idx >= q->umem_props.nframes)) {
+		q->invalid_descs++;
+		return false;
+	}
+	return true;
+}
+
+static inline u32 *xskq_validate_id(struct xsk_queue *q)
+{
+	while (q->cons_tail != q->cons_head) {
+		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+		unsigned int idx = q->cons_tail & q->ring_mask;
+
+		if (xskq_is_valid_id(q, ring->desc[idx]))
+			return &ring->desc[idx];
+
+		q->cons_tail++;
+	}
+
+	return NULL;
+}
+
+static inline u32 *xskq_peek_id(struct xsk_queue *q)
+{
+	struct xdp_umem_ring *ring;
+
+	if (q->cons_tail == q->cons_head) {
+		WRITE_ONCE(q->ring->consumer, q->cons_tail);
+		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+
+		/* Order consumer and data */
+		smp_rmb();
+
+		return xskq_validate_id(q);
+	}
+
+	ring = (struct xdp_umem_ring *)q->ring;
+	return &ring->desc[q->cons_tail & q->ring_mask];
+}
+
+static inline void xskq_discard_id(struct xsk_queue *q)
+{
+	q->cons_tail++;
+	(void)xskq_validate_id(q);
+}
+
+/* Rx queue */
+
+static inline int xskq_produce_batch_desc(struct xsk_queue *q,
+					  u32 id, u32 len, u16 offset)
+{
+	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+	unsigned int idx;
+
+	if (xskq_nb_free(q, q->prod_head, 1) == 0)
+		return -ENOSPC;
+
+	idx = (q->prod_head++) & q->ring_mask;
+	ring->desc[idx].idx = id;
+	ring->desc[idx].len = len;
+	ring->desc[idx].offset = offset;
+
+	return 0;
+}
+
+static inline void xskq_produce_flush_desc(struct xsk_queue *q)
+{
+	/* Order producer and data */
+	smp_wmb();
+
+	q->prod_tail = q->prod_head,
+	WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
+static inline bool xskq_empty_desc(struct xsk_queue *q)
+{
+	return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries);
+}
+
 void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
-void xskq_destroy(struct xsk_queue *q);
+void xskq_destroy(struct xsk_queue *q_ops);
 
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3


From fbfc504a24f53f7ebe128ab55cb5dba634f4ece8 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:28 +0200
Subject: bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xskmap is yet another BPF map, very much inspired by
dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application
adds AF_XDP sockets into the map, and by using the bpf_redirect_map
helper, an XDP program can redirect XDP frames to an AF_XDP socket.

Note that a socket that is bound to certain ifindex/queue index will
*only* accept XDP frames from that netdev/queue index. If an XDP
program tries to redirect from a netdev/queue index other than what
the socket is bound to, the frame will not be received on the socket.

A socket can reside in multiple maps.

v3: Fixed race and simplified code.
v2: Removed one indirection in map lookup.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h       |  25 +++++
 include/linux/bpf_types.h |   3 +
 include/net/xdp_sock.h    |   7 ++
 include/uapi/linux/bpf.h  |   1 +
 kernel/bpf/Makefile       |   3 +
 kernel/bpf/verifier.c     |   8 +-
 kernel/bpf/xskmap.c       | 239 ++++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk.c             |   5 +
 8 files changed, 289 insertions(+), 2 deletions(-)
 create mode 100644 kernel/bpf/xskmap.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c553f6f9c6b0..68ecdb4eea09 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -676,6 +676,31 @@ static inline int sock_map_prog(struct bpf_map *map,
 }
 #endif
 
+#if defined(CONFIG_XDP_SOCKETS)
+struct xdp_sock;
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key);
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs);
+void __xsk_map_flush(struct bpf_map *map);
+#else
+struct xdp_sock;
+static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
+						     u32 key)
+{
+	return NULL;
+}
+
+static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+				     struct xdp_sock *xs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void __xsk_map_flush(struct bpf_map *map)
+{
+}
+#endif
+
 /* verifier prototypes for helper functions called from eBPF programs */
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b28fcf6f6ae..d7df1b323082 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
+#if defined(CONFIG_XDP_SOCKETS)
+BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
+#endif
 #endif
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index a0342dff6a4d..ce3a2ab16b8f 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -28,6 +28,7 @@ struct xdp_sock {
 	struct xsk_queue *rx;
 	struct net_device *dev;
 	struct xdp_umem *umem;
+	struct list_head flush_node;
 	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
@@ -39,6 +40,7 @@ struct xdp_buff;
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 static inline void xsk_flush(struct xdp_sock *xs)
 {
 }
+
+static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+	return false;
+}
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8daef7326bb7..a3a495052511 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 35c485fa9ea3..f27f5496d6fe 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+ifeq ($(CONFIG_XDP_SOCKETS),y)
+obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
+endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 712d8655e916..0d91f18b2eb5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2070,8 +2070,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
-	/* Restrict bpf side of cpumap, open when use-cases appear */
+	/* Restrict bpf side of cpumap and xskmap, open when use-cases
+	 * appear.
+	 */
 	case BPF_MAP_TYPE_CPUMAP:
+	case BPF_MAP_TYPE_XSKMAP:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
@@ -2118,7 +2121,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
-		    map->map_type != BPF_MAP_TYPE_CPUMAP)
+		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
+		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..869dbb11b612
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct xsk_map {
+	struct bpf_map map;
+	struct xdp_sock **xsk_map;
+	struct list_head __percpu *flush_list;
+};
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+	int cpu, err = -EINVAL;
+	struct xsk_map *m;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 ||
+	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	m = kzalloc(sizeof(*m), GFP_USER);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&m->map, attr);
+
+	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
+	cost += sizeof(struct list_head) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_m;
+
+	m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	err = bpf_map_precharge_memlock(m->map.pages);
+	if (err)
+		goto free_m;
+
+	m->flush_list = alloc_percpu(struct list_head);
+	if (!m->flush_list)
+		goto free_m;
+
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
+
+	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
+					sizeof(struct xdp_sock *),
+					m->map.numa_node);
+	if (!m->xsk_map)
+		goto free_percpu;
+	return &m->map;
+
+free_percpu:
+	free_percpu(m->flush_list);
+free_m:
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	int i;
+
+	synchronize_net();
+
+	for (i = 0; i < map->max_entries; i++) {
+		struct xdp_sock *xs;
+
+		xs = m->xsk_map[i];
+		if (!xs)
+			continue;
+
+		sock_put((struct sock *)xs);
+	}
+
+	free_percpu(m->flush_list);
+	bpf_map_area_free(m->xsk_map);
+	kfree(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= m->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == m->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *xs;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	xs = READ_ONCE(m->xsk_map[key]);
+	return xs;
+}
+
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	int err;
+
+	err = xsk_rcv(xs, xdp);
+	if (err)
+		return err;
+
+	if (!xs->flush_node.prev)
+		list_add(&xs->flush_node, flush_list);
+
+	return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	struct xdp_sock *xs, *tmp;
+
+	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+		xsk_flush(xs);
+		__list_del(xs->flush_node.prev, xs->flush_node.next);
+		xs->flush_node.prev = NULL;
+	}
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 i = *(u32 *)key, fd = *(u32 *)value;
+	struct xdp_sock *xs, *old_xs;
+	struct socket *sock;
+	int err;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(i >= m->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return err;
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	xs = (struct xdp_sock *)sock->sk;
+
+	if (!xsk_is_setup_for_bpf_map(xs)) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	sock_hold(sock->sk);
+
+	old_xs = xchg(&m->xsk_map[i], xs);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	sockfd_put(sock);
+	return 0;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *old_xs;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	old_xs = xchg(&m->xsk_map[k], NULL);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	return 0;
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+	.map_alloc = xsk_map_alloc,
+	.map_free = xsk_map_free,
+	.map_get_next_key = xsk_map_get_next_key,
+	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_update_elem = xsk_map_update_elem,
+	.map_delete_elem = xsk_map_delete_elem,
+};
+
+
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4e1e6c581e1d..b931a0db5588 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+	return !!xs->rx;
+}
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 *id, len = xdp->data_end - xdp->data;
-- 
cgit v1.2.3


From 02671e23e7b383763fe1ae4f20b56d8029f9dfc6 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:30 +0200
Subject: xsk: wire up XDP_SKB side of AF_XDP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit wires up the xskmap to XDP_SKB layer.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  2 +-
 net/core/dev.c         | 35 +++++++++++++++++++----------------
 net/core/filter.c      | 17 ++++++++++++++---
 3 files changed, 34 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 64899c04c1a6..b7f81e3a70cb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -760,7 +760,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
  * This does not appear to be a real limitation for existing software.
  */
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *prog);
+			    struct xdp_buff *xdp, struct bpf_prog *prog);
 int xdp_do_redirect(struct net_device *dev,
 		    struct xdp_buff *xdp,
 		    struct bpf_prog *prog);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8f8931b93140..aea36b5a2fed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3994,12 +3994,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 }
 
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
 	struct netdev_rx_queue *rxqueue;
 	void *orig_data, *orig_data_end;
 	u32 metalen, act = XDP_DROP;
-	struct xdp_buff xdp;
 	int hlen, off;
 	u32 mac_len;
 
@@ -4034,19 +4034,19 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	 */
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
-	xdp.data = skb->data - mac_len;
-	xdp.data_meta = xdp.data;
-	xdp.data_end = xdp.data + hlen;
-	xdp.data_hard_start = skb->data - skb_headroom(skb);
-	orig_data_end = xdp.data_end;
-	orig_data = xdp.data;
+	xdp->data = skb->data - mac_len;
+	xdp->data_meta = xdp->data;
+	xdp->data_end = xdp->data + hlen;
+	xdp->data_hard_start = skb->data - skb_headroom(skb);
+	orig_data_end = xdp->data_end;
+	orig_data = xdp->data;
 
 	rxqueue = netif_get_rxqueue(skb);
-	xdp.rxq = &rxqueue->xdp_rxq;
+	xdp->rxq = &rxqueue->xdp_rxq;
 
-	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
 
-	off = xdp.data - orig_data;
+	off = xdp->data - orig_data;
 	if (off > 0)
 		__skb_pull(skb, off);
 	else if (off < 0)
@@ -4056,10 +4056,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
 	 * pckt.
 	 */
-	off = orig_data_end - xdp.data_end;
+	off = orig_data_end - xdp->data_end;
 	if (off != 0) {
-		skb_set_tail_pointer(skb, xdp.data_end - xdp.data);
+		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 		skb->len -= off;
+
 	}
 
 	switch (act) {
@@ -4068,7 +4069,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		__skb_push(skb, mac_len);
 		break;
 	case XDP_PASS:
-		metalen = xdp.data - xdp.data_meta;
+		metalen = xdp->data - xdp->data_meta;
 		if (metalen)
 			skb_metadata_set(skb, metalen);
 		break;
@@ -4118,17 +4119,19 @@ static struct static_key generic_xdp_needed __read_mostly;
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 {
 	if (xdp_prog) {
-		u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+		struct xdp_buff xdp;
+		u32 act;
 		int err;
 
+		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 		if (act != XDP_PASS) {
 			switch (act) {
 			case XDP_REDIRECT:
 				err = xdp_do_generic_redirect(skb->dev, skb,
-							      xdp_prog);
+							      &xdp, xdp_prog);
 				if (err)
 					goto out_redir;
-			/* fallthru to submit skb */
+				break;
 			case XDP_TX:
 				generic_xdp_tx(skb, xdp_prog);
 				break;
diff --git a/net/core/filter.c b/net/core/filter.c
index 40d4bbb4508d..120bc8a202d9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,6 +59,7 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -2973,13 +2974,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
 
 static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct sk_buff *skb,
+				       struct xdp_buff *xdp,
 				       struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
-	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
+	void *fwd = NULL;
 	int err = 0;
 
 	ri->ifindex = 0;
@@ -3001,6 +3003,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
 			goto err;
 		skb->dev = fwd;
+		generic_xdp_tx(skb, xdp_prog);
+	} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+		struct xdp_sock *xs = fwd;
+
+		err = xsk_generic_rcv(xs, xdp);
+		if (err)
+			goto err;
+		consume_skb(skb);
 	} else {
 		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
 		err = -EBADRQC;
@@ -3015,7 +3025,7 @@ err:
 }
 
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *xdp_prog)
+			    struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	u32 index = ri->ifindex;
@@ -3023,7 +3033,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 	int err = 0;
 
 	if (ri->map)
-		return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
 
 	ri->ifindex = 0;
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
@@ -3037,6 +3047,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 
 	skb->dev = fwd;
 	_trace_xdp_redirect(dev, xdp_prog, index);
+	generic_xdp_tx(skb, xdp_prog);
 	return 0;
 err:
 	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
-- 
cgit v1.2.3


From fe2308328cd2f26ebc986f543796e7d13ae00bc4 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:31 +0200
Subject: xsk: add umem completion queue support and mmap

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_COMPLETION_QUEUE. Using this socket option, the
process can ask the kernel to allocate a queue (ring buffer) and also
mmap it (XDP_UMEM_PGOFF_COMPLETION_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
kernel to user process. This will be used by the TX path to tell user
space that a certain frame has been transmitted and user space can use
it for something else, if it wishes.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h | 2 ++
 net/xdp/xdp_umem.c          | 7 ++++++-
 net/xdp/xdp_umem.h          | 1 +
 net/xdp/xsk.c               | 7 ++++++-
 4 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e5091881f776..71581a139f26 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -36,6 +36,7 @@ struct sockaddr_xdp {
 #define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
+#define XDP_UMEM_COMPLETION_RING	5
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -47,6 +48,7 @@ struct xdp_umem_reg {
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
 struct xdp_desc {
 	__u32 idx;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9bac1ad570fa..881dfdefe235 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -70,6 +70,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		umem->fq = NULL;
 	}
 
+	if (umem->cq) {
+		xskq_destroy(umem->cq);
+		umem->cq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
@@ -251,5 +256,5 @@ out:
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 {
-	return umem->fq;
+	return (umem->fq && umem->cq);
 }
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index c7378a11721f..7e0b2fab8522 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -24,6 +24,7 @@
 
 struct xdp_umem {
 	struct xsk_queue *fq;
+	struct xsk_queue *cq;
 	struct page **pgs;
 	struct xdp_umem_props props;
 	u32 npgs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b931a0db5588..f4a2c5bc6da9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -255,6 +255,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	} else {
 		/* This xsk has its own umem. */
 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
+		xskq_set_umem(xs->umem->cq, &xs->umem->props);
 	}
 
 	/* Rebind? */
@@ -334,6 +335,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		return 0;
 	}
 	case XDP_UMEM_FILL_RING:
+	case XDP_UMEM_COMPLETION_RING:
 	{
 		struct xsk_queue **q;
 		int entries;
@@ -345,7 +347,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		q = &xs->umem->fq;
+		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
+			&xs->umem->cq;
 		err = xsk_init_queue(entries, q, true);
 		mutex_unlock(&xs->mutex);
 		return err;
@@ -375,6 +378,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 
 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
 			q = xs->umem->fq;
+		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
+			q = xs->umem->cq;
 		else
 			return -EINVAL;
 	}
-- 
cgit v1.2.3


From f61459030ec7fffdaa3c462cc0f728eef11b4d05 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:32 +0200
Subject: xsk: add Tx queue setup and mmap support

Another setsockopt (XDP_TX_QUEUE) is added to let the process allocate
a queue, where the user process can pass frames to be transmitted by
the kernel.

The mmapping of the queue is done using the XDP_PGOFF_TX_QUEUE offset.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      | 1 +
 include/uapi/linux/if_xdp.h | 2 ++
 net/xdp/xsk.c               | 8 ++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ce3a2ab16b8f..185f4928fbda 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@ struct xdp_sock {
 	struct xdp_umem *umem;
 	struct list_head flush_node;
 	u16 queue_id;
+	struct xsk_queue *tx ____cacheline_aligned_in_smp;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	u64 rx_dropped;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 71581a139f26..e2ea878d025c 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -34,6 +34,7 @@ struct sockaddr_xdp {
 
 /* XDP socket options */
 #define XDP_RX_RING			1
+#define XDP_TX_RING			2
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 #define XDP_UMEM_COMPLETION_RING	5
@@ -47,6 +48,7 @@ struct xdp_umem_reg {
 
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
+#define XDP_PGOFF_TX_RING		 0x80000000
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f4a2c5bc6da9..2d7b0c90d996 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -206,7 +206,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_release;
 	}
 
-	if (!xs->rx) {
+	if (!xs->rx && !xs->tx) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
@@ -291,6 +291,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 	switch (optname) {
 	case XDP_RX_RING:
+	case XDP_TX_RING:
 	{
 		struct xsk_queue **q;
 		int entries;
@@ -301,7 +302,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		q = &xs->rx;
+		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 		err = xsk_init_queue(entries, q, false);
 		mutex_unlock(&xs->mutex);
 		return err;
@@ -372,6 +373,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 
 	if (offset == XDP_PGOFF_RX_RING) {
 		q = xs->rx;
+	} else if (offset == XDP_PGOFF_TX_RING) {
+		q = xs->tx;
 	} else {
 		if (!xs->umem)
 			return -EINVAL;
@@ -431,6 +434,7 @@ static void xsk_destruct(struct sock *sk)
 		return;
 
 	xskq_destroy(xs->rx);
+	xskq_destroy(xs->tx);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
-- 
cgit v1.2.3


From 865b03f21162e4edfda51fc08693c864b1d4fdaf Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:33 +0200
Subject: dev: packet: make packet_direct_xmit a common function

The new dev_direct_xmit will be used by AF_XDP in later commits.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 38 ++++++++++++++++++++++++++++++++++++++
 net/packet/af_packet.c    | 42 +++++-------------------------------------
 3 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 366c32891158..a30435118530 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2486,6 +2486,7 @@ void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index aea36b5a2fed..d3fdc86516e8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3625,6 +3625,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
 }
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+	struct net_device *dev = skb->dev;
+	struct sk_buff *orig_skb = skb;
+	struct netdev_queue *txq;
+	int ret = NETDEV_TX_BUSY;
+	bool again = false;
+
+	if (unlikely(!netif_running(dev) ||
+		     !netif_carrier_ok(dev)))
+		goto drop;
+
+	skb = validate_xmit_skb_list(skb, dev, &again);
+	if (skb != orig_skb)
+		goto drop;
+
+	skb_set_queue_mapping(skb, queue_id);
+	txq = skb_get_tx_queue(dev, skb);
+
+	local_bh_disable();
+
+	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	if (!netif_xmit_frozen_or_drv_stopped(txq))
+		ret = netdev_start_xmit(skb, dev, txq, false);
+	HARD_TX_UNLOCK(dev, txq);
+
+	local_bh_enable();
+
+	if (!dev_xmit_complete(ret))
+		kfree_skb(skb);
+
+	return ret;
+drop:
+	atomic_long_inc(&dev->tx_dropped);
+	kfree_skb_list(skb);
+	return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(dev_direct_xmit);
 
 /*************************************************************************
  *			Receiver routines
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01f3515cada0..611a26d5235c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *,
 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
 		struct tpacket3_hdr *);
 static void packet_flush_mclist(struct sock *sk);
-static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
+static u16 packet_pick_tx_queue(struct sk_buff *skb);
 
 struct packet_skb_cb {
 	union {
@@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
 
 static int packet_direct_xmit(struct sk_buff *skb)
 {
-	struct net_device *dev = skb->dev;
-	struct sk_buff *orig_skb = skb;
-	struct netdev_queue *txq;
-	int ret = NETDEV_TX_BUSY;
-	bool again = false;
-
-	if (unlikely(!netif_running(dev) ||
-		     !netif_carrier_ok(dev)))
-		goto drop;
-
-	skb = validate_xmit_skb_list(skb, dev, &again);
-	if (skb != orig_skb)
-		goto drop;
-
-	packet_pick_tx_queue(dev, skb);
-	txq = skb_get_tx_queue(dev, skb);
-
-	local_bh_disable();
-
-	HARD_TX_LOCK(dev, txq, smp_processor_id());
-	if (!netif_xmit_frozen_or_drv_stopped(txq))
-		ret = netdev_start_xmit(skb, dev, txq, false);
-	HARD_TX_UNLOCK(dev, txq);
-
-	local_bh_enable();
-
-	if (!dev_xmit_complete(ret))
-		kfree_skb(skb);
-
-	return ret;
-drop:
-	atomic_long_inc(&dev->tx_dropped);
-	kfree_skb_list(skb);
-	return NET_XMIT_DROP;
+	return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
 }
 
 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
@@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
 }
 
-static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 packet_pick_tx_queue(struct sk_buff *skb)
 {
+	struct net_device *dev = skb->dev;
 	const struct net_device_ops *ops = dev->netdev_ops;
 	u16 queue_index;
 
@@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 		queue_index = __packet_pick_tx_queue(dev, skb);
 	}
 
-	skb_set_queue_mapping(skb, queue_index);
+	return queue_index;
 }
 
 /* __register_prot_hook must be invoked through register_prot_hook
-- 
cgit v1.2.3


From af75d9e02d08dc55ce6a1e42e485465c630d7349 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:35 +0200
Subject: xsk: statistics support

In this commit, a new getsockopt is added: XDP_STATISTICS. This is
used to obtain stats from the sockets.

v2: getsockopt now returns size of stats structure.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h |  7 +++++++
 net/xdp/xsk.c               | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.h         |  5 +++++
 3 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e2ea878d025c..77b88c4efe98 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -38,6 +38,7 @@ struct sockaddr_xdp {
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 #define XDP_UMEM_COMPLETION_RING	5
+#define XDP_STATISTICS			6
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -46,6 +47,12 @@ struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+struct xdp_statistics {
+	__u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+	__u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+};
+
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_PGOFF_TX_RING		 0x80000000
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b33c535c7996..009c5af5bba5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -468,6 +468,49 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int len;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case XDP_STATISTICS:
+	{
+		struct xdp_statistics stats;
+
+		if (len < sizeof(stats))
+			return -EINVAL;
+
+		mutex_lock(&xs->mutex);
+		stats.rx_dropped = xs->rx_dropped;
+		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
+		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
+		mutex_unlock(&xs->mutex);
+
+		if (copy_to_user(optval, &stats, sizeof(stats)))
+			return -EFAULT;
+		if (put_user(sizeof(stats), optlen))
+			return -EFAULT;
+
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 static int xsk_mmap(struct file *file, struct socket *sock,
 		    struct vm_area_struct *vma)
 {
@@ -524,7 +567,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
 	.setsockopt =	xsk_setsockopt,
-	.getsockopt =	sock_no_getsockopt,
+	.getsockopt =	xsk_getsockopt,
 	.sendmsg =	xsk_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
 	.mmap =		xsk_mmap,
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 3497e8808608..7aa9a535db0e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -36,6 +36,11 @@ struct xsk_queue {
 
 /* Common functions operating for both RXTX and umem queues */
 
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+	return q ? q->invalid_descs : 0;
+}
+
 static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
 {
 	u32 entries = q->prod_tail - q->cons_tail;
-- 
cgit v1.2.3


From 57692c94dcbe99a1e0444409a3da13fb3443562c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 30 Apr 2018 11:10:58 -0700
Subject: drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+

This driver will be used to support Mesa on the Broadcom 7268 and 7278
platforms.

V3D 3.3 introduces an MMU, which means we no longer need CMA or vc4's
complicated CL/shader validation scheme.  This massively changes the
GEM behavior, so I've forked off to a new driver.

v2: Mark SUBMIT_CL as needing DRM_AUTH.  coccinelle fixes from kbuild
    test robot. Drop personal git link from MAINTAINERS.  Don't
    double-map dma-buf imported BOs.  Add kerneldoc about needing MMU
    eviction.  Drop prime vmap/unmap stubs.  Delay mmap offset setup
    to mmap time.  Use drm_dev_init instead of _alloc.  Use
    ktime_get() for wait_bo timeouts.  Drop drm_can_sleep() usage,
    since we don't modeset.  Switch page tables back to WC (debug
    change to coherent had slipped in).  Switch
    drm_gem_object_unreference_unlocked() to
    drm_gem_object_put_unlocked().  Simplify overflow mem handling by
    not sharing overflow mem between jobs.
v3: no changes
v4: align submit_cl to 64 bits (review by airlied), check zero flags in
    other ioctls.

Signed-off-by: Eric Anholt <eric@anholt.net>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v4)
Acked-by: Dave Airlie <airlied@linux.ie> (v3, requested submit_cl change)
Link: https://patchwork.freedesktop.org/patch/msgid/20180430181058.30181-3-eric@anholt.net
---
 Documentation/gpu/drivers.rst          |   1 +
 MAINTAINERS                            |   8 +
 drivers/gpu/drm/Kconfig                |   2 +
 drivers/gpu/drm/Makefile               |   1 +
 drivers/gpu/drm/v3d/Kconfig            |   9 +
 drivers/gpu/drm/v3d/Makefile           |  18 +
 drivers/gpu/drm/v3d/v3d_bo.c           | 389 +++++++++++++++++++
 drivers/gpu/drm/v3d/v3d_debugfs.c      | 191 ++++++++++
 drivers/gpu/drm/v3d/v3d_drv.c          | 371 ++++++++++++++++++
 drivers/gpu/drm/v3d/v3d_drv.h          | 294 +++++++++++++++
 drivers/gpu/drm/v3d/v3d_fence.c        |  58 +++
 drivers/gpu/drm/v3d/v3d_gem.c          | 668 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/v3d/v3d_irq.c          | 206 ++++++++++
 drivers/gpu/drm/v3d/v3d_mmu.c          | 122 ++++++
 drivers/gpu/drm/v3d/v3d_regs.h         | 295 +++++++++++++++
 drivers/gpu/drm/v3d/v3d_sched.c        | 228 +++++++++++
 drivers/gpu/drm/v3d/v3d_trace.h        |  82 ++++
 drivers/gpu/drm/v3d/v3d_trace_points.c |   9 +
 include/uapi/drm/v3d_drm.h             | 194 ++++++++++
 19 files changed, 3146 insertions(+)
 create mode 100644 drivers/gpu/drm/v3d/Kconfig
 create mode 100644 drivers/gpu/drm/v3d/Makefile
 create mode 100644 drivers/gpu/drm/v3d/v3d_bo.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_debugfs.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_drv.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_drv.h
 create mode 100644 drivers/gpu/drm/v3d/v3d_fence.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_gem.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_irq.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_mmu.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_regs.h
 create mode 100644 drivers/gpu/drm/v3d/v3d_sched.c
 create mode 100644 drivers/gpu/drm/v3d/v3d_trace.h
 create mode 100644 drivers/gpu/drm/v3d/v3d_trace_points.c
 create mode 100644 include/uapi/drm/v3d_drm.h

(limited to 'include')

diff --git a/Documentation/gpu/drivers.rst b/Documentation/gpu/drivers.rst
index d3ab6abae838..f982558fc25d 100644
--- a/Documentation/gpu/drivers.rst
+++ b/Documentation/gpu/drivers.rst
@@ -10,6 +10,7 @@ GPU Driver Documentation
    tegra
    tinydrm
    tve200
+   v3d
    vc4
    bridge/dw-hdmi
    xen-front
diff --git a/MAINTAINERS b/MAINTAINERS
index 4af7f6119530..631a16f7fa19 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4795,6 +4795,14 @@ S:	Maintained
 F:	drivers/gpu/drm/omapdrm/
 F:	Documentation/devicetree/bindings/display/ti/
 
+DRM DRIVERS FOR V3D
+M:	Eric Anholt <eric@anholt.net>
+S:	Supported
+F:	drivers/gpu/drm/v3d/
+F:	include/uapi/drm/v3d_drm.h
+F:	Documentation/devicetree/bindings/display/brcm,bcm-v3d.txt
+T:	git git://anongit.freedesktop.org/drm/drm-misc
+
 DRM DRIVERS FOR VC4
 M:	Eric Anholt <eric@anholt.net>
 T:	git git://github.com/anholt/linux
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 757825ac60df..1c73a455fdb1 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -267,6 +267,8 @@ source "drivers/gpu/drm/amd/amdkfd/Kconfig"
 
 source "drivers/gpu/drm/imx/Kconfig"
 
+source "drivers/gpu/drm/v3d/Kconfig"
+
 source "drivers/gpu/drm/vc4/Kconfig"
 
 source "drivers/gpu/drm/etnaviv/Kconfig"
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 9d66657ea117..7a401edd8761 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_DRM_MGA)	+= mga/
 obj-$(CONFIG_DRM_I810)	+= i810/
 obj-$(CONFIG_DRM_I915)	+= i915/
 obj-$(CONFIG_DRM_MGAG200) += mgag200/
+obj-$(CONFIG_DRM_V3D)  += v3d/
 obj-$(CONFIG_DRM_VC4)  += vc4/
 obj-$(CONFIG_DRM_CIRRUS_QEMU) += cirrus/
 obj-$(CONFIG_DRM_SIS)   += sis/
diff --git a/drivers/gpu/drm/v3d/Kconfig b/drivers/gpu/drm/v3d/Kconfig
new file mode 100644
index 000000000000..a0c0259355bd
--- /dev/null
+++ b/drivers/gpu/drm/v3d/Kconfig
@@ -0,0 +1,9 @@
+config DRM_V3D
+	tristate "Broadcom V3D 3.x and newer"
+	depends on ARCH_BCM || ARCH_BCMSTB || COMPILE_TEST
+	depends on DRM
+	depends on COMMON_CLK
+	select DRM_SCHED
+	help
+	  Choose this option if you have a system that has a Broadcom
+	  V3D 3.x or newer GPU, such as BCM7268.
diff --git a/drivers/gpu/drm/v3d/Makefile b/drivers/gpu/drm/v3d/Makefile
new file mode 100644
index 000000000000..34446e1de64f
--- /dev/null
+++ b/drivers/gpu/drm/v3d/Makefile
@@ -0,0 +1,18 @@
+# Please keep these build lists sorted!
+
+# core driver code
+v3d-y := \
+	v3d_bo.o \
+	v3d_drv.o \
+	v3d_fence.o \
+	v3d_gem.o \
+	v3d_irq.o \
+	v3d_mmu.o \
+	v3d_trace_points.o \
+	v3d_sched.o
+
+v3d-$(CONFIG_DEBUG_FS) += v3d_debugfs.o
+
+obj-$(CONFIG_DRM_V3D)  += v3d.o
+
+CFLAGS_v3d_trace_points.o := -I$(src)
diff --git a/drivers/gpu/drm/v3d/v3d_bo.c b/drivers/gpu/drm/v3d/v3d_bo.c
new file mode 100644
index 000000000000..7b1e2a549a71
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_bo.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2015-2018 Broadcom */
+
+/**
+ * DOC: V3D GEM BO management support
+ *
+ * Compared to VC4 (V3D 2.x), V3D 3.3 introduces an MMU between the
+ * GPU and the bus, allowing us to use shmem objects for our storage
+ * instead of CMA.
+ *
+ * Physically contiguous objects may still be imported to V3D, but the
+ * driver doesn't allocate physically contiguous objects on its own.
+ * Display engines requiring physically contiguous allocations should
+ * look into Mesa's "renderonly" support (as used by the Mesa pl111
+ * driver) for an example of how to integrate with V3D.
+ *
+ * Long term, we should support evicting pages from the MMU when under
+ * memory pressure (thus the v3d_bo_get_pages() refcounting), but
+ * that's not a high priority since our systems tend to not have swap.
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/pfn_t.h>
+
+#include "v3d_drv.h"
+#include "uapi/drm/v3d_drm.h"
+
+/* Pins the shmem pages, fills in the .pages and .sgt fields of the BO, and maps
+ * it for DMA.
+ */
+static int
+v3d_bo_get_pages(struct v3d_bo *bo)
+{
+	struct drm_gem_object *obj = &bo->base;
+	struct drm_device *dev = obj->dev;
+	int npages = obj->size >> PAGE_SHIFT;
+	int ret = 0;
+
+	mutex_lock(&bo->lock);
+	if (bo->pages_refcount++ != 0)
+		goto unlock;
+
+	if (!obj->import_attach) {
+		bo->pages = drm_gem_get_pages(obj);
+		if (IS_ERR(bo->pages)) {
+			ret = PTR_ERR(bo->pages);
+			goto unlock;
+		}
+
+		bo->sgt = drm_prime_pages_to_sg(bo->pages, npages);
+		if (IS_ERR(bo->sgt)) {
+			ret = PTR_ERR(bo->sgt);
+			goto put_pages;
+		}
+
+		/* Map the pages for use by the GPU. */
+		dma_map_sg(dev->dev, bo->sgt->sgl,
+			   bo->sgt->nents, DMA_BIDIRECTIONAL);
+	} else {
+		bo->pages = kcalloc(npages, sizeof(*bo->pages), GFP_KERNEL);
+		if (!bo->pages)
+			goto put_pages;
+
+		drm_prime_sg_to_page_addr_arrays(bo->sgt, bo->pages,
+						 NULL, npages);
+
+		/* Note that dma-bufs come in mapped. */
+	}
+
+	mutex_unlock(&bo->lock);
+
+	return 0;
+
+put_pages:
+	drm_gem_put_pages(obj, bo->pages, true, true);
+	bo->pages = NULL;
+unlock:
+	bo->pages_refcount--;
+	mutex_unlock(&bo->lock);
+	return ret;
+}
+
+static void
+v3d_bo_put_pages(struct v3d_bo *bo)
+{
+	struct drm_gem_object *obj = &bo->base;
+
+	mutex_lock(&bo->lock);
+	if (--bo->pages_refcount == 0) {
+		if (!obj->import_attach) {
+			dma_unmap_sg(obj->dev->dev, bo->sgt->sgl,
+				     bo->sgt->nents, DMA_BIDIRECTIONAL);
+			sg_free_table(bo->sgt);
+			kfree(bo->sgt);
+			drm_gem_put_pages(obj, bo->pages, true, true);
+		} else {
+			kfree(bo->pages);
+		}
+	}
+	mutex_unlock(&bo->lock);
+}
+
+static struct v3d_bo *v3d_bo_create_struct(struct drm_device *dev,
+					   size_t unaligned_size)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct drm_gem_object *obj;
+	struct v3d_bo *bo;
+	size_t size = roundup(unaligned_size, PAGE_SIZE);
+	int ret;
+
+	if (size == 0)
+		return ERR_PTR(-EINVAL);
+
+	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
+	if (!bo)
+		return ERR_PTR(-ENOMEM);
+	obj = &bo->base;
+
+	INIT_LIST_HEAD(&bo->vmas);
+	INIT_LIST_HEAD(&bo->unref_head);
+	mutex_init(&bo->lock);
+
+	ret = drm_gem_object_init(dev, obj, size);
+	if (ret)
+		goto free_bo;
+
+	spin_lock(&v3d->mm_lock);
+	ret = drm_mm_insert_node_generic(&v3d->mm, &bo->node,
+					 obj->size >> PAGE_SHIFT,
+					 GMP_GRANULARITY >> PAGE_SHIFT, 0, 0);
+	spin_unlock(&v3d->mm_lock);
+	if (ret)
+		goto free_obj;
+
+	return bo;
+
+free_obj:
+	drm_gem_object_release(obj);
+free_bo:
+	kfree(bo);
+	return ERR_PTR(ret);
+}
+
+struct v3d_bo *v3d_bo_create(struct drm_device *dev, struct drm_file *file_priv,
+			     size_t unaligned_size)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct drm_gem_object *obj;
+	struct v3d_bo *bo;
+	int ret;
+
+	bo = v3d_bo_create_struct(dev, unaligned_size);
+	if (IS_ERR(bo))
+		return bo;
+	obj = &bo->base;
+
+	bo->resv = &bo->_resv;
+	reservation_object_init(bo->resv);
+
+	ret = v3d_bo_get_pages(bo);
+	if (ret)
+		goto free_mm;
+
+	v3d_mmu_insert_ptes(bo);
+
+	mutex_lock(&v3d->bo_lock);
+	v3d->bo_stats.num_allocated++;
+	v3d->bo_stats.pages_allocated += obj->size >> PAGE_SHIFT;
+	mutex_unlock(&v3d->bo_lock);
+
+	return bo;
+
+free_mm:
+	spin_lock(&v3d->mm_lock);
+	drm_mm_remove_node(&bo->node);
+	spin_unlock(&v3d->mm_lock);
+
+	drm_gem_object_release(obj);
+	kfree(bo);
+	return ERR_PTR(ret);
+}
+
+/* Called DRM core on the last userspace/kernel unreference of the
+ * BO.
+ */
+void v3d_free_object(struct drm_gem_object *obj)
+{
+	struct v3d_dev *v3d = to_v3d_dev(obj->dev);
+	struct v3d_bo *bo = to_v3d_bo(obj);
+
+	mutex_lock(&v3d->bo_lock);
+	v3d->bo_stats.num_allocated--;
+	v3d->bo_stats.pages_allocated -= obj->size >> PAGE_SHIFT;
+	mutex_unlock(&v3d->bo_lock);
+
+	reservation_object_fini(&bo->_resv);
+
+	v3d_bo_put_pages(bo);
+
+	if (obj->import_attach)
+		drm_prime_gem_destroy(obj, bo->sgt);
+
+	v3d_mmu_remove_ptes(bo);
+	spin_lock(&v3d->mm_lock);
+	drm_mm_remove_node(&bo->node);
+	spin_unlock(&v3d->mm_lock);
+
+	mutex_destroy(&bo->lock);
+
+	drm_gem_object_release(obj);
+	kfree(bo);
+}
+
+struct reservation_object *v3d_prime_res_obj(struct drm_gem_object *obj)
+{
+	struct v3d_bo *bo = to_v3d_bo(obj);
+
+	return bo->resv;
+}
+
+static void
+v3d_set_mmap_vma_flags(struct vm_area_struct *vma)
+{
+	vma->vm_flags &= ~VM_PFNMAP;
+	vma->vm_flags |= VM_MIXEDMAP;
+	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+}
+
+int v3d_gem_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct drm_gem_object *obj = vma->vm_private_data;
+	struct v3d_bo *bo = to_v3d_bo(obj);
+	unsigned long pfn;
+	pgoff_t pgoff;
+	int ret;
+
+	/* We don't use vmf->pgoff since that has the fake offset: */
+	pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+	pfn = page_to_pfn(bo->pages[pgoff]);
+
+	ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
+
+	switch (ret) {
+	case -EAGAIN:
+	case 0:
+	case -ERESTARTSYS:
+	case -EINTR:
+	case -EBUSY:
+		/*
+		 * EBUSY is ok: this just means that another thread
+		 * already did the job.
+		 */
+		return VM_FAULT_NOPAGE;
+	case -ENOMEM:
+		return VM_FAULT_OOM;
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+}
+
+int v3d_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int ret;
+
+	ret = drm_gem_mmap(filp, vma);
+	if (ret)
+		return ret;
+
+	v3d_set_mmap_vma_flags(vma);
+
+	return ret;
+}
+
+int v3d_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
+{
+	int ret;
+
+	ret = drm_gem_mmap_obj(obj, obj->size, vma);
+	if (ret < 0)
+		return ret;
+
+	v3d_set_mmap_vma_flags(vma);
+
+	return 0;
+}
+
+struct sg_table *
+v3d_prime_get_sg_table(struct drm_gem_object *obj)
+{
+	struct v3d_bo *bo = to_v3d_bo(obj);
+	int npages = obj->size >> PAGE_SHIFT;
+
+	return drm_prime_pages_to_sg(bo->pages, npages);
+}
+
+struct drm_gem_object *
+v3d_prime_import_sg_table(struct drm_device *dev,
+			  struct dma_buf_attachment *attach,
+			  struct sg_table *sgt)
+{
+	struct drm_gem_object *obj;
+	struct v3d_bo *bo;
+
+	bo = v3d_bo_create_struct(dev, attach->dmabuf->size);
+	if (IS_ERR(bo))
+		return ERR_CAST(bo);
+	obj = &bo->base;
+
+	bo->resv = attach->dmabuf->resv;
+
+	bo->sgt = sgt;
+	v3d_bo_get_pages(bo);
+
+	v3d_mmu_insert_ptes(bo);
+
+	return obj;
+}
+
+int v3d_create_bo_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_v3d_create_bo *args = data;
+	struct v3d_bo *bo = NULL;
+	int ret;
+
+	if (args->flags != 0) {
+		DRM_INFO("unknown create_bo flags: %d\n", args->flags);
+		return -EINVAL;
+	}
+
+	bo = v3d_bo_create(dev, file_priv, PAGE_ALIGN(args->size));
+	if (IS_ERR(bo))
+		return PTR_ERR(bo);
+
+	args->offset = bo->node.start << PAGE_SHIFT;
+
+	ret = drm_gem_handle_create(file_priv, &bo->base, &args->handle);
+	drm_gem_object_put_unlocked(&bo->base);
+
+	return ret;
+}
+
+int v3d_mmap_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv)
+{
+	struct drm_v3d_mmap_bo *args = data;
+	struct drm_gem_object *gem_obj;
+	int ret;
+
+	if (args->flags != 0) {
+		DRM_INFO("unknown mmap_bo flags: %d\n", args->flags);
+		return -EINVAL;
+	}
+
+	gem_obj = drm_gem_object_lookup(file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_DEBUG("Failed to look up GEM BO %d\n", args->handle);
+		return -ENOENT;
+	}
+
+	ret = drm_gem_create_mmap_offset(gem_obj);
+	if (ret == 0)
+		args->offset = drm_vma_node_offset_addr(&gem_obj->vma_node);
+	drm_gem_object_put_unlocked(gem_obj);
+
+	return ret;
+}
+
+int v3d_get_bo_offset_ioctl(struct drm_device *dev, void *data,
+			    struct drm_file *file_priv)
+{
+	struct drm_v3d_get_bo_offset *args = data;
+	struct drm_gem_object *gem_obj;
+	struct v3d_bo *bo;
+
+	gem_obj = drm_gem_object_lookup(file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_DEBUG("Failed to look up GEM BO %d\n", args->handle);
+		return -ENOENT;
+	}
+	bo = to_v3d_bo(gem_obj);
+
+	args->offset = bo->node.start << PAGE_SHIFT;
+
+	drm_gem_object_put_unlocked(gem_obj);
+	return 0;
+}
diff --git a/drivers/gpu/drm/v3d/v3d_debugfs.c b/drivers/gpu/drm/v3d/v3d_debugfs.c
new file mode 100644
index 000000000000..4db62c545748
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2014-2018 Broadcom */
+
+#include <linux/circ_buf.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/pm_runtime.h>
+#include <linux/seq_file.h>
+#include <drm/drmP.h>
+
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+
+#define REGDEF(reg) { reg, #reg }
+struct v3d_reg_def {
+	u32 reg;
+	const char *name;
+};
+
+static const struct v3d_reg_def v3d_hub_reg_defs[] = {
+	REGDEF(V3D_HUB_AXICFG),
+	REGDEF(V3D_HUB_UIFCFG),
+	REGDEF(V3D_HUB_IDENT0),
+	REGDEF(V3D_HUB_IDENT1),
+	REGDEF(V3D_HUB_IDENT2),
+	REGDEF(V3D_HUB_IDENT3),
+	REGDEF(V3D_HUB_INT_STS),
+	REGDEF(V3D_HUB_INT_MSK_STS),
+};
+
+static const struct v3d_reg_def v3d_gca_reg_defs[] = {
+	REGDEF(V3D_GCA_SAFE_SHUTDOWN),
+	REGDEF(V3D_GCA_SAFE_SHUTDOWN_ACK),
+};
+
+static const struct v3d_reg_def v3d_core_reg_defs[] = {
+	REGDEF(V3D_CTL_IDENT0),
+	REGDEF(V3D_CTL_IDENT1),
+	REGDEF(V3D_CTL_IDENT2),
+	REGDEF(V3D_CTL_MISCCFG),
+	REGDEF(V3D_CTL_INT_STS),
+	REGDEF(V3D_CTL_INT_MSK_STS),
+	REGDEF(V3D_CLE_CT0CS),
+	REGDEF(V3D_CLE_CT0CA),
+	REGDEF(V3D_CLE_CT0EA),
+	REGDEF(V3D_CLE_CT1CS),
+	REGDEF(V3D_CLE_CT1CA),
+	REGDEF(V3D_CLE_CT1EA),
+
+	REGDEF(V3D_PTB_BPCA),
+	REGDEF(V3D_PTB_BPCS),
+
+	REGDEF(V3D_MMU_CTL),
+	REGDEF(V3D_MMU_VIO_ADDR),
+
+	REGDEF(V3D_GMP_STATUS),
+	REGDEF(V3D_GMP_CFG),
+	REGDEF(V3D_GMP_VIO_ADDR),
+};
+
+static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	int i, core;
+
+	for (i = 0; i < ARRAY_SIZE(v3d_hub_reg_defs); i++) {
+		seq_printf(m, "%s (0x%04x): 0x%08x\n",
+			   v3d_hub_reg_defs[i].name, v3d_hub_reg_defs[i].reg,
+			   V3D_READ(v3d_hub_reg_defs[i].reg));
+	}
+
+	for (i = 0; i < ARRAY_SIZE(v3d_gca_reg_defs); i++) {
+		seq_printf(m, "%s (0x%04x): 0x%08x\n",
+			   v3d_gca_reg_defs[i].name, v3d_gca_reg_defs[i].reg,
+			   V3D_GCA_READ(v3d_gca_reg_defs[i].reg));
+	}
+
+	for (core = 0; core < v3d->cores; core++) {
+		for (i = 0; i < ARRAY_SIZE(v3d_core_reg_defs); i++) {
+			seq_printf(m, "core %d %s (0x%04x): 0x%08x\n",
+				   core,
+				   v3d_core_reg_defs[i].name,
+				   v3d_core_reg_defs[i].reg,
+				   V3D_CORE_READ(core,
+						 v3d_core_reg_defs[i].reg));
+		}
+	}
+
+	return 0;
+}
+
+static int v3d_v3d_debugfs_ident(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	u32 ident0, ident1, ident2, ident3, cores;
+	int ret, core;
+
+	ret = pm_runtime_get_sync(v3d->dev);
+	if (ret < 0)
+		return ret;
+
+	ident0 = V3D_READ(V3D_HUB_IDENT0);
+	ident1 = V3D_READ(V3D_HUB_IDENT1);
+	ident2 = V3D_READ(V3D_HUB_IDENT2);
+	ident3 = V3D_READ(V3D_HUB_IDENT3);
+	cores = V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_NCORES);
+
+	seq_printf(m, "Revision:   %d.%d.%d.%d\n",
+		   V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_TVER),
+		   V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_REV),
+		   V3D_GET_FIELD(ident3, V3D_HUB_IDENT3_IPREV),
+		   V3D_GET_FIELD(ident3, V3D_HUB_IDENT3_IPIDX));
+	seq_printf(m, "MMU:        %s\n",
+		   (ident2 & V3D_HUB_IDENT2_WITH_MMU) ? "yes" : "no");
+	seq_printf(m, "TFU:        %s\n",
+		   (ident1 & V3D_HUB_IDENT1_WITH_TFU) ? "yes" : "no");
+	seq_printf(m, "TSY:        %s\n",
+		   (ident1 & V3D_HUB_IDENT1_WITH_TSY) ? "yes" : "no");
+	seq_printf(m, "MSO:        %s\n",
+		   (ident1 & V3D_HUB_IDENT1_WITH_MSO) ? "yes" : "no");
+	seq_printf(m, "L3C:        %s (%dkb)\n",
+		   (ident1 & V3D_HUB_IDENT1_WITH_L3C) ? "yes" : "no",
+		   V3D_GET_FIELD(ident2, V3D_HUB_IDENT2_L3C_NKB));
+
+	for (core = 0; core < cores; core++) {
+		u32 misccfg;
+		u32 nslc, ntmu, qups;
+
+		ident0 = V3D_CORE_READ(core, V3D_CTL_IDENT0);
+		ident1 = V3D_CORE_READ(core, V3D_CTL_IDENT1);
+		ident2 = V3D_CORE_READ(core, V3D_CTL_IDENT2);
+		misccfg = V3D_CORE_READ(core, V3D_CTL_MISCCFG);
+
+		nslc = V3D_GET_FIELD(ident1, V3D_IDENT1_NSLC);
+		ntmu = V3D_GET_FIELD(ident1, V3D_IDENT1_NTMU);
+		qups = V3D_GET_FIELD(ident1, V3D_IDENT1_QUPS);
+
+		seq_printf(m, "Core %d:\n", core);
+		seq_printf(m, "  Revision:     %d.%d\n",
+			   V3D_GET_FIELD(ident0, V3D_IDENT0_VER),
+			   V3D_GET_FIELD(ident1, V3D_IDENT1_REV));
+		seq_printf(m, "  Slices:       %d\n", nslc);
+		seq_printf(m, "  TMUs:         %d\n", nslc * ntmu);
+		seq_printf(m, "  QPUs:         %d\n", nslc * qups);
+		seq_printf(m, "  Semaphores:   %d\n",
+			   V3D_GET_FIELD(ident1, V3D_IDENT1_NSEM));
+		seq_printf(m, "  BCG int:      %d\n",
+			   (ident2 & V3D_IDENT2_BCG_INT) != 0);
+		seq_printf(m, "  Override TMU: %d\n",
+			   (misccfg & V3D_MISCCFG_OVRTMUOUT) != 0);
+	}
+
+	pm_runtime_mark_last_busy(v3d->dev);
+	pm_runtime_put_autosuspend(v3d->dev);
+
+	return 0;
+}
+
+static int v3d_debugfs_bo_stats(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+
+	mutex_lock(&v3d->bo_lock);
+	seq_printf(m, "allocated bos:          %d\n",
+		   v3d->bo_stats.num_allocated);
+	seq_printf(m, "allocated bo size (kb): %ld\n",
+		   (long)v3d->bo_stats.pages_allocated << (PAGE_SHIFT - 10));
+	mutex_unlock(&v3d->bo_lock);
+
+	return 0;
+}
+
+static const struct drm_info_list v3d_debugfs_list[] = {
+	{"v3d_ident", v3d_v3d_debugfs_ident, 0},
+	{"v3d_regs", v3d_v3d_debugfs_regs, 0},
+	{"bo_stats", v3d_debugfs_bo_stats, 0},
+};
+
+int
+v3d_debugfs_init(struct drm_minor *minor)
+{
+	return drm_debugfs_create_files(v3d_debugfs_list,
+					ARRAY_SIZE(v3d_debugfs_list),
+					minor->debugfs_root, minor);
+}
diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
new file mode 100644
index 000000000000..38e8041b5f0c
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2014-2018 Broadcom */
+
+/**
+ * DOC: Broadcom V3D Graphics Driver
+ *
+ * This driver supports the Broadcom V3D 3.3 and 4.1 OpenGL ES GPUs.
+ * For V3D 2.x support, see the VC4 driver.
+ *
+ * Currently only single-core rendering using the binner and renderer
+ * is supported.  The TFU (texture formatting unit) and V3D 4.x's CSD
+ * (compute shader dispatch) are not yet supported.
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <drm/drm_fb_cma_helper.h>
+#include <drm/drm_fb_helper.h>
+
+#include "uapi/drm/v3d_drm.h"
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+
+#define DRIVER_NAME "v3d"
+#define DRIVER_DESC "Broadcom V3D graphics"
+#define DRIVER_DATE "20180419"
+#define DRIVER_MAJOR 1
+#define DRIVER_MINOR 0
+#define DRIVER_PATCHLEVEL 0
+
+#ifdef CONFIG_PM
+static int v3d_runtime_suspend(struct device *dev)
+{
+	struct drm_device *drm = dev_get_drvdata(dev);
+	struct v3d_dev *v3d = to_v3d_dev(drm);
+
+	v3d_irq_disable(v3d);
+
+	clk_disable_unprepare(v3d->clk);
+
+	return 0;
+}
+
+static int v3d_runtime_resume(struct device *dev)
+{
+	struct drm_device *drm = dev_get_drvdata(dev);
+	struct v3d_dev *v3d = to_v3d_dev(drm);
+	int ret;
+
+	ret = clk_prepare_enable(v3d->clk);
+	if (ret != 0)
+		return ret;
+
+	/* XXX: VPM base */
+
+	v3d_mmu_set_page_table(v3d);
+	v3d_irq_enable(v3d);
+
+	return 0;
+}
+#endif
+
+static const struct dev_pm_ops v3d_v3d_pm_ops = {
+	SET_RUNTIME_PM_OPS(v3d_runtime_suspend, v3d_runtime_resume, NULL)
+};
+
+static int v3d_get_param_ioctl(struct drm_device *dev, void *data,
+			       struct drm_file *file_priv)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct drm_v3d_get_param *args = data;
+	int ret;
+	static const u32 reg_map[] = {
+		[DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_UIFCFG,
+		[DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_IDENT1,
+		[DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_IDENT2,
+		[DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_IDENT3,
+		[DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_IDENT0,
+		[DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_IDENT1,
+		[DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_IDENT2,
+	};
+
+	if (args->pad != 0)
+		return -EINVAL;
+
+	/* Note that DRM_V3D_PARAM_V3D_CORE0_IDENT0 is 0, so we need
+	 * to explicitly allow it in the "the register in our
+	 * parameter map" check.
+	 */
+	if (args->param < ARRAY_SIZE(reg_map) &&
+	    (reg_map[args->param] ||
+	     args->param == DRM_V3D_PARAM_V3D_CORE0_IDENT0)) {
+		u32 offset = reg_map[args->param];
+
+		if (args->value != 0)
+			return -EINVAL;
+
+		ret = pm_runtime_get_sync(v3d->dev);
+		if (args->param >= DRM_V3D_PARAM_V3D_CORE0_IDENT0 &&
+		    args->param <= DRM_V3D_PARAM_V3D_CORE0_IDENT2) {
+			args->value = V3D_CORE_READ(0, offset);
+		} else {
+			args->value = V3D_READ(offset);
+		}
+		pm_runtime_mark_last_busy(v3d->dev);
+		pm_runtime_put_autosuspend(v3d->dev);
+		return 0;
+	}
+
+	/* Any params that aren't just register reads would go here. */
+
+	DRM_DEBUG("Unknown parameter %d\n", args->param);
+	return -EINVAL;
+}
+
+static int
+v3d_open(struct drm_device *dev, struct drm_file *file)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_file_priv *v3d_priv;
+	int i;
+
+	v3d_priv = kzalloc(sizeof(*v3d_priv), GFP_KERNEL);
+	if (!v3d_priv)
+		return -ENOMEM;
+
+	v3d_priv->v3d = v3d;
+
+	for (i = 0; i < V3D_MAX_QUEUES; i++) {
+		drm_sched_entity_init(&v3d->queue[i].sched,
+				      &v3d_priv->sched_entity[i],
+				      &v3d->queue[i].sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL],
+				      32, NULL);
+	}
+
+	file->driver_priv = v3d_priv;
+
+	return 0;
+}
+
+static void
+v3d_postclose(struct drm_device *dev, struct drm_file *file)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_file_priv *v3d_priv = file->driver_priv;
+	enum v3d_queue q;
+
+	for (q = 0; q < V3D_MAX_QUEUES; q++) {
+		drm_sched_entity_fini(&v3d->queue[q].sched,
+				      &v3d_priv->sched_entity[q]);
+	}
+
+	kfree(v3d_priv);
+}
+
+static const struct file_operations v3d_drm_fops = {
+	.owner = THIS_MODULE,
+	.open = drm_open,
+	.release = drm_release,
+	.unlocked_ioctl = drm_ioctl,
+	.mmap = v3d_mmap,
+	.poll = drm_poll,
+	.read = drm_read,
+	.compat_ioctl = drm_compat_ioctl,
+	.llseek = noop_llseek,
+};
+
+/* DRM_AUTH is required on SUBMIT_CL for now, while we don't have GMP
+ * protection between clients.  Note that render nodes would be be
+ * able to submit CLs that could access BOs from clients authenticated
+ * with the master node.
+ */
+static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CL, v3d_submit_cl_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(V3D_WAIT_BO, v3d_wait_bo_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_CREATE_BO, v3d_create_bo_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_MMAP_BO, v3d_mmap_bo_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
+};
+
+static const struct vm_operations_struct v3d_vm_ops = {
+	.fault = v3d_gem_fault,
+	.open = drm_gem_vm_open,
+	.close = drm_gem_vm_close,
+};
+
+static struct drm_driver v3d_drm_driver = {
+	.driver_features = (DRIVER_GEM |
+			    DRIVER_RENDER |
+			    DRIVER_PRIME |
+			    DRIVER_SYNCOBJ),
+
+	.open = v3d_open,
+	.postclose = v3d_postclose,
+
+#if defined(CONFIG_DEBUG_FS)
+	.debugfs_init = v3d_debugfs_init,
+#endif
+
+	.gem_free_object_unlocked = v3d_free_object,
+	.gem_vm_ops = &v3d_vm_ops,
+
+	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
+	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
+	.gem_prime_import = drm_gem_prime_import,
+	.gem_prime_export = drm_gem_prime_export,
+	.gem_prime_res_obj = v3d_prime_res_obj,
+	.gem_prime_get_sg_table	= v3d_prime_get_sg_table,
+	.gem_prime_import_sg_table = v3d_prime_import_sg_table,
+	.gem_prime_mmap = v3d_prime_mmap,
+
+	.ioctls = v3d_drm_ioctls,
+	.num_ioctls = ARRAY_SIZE(v3d_drm_ioctls),
+	.fops = &v3d_drm_fops,
+
+	.name = DRIVER_NAME,
+	.desc = DRIVER_DESC,
+	.date = DRIVER_DATE,
+	.major = DRIVER_MAJOR,
+	.minor = DRIVER_MINOR,
+	.patchlevel = DRIVER_PATCHLEVEL,
+};
+
+static const struct of_device_id v3d_of_match[] = {
+	{ .compatible = "brcm,7268-v3d" },
+	{ .compatible = "brcm,7278-v3d" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, v3d_of_match);
+
+static int
+map_regs(struct v3d_dev *v3d, void __iomem **regs, const char *name)
+{
+	struct resource *res =
+		platform_get_resource_byname(v3d->pdev, IORESOURCE_MEM, name);
+
+	*regs = devm_ioremap_resource(v3d->dev, res);
+	return PTR_ERR_OR_ZERO(*regs);
+}
+
+static int v3d_platform_drm_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct drm_device *drm;
+	struct v3d_dev *v3d;
+	int ret;
+	u32 ident1;
+
+	dev->coherent_dma_mask = DMA_BIT_MASK(36);
+
+	v3d = kzalloc(sizeof(*v3d), GFP_KERNEL);
+	if (!v3d)
+		return -ENOMEM;
+	v3d->dev = dev;
+	v3d->pdev = pdev;
+	drm = &v3d->drm;
+
+	ret = map_regs(v3d, &v3d->bridge_regs, "bridge");
+	if (ret)
+		goto dev_free;
+
+	ret = map_regs(v3d, &v3d->hub_regs, "hub");
+	if (ret)
+		goto dev_free;
+
+	ret = map_regs(v3d, &v3d->core_regs[0], "core0");
+	if (ret)
+		goto dev_free;
+
+	ident1 = V3D_READ(V3D_HUB_IDENT1);
+	v3d->ver = (V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_TVER) * 10 +
+		    V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_REV));
+	v3d->cores = V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_NCORES);
+	WARN_ON(v3d->cores > 1); /* multicore not yet implemented */
+
+	if (v3d->ver < 41) {
+		ret = map_regs(v3d, &v3d->gca_regs, "gca");
+		if (ret)
+			goto dev_free;
+	}
+
+	v3d->mmu_scratch = dma_alloc_wc(dev, 4096, &v3d->mmu_scratch_paddr,
+					GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
+	if (!v3d->mmu_scratch) {
+		dev_err(dev, "Failed to allocate MMU scratch page\n");
+		ret = -ENOMEM;
+		goto dev_free;
+	}
+
+	pm_runtime_use_autosuspend(dev);
+	pm_runtime_set_autosuspend_delay(dev, 50);
+	pm_runtime_enable(dev);
+
+	ret = drm_dev_init(&v3d->drm, &v3d_drm_driver, dev);
+	if (ret)
+		goto dma_free;
+
+	platform_set_drvdata(pdev, drm);
+	drm->dev_private = v3d;
+
+	ret = v3d_gem_init(drm);
+	if (ret)
+		goto dev_destroy;
+
+	v3d_irq_init(v3d);
+
+	ret = drm_dev_register(drm, 0);
+	if (ret)
+		goto gem_destroy;
+
+	return 0;
+
+gem_destroy:
+	v3d_gem_destroy(drm);
+dev_destroy:
+	drm_dev_put(drm);
+dma_free:
+	dma_free_wc(dev, 4096, v3d->mmu_scratch, v3d->mmu_scratch_paddr);
+dev_free:
+	kfree(v3d);
+	return ret;
+}
+
+static int v3d_platform_drm_remove(struct platform_device *pdev)
+{
+	struct drm_device *drm = platform_get_drvdata(pdev);
+	struct v3d_dev *v3d = to_v3d_dev(drm);
+
+	drm_dev_unregister(drm);
+
+	v3d_gem_destroy(drm);
+
+	drm_dev_put(drm);
+
+	dma_free_wc(v3d->dev, 4096, v3d->mmu_scratch, v3d->mmu_scratch_paddr);
+
+	return 0;
+}
+
+static struct platform_driver v3d_platform_driver = {
+	.probe		= v3d_platform_drm_probe,
+	.remove		= v3d_platform_drm_remove,
+	.driver		= {
+		.name	= "v3d",
+		.of_match_table = v3d_of_match,
+	},
+};
+
+static int __init v3d_drm_register(void)
+{
+	return platform_driver_register(&v3d_platform_driver);
+}
+
+static void __exit v3d_drm_unregister(void)
+{
+	platform_driver_unregister(&v3d_platform_driver);
+}
+
+module_init(v3d_drm_register);
+module_exit(v3d_drm_unregister);
+
+MODULE_ALIAS("platform:v3d-drm");
+MODULE_DESCRIPTION("Broadcom V3D DRM Driver");
+MODULE_AUTHOR("Eric Anholt <eric@anholt.net>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
new file mode 100644
index 000000000000..a043ac3aae98
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2015-2018 Broadcom */
+
+#include <linux/reservation.h>
+#include <drm/drmP.h>
+#include <drm/drm_encoder.h>
+#include <drm/drm_gem.h>
+#include <drm/gpu_scheduler.h>
+
+#define GMP_GRANULARITY (128 * 1024)
+
+/* Enum for each of the V3D queues.  We maintain various queue
+ * tracking as an array because at some point we'll want to support
+ * the TFU (texture formatting unit) as another queue.
+ */
+enum v3d_queue {
+	V3D_BIN,
+	V3D_RENDER,
+};
+
+#define V3D_MAX_QUEUES (V3D_RENDER + 1)
+
+struct v3d_queue_state {
+	struct drm_gpu_scheduler sched;
+
+	u64 fence_context;
+	u64 emit_seqno;
+	u64 finished_seqno;
+};
+
+struct v3d_dev {
+	struct drm_device drm;
+
+	/* Short representation (e.g. 33, 41) of the V3D tech version
+	 * and revision.
+	 */
+	int ver;
+
+	struct device *dev;
+	struct platform_device *pdev;
+	void __iomem *hub_regs;
+	void __iomem *core_regs[3];
+	void __iomem *bridge_regs;
+	void __iomem *gca_regs;
+	struct clk *clk;
+
+	/* Virtual and DMA addresses of the single shared page table. */
+	volatile u32 *pt;
+	dma_addr_t pt_paddr;
+
+	/* Virtual and DMA addresses of the MMU's scratch page.  When
+	 * a read or write is invalid in the MMU, it will be
+	 * redirected here.
+	 */
+	void *mmu_scratch;
+	dma_addr_t mmu_scratch_paddr;
+
+	/* Number of V3D cores. */
+	u32 cores;
+
+	/* Allocator managing the address space.  All units are in
+	 * number of pages.
+	 */
+	struct drm_mm mm;
+	spinlock_t mm_lock;
+
+	struct work_struct overflow_mem_work;
+
+	struct v3d_exec_info *bin_job;
+	struct v3d_exec_info *render_job;
+
+	struct v3d_queue_state queue[V3D_MAX_QUEUES];
+
+	/* Spinlock used to synchronize the overflow memory
+	 * management against bin job submission.
+	 */
+	spinlock_t job_lock;
+
+	/* Protects bo_stats */
+	struct mutex bo_lock;
+
+	/* Lock taken when resetting the GPU, to keep multiple
+	 * processes from trying to park the scheduler threads and
+	 * reset at once.
+	 */
+	struct mutex reset_lock;
+
+	struct {
+		u32 num_allocated;
+		u32 pages_allocated;
+	} bo_stats;
+};
+
+static inline struct v3d_dev *
+to_v3d_dev(struct drm_device *dev)
+{
+	return (struct v3d_dev *)dev->dev_private;
+}
+
+/* The per-fd struct, which tracks the MMU mappings. */
+struct v3d_file_priv {
+	struct v3d_dev *v3d;
+
+	struct drm_sched_entity sched_entity[V3D_MAX_QUEUES];
+};
+
+/* Tracks a mapping of a BO into a per-fd address space */
+struct v3d_vma {
+	struct v3d_page_table *pt;
+	struct list_head list; /* entry in v3d_bo.vmas */
+};
+
+struct v3d_bo {
+	struct drm_gem_object base;
+
+	struct mutex lock;
+
+	struct drm_mm_node node;
+
+	u32 pages_refcount;
+	struct page **pages;
+	struct sg_table *sgt;
+	void *vaddr;
+
+	struct list_head vmas;    /* list of v3d_vma */
+
+	/* List entry for the BO's position in
+	 * v3d_exec_info->unref_list
+	 */
+	struct list_head unref_head;
+
+	/* normally (resv == &_resv) except for imported bo's */
+	struct reservation_object *resv;
+	struct reservation_object _resv;
+};
+
+static inline struct v3d_bo *
+to_v3d_bo(struct drm_gem_object *bo)
+{
+	return (struct v3d_bo *)bo;
+}
+
+struct v3d_fence {
+	struct dma_fence base;
+	struct drm_device *dev;
+	/* v3d seqno for signaled() test */
+	u64 seqno;
+	enum v3d_queue queue;
+};
+
+static inline struct v3d_fence *
+to_v3d_fence(struct dma_fence *fence)
+{
+	return (struct v3d_fence *)fence;
+}
+
+#define V3D_READ(offset) readl(v3d->hub_regs + offset)
+#define V3D_WRITE(offset, val) writel(val, v3d->hub_regs + offset)
+
+#define V3D_BRIDGE_READ(offset) readl(v3d->bridge_regs + offset)
+#define V3D_BRIDGE_WRITE(offset, val) writel(val, v3d->bridge_regs + offset)
+
+#define V3D_GCA_READ(offset) readl(v3d->gca_regs + offset)
+#define V3D_GCA_WRITE(offset, val) writel(val, v3d->gca_regs + offset)
+
+#define V3D_CORE_READ(core, offset) readl(v3d->core_regs[core] + offset)
+#define V3D_CORE_WRITE(core, offset, val) writel(val, v3d->core_regs[core] + offset)
+
+struct v3d_job {
+	struct drm_sched_job base;
+
+	struct v3d_exec_info *exec;
+
+	/* An optional fence userspace can pass in for the job to depend on. */
+	struct dma_fence *in_fence;
+
+	/* v3d fence to be signaled by IRQ handler when the job is complete. */
+	struct dma_fence *done_fence;
+
+	/* GPU virtual addresses of the start/end of the CL job. */
+	u32 start, end;
+};
+
+struct v3d_exec_info {
+	struct v3d_dev *v3d;
+
+	struct v3d_job bin, render;
+
+	/* Fence for when the scheduler considers the binner to be
+	 * done, for render to depend on.
+	 */
+	struct dma_fence *bin_done_fence;
+
+	struct kref refcount;
+
+	/* This is the array of BOs that were looked up at the start of exec. */
+	struct v3d_bo **bo;
+	u32 bo_count;
+
+	/* List of overflow BOs used in the job that need to be
+	 * released once the job is complete.
+	 */
+	struct list_head unref_list;
+
+	/* Submitted tile memory allocation start/size, tile state. */
+	u32 qma, qms, qts;
+};
+
+/**
+ * _wait_for - magic (register) wait macro
+ *
+ * Does the right thing for modeset paths when run under kdgb or similar atomic
+ * contexts. Note that it's important that we check the condition again after
+ * having timed out, since the timeout could be due to preemption or similar and
+ * we've never had a chance to check the condition before the timeout.
+ */
+#define wait_for(COND, MS) ({ \
+	unsigned long timeout__ = jiffies + msecs_to_jiffies(MS) + 1;	\
+	int ret__ = 0;							\
+	while (!(COND)) {						\
+		if (time_after(jiffies, timeout__)) {			\
+			if (!(COND))					\
+				ret__ = -ETIMEDOUT;			\
+			break;						\
+		}							\
+		msleep(1);					\
+	}								\
+	ret__;								\
+})
+
+static inline unsigned long nsecs_to_jiffies_timeout(const u64 n)
+{
+	/* nsecs_to_jiffies64() does not guard against overflow */
+	if (NSEC_PER_SEC % HZ &&
+	    div_u64(n, NSEC_PER_SEC) >= MAX_JIFFY_OFFSET / HZ)
+		return MAX_JIFFY_OFFSET;
+
+	return min_t(u64, MAX_JIFFY_OFFSET, nsecs_to_jiffies64(n) + 1);
+}
+
+/* v3d_bo.c */
+void v3d_free_object(struct drm_gem_object *gem_obj);
+struct v3d_bo *v3d_bo_create(struct drm_device *dev, struct drm_file *file_priv,
+			     size_t size);
+int v3d_create_bo_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+int v3d_mmap_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
+int v3d_get_bo_offset_ioctl(struct drm_device *dev, void *data,
+			    struct drm_file *file_priv);
+int v3d_gem_fault(struct vm_fault *vmf);
+int v3d_mmap(struct file *filp, struct vm_area_struct *vma);
+struct reservation_object *v3d_prime_res_obj(struct drm_gem_object *obj);
+int v3d_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
+struct sg_table *v3d_prime_get_sg_table(struct drm_gem_object *obj);
+struct drm_gem_object *v3d_prime_import_sg_table(struct drm_device *dev,
+						 struct dma_buf_attachment *attach,
+						 struct sg_table *sgt);
+
+/* v3d_debugfs.c */
+int v3d_debugfs_init(struct drm_minor *minor);
+
+/* v3d_fence.c */
+extern const struct dma_fence_ops v3d_fence_ops;
+struct dma_fence *v3d_fence_create(struct v3d_dev *v3d, enum v3d_queue queue);
+
+/* v3d_gem.c */
+int v3d_gem_init(struct drm_device *dev);
+void v3d_gem_destroy(struct drm_device *dev);
+int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
+void v3d_exec_put(struct v3d_exec_info *exec);
+void v3d_reset(struct v3d_dev *v3d);
+void v3d_invalidate_caches(struct v3d_dev *v3d);
+void v3d_flush_caches(struct v3d_dev *v3d);
+
+/* v3d_irq.c */
+void v3d_irq_init(struct v3d_dev *v3d);
+void v3d_irq_enable(struct v3d_dev *v3d);
+void v3d_irq_disable(struct v3d_dev *v3d);
+void v3d_irq_reset(struct v3d_dev *v3d);
+
+/* v3d_mmu.c */
+int v3d_mmu_get_offset(struct drm_file *file_priv, struct v3d_bo *bo,
+		       u32 *offset);
+int v3d_mmu_set_page_table(struct v3d_dev *v3d);
+void v3d_mmu_insert_ptes(struct v3d_bo *bo);
+void v3d_mmu_remove_ptes(struct v3d_bo *bo);
+
+/* v3d_sched.c */
+int v3d_sched_init(struct v3d_dev *v3d);
+void v3d_sched_fini(struct v3d_dev *v3d);
diff --git a/drivers/gpu/drm/v3d/v3d_fence.c b/drivers/gpu/drm/v3d/v3d_fence.c
new file mode 100644
index 000000000000..087d49c8cb12
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_fence.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2017-2018 Broadcom */
+
+#include "v3d_drv.h"
+
+struct dma_fence *v3d_fence_create(struct v3d_dev *v3d, enum v3d_queue queue)
+{
+	struct v3d_fence *fence;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return ERR_PTR(-ENOMEM);
+
+	fence->dev = &v3d->drm;
+	fence->queue = queue;
+	fence->seqno = ++v3d->queue[queue].emit_seqno;
+	dma_fence_init(&fence->base, &v3d_fence_ops, &v3d->job_lock,
+		       v3d->queue[queue].fence_context, fence->seqno);
+
+	return &fence->base;
+}
+
+static const char *v3d_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "v3d";
+}
+
+static const char *v3d_fence_get_timeline_name(struct dma_fence *fence)
+{
+	struct v3d_fence *f = to_v3d_fence(fence);
+
+	if (f->queue == V3D_BIN)
+		return "v3d-bin";
+	else
+		return "v3d-render";
+}
+
+static bool v3d_fence_enable_signaling(struct dma_fence *fence)
+{
+	return true;
+}
+
+static bool v3d_fence_signaled(struct dma_fence *fence)
+{
+	struct v3d_fence *f = to_v3d_fence(fence);
+	struct v3d_dev *v3d = to_v3d_dev(f->dev);
+
+	return v3d->queue[f->queue].finished_seqno >= f->seqno;
+}
+
+const struct dma_fence_ops v3d_fence_ops = {
+	.get_driver_name = v3d_fence_get_driver_name,
+	.get_timeline_name = v3d_fence_get_timeline_name,
+	.enable_signaling = v3d_fence_enable_signaling,
+	.signaled = v3d_fence_signaled,
+	.wait = dma_fence_default_wait,
+	.release = dma_fence_free,
+};
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
new file mode 100644
index 000000000000..b513f9189caf
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -0,0 +1,668 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2014-2018 Broadcom */
+
+#include <drm/drmP.h>
+#include <drm/drm_syncobj.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/sched/signal.h>
+
+#include "uapi/drm/v3d_drm.h"
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+#include "v3d_trace.h"
+
+static void
+v3d_init_core(struct v3d_dev *v3d, int core)
+{
+	/* Set OVRTMUOUT, which means that the texture sampler uniform
+	 * configuration's tmu output type field is used, instead of
+	 * using the hardware default behavior based on the texture
+	 * type.  If you want the default behavior, you can still put
+	 * "2" in the indirect texture state's output_type field.
+	 */
+	V3D_CORE_WRITE(core, V3D_CTL_MISCCFG, V3D_MISCCFG_OVRTMUOUT);
+
+	/* Whenever we flush the L2T cache, we always want to flush
+	 * the whole thing.
+	 */
+	V3D_CORE_WRITE(core, V3D_CTL_L2TFLSTA, 0);
+	V3D_CORE_WRITE(core, V3D_CTL_L2TFLEND, ~0);
+}
+
+/* Sets invariant state for the HW. */
+static void
+v3d_init_hw_state(struct v3d_dev *v3d)
+{
+	v3d_init_core(v3d, 0);
+}
+
+static void
+v3d_idle_axi(struct v3d_dev *v3d, int core)
+{
+	V3D_CORE_WRITE(core, V3D_GMP_CFG, V3D_GMP_CFG_STOP_REQ);
+
+	if (wait_for((V3D_CORE_READ(core, V3D_GMP_STATUS) &
+		      (V3D_GMP_STATUS_RD_COUNT_MASK |
+		       V3D_GMP_STATUS_WR_COUNT_MASK |
+		       V3D_GMP_STATUS_CFG_BUSY)) == 0, 100)) {
+		DRM_ERROR("Failed to wait for safe GMP shutdown\n");
+	}
+}
+
+static void
+v3d_idle_gca(struct v3d_dev *v3d)
+{
+	if (v3d->ver >= 41)
+		return;
+
+	V3D_GCA_WRITE(V3D_GCA_SAFE_SHUTDOWN, V3D_GCA_SAFE_SHUTDOWN_EN);
+
+	if (wait_for((V3D_GCA_READ(V3D_GCA_SAFE_SHUTDOWN_ACK) &
+		      V3D_GCA_SAFE_SHUTDOWN_ACK_ACKED) ==
+		     V3D_GCA_SAFE_SHUTDOWN_ACK_ACKED, 100)) {
+		DRM_ERROR("Failed to wait for safe GCA shutdown\n");
+	}
+}
+
+static void
+v3d_reset_v3d(struct v3d_dev *v3d)
+{
+	int version = V3D_BRIDGE_READ(V3D_TOP_GR_BRIDGE_REVISION);
+
+	if (V3D_GET_FIELD(version, V3D_TOP_GR_BRIDGE_MAJOR) == 2) {
+		V3D_BRIDGE_WRITE(V3D_TOP_GR_BRIDGE_SW_INIT_0,
+				 V3D_TOP_GR_BRIDGE_SW_INIT_0_V3D_CLK_108_SW_INIT);
+		V3D_BRIDGE_WRITE(V3D_TOP_GR_BRIDGE_SW_INIT_0, 0);
+
+		/* GFXH-1383: The SW_INIT may cause a stray write to address 0
+		 * of the unit, so reset it to its power-on value here.
+		 */
+		V3D_WRITE(V3D_HUB_AXICFG, V3D_HUB_AXICFG_MAX_LEN_MASK);
+	} else {
+		WARN_ON_ONCE(V3D_GET_FIELD(version,
+					   V3D_TOP_GR_BRIDGE_MAJOR) != 7);
+		V3D_BRIDGE_WRITE(V3D_TOP_GR_BRIDGE_SW_INIT_1,
+				 V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT);
+		V3D_BRIDGE_WRITE(V3D_TOP_GR_BRIDGE_SW_INIT_1, 0);
+	}
+
+	v3d_init_hw_state(v3d);
+}
+
+void
+v3d_reset(struct v3d_dev *v3d)
+{
+	struct drm_device *dev = &v3d->drm;
+
+	DRM_ERROR("Resetting GPU.\n");
+	trace_v3d_reset_begin(dev);
+
+	/* XXX: only needed for safe powerdown, not reset. */
+	if (false)
+		v3d_idle_axi(v3d, 0);
+
+	v3d_idle_gca(v3d);
+	v3d_reset_v3d(v3d);
+
+	v3d_mmu_set_page_table(v3d);
+	v3d_irq_reset(v3d);
+
+	trace_v3d_reset_end(dev);
+}
+
+static void
+v3d_flush_l3(struct v3d_dev *v3d)
+{
+	if (v3d->ver < 41) {
+		u32 gca_ctrl = V3D_GCA_READ(V3D_GCA_CACHE_CTRL);
+
+		V3D_GCA_WRITE(V3D_GCA_CACHE_CTRL,
+			      gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH);
+
+		if (v3d->ver < 33) {
+			V3D_GCA_WRITE(V3D_GCA_CACHE_CTRL,
+				      gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH);
+		}
+	}
+}
+
+/* Invalidates the (read-only) L2 cache. */
+static void
+v3d_invalidate_l2(struct v3d_dev *v3d, int core)
+{
+	V3D_CORE_WRITE(core, V3D_CTL_L2CACTL,
+		       V3D_L2CACTL_L2CCLR |
+		       V3D_L2CACTL_L2CENA);
+}
+
+static void
+v3d_invalidate_l1td(struct v3d_dev *v3d, int core)
+{
+	V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF);
+	if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
+		       V3D_L2TCACTL_L2TFLS), 100)) {
+		DRM_ERROR("Timeout waiting for L1T write combiner flush\n");
+	}
+}
+
+/* Invalidates texture L2 cachelines */
+static void
+v3d_flush_l2t(struct v3d_dev *v3d, int core)
+{
+	v3d_invalidate_l1td(v3d, core);
+
+	V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
+		       V3D_L2TCACTL_L2TFLS |
+		       V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
+	if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
+		       V3D_L2TCACTL_L2TFLS), 100)) {
+		DRM_ERROR("Timeout waiting for L2T flush\n");
+	}
+}
+
+/* Invalidates the slice caches.  These are read-only caches. */
+static void
+v3d_invalidate_slices(struct v3d_dev *v3d, int core)
+{
+	V3D_CORE_WRITE(core, V3D_CTL_SLCACTL,
+		       V3D_SET_FIELD(0xf, V3D_SLCACTL_TVCCS) |
+		       V3D_SET_FIELD(0xf, V3D_SLCACTL_TDCCS) |
+		       V3D_SET_FIELD(0xf, V3D_SLCACTL_UCC) |
+		       V3D_SET_FIELD(0xf, V3D_SLCACTL_ICC));
+}
+
+/* Invalidates texture L2 cachelines */
+static void
+v3d_invalidate_l2t(struct v3d_dev *v3d, int core)
+{
+	V3D_CORE_WRITE(core,
+		       V3D_CTL_L2TCACTL,
+		       V3D_L2TCACTL_L2TFLS |
+		       V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAR, V3D_L2TCACTL_FLM));
+	if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
+		       V3D_L2TCACTL_L2TFLS), 100)) {
+		DRM_ERROR("Timeout waiting for L2T invalidate\n");
+	}
+}
+
+void
+v3d_invalidate_caches(struct v3d_dev *v3d)
+{
+	v3d_flush_l3(v3d);
+
+	v3d_invalidate_l2(v3d, 0);
+	v3d_invalidate_slices(v3d, 0);
+	v3d_flush_l2t(v3d, 0);
+}
+
+void
+v3d_flush_caches(struct v3d_dev *v3d)
+{
+	v3d_invalidate_l1td(v3d, 0);
+	v3d_invalidate_l2t(v3d, 0);
+}
+
+static void
+v3d_attach_object_fences(struct v3d_exec_info *exec)
+{
+	struct dma_fence *out_fence = &exec->render.base.s_fence->finished;
+	struct v3d_bo *bo;
+	int i;
+
+	for (i = 0; i < exec->bo_count; i++) {
+		bo = to_v3d_bo(&exec->bo[i]->base);
+
+		/* XXX: Use shared fences for read-only objects. */
+		reservation_object_add_excl_fence(bo->resv, out_fence);
+	}
+}
+
+static void
+v3d_unlock_bo_reservations(struct drm_device *dev,
+			   struct v3d_exec_info *exec,
+			   struct ww_acquire_ctx *acquire_ctx)
+{
+	int i;
+
+	for (i = 0; i < exec->bo_count; i++) {
+		struct v3d_bo *bo = to_v3d_bo(&exec->bo[i]->base);
+
+		ww_mutex_unlock(&bo->resv->lock);
+	}
+
+	ww_acquire_fini(acquire_ctx);
+}
+
+/* Takes the reservation lock on all the BOs being referenced, so that
+ * at queue submit time we can update the reservations.
+ *
+ * We don't lock the RCL the tile alloc/state BOs, or overflow memory
+ * (all of which are on exec->unref_list).  They're entirely private
+ * to v3d, so we don't attach dma-buf fences to them.
+ */
+static int
+v3d_lock_bo_reservations(struct drm_device *dev,
+			 struct v3d_exec_info *exec,
+			 struct ww_acquire_ctx *acquire_ctx)
+{
+	int contended_lock = -1;
+	int i, ret;
+	struct v3d_bo *bo;
+
+	ww_acquire_init(acquire_ctx, &reservation_ww_class);
+
+retry:
+	if (contended_lock != -1) {
+		bo = to_v3d_bo(&exec->bo[contended_lock]->base);
+		ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
+						       acquire_ctx);
+		if (ret) {
+			ww_acquire_done(acquire_ctx);
+			return ret;
+		}
+	}
+
+	for (i = 0; i < exec->bo_count; i++) {
+		if (i == contended_lock)
+			continue;
+
+		bo = to_v3d_bo(&exec->bo[i]->base);
+
+		ret = ww_mutex_lock_interruptible(&bo->resv->lock, acquire_ctx);
+		if (ret) {
+			int j;
+
+			for (j = 0; j < i; j++) {
+				bo = to_v3d_bo(&exec->bo[j]->base);
+				ww_mutex_unlock(&bo->resv->lock);
+			}
+
+			if (contended_lock != -1 && contended_lock >= i) {
+				bo = to_v3d_bo(&exec->bo[contended_lock]->base);
+
+				ww_mutex_unlock(&bo->resv->lock);
+			}
+
+			if (ret == -EDEADLK) {
+				contended_lock = i;
+				goto retry;
+			}
+
+			ww_acquire_done(acquire_ctx);
+			return ret;
+		}
+	}
+
+	ww_acquire_done(acquire_ctx);
+
+	/* Reserve space for our shared (read-only) fence references,
+	 * before we commit the CL to the hardware.
+	 */
+	for (i = 0; i < exec->bo_count; i++) {
+		bo = to_v3d_bo(&exec->bo[i]->base);
+
+		ret = reservation_object_reserve_shared(bo->resv);
+		if (ret) {
+			v3d_unlock_bo_reservations(dev, exec, acquire_ctx);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * v3d_cl_lookup_bos() - Sets up exec->bo[] with the GEM objects
+ * referenced by the job.
+ * @dev: DRM device
+ * @file_priv: DRM file for this fd
+ * @exec: V3D job being set up
+ *
+ * The command validator needs to reference BOs by their index within
+ * the submitted job's BO list.  This does the validation of the job's
+ * BO list and reference counting for the lifetime of the job.
+ *
+ * Note that this function doesn't need to unreference the BOs on
+ * failure, because that will happen at v3d_exec_cleanup() time.
+ */
+static int
+v3d_cl_lookup_bos(struct drm_device *dev,
+		  struct drm_file *file_priv,
+		  struct drm_v3d_submit_cl *args,
+		  struct v3d_exec_info *exec)
+{
+	u32 *handles;
+	int ret = 0;
+	int i;
+
+	exec->bo_count = args->bo_handle_count;
+
+	if (!exec->bo_count) {
+		/* See comment on bo_index for why we have to check
+		 * this.
+		 */
+		DRM_DEBUG("Rendering requires BOs\n");
+		return -EINVAL;
+	}
+
+	exec->bo = kvmalloc_array(exec->bo_count,
+				  sizeof(struct drm_gem_cma_object *),
+				  GFP_KERNEL | __GFP_ZERO);
+	if (!exec->bo) {
+		DRM_DEBUG("Failed to allocate validated BO pointers\n");
+		return -ENOMEM;
+	}
+
+	handles = kvmalloc_array(exec->bo_count, sizeof(u32), GFP_KERNEL);
+	if (!handles) {
+		ret = -ENOMEM;
+		DRM_DEBUG("Failed to allocate incoming GEM handles\n");
+		goto fail;
+	}
+
+	if (copy_from_user(handles,
+			   (void __user *)(uintptr_t)args->bo_handles,
+			   exec->bo_count * sizeof(u32))) {
+		ret = -EFAULT;
+		DRM_DEBUG("Failed to copy in GEM handles\n");
+		goto fail;
+	}
+
+	spin_lock(&file_priv->table_lock);
+	for (i = 0; i < exec->bo_count; i++) {
+		struct drm_gem_object *bo = idr_find(&file_priv->object_idr,
+						     handles[i]);
+		if (!bo) {
+			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
+				  i, handles[i]);
+			ret = -ENOENT;
+			spin_unlock(&file_priv->table_lock);
+			goto fail;
+		}
+		drm_gem_object_get(bo);
+		exec->bo[i] = to_v3d_bo(bo);
+	}
+	spin_unlock(&file_priv->table_lock);
+
+fail:
+	kvfree(handles);
+	return ret;
+}
+
+static void
+v3d_exec_cleanup(struct kref *ref)
+{
+	struct v3d_exec_info *exec = container_of(ref, struct v3d_exec_info,
+						  refcount);
+	struct v3d_dev *v3d = exec->v3d;
+	unsigned int i;
+	struct v3d_bo *bo, *save;
+
+	dma_fence_put(exec->bin.in_fence);
+	dma_fence_put(exec->render.in_fence);
+
+	dma_fence_put(exec->bin.done_fence);
+	dma_fence_put(exec->render.done_fence);
+
+	dma_fence_put(exec->bin_done_fence);
+
+	for (i = 0; i < exec->bo_count; i++)
+		drm_gem_object_put_unlocked(&exec->bo[i]->base);
+	kvfree(exec->bo);
+
+	list_for_each_entry_safe(bo, save, &exec->unref_list, unref_head) {
+		drm_gem_object_put_unlocked(&bo->base);
+	}
+
+	pm_runtime_mark_last_busy(v3d->dev);
+	pm_runtime_put_autosuspend(v3d->dev);
+
+	kfree(exec);
+}
+
+void v3d_exec_put(struct v3d_exec_info *exec)
+{
+	kref_put(&exec->refcount, v3d_exec_cleanup);
+}
+
+int
+v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
+		  struct drm_file *file_priv)
+{
+	int ret;
+	struct drm_v3d_wait_bo *args = data;
+	struct drm_gem_object *gem_obj;
+	struct v3d_bo *bo;
+	ktime_t start = ktime_get();
+	u64 delta_ns;
+	unsigned long timeout_jiffies =
+		nsecs_to_jiffies_timeout(args->timeout_ns);
+
+	if (args->pad != 0)
+		return -EINVAL;
+
+	gem_obj = drm_gem_object_lookup(file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_DEBUG("Failed to look up GEM BO %d\n", args->handle);
+		return -EINVAL;
+	}
+	bo = to_v3d_bo(gem_obj);
+
+	ret = reservation_object_wait_timeout_rcu(bo->resv,
+						  true, true,
+						  timeout_jiffies);
+
+	if (ret == 0)
+		ret = -ETIME;
+	else if (ret > 0)
+		ret = 0;
+
+	/* Decrement the user's timeout, in case we got interrupted
+	 * such that the ioctl will be restarted.
+	 */
+	delta_ns = ktime_to_ns(ktime_sub(ktime_get(), start));
+	if (delta_ns < args->timeout_ns)
+		args->timeout_ns -= delta_ns;
+	else
+		args->timeout_ns = 0;
+
+	/* Asked to wait beyond the jiffie/scheduler precision? */
+	if (ret == -ETIME && args->timeout_ns)
+		ret = -EAGAIN;
+
+	drm_gem_object_put_unlocked(gem_obj);
+
+	return ret;
+}
+
+/**
+ * v3d_submit_cl_ioctl() - Submits a job (frame) to the V3D.
+ * @dev: DRM device
+ * @data: ioctl argument
+ * @file_priv: DRM file for this fd
+ *
+ * This is the main entrypoint for userspace to submit a 3D frame to
+ * the GPU.  Userspace provides the binner command list (if
+ * applicable), and the kernel sets up the render command list to draw
+ * to the framebuffer described in the ioctl, using the command lists
+ * that the 3D engine's binner will produce.
+ */
+int
+v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
+		    struct drm_file *file_priv)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_submit_cl *args = data;
+	struct v3d_exec_info *exec;
+	struct ww_acquire_ctx acquire_ctx;
+	struct drm_syncobj *sync_out;
+	int ret = 0;
+
+	if (args->pad != 0) {
+		DRM_INFO("pad must be zero: %d\n", args->pad);
+		return -EINVAL;
+	}
+
+	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
+	if (!exec)
+		return -ENOMEM;
+
+	ret = pm_runtime_get_sync(v3d->dev);
+	if (ret < 0) {
+		kfree(exec);
+		return ret;
+	}
+
+	kref_init(&exec->refcount);
+
+	ret = drm_syncobj_find_fence(file_priv, args->in_sync_bcl,
+				     &exec->bin.in_fence);
+	if (ret == -EINVAL)
+		goto fail;
+
+	ret = drm_syncobj_find_fence(file_priv, args->in_sync_rcl,
+				     &exec->render.in_fence);
+	if (ret == -EINVAL)
+		goto fail;
+
+	exec->qma = args->qma;
+	exec->qms = args->qms;
+	exec->qts = args->qts;
+	exec->bin.exec = exec;
+	exec->bin.start = args->bcl_start;
+	exec->bin.end = args->bcl_end;
+	exec->render.exec = exec;
+	exec->render.start = args->rcl_start;
+	exec->render.end = args->rcl_end;
+	exec->v3d = v3d;
+	INIT_LIST_HEAD(&exec->unref_list);
+
+	ret = v3d_cl_lookup_bos(dev, file_priv, args, exec);
+	if (ret)
+		goto fail;
+
+	ret = v3d_lock_bo_reservations(dev, exec, &acquire_ctx);
+	if (ret)
+		goto fail;
+
+	if (exec->bin.start != exec->bin.end) {
+		ret = drm_sched_job_init(&exec->bin.base,
+					 &v3d->queue[V3D_BIN].sched,
+					 &v3d_priv->sched_entity[V3D_BIN],
+					 v3d_priv);
+		if (ret)
+			goto fail_unreserve;
+
+		exec->bin_done_fence =
+			dma_fence_get(&exec->bin.base.s_fence->finished);
+
+		kref_get(&exec->refcount); /* put by scheduler job completion */
+		drm_sched_entity_push_job(&exec->bin.base,
+					  &v3d_priv->sched_entity[V3D_BIN]);
+	}
+
+	ret = drm_sched_job_init(&exec->render.base,
+				 &v3d->queue[V3D_RENDER].sched,
+				 &v3d_priv->sched_entity[V3D_RENDER],
+				 v3d_priv);
+	if (ret)
+		goto fail_unreserve;
+
+	kref_get(&exec->refcount); /* put by scheduler job completion */
+	drm_sched_entity_push_job(&exec->render.base,
+				  &v3d_priv->sched_entity[V3D_RENDER]);
+
+	v3d_attach_object_fences(exec);
+
+	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
+
+	/* Update the return sync object for the */
+	sync_out = drm_syncobj_find(file_priv, args->out_sync);
+	if (sync_out) {
+		drm_syncobj_replace_fence(sync_out,
+					  &exec->render.base.s_fence->finished);
+		drm_syncobj_put(sync_out);
+	}
+
+	v3d_exec_put(exec);
+
+	return 0;
+
+fail_unreserve:
+	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
+fail:
+	v3d_exec_put(exec);
+
+	return ret;
+}
+
+int
+v3d_gem_init(struct drm_device *dev)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	u32 pt_size = 4096 * 1024;
+	int ret, i;
+
+	for (i = 0; i < V3D_MAX_QUEUES; i++)
+		v3d->queue[i].fence_context = dma_fence_context_alloc(1);
+
+	spin_lock_init(&v3d->mm_lock);
+	spin_lock_init(&v3d->job_lock);
+	mutex_init(&v3d->bo_lock);
+	mutex_init(&v3d->reset_lock);
+
+	/* Note: We don't allocate address 0.  Various bits of HW
+	 * treat 0 as special, such as the occlusion query counters
+	 * where 0 means "disabled".
+	 */
+	drm_mm_init(&v3d->mm, 1, pt_size / sizeof(u32) - 1);
+
+	v3d->pt = dma_alloc_wc(v3d->dev, pt_size,
+			       &v3d->pt_paddr,
+			       GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
+	if (!v3d->pt) {
+		drm_mm_takedown(&v3d->mm);
+		dev_err(v3d->dev,
+			"Failed to allocate page tables. "
+			"Please ensure you have CMA enabled.\n");
+		return -ENOMEM;
+	}
+
+	v3d_init_hw_state(v3d);
+	v3d_mmu_set_page_table(v3d);
+
+	ret = v3d_sched_init(v3d);
+	if (ret) {
+		drm_mm_takedown(&v3d->mm);
+		dma_free_coherent(v3d->dev, 4096 * 1024, (void *)v3d->pt,
+				  v3d->pt_paddr);
+	}
+
+	return 0;
+}
+
+void
+v3d_gem_destroy(struct drm_device *dev)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	enum v3d_queue q;
+
+	v3d_sched_fini(v3d);
+
+	/* Waiting for exec to finish would need to be done before
+	 * unregistering V3D.
+	 */
+	for (q = 0; q < V3D_MAX_QUEUES; q++) {
+		WARN_ON(v3d->queue[q].emit_seqno !=
+			v3d->queue[q].finished_seqno);
+	}
+
+	drm_mm_takedown(&v3d->mm);
+
+	dma_free_coherent(v3d->dev, 4096 * 1024, (void *)v3d->pt, v3d->pt_paddr);
+}
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c
new file mode 100644
index 000000000000..77e1fa046c10
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2014-2018 Broadcom */
+
+/**
+ * DOC: Interrupt management for the V3D engine
+ *
+ * When we take a binning or rendering flush done interrupt, we need
+ * to signal the fence for that job so that the scheduler can queue up
+ * the next one and unblock any waiters.
+ *
+ * When we take the binner out of memory interrupt, we need to
+ * allocate some new memory and pass it to the binner so that the
+ * current job can make progress.
+ */
+
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+
+#define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM |	\
+			     V3D_INT_FLDONE |	\
+			     V3D_INT_FRDONE |	\
+			     V3D_INT_GMPV))
+
+#define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV |	\
+			    V3D_HUB_INT_MMU_PTI |	\
+			    V3D_HUB_INT_MMU_CAP))
+
+static void
+v3d_overflow_mem_work(struct work_struct *work)
+{
+	struct v3d_dev *v3d =
+		container_of(work, struct v3d_dev, overflow_mem_work);
+	struct drm_device *dev = &v3d->drm;
+	struct v3d_bo *bo = v3d_bo_create(dev, NULL /* XXX: GMP */, 256 * 1024);
+	unsigned long irqflags;
+
+	if (IS_ERR(bo)) {
+		DRM_ERROR("Couldn't allocate binner overflow mem\n");
+		return;
+	}
+
+	/* We lost a race, and our work task came in after the bin job
+	 * completed and exited.  This can happen because the HW
+	 * signals OOM before it's fully OOM, so the binner might just
+	 * barely complete.
+	 *
+	 * If we lose the race and our work task comes in after a new
+	 * bin job got scheduled, that's fine.  We'll just give them
+	 * some binner pool anyway.
+	 */
+	spin_lock_irqsave(&v3d->job_lock, irqflags);
+	if (!v3d->bin_job) {
+		spin_unlock_irqrestore(&v3d->job_lock, irqflags);
+		goto out;
+	}
+
+	drm_gem_object_get(&bo->base);
+	list_add_tail(&bo->unref_head, &v3d->bin_job->unref_list);
+	spin_unlock_irqrestore(&v3d->job_lock, irqflags);
+
+	V3D_CORE_WRITE(0, V3D_PTB_BPOA, bo->node.start << PAGE_SHIFT);
+	V3D_CORE_WRITE(0, V3D_PTB_BPOS, bo->base.size);
+
+out:
+	drm_gem_object_put_unlocked(&bo->base);
+}
+
+static irqreturn_t
+v3d_irq(int irq, void *arg)
+{
+	struct v3d_dev *v3d = arg;
+	u32 intsts;
+	irqreturn_t status = IRQ_NONE;
+
+	intsts = V3D_CORE_READ(0, V3D_CTL_INT_STS);
+
+	/* Acknowledge the interrupts we're handling here. */
+	V3D_CORE_WRITE(0, V3D_CTL_INT_CLR, intsts);
+
+	if (intsts & V3D_INT_OUTOMEM) {
+		/* Note that the OOM status is edge signaled, so the
+		 * interrupt won't happen again until the we actually
+		 * add more memory.
+		 */
+		schedule_work(&v3d->overflow_mem_work);
+		status = IRQ_HANDLED;
+	}
+
+	if (intsts & V3D_INT_FLDONE) {
+		v3d->queue[V3D_BIN].finished_seqno++;
+		dma_fence_signal(v3d->bin_job->bin.done_fence);
+		status = IRQ_HANDLED;
+	}
+
+	if (intsts & V3D_INT_FRDONE) {
+		v3d->queue[V3D_RENDER].finished_seqno++;
+		dma_fence_signal(v3d->render_job->render.done_fence);
+
+		status = IRQ_HANDLED;
+	}
+
+	/* We shouldn't be triggering these if we have GMP in
+	 * always-allowed mode.
+	 */
+	if (intsts & V3D_INT_GMPV)
+		dev_err(v3d->dev, "GMP violation\n");
+
+	return status;
+}
+
+static irqreturn_t
+v3d_hub_irq(int irq, void *arg)
+{
+	struct v3d_dev *v3d = arg;
+	u32 intsts;
+	irqreturn_t status = IRQ_NONE;
+
+	intsts = V3D_READ(V3D_HUB_INT_STS);
+
+	/* Acknowledge the interrupts we're handling here. */
+	V3D_WRITE(V3D_HUB_INT_CLR, intsts);
+
+	if (intsts & (V3D_HUB_INT_MMU_WRV |
+		      V3D_HUB_INT_MMU_PTI |
+		      V3D_HUB_INT_MMU_CAP)) {
+		u32 axi_id = V3D_READ(V3D_MMU_VIO_ID);
+		u64 vio_addr = (u64)V3D_READ(V3D_MMU_VIO_ADDR) << 8;
+
+		dev_err(v3d->dev, "MMU error from client %d at 0x%08llx%s%s%s\n",
+			axi_id, (long long)vio_addr,
+			((intsts & V3D_HUB_INT_MMU_WRV) ?
+			 ", write violation" : ""),
+			((intsts & V3D_HUB_INT_MMU_PTI) ?
+			 ", pte invalid" : ""),
+			((intsts & V3D_HUB_INT_MMU_CAP) ?
+			 ", cap exceeded" : ""));
+		status = IRQ_HANDLED;
+	}
+
+	return status;
+}
+
+void
+v3d_irq_init(struct v3d_dev *v3d)
+{
+	int ret, core;
+
+	INIT_WORK(&v3d->overflow_mem_work, v3d_overflow_mem_work);
+
+	/* Clear any pending interrupts someone might have left around
+	 * for us.
+	 */
+	for (core = 0; core < v3d->cores; core++)
+		V3D_CORE_WRITE(core, V3D_CTL_INT_CLR, V3D_CORE_IRQS);
+	V3D_WRITE(V3D_HUB_INT_CLR, V3D_HUB_IRQS);
+
+	ret = devm_request_irq(v3d->dev, platform_get_irq(v3d->pdev, 0),
+			       v3d_hub_irq, IRQF_SHARED,
+			       "v3d_hub", v3d);
+	ret = devm_request_irq(v3d->dev, platform_get_irq(v3d->pdev, 1),
+			       v3d_irq, IRQF_SHARED,
+			       "v3d_core0", v3d);
+	if (ret)
+		dev_err(v3d->dev, "IRQ setup failed: %d\n", ret);
+
+	v3d_irq_enable(v3d);
+}
+
+void
+v3d_irq_enable(struct v3d_dev *v3d)
+{
+	int core;
+
+	/* Enable our set of interrupts, masking out any others. */
+	for (core = 0; core < v3d->cores; core++) {
+		V3D_CORE_WRITE(core, V3D_CTL_INT_MSK_SET, ~V3D_CORE_IRQS);
+		V3D_CORE_WRITE(core, V3D_CTL_INT_MSK_CLR, V3D_CORE_IRQS);
+	}
+
+	V3D_WRITE(V3D_HUB_INT_MSK_SET, ~V3D_HUB_IRQS);
+	V3D_WRITE(V3D_HUB_INT_MSK_CLR, V3D_HUB_IRQS);
+}
+
+void
+v3d_irq_disable(struct v3d_dev *v3d)
+{
+	int core;
+
+	/* Disable all interrupts. */
+	for (core = 0; core < v3d->cores; core++)
+		V3D_CORE_WRITE(core, V3D_CTL_INT_MSK_SET, ~0);
+	V3D_WRITE(V3D_HUB_INT_MSK_SET, ~0);
+
+	/* Clear any pending interrupts we might have left. */
+	for (core = 0; core < v3d->cores; core++)
+		V3D_CORE_WRITE(core, V3D_CTL_INT_CLR, V3D_CORE_IRQS);
+	V3D_WRITE(V3D_HUB_INT_CLR, V3D_HUB_IRQS);
+
+	cancel_work_sync(&v3d->overflow_mem_work);
+}
+
+/** Reinitializes interrupt registers when a GPU reset is performed. */
+void v3d_irq_reset(struct v3d_dev *v3d)
+{
+	v3d_irq_enable(v3d);
+}
diff --git a/drivers/gpu/drm/v3d/v3d_mmu.c b/drivers/gpu/drm/v3d/v3d_mmu.c
new file mode 100644
index 000000000000..b00f97c31b70
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_mmu.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2017-2018 Broadcom */
+
+/**
+ * DOC: Broadcom V3D MMU
+ *
+ * The V3D 3.x hardware (compared to VC4) now includes an MMU.  It has
+ * a single level of page tables for the V3D's 4GB address space to
+ * map to AXI bus addresses, thus it could need up to 4MB of
+ * physically contiguous memory to store the PTEs.
+ *
+ * Because the 4MB of contiguous memory for page tables is precious,
+ * and switching between them is expensive, we load all BOs into the
+ * same 4GB address space.
+ *
+ * To protect clients from each other, we should use the GMP to
+ * quickly mask out (at 128kb granularity) what pages are available to
+ * each client.  This is not yet implemented.
+ */
+
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+
+#define V3D_MMU_PAGE_SHIFT 12
+
+/* Note: All PTEs for the 1MB superpage must be filled with the
+ * superpage bit set.
+ */
+#define V3D_PTE_SUPERPAGE BIT(31)
+#define V3D_PTE_WRITEABLE BIT(29)
+#define V3D_PTE_VALID BIT(28)
+
+static int v3d_mmu_flush_all(struct v3d_dev *v3d)
+{
+	int ret;
+
+	/* Make sure that another flush isn't already running when we
+	 * start this one.
+	 */
+	ret = wait_for(!(V3D_READ(V3D_MMU_CTL) &
+			 V3D_MMU_CTL_TLB_CLEARING), 100);
+	if (ret)
+		dev_err(v3d->dev, "TLB clear wait idle pre-wait failed\n");
+
+	V3D_WRITE(V3D_MMU_CTL, V3D_READ(V3D_MMU_CTL) |
+		  V3D_MMU_CTL_TLB_CLEAR);
+
+	V3D_WRITE(V3D_MMUC_CONTROL,
+		  V3D_MMUC_CONTROL_FLUSH |
+		  V3D_MMUC_CONTROL_ENABLE);
+
+	ret = wait_for(!(V3D_READ(V3D_MMU_CTL) &
+			 V3D_MMU_CTL_TLB_CLEARING), 100);
+	if (ret) {
+		dev_err(v3d->dev, "TLB clear wait idle failed\n");
+		return ret;
+	}
+
+	ret = wait_for(!(V3D_READ(V3D_MMUC_CONTROL) &
+			 V3D_MMUC_CONTROL_FLUSHING), 100);
+	if (ret)
+		dev_err(v3d->dev, "MMUC flush wait idle failed\n");
+
+	return ret;
+}
+
+int v3d_mmu_set_page_table(struct v3d_dev *v3d)
+{
+	V3D_WRITE(V3D_MMU_PT_PA_BASE, v3d->pt_paddr >> V3D_MMU_PAGE_SHIFT);
+	V3D_WRITE(V3D_MMU_CTL,
+		  V3D_MMU_CTL_ENABLE |
+		  V3D_MMU_CTL_PT_INVALID |
+		  V3D_MMU_CTL_PT_INVALID_ABORT |
+		  V3D_MMU_CTL_WRITE_VIOLATION_ABORT |
+		  V3D_MMU_CTL_CAP_EXCEEDED_ABORT);
+	V3D_WRITE(V3D_MMU_ILLEGAL_ADDR,
+		  (v3d->mmu_scratch_paddr >> V3D_MMU_PAGE_SHIFT) |
+		  V3D_MMU_ILLEGAL_ADDR_ENABLE);
+	V3D_WRITE(V3D_MMUC_CONTROL, V3D_MMUC_CONTROL_ENABLE);
+
+	return v3d_mmu_flush_all(v3d);
+}
+
+void v3d_mmu_insert_ptes(struct v3d_bo *bo)
+{
+	struct v3d_dev *v3d = to_v3d_dev(bo->base.dev);
+	u32 page = bo->node.start;
+	u32 page_prot = V3D_PTE_WRITEABLE | V3D_PTE_VALID;
+	unsigned int count;
+	struct scatterlist *sgl;
+
+	for_each_sg(bo->sgt->sgl, sgl, bo->sgt->nents, count) {
+		u32 page_address = sg_dma_address(sgl) >> V3D_MMU_PAGE_SHIFT;
+		u32 pte = page_prot | page_address;
+		u32 i;
+
+		BUG_ON(page_address + (sg_dma_len(sgl) >> V3D_MMU_PAGE_SHIFT) >=
+		       BIT(24));
+
+		for (i = 0; i < sg_dma_len(sgl) >> V3D_MMU_PAGE_SHIFT; i++)
+			v3d->pt[page++] = pte + i;
+	}
+
+	WARN_ON_ONCE(page - bo->node.start !=
+		     bo->base.size >> V3D_MMU_PAGE_SHIFT);
+
+	if (v3d_mmu_flush_all(v3d))
+		dev_err(v3d->dev, "MMU flush timeout\n");
+}
+
+void v3d_mmu_remove_ptes(struct v3d_bo *bo)
+{
+	struct v3d_dev *v3d = to_v3d_dev(bo->base.dev);
+	u32 npages = bo->base.size >> V3D_MMU_PAGE_SHIFT;
+	u32 page;
+
+	for (page = bo->node.start; page < bo->node.start + npages; page++)
+		v3d->pt[page] = 0;
+
+	if (v3d_mmu_flush_all(v3d))
+		dev_err(v3d->dev, "MMU flush timeout\n");
+}
diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
new file mode 100644
index 000000000000..fc13282dfc2f
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2017-2018 Broadcom */
+
+#ifndef V3D_REGS_H
+#define V3D_REGS_H
+
+#include <linux/bitops.h>
+
+#define V3D_MASK(high, low) ((u32)GENMASK(high, low))
+/* Using the GNU statement expression extension */
+#define V3D_SET_FIELD(value, field)					\
+	({								\
+		u32 fieldval = (value) << field##_SHIFT;		\
+		WARN_ON((fieldval & ~field##_MASK) != 0);		\
+		fieldval & field##_MASK;				\
+	 })
+
+#define V3D_GET_FIELD(word, field) (((word) & field##_MASK) >>		\
+				    field##_SHIFT)
+
+/* Hub registers for shared hardware between V3D cores. */
+
+#define V3D_HUB_AXICFG                                 0x00000
+# define V3D_HUB_AXICFG_MAX_LEN_MASK                   V3D_MASK(3, 0)
+# define V3D_HUB_AXICFG_MAX_LEN_SHIFT                  0
+#define V3D_HUB_UIFCFG                                 0x00004
+#define V3D_HUB_IDENT0                                 0x00008
+
+#define V3D_HUB_IDENT1                                 0x0000c
+# define V3D_HUB_IDENT1_WITH_MSO                       BIT(19)
+# define V3D_HUB_IDENT1_WITH_TSY                       BIT(18)
+# define V3D_HUB_IDENT1_WITH_TFU                       BIT(17)
+# define V3D_HUB_IDENT1_WITH_L3C                       BIT(16)
+# define V3D_HUB_IDENT1_NHOSTS_MASK                    V3D_MASK(15, 12)
+# define V3D_HUB_IDENT1_NHOSTS_SHIFT                   12
+# define V3D_HUB_IDENT1_NCORES_MASK                    V3D_MASK(11, 8)
+# define V3D_HUB_IDENT1_NCORES_SHIFT                   8
+# define V3D_HUB_IDENT1_REV_MASK                       V3D_MASK(7, 4)
+# define V3D_HUB_IDENT1_REV_SHIFT                      4
+# define V3D_HUB_IDENT1_TVER_MASK                      V3D_MASK(3, 0)
+# define V3D_HUB_IDENT1_TVER_SHIFT                     0
+
+#define V3D_HUB_IDENT2                                 0x00010
+# define V3D_HUB_IDENT2_WITH_MMU                       BIT(8)
+# define V3D_HUB_IDENT2_L3C_NKB_MASK                   V3D_MASK(7, 0)
+# define V3D_HUB_IDENT2_L3C_NKB_SHIFT                  0
+
+#define V3D_HUB_IDENT3                                 0x00014
+# define V3D_HUB_IDENT3_IPREV_MASK                     V3D_MASK(15, 8)
+# define V3D_HUB_IDENT3_IPREV_SHIFT                    8
+# define V3D_HUB_IDENT3_IPIDX_MASK                     V3D_MASK(7, 0)
+# define V3D_HUB_IDENT3_IPIDX_SHIFT                    0
+
+#define V3D_HUB_INT_STS                                0x00050
+#define V3D_HUB_INT_SET                                0x00054
+#define V3D_HUB_INT_CLR                                0x00058
+#define V3D_HUB_INT_MSK_STS                            0x0005c
+#define V3D_HUB_INT_MSK_SET                            0x00060
+#define V3D_HUB_INT_MSK_CLR                            0x00064
+# define V3D_HUB_INT_MMU_WRV                           BIT(5)
+# define V3D_HUB_INT_MMU_PTI                           BIT(4)
+# define V3D_HUB_INT_MMU_CAP                           BIT(3)
+# define V3D_HUB_INT_MSO                               BIT(2)
+# define V3D_HUB_INT_TFUC                              BIT(1)
+# define V3D_HUB_INT_TFUF                              BIT(0)
+
+#define V3D_GCA_CACHE_CTRL                             0x0000c
+# define V3D_GCA_CACHE_CTRL_FLUSH                      BIT(0)
+
+#define V3D_GCA_SAFE_SHUTDOWN                          0x000b0
+# define V3D_GCA_SAFE_SHUTDOWN_EN                      BIT(0)
+
+#define V3D_GCA_SAFE_SHUTDOWN_ACK                      0x000b4
+# define V3D_GCA_SAFE_SHUTDOWN_ACK_ACKED               3
+
+# define V3D_TOP_GR_BRIDGE_REVISION                    0x00000
+# define V3D_TOP_GR_BRIDGE_MAJOR_MASK                  V3D_MASK(15, 8)
+# define V3D_TOP_GR_BRIDGE_MAJOR_SHIFT                 8
+# define V3D_TOP_GR_BRIDGE_MINOR_MASK                  V3D_MASK(7, 0)
+# define V3D_TOP_GR_BRIDGE_MINOR_SHIFT                 0
+
+/* 7268 reset reg */
+# define V3D_TOP_GR_BRIDGE_SW_INIT_0                   0x00008
+# define V3D_TOP_GR_BRIDGE_SW_INIT_0_V3D_CLK_108_SW_INIT BIT(0)
+/* 7278 reset reg */
+# define V3D_TOP_GR_BRIDGE_SW_INIT_1                   0x0000c
+# define V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT BIT(0)
+
+/* Per-MMU registers. */
+
+#define V3D_MMUC_CONTROL                               0x01000
+# define V3D_MMUC_CONTROL_CLEAR                        BIT(3)
+# define V3D_MMUC_CONTROL_FLUSHING                     BIT(2)
+# define V3D_MMUC_CONTROL_FLUSH                        BIT(1)
+# define V3D_MMUC_CONTROL_ENABLE                       BIT(0)
+
+#define V3D_MMU_CTL                                    0x01200
+# define V3D_MMU_CTL_CAP_EXCEEDED                      BIT(27)
+# define V3D_MMU_CTL_CAP_EXCEEDED_ABORT                BIT(26)
+# define V3D_MMU_CTL_CAP_EXCEEDED_INT                  BIT(25)
+# define V3D_MMU_CTL_CAP_EXCEEDED_EXCEPTION            BIT(24)
+# define V3D_MMU_CTL_PT_INVALID                        BIT(20)
+# define V3D_MMU_CTL_PT_INVALID_ABORT                  BIT(19)
+# define V3D_MMU_CTL_PT_INVALID_INT                    BIT(18)
+# define V3D_MMU_CTL_PT_INVALID_EXCEPTION              BIT(17)
+# define V3D_MMU_CTL_WRITE_VIOLATION                   BIT(16)
+# define V3D_MMU_CTL_WRITE_VIOLATION_ABORT             BIT(11)
+# define V3D_MMU_CTL_WRITE_VIOLATION_INT               BIT(10)
+# define V3D_MMU_CTL_WRITE_VIOLATION_EXCEPTION         BIT(9)
+# define V3D_MMU_CTL_TLB_CLEARING                      BIT(7)
+# define V3D_MMU_CTL_TLB_STATS_CLEAR                   BIT(3)
+# define V3D_MMU_CTL_TLB_CLEAR                         BIT(2)
+# define V3D_MMU_CTL_TLB_STATS_ENABLE                  BIT(1)
+# define V3D_MMU_CTL_ENABLE                            BIT(0)
+
+#define V3D_MMU_PT_PA_BASE                             0x01204
+#define V3D_MMU_HIT                                    0x01208
+#define V3D_MMU_MISSES                                 0x0120c
+#define V3D_MMU_STALLS                                 0x01210
+
+#define V3D_MMU_ADDR_CAP                               0x01214
+# define V3D_MMU_ADDR_CAP_ENABLE                       BIT(31)
+# define V3D_MMU_ADDR_CAP_MPAGE_MASK                   V3D_MASK(11, 0)
+# define V3D_MMU_ADDR_CAP_MPAGE_SHIFT                  0
+
+#define V3D_MMU_SHOOT_DOWN                             0x01218
+# define V3D_MMU_SHOOT_DOWN_SHOOTING                   BIT(29)
+# define V3D_MMU_SHOOT_DOWN_SHOOT                      BIT(28)
+# define V3D_MMU_SHOOT_DOWN_PAGE_MASK                  V3D_MASK(27, 0)
+# define V3D_MMU_SHOOT_DOWN_PAGE_SHIFT                 0
+
+#define V3D_MMU_BYPASS_START                           0x0121c
+#define V3D_MMU_BYPASS_END                             0x01220
+
+/* AXI ID of the access that faulted */
+#define V3D_MMU_VIO_ID                                 0x0122c
+
+/* Address for illegal PTEs to return */
+#define V3D_MMU_ILLEGAL_ADDR                           0x01230
+# define V3D_MMU_ILLEGAL_ADDR_ENABLE                   BIT(31)
+
+/* Address that faulted */
+#define V3D_MMU_VIO_ADDR                               0x01234
+
+/* Per-V3D-core registers */
+
+#define V3D_CTL_IDENT0                                 0x00000
+# define V3D_IDENT0_VER_MASK                           V3D_MASK(31, 24)
+# define V3D_IDENT0_VER_SHIFT                          24
+
+#define V3D_CTL_IDENT1                                 0x00004
+/* Multiples of 1kb */
+# define V3D_IDENT1_VPM_SIZE_MASK                      V3D_MASK(31, 28)
+# define V3D_IDENT1_VPM_SIZE_SHIFT                     28
+# define V3D_IDENT1_NSEM_MASK                          V3D_MASK(23, 16)
+# define V3D_IDENT1_NSEM_SHIFT                         16
+# define V3D_IDENT1_NTMU_MASK                          V3D_MASK(15, 12)
+# define V3D_IDENT1_NTMU_SHIFT                         12
+# define V3D_IDENT1_QUPS_MASK                          V3D_MASK(11, 8)
+# define V3D_IDENT1_QUPS_SHIFT                         8
+# define V3D_IDENT1_NSLC_MASK                          V3D_MASK(7, 4)
+# define V3D_IDENT1_NSLC_SHIFT                         4
+# define V3D_IDENT1_REV_MASK                           V3D_MASK(3, 0)
+# define V3D_IDENT1_REV_SHIFT                          0
+
+#define V3D_CTL_IDENT2                                 0x00008
+# define V3D_IDENT2_BCG_INT                            BIT(28)
+
+#define V3D_CTL_MISCCFG                                0x00018
+# define V3D_MISCCFG_OVRTMUOUT                         BIT(0)
+
+#define V3D_CTL_L2CACTL                                0x00020
+# define V3D_L2CACTL_L2CCLR                            BIT(2)
+# define V3D_L2CACTL_L2CDIS                            BIT(1)
+# define V3D_L2CACTL_L2CENA                            BIT(0)
+
+#define V3D_CTL_SLCACTL                                0x00024
+# define V3D_SLCACTL_TVCCS_MASK                        V3D_MASK(27, 24)
+# define V3D_SLCACTL_TVCCS_SHIFT                       24
+# define V3D_SLCACTL_TDCCS_MASK                        V3D_MASK(19, 16)
+# define V3D_SLCACTL_TDCCS_SHIFT                       16
+# define V3D_SLCACTL_UCC_MASK                          V3D_MASK(11, 8)
+# define V3D_SLCACTL_UCC_SHIFT                         8
+# define V3D_SLCACTL_ICC_MASK                          V3D_MASK(3, 0)
+# define V3D_SLCACTL_ICC_SHIFT                         0
+
+#define V3D_CTL_L2TCACTL                               0x00030
+# define V3D_L2TCACTL_TMUWCF                           BIT(8)
+# define V3D_L2TCACTL_L2T_NO_WM                        BIT(4)
+# define V3D_L2TCACTL_FLM_FLUSH                        0
+# define V3D_L2TCACTL_FLM_CLEAR                        1
+# define V3D_L2TCACTL_FLM_CLEAN                        2
+# define V3D_L2TCACTL_FLM_MASK                         V3D_MASK(2, 1)
+# define V3D_L2TCACTL_FLM_SHIFT                        1
+# define V3D_L2TCACTL_L2TFLS                           BIT(0)
+#define V3D_CTL_L2TFLSTA                               0x00034
+#define V3D_CTL_L2TFLEND                               0x00038
+
+#define V3D_CTL_INT_STS                                0x00050
+#define V3D_CTL_INT_SET                                0x00054
+#define V3D_CTL_INT_CLR                                0x00058
+#define V3D_CTL_INT_MSK_STS                            0x0005c
+#define V3D_CTL_INT_MSK_SET                            0x00060
+#define V3D_CTL_INT_MSK_CLR                            0x00064
+# define V3D_INT_QPU_MASK                              V3D_MASK(27, 16)
+# define V3D_INT_QPU_SHIFT                             16
+# define V3D_INT_GMPV                                  BIT(5)
+# define V3D_INT_TRFB                                  BIT(4)
+# define V3D_INT_SPILLUSE                              BIT(3)
+# define V3D_INT_OUTOMEM                               BIT(2)
+# define V3D_INT_FLDONE                                BIT(1)
+# define V3D_INT_FRDONE                                BIT(0)
+
+#define V3D_CLE_CT0CS                                  0x00100
+#define V3D_CLE_CT1CS                                  0x00104
+#define V3D_CLE_CTNCS(n) (V3D_CLE_CT0CS + 4 * n)
+#define V3D_CLE_CT0EA                                  0x00108
+#define V3D_CLE_CT1EA                                  0x0010c
+#define V3D_CLE_CTNEA(n) (V3D_CLE_CT0EA + 4 * n)
+#define V3D_CLE_CT0CA                                  0x00110
+#define V3D_CLE_CT1CA                                  0x00114
+#define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
+#define V3D_CLE_CT0RA                                  0x00118
+#define V3D_CLE_CT1RA                                  0x0011c
+#define V3D_CLE_CT0LC                                  0x00120
+#define V3D_CLE_CT1LC                                  0x00124
+#define V3D_CLE_CT0PC                                  0x00128
+#define V3D_CLE_CT1PC                                  0x0012c
+#define V3D_CLE_PCS                                    0x00130
+#define V3D_CLE_BFC                                    0x00134
+#define V3D_CLE_RFC                                    0x00138
+#define V3D_CLE_TFBC                                   0x0013c
+#define V3D_CLE_TFIT                                   0x00140
+#define V3D_CLE_CT1CFG                                 0x00144
+#define V3D_CLE_CT1TILECT                              0x00148
+#define V3D_CLE_CT1TSKIP                               0x0014c
+#define V3D_CLE_CT1PTCT                                0x00150
+#define V3D_CLE_CT0SYNC                                0x00154
+#define V3D_CLE_CT1SYNC                                0x00158
+#define V3D_CLE_CT0QTS                                 0x0015c
+# define V3D_CLE_CT0QTS_ENABLE                         BIT(1)
+#define V3D_CLE_CT0QBA                                 0x00160
+#define V3D_CLE_CT1QBA                                 0x00164
+#define V3D_CLE_CTNQBA(n) (V3D_CLE_CT0QBA + 4 * n)
+#define V3D_CLE_CT0QEA                                 0x00168
+#define V3D_CLE_CT1QEA                                 0x0016c
+#define V3D_CLE_CTNQEA(n) (V3D_CLE_CT0QEA + 4 * n)
+#define V3D_CLE_CT0QMA                                 0x00170
+#define V3D_CLE_CT0QMS                                 0x00174
+#define V3D_CLE_CT1QCFG                                0x00178
+/* If set without ETPROC, entirely skip tiles with no primitives. */
+# define V3D_CLE_QCFG_ETFILT                           BIT(7)
+/* If set with ETFILT, just write the clear color to tiles with no
+ * primitives.
+ */
+# define V3D_CLE_QCFG_ETPROC                           BIT(6)
+# define V3D_CLE_QCFG_ETSFLUSH                         BIT(1)
+# define V3D_CLE_QCFG_MCDIS                            BIT(0)
+
+#define V3D_PTB_BPCA                                   0x00300
+#define V3D_PTB_BPCS                                   0x00304
+#define V3D_PTB_BPOA                                   0x00308
+#define V3D_PTB_BPOS                                   0x0030c
+
+#define V3D_PTB_BXCF                                   0x00310
+# define V3D_PTB_BXCF_RWORDERDISA                      BIT(1)
+# define V3D_PTB_BXCF_CLIPDISA                         BIT(0)
+
+#define V3D_GMP_STATUS                                 0x00800
+# define V3D_GMP_STATUS_GMPRST                         BIT(31)
+# define V3D_GMP_STATUS_WR_COUNT_MASK                  V3D_MASK(30, 24)
+# define V3D_GMP_STATUS_WR_COUNT_SHIFT                 24
+# define V3D_GMP_STATUS_RD_COUNT_MASK                  V3D_MASK(22, 16)
+# define V3D_GMP_STATUS_RD_COUNT_SHIFT                 16
+# define V3D_GMP_STATUS_WR_ACTIVE                      BIT(5)
+# define V3D_GMP_STATUS_RD_ACTIVE                      BIT(4)
+# define V3D_GMP_STATUS_CFG_BUSY                       BIT(3)
+# define V3D_GMP_STATUS_CNTOVF                         BIT(2)
+# define V3D_GMP_STATUS_INVPROT                        BIT(1)
+# define V3D_GMP_STATUS_VIO                            BIT(0)
+
+#define V3D_GMP_CFG                                    0x00804
+# define V3D_GMP_CFG_LBURSTEN                          BIT(3)
+# define V3D_GMP_CFG_PGCRSEN                           BIT()
+# define V3D_GMP_CFG_STOP_REQ                          BIT(1)
+# define V3D_GMP_CFG_PROT_ENABLE                       BIT(0)
+
+#define V3D_GMP_VIO_ADDR                               0x00808
+#define V3D_GMP_VIO_TYPE                               0x0080c
+#define V3D_GMP_TABLE_ADDR                             0x00810
+#define V3D_GMP_CLEAR_LOAD                             0x00814
+#define V3D_GMP_PRESERVE_LOAD                          0x00818
+#define V3D_GMP_VALID_LINES                            0x00820
+
+#endif /* V3D_REGS_H */
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
new file mode 100644
index 000000000000..b07bece9417d
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2018 Broadcom */
+
+/**
+ * DOC: Broadcom V3D scheduling
+ *
+ * The shared DRM GPU scheduler is used to coordinate submitting jobs
+ * to the hardware.  Each DRM fd (roughly a client process) gets its
+ * own scheduler entity, which will process jobs in order.  The GPU
+ * scheduler will round-robin between clients to submit the next job.
+ *
+ * For simplicity, and in order to keep latency low for interactive
+ * jobs when bulk background jobs are queued up, we submit a new job
+ * to the HW only when it has completed the last one, instead of
+ * filling up the CT[01]Q FIFOs with jobs.  Similarly, we use
+ * v3d_job_dependency() to manage the dependency between bin and
+ * render, instead of having the clients submit jobs with using the
+ * HW's semaphores to interlock between them.
+ */
+
+#include <linux/kthread.h>
+
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+#include "v3d_trace.h"
+
+static struct v3d_job *
+to_v3d_job(struct drm_sched_job *sched_job)
+{
+	return container_of(sched_job, struct v3d_job, base);
+}
+
+static void
+v3d_job_free(struct drm_sched_job *sched_job)
+{
+	struct v3d_job *job = to_v3d_job(sched_job);
+
+	v3d_exec_put(job->exec);
+}
+
+/**
+ * Returns the fences that the bin job depends on, one by one.
+ * v3d_job_run() won't be called until all of them have been signaled.
+ */
+static struct dma_fence *
+v3d_job_dependency(struct drm_sched_job *sched_job,
+		   struct drm_sched_entity *s_entity)
+{
+	struct v3d_job *job = to_v3d_job(sched_job);
+	struct v3d_exec_info *exec = job->exec;
+	enum v3d_queue q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
+	struct dma_fence *fence;
+
+	fence = job->in_fence;
+	if (fence) {
+		job->in_fence = NULL;
+		return fence;
+	}
+
+	if (q == V3D_RENDER) {
+		/* If we had a bin job, the render job definitely depends on
+		 * it. We first have to wait for bin to be scheduled, so that
+		 * its done_fence is created.
+		 */
+		fence = exec->bin_done_fence;
+		if (fence) {
+			exec->bin_done_fence = NULL;
+			return fence;
+		}
+	}
+
+	/* XXX: Wait on a fence for switching the GMP if necessary,
+	 * and then do so.
+	 */
+
+	return fence;
+}
+
+static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
+{
+	struct v3d_job *job = to_v3d_job(sched_job);
+	struct v3d_exec_info *exec = job->exec;
+	enum v3d_queue q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
+	struct v3d_dev *v3d = exec->v3d;
+	struct drm_device *dev = &v3d->drm;
+	struct dma_fence *fence;
+	unsigned long irqflags;
+
+	if (unlikely(job->base.s_fence->finished.error))
+		return NULL;
+
+	/* Lock required around bin_job update vs
+	 * v3d_overflow_mem_work().
+	 */
+	spin_lock_irqsave(&v3d->job_lock, irqflags);
+	if (q == V3D_BIN) {
+		v3d->bin_job = job->exec;
+
+		/* Clear out the overflow allocation, so we don't
+		 * reuse the overflow attached to a previous job.
+		 */
+		V3D_CORE_WRITE(0, V3D_PTB_BPOS, 0);
+	} else {
+		v3d->render_job = job->exec;
+	}
+	spin_unlock_irqrestore(&v3d->job_lock, irqflags);
+
+	/* Can we avoid this flush when q==RENDER?  We need to be
+	 * careful of scheduling, though -- imagine job0 rendering to
+	 * texture and job1 reading, and them being executed as bin0,
+	 * bin1, render0, render1, so that render1's flush at bin time
+	 * wasn't enough.
+	 */
+	v3d_invalidate_caches(v3d);
+
+	fence = v3d_fence_create(v3d, q);
+	if (!fence)
+		return fence;
+
+	if (job->done_fence)
+		dma_fence_put(job->done_fence);
+	job->done_fence = dma_fence_get(fence);
+
+	trace_v3d_submit_cl(dev, q == V3D_RENDER, to_v3d_fence(fence)->seqno,
+			    job->start, job->end);
+
+	if (q == V3D_BIN) {
+		if (exec->qma) {
+			V3D_CORE_WRITE(0, V3D_CLE_CT0QMA, exec->qma);
+			V3D_CORE_WRITE(0, V3D_CLE_CT0QMS, exec->qms);
+		}
+		if (exec->qts) {
+			V3D_CORE_WRITE(0, V3D_CLE_CT0QTS,
+				       V3D_CLE_CT0QTS_ENABLE |
+				       exec->qts);
+		}
+	} else {
+		/* XXX: Set the QCFG */
+	}
+
+	/* Set the current and end address of the control list.
+	 * Writing the end register is what starts the job.
+	 */
+	V3D_CORE_WRITE(0, V3D_CLE_CTNQBA(q), job->start);
+	V3D_CORE_WRITE(0, V3D_CLE_CTNQEA(q), job->end);
+
+	return fence;
+}
+
+static void
+v3d_job_timedout(struct drm_sched_job *sched_job)
+{
+	struct v3d_job *job = to_v3d_job(sched_job);
+	struct v3d_exec_info *exec = job->exec;
+	struct v3d_dev *v3d = exec->v3d;
+	enum v3d_queue q;
+
+	mutex_lock(&v3d->reset_lock);
+
+	/* block scheduler */
+	for (q = 0; q < V3D_MAX_QUEUES; q++) {
+		struct drm_gpu_scheduler *sched = &v3d->queue[q].sched;
+
+		kthread_park(sched->thread);
+		drm_sched_hw_job_reset(sched, (sched_job->sched == sched ?
+					       sched_job : NULL));
+	}
+
+	/* get the GPU back into the init state */
+	v3d_reset(v3d);
+
+	/* Unblock schedulers and restart their jobs. */
+	for (q = 0; q < V3D_MAX_QUEUES; q++) {
+		drm_sched_job_recovery(&v3d->queue[q].sched);
+		kthread_unpark(v3d->queue[q].sched.thread);
+	}
+
+	mutex_unlock(&v3d->reset_lock);
+}
+
+static const struct drm_sched_backend_ops v3d_sched_ops = {
+	.dependency = v3d_job_dependency,
+	.run_job = v3d_job_run,
+	.timedout_job = v3d_job_timedout,
+	.free_job = v3d_job_free
+};
+
+int
+v3d_sched_init(struct v3d_dev *v3d)
+{
+	int hw_jobs_limit = 1;
+	int job_hang_limit = 0;
+	int hang_limit_ms = 500;
+	int ret;
+
+	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
+			     &v3d_sched_ops,
+			     hw_jobs_limit, job_hang_limit,
+			     msecs_to_jiffies(hang_limit_ms),
+			     "v3d_bin");
+	if (ret) {
+		dev_err(v3d->dev, "Failed to create bin scheduler: %d.", ret);
+		return ret;
+	}
+
+	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
+			     &v3d_sched_ops,
+			     hw_jobs_limit, job_hang_limit,
+			     msecs_to_jiffies(hang_limit_ms),
+			     "v3d_render");
+	if (ret) {
+		dev_err(v3d->dev, "Failed to create render scheduler: %d.",
+			ret);
+		drm_sched_fini(&v3d->queue[V3D_BIN].sched);
+		return ret;
+	}
+
+	return 0;
+}
+
+void
+v3d_sched_fini(struct v3d_dev *v3d)
+{
+	enum v3d_queue q;
+
+	for (q = 0; q < V3D_MAX_QUEUES; q++)
+		drm_sched_fini(&v3d->queue[q].sched);
+}
diff --git a/drivers/gpu/drm/v3d/v3d_trace.h b/drivers/gpu/drm/v3d/v3d_trace.h
new file mode 100644
index 000000000000..85dd351e1e09
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_trace.h
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2015-2018 Broadcom */
+
+#if !defined(_V3D_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _V3D_TRACE_H_
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM v3d
+#define TRACE_INCLUDE_FILE v3d_trace
+
+TRACE_EVENT(v3d_submit_cl,
+	    TP_PROTO(struct drm_device *dev, bool is_render,
+		     uint64_t seqno,
+		     u32 ctnqba, u32 ctnqea),
+	    TP_ARGS(dev, is_render, seqno, ctnqba, ctnqea),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     __field(bool, is_render)
+			     __field(u64, seqno)
+			     __field(u32, ctnqba)
+			     __field(u32, ctnqea)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   __entry->is_render = is_render;
+			   __entry->seqno = seqno;
+			   __entry->ctnqba = ctnqba;
+			   __entry->ctnqea = ctnqea;
+			   ),
+
+	    TP_printk("dev=%u, %s, seqno=%llu, 0x%08x..0x%08x",
+		      __entry->dev,
+		      __entry->is_render ? "RCL" : "BCL",
+		      __entry->seqno,
+		      __entry->ctnqba,
+		      __entry->ctnqea)
+);
+
+TRACE_EVENT(v3d_reset_begin,
+	    TP_PROTO(struct drm_device *dev),
+	    TP_ARGS(dev),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   ),
+
+	    TP_printk("dev=%u",
+		      __entry->dev)
+);
+
+TRACE_EVENT(v3d_reset_end,
+	    TP_PROTO(struct drm_device *dev),
+	    TP_ARGS(dev),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   ),
+
+	    TP_printk("dev=%u",
+		      __entry->dev)
+);
+
+#endif /* _V3D_TRACE_H_ */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
diff --git a/drivers/gpu/drm/v3d/v3d_trace_points.c b/drivers/gpu/drm/v3d/v3d_trace_points.c
new file mode 100644
index 000000000000..482922d7c7e1
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_trace_points.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2015 Broadcom */
+
+#include "v3d_drv.h"
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "v3d_trace.h"
+#endif
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
new file mode 100644
index 000000000000..7b6627783608
--- /dev/null
+++ b/include/uapi/drm/v3d_drm.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright © 2014-2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _V3D_DRM_H_
+#define _V3D_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_V3D_SUBMIT_CL                         0x00
+#define DRM_V3D_WAIT_BO                           0x01
+#define DRM_V3D_CREATE_BO                         0x02
+#define DRM_V3D_MMAP_BO                           0x03
+#define DRM_V3D_GET_PARAM                         0x04
+#define DRM_V3D_GET_BO_OFFSET                     0x05
+
+#define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
+#define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
+#define DRM_IOCTL_V3D_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_CREATE_BO, struct drm_v3d_create_bo)
+#define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
+#define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
+#define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
+
+/**
+ * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
+ * engine.
+ *
+ * This asks the kernel to have the GPU execute an optional binner
+ * command list, and a render command list.
+ */
+struct drm_v3d_submit_cl {
+	/* Pointer to the binner command list.
+	 *
+	 * This is the first set of commands executed, which runs the
+	 * coordinate shader to determine where primitives land on the screen,
+	 * then writes out the state updates and draw calls necessary per tile
+	 * to the tile allocation BO.
+	 */
+	__u32 bcl_start;
+
+	 /** End address of the BCL (first byte after the BCL) */
+	__u32 bcl_end;
+
+	/* Offset of the render command list.
+	 *
+	 * This is the second set of commands executed, which will either
+	 * execute the tiles that have been set up by the BCL, or a fixed set
+	 * of tiles (in the case of RCL-only blits).
+	 */
+	__u32 rcl_start;
+
+	 /** End address of the RCL (first byte after the RCL) */
+	__u32 rcl_end;
+
+	/** An optional sync object to wait on before starting the BCL. */
+	__u32 in_sync_bcl;
+	/** An optional sync object to wait on before starting the RCL. */
+	__u32 in_sync_rcl;
+	/** An optional sync object to place the completion fence in. */
+	__u32 out_sync;
+
+	/* Offset of the tile alloc memory
+	 *
+	 * This is optional on V3D 3.3 (where the CL can set the value) but
+	 * required on V3D 4.1.
+	 */
+	__u32 qma;
+
+	/** Size of the tile alloc memory. */
+	__u32 qms;
+
+	/** Offset of the tile state data array. */
+	__u32 qts;
+
+	/* Pointer to a u32 array of the BOs that are referenced by the job.
+	 */
+	__u64 bo_handles;
+
+	/* Number of BO handles passed in (size is that times 4). */
+	__u32 bo_handle_count;
+
+	/* Pad, must be zero-filled. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_v3d_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_V3D_SUBMIT_CL on a BO.
+ *
+ * This is useful for cases where multiple processes might be
+ * rendering to a BO and you want to wait for all rendering to be
+ * completed.
+ */
+struct drm_v3d_wait_bo {
+	__u32 handle;
+	__u32 pad;
+	__u64 timeout_ns;
+};
+
+/**
+ * struct drm_v3d_create_bo - ioctl argument for creating V3D BOs.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_v3d_create_bo {
+	__u32 size;
+	__u32 flags;
+	/** Returned GEM handle for the BO. */
+	__u32 handle;
+	/**
+	 * Returned offset for the BO in the V3D address space.  This offset
+	 * is private to the DRM fd and is valid for the lifetime of the GEM
+	 * handle.
+	 *
+	 * This offset value will always be nonzero, since various HW
+	 * units treat 0 specially.
+	 */
+	__u32 offset;
+};
+
+/**
+ * struct drm_v3d_mmap_bo - ioctl argument for mapping V3D BOs.
+ *
+ * This doesn't actually perform an mmap.  Instead, it returns the
+ * offset you need to use in an mmap on the DRM device node.  This
+ * means that tools like valgrind end up knowing about the mapped
+ * memory.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_v3d_mmap_bo {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 flags;
+	/** offset into the drm node to use for subsequent mmap call. */
+	__u64 offset;
+};
+
+enum drm_v3d_param {
+	DRM_V3D_PARAM_V3D_UIFCFG,
+	DRM_V3D_PARAM_V3D_HUB_IDENT1,
+	DRM_V3D_PARAM_V3D_HUB_IDENT2,
+	DRM_V3D_PARAM_V3D_HUB_IDENT3,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT0,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT2,
+};
+
+struct drm_v3d_get_param {
+	__u32 param;
+	__u32 pad;
+	__u64 value;
+};
+
+/**
+ * Returns the offset for the BO in the V3D address space for this DRM fd.
+ * This is the same value returned by drm_v3d_create_bo, if that was called
+ * from this DRM fd.
+ */
+struct drm_v3d_get_bo_offset {
+	__u32 handle;
+	__u32 offset;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _V3D_DRM_H_ */
-- 
cgit v1.2.3


From 93731ef086cee90af594e62874bb98ae6d6eee91 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:13 +0200
Subject: bpf: migrate ebpf ld_abs/ld_ind tests to test_verifier

Remove all eBPF tests involving LD_ABS/LD_IND from test_bpf.ko. Reason
is that the eBPF tests from test_bpf module do not go via BPF verifier
and therefore any instruction rewrites from verifier cannot take place.

Therefore, move them into test_verifier which runs out of user space,
so that verfier can rewrite LD_ABS/LD_IND internally in upcoming patches.
It will have the same effect since runtime tests are also performed from
there. This also allows to finally unexport bpf_skb_vlan_{push,pop}_proto
and keep it internal to core kernel.

Additionally, also add further cBPF LD_ABS/LD_IND test coverage into
test_bpf.ko suite.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                         |   2 -
 lib/test_bpf.c                              | 570 +++++++++++++++++-----------
 net/core/filter.c                           |   6 +-
 tools/testing/selftests/bpf/test_verifier.c | 266 ++++++++++++-
 4 files changed, 619 insertions(+), 225 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 68ecdb4eea09..d0e3d7ef36a8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -714,8 +714,6 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
-extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
-extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8e157806df7a..317f231462d4 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -386,116 +386,6 @@ static int bpf_fill_ld_abs_get_processor_id(struct bpf_test *self)
 	return 0;
 }
 
-#define PUSH_CNT 68
-/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */
-static int bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
-{
-	unsigned int len = BPF_MAXINSNS;
-	struct bpf_insn *insn;
-	int i = 0, j, k = 0;
-
-	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
-	if (!insn)
-		return -ENOMEM;
-
-	insn[i++] = BPF_MOV64_REG(R6, R1);
-loop:
-	for (j = 0; j < PUSH_CNT; j++) {
-		insn[i++] = BPF_LD_ABS(BPF_B, 0);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2);
-		i++;
-		insn[i++] = BPF_MOV64_REG(R1, R6);
-		insn[i++] = BPF_MOV64_IMM(R2, 1);
-		insn[i++] = BPF_MOV64_IMM(R3, 2);
-		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-					 bpf_skb_vlan_push_proto.func - __bpf_call_base);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2);
-		i++;
-	}
-
-	for (j = 0; j < PUSH_CNT; j++) {
-		insn[i++] = BPF_LD_ABS(BPF_B, 0);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2);
-		i++;
-		insn[i++] = BPF_MOV64_REG(R1, R6);
-		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-					 bpf_skb_vlan_pop_proto.func - __bpf_call_base);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2);
-		i++;
-	}
-	if (++k < 5)
-		goto loop;
-
-	for (; i < len - 1; i++)
-		insn[i] = BPF_ALU32_IMM(BPF_MOV, R0, 0xbef);
-
-	insn[len - 1] = BPF_EXIT_INSN();
-
-	self->u.ptr.insns = insn;
-	self->u.ptr.len = len;
-
-	return 0;
-}
-
-static int bpf_fill_ld_abs_vlan_push_pop2(struct bpf_test *self)
-{
-	struct bpf_insn *insn;
-
-	insn = kmalloc_array(16, sizeof(*insn), GFP_KERNEL);
-	if (!insn)
-		return -ENOMEM;
-
-	/* Due to func address being non-const, we need to
-	 * assemble this here.
-	 */
-	insn[0] = BPF_MOV64_REG(R6, R1);
-	insn[1] = BPF_LD_ABS(BPF_B, 0);
-	insn[2] = BPF_LD_ABS(BPF_H, 0);
-	insn[3] = BPF_LD_ABS(BPF_W, 0);
-	insn[4] = BPF_MOV64_REG(R7, R6);
-	insn[5] = BPF_MOV64_IMM(R6, 0);
-	insn[6] = BPF_MOV64_REG(R1, R7);
-	insn[7] = BPF_MOV64_IMM(R2, 1);
-	insn[8] = BPF_MOV64_IMM(R3, 2);
-	insn[9] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-			       bpf_skb_vlan_push_proto.func - __bpf_call_base);
-	insn[10] = BPF_MOV64_REG(R6, R7);
-	insn[11] = BPF_LD_ABS(BPF_B, 0);
-	insn[12] = BPF_LD_ABS(BPF_H, 0);
-	insn[13] = BPF_LD_ABS(BPF_W, 0);
-	insn[14] = BPF_MOV64_IMM(R0, 42);
-	insn[15] = BPF_EXIT_INSN();
-
-	self->u.ptr.insns = insn;
-	self->u.ptr.len = 16;
-
-	return 0;
-}
-
-static int bpf_fill_jump_around_ld_abs(struct bpf_test *self)
-{
-	unsigned int len = BPF_MAXINSNS;
-	struct bpf_insn *insn;
-	int i = 0;
-
-	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
-	if (!insn)
-		return -ENOMEM;
-
-	insn[i++] = BPF_MOV64_REG(R6, R1);
-	insn[i++] = BPF_LD_ABS(BPF_B, 0);
-	insn[i] = BPF_JMP_IMM(BPF_JEQ, R0, 10, len - i - 2);
-	i++;
-	while (i < len - 1)
-		insn[i++] = BPF_LD_ABS(BPF_B, 1);
-	insn[i] = BPF_EXIT_INSN();
-
-	self->u.ptr.insns = insn;
-	self->u.ptr.len = len;
-
-	return 0;
-}
-
 static int __bpf_fill_stxdw(struct bpf_test *self, int size)
 {
 	unsigned int len = BPF_MAXINSNS;
@@ -1987,40 +1877,6 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, -1 } }
 	},
-	{
-		"INT: DIV + ABS",
-		.u.insns_int = {
-			BPF_ALU64_REG(BPF_MOV, R6, R1),
-			BPF_LD_ABS(BPF_B, 3),
-			BPF_ALU64_IMM(BPF_MOV, R2, 2),
-			BPF_ALU32_REG(BPF_DIV, R0, R2),
-			BPF_ALU64_REG(BPF_MOV, R8, R0),
-			BPF_LD_ABS(BPF_B, 4),
-			BPF_ALU64_REG(BPF_ADD, R8, R0),
-			BPF_LD_IND(BPF_B, R8, -70),
-			BPF_EXIT_INSN(),
-		},
-		INTERNAL,
-		{ 10, 20, 30, 40, 50 },
-		{ { 4, 0 }, { 5, 10 } }
-	},
-	{
-		/* This one doesn't go through verifier, but is just raw insn
-		 * as opposed to cBPF tests from here. Thus div by 0 tests are
-		 * done in test_verifier in BPF kselftests.
-		 */
-		"INT: DIV by -1",
-		.u.insns_int = {
-			BPF_ALU64_REG(BPF_MOV, R6, R1),
-			BPF_ALU64_IMM(BPF_MOV, R7, -1),
-			BPF_LD_ABS(BPF_B, 3),
-			BPF_ALU32_REG(BPF_DIV, R0, R7),
-			BPF_EXIT_INSN(),
-		},
-		INTERNAL,
-		{ 10, 20, 30, 40, 50 },
-		{ { 3, 0 }, { 4, 0 } }
-	},
 	{
 		"check: missing ret",
 		.u.insns = {
@@ -2383,50 +2239,6 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } }
 	},
-	{
-		"nmap reduced",
-		.u.insns_int = {
-			BPF_MOV64_REG(R6, R1),
-			BPF_LD_ABS(BPF_H, 12),
-			BPF_JMP_IMM(BPF_JNE, R0, 0x806, 28),
-			BPF_LD_ABS(BPF_H, 12),
-			BPF_JMP_IMM(BPF_JNE, R0, 0x806, 26),
-			BPF_MOV32_IMM(R0, 18),
-			BPF_STX_MEM(BPF_W, R10, R0, -64),
-			BPF_LDX_MEM(BPF_W, R7, R10, -64),
-			BPF_LD_IND(BPF_W, R7, 14),
-			BPF_STX_MEM(BPF_W, R10, R0, -60),
-			BPF_MOV32_IMM(R0, 280971478),
-			BPF_STX_MEM(BPF_W, R10, R0, -56),
-			BPF_LDX_MEM(BPF_W, R7, R10, -56),
-			BPF_LDX_MEM(BPF_W, R0, R10, -60),
-			BPF_ALU32_REG(BPF_SUB, R0, R7),
-			BPF_JMP_IMM(BPF_JNE, R0, 0, 15),
-			BPF_LD_ABS(BPF_H, 12),
-			BPF_JMP_IMM(BPF_JNE, R0, 0x806, 13),
-			BPF_MOV32_IMM(R0, 22),
-			BPF_STX_MEM(BPF_W, R10, R0, -56),
-			BPF_LDX_MEM(BPF_W, R7, R10, -56),
-			BPF_LD_IND(BPF_H, R7, 14),
-			BPF_STX_MEM(BPF_W, R10, R0, -52),
-			BPF_MOV32_IMM(R0, 17366),
-			BPF_STX_MEM(BPF_W, R10, R0, -48),
-			BPF_LDX_MEM(BPF_W, R7, R10, -48),
-			BPF_LDX_MEM(BPF_W, R0, R10, -52),
-			BPF_ALU32_REG(BPF_SUB, R0, R7),
-			BPF_JMP_IMM(BPF_JNE, R0, 0, 2),
-			BPF_MOV32_IMM(R0, 256),
-			BPF_EXIT_INSN(),
-			BPF_MOV32_IMM(R0, 0),
-			BPF_EXIT_INSN(),
-		},
-		INTERNAL,
-		{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0, 0,
-		  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		  0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6},
-		{ { 38, 256 } },
-		.stack_depth = 64,
-	},
 	/* BPF_ALU | BPF_MOV | BPF_X */
 	{
 		"ALU_MOV_X: dst = 2",
@@ -5485,22 +5297,6 @@ static struct bpf_test tests[] = {
 		{ { 1, 0xbee } },
 		.fill_helper = bpf_fill_ld_abs_get_processor_id,
 	},
-	{
-		"BPF_MAXINSNS: ld_abs+vlan_push/pop",
-		{ },
-		INTERNAL,
-		{ 0x34 },
-		{ { ETH_HLEN, 0xbef } },
-		.fill_helper = bpf_fill_ld_abs_vlan_push_pop,
-	},
-	{
-		"BPF_MAXINSNS: jump around ld_abs",
-		{ },
-		INTERNAL,
-		{ 10, 11 },
-		{ { 2, 10 } },
-		.fill_helper = bpf_fill_jump_around_ld_abs,
-	},
 	/*
 	 * LD_IND / LD_ABS on fragmented SKBs
 	 */
@@ -5682,6 +5478,53 @@ static struct bpf_test tests[] = {
 		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
 		{ {0x40, 0x05 } },
 	},
+	{
+		"LD_IND byte positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xff } },
+	},
+	{
+		"LD_IND byte positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_IND byte negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, -0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 } },
+	},
+	{
+		"LD_IND byte negative offset, multiple calls",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3b),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 1),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 2),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 3),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 4),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x82 }, },
+	},
 	{
 		"LD_IND halfword positive offset",
 		.u.insns = {
@@ -5730,6 +5573,39 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x66cc } },
 	},
+	{
+		"LD_IND halfword positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3d),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffff } },
+	},
+	{
+		"LD_IND halfword positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_IND halfword negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_H, -0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 } },
+	},
 	{
 		"LD_IND word positive offset",
 		.u.insns = {
@@ -5820,6 +5696,39 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x66cc77dd } },
 	},
+	{
+		"LD_IND word positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3b),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffffffff } },
+	},
+	{
+		"LD_IND word positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_IND word negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_W, -0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 } },
+	},
 	{
 		"LD_ABS byte",
 		.u.insns = {
@@ -5837,6 +5746,68 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0xcc } },
 	},
+	{
+		"LD_ABS byte positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xff } },
+	},
+	{
+		"LD_ABS byte positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS byte negative offset, out of bounds load",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, -1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"LD_ABS byte negative offset, in bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x82 }, },
+	},
+	{
+		"LD_ABS byte negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS byte negative offset, multiple calls",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3c),
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3d),
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x82 }, },
+	},
 	{
 		"LD_ABS halfword",
 		.u.insns = {
@@ -5871,6 +5842,55 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x99ff } },
 	},
+	{
+		"LD_ABS halfword positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3e),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffff } },
+	},
+	{
+		"LD_ABS halfword positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS halfword negative offset, out of bounds load",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, -1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"LD_ABS halfword negative offset, in bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x1982 }, },
+	},
+	{
+		"LD_ABS halfword negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
 	{
 		"LD_ABS word",
 		.u.insns = {
@@ -5939,6 +5959,140 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x88ee99ff } },
 	},
+	{
+		"LD_ABS word positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffffffff } },
+	},
+	{
+		"LD_ABS word positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS word negative offset, out of bounds load",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, -1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"LD_ABS word negative offset, in bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x25051982 }, },
+	},
+	{
+		"LD_ABS word negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LDX_MSH standalone, preserved A",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0xffeebbaa }, },
+	},
+	{
+		"LDX_MSH standalone, preserved A 2",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0x175e9d63),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3d),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x175e9d63 }, },
+	},
+	{
+		"LDX_MSH standalone, test result 1",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x14 }, },
+	},
+	{
+		"LDX_MSH standalone, test result 2",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x24 }, },
+	},
+	{
+		"LDX_MSH standalone, negative offset",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, -1),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0 }, },
+	},
+	{
+		"LDX_MSH standalone, negative offset 2",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x24 }, },
+	},
+	{
+		"LDX_MSH standalone, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x40),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0 }, },
+	},
 	/*
 	 * verify that the interpreter or JIT correctly sets A and X
 	 * to 0.
@@ -6127,14 +6281,6 @@ static struct bpf_test tests[] = {
 		{},
 		{ {0x1, 0x42 } },
 	},
-	{
-		"LD_ABS with helper changing skb data",
-		{ },
-		INTERNAL,
-		{ 0x34 },
-		{ { ETH_HLEN, 42 } },
-		.fill_helper = bpf_fill_ld_abs_vlan_push_pop2,
-	},
 	/* Checking interpreter vs JIT wrt signed extended imms. */
 	{
 		"JNE signed compare, test 1",
diff --git a/net/core/filter.c b/net/core/filter.c
index c33595a8d604..865500f6180d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2181,7 +2181,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	return ret;
 }
 
-const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
 	.func           = bpf_skb_vlan_push,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
@@ -2189,7 +2189,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
 	.arg2_type      = ARG_ANYTHING,
 	.arg3_type      = ARG_ANYTHING,
 };
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
 
 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 {
@@ -2203,13 +2202,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 	return ret;
 }
 
-const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
 	.func           = bpf_skb_vlan_pop,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
 
 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
 {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 1acafe26498b..275b4570b5b8 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -47,7 +47,7 @@
 # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
 
-#define MAX_INSNS	512
+#define MAX_INSNS	BPF_MAXINSNS
 #define MAX_FIXUPS	8
 #define MAX_NR_MAPS	4
 #define POINTER_VALUE	0xcafe4all
@@ -77,6 +77,8 @@ struct bpf_test {
 	} result, result_unpriv;
 	enum bpf_prog_type prog_type;
 	uint8_t flags;
+	__u8 data[TEST_DATA_LEN];
+	void (*fill_helper)(struct bpf_test *self);
 };
 
 /* Note we want this to be 64 bit aligned so that the end of our array is
@@ -94,6 +96,62 @@ struct other_val {
 	long long bar;
 };
 
+static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
+{
+	/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */
+#define PUSH_CNT 51
+	unsigned int len = BPF_MAXINSNS;
+	struct bpf_insn *insn = self->insns;
+	int i = 0, j, k = 0;
+
+	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+loop:
+	for (j = 0; j < PUSH_CNT; j++) {
+		insn[i++] = BPF_LD_ABS(BPF_B, 0);
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2);
+		i++;
+		insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+		insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1);
+		insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2);
+		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+					 BPF_FUNC_skb_vlan_push),
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2);
+		i++;
+	}
+
+	for (j = 0; j < PUSH_CNT; j++) {
+		insn[i++] = BPF_LD_ABS(BPF_B, 0);
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2);
+		i++;
+		insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+					 BPF_FUNC_skb_vlan_pop),
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2);
+		i++;
+	}
+	if (++k < 5)
+		goto loop;
+
+	for (; i < len - 1; i++)
+		insn[i] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 0xbef);
+	insn[len - 1] = BPF_EXIT_INSN();
+}
+
+static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
+{
+	struct bpf_insn *insn = self->insns;
+	unsigned int len = BPF_MAXINSNS;
+	int i = 0;
+
+	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	insn[i++] = BPF_LD_ABS(BPF_B, 0);
+	insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 10, len - i - 2);
+	i++;
+	while (i < len - 1)
+		insn[i++] = BPF_LD_ABS(BPF_B, 1);
+	insn[i] = BPF_EXIT_INSN();
+}
+
 static struct bpf_test tests[] = {
 	{
 		"add+sub+mul",
@@ -11725,6 +11783,197 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
+	{
+		"ld_abs: invalid op 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_DW, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = REJECT,
+		.errstr = "unknown opcode",
+	},
+	{
+		"ld_abs: invalid op 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 256),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_IND(BPF_DW, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = REJECT,
+		.errstr = "unknown opcode",
+	},
+	{
+		"ld_abs: nmap reduced",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_H, 12),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 28),
+			BPF_LD_ABS(BPF_H, 12),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 26),
+			BPF_MOV32_IMM(BPF_REG_0, 18),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -64),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -64),
+			BPF_LD_IND(BPF_W, BPF_REG_7, 14),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -60),
+			BPF_MOV32_IMM(BPF_REG_0, 280971478),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -60),
+			BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 15),
+			BPF_LD_ABS(BPF_H, 12),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 13),
+			BPF_MOV32_IMM(BPF_REG_0, 22),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
+			BPF_LD_IND(BPF_H, BPF_REG_7, 14),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -52),
+			BPF_MOV32_IMM(BPF_REG_0, 17366),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -48),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -48),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -52),
+			BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+			BPF_MOV32_IMM(BPF_REG_0, 256),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0,
+			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 256,
+	},
+	{
+		"ld_abs: div + abs, test 1",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_B, 3),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
+			BPF_LD_ABS(BPF_B, 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
+			BPF_LD_IND(BPF_B, BPF_REG_8, -70),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 10,
+	},
+	{
+		"ld_abs: div + abs, test 2",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_B, 3),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
+			BPF_LD_ABS(BPF_B, 128),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
+			BPF_LD_IND(BPF_B, BPF_REG_8, -70),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"ld_abs: div + abs, test 3",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
+			BPF_LD_ABS(BPF_B, 3),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"ld_abs: div + abs, test 4",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
+			BPF_LD_ABS(BPF_B, 256),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"ld_abs: vlan + abs, test 1",
+		.insns = { },
+		.data = {
+			0x34,
+		},
+		.fill_helper = bpf_fill_ld_abs_vlan_push_pop,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0xbef,
+	},
+	{
+		"ld_abs: vlan + abs, test 2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_6, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 2),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_skb_vlan_push),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 42),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			0x34,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"ld_abs: jump around ld_abs",
+		.insns = { },
+		.data = {
+			10, 11,
+		},
+		.fill_helper = bpf_fill_jump_around_ld_abs,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 10,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
@@ -11828,7 +12077,7 @@ static int create_map_in_map(void)
 	return outer_map_fd;
 }
 
-static char bpf_vlog[32768];
+static char bpf_vlog[UINT_MAX >> 8];
 
 static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 			  int *map_fds)
@@ -11839,6 +12088,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	int *fixup_prog = test->fixup_prog;
 	int *fixup_map_in_map = test->fixup_map_in_map;
 
+	if (test->fill_helper)
+		test->fill_helper(test);
+
 	/* Allocating HTs with 1 elem is fine here, since we only test
 	 * for verifier and not do a runtime lookup, so the only thing
 	 * that really matters is value size in this case.
@@ -11888,10 +12140,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 			   int *passes, int *errors)
 {
 	int fd_prog, expected_ret, reject_from_alignment;
+	int prog_len, prog_type = test->prog_type;
 	struct bpf_insn *prog = test->insns;
-	int prog_len = probe_filter_length(prog);
-	char data_in[TEST_DATA_LEN] = {};
-	int prog_type = test->prog_type;
 	int map_fds[MAX_NR_MAPS];
 	const char *expected_err;
 	uint32_t retval;
@@ -11901,6 +12151,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		map_fds[i] = -1;
 
 	do_test_fixup(test, prog, map_fds);
+	prog_len = probe_filter_length(prog);
 
 	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
 				     prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
@@ -11940,8 +12191,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	}
 
 	if (fd_prog >= 0) {
-		err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in),
-					NULL, NULL, &retval, NULL);
+		err = bpf_prog_test_run(fd_prog, 1, test->data,
+					sizeof(test->data), NULL, NULL,
+					&retval, NULL);
 		if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) {
 			printf("Unexpected bpf_prog_test_run error\n");
 			goto fail_log;
-- 
cgit v1.2.3


From e0cea7ce988cf48cc4052235d2ad2550b3bc4fa0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:14 +0200
Subject: bpf: implement ld_abs/ld_ind in native bpf

The main part of this work is to finally allow removal of LD_ABS
and LD_IND from the BPF core by reimplementing them through native
eBPF instead. Both LD_ABS/LD_IND were carried over from cBPF and
keeping them around in native eBPF caused way more trouble than
actually worth it. To just list some of the security issues in
the past:

  * fdfaf64e7539 ("x86: bpf_jit: support negative offsets")
  * 35607b02dbef ("sparc: bpf_jit: fix loads from negative offsets")
  * e0ee9c12157d ("x86: bpf_jit: fix two bugs in eBPF JIT compiler")
  * 07aee9439454 ("bpf, sparc: fix usage of wrong reg for load_skb_regs after call")
  * 6d59b7dbf72e ("bpf, s390x: do not reload skb pointers in non-skb context")
  * 87338c8e2cbb ("bpf, ppc64: do not reload skb pointers in non-skb context")

For programs in native eBPF, LD_ABS/LD_IND are pretty much legacy
these days due to their limitations and more efficient/flexible
alternatives that have been developed over time such as direct
packet access. LD_ABS/LD_IND only cover 1/2/4 byte loads into a
register, the load happens in host endianness and its exception
handling can yield unexpected behavior. The latter is explained
in depth in f6b1b3bf0d5f ("bpf: fix subprog verifier bypass by
div/mod by 0 exception") with similar cases of exceptions we had.
In native eBPF more recent program types will disable LD_ABS/LD_IND
altogether through may_access_skb() in verifier, and given the
limitations in terms of exception handling, it's also disabled
in programs that use BPF to BPF calls.

In terms of cBPF, the LD_ABS/LD_IND is used in networking programs
to access packet data. It is not used in seccomp-BPF but programs
that use it for socket filtering or reuseport for demuxing with
cBPF. This is mostly relevant for applications that have not yet
migrated to native eBPF.

The main complexity and source of bugs in LD_ABS/LD_IND is coming
from their implementation in the various JITs. Most of them keep
the model around from cBPF times by implementing a fastpath written
in asm. They use typically two from the BPF program hidden CPU
registers for caching the skb's headlen (skb->len - skb->data_len)
and skb->data. Throughout the JIT phase this requires to keep track
whether LD_ABS/LD_IND are used and if so, the two registers need
to be recached each time a BPF helper would change the underlying
packet data in native eBPF case. At least in eBPF case, available
CPU registers are rare and the additional exit path out of the
asm written JIT helper makes it also inflexible since not all
parts of the JITer are in control from plain C. A LD_ABS/LD_IND
implementation in eBPF therefore allows to significantly reduce
the complexity in JITs with comparable performance results for
them, e.g.:

test_bpf             tcpdump port 22             tcpdump complex
x64      - before    15 21 10                    14 19  18
         - after      7 10 10                     7 10  15
arm64    - before    40 91 92                    40 91 151
         - after     51 64 73                    51 62 113

For cBPF we now track any usage of LD_ABS/LD_IND in bpf_convert_filter()
and cache the skb's headlen and data in the cBPF prologue. The
BPF_REG_TMP gets remapped from R8 to R2 since it's mainly just
used as a local temporary variable. This allows to shrink the
image on x86_64 also for seccomp programs slightly since mapping
to %rsi is not an ereg. In callee-saved R8 and R9 we now track
skb data and headlen, respectively. For normal prologue emission
in the JITs this does not add any extra instructions since R8, R9
are pushed to stack in any case from eBPF side. cBPF uses the
convert_bpf_ld_abs() emitter which probes the fast path inline
already and falls back to bpf_skb_load_helper_{8,16,32}() helper
relying on the cached skb data and headlen as well. R8 and R9
never need to be reloaded due to bpf_helper_changes_pkt_data()
since all skb access in cBPF is read-only. Then, for the case
of native eBPF, we use the bpf_gen_ld_abs() emitter, which calls
the bpf_skb_load_helper_{8,16,32}_no_cache() helper unconditionally,
does neither cache skb data and headlen nor has an inlined fast
path. The reason for the latter is that native eBPF does not have
any extra registers available anyway, but even if there were, it
avoids any reload of skb data and headlen in the first place.
Additionally, for the negative offsets, we provide an alternative
bpf_skb_load_bytes_relative() helper in eBPF which operates
similarly as bpf_skb_load_bytes() and allows for more flexibility.
Tested myself on x64, arm64, s390x, from Sandipan on ppc64.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    |   2 +
 include/linux/filter.h |   4 +-
 kernel/bpf/core.c      |  96 ++------------------
 kernel/bpf/verifier.c  |  24 +++++
 net/core/filter.c      | 236 ++++++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 262 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0e3d7ef36a8..0e00a13ff01b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -235,6 +235,8 @@ struct bpf_verifier_ops {
 				struct bpf_insn_access_aux *info);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
+	int (*gen_ld_abs)(const struct bpf_insn *orig,
+			  struct bpf_insn *insn_buf);
 	u32 (*convert_ctx_access)(enum bpf_access_type type,
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b7f81e3a70cb..da7e16523128 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -47,7 +47,9 @@ struct xdp_buff;
 /* Additional register mappings for converted user programs. */
 #define BPF_REG_A	BPF_REG_0
 #define BPF_REG_X	BPF_REG_7
-#define BPF_REG_TMP	BPF_REG_8
+#define BPF_REG_TMP	BPF_REG_2	/* scratch reg */
+#define BPF_REG_D	BPF_REG_8	/* data, callee-saved */
+#define BPF_REG_H	BPF_REG_9	/* hlen, callee-saved */
 
 /* Kernel hidden auxiliary/helper register for hardening step.
  * Only used by eBPF JITs. It's nothing more than a temporary
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 90feeba3a1a1..1127552c8033 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -634,23 +634,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
 		break;
 
-	case BPF_LD | BPF_ABS | BPF_W:
-	case BPF_LD | BPF_ABS | BPF_H:
-	case BPF_LD | BPF_ABS | BPF_B:
-		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
-		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
-		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
-		break;
-
-	case BPF_LD | BPF_IND | BPF_W:
-	case BPF_LD | BPF_IND | BPF_H:
-	case BPF_LD | BPF_IND | BPF_B:
-		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
-		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
-		*to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
-		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
-		break;
-
 	case BPF_LD | BPF_IMM | BPF_DW:
 		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
 		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -891,14 +874,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 	INSN_3(LDX, MEM, W),			\
 	INSN_3(LDX, MEM, DW),			\
 	/*   Immediate based. */		\
-	INSN_3(LD, IMM, DW),			\
-	/*   Misc (old cBPF carry-over). */	\
-	INSN_3(LD, ABS, B),			\
-	INSN_3(LD, ABS, H),			\
-	INSN_3(LD, ABS, W),			\
-	INSN_3(LD, IND, B),			\
-	INSN_3(LD, IND, H),			\
-	INSN_3(LD, IND, W)
+	INSN_3(LD, IMM, DW)
 
 bool bpf_opcode_in_insntable(u8 code)
 {
@@ -908,6 +884,13 @@ bool bpf_opcode_in_insntable(u8 code)
 		[0 ... 255] = false,
 		/* Now overwrite non-defaults ... */
 		BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+		/* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
+		[BPF_LD | BPF_ABS | BPF_B] = true,
+		[BPF_LD | BPF_ABS | BPF_H] = true,
+		[BPF_LD | BPF_ABS | BPF_W] = true,
+		[BPF_LD | BPF_IND | BPF_B] = true,
+		[BPF_LD | BPF_IND | BPF_H] = true,
+		[BPF_LD | BPF_IND | BPF_W] = true,
 	};
 #undef BPF_INSN_3_TBL
 #undef BPF_INSN_2_TBL
@@ -938,8 +921,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 #undef BPF_INSN_3_LBL
 #undef BPF_INSN_2_LBL
 	u32 tail_call_cnt = 0;
-	void *ptr;
-	int off;
 
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
@@ -1266,67 +1247,6 @@ out:
 		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
 			     (DST + insn->off));
 		CONT;
-	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
-		off = IMM;
-load_word:
-		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
-		 * appearing in the programs where ctx == skb
-		 * (see may_access_skb() in the verifier). All programs
-		 * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
-		 * bpf_convert_filter() saves it in BPF_R6, internal BPF
-		 * verifier will check that BPF_R6 == ctx.
-		 *
-		 * BPF_ABS and BPF_IND are wrappers of function calls,
-		 * so they scratch BPF_R1-BPF_R5 registers, preserve
-		 * BPF_R6-BPF_R9, and store return value into BPF_R0.
-		 *
-		 * Implicit input:
-		 *   ctx == skb == BPF_R6 == CTX
-		 *
-		 * Explicit input:
-		 *   SRC == any register
-		 *   IMM == 32-bit immediate
-		 *
-		 * Output:
-		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
-		 */
-
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be32(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
-		off = IMM;
-load_half:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be16(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
-		off = IMM;
-load_byte:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = *(u8 *)ptr;
-			CONT;
-		}
-
-		return 0;
-	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_word;
-	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_half;
-	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
-		off = IMM + SRC;
-		goto load_byte;
 
 	default_label:
 		/* If we ever reach this, we have a bug somewhere. Die hard here
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0d91f18b2eb5..6ba10a83909d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3884,6 +3884,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
+	if (!env->ops->gen_ld_abs) {
+		verbose(env, "bpf verifier is misconfigured\n");
+		return -EINVAL;
+	}
+
 	if (env->subprog_cnt) {
 		/* when program has LD_ABS insn JITs and interpreter assume
 		 * that r1 == ctx == skb which is not the case for callees
@@ -5519,6 +5524,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (BPF_CLASS(insn->code) == BPF_LD &&
+		    (BPF_MODE(insn->code) == BPF_ABS ||
+		     BPF_MODE(insn->code) == BPF_IND)) {
+			cnt = env->ops->gen_ld_abs(insn, insn_buf);
+			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+				verbose(env, "bpf verifier is misconfigured\n");
+				return -EINVAL;
+			}
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (insn->code != (BPF_JMP | BPF_CALL))
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
diff --git a/net/core/filter.c b/net/core/filter.c
index 865500f6180d..a49729842b3d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -162,6 +162,87 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 	return 0;
 }
 
+BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
+	   data, int, headlen, int, offset)
+{
+	u8 tmp, *ptr;
+	const int len = sizeof(tmp);
+
+	if (offset >= 0) {
+		if (headlen - offset >= len)
+			return *(u8 *)(data + offset);
+		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+			return tmp;
+	} else {
+		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+		if (likely(ptr))
+			return *(u8 *)ptr;
+	}
+
+	return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
+	   int, offset)
+{
+	return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
+					 offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
+	   data, int, headlen, int, offset)
+{
+	u16 tmp, *ptr;
+	const int len = sizeof(tmp);
+
+	if (offset >= 0) {
+		if (headlen - offset >= len)
+			return get_unaligned_be16(data + offset);
+		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+			return be16_to_cpu(tmp);
+	} else {
+		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+		if (likely(ptr))
+			return get_unaligned_be16(ptr);
+	}
+
+	return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
+	   int, offset)
+{
+	return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
+					  offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
+	   data, int, headlen, int, offset)
+{
+	u32 tmp, *ptr;
+	const int len = sizeof(tmp);
+
+	if (likely(offset >= 0)) {
+		if (headlen - offset >= len)
+			return get_unaligned_be32(data + offset);
+		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+			return be32_to_cpu(tmp);
+	} else {
+		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+		if (likely(ptr))
+			return get_unaligned_be32(ptr);
+	}
+
+	return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
+	   int, offset)
+{
+	return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
+					  offset);
+}
+
 BPF_CALL_0(bpf_get_raw_cpu_id)
 {
 	return raw_smp_processor_id();
@@ -354,26 +435,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	return true;
 }
 
+static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
+{
+	const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
+	int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
+	bool endian = BPF_SIZE(fp->code) == BPF_H ||
+		      BPF_SIZE(fp->code) == BPF_W;
+	bool indirect = BPF_MODE(fp->code) == BPF_IND;
+	const int ip_align = NET_IP_ALIGN;
+	struct bpf_insn *insn = *insnp;
+	int offset = fp->k;
+
+	if (!indirect &&
+	    ((unaligned_ok && offset >= 0) ||
+	     (!unaligned_ok && offset >= 0 &&
+	      offset + ip_align >= 0 &&
+	      offset + ip_align % size == 0))) {
+		*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
+		*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+		*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
+		*insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
+				      offset);
+		if (endian)
+			*insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
+		*insn++ = BPF_JMP_A(8);
+	}
+
+	*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+	*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
+	*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
+	if (!indirect) {
+		*insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
+	} else {
+		*insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
+		if (fp->k)
+			*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
+	}
+
+	switch (BPF_SIZE(fp->code)) {
+	case BPF_B:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
+		break;
+	case BPF_H:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
+		break;
+	case BPF_W:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
+		break;
+	default:
+		return false;
+	}
+
+	*insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
+	*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+	*insn   = BPF_EXIT_INSN();
+
+	*insnp = insn;
+	return true;
+}
+
 /**
  *	bpf_convert_filter - convert filter program
  *	@prog: the user passed filter program
  *	@len: the length of the user passed filter program
  *	@new_prog: allocated 'struct bpf_prog' or NULL
  *	@new_len: pointer to store length of converted program
+ *	@seen_ld_abs: bool whether we've seen ld_abs/ind
  *
  * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
  * style extended BPF (eBPF).
  * Conversion workflow:
  *
  * 1) First pass for calculating the new program length:
- *   bpf_convert_filter(old_prog, old_len, NULL, &new_len)
+ *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
  *
  * 2) 2nd pass to remap in two passes: 1st pass finds new
  *    jump offsets, 2nd pass remapping:
- *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
  */
 static int bpf_convert_filter(struct sock_filter *prog, int len,
-			      struct bpf_prog *new_prog, int *new_len)
+			      struct bpf_prog *new_prog, int *new_len,
+			      bool *seen_ld_abs)
 {
 	int new_flen = 0, pass = 0, target, i, stack_off;
 	struct bpf_insn *new_insn, *first_insn = NULL;
@@ -412,12 +554,27 @@ do_pass:
 		 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
 		 */
 		*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+		if (*seen_ld_abs) {
+			/* For packet access in classic BPF, cache skb->data
+			 * in callee-saved BPF R8 and skb->len - skb->data_len
+			 * (headlen) in BPF R9. Since classic BPF is read-only
+			 * on CTX, we only need to cache it once.
+			 */
+			*new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+						  BPF_REG_D, BPF_REG_CTX,
+						  offsetof(struct sk_buff, data));
+			*new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
+						  offsetof(struct sk_buff, len));
+			*new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
+						  offsetof(struct sk_buff, data_len));
+			*new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
+		}
 	} else {
 		new_insn += 3;
 	}
 
 	for (i = 0; i < len; fp++, i++) {
-		struct bpf_insn tmp_insns[6] = { };
+		struct bpf_insn tmp_insns[32] = { };
 		struct bpf_insn *insn = tmp_insns;
 
 		if (addrs)
@@ -460,6 +617,11 @@ do_pass:
 			    BPF_MODE(fp->code) == BPF_ABS &&
 			    convert_bpf_extensions(fp, &insn))
 				break;
+			if (BPF_CLASS(fp->code) == BPF_LD &&
+			    convert_bpf_ld_abs(fp, &insn)) {
+				*seen_ld_abs = true;
+				break;
+			}
 
 			if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
 			    fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
@@ -562,21 +724,31 @@ jmp_rest:
 			break;
 
 		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
-		case BPF_LDX | BPF_MSH | BPF_B:
-			/* tmp = A */
-			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+		case BPF_LDX | BPF_MSH | BPF_B: {
+			struct sock_filter tmp = {
+				.code	= BPF_LD | BPF_ABS | BPF_B,
+				.k	= fp->k,
+			};
+
+			*seen_ld_abs = true;
+
+			/* X = A */
+			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 			/* A = BPF_R0 = *(u8 *) (skb->data + K) */
-			*insn++ = BPF_LD_ABS(BPF_B, fp->k);
+			convert_bpf_ld_abs(&tmp, &insn);
+			insn++;
 			/* A &= 0xf */
 			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 			/* A <<= 2 */
 			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+			/* tmp = X */
+			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
 			/* X = A */
 			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 			/* A = tmp */
 			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 			break;
-
+		}
 		/* RET_K is remaped into 2 insns. RET_A case doesn't need an
 		 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
 		 */
@@ -658,6 +830,8 @@ jmp_rest:
 	if (!new_prog) {
 		/* Only calculating new length. */
 		*new_len = new_insn - first_insn;
+		if (*seen_ld_abs)
+			*new_len += 4; /* Prologue bits. */
 		return 0;
 	}
 
@@ -1019,6 +1193,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 	struct sock_filter *old_prog;
 	struct bpf_prog *old_fp;
 	int err, new_len, old_len = fp->len;
+	bool seen_ld_abs = false;
 
 	/* We are free to overwrite insns et al right here as it
 	 * won't be used at this point in time anymore internally
@@ -1040,7 +1215,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 	}
 
 	/* 1st pass: calculate the new program length. */
-	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
+	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
+				 &seen_ld_abs);
 	if (err)
 		goto out_err_free;
 
@@ -1059,7 +1235,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 	fp->len = new_len;
 
 	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
-	err = bpf_convert_filter(old_prog, old_len, fp, &new_len);
+	err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
+				 &seen_ld_abs);
 	if (err)
 		/* 2nd bpf_convert_filter() can fail only if it fails
 		 * to allocate memory, remapping must succeed. Note,
@@ -4330,6 +4507,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
 	return insn - insn_buf;
 }
 
+static int bpf_gen_ld_abs(const struct bpf_insn *orig,
+			  struct bpf_insn *insn_buf)
+{
+	bool indirect = BPF_MODE(orig->code) == BPF_IND;
+	struct bpf_insn *insn = insn_buf;
+
+	/* We're guaranteed here that CTX is in R6. */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
+	if (!indirect) {
+		*insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
+	} else {
+		*insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
+		if (orig->imm)
+			*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
+	}
+
+	switch (BPF_SIZE(orig->code)) {
+	case BPF_B:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
+		break;
+	case BPF_H:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
+		break;
+	case BPF_W:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
+		break;
+	}
+
+	*insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+	*insn++ = BPF_EXIT_INSN();
+
+	return insn - insn_buf;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -5599,6 +5811,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+	.gen_ld_abs		= bpf_gen_ld_abs,
 };
 
 const struct bpf_prog_ops sk_filter_prog_ops = {
@@ -5610,6 +5823,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+	.gen_ld_abs		= bpf_gen_ld_abs,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
-- 
cgit v1.2.3


From 4e1ec56cdc59746943b2acfab3c171b930187bbe Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:15 +0200
Subject: bpf: add skb_load_bytes_relative helper

This adds a small BPF helper similar to bpf_skb_load_bytes() that
is able to load relative to mac/net header offset from the skb's
linear data. Compared to bpf_skb_load_bytes(), it takes a fifth
argument namely start_header, which is either BPF_HDR_START_MAC
or BPF_HDR_START_NET. This allows for a more flexible alternative
compared to LD_ABS/LD_IND with negative offset. It's enabled for
tc BPF programs as well as sock filter program types where it's
mainly useful in reuseport programs to ease access to lower header
data.

Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2017-March/000698.html
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++-
 net/core/filter.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a3a495052511..93d5a4eeec2a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1802,6 +1802,30 @@ union bpf_attr {
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
  * 		a negative error in case of failure.
+ *
+ * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * 	Description
+ * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 		it provides an easy way to load *len* bytes from *offset*
+ * 		from the packet associated to *skb*, into the buffer pointed
+ * 		by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 		a fifth argument *start_header* exists in order to select a
+ * 		base offset to start from. *start_header* can be one of:
+ *
+ * 		**BPF_HDR_START_MAC**
+ * 			Base offset to load data from is *skb*'s mac header.
+ * 		**BPF_HDR_START_NET**
+ * 			Base offset to load data from is *skb*'s network header.
+ *
+ * 		In general, "direct packet access" is the preferred method to
+ * 		access packet data, however, this helper is in particular useful
+ * 		in socket filters where *skb*\ **->data** does not always point
+ * 		to the start of the mac header and where "direct packet access"
+ * 		is not available.
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1871,7 +1895,8 @@ union bpf_attr {
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
-	FN(get_stack),
+	FN(get_stack),			\
+	FN(skb_load_bytes_relative),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1932,6 +1957,12 @@ enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
 };
 
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+	BPF_HDR_START_MAC,
+	BPF_HDR_START_NET,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index a49729842b3d..6877426c23a6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1684,6 +1684,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+	   u32, offset, void *, to, u32, len, u32, start_header)
+{
+	u8 *ptr;
+
+	if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
+		goto err_clear;
+
+	switch (start_header) {
+	case BPF_HDR_START_MAC:
+		ptr = skb_mac_header(skb) + offset;
+		break;
+	case BPF_HDR_START_NET:
+		ptr = skb_network_header(skb) + offset;
+		break;
+	default:
+		goto err_clear;
+	}
+
+	if (likely(ptr >= skb_mac_header(skb) &&
+		   ptr + len <= skb_tail_pointer(skb))) {
+		memcpy(to, ptr, len);
+		return 0;
+	}
+
+err_clear:
+	memset(to, 0, len);
+	return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+	.func		= bpf_skb_load_bytes_relative,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
 {
 	/* Idea is the following: should the needed direct read/write
@@ -4061,6 +4102,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &bpf_skb_load_bytes_relative_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
@@ -4078,6 +4121,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &bpf_skb_load_bytes_relative_proto;
 	case BPF_FUNC_skb_pull_data:
 		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
-- 
cgit v1.2.3


From 9ea19e7e7613c0dac114bcbb5adc7f1b08eec615 Mon Sep 17 00:00:00 2001
From: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Date: Fri, 4 May 2018 04:24:03 +0300
Subject: include: usb: audio-v3: add BADD-specific values

Add BADD-specific predefined values to audio-v3
so usb-audio in ALSA and UAC3 gadget can use them

Signed-off-by: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v3.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/usb/audio-v3.h b/include/linux/usb/audio-v3.h
index a8959aaba0ae..38add1dedf2e 100644
--- a/include/linux/usb/audio-v3.h
+++ b/include/linux/usb/audio-v3.h
@@ -392,4 +392,30 @@ struct uac3_interrupt_data_msg {
 #define UAC3_AC_ACTIVE_INTERFACE_CONTROL	0x01
 #define UAC3_AC_POWER_DOMAIN_CONTROL		0x02
 
+/* BADD predefined Unit/Terminal values */
+#define UAC3_BADD_IT_ID1	1  /* Input Terminal ID1: bTerminalID = 1 */
+#define UAC3_BADD_FU_ID2	2  /* Feature Unit ID2: bUnitID = 2 */
+#define UAC3_BADD_OT_ID3	3  /* Output Terminal ID3: bTerminalID = 3 */
+#define UAC3_BADD_IT_ID4	4  /* Input Terminal ID4: bTerminalID = 4 */
+#define UAC3_BADD_FU_ID5	5  /* Feature Unit ID5: bUnitID = 5 */
+#define UAC3_BADD_OT_ID6	6  /* Output Terminal ID6: bTerminalID = 6 */
+#define UAC3_BADD_FU_ID7	7  /* Feature Unit ID7: bUnitID = 7 */
+#define UAC3_BADD_MU_ID8	8  /* Mixer Unit ID8: bUnitID = 8 */
+#define UAC3_BADD_CS_ID9	9  /* Clock Source Entity ID9: bClockID = 9 */
+#define UAC3_BADD_PD_ID10	10 /* Power Domain ID10: bPowerDomainID = 10 */
+#define UAC3_BADD_PD_ID11	11 /* Power Domain ID11: bPowerDomainID = 11 */
+
+/* BADD wMaxPacketSize of AS endpoints */
+#define UAC3_BADD_EP_MAXPSIZE_SYNC_MONO_16		0x0060
+#define UAC3_BADD_EP_MAXPSIZE_ASYNC_MONO_16		0x0062
+#define UAC3_BADD_EP_MAXPSIZE_SYNC_MONO_24		0x0090
+#define UAC3_BADD_EP_MAXPSIZE_ASYNC_MONO_24		0x0093
+#define UAC3_BADD_EP_MAXPSIZE_SYNC_STEREO_16		0x00C0
+#define UAC3_BADD_EP_MAXPSIZE_ASYNC_STEREO_16		0x00C4
+#define UAC3_BADD_EP_MAXPSIZE_SYNC_STEREO_24		0x0120
+#define UAC3_BADD_EP_MAXPSIZE_ASYNC_STEREO_24		0x0126
+
+/* BADD sample rate is always fixed to 48kHz */
+#define UAC3_BADD_SAMPLING_RATE				48000
+
 #endif /* __LINUX_USB_AUDIO_V3_H */
-- 
cgit v1.2.3


From c427f69564e2a844c5fcf2804042609342513da0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Apr 2018 11:05:35 +0200
Subject: locking/mutex: Optimize __mutex_trylock_fast()

Use try_cmpxchg to avoid the pointless TEST instruction..
And add the (missing) atomic_long_try_cmpxchg*() wrappery.

On x86_64 this gives:

0000000000000710 <mutex_lock>:						0000000000000710 <mutex_lock>:
 710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx                      710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx
 717:   00 00                                                            717:   00 00
                        715: R_X86_64_32S       current_task                                    715: R_X86_64_32S       current_task
 719:   31 c0                   xor    %eax,%eax                         719:   31 c0                   xor    %eax,%eax
 71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)                 71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)
 720:   48 85 c0                test   %rax,%rax                         720:   75 02                   jne    724 <mutex_lock+0x14>
 723:   75 02                   jne    727 <mutex_lock+0x17>             722:   f3 c3                   repz retq
 725:   f3 c3                   repz retq                                724:   eb da                   jmp    700 <__mutex_lock_slowpath>
 727:   eb d7                   jmp    700 <__mutex_lock_slowpath>       726:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
 729:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)                         72d:   00 00 00

On ARM64 this gives:

000000000000638 <mutex_lock>:						0000000000000638 <mutex_lock>:
     638:       d5384101        mrs     x1, sp_el0                           638:       d5384101        mrs     x1, sp_el0
     63c:       d2800002        mov     x2, #0x0                             63c:       d2800002        mov     x2, #0x0
     640:       f9800011        prfm    pstl1strm, [x0]                      640:       f9800011        prfm    pstl1strm, [x0]
     644:       c85ffc03        ldaxr   x3, [x0]                             644:       c85ffc03        ldaxr   x3, [x0]
     648:       ca020064        eor     x4, x3, x2                           648:       ca020064        eor     x4, x3, x2
     64c:       b5000064        cbnz    x4, 658 <mutex_lock+0x20>            64c:       b5000064        cbnz    x4, 658 <mutex_lock+0x20>
     650:       c8047c01        stxr    w4, x1, [x0]                         650:       c8047c01        stxr    w4, x1, [x0]
     654:       35ffff84        cbnz    w4, 644 <mutex_lock+0xc>             654:       35ffff84        cbnz    w4, 644 <mutex_lock+0xc>
     658:       b40000c3        cbz     x3, 670 <mutex_lock+0x38>            658:       b5000043        cbnz    x3, 660 <mutex_lock+0x28>
     65c:       a9bf7bfd        stp     x29, x30, [sp,#-16]!                 65c:       d65f03c0        ret
     660:       910003fd        mov     x29, sp                              660:       a9bf7bfd        stp     x29, x30, [sp,#-16]!
     664:       97ffffef        bl      620 <__mutex_lock_slowpath>          664:       910003fd        mov     x29, sp
     668:       a8c17bfd        ldp     x29, x30, [sp],#16                   668:       97ffffee        bl      620 <__mutex_lock_slowpath>
     66c:       d65f03c0        ret                                          66c:       a8c17bfd        ldp     x29, x30, [sp],#16
     670:       d65f03c0        ret                                          670:       d65f03c0        ret

Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/atomic-long.h | 17 +++++++++++++++++
 kernel/locking/mutex.c            |  3 ++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index 5b2b0b5ea06d..87d14476edc2 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
 
 #define ATOMIC_LONG_INIT(i)	ATOMIC64_INIT(i)
 #define ATOMIC_LONG_PFX(x)	atomic64 ## x
+#define ATOMIC_LONG_TYPE	s64
 
 #else
 
@@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
 
 #define ATOMIC_LONG_INIT(i)	ATOMIC_INIT(i)
 #define ATOMIC_LONG_PFX(x)	atomic ## x
+#define ATOMIC_LONG_TYPE	int
 
 #endif
 
@@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
 #define atomic_long_cmpxchg(l, old, new) \
 	(ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
 
+
+#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_acquire(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_acquire)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_release(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_release)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), \
+				       (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+
+
 #define atomic_long_xchg_relaxed(v, new) \
 	(ATOMIC_LONG_PFX(_xchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
 #define atomic_long_xchg_acquire(v, new) \
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2048359f33d2..f44f658ae629 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock)
 static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
 {
 	unsigned long curr = (unsigned long)current;
+	unsigned long zero = 0UL;
 
-	if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+	if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
 		return true;
 
 	return false;
-- 
cgit v1.2.3


From f96bdf564f3e7511aecdd4c35cc18ac5e0750a2f Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Thu, 3 May 2018 13:22:14 +0200
Subject: drm/rect: Handle rounding errors in drm_rect_clip_scaled, v3.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of relying on a scale which may increase rounding errors,
clip src by doing: src * (dst - clip) / dst and rounding the result
away from 1, so the new coordinates get closer to 1. We won't need
to fix up with a magic macro afterwards, because our scaling factor
will never go to the other side of 1.

Changes since v1:
- Adjust dst immediately, else drm_rect_width/height on dst gives bogus
  results.
Change since v2:
- Get rid of macros and use 64-bits math.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
[mlankhorst: Add Villes comment, and rename newsrc to tmp. (Ville)]
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180503112217.37292-3-maarten.lankhorst@linux.intel.com
---
 drivers/gpu/drm/drm_atomic_helper.c |  2 +-
 drivers/gpu/drm/drm_rect.c          | 49 +++++++++++++++++++++++++++----------
 drivers/gpu/drm/i915/intel_sprite.c |  2 +-
 include/drm/drm_rect.h              |  3 +--
 4 files changed, 39 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 9cb2209f6fc8..130da5195f3b 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -766,7 +766,7 @@ int drm_atomic_helper_check_plane_state(struct drm_plane_state *plane_state,
 	if (crtc_state->enable)
 		drm_mode_get_hv_timing(&crtc_state->mode, &clip.x2, &clip.y2);
 
-	plane_state->visible = drm_rect_clip_scaled(src, dst, &clip, hscale, vscale);
+	plane_state->visible = drm_rect_clip_scaled(src, dst, &clip);
 
 	drm_rect_rotate_inv(src, fb->width << 16, fb->height << 16, rotation);
 
diff --git a/drivers/gpu/drm/drm_rect.c b/drivers/gpu/drm/drm_rect.c
index a8e934795c7d..8c057829b804 100644
--- a/drivers/gpu/drm/drm_rect.c
+++ b/drivers/gpu/drm/drm_rect.c
@@ -50,13 +50,25 @@ bool drm_rect_intersect(struct drm_rect *r1, const struct drm_rect *r2)
 }
 EXPORT_SYMBOL(drm_rect_intersect);
 
+static u32 clip_scaled(u32 src, u32 dst, u32 clip)
+{
+	u64 tmp = mul_u32_u32(src, dst - clip);
+
+	/*
+	 * Round toward 1.0 when clipping so that we don't accidentally
+	 * change upscaling to downscaling or vice versa.
+	 */
+	if (src < (dst << 16))
+		return DIV_ROUND_UP_ULL(tmp, dst);
+	else
+		return DIV_ROUND_DOWN_ULL(tmp, dst);
+}
+
 /**
  * drm_rect_clip_scaled - perform a scaled clip operation
  * @src: source window rectangle
  * @dst: destination window rectangle
  * @clip: clip rectangle
- * @hscale: horizontal scaling factor
- * @vscale: vertical scaling factor
  *
  * Clip rectangle @dst by rectangle @clip. Clip rectangle @src by the
  * same amounts multiplied by @hscale and @vscale.
@@ -66,33 +78,44 @@ EXPORT_SYMBOL(drm_rect_intersect);
  * %false otherwise
  */
 bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst,
-			  const struct drm_rect *clip,
-			  int hscale, int vscale)
+			  const struct drm_rect *clip)
 {
 	int diff;
 
 	diff = clip->x1 - dst->x1;
 	if (diff > 0) {
-		int64_t tmp = src->x1 + (int64_t) diff * hscale;
-		src->x1 = clamp_t(int64_t, tmp, INT_MIN, INT_MAX);
+		u32 new_src_w = clip_scaled(drm_rect_width(src),
+					    drm_rect_width(dst), diff);
+
+		src->x1 = clamp_t(int64_t, src->x2 - new_src_w, INT_MIN, INT_MAX);
+		dst->x1 = clip->x1;
 	}
 	diff = clip->y1 - dst->y1;
 	if (diff > 0) {
-		int64_t tmp = src->y1 + (int64_t) diff * vscale;
-		src->y1 = clamp_t(int64_t, tmp, INT_MIN, INT_MAX);
+		u32 new_src_h = clip_scaled(drm_rect_height(src),
+					    drm_rect_height(dst), diff);
+
+		src->y1 = clamp_t(int64_t, src->y2 - new_src_h, INT_MIN, INT_MAX);
+		dst->y1 = clip->y1;
 	}
 	diff = dst->x2 - clip->x2;
 	if (diff > 0) {
-		int64_t tmp = src->x2 - (int64_t) diff * hscale;
-		src->x2 = clamp_t(int64_t, tmp, INT_MIN, INT_MAX);
+		u32 new_src_w = clip_scaled(drm_rect_width(src),
+					    drm_rect_width(dst), diff);
+
+		src->x2 = clamp_t(int64_t, src->x1 + new_src_w, INT_MIN, INT_MAX);
+		dst->x2 = clip->x2;
 	}
 	diff = dst->y2 - clip->y2;
 	if (diff > 0) {
-		int64_t tmp = src->y2 - (int64_t) diff * vscale;
-		src->y2 = clamp_t(int64_t, tmp, INT_MIN, INT_MAX);
+		u32 new_src_h = clip_scaled(drm_rect_height(src),
+					    drm_rect_height(dst), diff);
+
+		src->y2 = clamp_t(int64_t, src->y1 + new_src_h, INT_MIN, INT_MAX);
+		dst->y2 = clip->y2;
 	}
 
-	return drm_rect_intersect(dst, clip);
+	return drm_rect_visible(dst);
 }
 EXPORT_SYMBOL(drm_rect_clip_scaled);
 
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index dbdcf85032df..e17c26a1cff1 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -1003,7 +1003,7 @@ intel_check_sprite_plane(struct intel_plane *plane,
 		drm_mode_get_hv_timing(&crtc_state->base.mode,
 				       &clip.x2, &clip.y2);
 
-	state->base.visible = drm_rect_clip_scaled(src, dst, &clip, hscale, vscale);
+	state->base.visible = drm_rect_clip_scaled(src, dst, &clip);
 
 	crtc_x = dst->x1;
 	crtc_y = dst->y1;
diff --git a/include/drm/drm_rect.h b/include/drm/drm_rect.h
index 44bc122b9ee0..6c54544a4be7 100644
--- a/include/drm/drm_rect.h
+++ b/include/drm/drm_rect.h
@@ -175,8 +175,7 @@ static inline bool drm_rect_equals(const struct drm_rect *r1,
 
 bool drm_rect_intersect(struct drm_rect *r, const struct drm_rect *clip);
 bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst,
-			  const struct drm_rect *clip,
-			  int hscale, int vscale);
+			  const struct drm_rect *clip);
 int drm_rect_calc_hscale(const struct drm_rect *src,
 			 const struct drm_rect *dst,
 			 int min_hscale, int max_hscale);
-- 
cgit v1.2.3


From f910cefa32b6cdabc96b126bcfc46d8940b1dc45 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Wed, 2 May 2018 16:17:17 -0400
Subject: bpf: unify main prog and subprog

Currently, verifier treat main prog and subprog differently. All subprogs
detected are kept in env->subprog_starts while main prog is not kept there.
Instead, main prog is implicitly defined as the prog start at 0.

There is actually no difference between main prog and subprog, it is better
to unify them, and register all progs detected into env->subprog_starts.

This could also help simplifying some code logic.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 57 ++++++++++++++++++++++++--------------------
 2 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c395fddf..f655b926e432 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -191,7 +191,7 @@ struct bpf_verifier_env {
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifier_log log;
-	u32 subprog_starts[BPF_MAX_SUBPROGS];
+	u32 subprog_starts[BPF_MAX_SUBPROGS + 1];
 	/* computes the stack depth of each bpf function */
 	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6ba10a83909d..8e8e582a7c03 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -768,7 +768,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	ret = find_subprog(env, off);
 	if (ret >= 0)
 		return 0;
-	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+	if (env->subprog_cnt > BPF_MAX_SUBPROGS) {
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
@@ -784,6 +784,11 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 
+	/* Add entry function. */
+	ret = add_subprog(env, 0);
+	if (ret < 0)
+		return ret;
+
 	/* determine subprog starts. The end is one before the next starts */
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
@@ -809,10 +814,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	/* now check that all jumps are within the same subprog */
 	subprog_start = 0;
-	if (env->subprog_cnt == cur_subprog)
+	if (env->subprog_cnt == cur_subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[cur_subprog++];
+		subprog_end = env->subprog_starts[cur_subprog + 1];
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -836,11 +841,13 @@ next:
 				verbose(env, "last insn is not an exit or jmp\n");
 				return -EINVAL;
 			}
+			cur_subprog++;
 			subprog_start = subprog_end;
-			if (env->subprog_cnt == cur_subprog)
+			if (env->subprog_cnt == cur_subprog + 1)
 				subprog_end = insn_cnt;
 			else
-				subprog_end = env->subprog_starts[cur_subprog++];
+				subprog_end =
+					env->subprog_starts[cur_subprog + 1];
 		}
 	}
 	return 0;
@@ -1508,10 +1515,10 @@ process_func:
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog)
+	if (env->subprog_cnt == subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[subprog];
+		subprog_end = env->subprog_starts[subprog + 1];
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -1529,7 +1536,6 @@ continue_func:
 				  i);
 			return -EFAULT;
 		}
-		subprog++;
 		frame++;
 		if (frame >= MAX_CALL_FRAMES) {
 			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1561,7 +1567,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	subprog++;
 	return env->subprog_stack_depth[subprog];
 }
 #endif
@@ -2099,7 +2104,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_tail_call:
 		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
 			goto error;
-		if (env->subprog_cnt) {
+		if (env->subprog_cnt > 1) {
 			verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
 			return -EINVAL;
 		}
@@ -2272,7 +2277,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			/* remember the callsite, it will be used by bpf_exit */
 			*insn_idx /* callsite */,
 			state->curframe + 1 /* frameno within this callchain */,
-			subprog + 1 /* subprog number within this prog */);
+			subprog /* subprog number within this prog */);
 
 	/* copy r1 - r5 args that callee can access */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -3889,7 +3894,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
-	if (env->subprog_cnt) {
+	if (env->subprog_cnt > 1) {
 		/* when program has LD_ABS insn JITs and interpreter assume
 		 * that r1 == ctx == skb which is not the case for callees
 		 * that can have arbitrary arguments. It's problematic
@@ -4920,11 +4925,11 @@ process_bpf_exit:
 
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
-	for (i = 0; i < env->subprog_cnt + 1; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		u32 depth = env->subprog_stack_depth[i];
 
 		verbose(env, "%d", depth);
-		if (i + 1 < env->subprog_cnt + 1)
+		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
@@ -5301,7 +5306,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	void *old_bpf_func;
 	int err = -ENOMEM;
 
-	if (env->subprog_cnt == 0)
+	if (env->subprog_cnt <= 1)
 		return 0;
 
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5317,7 +5322,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		/* temporarily remember subprog id inside insn instead of
 		 * aux_data, since next loop will split up all insns into funcs
 		 */
-		insn->off = subprog + 1;
+		insn->off = subprog;
 		/* remember original imm in case JIT fails and fallback
 		 * to interpreter will be needed
 		 */
@@ -5326,16 +5331,16 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->imm = 1;
 	}
 
-	func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+	func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL);
 	if (!func)
 		return -ENOMEM;
 
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		subprog_start = subprog_end;
-		if (env->subprog_cnt == i)
+		if (env->subprog_cnt == i + 1)
 			subprog_end = prog->len;
 		else
-			subprog_end = env->subprog_starts[i];
+			subprog_end = env->subprog_starts[i + 1];
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5365,7 +5370,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
 			if (insn->code != (BPF_JMP | BPF_CALL) ||
@@ -5378,7 +5383,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 				__bpf_call_base;
 		}
 	}
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
 		tmp = bpf_int_jit_compile(func[i]);
 		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5392,7 +5397,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	/* finally lock prog and jit images for all functions and
 	 * populate kallsysm
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		bpf_prog_lock_ro(func[i]);
 		bpf_prog_kallsyms_add(func[i]);
 	}
@@ -5409,7 +5414,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog + 1]->bpf_func;
+		addr  = (unsigned long)func[subprog]->bpf_func;
 		addr &= PAGE_MASK;
 		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 			    addr - __bpf_call_base;
@@ -5418,10 +5423,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->jited = 1;
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt + 1;
+	prog->aux->func_cnt = env->subprog_cnt;
 	return 0;
 out_free:
-	for (i = 0; i <= env->subprog_cnt; i++)
+	for (i = 0; i < env->subprog_cnt; i++)
 		if (func[i])
 			bpf_jit_free(func[i]);
 	kfree(func);
-- 
cgit v1.2.3


From 9c8105bd4402236b1bb0f8f10709c5cec1440a0c Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Wed, 2 May 2018 16:17:18 -0400
Subject: bpf: centre subprog information fields

It is better to centre all subprog information fields into one structure.
This structure could later serve as function node in call graph.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  9 ++++---
 kernel/bpf/verifier.c        | 62 +++++++++++++++++++++++---------------------
 2 files changed, 38 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f655b926e432..8f70dc181e23 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -173,6 +173,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 #define BPF_MAX_SUBPROGS 256
 
+struct bpf_subprog_info {
+	u32 start; /* insn idx of function entry point */
+	u16 stack_depth; /* max. stack depth used by this function */
+};
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -191,9 +196,7 @@ struct bpf_verifier_env {
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifier_log log;
-	u32 subprog_starts[BPF_MAX_SUBPROGS + 1];
-	/* computes the stack depth of each bpf function */
-	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8e8e582a7c03..5b293b4abb70 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -741,18 +741,19 @@ enum reg_arg_type {
 
 static int cmp_subprogs(const void *a, const void *b)
 {
-	return *(int *)a - *(int *)b;
+	return ((struct bpf_subprog_info *)a)->start -
+	       ((struct bpf_subprog_info *)b)->start;
 }
 
 static int find_subprog(struct bpf_verifier_env *env, int off)
 {
-	u32 *p;
+	struct bpf_subprog_info *p;
 
-	p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
-		    sizeof(env->subprog_starts[0]), cmp_subprogs);
+	p = bsearch(&off, env->subprog_info, env->subprog_cnt,
+		    sizeof(env->subprog_info[0]), cmp_subprogs);
 	if (!p)
 		return -ENOENT;
-	return p - env->subprog_starts;
+	return p - env->subprog_info;
 
 }
 
@@ -772,15 +773,16 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
-	env->subprog_starts[env->subprog_cnt++] = off;
-	sort(env->subprog_starts, env->subprog_cnt,
-	     sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+	env->subprog_info[env->subprog_cnt++].start = off;
+	sort(env->subprog_info, env->subprog_cnt,
+	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 	return 0;
 }
 
 static int check_subprogs(struct bpf_verifier_env *env)
 {
 	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 
@@ -810,14 +812,14 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	if (env->log.level > 1)
 		for (i = 0; i < env->subprog_cnt; i++)
-			verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
 	/* now check that all jumps are within the same subprog */
 	subprog_start = 0;
 	if (env->subprog_cnt == cur_subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[cur_subprog + 1];
+		subprog_end = subprog[cur_subprog + 1].start;
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -846,8 +848,7 @@ next:
 			if (env->subprog_cnt == cur_subprog + 1)
 				subprog_end = insn_cnt;
 			else
-				subprog_end =
-					env->subprog_starts[cur_subprog + 1];
+				subprog_end = subprog[cur_subprog + 1].start;
 		}
 	}
 	return 0;
@@ -1480,13 +1481,13 @@ static int update_stack_depth(struct bpf_verifier_env *env,
 			      const struct bpf_func_state *func,
 			      int off)
 {
-	u16 stack = env->subprog_stack_depth[func->subprogno];
+	u16 stack = env->subprog_info[func->subprogno].stack_depth;
 
 	if (stack >= -off)
 		return 0;
 
 	/* update known max for given subprogram */
-	env->subprog_stack_depth[func->subprogno] = -off;
+	env->subprog_info[func->subprogno].stack_depth = -off;
 	return 0;
 }
 
@@ -1498,7 +1499,8 @@ static int update_stack_depth(struct bpf_verifier_env *env,
  */
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
-	int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+	int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 	int ret_insn[MAX_CALL_FRAMES];
@@ -1508,17 +1510,17 @@ process_func:
 	/* round up to 32-bytes, since this is granularity
 	 * of interpreter stack size
 	 */
-	depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	if (depth > MAX_BPF_STACK) {
 		verbose(env, "combined stack size of %d calls is %d. Too large\n",
 			frame + 1, depth);
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog + 1)
+	if (env->subprog_cnt == idx + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[subprog + 1];
+		subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -1526,12 +1528,12 @@ continue_func:
 			continue;
 		/* remember insn and function to return to */
 		ret_insn[frame] = i + 1;
-		ret_prog[frame] = subprog;
+		ret_prog[frame] = idx;
 
 		/* find the callee */
 		i = i + insn[i].imm + 1;
-		subprog = find_subprog(env, i);
-		if (subprog < 0) {
+		idx = find_subprog(env, i);
+		if (idx < 0) {
 			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
 				  i);
 			return -EFAULT;
@@ -1548,10 +1550,10 @@ continue_func:
 	 */
 	if (frame == 0)
 		return 0;
-	depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	frame--;
 	i = ret_insn[frame];
-	subprog = ret_prog[frame];
+	idx = ret_prog[frame];
 	goto continue_func;
 }
 
@@ -1567,7 +1569,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	return env->subprog_stack_depth[subprog];
+	return env->subprog_info[subprog].stack_depth;
 }
 #endif
 
@@ -4926,14 +4928,14 @@ process_bpf_exit:
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
 	for (i = 0; i < env->subprog_cnt; i++) {
-		u32 depth = env->subprog_stack_depth[i];
+		u32 depth = env->subprog_info[i].stack_depth;
 
 		verbose(env, "%d", depth);
 		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
-	env->prog->aux->stack_depth = env->subprog_stack_depth[0];
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return 0;
 }
 
@@ -5140,9 +5142,9 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
 	if (len == 1)
 		return;
 	for (i = 0; i < env->subprog_cnt; i++) {
-		if (env->subprog_starts[i] < off)
+		if (env->subprog_info[i].start < off)
 			continue;
-		env->subprog_starts[i] += len - 1;
+		env->subprog_info[i].start += len - 1;
 	}
 }
 
@@ -5340,7 +5342,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		if (env->subprog_cnt == i + 1)
 			subprog_end = prog->len;
 		else
-			subprog_end = env->subprog_starts[i + 1];
+			subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5357,7 +5359,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		 * Long term would need debug info to populate names
 		 */
 		func[i]->aux->name[0] = 'F';
-		func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
-- 
cgit v1.2.3


From 101c92dc80c8947b92addb34c11ea7a47a1d2561 Mon Sep 17 00:00:00 2001
From: Honghui Zhang <honghui.zhang@mediatek.com>
Date: Fri, 4 May 2018 13:47:32 +0800
Subject: PCI: mediatek: Set up vendor ID and class type for MT7622

MT7622's hardware default value of vendor ID and class type is not correct,
fix that by setup the correct values before linkup with Endpoint.

Signed-off-by: Honghui Zhang <honghui.zhang@mediatek.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Ryder Lee <ryder.lee@mediatek.com>
---
 drivers/pci/host/pcie-mediatek.c | 30 +++++++++++++++++++++++++++---
 include/linux/pci_ids.h          |  2 ++
 2 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/host/pcie-mediatek.c b/drivers/pci/host/pcie-mediatek.c
index a8b20c5012a9..c3dc5499d1b7 100644
--- a/drivers/pci/host/pcie-mediatek.c
+++ b/drivers/pci/host/pcie-mediatek.c
@@ -66,6 +66,10 @@
 
 /* PCIe V2 per-port registers */
 #define PCIE_MSI_VECTOR		0x0c0
+
+#define PCIE_CONF_VEND_ID	0x100
+#define PCIE_CONF_CLASS_ID	0x106
+
 #define PCIE_INT_MASK		0x420
 #define INTX_MASK		GENMASK(19, 16)
 #define INTX_SHIFT		16
@@ -125,12 +129,14 @@ struct mtk_pcie_port;
 
 /**
  * struct mtk_pcie_soc - differentiate between host generations
+ * @need_fix_class_id: whether this host's class ID needed to be fixed or not
  * @has_msi: whether this host supports MSI interrupts or not
  * @ops: pointer to configuration access functions
  * @startup: pointer to controller setting functions
  * @setup_irq: pointer to initialize IRQ functions
  */
 struct mtk_pcie_soc {
+	bool need_fix_class_id;
 	bool has_msi;
 	struct pci_ops *ops;
 	int (*startup)(struct mtk_pcie_port *port);
@@ -375,6 +381,7 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port)
 {
 	struct mtk_pcie *pcie = port->pcie;
 	struct resource *mem = &pcie->mem;
+	const struct mtk_pcie_soc *soc = port->pcie->soc;
 	u32 val;
 	size_t size;
 	int err;
@@ -403,6 +410,15 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port)
 	       PCIE_MAC_SRSTB | PCIE_CRSTB;
 	writel(val, port->base + PCIE_RST_CTRL);
 
+	/* Set up vendor ID and class code */
+	if (soc->need_fix_class_id) {
+		val = PCI_VENDOR_ID_MEDIATEK;
+		writew(val, port->base + PCIE_CONF_VEND_ID);
+
+		val = PCI_CLASS_BRIDGE_HOST;
+		writew(val, port->base + PCIE_CONF_CLASS_ID);
+	}
+
 	/* 100ms timeout value should be enough for Gen1/2 training */
 	err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_V2, val,
 				 !!(val & PCIE_PORT_LINKUP_V2), 20,
@@ -1142,7 +1158,15 @@ static const struct mtk_pcie_soc mtk_pcie_soc_v1 = {
 	.startup = mtk_pcie_startup_port,
 };
 
-static const struct mtk_pcie_soc mtk_pcie_soc_v2 = {
+static const struct mtk_pcie_soc mtk_pcie_soc_mt2712 = {
+	.has_msi = true,
+	.ops = &mtk_pcie_ops_v2,
+	.startup = mtk_pcie_startup_port_v2,
+	.setup_irq = mtk_pcie_setup_irq,
+};
+
+static const struct mtk_pcie_soc mtk_pcie_soc_mt7622 = {
+	.need_fix_class_id = true,
 	.has_msi = true,
 	.ops = &mtk_pcie_ops_v2,
 	.startup = mtk_pcie_startup_port_v2,
@@ -1152,8 +1176,8 @@ static const struct mtk_pcie_soc mtk_pcie_soc_v2 = {
 static const struct of_device_id mtk_pcie_ids[] = {
 	{ .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 },
 	{ .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 },
-	{ .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_v2 },
-	{ .compatible = "mediatek,mt7622-pcie", .data = &mtk_pcie_soc_v2 },
+	{ .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 },
+	{ .compatible = "mediatek,mt7622-pcie", .data = &mtk_pcie_soc_mt7622 },
 	{},
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cc608fc55334..42117a508119 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2119,6 +2119,8 @@
 
 #define PCI_VENDOR_ID_MYRICOM		0x14c1
 
+#define PCI_VENDOR_ID_MEDIATEK		0x14c3
+
 #define PCI_VENDOR_ID_TITAN		0x14D2
 #define PCI_DEVICE_ID_TITAN_010L	0x8001
 #define PCI_DEVICE_ID_TITAN_100L	0x8010
-- 
cgit v1.2.3


From 771f7be87ff921e9a3d744febd606af39a150e14 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 20 Apr 2018 13:42:51 -0400
Subject: media: omapfb: omapfb_dss.h: add stubs to build with COMPILE_TEST &&
 DRM_OMAP

Add stubs for omapfb_dss.h, in the case it is included by
some driver when CONFIG_FB_OMAP2 is not defined, with can
happen on ARM when DRM_OMAP is not 'n'.

That allows building such driver(s) with COMPILE_TEST.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/video/omapfb_dss.h | 54 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/video/omapfb_dss.h b/include/video/omapfb_dss.h
index 1d38901d599d..e9775144ff3b 100644
--- a/include/video/omapfb_dss.h
+++ b/include/video/omapfb_dss.h
@@ -774,6 +774,12 @@ struct omap_dss_driver {
 		const struct hdmi_avi_infoframe *avi);
 };
 
+#define for_each_dss_dev(d) while ((d = omap_dss_get_next_device(d)) != NULL)
+
+typedef void (*omap_dispc_isr_t) (void *arg, u32 mask);
+
+#ifdef CONFIG_FB_OMAP2
+
 enum omapdss_version omapdss_get_version(void);
 bool omapdss_is_initialized(void);
 
@@ -785,7 +791,6 @@ void omapdss_unregister_display(struct omap_dss_device *dssdev);
 
 struct omap_dss_device *omap_dss_get_device(struct omap_dss_device *dssdev);
 void omap_dss_put_device(struct omap_dss_device *dssdev);
-#define for_each_dss_dev(d) while ((d = omap_dss_get_next_device(d)) != NULL)
 struct omap_dss_device *omap_dss_get_next_device(struct omap_dss_device *from);
 struct omap_dss_device *omap_dss_find_device(void *data,
 		int (*match)(struct omap_dss_device *dssdev, void *data));
@@ -826,7 +831,6 @@ int omapdss_default_get_recommended_bpp(struct omap_dss_device *dssdev);
 void omapdss_default_get_timings(struct omap_dss_device *dssdev,
 		struct omap_video_timings *timings);
 
-typedef void (*omap_dispc_isr_t) (void *arg, u32 mask);
 int omap_dispc_register_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
 int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
 
@@ -856,5 +860,51 @@ omapdss_of_get_first_endpoint(const struct device_node *parent);
 
 struct omap_dss_device *
 omapdss_of_find_source_for_first_ep(struct device_node *node);
+#else
+
+static inline enum omapdss_version omapdss_get_version(void)
+{ return OMAPDSS_VER_UNKNOWN; };
+
+static inline bool omapdss_is_initialized(void)
+{ return false; };
+
+static inline int omap_dispc_register_isr(omap_dispc_isr_t isr,
+					  void *arg, u32 mask)
+{ return 0; };
+
+static inline int omap_dispc_unregister_isr(omap_dispc_isr_t isr,
+					    void *arg, u32 mask)
+{ return 0; };
+
+static inline struct omap_dss_device
+*omap_dss_get_device(struct omap_dss_device *dssdev)
+{ return NULL; };
+
+static inline struct omap_dss_device
+*omap_dss_get_next_device(struct omap_dss_device *from)
+{return NULL; };
+
+static inline void omap_dss_put_device(struct omap_dss_device *dssdev) {};
+
+static inline int omapdss_compat_init(void)
+{ return 0; };
+
+static inline void omapdss_compat_uninit(void) {};
+
+static inline int omap_dss_get_num_overlay_managers(void)
+{ return 0; };
+
+static inline struct omap_overlay_manager *omap_dss_get_overlay_manager(int num)
+{ return NULL; };
+
+static inline int omap_dss_get_num_overlays(void)
+{ return 0; };
+
+static inline struct omap_overlay *omap_dss_get_overlay(int num)
+{ return NULL; };
+
+
+#endif /* FB_OMAP2 */
+
 
 #endif /* __OMAPFB_DSS_H */
-- 
cgit v1.2.3


From adafae6229743fa1d99e2cc76fda7d964cea79f9 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 9 Mar 2018 03:30:48 -0500
Subject: media: v4l2-core: get rid of videobuf-dvb

Videobuf has been replaced by videobuf2. Now, no drivers use
the videobuf-dvb helper module anymore. So, get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/Kconfig        |   4 -
 drivers/media/v4l2-core/Makefile       |   1 -
 drivers/media/v4l2-core/videobuf-dvb.c | 398 ---------------------------------
 include/media/videobuf-dvb.h           |  59 -----
 4 files changed, 462 deletions(-)
 delete mode 100644 drivers/media/v4l2-core/videobuf-dvb.c
 delete mode 100644 include/media/videobuf-dvb.h

(limited to 'include')

diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig
index 8e37e7c5e0f7..2a56f37f186b 100644
--- a/drivers/media/v4l2-core/Kconfig
+++ b/drivers/media/v4l2-core/Kconfig
@@ -76,7 +76,3 @@ config VIDEOBUF_DMA_CONTIG
 	tristate
 	depends on HAS_DMA
 	select VIDEOBUF_GEN
-
-config VIDEOBUF_DVB
-	tristate
-	select VIDEOBUF_GEN
diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile
index 7df54582e956..9ee57e1efefe 100644
--- a/drivers/media/v4l2-core/Makefile
+++ b/drivers/media/v4l2-core/Makefile
@@ -31,7 +31,6 @@ obj-$(CONFIG_VIDEOBUF_GEN) += videobuf-core.o
 obj-$(CONFIG_VIDEOBUF_DMA_SG) += videobuf-dma-sg.o
 obj-$(CONFIG_VIDEOBUF_DMA_CONTIG) += videobuf-dma-contig.o
 obj-$(CONFIG_VIDEOBUF_VMALLOC) += videobuf-vmalloc.o
-obj-$(CONFIG_VIDEOBUF_DVB) += videobuf-dvb.o
 
 ccflags-y += -I$(srctree)/drivers/media/dvb-frontends
 ccflags-y += -I$(srctree)/drivers/media/tuners
diff --git a/drivers/media/v4l2-core/videobuf-dvb.c b/drivers/media/v4l2-core/videobuf-dvb.c
deleted file mode 100644
index b7efa4516d36..000000000000
--- a/drivers/media/v4l2-core/videobuf-dvb.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- *
- * some helper function for simple DVB cards which simply DMA the
- * complete transport stream and let the computer sort everything else
- * (i.e. we are using the software demux, ...).  Also uses the
- * video-buf to manage DMA buffers.
- *
- * (c) 2004 Gerd Knorr <kraxel@bytesex.org> [SUSE Labs]
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/kthread.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-
-#include <linux/freezer.h>
-
-#include <media/videobuf-core.h>
-#include <media/videobuf-dvb.h>
-
-/* ------------------------------------------------------------------ */
-
-MODULE_AUTHOR("Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]");
-MODULE_LICENSE("GPL");
-
-static unsigned int debug;
-module_param(debug, int, 0644);
-MODULE_PARM_DESC(debug,"enable debug messages");
-
-#define dprintk(fmt, arg...)	if (debug)			\
-	printk(KERN_DEBUG "%s/dvb: " fmt, dvb->name , ## arg)
-
-/* ------------------------------------------------------------------ */
-
-static int videobuf_dvb_thread(void *data)
-{
-	struct videobuf_dvb *dvb = data;
-	struct videobuf_buffer *buf;
-	unsigned long flags;
-	void *outp;
-
-	dprintk("dvb thread started\n");
-	set_freezable();
-	videobuf_read_start(&dvb->dvbq);
-
-	for (;;) {
-		/* fetch next buffer */
-		buf = list_entry(dvb->dvbq.stream.next,
-				 struct videobuf_buffer, stream);
-		list_del(&buf->stream);
-		videobuf_waiton(&dvb->dvbq, buf, 0, 1);
-
-		/* no more feeds left or stop_feed() asked us to quit */
-		if (0 == dvb->nfeeds)
-			break;
-		if (kthread_should_stop())
-			break;
-		try_to_freeze();
-
-		/* feed buffer data to demux */
-		outp = videobuf_queue_to_vaddr(&dvb->dvbq, buf);
-
-		if (buf->state == VIDEOBUF_DONE)
-			dvb_dmx_swfilter(&dvb->demux, outp,
-					 buf->size);
-
-		/* requeue buffer */
-		list_add_tail(&buf->stream,&dvb->dvbq.stream);
-		spin_lock_irqsave(dvb->dvbq.irqlock,flags);
-		dvb->dvbq.ops->buf_queue(&dvb->dvbq,buf);
-		spin_unlock_irqrestore(dvb->dvbq.irqlock,flags);
-	}
-
-	videobuf_read_stop(&dvb->dvbq);
-	dprintk("dvb thread stopped\n");
-
-	/* Hmm, linux becomes *very* unhappy without this ... */
-	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule();
-	}
-	return 0;
-}
-
-static int videobuf_dvb_start_feed(struct dvb_demux_feed *feed)
-{
-	struct dvb_demux *demux  = feed->demux;
-	struct videobuf_dvb *dvb = demux->priv;
-	int rc;
-
-	if (!demux->dmx.frontend)
-		return -EINVAL;
-
-	mutex_lock(&dvb->lock);
-	dvb->nfeeds++;
-	rc = dvb->nfeeds;
-
-	if (NULL != dvb->thread)
-		goto out;
-	dvb->thread = kthread_run(videobuf_dvb_thread,
-				  dvb, "%s dvb", dvb->name);
-	if (IS_ERR(dvb->thread)) {
-		rc = PTR_ERR(dvb->thread);
-		dvb->thread = NULL;
-	}
-
-out:
-	mutex_unlock(&dvb->lock);
-	return rc;
-}
-
-static int videobuf_dvb_stop_feed(struct dvb_demux_feed *feed)
-{
-	struct dvb_demux *demux  = feed->demux;
-	struct videobuf_dvb *dvb = demux->priv;
-	int err = 0;
-
-	mutex_lock(&dvb->lock);
-	dvb->nfeeds--;
-	if (0 == dvb->nfeeds  &&  NULL != dvb->thread) {
-		err = kthread_stop(dvb->thread);
-		dvb->thread = NULL;
-	}
-	mutex_unlock(&dvb->lock);
-	return err;
-}
-
-static int videobuf_dvb_register_adapter(struct videobuf_dvb_frontends *fe,
-			  struct module *module,
-			  void *adapter_priv,
-			  struct device *device,
-			  char *adapter_name,
-			  short *adapter_nr,
-			  int mfe_shared)
-{
-	int result;
-
-	mutex_init(&fe->lock);
-
-	/* register adapter */
-	result = dvb_register_adapter(&fe->adapter, adapter_name, module,
-		device, adapter_nr);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: dvb_register_adapter failed (errno = %d)\n",
-		       adapter_name, result);
-	}
-	fe->adapter.priv = adapter_priv;
-	fe->adapter.mfe_shared = mfe_shared;
-
-	return result;
-}
-
-static int videobuf_dvb_register_frontend(struct dvb_adapter *adapter,
-	struct videobuf_dvb *dvb)
-{
-	int result;
-
-	/* register frontend */
-	result = dvb_register_frontend(adapter, dvb->frontend);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: dvb_register_frontend failed (errno = %d)\n",
-		       dvb->name, result);
-		goto fail_frontend;
-	}
-
-	/* register demux stuff */
-	dvb->demux.dmx.capabilities =
-		DMX_TS_FILTERING | DMX_SECTION_FILTERING |
-		DMX_MEMORY_BASED_FILTERING;
-	dvb->demux.priv       = dvb;
-	dvb->demux.filternum  = 256;
-	dvb->demux.feednum    = 256;
-	dvb->demux.start_feed = videobuf_dvb_start_feed;
-	dvb->demux.stop_feed  = videobuf_dvb_stop_feed;
-	result = dvb_dmx_init(&dvb->demux);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: dvb_dmx_init failed (errno = %d)\n",
-		       dvb->name, result);
-		goto fail_dmx;
-	}
-
-	dvb->dmxdev.filternum    = 256;
-	dvb->dmxdev.demux        = &dvb->demux.dmx;
-	dvb->dmxdev.capabilities = 0;
-	result = dvb_dmxdev_init(&dvb->dmxdev, adapter);
-
-	if (result < 0) {
-		printk(KERN_WARNING "%s: dvb_dmxdev_init failed (errno = %d)\n",
-		       dvb->name, result);
-		goto fail_dmxdev;
-	}
-
-	dvb->fe_hw.source = DMX_FRONTEND_0;
-	result = dvb->demux.dmx.add_frontend(&dvb->demux.dmx, &dvb->fe_hw);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: add_frontend failed (DMX_FRONTEND_0, errno = %d)\n",
-		       dvb->name, result);
-		goto fail_fe_hw;
-	}
-
-	dvb->fe_mem.source = DMX_MEMORY_FE;
-	result = dvb->demux.dmx.add_frontend(&dvb->demux.dmx, &dvb->fe_mem);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: add_frontend failed (DMX_MEMORY_FE, errno = %d)\n",
-		       dvb->name, result);
-		goto fail_fe_mem;
-	}
-
-	result = dvb->demux.dmx.connect_frontend(&dvb->demux.dmx, &dvb->fe_hw);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: connect_frontend failed (errno = %d)\n",
-		       dvb->name, result);
-		goto fail_fe_conn;
-	}
-
-	/* register network adapter */
-	result = dvb_net_init(adapter, &dvb->net, &dvb->demux.dmx);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: dvb_net_init failed (errno = %d)\n",
-		       dvb->name, result);
-		goto fail_fe_conn;
-	}
-	return 0;
-
-fail_fe_conn:
-	dvb->demux.dmx.remove_frontend(&dvb->demux.dmx, &dvb->fe_mem);
-fail_fe_mem:
-	dvb->demux.dmx.remove_frontend(&dvb->demux.dmx, &dvb->fe_hw);
-fail_fe_hw:
-	dvb_dmxdev_release(&dvb->dmxdev);
-fail_dmxdev:
-	dvb_dmx_release(&dvb->demux);
-fail_dmx:
-	dvb_unregister_frontend(dvb->frontend);
-fail_frontend:
-	dvb_frontend_detach(dvb->frontend);
-	dvb->frontend = NULL;
-
-	return result;
-}
-
-/* ------------------------------------------------------------------ */
-/* Register a single adapter and one or more frontends */
-int videobuf_dvb_register_bus(struct videobuf_dvb_frontends *f,
-			  struct module *module,
-			  void *adapter_priv,
-			  struct device *device,
-			  short *adapter_nr,
-			  int mfe_shared)
-{
-	struct list_head *list, *q;
-	struct videobuf_dvb_frontend *fe;
-	int res;
-
-	fe = videobuf_dvb_get_frontend(f, 1);
-	if (!fe) {
-		printk(KERN_WARNING "Unable to register the adapter which has no frontends\n");
-		return -EINVAL;
-	}
-
-	/* Bring up the adapter */
-	res = videobuf_dvb_register_adapter(f, module, adapter_priv, device,
-		fe->dvb.name, adapter_nr, mfe_shared);
-	if (res < 0) {
-		printk(KERN_WARNING "videobuf_dvb_register_adapter failed (errno = %d)\n", res);
-		return res;
-	}
-
-	/* Attach all of the frontends to the adapter */
-	mutex_lock(&f->lock);
-	list_for_each_safe(list, q, &f->felist) {
-		fe = list_entry(list, struct videobuf_dvb_frontend, felist);
-		res = videobuf_dvb_register_frontend(&f->adapter, &fe->dvb);
-		if (res < 0) {
-			printk(KERN_WARNING "%s: videobuf_dvb_register_frontend failed (errno = %d)\n",
-				fe->dvb.name, res);
-			goto err;
-		}
-	}
-	mutex_unlock(&f->lock);
-	return 0;
-
-err:
-	mutex_unlock(&f->lock);
-	videobuf_dvb_unregister_bus(f);
-	return res;
-}
-EXPORT_SYMBOL(videobuf_dvb_register_bus);
-
-void videobuf_dvb_unregister_bus(struct videobuf_dvb_frontends *f)
-{
-	videobuf_dvb_dealloc_frontends(f);
-
-	dvb_unregister_adapter(&f->adapter);
-}
-EXPORT_SYMBOL(videobuf_dvb_unregister_bus);
-
-struct videobuf_dvb_frontend *videobuf_dvb_get_frontend(
-	struct videobuf_dvb_frontends *f, int id)
-{
-	struct list_head *list, *q;
-	struct videobuf_dvb_frontend *fe, *ret = NULL;
-
-	mutex_lock(&f->lock);
-
-	list_for_each_safe(list, q, &f->felist) {
-		fe = list_entry(list, struct videobuf_dvb_frontend, felist);
-		if (fe->id == id) {
-			ret = fe;
-			break;
-		}
-	}
-
-	mutex_unlock(&f->lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(videobuf_dvb_get_frontend);
-
-int videobuf_dvb_find_frontend(struct videobuf_dvb_frontends *f,
-	struct dvb_frontend *p)
-{
-	struct list_head *list, *q;
-	struct videobuf_dvb_frontend *fe = NULL;
-	int ret = 0;
-
-	mutex_lock(&f->lock);
-
-	list_for_each_safe(list, q, &f->felist) {
-		fe = list_entry(list, struct videobuf_dvb_frontend, felist);
-		if (fe->dvb.frontend == p) {
-			ret = fe->id;
-			break;
-		}
-	}
-
-	mutex_unlock(&f->lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(videobuf_dvb_find_frontend);
-
-struct videobuf_dvb_frontend *videobuf_dvb_alloc_frontend(
-	struct videobuf_dvb_frontends *f, int id)
-{
-	struct videobuf_dvb_frontend *fe;
-
-	fe = kzalloc(sizeof(struct videobuf_dvb_frontend), GFP_KERNEL);
-	if (fe == NULL)
-		goto fail_alloc;
-
-	fe->id = id;
-	mutex_init(&fe->dvb.lock);
-
-	mutex_lock(&f->lock);
-	list_add_tail(&fe->felist, &f->felist);
-	mutex_unlock(&f->lock);
-
-fail_alloc:
-	return fe;
-}
-EXPORT_SYMBOL(videobuf_dvb_alloc_frontend);
-
-void videobuf_dvb_dealloc_frontends(struct videobuf_dvb_frontends *f)
-{
-	struct list_head *list, *q;
-	struct videobuf_dvb_frontend *fe;
-
-	mutex_lock(&f->lock);
-	list_for_each_safe(list, q, &f->felist) {
-		fe = list_entry(list, struct videobuf_dvb_frontend, felist);
-		if (fe->dvb.net.dvbdev) {
-			dvb_net_release(&fe->dvb.net);
-			fe->dvb.demux.dmx.remove_frontend(&fe->dvb.demux.dmx,
-				&fe->dvb.fe_mem);
-			fe->dvb.demux.dmx.remove_frontend(&fe->dvb.demux.dmx,
-				&fe->dvb.fe_hw);
-			dvb_dmxdev_release(&fe->dvb.dmxdev);
-			dvb_dmx_release(&fe->dvb.demux);
-			dvb_unregister_frontend(fe->dvb.frontend);
-		}
-		if (fe->dvb.frontend)
-			/* always allocated, may have been reset */
-			dvb_frontend_detach(fe->dvb.frontend);
-		list_del(list); /* remove list entry */
-		kfree(fe);	/* free frontend allocation */
-	}
-	mutex_unlock(&f->lock);
-}
-EXPORT_SYMBOL(videobuf_dvb_dealloc_frontends);
diff --git a/include/media/videobuf-dvb.h b/include/media/videobuf-dvb.h
deleted file mode 100644
index c9c81990a56c..000000000000
--- a/include/media/videobuf-dvb.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <media/dvbdev.h>
-#include <media/dmxdev.h>
-#include <media/dvb_demux.h>
-#include <media/dvb_net.h>
-#include <media/dvb_frontend.h>
-
-#ifndef _VIDEOBUF_DVB_H_
-#define	_VIDEOBUF_DVB_H_
-
-struct videobuf_dvb {
-	/* filling that the job of the driver */
-	char                       *name;
-	struct dvb_frontend        *frontend;
-	struct videobuf_queue      dvbq;
-
-	/* video-buf-dvb state info */
-	struct mutex               lock;
-	struct task_struct         *thread;
-	int                        nfeeds;
-
-	/* videobuf_dvb_(un)register manges this */
-	struct dvb_demux           demux;
-	struct dmxdev              dmxdev;
-	struct dmx_frontend        fe_hw;
-	struct dmx_frontend        fe_mem;
-	struct dvb_net             net;
-};
-
-struct videobuf_dvb_frontend {
-	struct list_head felist;
-	int id;
-	struct videobuf_dvb dvb;
-};
-
-struct videobuf_dvb_frontends {
-	struct list_head felist;
-	struct mutex lock;
-	struct dvb_adapter adapter;
-	int active_fe_id; /* Indicates which frontend in the felist is in use */
-	int gate; /* Frontend with gate control 0=!MFE,1=fe0,2=fe1 etc */
-};
-
-int videobuf_dvb_register_bus(struct videobuf_dvb_frontends *f,
-			  struct module *module,
-			  void *adapter_priv,
-			  struct device *device,
-			  short *adapter_nr,
-			  int mfe_shared);
-
-void videobuf_dvb_unregister_bus(struct videobuf_dvb_frontends *f);
-
-struct videobuf_dvb_frontend * videobuf_dvb_alloc_frontend(struct videobuf_dvb_frontends *f, int id);
-void videobuf_dvb_dealloc_frontends(struct videobuf_dvb_frontends *f);
-
-struct videobuf_dvb_frontend * videobuf_dvb_get_frontend(struct videobuf_dvb_frontends *f, int id);
-int videobuf_dvb_find_frontend(struct videobuf_dvb_frontends *f, struct dvb_frontend *p);
-
-#endif			/* _VIDEOBUF_DVB_H_ */
-- 
cgit v1.2.3


From b7c7b05065aa77ae3d7b70b9139ed58970daed78 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Fri, 4 May 2018 02:38:41 +0800
Subject: clk: sunxi-ng: add support for H6 PRCM CCU

The H6 has clock/reset controls in PRCM part, like old SoCs such as H3
and A64. However, the PRCM CCU is rearranged; the register arragement
is now similar to the main CCU of H6, and the PRCM now has two APB
buses to control -- one is clocked from AHB clock derivde from AR100
clock, the other is clocked from the same mux with AR100 clock.
Therefore a new driver is written for it.

As there's no official document about the PRCM in H6, all the information
are indirectly collected from BSP and parts of the document, and the
information source is noted as comments in the driver's source code. If
reliable information is provided furtherly, the driver needs to be
rechecked.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
---
 .../devicetree/bindings/clock/sunxi-ccu.txt        |   3 +-
 drivers/clk/sunxi-ng/Kconfig                       |   5 +
 drivers/clk/sunxi-ng/Makefile                      |   1 +
 drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c             | 207 +++++++++++++++++++++
 drivers/clk/sunxi-ng/ccu-sun50i-h6-r.h             |  19 ++
 include/dt-bindings/clock/sun50i-h6-r-ccu.h        |  24 +++
 include/dt-bindings/reset/sun50i-h6-r-ccu.h        |  17 ++
 7 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c
 create mode 100644 drivers/clk/sunxi-ng/ccu-sun50i-h6-r.h
 create mode 100644 include/dt-bindings/clock/sun50i-h6-r-ccu.h
 create mode 100644 include/dt-bindings/reset/sun50i-h6-r-ccu.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
index 460ef27b1008..47d2e902ced4 100644
--- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
+++ b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
@@ -21,6 +21,7 @@ Required properties :
 		- "allwinner,sun50i-a64-r-ccu"
 		- "allwinner,sun50i-h5-ccu"
 		- "allwinner,sun50i-h6-ccu"
+		- "allwinner,sun50i-h6-r-ccu"
 		- "nextthing,gr8-ccu"
 
 - reg: Must contain the registers base address and length
@@ -35,7 +36,7 @@ Required properties :
 For the main CCU on H6, one more clock is needed:
 - "iosc": the SoC's internal frequency oscillator
 
-For the PRCM CCUs on A83T/H3/A64, two more clocks are needed:
+For the PRCM CCUs on A83T/H3/A64/H6, two more clocks are needed:
 - "pll-periph": the SoC's peripheral PLL from the main CCU
 - "iosc": the SoC's internal frequency oscillator
 
diff --git a/drivers/clk/sunxi-ng/Kconfig b/drivers/clk/sunxi-ng/Kconfig
index 79dfd296c3d1..826674d090fd 100644
--- a/drivers/clk/sunxi-ng/Kconfig
+++ b/drivers/clk/sunxi-ng/Kconfig
@@ -16,6 +16,11 @@ config SUN50I_H6_CCU
 	default ARM64 && ARCH_SUNXI
 	depends on (ARM64 && ARCH_SUNXI) || COMPILE_TEST
 
+config SUN50I_H6_R_CCU
+	bool "Support for the Allwinner H6 PRCM CCU"
+	default ARM64 && ARCH_SUNXI
+	depends on (ARM64 && ARCH_SUNXI) || COMPILE_TEST
+
 config SUN4I_A10_CCU
 	bool "Support for the Allwinner A10/A20 CCU"
 	default MACH_SUN4I
diff --git a/drivers/clk/sunxi-ng/Makefile b/drivers/clk/sunxi-ng/Makefile
index 128a40ee5c5e..acaa14cfa25c 100644
--- a/drivers/clk/sunxi-ng/Makefile
+++ b/drivers/clk/sunxi-ng/Makefile
@@ -23,6 +23,7 @@ lib-$(CONFIG_SUNXI_CCU)		+= ccu_mp.o
 # SoC support
 obj-$(CONFIG_SUN50I_A64_CCU)	+= ccu-sun50i-a64.o
 obj-$(CONFIG_SUN50I_H6_CCU)	+= ccu-sun50i-h6.o
+obj-$(CONFIG_SUN50I_H6_R_CCU)	+= ccu-sun50i-h6-r.o
 obj-$(CONFIG_SUN4I_A10_CCU)	+= ccu-sun4i-a10.o
 obj-$(CONFIG_SUN5I_CCU)		+= ccu-sun5i.o
 obj-$(CONFIG_SUN6I_A31_CCU)	+= ccu-sun6i-a31.o
diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c
new file mode 100644
index 000000000000..27554eaf6929
--- /dev/null
+++ b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017 Icenowy Zheng <icenowy@aosc.xyz>
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+
+#include "ccu_common.h"
+#include "ccu_reset.h"
+
+#include "ccu_div.h"
+#include "ccu_gate.h"
+#include "ccu_mp.h"
+#include "ccu_nm.h"
+
+#include "ccu-sun50i-h6-r.h"
+
+/*
+ * Information about AR100 and AHB/APB clocks in R_CCU are gathered from
+ * clock definitions in the BSP source code.
+ */
+
+static const char * const ar100_r_apb2_parents[] = { "osc24M", "osc32k",
+					     "pll-periph0", "iosc" };
+static const struct ccu_mux_var_prediv ar100_r_apb2_predivs[] = {
+	{ .index = 2, .shift = 0, .width = 5 },
+};
+
+static struct ccu_div ar100_clk = {
+	.div		= _SUNXI_CCU_DIV_FLAGS(8, 2, CLK_DIVIDER_POWER_OF_TWO),
+
+	.mux		= {
+		.shift	= 24,
+		.width	= 2,
+
+		.var_predivs	= ar100_r_apb2_predivs,
+		.n_var_predivs	= ARRAY_SIZE(ar100_r_apb2_predivs),
+	},
+
+	.common		= {
+		.reg		= 0x000,
+		.features	= CCU_FEATURE_VARIABLE_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("ar100",
+						      ar100_r_apb2_parents,
+						      &ccu_div_ops,
+						      0),
+	},
+};
+
+static CLK_FIXED_FACTOR(r_ahb_clk, "r-ahb", "ar100", 1, 1, 0);
+
+static struct ccu_div r_apb1_clk = {
+	.div		= _SUNXI_CCU_DIV(0, 2),
+
+	.common		= {
+		.reg		= 0x00c,
+		.hw.init	= CLK_HW_INIT("r-apb1",
+					      "r-ahb",
+					      &ccu_div_ops,
+					      0),
+	},
+};
+
+static struct ccu_div r_apb2_clk = {
+	.div		= _SUNXI_CCU_DIV_FLAGS(8, 2, CLK_DIVIDER_POWER_OF_TWO),
+
+	.mux		= {
+		.shift	= 24,
+		.width	= 2,
+
+		.var_predivs	= ar100_r_apb2_predivs,
+		.n_var_predivs	= ARRAY_SIZE(ar100_r_apb2_predivs),
+	},
+
+	.common		= {
+		.reg		= 0x010,
+		.features	= CCU_FEATURE_VARIABLE_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("r-apb2",
+						      ar100_r_apb2_parents,
+						      &ccu_div_ops,
+						      0),
+	},
+};
+
+/*
+ * Information about the gate/resets are gathered from the clock header file
+ * in the BSP source code, although most of them are unused. The existence
+ * of the hardware block is verified with "3.1 Memory Mapping" chapter in
+ * "Allwinner H6 V200 User Manual V1.1"; and the parent APB buses are verified
+ * with "3.3.2.1 System Bus Tree" chapter inthe same document.
+ */
+static SUNXI_CCU_GATE(r_apb1_timer_clk,	"r-apb1-timer",	"r-apb1",
+		      0x11c, BIT(0), 0);
+static SUNXI_CCU_GATE(r_apb1_twd_clk,	"r-apb1-twd",	"r-apb1",
+		      0x12c, BIT(0), 0);
+static SUNXI_CCU_GATE(r_apb1_pwm_clk,	"r-apb1-pwm",	"r-apb1",
+		      0x13c, BIT(0), 0);
+static SUNXI_CCU_GATE(r_apb2_uart_clk,	"r-apb2-uart",	"r-apb2",
+		      0x18c, BIT(0), 0);
+static SUNXI_CCU_GATE(r_apb2_i2c_clk,	"r-apb2-i2c",	"r-apb2",
+		      0x19c, BIT(0), 0);
+static SUNXI_CCU_GATE(r_apb1_ir_clk,	"r-apb1-ir",	"r-apb1",
+		      0x1cc, BIT(0), 0);
+static SUNXI_CCU_GATE(r_apb1_w1_clk,	"r-apb1-w1",	"r-apb1",
+		      0x1cc, BIT(0), 0);
+
+/* Information of IR(RX) mod clock is gathered from BSP source code */
+static const char * const r_mod0_default_parents[] = { "osc32k", "osc24M" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(ir_clk, "ir",
+				  r_mod0_default_parents, 0x1c0,
+				  0, 5,		/* M */
+				  8, 2,		/* P */
+				  24, 1,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/*
+ * BSP didn't use the 1-wire function at all now, and the information about
+ * this mod clock is guessed from the IR mod clock above. The existence of
+ * this mod clock is proven by BSP clock header, and the dividers are verified
+ * by contents in the 1-wire related chapter of the User Manual.
+ */
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(w1_clk, "w1",
+				  r_mod0_default_parents, 0x1e0,
+				  0, 5,		/* M */
+				  8, 2,		/* P */
+				  24, 1,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static struct ccu_common *sun50i_h6_r_ccu_clks[] = {
+	&ar100_clk.common,
+	&r_apb1_clk.common,
+	&r_apb2_clk.common,
+	&r_apb1_timer_clk.common,
+	&r_apb1_twd_clk.common,
+	&r_apb1_pwm_clk.common,
+	&r_apb2_uart_clk.common,
+	&r_apb2_i2c_clk.common,
+	&r_apb1_ir_clk.common,
+	&r_apb1_w1_clk.common,
+	&ir_clk.common,
+	&w1_clk.common,
+};
+
+static struct clk_hw_onecell_data sun50i_h6_r_hw_clks = {
+	.hws	= {
+		[CLK_AR100]		= &ar100_clk.common.hw,
+		[CLK_R_AHB]		= &r_ahb_clk.hw,
+		[CLK_R_APB1]		= &r_apb1_clk.common.hw,
+		[CLK_R_APB2]		= &r_apb2_clk.common.hw,
+		[CLK_R_APB1_TIMER]	= &r_apb1_timer_clk.common.hw,
+		[CLK_R_APB1_TWD]	= &r_apb1_twd_clk.common.hw,
+		[CLK_R_APB1_PWM]	= &r_apb1_pwm_clk.common.hw,
+		[CLK_R_APB2_UART]	= &r_apb2_uart_clk.common.hw,
+		[CLK_R_APB2_I2C]	= &r_apb2_i2c_clk.common.hw,
+		[CLK_R_APB1_IR]		= &r_apb1_ir_clk.common.hw,
+		[CLK_R_APB1_W1]		= &r_apb1_w1_clk.common.hw,
+		[CLK_IR]		= &ir_clk.common.hw,
+		[CLK_W1]		= &w1_clk.common.hw,
+	},
+	.num	= CLK_NUMBER,
+};
+
+static struct ccu_reset_map sun50i_h6_r_ccu_resets[] = {
+	[RST_R_APB1_TIMER]	=  { 0x11c, BIT(16) },
+	[RST_R_APB1_TWD]	=  { 0x12c, BIT(16) },
+	[RST_R_APB1_PWM]	=  { 0x13c, BIT(16) },
+	[RST_R_APB2_UART]	=  { 0x18c, BIT(16) },
+	[RST_R_APB2_I2C]	=  { 0x19c, BIT(16) },
+	[RST_R_APB1_IR]		=  { 0x1cc, BIT(16) },
+	[RST_R_APB1_W1]		=  { 0x1ec, BIT(16) },
+};
+
+static const struct sunxi_ccu_desc sun50i_h6_r_ccu_desc = {
+	.ccu_clks	= sun50i_h6_r_ccu_clks,
+	.num_ccu_clks	= ARRAY_SIZE(sun50i_h6_r_ccu_clks),
+
+	.hw_clks	= &sun50i_h6_r_hw_clks,
+
+	.resets		= sun50i_h6_r_ccu_resets,
+	.num_resets	= ARRAY_SIZE(sun50i_h6_r_ccu_resets),
+};
+
+static void __init sunxi_r_ccu_init(struct device_node *node,
+				    const struct sunxi_ccu_desc *desc)
+{
+	void __iomem *reg;
+
+	reg = of_io_request_and_map(node, 0, of_node_full_name(node));
+	if (IS_ERR(reg)) {
+		pr_err("%pOF: Could not map the clock registers\n", node);
+		return;
+	}
+
+	sunxi_ccu_probe(node, reg, desc);
+}
+
+static void __init sun50i_h6_r_ccu_setup(struct device_node *node)
+{
+	sunxi_r_ccu_init(node, &sun50i_h6_r_ccu_desc);
+}
+CLK_OF_DECLARE(sun50i_h6_r_ccu, "allwinner,sun50i-h6-r-ccu",
+	       sun50i_h6_r_ccu_setup);
diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.h b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.h
new file mode 100644
index 000000000000..782117dc0b28
--- /dev/null
+++ b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2017 Icenowy Zheng <icenowy@aosc.xyz>
+ */
+
+#ifndef _CCU_SUN50I_H6_R_H
+#define _CCU_SUN50I_H6_R_H
+
+#include <dt-bindings/clock/sun50i-h6-r-ccu.h>
+#include <dt-bindings/reset/sun50i-h6-r-ccu.h>
+
+/* AHB/APB bus clocks are not exported except APB1 for R_PIO */
+#define CLK_R_AHB	1
+
+#define CLK_R_APB2	3
+
+#define CLK_NUMBER	(CLK_W1 + 1)
+
+#endif /* _CCU_SUN50I_H6_R_H */
diff --git a/include/dt-bindings/clock/sun50i-h6-r-ccu.h b/include/dt-bindings/clock/sun50i-h6-r-ccu.h
new file mode 100644
index 000000000000..76136132a13e
--- /dev/null
+++ b/include/dt-bindings/clock/sun50i-h6-r-ccu.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2017 Icenowy Zheng <icenowy@aosc.xyz>
+ */
+
+#ifndef _DT_BINDINGS_CLK_SUN50I_H6_R_CCU_H_
+#define _DT_BINDINGS_CLK_SUN50I_H6_R_CCU_H_
+
+#define CLK_AR100		0
+
+#define CLK_R_APB1		2
+
+#define CLK_R_APB1_TIMER	4
+#define CLK_R_APB1_TWD		5
+#define CLK_R_APB1_PWM		6
+#define CLK_R_APB2_UART		7
+#define CLK_R_APB2_I2C		8
+#define CLK_R_APB1_IR		9
+#define CLK_R_APB1_W1		10
+
+#define CLK_IR			11
+#define CLK_W1			12
+
+#endif /* _DT_BINDINGS_CLK_SUN50I_H6_R_CCU_H_ */
diff --git a/include/dt-bindings/reset/sun50i-h6-r-ccu.h b/include/dt-bindings/reset/sun50i-h6-r-ccu.h
new file mode 100644
index 000000000000..01c84dba49a4
--- /dev/null
+++ b/include/dt-bindings/reset/sun50i-h6-r-ccu.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: (GPL-2.0+ or MIT) */
+/*
+ * Copyright (C) 2016 Icenowy Zheng <icenowy@aosc.xyz>
+ */
+
+#ifndef _DT_BINDINGS_RST_SUN50I_H6_R_CCU_H_
+#define _DT_BINDINGS_RST_SUN50I_H6_R_CCU_H_
+
+#define RST_R_APB1_TIMER	0
+#define RST_R_APB1_TWD		1
+#define RST_R_APB1_PWM		2
+#define RST_R_APB2_UART		3
+#define RST_R_APB2_I2C		4
+#define RST_R_APB1_IR		5
+#define RST_R_APB1_W1		6
+
+#endif /* _DT_BINDINGS_RST_SUN50I_H6_R_CCU_H_ */
-- 
cgit v1.2.3


From 5b1a270d224b77551ea50b613155619847d3d154 Mon Sep 17 00:00:00 2001
From: Rainer Keller <mail@rainerkeller.de>
Date: Sun, 18 Mar 2018 08:40:16 -0400
Subject: media: dvb: add alternative USB PID for Hauppauge WinTV-soloHD

Newer DVB receivers of this type have a different USB PID.

Signed-off-by: Rainer Keller <mail@rainerkeller.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/usb/em28xx/em28xx-cards.c | 2 ++
 include/media/dvb-usb-ids.h             | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/media/usb/em28xx/em28xx-cards.c b/drivers/media/usb/em28xx/em28xx-cards.c
index 6e0e67d23876..fd4bde048dd9 100644
--- a/drivers/media/usb/em28xx/em28xx-cards.c
+++ b/drivers/media/usb/em28xx/em28xx-cards.c
@@ -2663,6 +2663,8 @@ struct usb_device_id em28xx_id_table[] = {
 			.driver_info = EM28178_BOARD_PCTV_292E },
 	{ USB_DEVICE(0x2040, 0x8268), /* Hauppauge Retail WinTV-soloHD Bulk */
 			.driver_info = EM28178_BOARD_PCTV_292E },
+	{ USB_DEVICE(0x2040, 0x8268), /* Hauppauge WinTV-soloHD alt. PID */
+			.driver_info = EM28178_BOARD_PCTV_292E },
 	{ USB_DEVICE(0x0413, 0x6f07),
 			.driver_info = EM2861_BOARD_LEADTEK_VC100 },
 	{ USB_DEVICE(0xeb1a, 0x8179),
diff --git a/include/media/dvb-usb-ids.h b/include/media/dvb-usb-ids.h
index 28e2be5c8a98..f9e73b4a6e89 100644
--- a/include/media/dvb-usb-ids.h
+++ b/include/media/dvb-usb-ids.h
@@ -418,6 +418,7 @@
 #define USB_PID_SVEON_STV27                             0xd3af
 #define USB_PID_TURBOX_DTT_2000                         0xd3a4
 #define USB_PID_WINTV_SOLOHD                            0x0264
+#define USB_PID_WINTV_SOLOHD_2                          0x8268
 #define USB_PID_EVOLVEO_XTRATV_STICK			0xa115
 #define USB_PID_HAMA_DVBT_HYBRID			0x2758
 #define USB_PID_XBOX_ONE_TUNER                          0x02d5
-- 
cgit v1.2.3


From aae7cfcbb733cf16f3bc9cbb650673b94d5df75f Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Fri, 4 May 2018 16:28:19 +0200
Subject: security: add hook for socketpair()

Right now the LSM labels for socketpairs are always uninitialized,
since there is no security hook for the socketpair() syscall. This
patch adds the required hooks so LSMs can properly label socketpairs.
This allows SO_PEERSEC to return useful information on those sockets.

Note that the behavior of socketpair() can be emulated by creating a
listener socket, connecting to it, and then discarding the initial
listener socket. With this workaround, SO_PEERSEC would return the
caller's security context. However, with socketpair(), the uninitialized
context is returned unconditionally. This is unexpected and makes
socketpair() less useful in situations where the security context is
crucial to the application.

With the new socketpair-hook this disparity can be solved by making
socketpair() return the expected security context.

Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: James Morris <james.morris@microsoft.com>
---
 include/linux/lsm_hooks.h | 7 +++++++
 include/linux/security.h  | 7 +++++++
 security/security.c       | 6 ++++++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9d0b286f3dba..8f1131c8dd54 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -757,6 +757,11 @@
  *	@type contains the requested communications type.
  *	@protocol contains the requested protocol.
  *	@kern set to 1 if a kernel socket.
+ * @socket_socketpair:
+ *	Check permissions before creating a fresh pair of sockets.
+ *	@socka contains the first socket structure.
+ *	@sockb contains the second socket structure.
+ *	Return 0 if permission is granted and the connection was established.
  * @socket_bind:
  *	Check permission before socket protocol layer bind operation is
  *	performed and the socket @sock is bound to the address specified in the
@@ -1656,6 +1661,7 @@ union security_list_options {
 	int (*socket_create)(int family, int type, int protocol, int kern);
 	int (*socket_post_create)(struct socket *sock, int family, int type,
 					int protocol, int kern);
+	int (*socket_socketpair)(struct socket *socka, struct socket *sockb);
 	int (*socket_bind)(struct socket *sock, struct sockaddr *address,
 				int addrlen);
 	int (*socket_connect)(struct socket *sock, struct sockaddr *address,
@@ -1922,6 +1928,7 @@ struct security_hook_heads {
 	struct hlist_head unix_may_send;
 	struct hlist_head socket_create;
 	struct hlist_head socket_post_create;
+	struct hlist_head socket_socketpair;
 	struct hlist_head socket_bind;
 	struct hlist_head socket_connect;
 	struct hlist_head socket_listen;
diff --git a/include/linux/security.h b/include/linux/security.h
index ecb06e1357dd..63030c85ee19 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1177,6 +1177,7 @@ int security_unix_may_send(struct socket *sock,  struct socket *other);
 int security_socket_create(int family, int type, int protocol, int kern);
 int security_socket_post_create(struct socket *sock, int family,
 				int type, int protocol, int kern);
+int security_socket_socketpair(struct socket *socka, struct socket *sockb);
 int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen);
 int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen);
 int security_socket_listen(struct socket *sock, int backlog);
@@ -1248,6 +1249,12 @@ static inline int security_socket_post_create(struct socket *sock,
 	return 0;
 }
 
+static inline int security_socket_socketpair(struct socket *socka,
+					     struct socket *sockb)
+{
+	return 0;
+}
+
 static inline int security_socket_bind(struct socket *sock,
 				       struct sockaddr *address,
 				       int addrlen)
diff --git a/security/security.c b/security/security.c
index 7bc2fde023a7..68f46d849abe 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1358,6 +1358,12 @@ int security_socket_post_create(struct socket *sock, int family,
 						protocol, kern);
 }
 
+int security_socket_socketpair(struct socket *socka, struct socket *sockb)
+{
+	return call_int_hook(socket_socketpair, 0, socka, sockb);
+}
+EXPORT_SYMBOL(security_socket_socketpair);
+
 int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
 {
 	return call_int_hook(socket_bind, 0, sock, address, addrlen);
-- 
cgit v1.2.3


From 0cd3cbed3caf6eae3bc0fa4afa4f26a9babfe55a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:08 -0700
Subject: bpf: offload: allow offloaded programs to use perf event arrays

BPF_MAP_TYPE_PERF_EVENT_ARRAY is special as far as offload goes.
The map only holds glue to perf ring, not actual data.  Allow
non-offloaded perf event arrays to be used in offloaded programs.
Offload driver can extract the events from HW and put them in
the map for user space to retrieve.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  | 5 +++++
 kernel/bpf/offload.c | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0e00a13ff01b..321969da67b7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -110,6 +110,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
 	return container_of(map, struct bpf_offloaded_map, map);
 }
 
+static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
+{
+	return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY;
+}
+
 static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
 {
 	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index c9401075b58c..ac747d5cf7c6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
 	struct bpf_prog_offload *offload;
 	bool ret;
 
-	if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+	if (!bpf_prog_is_dev_bound(prog->aux))
 		return false;
+	if (!bpf_map_is_dev_bound(map))
+		return bpf_map_offload_neutral(map);
 
 	down_read(&bpf_devs_lock);
 	offload = prog->aux->offload;
-- 
cgit v1.2.3


From 8fb11a9a8d51df9a314a6d970436963c127ff1bd Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 4 May 2018 13:54:24 -0700
Subject: net/ipv6: rename rt6_next to fib6_next

This slipped through the cracks in the followup set to the fib6_info flip.
Rename rt6_next to fib6_next.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  6 +++---
 net/ipv6/ip6_fib.c    | 26 +++++++++++++-------------
 net/ipv6/route.c      | 12 ++++++------
 3 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1af450d4e923..a3ec08d05756 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -135,7 +135,7 @@ struct fib6_nh {
 
 struct fib6_info {
 	struct fib6_table		*fib6_table;
-	struct fib6_info __rcu		*rt6_next;
+	struct fib6_info __rcu		*fib6_next;
 	struct fib6_node __rcu		*fib6_node;
 
 	/* Multipath routes:
@@ -192,11 +192,11 @@ struct rt6_info {
 
 #define for_each_fib6_node_rt_rcu(fn)					\
 	for (rt = rcu_dereference((fn)->leaf); rt;			\
-	     rt = rcu_dereference(rt->rt6_next))
+	     rt = rcu_dereference(rt->fib6_next))
 
 #define for_each_fib6_walker_rt(w)					\
 	for (rt = (w)->leaf; rt;					\
-	     rt = rcu_dereference_protected(rt->rt6_next, 1))
+	     rt = rcu_dereference_protected(rt->fib6_next, 1))
 
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6421c893466e..f0a4262a4789 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -945,7 +945,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 	ins = &fn->leaf;
 
 	for (iter = leaf; iter;
-	     iter = rcu_dereference_protected(iter->rt6_next,
+	     iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock))) {
 		/*
 		 *	Search for duplicates
@@ -1002,7 +1002,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			break;
 
 next_iter:
-		ins = &iter->rt6_next;
+		ins = &iter->fib6_next;
 	}
 
 	if (fallback_ins && !found) {
@@ -1031,7 +1031,7 @@ next_iter:
 					      &sibling->fib6_siblings);
 				break;
 			}
-			sibling = rcu_dereference_protected(sibling->rt6_next,
+			sibling = rcu_dereference_protected(sibling->fib6_next,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		}
 		/* For each sibling in the list, increment the counter of
@@ -1065,7 +1065,7 @@ add:
 		if (err)
 			return err;
 
-		rcu_assign_pointer(rt->rt6_next, iter);
+		rcu_assign_pointer(rt->fib6_next, iter);
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
 		rcu_assign_pointer(*ins, rt);
@@ -1096,7 +1096,7 @@ add:
 
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
-		rt->rt6_next = iter->rt6_next;
+		rt->fib6_next = iter->fib6_next;
 		rcu_assign_pointer(*ins, rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -1113,14 +1113,14 @@ add:
 
 		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
-			ins = &rt->rt6_next;
+			ins = &rt->fib6_next;
 			iter = rcu_dereference_protected(*ins,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 			while (iter) {
 				if (iter->fib6_metric > rt->fib6_metric)
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
-					*ins = iter->rt6_next;
+					*ins = iter->fib6_next;
 					iter->fib6_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
@@ -1129,7 +1129,7 @@ add:
 					nsiblings--;
 					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 				} else {
-					ins = &iter->rt6_next;
+					ins = &iter->fib6_next;
 				}
 				iter = rcu_dereference_protected(*ins,
 					lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -1712,7 +1712,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	RT6_TRACE("fib6_del_route\n");
 
 	/* Unlink it */
-	*rtp = rt->rt6_next;
+	*rtp = rt->fib6_next;
 	rt->fib6_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1741,7 +1741,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
-			w->leaf = rcu_dereference_protected(rt->rt6_next,
+			w->leaf = rcu_dereference_protected(rt->fib6_next,
 					    lockdep_is_held(&table->tb6_lock));
 			if (!w->leaf)
 				w->state = FWS_U;
@@ -1795,7 +1795,7 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info)
 			fib6_del_route(table, fn, rtp, info);
 			return 0;
 		}
-		rtp_next = &cur->rt6_next;
+		rtp_next = &cur->fib6_next;
 	}
 	return -ENOENT;
 }
@@ -2279,7 +2279,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
 
 	do {
 		iter->w.leaf = rcu_dereference_protected(
-				iter->w.leaf->rt6_next,
+				iter->w.leaf->fib6_next,
 				lockdep_is_held(&iter->tbl->tb6_lock));
 		iter->skip--;
 		if (!iter->skip && iter->w.leaf)
@@ -2345,7 +2345,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (!v)
 		goto iter_table;
 
-	n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next);
+	n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
 	if (n) {
 		++*pos;
 		return n;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8ed1b519c2b0..daa3662da0ee 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -468,7 +468,7 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
 		return rt;
 
-	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
+	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
 
 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
@@ -696,7 +696,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 
 	match = NULL;
 	cont = NULL;
-	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
+	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
 		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
@@ -706,7 +706,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 	}
 
 	for (rt = leaf; rt && rt != rr_head;
-	     rt = rcu_dereference(rt->rt6_next)) {
+	     rt = rcu_dereference(rt->fib6_next)) {
 		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
@@ -718,7 +718,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 	if (match || !cont)
 		return match;
 
-	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
+	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
 	return match;
@@ -756,7 +756,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 			     &do_rr);
 
 	if (do_rr) {
-		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
+		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
 
 		/* no entries matched; do round-robin */
 		if (!next || next->fib6_metric != rt0->fib6_metric)
@@ -3781,7 +3781,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 		if (iter->fib6_metric == rt->fib6_metric &&
 		    rt6_qualify_for_ecmp(iter))
 			return iter;
-		iter = rcu_dereference_protected(iter->rt6_next,
+		iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock));
 	}
 
-- 
cgit v1.2.3


From 5cc73ff7a3524d4ce9a8bde0da6e34fb54fed45b Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Fri, 20 Apr 2018 15:38:10 +0800
Subject: clk: imx6sx: add missing lvds2 clock to the clock tree

i.MX6SX has lvds2 (analog clock2), an I/O clock like lvds1.
And this lvds2, along with lvds1, can be used to provide
external clock source to the internal pll, such as pll4_audio
and pll5_video.

This patch mainly adds the lvds2 to the clock tree and fix its
relationship with pll accordingly.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Shawn Guo <shawnguo@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx6sx.c             | 10 +++++++---
 include/dt-bindings/clock/imx6sx-clock.h |  6 +++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/imx/clk-imx6sx.c b/drivers/clk/imx/clk-imx6sx.c
index bc3f9ebf2d9e..0178ee26a53a 100644
--- a/drivers/clk/imx/clk-imx6sx.c
+++ b/drivers/clk/imx/clk-imx6sx.c
@@ -80,7 +80,7 @@ static const char *lvds_sels[]	= {
 	"arm", "pll1_sys", "dummy", "dummy", "dummy", "dummy", "dummy", "pll5_video_div",
 	"dummy", "dummy", "pcie_ref_125m", "dummy", "usbphy1", "usbphy2",
 };
-static const char *pll_bypass_src_sels[] = { "osc", "lvds1_in", };
+static const char *pll_bypass_src_sels[] = { "osc", "lvds1_in", "lvds2_in", "dummy", };
 static const char *pll1_bypass_sels[] = { "pll1", "pll1_bypass_src", };
 static const char *pll2_bypass_sels[] = { "pll2", "pll2_bypass_src", };
 static const char *pll3_bypass_sels[] = { "pll3", "pll3_bypass_src", };
@@ -158,8 +158,9 @@ static void __init imx6sx_clocks_init(struct device_node *ccm_node)
 	clks[IMX6SX_CLK_IPP_DI0] = of_clk_get_by_name(ccm_node, "ipp_di0");
 	clks[IMX6SX_CLK_IPP_DI1] = of_clk_get_by_name(ccm_node, "ipp_di1");
 
-	/* Clock source from external clock via CLK1 PAD */
-	clks[IMX6SX_CLK_ANACLK1] = imx_obtain_fixed_clock("anaclk1", 0);
+	/* Clock source from external clock via CLK1/2 PAD */
+	clks[IMX6SX_CLK_ANACLK1] = of_clk_get_by_name(ccm_node, "anaclk1");
+	clks[IMX6SX_CLK_ANACLK2] = of_clk_get_by_name(ccm_node, "anaclk2");
 
 	np = of_find_compatible_node(NULL, NULL, "fsl,imx6sx-anatop");
 	base = of_iomap(np, 0);
@@ -228,7 +229,9 @@ static void __init imx6sx_clocks_init(struct device_node *ccm_node)
 	clks[IMX6SX_CLK_PCIE_REF_125M] = imx_clk_gate("pcie_ref_125m", "pcie_ref", base + 0xe0, 19);
 
 	clks[IMX6SX_CLK_LVDS1_OUT] = imx_clk_gate_exclusive("lvds1_out", "lvds1_sel", base + 0x160, 10, BIT(12));
+	clks[IMX6SX_CLK_LVDS2_OUT] = imx_clk_gate_exclusive("lvds2_out", "lvds2_sel", base + 0x160, 11, BIT(13));
 	clks[IMX6SX_CLK_LVDS1_IN]  = imx_clk_gate_exclusive("lvds1_in",  "anaclk1",   base + 0x160, 12, BIT(10));
+	clks[IMX6SX_CLK_LVDS2_IN]  = imx_clk_gate_exclusive("lvds2_in",  "anaclk2",   base + 0x160, 13, BIT(11));
 
 	clks[IMX6SX_CLK_ENET_REF] = clk_register_divider_table(NULL, "enet_ref", "pll6_enet", 0,
 			base + 0xe0, 0, 2, 0, clk_enet_ref_table,
@@ -270,6 +273,7 @@ static void __init imx6sx_clocks_init(struct device_node *ccm_node)
 
 	/*                                                name                reg           shift   width   parent_names       num_parents */
 	clks[IMX6SX_CLK_LVDS1_SEL]          = imx_clk_mux("lvds1_sel",        base + 0x160, 0,      5,      lvds_sels,         ARRAY_SIZE(lvds_sels));
+	clks[IMX6SX_CLK_LVDS2_SEL]          = imx_clk_mux("lvds2_sel",        base + 0x160, 5,      5,      lvds_sels,         ARRAY_SIZE(lvds_sels));
 
 	np = ccm_node;
 	base = of_iomap(np, 0);
diff --git a/include/dt-bindings/clock/imx6sx-clock.h b/include/dt-bindings/clock/imx6sx-clock.h
index 36f0324902a5..cd2d6c570e86 100644
--- a/include/dt-bindings/clock/imx6sx-clock.h
+++ b/include/dt-bindings/clock/imx6sx-clock.h
@@ -275,6 +275,10 @@
 #define IMX6SX_PLL6_BYPASS		262
 #define IMX6SX_PLL7_BYPASS		263
 #define IMX6SX_CLK_SPDIF_GCLK		264
-#define IMX6SX_CLK_CLK_END		265
+#define IMX6SX_CLK_LVDS2_SEL		265
+#define IMX6SX_CLK_LVDS2_OUT		266
+#define IMX6SX_CLK_LVDS2_IN		267
+#define IMX6SX_CLK_ANACLK2		268
+#define IMX6SX_CLK_CLK_END		269
 
 #endif /* __DT_BINDINGS_CLOCK_IMX6SX_H */
-- 
cgit v1.2.3


From 1e0a601437a6111ecf384df010812c53cada6497 Mon Sep 17 00:00:00 2001
From: Lokesh Vutla <lokeshvutla@ti.com>
Date: Fri, 4 May 2018 23:10:23 -0700
Subject: firmware: ti_sci: Switch to SPDX Licensing

Switch to SPDX licensing and drop the GPL text which comes redundant.

Acked-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 10 +---------
 drivers/firmware/ti_sci.h              | 30 +-----------------------------
 include/linux/soc/ti/ti_sci_protocol.h | 10 +---------
 3 files changed, 3 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 5229036dcfbf..b74a533ef35b 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Texas Instruments System Control Interface Protocol Driver
  *
  * Copyright (C) 2015-2016 Texas Instruments Incorporated - http://www.ti.com/
  *	Nishanth Menon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #define pr_fmt(fmt) "%s: " fmt, __func__
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 9b611e9e6f6d..12bf316b68df 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: BSD-3-Clause
 /*
  * Texas Instruments System Control Interface (TISCI) Protocol
  *
@@ -6,35 +7,6 @@
  * See: http://processors.wiki.ti.com/index.php/TISCI for details
  *
  * Copyright (C)  2015-2016 Texas Instruments Incorporated - http://www.ti.com/
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- *
- *   Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the
- *   distribution.
- *
- *   Neither the name of Texas Instruments Incorporated nor the names of
- *   its contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  */
 
 #ifndef __TI_SCI_H
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 0ccbc138c26a..18435e5c6364 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Texas Instruments System Control Interface Protocol
  *
  * Copyright (C) 2015-2016 Texas Instruments Incorporated - http://www.ti.com/
  *	Nishanth Menon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __TISCI_PROTOCOL_H
-- 
cgit v1.2.3


From 8da02bf1a216224e6f63b48d883b5aee07f82ba9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 25 Apr 2018 14:20:45 +0200
Subject: crypto: sm4 - export encrypt/decrypt routines to other drivers

In preparation of adding support for the SIMD based arm64 implementation
of arm64, which requires a fallback to non-SIMD code when invoked in
certain contexts, expose the generic SM4 encrypt and decrypt routines
to other drivers.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/sm4_generic.c | 10 ++++++----
 include/crypto/sm4.h |  3 +++
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/crypto/sm4_generic.c b/crypto/sm4_generic.c
index f537a2766c55..c18eebfd5edd 100644
--- a/crypto/sm4_generic.c
+++ b/crypto/sm4_generic.c
@@ -190,21 +190,23 @@ static void sm4_do_crypt(const u32 *rk, u32 *out, const u32 *in)
 
 /* encrypt a block of text */
 
-static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	sm4_do_crypt(ctx->rkey_enc, (u32 *)out, (u32 *)in);
 }
+EXPORT_SYMBOL_GPL(crypto_sm4_encrypt);
 
 /* decrypt a block of text */
 
-static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	sm4_do_crypt(ctx->rkey_dec, (u32 *)out, (u32 *)in);
 }
+EXPORT_SYMBOL_GPL(crypto_sm4_decrypt);
 
 static struct crypto_alg sm4_alg = {
 	.cra_name		=	"sm4",
@@ -219,8 +221,8 @@ static struct crypto_alg sm4_alg = {
 			.cia_min_keysize	=	SM4_KEY_SIZE,
 			.cia_max_keysize	=	SM4_KEY_SIZE,
 			.cia_setkey		=	crypto_sm4_set_key,
-			.cia_encrypt		=	sm4_encrypt,
-			.cia_decrypt		=	sm4_decrypt
+			.cia_encrypt		=	crypto_sm4_encrypt,
+			.cia_decrypt		=	crypto_sm4_decrypt
 		}
 	}
 };
diff --git a/include/crypto/sm4.h b/include/crypto/sm4.h
index b64e64d20b28..7afd730d16ff 100644
--- a/include/crypto/sm4.h
+++ b/include/crypto/sm4.h
@@ -25,4 +25,7 @@ int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 int crypto_sm4_expand_key(struct crypto_sm4_ctx *ctx, const u8 *in_key,
 			  unsigned int key_len);
 
+void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+
 #endif
-- 
cgit v1.2.3


From 6159e12e11770fb25e748af90f6c5206c1df09ee Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 20 Apr 2018 11:55:13 -0400
Subject: media: meye: allow building it with COMPILE_TEST on non-x86

This driver depends on sony-laptop driver, but this is available
only for x86. So, add a stub function, in order to allow building
it on non-x86 too.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/pci/meye/Kconfig | 3 ++-
 include/linux/sony-laptop.h    | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/media/pci/meye/Kconfig b/drivers/media/pci/meye/Kconfig
index b4bf848be5a0..2e60334ffef5 100644
--- a/drivers/media/pci/meye/Kconfig
+++ b/drivers/media/pci/meye/Kconfig
@@ -1,6 +1,7 @@
 config VIDEO_MEYE
 	tristate "Sony Vaio Picturebook Motion Eye Video For Linux"
-	depends on PCI && SONY_LAPTOP && VIDEO_V4L2
+	depends on PCI && VIDEO_V4L2
+	depends on SONY_LAPTOP || COMPILE_TEST
 	---help---
 	  This is the video4linux driver for the Motion Eye camera found
 	  in the Vaio Picturebook laptops. Please read the material in
diff --git a/include/linux/sony-laptop.h b/include/linux/sony-laptop.h
index 1a4b77317fa1..374d0fdb0743 100644
--- a/include/linux/sony-laptop.h
+++ b/include/linux/sony-laptop.h
@@ -28,7 +28,11 @@
 #define SONY_PIC_COMMAND_GETCAMERAROMVERSION	18	/* obsolete */
 #define SONY_PIC_COMMAND_GETCAMERAREVISION	19	/* obsolete */
 
+#if IS_ENABLED(CONFIG_SONY_LAPTOP)
 int sony_pic_camera_command(int command, u8 value);
+#else
+static inline int sony_pic_camera_command(int command, u8 value) { return 0; };
+#endif
 
 #endif	/* __KERNEL__ */
 
-- 
cgit v1.2.3


From f10379aad39e9da8bc7d1822e251b5f0673067ef Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Fri, 4 May 2018 16:49:32 -0400
Subject: media: include/video/omapfb_dss.h: use IS_ENABLED()

Just checking for ifdefs cause build issues as reported by
kernel test:

config: openrisc-allmodconfig (attached as .config)
compiler: or1k-linux-gcc (GCC) 6.0.0 20160327 (experimental)

All errors (new ones prefixed by >>):

   drivers/video/fbdev/omap2/omapfb/omapfb-main.c: In function 'omapfb_init_connections':
>> drivers/video/fbdev/omap2/omapfb/omapfb-main.c:2396:8: error: implicit declaration of function 'omapdss_find_mgr_from_display' [-Werror=implicit-function-declaration]
     mgr = omapdss_find_mgr_from_display(def_dssdev);
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/video/fbdev/omap2/omapfb/omapfb-main.c:2396:6: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
     mgr = omapdss_find_mgr_from_display(def_dssdev);
         ^
   drivers/video/fbdev/omap2/omapfb/omapfb-main.c: In function 'omapfb_find_default_display':
>> drivers/video/fbdev/omap2/omapfb/omapfb-main.c:2430:13: error: implicit declaration of function 'omapdss_get_default_display_name' [-Werror=implicit-function-declaration]
     def_name = omapdss_get_default_display_name();
                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/video/fbdev/omap2/omapfb/omapfb-main.c:2430:11: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
     def_name = omapdss_get_default_display_name();
              ^

So, use IS_ENABLED() instead.

Fixes: 771f7be87ff9 ("media: omapfb: omapfb_dss.h: add stubs to build with COMPILE_TEST && DRM_OMAP")

Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: tomi.valkeinen@ti.com
Cc: linux-omap@vger.kernel.org
Cc: linux-fbdev@vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/video/omapfb_dss.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/video/omapfb_dss.h b/include/video/omapfb_dss.h
index e9775144ff3b..12755d8d9b4f 100644
--- a/include/video/omapfb_dss.h
+++ b/include/video/omapfb_dss.h
@@ -778,7 +778,7 @@ struct omap_dss_driver {
 
 typedef void (*omap_dispc_isr_t) (void *arg, u32 mask);
 
-#ifdef CONFIG_FB_OMAP2
+#if IS_ENABLED(CONFIG_FB_OMAP2)
 
 enum omapdss_version omapdss_get_version(void);
 bool omapdss_is_initialized(void);
-- 
cgit v1.2.3


From d734a2888922efa521f9eb8dfe6790ced8f62bb7 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Sun, 22 Apr 2018 11:03:23 +0200
Subject: netfilter: nft_numgen: add map lookups for numgen statements

This patch includes a new attribute in the numgen structure to allow
the lookup of an element based on the number generator as a key.

For this purpose, different ops have been included to extend the
current numgen inc functions.

Currently, only supported for numgen incremental operations, but
it will be supported for random in a follow-up patch.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 ++
 net/netfilter/nft_numgen.c               | 85 ++++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 6a3d653d5b27..5a5551a580f7 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1450,6 +1450,8 @@ enum nft_trace_types {
  * @NFTA_NG_MODULUS: maximum counter value (NLA_U32)
  * @NFTA_NG_TYPE: operation type (NLA_U32)
  * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32)
+ * @NFTA_NG_SET_NAME: name of the map to lookup (NLA_STRING)
+ * @NFTA_NG_SET_ID: id of the map (NLA_U32)
  */
 enum nft_ng_attributes {
 	NFTA_NG_UNSPEC,
@@ -1457,6 +1459,8 @@ enum nft_ng_attributes {
 	NFTA_NG_MODULUS,
 	NFTA_NG_TYPE,
 	NFTA_NG_OFFSET,
+	NFTA_NG_SET_NAME,
+	NFTA_NG_SET_ID,
 	__NFTA_NG_MAX
 };
 #define NFTA_NG_MAX	(__NFTA_NG_MAX - 1)
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 5a3a52c71545..8a64db8f2e69 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,13 +24,11 @@ struct nft_ng_inc {
 	u32			modulus;
 	atomic_t		counter;
 	u32			offset;
+	struct nft_set		*map;
 };
 
-static void nft_ng_inc_eval(const struct nft_expr *expr,
-			    struct nft_regs *regs,
-			    const struct nft_pktinfo *pkt)
+static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
 {
-	struct nft_ng_inc *priv = nft_expr_priv(expr);
 	u32 nval, oval;
 
 	do {
@@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
-	regs->data[priv->dreg] = nval + priv->offset;
+	return nval + priv->offset;
+}
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	regs->data[priv->dreg] = nft_ng_inc_gen(priv);
+}
+
+static void nft_ng_inc_map_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = nft_ng_inc_gen(priv);
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
 }
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
@@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
 	[NFTA_NG_OFFSET]	= { .type = NLA_U32 },
+	[NFTA_NG_SET_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_SET_MAXNAMELEN - 1 },
+	[NFTA_NG_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -71,6 +101,25 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_ng_inc_init(ctx, expr, tb);
+
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_NG_SET_NAME],
+					  tb[NFTA_NG_SET_ID], genmask);
+
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
 		       u32 modulus, enum nft_ng_types type, u32 offset)
 {
@@ -97,6 +146,22 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 			   priv->offset);
 }
 
+static int nft_ng_inc_map_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr)
+{
+	const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+			NFT_NG_INCREMENTAL, priv->offset) ||
+	    nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
 	u32			modulus;
@@ -156,6 +221,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
 	.dump		= nft_ng_inc_dump,
 };
 
+static const struct nft_expr_ops nft_ng_inc_map_ops = {
+	.type		= &nft_ng_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+	.eval		= nft_ng_inc_map_eval,
+	.init		= nft_ng_inc_map_init,
+	.dump		= nft_ng_inc_map_dump,
+};
+
 static const struct nft_expr_ops nft_ng_random_ops = {
 	.type		= &nft_ng_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -178,6 +251,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 
 	switch (type) {
 	case NFT_NG_INCREMENTAL:
+		if (tb[NFTA_NG_SET_NAME])
+			return &nft_ng_inc_map_ops;
 		return &nft_ng_inc_ops;
 	case NFT_NG_RANDOM:
 		return &nft_ng_random_ops;
-- 
cgit v1.2.3


From c1c7e44b4f62e9c07fea8aaa58ed46cfae48ba3d Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Wed, 25 Apr 2018 05:30:24 -0500
Subject: netfilter: ip6t_srh: extend SRH matching for previous, next and last
 SID

IPv6 Segment Routing Header (SRH) contains a list of SIDs to be crossed
by SR encapsulated packet. Each SID is encoded as an IPv6 prefix.

When a Firewall receives an SR encapsulated packet, it should be able
to identify which node previously processed the packet (previous SID),
which node is going to process the packet next (next SID), and which
node is the last to process the packet (last SID) which represent the
final destination of the packet in case of inline SR mode.

An example use-case of using these features could be SID list that
includes two firewalls. When the second firewall receives a packet,
it can check whether the packet has been processed by the first firewall
or not. Based on that check, it decides to apply all rules, apply just
subset of the rules, or totally skip all rules and forward the packet to
the next SID.

This patch extends SRH match to support matching previous SID, next SID,
and last SID.

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_srh.h |  43 ++++++-
 net/ipv6/netfilter/ip6t_srh.c                | 173 +++++++++++++++++++++++++--
 2 files changed, 205 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
index f3cc0ef514a7..54ed83360dac 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
@@ -17,7 +17,10 @@
 #define IP6T_SRH_LAST_GT        0x0100
 #define IP6T_SRH_LAST_LT        0x0200
 #define IP6T_SRH_TAG            0x0400
-#define IP6T_SRH_MASK           0x07FF
+#define IP6T_SRH_PSID           0x0800
+#define IP6T_SRH_NSID           0x1000
+#define IP6T_SRH_LSID           0x2000
+#define IP6T_SRH_MASK           0x3FFF
 
 /* Values for "mt_invflags" field in struct ip6t_srh */
 #define IP6T_SRH_INV_NEXTHDR    0x0001
@@ -31,7 +34,10 @@
 #define IP6T_SRH_INV_LAST_GT    0x0100
 #define IP6T_SRH_INV_LAST_LT    0x0200
 #define IP6T_SRH_INV_TAG        0x0400
-#define IP6T_SRH_INV_MASK       0x07FF
+#define IP6T_SRH_INV_PSID       0x0800
+#define IP6T_SRH_INV_NSID       0x1000
+#define IP6T_SRH_INV_LSID       0x2000
+#define IP6T_SRH_INV_MASK       0x3FFF
 
 /**
  *      struct ip6t_srh - SRH match options
@@ -54,4 +60,37 @@ struct ip6t_srh {
 	__u16                   mt_invflags;
 };
 
+/**
+ *      struct ip6t_srh1 - SRH match options (revision 1)
+ *      @ next_hdr: Next header field of SRH
+ *      @ hdr_len: Extension header length field of SRH
+ *      @ segs_left: Segments left field of SRH
+ *      @ last_entry: Last entry field of SRH
+ *      @ tag: Tag field of SRH
+ *      @ psid_addr: Address of previous SID in SRH SID list
+ *      @ nsid_addr: Address of NEXT SID in SRH SID list
+ *      @ lsid_addr: Address of LAST SID in SRH SID list
+ *      @ psid_msk: Mask of previous SID in SRH SID list
+ *      @ nsid_msk: Mask of next SID in SRH SID list
+ *      @ lsid_msk: MAsk of last SID in SRH SID list
+ *      @ mt_flags: match options
+ *      @ mt_invflags: Invert the sense of match options
+ */
+
+struct ip6t_srh1 {
+	__u8                    next_hdr;
+	__u8                    hdr_len;
+	__u8                    segs_left;
+	__u8                    last_entry;
+	__u16                   tag;
+	struct in6_addr         psid_addr;
+	struct in6_addr         nsid_addr;
+	struct in6_addr         lsid_addr;
+	struct in6_addr         psid_msk;
+	struct in6_addr         nsid_msk;
+	struct in6_addr         lsid_msk;
+	__u16                   mt_flags;
+	__u16                   mt_invflags;
+};
+
 #endif /*_IP6T_SRH_H*/
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
index 33719d5560c8..1059894a6f4c 100644
--- a/net/ipv6/netfilter/ip6t_srh.c
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
+static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
+	const struct ip6t_srh1 *srhinfo = par->matchinfo;
+	struct in6_addr *psid, *nsid, *lsid;
+	struct in6_addr _psid, _nsid, _lsid;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6_sr_hdr _srh;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return false;
+	srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+	if (!srh)
+		return false;
+
+	hdrlen = ipv6_optlen(srh);
+	if (skb->len - srhoff < hdrlen)
+		return false;
+
+	if (srh->type != IPV6_SRCRT_TYPE_4)
+		return false;
+
+	if (srh->segments_left > srh->first_segment)
+		return false;
+
+	/* Next Header matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+				!(srh->nexthdr == srhinfo->next_hdr)))
+			return false;
+
+	/* Header Extension Length matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+				!(srh->hdrlen == srhinfo->hdr_len)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+				!(srh->hdrlen > srhinfo->hdr_len)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+				!(srh->hdrlen < srhinfo->hdr_len)))
+			return false;
+
+	/* Segments Left matching */
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+				!(srh->segments_left == srhinfo->segs_left)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+				!(srh->segments_left > srhinfo->segs_left)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+				!(srh->segments_left < srhinfo->segs_left)))
+			return false;
+
+	/**
+	 * Last Entry matching
+	 * Last_Entry field was introduced in revision 6 of the SRH draft.
+	 * It was called First_Segment in the previous revision
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+				!(srh->first_segment == srhinfo->last_entry)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+				!(srh->first_segment > srhinfo->last_entry)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+				!(srh->first_segment < srhinfo->last_entry)))
+			return false;
+
+	/**
+	 * Tag matchig
+	 * Tag field was introduced in revision 6 of the SRH draft
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_TAG)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+				!(srh->tag == srhinfo->tag)))
+			return false;
+
+	/* Previous SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_PSID) {
+		if (srh->segments_left == srh->first_segment)
+			return false;
+		psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+			  ((srh->segments_left + 1) * sizeof(struct in6_addr));
+		psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
+				ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
+						     &srhinfo->psid_addr)))
+			return false;
+	}
+
+	/* Next SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NSID) {
+		if (srh->segments_left == 0)
+			return false;
+		nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+			  ((srh->segments_left - 1) * sizeof(struct in6_addr));
+		nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
+				ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
+						     &srhinfo->nsid_addr)))
+			return false;
+	}
+
+	/* Last SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LSID) {
+		lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
+		lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
+				ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
+						     &srhinfo->lsid_addr)))
+			return false;
+	}
+	return true;
+}
+
 static int srh_mt6_check(const struct xt_mtchk_param *par)
 {
 	const struct ip6t_srh *srhinfo = par->matchinfo;
@@ -136,23 +260,54 @@ static int srh_mt6_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static struct xt_match srh_mt6_reg __read_mostly = {
-	.name		= "srh",
-	.family		= NFPROTO_IPV6,
-	.match		= srh_mt6,
-	.matchsize	= sizeof(struct ip6t_srh),
-	.checkentry	= srh_mt6_check,
-	.me		= THIS_MODULE,
+static int srh1_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_srh1 *srhinfo = par->matchinfo;
+
+	if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+		pr_info_ratelimited("unknown srh match flags  %X\n",
+				    srhinfo->mt_flags);
+		return -EINVAL;
+	}
+
+	if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+		pr_info_ratelimited("unknown srh invflags %X\n",
+				    srhinfo->mt_invflags);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match srh_mt6_reg[] __read_mostly = {
+	{
+		.name		= "srh",
+		.revision	= 0,
+		.family		= NFPROTO_IPV6,
+		.match		= srh_mt6,
+		.matchsize	= sizeof(struct ip6t_srh),
+		.checkentry	= srh_mt6_check,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "srh",
+		.revision       = 1,
+		.family         = NFPROTO_IPV6,
+		.match          = srh1_mt6,
+		.matchsize      = sizeof(struct ip6t_srh1),
+		.checkentry     = srh1_mt6_check,
+		.me             = THIS_MODULE,
+	}
 };
 
 static int __init srh_mt6_init(void)
 {
-	return xt_register_match(&srh_mt6_reg);
+	return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
 }
 
 static void __exit srh_mt6_exit(void)
 {
-	xt_unregister_match(&srh_mt6_reg);
+	xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
 }
 
 module_init(srh_mt6_init);
-- 
cgit v1.2.3


From 3a2e86f64577be9a6e2c13ee06834e769cd3824f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 26 Apr 2018 17:42:15 +0200
Subject: netfilter: nf_nat: remove unused ct arg from lookup functions

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l3proto.h   | 24 ++++++++----------------
 net/ipv4/netfilter/iptable_nat.c         |  3 +--
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 14 +++++---------
 net/ipv4/netfilter/nft_chain_nat_ipv4.c  |  3 +--
 net/ipv6/netfilter/ip6table_nat.c        |  3 +--
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 14 +++++---------
 net/ipv6/netfilter/nft_chain_nat_ipv6.c  |  3 +--
 7 files changed, 22 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index ac47098a61dc..8bad2560576f 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -48,30 +48,26 @@ unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 			     const struct nf_hook_state *state,
 			     unsigned int (*do_chain)(void *priv,
 						      struct sk_buff *skb,
-						      const struct nf_hook_state *state,
-						      struct nf_conn *ct));
+						      const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv4_local_fn(void *priv,
 				  struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  unsigned int (*do_chain)(void *priv,
 							   struct sk_buff *skb,
-							   const struct nf_hook_state *state,
-							   struct nf_conn *ct));
+							   const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
 				    enum ip_conntrack_info ctinfo,
@@ -81,29 +77,25 @@ unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 			     const struct nf_hook_state *state,
 			     unsigned int (*do_chain)(void *priv,
 						      struct sk_buff *skb,
-						      const struct nf_hook_state *state,
-						      struct nf_conn *ct));
+						      const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv6_local_fn(void *priv,
 				  struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  unsigned int (*do_chain)(void *priv,
 							   struct sk_buff *skb,
-							   const struct nf_hook_state *state,
-							   struct nf_conn *ct));
+							   const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 #endif /* _NF_NAT_L3PROTO_H */
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 0f7255cc65ee..529d89ec31e8 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -33,8 +33,7 @@ static const struct xt_table nf_nat_ipv4_table = {
 
 static unsigned int iptable_nat_do_chain(void *priv,
 					 struct sk_buff *skb,
-					 const struct nf_hook_state *state,
-					 struct nf_conn *ct)
+					 const struct nf_hook_state *state)
 {
 	return ipt_do_table(skb, state, state->net->ipv4.nat_table);
 }
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 4346336cee4c..325e02956bf5 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -246,8 +246,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					struct sk_buff *skb,
-					const struct nf_hook_state *state,
-					struct nf_conn *ct))
+					const struct nf_hook_state *state))
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -285,7 +284,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = do_chain(priv, skb, state, ct);
+			ret = do_chain(priv, skb, state);
 			if (ret != NF_ACCEPT)
 				return ret;
 
@@ -326,8 +325,7 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					 struct sk_buff *skb,
-					 const struct nf_hook_state *state,
-					 struct nf_conn *ct))
+					 const struct nf_hook_state *state))
 {
 	unsigned int ret;
 	__be32 daddr = ip_hdr(skb)->daddr;
@@ -346,8 +344,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 		const struct nf_hook_state *state,
 		unsigned int (*do_chain)(void *priv,
 					  struct sk_buff *skb,
-					  const struct nf_hook_state *state,
-					  struct nf_conn *ct))
+					  const struct nf_hook_state *state))
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -383,8 +380,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 		     const struct nf_hook_state *state,
 		     unsigned int (*do_chain)(void *priv,
 					       struct sk_buff *skb,
-					       const struct nf_hook_state *state,
-					       struct nf_conn *ct))
+					       const struct nf_hook_state *state))
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index b5464a3f253b..285baccfbdea 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -28,8 +28,7 @@
 
 static unsigned int nft_nat_do_chain(void *priv,
 				      struct sk_buff *skb,
-				      const struct nf_hook_state *state,
-				      struct nf_conn *ct)
+				      const struct nf_hook_state *state)
 {
 	struct nft_pktinfo pkt;
 
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 47306e45a80a..2bf554e18af8 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -35,8 +35,7 @@ static const struct xt_table nf_nat_ipv6_table = {
 
 static unsigned int ip6table_nat_do_chain(void *priv,
 					  struct sk_buff *skb,
-					  const struct nf_hook_state *state,
-					  struct nf_conn *ct)
+					  const struct nf_hook_state *state)
 {
 	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
 }
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 56d75eb5448f..f1582b6f9588 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -257,8 +257,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					struct sk_buff *skb,
-					const struct nf_hook_state *state,
-					struct nf_conn *ct))
+					const struct nf_hook_state *state))
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -303,7 +302,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = do_chain(priv, skb, state, ct);
+			ret = do_chain(priv, skb, state);
 			if (ret != NF_ACCEPT)
 				return ret;
 
@@ -343,8 +342,7 @@ nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					struct sk_buff *skb,
-					const struct nf_hook_state *state,
-					struct nf_conn *ct))
+					const struct nf_hook_state *state))
 {
 	unsigned int ret;
 	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
@@ -363,8 +361,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 		const struct nf_hook_state *state,
 		unsigned int (*do_chain)(void *priv,
 					 struct sk_buff *skb,
-					 const struct nf_hook_state *state,
-					 struct nf_conn *ct))
+					 const struct nf_hook_state *state))
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -400,8 +397,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
 		     const struct nf_hook_state *state,
 		     unsigned int (*do_chain)(void *priv,
 					      struct sk_buff *skb,
-					      const struct nf_hook_state *state,
-					      struct nf_conn *ct))
+					      const struct nf_hook_state *state))
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 3557b114446c..100a6bd1046a 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -26,8 +26,7 @@
 
 static unsigned int nft_nat_do_chain(void *priv,
 				     struct sk_buff *skb,
-				     const struct nf_hook_state *state,
-				     struct nf_conn *ct)
+				     const struct nf_hook_state *state)
 {
 	struct nft_pktinfo pkt;
 
-- 
cgit v1.2.3


From 3f9c56a581b96d8117922c4fd8221687fd649f9b Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Fri, 27 Apr 2018 12:47:01 +0200
Subject: netfilter: nf_tables: Provide NFT_{RT,CT}_MAX for userspace

These macros allow conveniently declaring arrays which use NFT_{RT,CT}_*
values as indexes.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5a5551a580f7..ce031cf72288 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -831,7 +831,9 @@ enum nft_rt_keys {
 	NFT_RT_NEXTHOP4,
 	NFT_RT_NEXTHOP6,
 	NFT_RT_TCPMSS,
+	__NFT_RT_MAX
 };
+#define NFT_RT_MAX		(__NFT_RT_MAX - 1)
 
 /**
  * enum nft_hash_types - nf_tables hash expression types
@@ -949,7 +951,9 @@ enum nft_ct_keys {
 	NFT_CT_DST_IP,
 	NFT_CT_SRC_IP6,
 	NFT_CT_DST_IP6,
+	__NFT_CT_MAX
 };
+#define NFT_CT_MAX		(__NFT_CT_MAX - 1)
 
 /**
  * enum nft_ct_attributes - nf_tables ct expression netlink attributes
-- 
cgit v1.2.3


From bfb15f2a95cbbc548b59abf8007d0fdb35fdfee5 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Thu, 3 May 2018 14:05:40 +0200
Subject: netfilter: extract Passive OS fingerprint infrastructure from xt_osf

Add nf_osf_ttl() and nf_osf_match() into nf_osf.c to prepare for
nf_tables support.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      |  27 +++++
 include/uapi/linux/netfilter/nf_osf.h |  90 ++++++++++++++
 include/uapi/linux/netfilter/xt_osf.h | 106 +++--------------
 net/netfilter/Kconfig                 |   4 +
 net/netfilter/Makefile                |   1 +
 net/netfilter/nf_osf.c                | 218 ++++++++++++++++++++++++++++++++++
 net/netfilter/xt_osf.c                | 202 +------------------------------
 7 files changed, 359 insertions(+), 289 deletions(-)
 create mode 100644 include/linux/netfilter/nf_osf.h
 create mode 100644 include/uapi/linux/netfilter/nf_osf.h
 create mode 100644 net/netfilter/nf_osf.c

(limited to 'include')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
new file mode 100644
index 000000000000..a2b39602e87d
--- /dev/null
+++ b/include/linux/netfilter/nf_osf.h
@@ -0,0 +1,27 @@
+#include <uapi/linux/netfilter/nf_osf.h>
+
+/* Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum nf_osf_window_size_options {
+	OSF_WSS_PLAIN   = 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
+enum osf_fmatch_states {
+	/* Packet does not match the fingerprint */
+	FMATCH_WRONG = 0,
+	/* Packet matches the fingerprint */
+	FMATCH_OK,
+	/* Options do not match the fingerprint, but header does */
+	FMATCH_OPT_WRONG,
+};
+
+bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+		  int hooknum, struct net_device *in, struct net_device *out,
+		  const struct nf_osf_info *info, struct net *net,
+		  const struct list_head *nf_osf_fingers);
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
new file mode 100644
index 000000000000..45376eae31ef
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -0,0 +1,90 @@
+#ifndef _NF_OSF_H
+#define _NF_OSF_H
+
+#define MAXGENRELEN	32
+
+#define NF_OSF_GENRE	(1 << 0)
+#define NF_OSF_TTL	(1 << 1)
+#define NF_OSF_LOG	(1 << 2)
+#define NF_OSF_INVERT	(1 << 3)
+
+#define NF_OSF_LOGLEVEL_ALL		0	/* log all matched fingerprints */
+#define NF_OSF_LOGLEVEL_FIRST		1	/* log only the first matced fingerprint */
+#define NF_OSF_LOGLEVEL_ALL_KNOWN	2	/* do not log unknown packets */
+
+#define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
+
+/* Do not compare ip and fingerprint TTL at all */
+#define NF_OSF_TTL_NOCHECK		2
+
+/* Wildcard MSS (kind of).
+ * It is used to implement a state machine for the different wildcard values
+ * of the MSS and window sizes.
+ */
+struct nf_osf_wc {
+	__u32	wc;
+	__u32	val;
+};
+
+/* This struct represents IANA options
+ * http://www.iana.org/assignments/tcp-parameters
+ */
+struct nf_osf_opt {
+	__u16			kind, length;
+	struct nf_osf_wc	wc;
+};
+
+struct nf_osf_info {
+	char	genre[MAXGENRELEN];
+	__u32	len;
+	__u32	flags;
+	__u32	loglevel;
+	__u32	ttl;
+};
+
+struct nf_osf_user_finger {
+	struct nf_osf_wc	wss;
+
+	__u8	ttl, df;
+	__u16	ss, mss;
+	__u16	opt_num;
+
+	char	genre[MAXGENRELEN];
+	char	version[MAXGENRELEN];
+	char	subtype[MAXGENRELEN];
+
+	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
+	struct nf_osf_opt	opt[MAX_IPOPTLEN];
+};
+
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
+struct nf_osf_nlmsg {
+	struct nf_osf_user_finger	f;
+	struct iphdr			ip;
+	struct tcphdr			tcp;
+};
+
+/* Defines for IANA option kinds */
+enum iana_options {
+	OSFOPT_EOL = 0,		/* End of options */
+	OSFOPT_NOP,		/* NOP */
+	OSFOPT_MSS,		/* Maximum segment size */
+	OSFOPT_WSO,		/* Window scale option */
+	OSFOPT_SACKP,		/* SACK permitted */
+	OSFOPT_SACK,		/* SACK */
+	OSFOPT_ECHO,
+	OSFOPT_ECHOREPLY,
+	OSFOPT_TS,		/* Timestamp option */
+	OSFOPT_POCP,		/* Partial Order Connection Permitted */
+	OSFOPT_POSP,		/* Partial Order Service Profile */
+
+	/* Others are not used in the current OSF */
+	OSFOPT_EMPTY = 255,
+};
+
+#endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index dad197e2ab99..72956eceeb09 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -23,101 +23,29 @@
 #include <linux/types.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
+#include <linux/netfilter/nf_osf.h>
 
-#define MAXGENRELEN		32
+#define XT_OSF_GENRE		NF_OSF_GENRE
+#define XT_OSF_INVERT		NF_OSF_INVERT
 
-#define XT_OSF_GENRE		(1<<0)
-#define	XT_OSF_TTL		(1<<1)
-#define XT_OSF_LOG		(1<<2)
-#define XT_OSF_INVERT		(1<<3)
+#define XT_OSF_TTL		NF_OSF_TTL
+#define XT_OSF_LOG		NF_OSF_LOG
 
-#define XT_OSF_LOGLEVEL_ALL	0	/* log all matched fingerprints */
-#define XT_OSF_LOGLEVEL_FIRST	1	/* log only the first matced fingerprint */
-#define XT_OSF_LOGLEVEL_ALL_KNOWN	2 /* do not log unknown packets */
+#define XT_OSF_LOGLEVEL_ALL		NF_OSF_LOGLEVEL_ALL
+#define XT_OSF_LOGLEVEL_FIRST		NF_OSF_LOGLEVEL_FIRST
+#define XT_OSF_LOGLEVEL_ALL_KNOWN	NF_OSF_LOGLEVEL_ALL_KNOWN
 
-#define XT_OSF_TTL_TRUE		0	/* True ip and fingerprint TTL comparison */
-#define XT_OSF_TTL_LESS		1	/* Check if ip TTL is less than fingerprint one */
-#define XT_OSF_TTL_NOCHECK	2	/* Do not compare ip and fingerprint TTL at all */
+#define XT_OSF_TTL_TRUE		NF_OSF_TTL_TRUE
+#define XT_OSF_TTL_NOCHECK	NF_OSF_TTL_NOCHECK
 
-struct xt_osf_info {
-	char			genre[MAXGENRELEN];
-	__u32			len;
-	__u32			flags;
-	__u32			loglevel;
-	__u32			ttl;
-};
-
-/*
- * Wildcard MSS (kind of).
- * It is used to implement a state machine for the different wildcard values
- * of the MSS and window sizes.
- */
-struct xt_osf_wc {
-	__u32			wc;
-	__u32			val;
-};
-
-/*
- * This struct represents IANA options
- * http://www.iana.org/assignments/tcp-parameters
- */
-struct xt_osf_opt {
-	__u16			kind, length;
-	struct xt_osf_wc	wc;
-};
-
-struct xt_osf_user_finger {
-	struct xt_osf_wc	wss;
-
-	__u8			ttl, df;
-	__u16			ss, mss;
-	__u16			opt_num;
-
-	char			genre[MAXGENRELEN];
-	char			version[MAXGENRELEN];
-	char			subtype[MAXGENRELEN];
+#define XT_OSF_TTL_LESS	1	/* Check if ip TTL is less than fingerprint one */
 
-	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
-	struct xt_osf_opt	opt[MAX_IPOPTLEN];
-};
-
-struct xt_osf_nlmsg {
-	struct xt_osf_user_finger	f;
-	struct iphdr		ip;
-	struct tcphdr		tcp;
-};
-
-/* Defines for IANA option kinds */
-
-enum iana_options {
-	OSFOPT_EOL = 0,		/* End of options */
-	OSFOPT_NOP, 		/* NOP */
-	OSFOPT_MSS, 		/* Maximum segment size */
-	OSFOPT_WSO, 		/* Window scale option */
-	OSFOPT_SACKP,		/* SACK permitted */
-	OSFOPT_SACK,		/* SACK */
-	OSFOPT_ECHO,
-	OSFOPT_ECHOREPLY,
-	OSFOPT_TS,		/* Timestamp option */
-	OSFOPT_POCP,		/* Partial Order Connection Permitted */
-	OSFOPT_POSP,		/* Partial Order Service Profile */
-
-	/* Others are not used in the current OSF */
-	OSFOPT_EMPTY = 255,
-};
-
-/*
- * Initial window size option state machine: multiple of mss, mtu or
- * plain numeric value. Can also be made as plain numeric value which
- * is not a multiple of specified value.
- */
-enum xt_osf_window_size_options {
-	OSF_WSS_PLAIN	= 0,
-	OSF_WSS_MSS,
-	OSF_WSS_MTU,
-	OSF_WSS_MODULO,
-	OSF_WSS_MAX,
-};
+#define xt_osf_wc		nf_osf_wc
+#define xt_osf_opt		nf_osf_opt
+#define xt_osf_info		nf_osf_info
+#define xt_osf_user_finger	nf_osf_user_finger
+#define xt_osf_finger		nf_osf_finger
+#define xt_osf_nlmsg		nf_osf_nlmsg
 
 /*
  * Add/remove fingerprint from the kernel.
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f66586fb41cd..e25cdb3d51cb 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -444,6 +444,9 @@ config NETFILTER_SYNPROXY
 
 endif # NF_CONNTRACK
 
+config NF_OSF
+	tristate 'Passive OS fingerprint infrastructure'
+
 config NF_TABLES
 	select NETFILTER_NETLINK
 	tristate "Netfilter nf_tables support"
@@ -1358,6 +1361,7 @@ config NETFILTER_XT_MATCH_NFACCT
 config NETFILTER_XT_MATCH_OSF
 	tristate '"osf" Passive OS fingerprint match'
 	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+	select NF_OSF
 	help
 	  This option selects the Passive OS Fingerprinting match module
 	  that allows to passively match the remote operating system by
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b37ce0bc9ab7..1aa710b5d384 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
 obj-$(CONFIG_NFT_FIB)		+= nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
+obj-$(CONFIG_NF_OSF)		+= nf_osf.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
new file mode 100644
index 000000000000..5ba5c7bef2f9
--- /dev/null
+++ b/net/netfilter/nf_osf.c
@@ -0,0 +1,218 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/nf_osf.h>
+
+static inline int nf_osf_ttl(const struct sk_buff *skb,
+			     const struct nf_osf_info *info,
+			     unsigned char f_ttl)
+{
+	const struct iphdr *ip = ip_hdr(skb);
+
+	if (info->flags & NF_OSF_TTL) {
+		if (info->ttl == NF_OSF_TTL_TRUE)
+			return ip->ttl == f_ttl;
+		if (info->ttl == NF_OSF_TTL_NOCHECK)
+			return 1;
+		else if (ip->ttl <= f_ttl)
+			return 1;
+		else {
+			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+			int ret = 0;
+
+			for_ifa(in_dev) {
+				if (inet_ifa_match(ip->saddr, ifa)) {
+					ret = (ip->ttl == f_ttl);
+					break;
+				}
+			}
+			endfor_ifa(in_dev);
+
+			return ret;
+		}
+	}
+
+	return ip->ttl == f_ttl;
+}
+
+bool
+nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+	     int hooknum, struct net_device *in, struct net_device *out,
+	     const struct nf_osf_info *info, struct net *net,
+	     const struct list_head *nf_osf_fingers)
+{
+	const unsigned char *optp = NULL, *_optp = NULL;
+	unsigned int optsize = 0, check_WSS = 0;
+	int fmatch = FMATCH_WRONG, fcount = 0;
+	const struct iphdr *ip = ip_hdr(skb);
+	const struct nf_osf_user_finger *f;
+	unsigned char opts[MAX_IPOPTLEN];
+	const struct nf_osf_finger *kf;
+	u16 window, totlen, mss = 0;
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+	bool df;
+
+	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+	if (!tcp)
+		return false;
+
+	if (!tcp->syn)
+		return false;
+
+	totlen = ntohs(ip->tot_len);
+	df = ntohs(ip->frag_off) & IP_DF;
+	window = ntohs(tcp->window);
+
+	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+				sizeof(struct tcphdr), optsize, opts);
+	}
+
+	list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
+		int foptsize, optnum;
+
+		f = &kf->finger;
+
+		if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
+			continue;
+
+		optp = _optp;
+		fmatch = FMATCH_WRONG;
+
+		if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
+			continue;
+
+		/*
+		 * Should not happen if userspace parser was written correctly.
+		 */
+		if (f->wss.wc >= OSF_WSS_MAX)
+			continue;
+
+		/* Check options */
+
+		foptsize = 0;
+		for (optnum = 0; optnum < f->opt_num; ++optnum)
+			foptsize += f->opt[optnum].length;
+
+		if (foptsize > MAX_IPOPTLEN ||
+		    optsize > MAX_IPOPTLEN ||
+		    optsize != foptsize)
+			continue;
+
+		check_WSS = f->wss.wc;
+
+		for (optnum = 0; optnum < f->opt_num; ++optnum) {
+			if (f->opt[optnum].kind == (*optp)) {
+				__u32 len = f->opt[optnum].length;
+				const __u8 *optend = optp + len;
+
+				fmatch = FMATCH_OK;
+
+				switch (*optp) {
+				case OSFOPT_MSS:
+					mss = optp[3];
+					mss <<= 8;
+					mss |= optp[2];
+
+					mss = ntohs((__force __be16)mss);
+					break;
+				case OSFOPT_TS:
+					break;
+				}
+
+				optp = optend;
+			} else
+				fmatch = FMATCH_OPT_WRONG;
+
+			if (fmatch != FMATCH_OK)
+				break;
+		}
+
+		if (fmatch != FMATCH_OPT_WRONG) {
+			fmatch = FMATCH_WRONG;
+
+			switch (check_WSS) {
+			case OSF_WSS_PLAIN:
+				if (f->wss.val == 0 || window == f->wss.val)
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MSS:
+				/*
+				 * Some smart modems decrease mangle MSS to
+				 * SMART_MSS_2, so we check standard, decreased
+				 * and the one provided in the fingerprint MSS
+				 * values.
+				 */
+#define SMART_MSS_1	1460
+#define SMART_MSS_2	1448
+				if (window == f->wss.val * mss ||
+				    window == f->wss.val * SMART_MSS_1 ||
+				    window == f->wss.val * SMART_MSS_2)
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MTU:
+				if (window == f->wss.val * (mss + 40) ||
+				    window == f->wss.val * (SMART_MSS_1 + 40) ||
+				    window == f->wss.val * (SMART_MSS_2 + 40))
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MODULO:
+				if ((window % f->wss.val) == 0)
+					fmatch = FMATCH_OK;
+				break;
+			}
+		}
+
+		if (fmatch != FMATCH_OK)
+			continue;
+
+		fcount++;
+
+		if (info->flags & NF_OSF_LOG)
+			nf_log_packet(net, family, hooknum, skb,
+				      in, out, NULL,
+				      "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+				      f->genre, f->version, f->subtype,
+				      &ip->saddr, ntohs(tcp->source),
+				      &ip->daddr, ntohs(tcp->dest),
+				      f->ttl - ip->ttl);
+
+		if ((info->flags & NF_OSF_LOG) &&
+		    info->loglevel == NF_OSF_LOGLEVEL_FIRST)
+			break;
+	}
+
+	if (!fcount && (info->flags & NF_OSF_LOG))
+		nf_log_packet(net, family, hooknum, skb, in, out, NULL,
+			      "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+			      &ip->saddr, ntohs(tcp->source),
+			      &ip->daddr, ntohs(tcp->dest));
+
+	if (fcount)
+		fmatch = FMATCH_OK;
+
+	return fmatch == FMATCH_OK;
+}
+EXPORT_SYMBOL_GPL(nf_osf_match);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a34f314a8c23..9cfef73b4107 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,21 +37,6 @@
 #include <net/netfilter/nf_log.h>
 #include <linux/netfilter/xt_osf.h>
 
-struct xt_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct xt_osf_user_finger	finger;
-};
-
-enum osf_fmatch_states {
-	/* Packet does not match the fingerprint */
-	FMATCH_WRONG = 0,
-	/* Packet matches the fingerprint */
-	FMATCH_OK,
-	/* Options do not match the fingerprint, but header does */
-	FMATCH_OPT_WRONG,
-};
-
 /*
  * Indexed by dont-fragment bit.
  * It is the only constant value in the fingerprint.
@@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
 	.cb			= xt_osf_nfnetlink_callbacks,
 };
 
-static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
-			    unsigned char f_ttl)
-{
-	const struct iphdr *ip = ip_hdr(skb);
-
-	if (info->flags & XT_OSF_TTL) {
-		if (info->ttl == XT_OSF_TTL_TRUE)
-			return ip->ttl == f_ttl;
-		if (info->ttl == XT_OSF_TTL_NOCHECK)
-			return 1;
-		else if (ip->ttl <= f_ttl)
-			return 1;
-		else {
-			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
-			int ret = 0;
-
-			for_ifa(in_dev) {
-				if (inet_ifa_match(ip->saddr, ifa)) {
-					ret = (ip->ttl == f_ttl);
-					break;
-				}
-			}
-			endfor_ifa(in_dev);
-
-			return ret;
-		}
-	}
-
-	return ip->ttl == f_ttl;
-}
-
 static bool
 xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 {
 	const struct xt_osf_info *info = p->matchinfo;
-	const struct iphdr *ip = ip_hdr(skb);
-	const struct tcphdr *tcp;
-	struct tcphdr _tcph;
-	int fmatch = FMATCH_WRONG, fcount = 0;
-	unsigned int optsize = 0, check_WSS = 0;
-	u16 window, totlen, mss = 0;
-	bool df;
-	const unsigned char *optp = NULL, *_optp = NULL;
-	unsigned char opts[MAX_IPOPTLEN];
-	const struct xt_osf_finger *kf;
-	const struct xt_osf_user_finger *f;
 	struct net *net = xt_net(p);
 
 	if (!info)
 		return false;
 
-	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
-	if (!tcp)
-		return false;
-
-	if (!tcp->syn)
-		return false;
-
-	totlen = ntohs(ip->tot_len);
-	df = ntohs(ip->frag_off) & IP_DF;
-	window = ntohs(tcp->window);
-
-	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
-		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
-		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
-				sizeof(struct tcphdr), optsize, opts);
-	}
-
-	list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
-		int foptsize, optnum;
-
-		f = &kf->finger;
-
-		if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
-			continue;
-
-		optp = _optp;
-		fmatch = FMATCH_WRONG;
-
-		if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl))
-			continue;
-
-		/*
-		 * Should not happen if userspace parser was written correctly.
-		 */
-		if (f->wss.wc >= OSF_WSS_MAX)
-			continue;
-
-		/* Check options */
-
-		foptsize = 0;
-		for (optnum = 0; optnum < f->opt_num; ++optnum)
-			foptsize += f->opt[optnum].length;
-
-		if (foptsize > MAX_IPOPTLEN ||
-		    optsize > MAX_IPOPTLEN ||
-		    optsize != foptsize)
-			continue;
-
-		check_WSS = f->wss.wc;
-
-		for (optnum = 0; optnum < f->opt_num; ++optnum) {
-			if (f->opt[optnum].kind == (*optp)) {
-				__u32 len = f->opt[optnum].length;
-				const __u8 *optend = optp + len;
-
-				fmatch = FMATCH_OK;
-
-				switch (*optp) {
-				case OSFOPT_MSS:
-					mss = optp[3];
-					mss <<= 8;
-					mss |= optp[2];
-
-					mss = ntohs((__force __be16)mss);
-					break;
-				case OSFOPT_TS:
-					break;
-				}
-
-				optp = optend;
-			} else
-				fmatch = FMATCH_OPT_WRONG;
-
-			if (fmatch != FMATCH_OK)
-				break;
-		}
-
-		if (fmatch != FMATCH_OPT_WRONG) {
-			fmatch = FMATCH_WRONG;
-
-			switch (check_WSS) {
-			case OSF_WSS_PLAIN:
-				if (f->wss.val == 0 || window == f->wss.val)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MSS:
-				/*
-				 * Some smart modems decrease mangle MSS to
-				 * SMART_MSS_2, so we check standard, decreased
-				 * and the one provided in the fingerprint MSS
-				 * values.
-				 */
-#define SMART_MSS_1	1460
-#define SMART_MSS_2	1448
-				if (window == f->wss.val * mss ||
-				    window == f->wss.val * SMART_MSS_1 ||
-				    window == f->wss.val * SMART_MSS_2)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MTU:
-				if (window == f->wss.val * (mss + 40) ||
-				    window == f->wss.val * (SMART_MSS_1 + 40) ||
-				    window == f->wss.val * (SMART_MSS_2 + 40))
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MODULO:
-				if ((window % f->wss.val) == 0)
-					fmatch = FMATCH_OK;
-				break;
-			}
-		}
-
-		if (fmatch != FMATCH_OK)
-			continue;
-
-		fcount++;
-
-		if (info->flags & XT_OSF_LOG)
-			nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
-				      xt_in(p), xt_out(p), NULL,
-				      "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
-				      f->genre, f->version, f->subtype,
-				      &ip->saddr, ntohs(tcp->source),
-				      &ip->daddr, ntohs(tcp->dest),
-				      f->ttl - ip->ttl);
-
-		if ((info->flags & XT_OSF_LOG) &&
-		    info->loglevel == XT_OSF_LOGLEVEL_FIRST)
-			break;
-	}
-
-	if (!fcount && (info->flags & XT_OSF_LOG))
-		nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
-			      xt_out(p), NULL,
-			"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
-				&ip->saddr, ntohs(tcp->source),
-				&ip->daddr, ntohs(tcp->dest));
-
-	if (fcount)
-		fmatch = FMATCH_OK;
-
-	return fmatch == FMATCH_OK;
+	return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
+			    xt_out(p), info, net, xt_osf_fingers);
 }
 
 static struct xt_match xt_osf_match = {
-- 
cgit v1.2.3


From 538c5672be6d67b7b10c15701588e20617374973 Mon Sep 17 00:00:00 2001
From: Florent Fourcot <florent.fourcot@wifirst.fr>
Date: Sun, 6 May 2018 16:30:14 +0200
Subject: netfilter: ctnetlink: export nf_conntrack_max

IPCTNL_MSG_CT_GET_STATS netlink command allow to monitor current number
of conntrack entries. However, if one wants to compare it with the
maximum (and detect exhaustion), the only solution is currently to read
sysctl value.

This patch add nf_conntrack_max value in netlink message, and simplify
monitoring for application built on netlink API.

Signed-off-by: Florent Fourcot <florent.fourcot@wifirst.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 +
 net/netfilter/nf_conntrack_core.c                  | 1 +
 net/netfilter/nf_conntrack_netlink.c               | 3 +++
 3 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 77987111cab0..1d41810d17e2 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -262,6 +262,7 @@ enum ctattr_stats_cpu {
 enum ctattr_stats_global {
 	CTA_STATS_GLOBAL_UNSPEC,
 	CTA_STATS_GLOBAL_ENTRIES,
+	CTA_STATS_GLOBAL_MAX_ENTRIES,
 	__CTA_STATS_GLOBAL_MAX,
 };
 #define CTA_STATS_GLOBAL_MAX (__CTA_STATS_GLOBAL_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 41ff04ee2554..605441727008 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -186,6 +186,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
 unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_t nf_conntrack_generation __read_mostly;
 static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4c1d0c5bc268..d807b8770be3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return skb->len;
 
-- 
cgit v1.2.3


From 1d3b9917df9725382f4e0005ecaddee114ebc847 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Apr 2018 19:11:46 +0200
Subject: ide: kill ide_toggle_bounce

ide_toggle_bounce did select various strange block bounce limits, including
not bouncing at all as soon as an iommu is present in the system.  Given
that the dma_map routines now handle any required bounce buffering except
for ISA DMA, and the ide code already must handle either ISA DMA or highmem
at least for iommu equipped systems we can get rid of the block layer
bounce limit setting entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-dma.c   |  2 --
 drivers/ide/ide-lib.c   | 26 --------------------------
 drivers/ide/ide-probe.c |  3 ---
 include/linux/ide.h     |  2 --
 4 files changed, 33 deletions(-)

(limited to 'include')

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 54d4d78ca46a..6f344654ef22 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -180,7 +180,6 @@ EXPORT_SYMBOL_GPL(ide_dma_unmap_sg);
 void ide_dma_off_quietly(ide_drive_t *drive)
 {
 	drive->dev_flags &= ~IDE_DFLAG_USING_DMA;
-	ide_toggle_bounce(drive, 0);
 
 	drive->hwif->dma_ops->dma_host_set(drive, 0);
 }
@@ -211,7 +210,6 @@ EXPORT_SYMBOL(ide_dma_off);
 void ide_dma_on(ide_drive_t *drive)
 {
 	drive->dev_flags |= IDE_DFLAG_USING_DMA;
-	ide_toggle_bounce(drive, 1);
 
 	drive->hwif->dma_ops->dma_host_set(drive, 1);
 }
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index e1180fa46196..78cb79eddc8b 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -6,32 +6,6 @@
 #include <linux/ide.h>
 #include <linux/bitops.h>
 
-/**
- *	ide_toggle_bounce	-	handle bounce buffering
- *	@drive: drive to update
- *	@on: on/off boolean
- *
- *	Enable or disable bounce buffering for the device. Drives move
- *	between PIO and DMA and that changes the rules we need.
- */
-
-void ide_toggle_bounce(ide_drive_t *drive, int on)
-{
-	u64 addr = BLK_BOUNCE_HIGH;	/* dma64_addr_t */
-
-	if (!PCI_DMA_BUS_IS_PHYS) {
-		addr = BLK_BOUNCE_ANY;
-	} else if (on && drive->media == ide_disk) {
-		struct device *dev = drive->hwif->dev;
-
-		if (dev && dev->dma_mask)
-			addr = *dev->dma_mask;
-	}
-
-	if (drive->queue)
-		blk_queue_bounce_limit(drive->queue, addr);
-}
-
 u64 ide_get_lba_addr(struct ide_cmd *cmd, int lba48)
 {
 	struct ide_taskfile *tf = &cmd->tf;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 2019e66eada7..8d8ed036ca0a 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -805,9 +805,6 @@ static int ide_init_queue(ide_drive_t *drive)
 	/* assign drive queue */
 	drive->queue = q;
 
-	/* needs drive->queue to be set */
-	ide_toggle_bounce(drive, 1);
-
 	return 0;
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ca9d34feb572..11f0dd03a4b4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1508,8 +1508,6 @@ static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
 	hwif->hwif_data = data;
 }
 
-extern void ide_toggle_bounce(ide_drive_t *drive, int on);
-
 u64 ide_get_lba_addr(struct ide_cmd *, int);
 u8 ide_dump_status(ide_drive_t *, const char *, u8);
 
-- 
cgit v1.2.3


From 325ef1857fff8b2049322921e19421b6c5ad74e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 12 Apr 2018 09:33:30 +0200
Subject: PCI: remove PCI_DMA_BUS_IS_PHYS

This was used by the ide, scsi and networking code in the past to
determine if they should bounce payloads.  Now that the dma mapping
always have to support dma to all physical memory (thanks to swiotlb
for non-iommu systems) there is no need to this crude hack any more.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Palmer Dabbelt <palmer@sifive.com> (for riscv)
Reviewed-by: Jens Axboe <axboe@kernel.dk>
---
 arch/alpha/include/asm/pci.h      |  5 -----
 arch/arc/include/asm/pci.h        |  6 ------
 arch/arm/include/asm/pci.h        |  7 -------
 arch/arm64/include/asm/pci.h      |  5 -----
 arch/h8300/include/asm/pci.h      |  2 --
 arch/hexagon/kernel/dma.c         |  1 -
 arch/ia64/hp/common/sba_iommu.c   |  3 ---
 arch/ia64/include/asm/pci.h       | 17 -----------------
 arch/ia64/kernel/setup.c          | 12 ------------
 arch/ia64/sn/kernel/io_common.c   |  5 -----
 arch/m68k/include/asm/pci.h       |  6 ------
 arch/microblaze/include/asm/pci.h |  6 ------
 arch/mips/include/asm/pci.h       |  7 -------
 arch/parisc/include/asm/pci.h     | 23 -----------------------
 arch/parisc/kernel/setup.c        |  5 -----
 arch/powerpc/include/asm/pci.h    | 18 ------------------
 arch/riscv/include/asm/pci.h      |  3 ---
 arch/s390/include/asm/pci.h       |  2 --
 arch/s390/pci/pci_dma.c           |  2 --
 arch/sh/include/asm/pci.h         |  6 ------
 arch/sh/kernel/dma-nommu.c        |  1 -
 arch/sparc/include/asm/pci_32.h   |  4 ----
 arch/sparc/include/asm/pci_64.h   |  6 ------
 arch/x86/include/asm/pci.h        |  3 ---
 arch/xtensa/include/asm/pci.h     |  2 --
 drivers/parisc/ccio-dma.c         |  2 --
 drivers/parisc/sba_iommu.c        |  2 --
 include/asm-generic/pci.h         |  8 --------
 include/linux/dma-mapping.h       |  1 -
 lib/dma-direct.c                  |  1 -
 tools/virtio/linux/dma-mapping.h  |  2 --
 31 files changed, 173 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index b9ec55351924..cf6bc1e64d66 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -56,11 +56,6 @@ struct pci_controller {
 
 /* IOMMU controls.  */
 
-/* The PCI address space does not equal the physical memory address space.
-   The networking and block device layers use this boolean for bounce buffer
-   decisions.  */
-#define PCI_DMA_BUS_IS_PHYS  0
-
 /* TODO: integrate with include/asm-generic/pci.h ? */
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
diff --git a/arch/arc/include/asm/pci.h b/arch/arc/include/asm/pci.h
index ba56c23c1b20..4ff53c041c64 100644
--- a/arch/arc/include/asm/pci.h
+++ b/arch/arc/include/asm/pci.h
@@ -16,12 +16,6 @@
 #define PCIBIOS_MIN_MEM 0x100000
 
 #define pcibios_assign_all_busses()	1
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	1
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 1f0de808d111..0abd389cf0ec 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -19,13 +19,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 }
 #endif /* CONFIG_PCI_DOMAINS */
 
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
 #define HAVE_PCI_MMAP
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE
 
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 8747f7c5e0e7..9e690686e8aa 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -18,11 +18,6 @@
 #define pcibios_assign_all_busses() \
 	(pci_has_flag(PCI_REASSIGN_ALL_BUS))
 
-/*
- * PCI address space differs from physical memory address space
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 
 extern int isa_dma_bridge_buggy;
diff --git a/arch/h8300/include/asm/pci.h b/arch/h8300/include/asm/pci.h
index 7c9e55d62215..d4d345a52092 100644
--- a/arch/h8300/include/asm/pci.h
+++ b/arch/h8300/include/asm/pci.h
@@ -15,6 +15,4 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 	/* We don't do dynamic PCI IRQ allocation */
 }
 
-#define PCI_DMA_BUS_IS_PHYS	(1)
-
 #endif /* _ASM_H8300_PCI_H */
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index ad8347c29dcf..77459df34e2e 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -208,7 +208,6 @@ const struct dma_map_ops hexagon_dma_ops = {
 	.sync_single_for_cpu = hexagon_sync_single_for_cpu,
 	.sync_single_for_device = hexagon_sync_single_for_device,
 	.mapping_error	= hexagon_mapping_error,
-	.is_phys	= 1,
 };
 
 void __init hexagon_dma_init(void)
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index aec4a3354abe..6f05aba9012f 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1845,9 +1845,6 @@ static void ioc_init(unsigned long hpa, struct ioc *ioc)
 	ioc_resource_init(ioc);
 	ioc_sac_init(ioc);
 
-	if ((long) ~iovp_mask > (long) ia64_max_iommu_merge_mask)
-		ia64_max_iommu_merge_mask = ~iovp_mask;
-
 	printk(KERN_INFO PFX
 		"%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
 		ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index b1d04e8bafc8..780e8744ba85 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -30,23 +30,6 @@ struct pci_vector_struct {
 #define PCIBIOS_MIN_IO		0x1000
 #define PCIBIOS_MIN_MEM		0x10000000
 
-/*
- * PCI_DMA_BUS_IS_PHYS should be set to 1 if there is _necessarily_ a direct
- * correspondence between device bus addresses and CPU physical addresses.
- * Platforms with a hardware I/O MMU _must_ turn this off to suppress the
- * bounce buffer handling code in the block and network device layers.
- * Platforms with separate bus address spaces _must_ turn this off and provide
- * a device DMA mapping implementation that takes care of the necessary
- * address translation.
- *
- * For now, the ia64 platforms which may have separate/multiple bus address
- * spaces all have I/O MMUs which support the merging of physically
- * discontiguous buffers, so we can use that as the sole factor to determine
- * the setting of PCI_DMA_BUS_IS_PHYS.
- */
-extern unsigned long ia64_max_iommu_merge_mask;
-#define PCI_DMA_BUS_IS_PHYS	(ia64_max_iommu_merge_mask == ~0UL)
-
 #define HAVE_PCI_MMAP
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE
 #define arch_can_pci_mmap_wc()	1
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index dee56bcb993d..ad43cbf70628 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -123,18 +123,6 @@ unsigned long ia64_i_cache_stride_shift = ~0;
 #define	CACHE_STRIDE_SHIFT	5
 unsigned long ia64_cache_stride_shift = ~0;
 
-/*
- * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
- * mask specifies a mask of address bits that must be 0 in order for two buffers to be
- * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start
- * address of the second buffer must be aligned to (merge_mask+1) in order to be
- * mergeable).  By default, we assume there is no I/O MMU which can merge physically
- * discontiguous buffers, so we set the merge_mask to ~0UL, which corresponds to a iommu
- * page-size of 2^64.
- */
-unsigned long ia64_max_iommu_merge_mask = ~0UL;
-EXPORT_SYMBOL(ia64_max_iommu_merge_mask);
-
 /*
  * We use a special marker for the end of memory and it uses the extra (+1) slot
  */
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 11f2275570fb..8479e9a7ce16 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -480,11 +480,6 @@ sn_io_early_init(void)
 	tioca_init_provider();
 	tioce_init_provider();
 
-	/*
-	 * This is needed to avoid bounce limit checks in the blk layer
-	 */
-	ia64_max_iommu_merge_mask = ~PAGE_MASK;
-
 	sn_irq_lh_init();
 	INIT_LIST_HEAD(&sn_sysdata_list);
 	sn_init_cpei_timer();
diff --git a/arch/m68k/include/asm/pci.h b/arch/m68k/include/asm/pci.h
index ef26fae8cf0b..5a4bc223743b 100644
--- a/arch/m68k/include/asm/pci.h
+++ b/arch/m68k/include/asm/pci.h
@@ -4,12 +4,6 @@
 
 #include <asm-generic/pci.h>
 
-/* The PCI address space does equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(1)
-
 #define	pcibios_assign_all_busses()	1
 
 #define	PCIBIOS_MIN_IO		0x00000100
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 5de871eb4a59..00337861472e 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -62,12 +62,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
 
 #define HAVE_PCI_LEGACY	1
 
-/* The PCI address space does equal the physical memory
- * address space (no IOMMU).  The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
 extern void pcibios_claim_one_bus(struct pci_bus *b);
 
 extern void pcibios_finish_adding_to_bus(struct pci_bus *bus);
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 2339f42f047a..436099883022 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -121,13 +121,6 @@ extern unsigned long PCIBIOS_MIN_MEM;
 #include <linux/string.h>
 #include <asm/io.h>
 
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
 #ifdef CONFIG_PCI_DOMAINS_GENERIC
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 96b7deec512d..3328fd17c19d 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -87,29 +87,6 @@ struct pci_hba_data {
 #define PCI_F_EXTEND		0UL
 #endif /* !CONFIG_64BIT */
 
-/*
- * If the PCI device's view of memory is the same as the CPU's view of memory,
- * PCI_DMA_BUS_IS_PHYS is true.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#ifdef CONFIG_PA20
-/* All PA-2.0 machines have an IOMMU. */
-#define PCI_DMA_BUS_IS_PHYS	0
-#define parisc_has_iommu()	do { } while (0)
-#else
-
-#if defined(CONFIG_IOMMU_CCIO) || defined(CONFIG_IOMMU_SBA)
-extern int parisc_bus_is_phys; 	/* in arch/parisc/kernel/setup.c */
-#define PCI_DMA_BUS_IS_PHYS	parisc_bus_is_phys
-#define parisc_has_iommu()	do { parisc_bus_is_phys = 0; } while (0)
-#else
-#define PCI_DMA_BUS_IS_PHYS	1
-#define parisc_has_iommu()	do { } while (0)
-#endif
-
-#endif	/* !CONFIG_PA20 */
-
-
 /*
 ** Most PCI devices (eg Tulip, NCR720) also export the same registers
 ** to both MMIO and I/O port space.  Due to poor performance of I/O Port
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 0e9675f857a5..8d3a7b80ac42 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -58,11 +58,6 @@ struct proc_dir_entry * proc_runway_root __read_mostly = NULL;
 struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
 struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL;
 
-#if !defined(CONFIG_PA20) && (defined(CONFIG_IOMMU_CCIO) || defined(CONFIG_IOMMU_SBA))
-int parisc_bus_is_phys __read_mostly = 1;	/* Assume no IOMMU is present */
-EXPORT_SYMBOL(parisc_bus_is_phys);
-#endif
-
 void __init setup_cmdline(char **cmdline_p)
 {
 	extern unsigned int boot_args[];
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 401c62aad5e4..2af9ded80540 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -92,24 +92,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
 
 #define HAVE_PCI_LEGACY	1
 
-#ifdef CONFIG_PPC64
-
-/* The PCI address space does not equal the physical memory address
- * space (we have an IOMMU).  The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
-#else /* 32-bit */
-
-/* The PCI address space does equal the physical memory
- * address space (no IOMMU).  The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
-#endif /* CONFIG_PPC64 */
-
 extern void pcibios_claim_one_bus(struct pci_bus *b);
 
 extern void pcibios_finish_adding_to_bus(struct pci_bus *bus);
diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
index 0f2fc9ef20fc..b3638c505728 100644
--- a/arch/riscv/include/asm/pci.h
+++ b/arch/riscv/include/asm/pci.h
@@ -26,9 +26,6 @@
 /* RISC-V shim does not initialize PCI bus */
 #define pcibios_assign_all_busses() 1
 
-/* We do not have an IOMMU */
-#define PCI_DMA_BUS_IS_PHYS 1
-
 extern int isa_dma_bridge_buggy;
 
 #ifdef CONFIG_PCI
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 12fe3591034f..94f8db468c9b 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -2,8 +2,6 @@
 #ifndef __ASM_S390_PCI_H
 #define __ASM_S390_PCI_H
 
-/* must be set before including asm-generic/pci.h */
-#define PCI_DMA_BUS_IS_PHYS (0)
 /* must be set before including pci_clp.h */
 #define PCI_BAR_COUNT	6
 
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 2d15d84c20ed..10abf5ed6187 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -685,8 +685,6 @@ const struct dma_map_ops s390_pci_dma_ops = {
 	.map_page	= s390_dma_map_pages,
 	.unmap_page	= s390_dma_unmap_pages,
 	.mapping_error	= s390_mapping_error,
-	/* if we support direct DMA this must be conditional */
-	.is_phys	= 0,
 	/* dma_supported is unconditionally true without a callback */
 };
 EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 0033f0df2b3b..10a36b1cf2ea 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -71,12 +71,6 @@ extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
  * SuperH has everything mapped statically like x86.
  */
 
-/* The PCI address space does equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(dma_ops->is_phys)
-
 #ifdef CONFIG_PCI
 /*
  * None of the SH PCI controllers support MWI, it is always treated as a
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c
index 178457d7620c..3e3a32fc676e 100644
--- a/arch/sh/kernel/dma-nommu.c
+++ b/arch/sh/kernel/dma-nommu.c
@@ -78,7 +78,6 @@ const struct dma_map_ops nommu_dma_ops = {
 	.sync_single_for_device	= nommu_sync_single_for_device,
 	.sync_sg_for_device	= nommu_sync_sg_for_device,
 #endif
-	.is_phys		= 1,
 };
 
 void __init no_iommu_init(void)
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index 98917e48727d..cfc0ee9476c6 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -17,10 +17,6 @@
 
 #define PCI_IRQ_NONE		0xffffffff
 
-/* Dynamic DMA mapping stuff.
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
 #endif /* __KERNEL__ */
 
 #ifndef CONFIG_LEON_PCI
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 671274e36cfa..fac77813402c 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -17,12 +17,6 @@
 
 #define PCI_IRQ_NONE		0xffffffff
 
-/* The PCI address space does not equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
 /* PCI IOMMU mapping bypass support. */
 
 /* PCI 64-bit addressing works for all slots on all controller
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d32175e30259..662963681ea6 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -117,9 +117,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #endif
-
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index d5a82153a7c5..6ddf0a30c60d 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -42,8 +42,6 @@ extern struct pci_controller* pcibios_alloc_controller(void);
  * decisions.
  */
 
-#define PCI_DMA_BUS_IS_PHYS	(1)
-
 /* Tell PCI code what kind of PCI resource mappings we support */
 #define HAVE_PCI_MMAP			1
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index acba1f56af3e..2b129d8525d5 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1596,8 +1596,6 @@ static int __init ccio_probe(struct parisc_device *dev)
 	}
 #endif
 	ioc_count++;
-
-	parisc_has_iommu();
 	return 0;
 }
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 0a9c762a70fa..a58c586ebd81 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -2017,8 +2017,6 @@ static int __init sba_driver_callback(struct parisc_device *dev)
 	proc_create("sba_iommu", 0, root, &sba_proc_fops);
 	proc_create("sba_iommu-bitmap", 0, root, &sba_proc_bitmap_fops);
 #endif
-
-	parisc_has_iommu();
 	return 0;
 }
 
diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
index 830d7659289b..6bb3cd3d695a 100644
--- a/include/asm-generic/pci.h
+++ b/include/asm-generic/pci.h
@@ -14,12 +14,4 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 }
 #endif /* HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ */
 
-/*
- * By default, assume that no iommu is in use and that the PCI
- * space is mapped to address physical 0.
- */
-#ifndef PCI_DMA_BUS_IS_PHYS
-#define PCI_DMA_BUS_IS_PHYS	(1)
-#endif
-
 #endif /* _ASM_GENERIC_PCI_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f8ab1c0f589e..a6d4ac8b81e4 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -133,7 +133,6 @@ struct dma_map_ops {
 #ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
 	u64 (*get_required_mask)(struct device *dev);
 #endif
-	int is_phys;
 };
 
 extern const struct dma_map_ops dma_direct_ops;
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 970d39155618..df9e726e0712 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -187,6 +187,5 @@ const struct dma_map_ops dma_direct_ops = {
 	.map_sg			= dma_direct_map_sg,
 	.dma_supported		= dma_direct_supported,
 	.mapping_error		= dma_direct_mapping_error,
-	.is_phys		= 1,
 };
 EXPORT_SYMBOL(dma_direct_ops);
diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h
index 1571e24e9494..f91aeb5fe571 100644
--- a/tools/virtio/linux/dma-mapping.h
+++ b/tools/virtio/linux/dma-mapping.h
@@ -6,8 +6,6 @@
 # error Virtio userspace code does not support CONFIG_HAS_DMA
 #endif
 
-#define PCI_DMA_BUS_IS_PHYS 1
-
 enum dma_data_direction {
 	DMA_BIDIRECTIONAL = 0,
 	DMA_TO_DEVICE = 1,
-- 
cgit v1.2.3


From a2268cfbf599e7f55d4ee68193f08b4f44535fac Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:34:32 -0400
Subject: xprtrdma: Add proper SPDX tags for NetApp-contributed source

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/rpc_rdma.h | 1 +
 include/linux/sunrpc/xprtrdma.h | 1 +
 net/sunrpc/xprtrdma/module.c    | 1 +
 net/sunrpc/xprtrdma/rpc_rdma.c  | 1 +
 net/sunrpc/xprtrdma/transport.c | 1 +
 net/sunrpc/xprtrdma/verbs.c     | 1 +
 net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
 7 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 8f144db73e38..92d182fd8e3b 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright (c) 2015-2017 Oracle. All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 5859563e3c1f..86fc38ff0355 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index a762d192372b..f338065121f2 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
  */
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e8adad33d0bb..8f89e3faae8e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index cc1aad325496..4717578b4b81 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index fe5eaca2d197..b2c6f525b020 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 3d3b423fa9c1..3490a87bc69a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
-- 
cgit v1.2.3


From 37ac86c3a76c113619b7d9afe0251bbfc04cb80a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:34:53 -0400
Subject: SUNRPC: Initialize rpc_rqst outside of xprt->reserve_lock

alloc_slot is a transport-specific op, but initializing an rpc_rqst
is common to all transports. In addition, the only part of initial-
izing an rpc_rqst that needs serialization is getting a fresh XID.

Move rpc_rqst initialization to common code in preparation for
adding a transport-specific alloc_slot to xprtrdma.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/clnt.c           |  1 +
 net/sunrpc/xprt.c           | 12 +++++++-----
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 5fea0fb420df..9784e2875e7e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -324,6 +324,7 @@ struct xprt_class {
 struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
 void			xprt_connect(struct rpc_task *task);
 void			xprt_reserve(struct rpc_task *task);
+void			xprt_request_init(struct rpc_task *task);
 void			xprt_retry_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
 int			xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c2266f387213..d839c33ae7d9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1546,6 +1546,7 @@ call_reserveresult(struct rpc_task *task)
 	task->tk_status = 0;
 	if (status >= 0) {
 		if (task->tk_rqstp) {
+			xprt_request_init(task);
 			task->tk_action = call_refresh;
 			return;
 		}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 70f005044f06..2d959268d9b7 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -66,7 +66,7 @@
  * Local functions
  */
 static void	 xprt_init(struct rpc_xprt *xprt, struct net *net);
-static void	xprt_request_init(struct rpc_task *, struct rpc_xprt *);
+static __be32	xprt_alloc_xid(struct rpc_xprt *xprt);
 static void	xprt_connect_status(struct rpc_task *task);
 static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
 static void     __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
@@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task *task)
 		task->tk_status = -EAGAIN;
 		goto out_unlock;
 	}
+	if (!bc_prealloc(req) && !req->rq_xmit_bytes_sent)
+		req->rq_xid = xprt_alloc_xid(xprt);
 	ret = true;
 out_unlock:
 	spin_unlock_bh(&xprt->transport_lock);
@@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
 out_init_req:
 	xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
 				     xprt->num_reqs);
+	spin_unlock(&xprt->reserve_lock);
+
 	task->tk_status = 0;
 	task->tk_rqstp = req;
-	xprt_request_init(task, xprt);
-	spin_unlock(&xprt->reserve_lock);
 }
 EXPORT_SYMBOL_GPL(xprt_alloc_slot);
 
@@ -1303,8 +1305,9 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
 	xprt->xid = prandom_u32();
 }
 
-static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
+void xprt_request_init(struct rpc_task *task)
 {
+	struct rpc_xprt *xprt = task->tk_xprt;
 	struct rpc_rqst	*req = task->tk_rqstp;
 
 	INIT_LIST_HEAD(&req->rq_list);
@@ -1312,7 +1315,6 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
 	req->rq_task	= task;
 	req->rq_xprt    = xprt;
 	req->rq_buffer  = NULL;
-	req->rq_xid     = xprt_alloc_xid(xprt);
 	req->rq_connect_cookie = xprt->connect_cookie - 1;
 	req->rq_bytes_sent = 0;
 	req->rq_snd_buf.len = 0;
-- 
cgit v1.2.3


From a9cde23ab7cdf5e4e93432dffd0e734267f2b745 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:34:59 -0400
Subject: SUNRPC: Add a ->free_slot transport callout

Refactor: xprtrdma needs to have better control over when RPCs are
awoken from the backlog queue, so replace xprt_free_slot with a
transport op callout.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h                | 4 ++++
 net/sunrpc/xprt.c                          | 5 +++--
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 +
 net/sunrpc/xprtrdma/transport.c            | 1 +
 net/sunrpc/xprtsock.c                      | 4 ++++
 5 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 9784e2875e7e..706eef12bbc0 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -127,6 +127,8 @@ struct rpc_xprt_ops {
 	int		(*reserve_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
 	void		(*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
 	void		(*alloc_slot)(struct rpc_xprt *xprt, struct rpc_task *task);
+	void		(*free_slot)(struct rpc_xprt *xprt,
+				     struct rpc_rqst *req);
 	void		(*rpcbind)(struct rpc_task *task);
 	void		(*set_port)(struct rpc_xprt *xprt, unsigned short port);
 	void		(*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
@@ -329,6 +331,8 @@ void			xprt_retry_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
 int			xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
 void			xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
+void			xprt_free_slot(struct rpc_xprt *xprt,
+				       struct rpc_rqst *req);
 void			xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
 bool			xprt_prepare_transmit(struct rpc_task *task);
 void			xprt_transmit(struct rpc_task *task);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2d959268d9b7..3c85af058227 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1186,7 +1186,7 @@ void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
 }
 EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot);
 
-static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
+void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
 {
 	spin_lock(&xprt->reserve_lock);
 	if (!xprt_dynamic_free_slot(xprt, req)) {
@@ -1196,6 +1196,7 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
 	xprt_wake_up_backlog(xprt);
 	spin_unlock(&xprt->reserve_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_free_slot);
 
 static void xprt_free_all_slots(struct rpc_xprt *xprt)
 {
@@ -1375,7 +1376,7 @@ void xprt_release(struct rpc_task *task)
 
 	dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
 	if (likely(!bc_prealloc(req)))
-		xprt_free_slot(xprt, req);
+		xprt->ops->free_slot(xprt, req);
 	else
 		xprt_free_bc_request(req);
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a73632ca9048..1035516d54e2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -273,6 +273,7 @@ static const struct rpc_xprt_ops xprt_rdma_bc_procs = {
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
 	.alloc_slot		= xprt_alloc_slot,
+	.free_slot		= xprt_free_slot,
 	.release_request	= xprt_release_rqst_cong,
 	.buf_alloc		= xprt_rdma_bc_allocate,
 	.buf_free		= xprt_rdma_bc_free,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 4717578b4b81..cf5e866ee969 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -781,6 +781,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
 	.alloc_slot		= xprt_alloc_slot,
+	.free_slot		= xprt_free_slot,
 	.release_request	= xprt_release_rqst_cong,       /* ditto */
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def, /* ditto */
 	.timer			= xprt_rdma_timer,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c8902f11efdd..9e1c5024aba9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2763,6 +2763,7 @@ static const struct rpc_xprt_ops xs_local_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xs_tcp_release_xprt,
 	.alloc_slot		= xprt_alloc_slot,
+	.free_slot		= xprt_free_slot,
 	.rpcbind		= xs_local_rpcbind,
 	.set_port		= xs_local_set_port,
 	.connect		= xs_local_connect,
@@ -2782,6 +2783,7 @@ static const struct rpc_xprt_ops xs_udp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
 	.alloc_slot		= xprt_alloc_slot,
+	.free_slot		= xprt_free_slot,
 	.rpcbind		= rpcb_getport_async,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
@@ -2803,6 +2805,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xs_tcp_release_xprt,
 	.alloc_slot		= xprt_lock_and_alloc_slot,
+	.free_slot		= xprt_free_slot,
 	.rpcbind		= rpcb_getport_async,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
@@ -2834,6 +2837,7 @@ static const struct rpc_xprt_ops bc_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xprt_release_xprt,
 	.alloc_slot		= xprt_alloc_slot,
+	.free_slot		= xprt_free_slot,
 	.buf_alloc		= bc_malloc,
 	.buf_free		= bc_free,
 	.send_request		= bc_send_request,
-- 
cgit v1.2.3


From edb41e61a54ee75fae31302775e0301fdcb0caaa Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:35:09 -0400
Subject: xprtrdma: Make rpc_rqst part of rpcrdma_req

This simplifies allocation of the generic RPC slot and xprtrdma
specific per-RPC resources.

It also makes xprtrdma more like the socket-based transports:
->buf_alloc and ->buf_free are now responsible only for send and
receive buffers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h       |  1 -
 net/sunrpc/xprtrdma/backchannel.c | 77 ++++++++++++++++++---------------------
 net/sunrpc/xprtrdma/transport.c   | 35 +++++-------------
 net/sunrpc/xprtrdma/xprt_rdma.h   |  9 +----
 4 files changed, 46 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 706eef12bbc0..336fd1a19cca 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -84,7 +84,6 @@ struct rpc_rqst {
 	void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
 	struct list_head	rq_list;
 
-	void			*rq_xprtdata;	/* Per-xprt private data */
 	void			*rq_buffer;	/* Call XDR encode buffer */
 	size_t			rq_callsize;
 	void			*rq_rbuffer;	/* Reply XDR decode buffer */
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 47ebac949769..4034788c9856 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -29,29 +29,41 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
 	spin_unlock(&buf->rb_reqslock);
 
 	rpcrdma_destroy_req(req);
-
-	kfree(rqst);
 }
 
-static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
-				 struct rpc_rqst *rqst)
+static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt,
+				 unsigned int count)
 {
-	struct rpcrdma_regbuf *rb;
-	struct rpcrdma_req *req;
-	size_t size;
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpc_rqst *rqst;
+	unsigned int i;
+
+	for (i = 0; i < (count << 1); i++) {
+		struct rpcrdma_regbuf *rb;
+		struct rpcrdma_req *req;
+		size_t size;
+
+		req = rpcrdma_create_req(r_xprt);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		rqst = &req->rl_slot;
+
+		rqst->rq_xprt = xprt;
+		INIT_LIST_HEAD(&rqst->rq_list);
+		INIT_LIST_HEAD(&rqst->rq_bc_list);
+		__set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+		spin_lock_bh(&xprt->bc_pa_lock);
+		list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+		spin_unlock_bh(&xprt->bc_pa_lock);
 
-	req = rpcrdma_create_req(r_xprt);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	size = r_xprt->rx_data.inline_rsize;
-	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
-	if (IS_ERR(rb))
-		goto out_fail;
-	req->rl_sendbuf = rb;
-	xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
-		     min_t(size_t, size, PAGE_SIZE));
-	rpcrdma_set_xprtdata(rqst, req);
+		size = r_xprt->rx_data.inline_rsize;
+		rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
+		if (IS_ERR(rb))
+			goto out_fail;
+		req->rl_sendbuf = rb;
+		xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
+			     min_t(size_t, size, PAGE_SIZE));
+	}
 	return 0;
 
 out_fail:
@@ -86,9 +98,6 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
 int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-	struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
-	struct rpc_rqst *rqst;
-	unsigned int i;
 	int rc;
 
 	/* The backchannel reply path returns each rpc_rqst to the
@@ -103,25 +112,9 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 	if (reqs > RPCRDMA_BACKWARD_WRS >> 1)
 		goto out_err;
 
-	for (i = 0; i < (reqs << 1); i++) {
-		rqst = kzalloc(sizeof(*rqst), GFP_KERNEL);
-		if (!rqst)
-			goto out_free;
-
-		dprintk("RPC:       %s: new rqst %p\n", __func__, rqst);
-
-		rqst->rq_xprt = &r_xprt->rx_xprt;
-		INIT_LIST_HEAD(&rqst->rq_list);
-		INIT_LIST_HEAD(&rqst->rq_bc_list);
-		__set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
-
-		if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
-			goto out_free;
-
-		spin_lock_bh(&xprt->bc_pa_lock);
-		list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
-		spin_unlock_bh(&xprt->bc_pa_lock);
-	}
+	rc = rpcrdma_bc_setup_reqs(r_xprt, reqs);
+	if (rc)
+		goto out_free;
 
 	rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
 	if (rc)
@@ -131,7 +124,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 	if (rc)
 		goto out_free;
 
-	buffer->rb_bc_srv_max_requests = reqs;
+	r_xprt->rx_buf.rb_bc_srv_max_requests = reqs;
 	request_module("svcrdma");
 	trace_xprtrdma_cb_setup(r_xprt, reqs);
 	return 0;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8f9338e98c4f..79885aa39c24 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -331,9 +331,7 @@ xprt_setup_rdma(struct xprt_create *args)
 		return ERR_PTR(-EBADF);
 	}
 
-	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
-			xprt_rdma_slot_table_entries,
-			xprt_rdma_slot_table_entries);
+	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0);
 	if (xprt == NULL) {
 		dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
 			__func__);
@@ -365,7 +363,7 @@ xprt_setup_rdma(struct xprt_create *args)
 		xprt_set_bound(xprt);
 	xprt_rdma_format_addresses(xprt, sap);
 
-	cdata.max_requests = xprt->max_reqs;
+	cdata.max_requests = xprt_rdma_slot_table_entries;
 
 	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
 	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
@@ -550,22 +548,18 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 static void
 xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
 {
-	struct rpc_rqst *rqst;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_req *req;
 
-	spin_lock(&xprt->reserve_lock);
-	if (list_empty(&xprt->free))
+	req = rpcrdma_buffer_get(&r_xprt->rx_buf);
+	if (!req)
 		goto out_sleep;
-	rqst = list_first_entry(&xprt->free, struct rpc_rqst, rq_list);
-	list_del(&rqst->rq_list);
-	spin_unlock(&xprt->reserve_lock);
-
-	task->tk_rqstp = rqst;
+	task->tk_rqstp = &req->rl_slot;
 	task->tk_status = 0;
 	return;
 
 out_sleep:
 	rpc_sleep_on(&xprt->backlog, task, NULL);
-	spin_unlock(&xprt->reserve_lock);
 	task->tk_status = -EAGAIN;
 }
 
@@ -579,11 +573,8 @@ static void
 xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
 {
 	memset(rqst, 0, sizeof(*rqst));
-
-	spin_lock(&xprt->reserve_lock);
-	list_add(&rqst->rq_list, &xprt->free);
+	rpcrdma_buffer_put(rpcr_to_rdmar(rqst));
 	rpc_wake_up_next(&xprt->backlog);
-	spin_unlock(&xprt->reserve_lock);
 }
 
 static bool
@@ -656,13 +647,9 @@ xprt_rdma_allocate(struct rpc_task *task)
 {
 	struct rpc_rqst *rqst = task->tk_rqstp;
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
-	struct rpcrdma_req *req;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	gfp_t flags;
 
-	req = rpcrdma_buffer_get(&r_xprt->rx_buf);
-	if (req == NULL)
-		goto out_get;
-
 	flags = RPCRDMA_DEF_GFP;
 	if (RPC_IS_SWAPPER(task))
 		flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
@@ -672,15 +659,12 @@ xprt_rdma_allocate(struct rpc_task *task)
 	if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
 		goto out_fail;
 
-	rpcrdma_set_xprtdata(rqst, req);
 	rqst->rq_buffer = req->rl_sendbuf->rg_base;
 	rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
 	trace_xprtrdma_allocate(task, req);
 	return 0;
 
 out_fail:
-	rpcrdma_buffer_put(req);
-out_get:
 	trace_xprtrdma_allocate(task, NULL);
 	return -ENOMEM;
 }
@@ -701,7 +685,6 @@ xprt_rdma_free(struct rpc_task *task)
 	if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags))
 		rpcrdma_release_rqst(r_xprt, req);
 	trace_xprtrdma_rpc_done(task, req);
-	rpcrdma_buffer_put(req);
 }
 
 /**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 3490a87bc69a..1d7bb6e5db18 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -335,6 +335,7 @@ enum {
 struct rpcrdma_buffer;
 struct rpcrdma_req {
 	struct list_head	rl_list;
+	struct rpc_rqst		rl_slot;
 	struct rpcrdma_buffer	*rl_buffer;
 	struct rpcrdma_rep	*rl_reply;
 	struct xdr_stream	rl_stream;
@@ -357,16 +358,10 @@ enum {
 	RPCRDMA_REQ_F_TX_RESOURCES,
 };
 
-static inline void
-rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
-{
-	rqst->rq_xprtdata = req;
-}
-
 static inline struct rpcrdma_req *
 rpcr_to_rdmar(const struct rpc_rqst *rqst)
 {
-	return rqst->rq_xprtdata;
+	return container_of(rqst, struct rpcrdma_req, rl_slot);
 }
 
 static inline void
-- 
cgit v1.2.3


From 0e0b854cfb3302b1907e9d3a927469b95710238f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:35:14 -0400
Subject: xprtrdma: Clean up Receive trace points

For clarity, report the posting and completion of Receive CQEs.

Also, the wc->byte_len field contains garbage if wc->status is
non-zero, and the vendor error field contains garbage if wc->status
is zero. For readability, don't save those fields in those cases.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 39 ++++++++++++++++++++-------------------
 net/sunrpc/xprtrdma/verbs.c    |  4 ++--
 2 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 50ed3f8bf534..99c0049e51a5 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -528,24 +528,21 @@ TRACE_EVENT(xprtrdma_post_send,
 
 TRACE_EVENT(xprtrdma_post_recv,
 	TP_PROTO(
-		const struct rpcrdma_rep *rep,
-		int status
+		const struct ib_cqe *cqe
 	),
 
-	TP_ARGS(rep, status),
+	TP_ARGS(cqe),
 
 	TP_STRUCT__entry(
-		__field(const void *, rep)
-		__field(int, status)
+		__field(const void *, cqe)
 	),
 
 	TP_fast_assign(
-		__entry->rep = rep;
-		__entry->status = status;
+		__entry->cqe = cqe;
 	),
 
-	TP_printk("rep=%p status=%d",
-		__entry->rep, __entry->status
+	TP_printk("cqe=%p",
+		__entry->cqe
 	)
 );
 
@@ -584,28 +581,32 @@ TRACE_EVENT(xprtrdma_wc_send,
 
 TRACE_EVENT(xprtrdma_wc_receive,
 	TP_PROTO(
-		const struct rpcrdma_rep *rep,
 		const struct ib_wc *wc
 	),
 
-	TP_ARGS(rep, wc),
+	TP_ARGS(wc),
 
 	TP_STRUCT__entry(
-		__field(const void *, rep)
-		__field(unsigned int, byte_len)
+		__field(const void *, cqe)
+		__field(u32, byte_len)
 		__field(unsigned int, status)
-		__field(unsigned int, vendor_err)
+		__field(u32, vendor_err)
 	),
 
 	TP_fast_assign(
-		__entry->rep = rep;
-		__entry->byte_len = wc->byte_len;
+		__entry->cqe = wc->wr_cqe;
 		__entry->status = wc->status;
-		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
+		if (wc->status) {
+			__entry->byte_len = 0;
+			__entry->vendor_err = wc->vendor_err;
+		} else {
+			__entry->byte_len = wc->byte_len;
+			__entry->vendor_err = 0;
+		}
 	),
 
-	TP_printk("rep=%p, %u bytes: %s (%u/0x%x)",
-		__entry->rep, __entry->byte_len,
+	TP_printk("cqe=%p %u bytes: %s (%u/0x%x)",
+		__entry->cqe, __entry->byte_len,
 		rdma_show_wc_status(__entry->status),
 		__entry->status, __entry->vendor_err
 	)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 62baddefced3..4b18926de912 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -160,7 +160,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 					       rr_cqe);
 
 	/* WARNING: Only wr_id and status are reliable at this point */
-	trace_xprtrdma_wc_receive(rep, wc);
+	trace_xprtrdma_wc_receive(wc);
 	if (wc->status != IB_WC_SUCCESS)
 		goto out_fail;
 
@@ -1570,7 +1570,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
 	if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
 		goto out_map;
 	rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
-	trace_xprtrdma_post_recv(rep, rc);
+	trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
 	if (rc)
 		return -ENOTCONN;
 	return 0;
-- 
cgit v1.2.3


From 7c8d9e7c8863905951d4eaa7a8d277150f3a37f7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:35:20 -0400
Subject: xprtrdma: Move Receive posting to Receive handler

Receive completion and Reply handling are done by a BOUND
workqueue, meaning they run on only one CPU.

Posting receives is currently done in the send_request path, which
on large systems is typically done on a different CPU than the one
handling Receive completions. This results in movement of
Receive-related cachelines between the sending and receiving CPUs.

More importantly, it means that currently Receives are posted while
the transport's write lock is held, which is unnecessary and costly.

Finally, allocation of Receive buffers is performed on-demand in
the Receive completion handler. This helps guarantee that they are
allocated on the same NUMA node as the CPU that handles Receive
completions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h    |  40 +++++++--
 net/sunrpc/xprtrdma/backchannel.c |  32 ++-----
 net/sunrpc/xprtrdma/rpc_rdma.c    |  22 ++---
 net/sunrpc/xprtrdma/transport.c   |   3 -
 net/sunrpc/xprtrdma/verbs.c       | 176 +++++++++++++++++++++-----------------
 net/sunrpc/xprtrdma/xprt_rdma.h   |   6 +-
 6 files changed, 150 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 99c0049e51a5..ad27e192cdf8 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -546,6 +546,39 @@ TRACE_EVENT(xprtrdma_post_recv,
 	)
 );
 
+TRACE_EVENT(xprtrdma_post_recvs,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		unsigned int count,
+		int status
+	),
+
+	TP_ARGS(r_xprt, count, status),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned int, count)
+		__field(int, status)
+		__field(int, posted)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->count = count;
+		__entry->status = status;
+		__entry->posted = r_xprt->rx_buf.rb_posted_receives;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: %u new recvs, %d active (rc %d)",
+		__get_str(addr), __get_str(port), __entry->r_xprt,
+		__entry->count, __entry->posted, __entry->status
+	)
+);
+
 /**
  ** Completion events
  **/
@@ -800,7 +833,6 @@ TRACE_EVENT(xprtrdma_allocate,
 		__field(unsigned int, task_id)
 		__field(unsigned int, client_id)
 		__field(const void *, req)
-		__field(const void *, rep)
 		__field(size_t, callsize)
 		__field(size_t, rcvsize)
 	),
@@ -809,15 +841,13 @@ TRACE_EVENT(xprtrdma_allocate,
 		__entry->task_id = task->tk_pid;
 		__entry->client_id = task->tk_client->cl_clid;
 		__entry->req = req;
-		__entry->rep = req ? req->rl_reply : NULL;
 		__entry->callsize = task->tk_rqstp->rq_callsize;
 		__entry->rcvsize = task->tk_rqstp->rq_rcvsize;
 	),
 
-	TP_printk("task:%u@%u req=%p rep=%p (%zu, %zu)",
+	TP_printk("task:%u@%u req=%p (%zu, %zu)",
 		__entry->task_id, __entry->client_id,
-		__entry->req, __entry->rep,
-		__entry->callsize, __entry->rcvsize
+		__entry->req, __entry->callsize, __entry->rcvsize
 	)
 );
 
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 4034788c9856..c8f1c2b89dad 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -71,23 +71,6 @@ out_fail:
 	return -ENOMEM;
 }
 
-/* Allocate and add receive buffers to the rpcrdma_buffer's
- * existing list of rep's. These are released when the
- * transport is destroyed.
- */
-static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
-				 unsigned int count)
-{
-	int rc = 0;
-
-	while (count--) {
-		rc = rpcrdma_create_rep(r_xprt);
-		if (rc)
-			break;
-	}
-	return rc;
-}
-
 /**
  * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
  * @xprt: transport associated with these backchannel resources
@@ -116,14 +99,6 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 	if (rc)
 		goto out_free;
 
-	rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
-	if (rc)
-		goto out_free;
-
-	rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs);
-	if (rc)
-		goto out_free;
-
 	r_xprt->rx_buf.rb_bc_srv_max_requests = reqs;
 	request_module("svcrdma");
 	trace_xprtrdma_cb_setup(r_xprt, reqs);
@@ -228,6 +203,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
 	if (rc < 0)
 		goto failed_marshal;
 
+	rpcrdma_post_recvs(r_xprt, true);
 	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
 		goto drop_connection;
 	return 0;
@@ -268,10 +244,14 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
  */
 void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
 {
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	struct rpc_xprt *xprt = rqst->rq_xprt;
 
 	dprintk("RPC:       %s: freeing rqst %p (req %p)\n",
-		__func__, rqst, rpcr_to_rdmar(rqst));
+		__func__, rqst, req);
+
+	rpcrdma_recv_buffer_put(req->rl_reply);
+	req->rl_reply = NULL;
 
 	spin_lock_bh(&xprt->bc_pa_lock);
 	list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 8f89e3faae8e..d676106295ff 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1027,8 +1027,6 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
 
 out_short:
 	pr_warn("RPC/RDMA short backward direction call\n");
-	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
-		xprt_disconnect_done(&r_xprt->rx_xprt);
 	return true;
 }
 #else	/* CONFIG_SUNRPC_BACKCHANNEL */
@@ -1334,13 +1332,14 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	u32 credits;
 	__be32 *p;
 
+	--buf->rb_posted_receives;
+
 	if (rep->rr_hdrbuf.head[0].iov_len == 0)
 		goto out_badstatus;
 
+	/* Fixed transport header fields */
 	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
 			rep->rr_hdrbuf.head[0].iov_base);
-
-	/* Fixed transport header fields */
 	p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
 	if (unlikely(!p))
 		goto out_shortreply;
@@ -1379,17 +1378,10 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 
 	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
 
+	rpcrdma_post_recvs(r_xprt, false);
 	queue_work(rpcrdma_receive_wq, &rep->rr_work);
 	return;
 
-out_badstatus:
-	rpcrdma_recv_buffer_put(rep);
-	if (r_xprt->rx_ep.rep_connected == 1) {
-		r_xprt->rx_ep.rep_connected = -EIO;
-		rpcrdma_conn_func(&r_xprt->rx_ep);
-	}
-	return;
-
 out_badversion:
 	trace_xprtrdma_reply_vers(rep);
 	goto repost;
@@ -1409,7 +1401,7 @@ out_shortreply:
  * receive buffer before returning.
  */
 repost:
-	r_xprt->rx_stats.bad_reply_count++;
-	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
-		rpcrdma_recv_buffer_put(rep);
+	rpcrdma_post_recvs(r_xprt, false);
+out_badstatus:
+	rpcrdma_recv_buffer_put(rep);
 }
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 79885aa39c24..0c775f05123c 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -722,9 +722,6 @@ xprt_rdma_send_request(struct rpc_task *task)
 	if (rc < 0)
 		goto failed_marshal;
 
-	if (req->rl_reply == NULL) 		/* e.g. reconnection */
-		rpcrdma_recv_buffer_get(req);
-
 	/* Must suppress retransmit to maintain credits */
 	if (rqst->rq_connect_cookie == xprt->connect_cookie)
 		goto drop_connection;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4b18926de912..2ed2ee4f2c6c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -74,6 +74,7 @@
  */
 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
+static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp);
 static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
 
 struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
@@ -726,7 +727,6 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
 	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
 						   rx_ia);
-	unsigned int extras;
 	int rc;
 
 retry:
@@ -770,9 +770,8 @@ retry:
 	}
 
 	dprintk("RPC:       %s: connected\n", __func__);
-	extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
-	if (extras)
-		rpcrdma_ep_post_extra_recv(r_xprt, extras);
+
+	rpcrdma_post_recvs(r_xprt, true);
 
 out:
 	if (rc)
@@ -1082,14 +1081,8 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
 	return req;
 }
 
-/**
- * rpcrdma_create_rep - Allocate an rpcrdma_rep object
- * @r_xprt: controlling transport
- *
- * Returns 0 on success or a negative errno on failure.
- */
-int
-rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
+static int
+rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp)
 {
 	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1117,6 +1110,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
 	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
 	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
 	rep->rr_recv_wr.num_sge = 1;
+	rep->rr_temp = temp;
 
 	spin_lock(&buf->rb_lock);
 	list_add(&rep->rr_list, &buf->rb_recv_bufs);
@@ -1168,12 +1162,8 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 		list_add(&req->rl_list, &buf->rb_send_bufs);
 	}
 
+	buf->rb_posted_receives = 0;
 	INIT_LIST_HEAD(&buf->rb_recv_bufs);
-	for (i = 0; i <= buf->rb_max_requests; i++) {
-		rc = rpcrdma_create_rep(r_xprt);
-		if (rc)
-			goto out;
-	}
 
 	rc = rpcrdma_sendctxs_create(r_xprt);
 	if (rc)
@@ -1263,7 +1253,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 		rep = rpcrdma_buffer_get_rep_locked(buf);
 		rpcrdma_destroy_rep(rep);
 	}
-	buf->rb_send_count = 0;
 
 	spin_lock(&buf->rb_reqslock);
 	while (!list_empty(&buf->rb_allreqs)) {
@@ -1278,7 +1267,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 		spin_lock(&buf->rb_reqslock);
 	}
 	spin_unlock(&buf->rb_reqslock);
-	buf->rb_recv_count = 0;
 
 	rpcrdma_mrs_destroy(buf);
 }
@@ -1351,27 +1339,11 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
 	__rpcrdma_mr_put(&r_xprt->rx_buf, mr);
 }
 
-static struct rpcrdma_rep *
-rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
-{
-	/* If an RPC previously completed without a reply (say, a
-	 * credential problem or a soft timeout occurs) then hold off
-	 * on supplying more Receive buffers until the number of new
-	 * pending RPCs catches up to the number of posted Receives.
-	 */
-	if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
-		return NULL;
-
-	if (unlikely(list_empty(&buffers->rb_recv_bufs)))
-		return NULL;
-	buffers->rb_recv_count++;
-	return rpcrdma_buffer_get_rep_locked(buffers);
-}
-
-/*
- * Get a set of request/reply buffers.
+/**
+ * rpcrdma_buffer_get - Get a request buffer
+ * @buffers: Buffer pool from which to obtain a buffer
  *
- * Reply buffer (if available) is attached to send buffer upon return.
+ * Returns a fresh rpcrdma_req, or NULL if none are available.
  */
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -1379,23 +1351,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 	struct rpcrdma_req *req;
 
 	spin_lock(&buffers->rb_lock);
-	if (list_empty(&buffers->rb_send_bufs))
-		goto out_reqbuf;
-	buffers->rb_send_count++;
+	if (unlikely(list_empty(&buffers->rb_send_bufs)))
+		goto out_noreqs;
 	req = rpcrdma_buffer_get_req_locked(buffers);
-	req->rl_reply = rpcrdma_buffer_get_rep(buffers);
 	spin_unlock(&buffers->rb_lock);
-
 	return req;
 
-out_reqbuf:
+out_noreqs:
 	spin_unlock(&buffers->rb_lock);
 	return NULL;
 }
 
-/*
- * Put request/reply buffers back into pool.
- * Pre-decrement counter/array index.
+/**
+ * rpcrdma_buffer_put - Put request/reply buffers back into pool
+ * @req: object to return
+ *
  */
 void
 rpcrdma_buffer_put(struct rpcrdma_req *req)
@@ -1406,27 +1376,16 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 	req->rl_reply = NULL;
 
 	spin_lock(&buffers->rb_lock);
-	buffers->rb_send_count--;
-	list_add_tail(&req->rl_list, &buffers->rb_send_bufs);
+	list_add(&req->rl_list, &buffers->rb_send_bufs);
 	if (rep) {
-		buffers->rb_recv_count--;
-		list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+		if (!rep->rr_temp) {
+			list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+			rep = NULL;
+		}
 	}
 	spin_unlock(&buffers->rb_lock);
-}
-
-/*
- * Recover reply buffers from pool.
- * This happens when recovering from disconnect.
- */
-void
-rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
-{
-	struct rpcrdma_buffer *buffers = req->rl_buffer;
-
-	spin_lock(&buffers->rb_lock);
-	req->rl_reply = rpcrdma_buffer_get_rep(buffers);
-	spin_unlock(&buffers->rb_lock);
+	if (rep)
+		rpcrdma_destroy_rep(rep);
 }
 
 /*
@@ -1438,10 +1397,13 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
 {
 	struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
 
-	spin_lock(&buffers->rb_lock);
-	buffers->rb_recv_count--;
-	list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
-	spin_unlock(&buffers->rb_lock);
+	if (!rep->rr_temp) {
+		spin_lock(&buffers->rb_lock);
+		list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+		spin_unlock(&buffers->rb_lock);
+	} else {
+		rpcrdma_destroy_rep(rep);
+	}
 }
 
 /**
@@ -1537,13 +1499,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 	struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
 	int rc;
 
-	if (req->rl_reply) {
-		rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
-		if (rc)
-			return rc;
-		req->rl_reply = NULL;
-	}
-
 	if (!ep->rep_send_count ||
 	    test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
 		send_wr->send_flags |= IB_SEND_SIGNALED;
@@ -1618,3 +1573,70 @@ out_rc:
 	rpcrdma_recv_buffer_put(rep);
 	return rc;
 }
+
+/**
+ * rpcrdma_post_recvs - Maybe post some Receive buffers
+ * @r_xprt: controlling transport
+ * @temp: when true, allocate temp rpcrdma_rep objects
+ *
+ */
+void
+rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct ib_recv_wr *wr, *bad_wr;
+	int needed, count, rc;
+
+	needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
+	if (buf->rb_posted_receives > needed)
+		return;
+	needed -= buf->rb_posted_receives;
+
+	count = 0;
+	wr = NULL;
+	while (needed) {
+		struct rpcrdma_regbuf *rb;
+		struct rpcrdma_rep *rep;
+
+		spin_lock(&buf->rb_lock);
+		rep = list_first_entry_or_null(&buf->rb_recv_bufs,
+					       struct rpcrdma_rep, rr_list);
+		if (likely(rep))
+			list_del(&rep->rr_list);
+		spin_unlock(&buf->rb_lock);
+		if (!rep) {
+			if (rpcrdma_create_rep(r_xprt, temp))
+				break;
+			continue;
+		}
+
+		rb = rep->rr_rdmabuf;
+		if (!rpcrdma_regbuf_is_mapped(rb)) {
+			if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) {
+				rpcrdma_recv_buffer_put(rep);
+				break;
+			}
+		}
+
+		trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
+		rep->rr_recv_wr.next = wr;
+		wr = &rep->rr_recv_wr;
+		++count;
+		--needed;
+	}
+	if (!count)
+		return;
+
+	rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, &bad_wr);
+	if (rc) {
+		for (wr = bad_wr; wr; wr = wr->next) {
+			struct rpcrdma_rep *rep;
+
+			rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
+			rpcrdma_recv_buffer_put(rep);
+			--count;
+		}
+	}
+	buf->rb_posted_receives += count;
+	trace_xprtrdma_post_recvs(r_xprt, count, rc);
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 1d7bb6e5db18..4915a6dd58c9 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -197,6 +197,7 @@ struct rpcrdma_rep {
 	__be32			rr_proc;
 	int			rr_wc_flags;
 	u32			rr_inv_rkey;
+	bool			rr_temp;
 	struct rpcrdma_regbuf	*rr_rdmabuf;
 	struct rpcrdma_xprt	*rr_rxprt;
 	struct work_struct	rr_work;
@@ -397,11 +398,11 @@ struct rpcrdma_buffer {
 	struct rpcrdma_sendctx	**rb_sc_ctxs;
 
 	spinlock_t		rb_lock;	/* protect buf lists */
-	int			rb_send_count, rb_recv_count;
 	struct list_head	rb_send_bufs;
 	struct list_head	rb_recv_bufs;
 	u32			rb_max_requests;
 	u32			rb_credits;	/* most recent credit grant */
+	int			rb_posted_receives;
 
 	u32			rb_bc_srv_max_requests;
 	spinlock_t		rb_reqslock;	/* protect rb_allreqs */
@@ -558,13 +559,13 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
 				struct rpcrdma_req *);
 int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
 
 /*
  * Buffer calls - xprtrdma/verbs.c
  */
 struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
 void rpcrdma_destroy_req(struct rpcrdma_req *);
-int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt);
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
@@ -577,7 +578,6 @@ void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr);
 
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_req *);
-void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
-- 
cgit v1.2.3


From a7986f09986ac1befc85bcab30970312c476dbc7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:35:25 -0400
Subject: xprtrdma: Remove rpcrdma_ep_{post_recv, post_extra_recv}

Clean up: These functions are no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  |  2 --
 net/sunrpc/xprtrdma/verbs.c     | 59 -----------------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h |  3 ---
 3 files changed, 64 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index ad27e192cdf8..ac82849954e4 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -879,8 +879,6 @@ TRACE_EVENT(xprtrdma_rpc_done,
 	)
 );
 
-DEFINE_RXPRT_EVENT(xprtrdma_noreps);
-
 /**
  ** Callback events
  **/
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 2ed2ee4f2c6c..601b78b64483 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1515,65 +1515,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 	return 0;
 }
 
-int
-rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
-		     struct rpcrdma_rep *rep)
-{
-	struct ib_recv_wr *recv_wr_fail;
-	int rc;
-
-	if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
-		goto out_map;
-	rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
-	trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
-	if (rc)
-		return -ENOTCONN;
-	return 0;
-
-out_map:
-	pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
-	return -EIO;
-}
-
-/**
- * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
- * @r_xprt: transport associated with these backchannel resources
- * @count: minimum number of incoming requests expected
- *
- * Returns zero if all requested buffers were posted, or a negative errno.
- */
-int
-rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
-{
-	struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_rep *rep;
-	int rc;
-
-	while (count--) {
-		spin_lock(&buffers->rb_lock);
-		if (list_empty(&buffers->rb_recv_bufs))
-			goto out_reqbuf;
-		rep = rpcrdma_buffer_get_rep_locked(buffers);
-		spin_unlock(&buffers->rb_lock);
-
-		rc = rpcrdma_ep_post_recv(ia, rep);
-		if (rc)
-			goto out_rc;
-	}
-
-	return 0;
-
-out_reqbuf:
-	spin_unlock(&buffers->rb_lock);
-	trace_xprtrdma_noreps(r_xprt);
-	return -ENOMEM;
-
-out_rc:
-	rpcrdma_recv_buffer_put(rep);
-	return rc;
-}
-
 /**
  * rpcrdma_post_recvs - Maybe post some Receive buffers
  * @r_xprt: controlling transport
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 4915a6dd58c9..44a31246b8f7 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -558,7 +558,6 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
 				struct rpcrdma_req *);
-int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
 void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
 
 /*
@@ -599,8 +598,6 @@ rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 	return __rpcrdma_dma_map_regbuf(ia, rb);
 }
 
-int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
-
 int rpcrdma_alloc_wq(void);
 void rpcrdma_destroy_wq(void);
 
-- 
cgit v1.2.3


From 04e7b3d7eabefa0807b398233f019f274945ee79 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Mon, 7 May 2018 06:35:51 -0300
Subject: iio: iio.h: use nested struct support on kernel-doc markup

Solve those Sphinx warnings:

    ./include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.sign' not described in 'iio_chan_spec'
    ./include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.realbits' not described in 'iio_chan_spec'
    ./include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.storagebits' not described in 'iio_chan_spec'
    ./include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.shift' not described in 'iio_chan_spec'
    ./include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.repeat' not described in 'iio_chan_spec'
    ./include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.endianness' not described in 'iio_chan_spec'

    ./include/linux/iio/iio.h:191: WARNING: Unexpected indentation.
    ./include/linux/iio/iio.h:192: WARNING: Block quote ends without a blank line; unexpected unindent.
    ./include/linux/iio/iio.h:198: WARNING: Definition list ends without a blank line; unexpected unindent.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 11579fd4126e..a74cb177dc6f 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -183,18 +183,18 @@ struct iio_event_spec {
  * @address:		Driver specific identifier.
  * @scan_index:		Monotonic index to give ordering in scans when read
  *			from a buffer.
- * @scan_type:		sign:		's' or 'u' to specify signed or unsigned
- *			realbits:	Number of valid bits of data
- *			storagebits:	Realbits + padding
- *			shift:		Shift right by this before masking out
- *					realbits.
- *			repeat:		Number of times real/storage bits
- *					repeats. When the repeat element is
- *					more than 1, then the type element in
- *					sysfs will show a repeat value.
- *					Otherwise, the number of repetitions is
- *					omitted.
- *			endianness:	little or big endian
+ * @scan_type:		struct describing the scan type
+ * @scan_type.sign:		's' or 'u' to specify signed or unsigned
+ * @scan_type.realbits:		Number of valid bits of data
+ * @scan_type.storagebits:	Realbits + padding
+ * @scan_type.shift:		Shift right by this before masking out
+ *				realbits.
+ * @scan_type.repeat:		Number of times real/storage bits repeats.
+ *				When the repeat element is more than 1, then
+ *				the type element in sysfs will show a repeat
+ *				value. Otherwise, the number of repetitions
+ *				is omitted.
+ * @scan_type.endianness:	little or big endian
  * @info_mask_separate: What information is to be exported that is specific to
  *			this channel.
  * @info_mask_separate_available: What availability information is to be
-- 
cgit v1.2.3


From 03737001e463d458c13510fcb64179b3368aaeee Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Fri, 20 Apr 2018 13:49:24 +0300
Subject: mac80211: add api to set CSA counter in mac80211

Sometimes the most updated CSA counter values are known only
to the device. Add an API to pass this data to mac80211.

Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 13 +++++++++++++
 net/mac80211/tx.c      | 25 +++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d2279b2d61aa..52f36c43f35f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4449,6 +4449,19 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
  */
 u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif);
 
+/**
+ * ieee80211_csa_set_counter - request mac80211 to set csa counter
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @counter: the new value for the counter
+ *
+ * The csa counter can be changed by the device, this API should be
+ * used by the device driver to update csa counter in mac80211.
+ *
+ * It should never be used together with ieee80211_csa_update_counter(),
+ * as it will cause a race condition around the counter value.
+ */
+void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter);
+
 /**
  * ieee80211_csa_finish - notify mac80211 about channel switch
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 535de3161a78..062e125a324c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4084,6 +4084,31 @@ unlock:
 }
 EXPORT_SYMBOL(ieee80211_csa_update_counter);
 
+void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct beacon_data *beacon = NULL;
+
+	rcu_read_lock();
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
+		beacon = rcu_dereference(sdata->u.ap.beacon);
+	else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+		beacon = rcu_dereference(sdata->u.ibss.presp);
+	else if (ieee80211_vif_is_mesh(&sdata->vif))
+		beacon = rcu_dereference(sdata->u.mesh.beacon);
+
+	if (!beacon)
+		goto unlock;
+
+	if (counter < beacon->csa_current_counter)
+		beacon->csa_current_counter = counter;
+
+unlock:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_csa_set_counter);
+
 bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-- 
cgit v1.2.3


From 50f32718e125c3be5b0528bfa3868e88d677d8ce Mon Sep 17 00:00:00 2001
From: Haim Dreyfuss <haim.dreyfuss@intel.com>
Date: Fri, 20 Apr 2018 13:49:26 +0300
Subject: nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command

This will serve userspace entity to maintain its regulatory limitation.
More specifcally APs can use this data to calculate the WMM IE when
building: beacons, probe responses, assoc responses etc...

Signed-off-by: Haim Dreyfuss <haim.dreyfuss@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++
 net/wireless/nl80211.c       | 57 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 15daf5e2638d..04c9b97aa5fc 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -11,6 +11,7 @@
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  * Copyright 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -3141,6 +3142,29 @@ enum nl80211_band_attr {
 
 #define NL80211_BAND_ATTR_HT_CAPA NL80211_BAND_ATTR_HT_CAPA
 
+/**
+ * enum nl80211_wmm_rule - regulatory wmm rule
+ *
+ * @__NL80211_WMMR_INVALID: attribute number 0 is reserved
+ * @NL80211_WMMR_CW_MIN: Minimum contention window slot.
+ * @NL80211_WMMR_CW_MAX: Maximum contention window slot.
+ * @NL80211_WMMR_AIFSN: Arbitration Inter Frame Space.
+ * @NL80211_WMMR_TXOP: Maximum allowed tx operation time.
+ * @nl80211_WMMR_MAX: highest possible wmm rule.
+ * @__NL80211_WMMR_LAST: Internal use.
+ */
+enum nl80211_wmm_rule {
+	__NL80211_WMMR_INVALID,
+	NL80211_WMMR_CW_MIN,
+	NL80211_WMMR_CW_MAX,
+	NL80211_WMMR_AIFSN,
+	NL80211_WMMR_TXOP,
+
+	/* keep last */
+	__NL80211_WMMR_LAST,
+	NL80211_WMMR_MAX = __NL80211_WMMR_LAST - 1
+};
+
 /**
  * enum nl80211_frequency_attr - frequency attributes
  * @__NL80211_FREQUENCY_ATTR_INVALID: attribute number 0 is reserved
@@ -3190,6 +3214,9 @@ enum nl80211_band_attr {
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_NO_10MHZ: 10 MHz operation is not allowed
  *	on this channel in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_WMM: this channel has wmm limitations.
+ *	This is a nested attribute that contains the wmm limitation per AC.
+ *	(see &enum nl80211_wmm_rule)
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -3218,6 +3245,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_IR_CONCURRENT,
 	NL80211_FREQUENCY_ATTR_NO_20MHZ,
 	NL80211_FREQUENCY_ATTR_NO_10MHZ,
+	NL80211_FREQUENCY_ATTR_WMM,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ff28f8feeb09..016d0a1de576 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4,6 +4,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  */
 
 #include <linux/if.h>
@@ -645,7 +646,43 @@ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
 	return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
 }
 
-static int nl80211_msg_put_channel(struct sk_buff *msg,
+static int nl80211_msg_put_wmm_rules(struct sk_buff *msg,
+				     const struct ieee80211_reg_rule *rule)
+{
+	int j;
+	struct nlattr *nl_wmm_rules =
+		nla_nest_start(msg, NL80211_FREQUENCY_ATTR_WMM);
+
+	if (!nl_wmm_rules)
+		goto nla_put_failure;
+
+	for (j = 0; j < IEEE80211_NUM_ACS; j++) {
+		struct nlattr *nl_wmm_rule = nla_nest_start(msg, j);
+
+		if (!nl_wmm_rule)
+			goto nla_put_failure;
+
+		if (nla_put_u16(msg, NL80211_WMMR_CW_MIN,
+				rule->wmm_rule->client[j].cw_min) ||
+		    nla_put_u16(msg, NL80211_WMMR_CW_MAX,
+				rule->wmm_rule->client[j].cw_max) ||
+		    nla_put_u8(msg, NL80211_WMMR_AIFSN,
+			       rule->wmm_rule->client[j].aifsn) ||
+		    nla_put_u8(msg, NL80211_WMMR_TXOP,
+			       rule->wmm_rule->client[j].cot))
+			goto nla_put_failure;
+
+		nla_nest_end(msg, nl_wmm_rule);
+	}
+	nla_nest_end(msg, nl_wmm_rules);
+
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 				   struct ieee80211_channel *chan,
 				   bool large)
 {
@@ -721,6 +758,16 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
 			DBM_TO_MBM(chan->max_power)))
 		goto nla_put_failure;
 
+	if (large) {
+		const struct ieee80211_reg_rule *rule =
+			freq_reg_info(wiphy, chan->center_freq);
+
+		if (!IS_ERR(rule) && rule->wmm_rule) {
+			if (nl80211_msg_put_wmm_rules(msg, rule))
+				goto nla_put_failure;
+		}
+	}
+
 	return 0;
 
  nla_put_failure:
@@ -1631,7 +1678,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 					chan = &sband->channels[i];
 
 					if (nl80211_msg_put_channel(
-							msg, chan,
+							msg, &rdev->wiphy, chan,
 							state->split))
 						goto nla_put_failure;
 
@@ -14320,7 +14367,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
 	if (!nl_freq)
 		goto nla_put_failure;
-	if (nl80211_msg_put_channel(msg, channel_before, false))
+
+	if (nl80211_msg_put_channel(msg, wiphy, channel_before, false))
 		goto nla_put_failure;
 	nla_nest_end(msg, nl_freq);
 
@@ -14328,7 +14376,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
 	if (!nl_freq)
 		goto nla_put_failure;
-	if (nl80211_msg_put_channel(msg, channel_after, false))
+
+	if (nl80211_msg_put_channel(msg, wiphy, channel_after, false))
 		goto nla_put_failure;
 	nla_nest_end(msg, nl_freq);
 
-- 
cgit v1.2.3


From 81d5439da84419ee35bea54309a9f2c3871b6605 Mon Sep 17 00:00:00 2001
From: Balaji Pothunoori <bpothuno@codeaurora.org>
Date: Mon, 16 Apr 2018 20:18:40 +0530
Subject: cfg80211: average ack rssi support for data frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Average ack rssi will be given to userspace via NL80211 interface
if firmware is capable. Userspace tool ‘iw’ can process this
information and give the output as one of the fields in
‘iw dev wlanX station dump’.

Example output :

localhost ~ #iw dev wlan-5000mhz station dump Station
34:f3:9a:aa:3b:29 (on wlan-5000mhz)
        inactive time:  5370 ms
        rx bytes:       85321
        rx packets:     576
        tx bytes:       14225
        tx packets:     71
        tx retries:     0
        tx failed:      2
        beacon loss:    0
        rx drop misc:   0
        signal:         -54 dBm
        signal avg:     -53 dBm
        tx bitrate:     866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2
        rx bitrate:     866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2
        avg ack signal: -56 dBm
        authorized:     yes
        authenticated:  yes
        associated:     yes
        preamble:       short
        WMM/WME:        yes
        MFP:            no
        TDLS peer:      no
        DTIM period:    2
        beacon interval:100
       short preamble: yes
       short slot time:yes
       connected time: 203 seconds

Main use case is to measure the signal strength of a connected station
to AP. Data packet transmit rates and bandwidth used by station can vary
a lot even if the station is at fixed location, especially if the rates
used are multi stream(2stream, 3stream) rates with different bandwidth(20/40/80 Mhz).
These multi stream rates are sensitive and station can use different transmit power
for each of the rate and bandwidth combinations. RSSI measured from these RX packets
on AP will be not stable and can vary a lot with in a short time.
Whereas 802.11 ack frames from station are sent relatively at a constant
rate (6/12/24 Mbps) with constant bandwidth(20 Mhz).
So average rssi of the ack packets is good and more accurate.

Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 3 +++
 include/uapi/linux/nl80211.h | 7 +++++++
 net/wireless/nl80211.c       | 3 +++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 250dac390806..5e888ec62811 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1152,6 +1152,8 @@ struct cfg80211_tid_stats {
  * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
  *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
  * @ack_signal: signal strength (in dBm) of the last ACK frame.
+ * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
+ *	been sent.
  */
 struct station_info {
 	u64 filled;
@@ -1197,6 +1199,7 @@ struct station_info {
 	u8 rx_beacon_signal_avg;
 	struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1];
 	s8 ack_signal;
+	s8 avg_ack_signal;
 };
 
 #if IS_ENABLED(CONFIG_CFG80211)
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 04c9b97aa5fc..8e55b63ed3ff 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2981,6 +2981,8 @@ enum nl80211_sta_bss_param {
  *	received from the station (u64, usec)
  * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment
  * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm)
+ * @NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG: avg signal strength of (data)
+ *	ACK frame (s8, dBm)
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3020,6 +3022,7 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_RX_DURATION,
 	NL80211_STA_INFO_PAD,
 	NL80211_STA_INFO_ACK_SIGNAL,
+	NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
@@ -5066,6 +5069,9 @@ enum nl80211_feature_flags {
  *	"radar detected" event.
  * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and
  *	receiving control port frames over nl80211 instead of the netdevice.
+ * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack
+ *	rssi if firmware support, this flag is to intimate about ack rssi
+ *	support to nl80211.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -5098,6 +5104,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN,
 	NL80211_EXT_FEATURE_DFS_OFFLOAD,
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
+	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 016d0a1de576..6b942a68d1c8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4541,6 +4541,9 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO_U64(BEACON_RX, rx_beacon);
 	PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
 	PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
+	if (wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT))
+		PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8);
 
 #undef PUT_SINFO
 #undef PUT_SINFO_U64
-- 
cgit v1.2.3


From 333c8c1216c1e7ead6af7b3d667b43eb425b5034 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 7 May 2018 15:52:55 -0500
Subject: PCI: Add Qualcomm vendor ID

Add the Qualcomm vendor ID to pci_ids.h and use it in quirks.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/quirks.c    | 4 ++--
 include/linux/pci_ids.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 2990ad1e7c99..e7bf44515fd6 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4361,8 +4361,8 @@ static const struct pci_dev_acs_enabled {
 	{ PCI_VENDOR_ID_INTEL, 0x15b7, pci_quirk_mf_endpoint_acs },
 	{ PCI_VENDOR_ID_INTEL, 0x15b8, pci_quirk_mf_endpoint_acs },
 	/* QCOM QDF2xxx root ports */
-	{ 0x17cb, 0x400, pci_quirk_qcom_rp_acs },
-	{ 0x17cb, 0x401, pci_quirk_qcom_rp_acs },
+	{ PCI_VENDOR_ID_QCOM, 0x0400, pci_quirk_qcom_rp_acs },
+	{ PCI_VENDOR_ID_QCOM, 0x0401, pci_quirk_qcom_rp_acs },
 	/* Intel PCH root ports */
 	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_quirk_intel_pch_acs },
 	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_quirk_intel_spt_pch_acs },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cc608fc55334..883cb7bf78aa 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2387,6 +2387,8 @@
 
 #define PCI_VENDOR_ID_LENOVO		0x17aa
 
+#define PCI_VENDOR_ID_QCOM		0x17cb
+
 #define PCI_VENDOR_ID_CDNS		0x17cd
 
 #define PCI_VENDOR_ID_ARECA		0x17d3
-- 
cgit v1.2.3


From d22b362184553899f7d6b6760899a77d3b2d7c1b Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 3 May 2018 18:39:38 -0500
Subject: PCI: pciehp: Add quirk for Command Completed errata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several PCIe hotplug controllers have errata that mean they do not set the
Command Completed bit unless writes to the Slot Command register change
"Control" bits.  Command Completed is never set for writes that only change
software notification "Enable" bits.  This results in timeouts like this:

  pciehp 0000:00:1c.0:pcie004: Timeout on hotplug command 0x1038 (issued 65284 msec ago)

When this erratum is present, avoid these timeouts by marking commands
"completed" immediately unless they change the "Control" bits.

Here's the text of the Intel erratum CF118.  We assume this applies to all
Intel parts:

  CF118        PCIe Slot Status Register Command Completed bit not always
               updated on any configuration write to the Slot Control
               Register

  Problem:     For PCIe root ports (devices 0 - 10) supporting hot-plug,
               the Slot Status Register (offset AAh) Command Completed
               (bit[4]) status is updated under the following condition:
               IOH will set Command Completed bit after delivering the new
               commands written in the Slot Controller register (offset
               A8h) to VPP. The IOH detects new commands written in Slot
               Control register by checking the change of value for Power
               Controller Control (bit[10]), Power Indicator Control
               (bits[9:8]), Attention Indicator Control (bits[7:6]), or
               Electromechanical Interlock Control (bit[11]) fields. Any
               other configuration writes to the Slot Control register
               without changing the values of these fields will not cause
               Command Completed bit to be set.

               The PCIe Base Specification Revision 2.0 or later describes
               the “Slot Control Register” in section 7.8.10, as follows
               (Reference section 7.8.10, Slot Control Register, Offset
               18h). In hot-plug capable Downstream Ports, a write to the
               Slot Control register must cause a hot-plug command to be
               generated (see Section 6.7.3.2 for details on hot-plug
               commands). A write to the Slot Control register in a
               Downstream Port that is not hotplug capable must not cause a
               hot-plug command to be executed.

               The PCIe Spec intended that every write to the Slot Control
               Register is a command and expected a command complete status
               to abstract the VPP implementation specific nuances from the
               OS software. IOH PCIe Slot Control Register implementation
               is not fully conforming to the PCIe Specification in this
               respect.

  Implication: Software checking on the Command Completed status after
               writing to the Slot Control register may time out.

  Workaround:  Software can read the Slot Control register and compare the
               existing and new values to determine if it should check the
               Command Completed status after writing to the Slot Control
               register.

Per Sinan, the Qualcomm QDF2400 controller also does not set the Command
Completed bit unless writes to the Slot Command register change "Control"
bits.

Link: http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-v2-spec-update.html
Link: https://lkml.kernel.org/r/8770820b-85a0-172b-7230-3a44524e6c9f@molgen.mpg.de
Reported-by: Paul Menzel <pmenzel+linux-pci@molgen.mpg.de>	# Lenovo X60
Tested-by: Paul Menzel <pmenzel+linux-pci@molgen.mpg.de>	# Lenovo X60
Signed-off-by: Sinan Kaya <okaya@codeaurora.org>		# Qcom quirk
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/pci/hotplug/pciehp_hpc.c | 51 ++++++++++++++++++++++++++++++----------
 include/linux/pci.h              |  3 +++
 2 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8f5dc5..74a7f7bde263 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -10,7 +10,6 @@
  * All rights reserved.
  *
  * Send feedback to <greg@kroah.com>,<kristen.c.accardi@intel.com>
- *
  */
 
 #include <linux/kernel.h>
@@ -147,25 +146,22 @@ static void pcie_wait_cmd(struct controller *ctrl)
 	else
 		rc = pcie_poll_cmd(ctrl, jiffies_to_msecs(timeout));
 
-	/*
-	 * Controllers with errata like Intel CF118 don't generate
-	 * completion notifications unless the power/indicator/interlock
-	 * control bits are changed.  On such controllers, we'll emit this
-	 * timeout message when we wait for completion of commands that
-	 * don't change those bits, e.g., commands that merely enable
-	 * interrupts.
-	 */
 	if (!rc)
 		ctrl_info(ctrl, "Timeout on hotplug command %#06x (issued %u msec ago)\n",
 			  ctrl->slot_ctrl,
 			  jiffies_to_msecs(jiffies - ctrl->cmd_started));
 }
 
+#define CC_ERRATUM_MASK		(PCI_EXP_SLTCTL_PCC |	\
+				 PCI_EXP_SLTCTL_PIC |	\
+				 PCI_EXP_SLTCTL_AIC |	\
+				 PCI_EXP_SLTCTL_EIC)
+
 static void pcie_do_write_cmd(struct controller *ctrl, u16 cmd,
 			      u16 mask, bool wait)
 {
 	struct pci_dev *pdev = ctrl_dev(ctrl);
-	u16 slot_ctrl;
+	u16 slot_ctrl_orig, slot_ctrl;
 
 	mutex_lock(&ctrl->ctrl_lock);
 
@@ -180,6 +176,7 @@ static void pcie_do_write_cmd(struct controller *ctrl, u16 cmd,
 		goto out;
 	}
 
+	slot_ctrl_orig = slot_ctrl;
 	slot_ctrl &= ~mask;
 	slot_ctrl |= (cmd & mask);
 	ctrl->cmd_busy = 1;
@@ -188,6 +185,17 @@ static void pcie_do_write_cmd(struct controller *ctrl, u16 cmd,
 	ctrl->cmd_started = jiffies;
 	ctrl->slot_ctrl = slot_ctrl;
 
+	/*
+	 * Controllers with the Intel CF118 and similar errata advertise
+	 * Command Completed support, but they only set Command Completed
+	 * if we change the "Control" bits for power, power indicator,
+	 * attention indicator, or interlock.  If we only change the
+	 * "Enable" bits, they never set the Command Completed bit.
+	 */
+	if (pdev->broken_cmd_compl &&
+	    (slot_ctrl_orig & CC_ERRATUM_MASK) == (slot_ctrl & CC_ERRATUM_MASK))
+		ctrl->cmd_busy = 0;
+
 	/*
 	 * Optionally wait for the hardware to be ready for a new command,
 	 * indicating completion of the above issued command.
@@ -861,7 +869,7 @@ struct controller *pcie_init(struct pcie_device *dev)
 		PCI_EXP_SLTSTA_MRLSC | PCI_EXP_SLTSTA_CC |
 		PCI_EXP_SLTSTA_DLLSC);
 
-	ctrl_info(ctrl, "Slot #%d AttnBtn%c PwrCtrl%c MRL%c AttnInd%c PwrInd%c HotPlug%c Surprise%c Interlock%c NoCompl%c LLActRep%c\n",
+	ctrl_info(ctrl, "Slot #%d AttnBtn%c PwrCtrl%c MRL%c AttnInd%c PwrInd%c HotPlug%c Surprise%c Interlock%c NoCompl%c LLActRep%c%s\n",
 		(slot_cap & PCI_EXP_SLTCAP_PSN) >> 19,
 		FLAG(slot_cap, PCI_EXP_SLTCAP_ABP),
 		FLAG(slot_cap, PCI_EXP_SLTCAP_PCP),
@@ -872,7 +880,8 @@ struct controller *pcie_init(struct pcie_device *dev)
 		FLAG(slot_cap, PCI_EXP_SLTCAP_HPS),
 		FLAG(slot_cap, PCI_EXP_SLTCAP_EIP),
 		FLAG(slot_cap, PCI_EXP_SLTCAP_NCCS),
-		FLAG(link_cap, PCI_EXP_LNKCAP_DLLLARC));
+		FLAG(link_cap, PCI_EXP_LNKCAP_DLLLARC),
+		pdev->broken_cmd_compl ? " (with Cmd Compl erratum)" : "");
 
 	if (pcie_init_slot(ctrl))
 		goto abort_ctrl;
@@ -891,3 +900,21 @@ void pciehp_release_ctrl(struct controller *ctrl)
 	pcie_cleanup_slot(ctrl);
 	kfree(ctrl);
 }
+
+static void quirk_cmd_compl(struct pci_dev *pdev)
+{
+	u32 slot_cap;
+
+	if (pci_is_pcie(pdev)) {
+		pcie_capability_read_dword(pdev, PCI_EXP_SLTCAP, &slot_cap);
+		if (slot_cap & PCI_EXP_SLTCAP_HPC &&
+		    !(slot_cap & PCI_EXP_SLTCAP_NCCS))
+			pdev->broken_cmd_compl = 1;
+	}
+}
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
+			      PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0400,
+			      PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0401,
+			      PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..14ab65b83067 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -406,6 +406,9 @@ struct pci_dev {
 	struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
 	struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
 
+#ifdef CONFIG_HOTPLUG_PCI_PCIE
+	unsigned int	broken_cmd_compl:1;	/* No compl for some cmds */
+#endif
 #ifdef CONFIG_PCIE_PTM
 	unsigned int	ptm_root:1;
 	unsigned int	ptm_enabled:1;
-- 
cgit v1.2.3


From 17e8f0d4cee2bf50c2764bb4318284ce16152c5f Mon Sep 17 00:00:00 2001
From: Gilles Buloz <Gilles.Buloz@kontron.com>
Date: Thu, 3 May 2018 15:21:44 -0500
Subject: PCI: Check whether bridges allow access to extended config space

Even if a device supports extended config space, i.e., it is a PCI-X Mode 2
or a PCI Express device, the extended space may not be accessible if
there's a conventional PCI bus in the path to it.

We currently figure that out in pci_cfg_space_size() by reading the first
dword of extended config space.  On most platforms that returns ~0 data if
the space is inaccessible, but it may set error bits in PCI status
registers, and on some platforms it causes exceptions that we currently
don't recover from.

For example, a PCIe-to-conventional PCI bridge treats config transactions
with a non-zero Extended Register Address as an Unsupported Request on PCIe
and a received Master-Abort on the destination bus (see PCI Express to
PCI/PCI-X Bridge spec, r1.0, sec 4.1.3).

A sample case is a LS1043A CPU (NXP QorIQ Layerscape) platform with the
following bus topology:

  LS1043 PCIe Root Port
    -> PEX8112 PCIe-to-PCI bridge (doesn't support ext cfg on PCI side)
      -> PMC slot connector (for legacy PMC modules)

With a PMC module topology as follows:

  PMC connector
    -> PCI-to-PCIe bridge
      -> PCIe switch (4 ports)
        -> 4 PCIe devices (one on each port)

The PCIe devices on the PMC module support extended config space, but we
can't reach it because the PEX8112 can't generate accesses to the extended
space on its secondary bus.  Attempts to access it cause Unsupported
Request errors, which result in synchronous aborts on this platform.

To avoid these errors, check whether bridges are capable of generating
extended config space addresses on their secondary interfaces.  If they
can't, we restrict devices below the bridge to only the 256-byte
PCI-compatible config space.

Signed-off-by: Gilles Buloz <gilles.buloz@kontron.com>
[bhelgaas: changelog, rework patch so bus_flags testing is all in
pci_bridge_child_ext_cfg_accessible()]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/probe.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac91b6fd0bcd..dcc41de1c2c8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -882,6 +882,45 @@ free:
 	return err;
 }
 
+static bool pci_bridge_child_ext_cfg_accessible(struct pci_dev *bridge)
+{
+	int pos;
+	u32 status;
+
+	/*
+	 * If extended config space isn't accessible on a bridge's primary
+	 * bus, we certainly can't access it on the secondary bus.
+	 */
+	if (bridge->bus->bus_flags & PCI_BUS_FLAGS_NO_EXTCFG)
+		return false;
+
+	/*
+	 * PCIe Root Ports and switch ports are PCIe on both sides, so if
+	 * extended config space is accessible on the primary, it's also
+	 * accessible on the secondary.
+	 */
+	if (pci_is_pcie(bridge) &&
+	    (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT ||
+	     pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM ||
+	     pci_pcie_type(bridge) == PCI_EXP_TYPE_DOWNSTREAM))
+		return true;
+
+	/*
+	 * For the other bridge types:
+	 *   - PCI-to-PCI bridges
+	 *   - PCIe-to-PCI/PCI-X forward bridges
+	 *   - PCI/PCI-X-to-PCIe reverse bridges
+	 * extended config space on the secondary side is only accessible
+	 * if the bridge supports PCI-X Mode 2.
+	 */
+	pos = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
+	if (!pos)
+		return false;
+
+	pci_read_config_dword(bridge, pos + PCI_X_STATUS, &status);
+	return status & (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ);
+}
+
 static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
 					   struct pci_dev *bridge, int busnr)
 {
@@ -923,6 +962,16 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
 	pci_set_bus_of_node(child);
 	pci_set_bus_speed(child);
 
+	/*
+	 * Check whether extended config space is accessible on the child
+	 * bus.  Note that we currently assume it is always accessible on
+	 * the root bus.
+	 */
+	if (!pci_bridge_child_ext_cfg_accessible(bridge)) {
+		child->bus_flags |= PCI_BUS_FLAGS_NO_EXTCFG;
+		pci_info(child, "extended config space not accessible\n");
+	}
+
 	/* Set up default resource pointers and names */
 	for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
 		child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i];
@@ -1393,6 +1442,9 @@ int pci_cfg_space_size(struct pci_dev *dev)
 	u32 status;
 	u16 class;
 
+	if (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_EXTCFG)
+		return PCI_CFG_SPACE_SIZE;
+
 	class = dev->class >> 8;
 	if (class == PCI_CLASS_BRIDGE_HOST)
 		return pci_cfg_space_size_ext(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..5eb97e198325 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -217,6 +217,7 @@ enum pci_bus_flags {
 	PCI_BUS_FLAGS_NO_MSI	= (__force pci_bus_flags_t) 1,
 	PCI_BUS_FLAGS_NO_MMRBC	= (__force pci_bus_flags_t) 2,
 	PCI_BUS_FLAGS_NO_AERSID	= (__force pci_bus_flags_t) 4,
+	PCI_BUS_FLAGS_NO_EXTCFG	= (__force pci_bus_flags_t) 8,
 };
 
 /* Values from Link Status register, PCIe r3.1, sec 7.8.8 */
-- 
cgit v1.2.3


From c36a68fab1c3ddf71918cd1e5937a647cd80ba2b Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 4 May 2018 17:17:46 +0200
Subject: net: u64_stats_sync: Remove functions without user

Commit 67db3e4bfbc9 ("tcp: no longer hold ehash lock while calling
tcp_get_info()") removes the only users of u64_stats_update_end/begin_raw()
without removing the function in header file.

Remove no longer used functions.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 07ee0f84a46c..a27604f99ed0 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -112,20 +112,6 @@ u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
 #endif
 }
 
-static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	raw_write_seqcount_begin(&syncp->seq);
-#endif
-}
-
-static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	raw_write_seqcount_end(&syncp->seq);
-#endif
-}
-
 static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-- 
cgit v1.2.3


From 0bc5fe857274133ca028ebb15ff2e8549a369916 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Sat, 5 May 2018 18:42:59 -0700
Subject: qed*: Refactor mf_mode to consist of bits.

`mf_mode' field indicates the multi-partitioning mode the device is
configured to. This method doesn't scale very well, adding a new MF mode
requires going over all the existing conditions, and deciding whether those
are needed for the new mode or not.
The patch defines a set of bit-fields for modes which are derived according
to the mode info shared by the MFW and all the configuration would be made
according to those. To add a new mode, there would be a single place where
we'll need to go and choose which bits apply and which don't.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h             | 41 ++++++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 39 +++++++++++----------
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         |  6 ++--
 drivers/net/ethernet/qlogic/qed/qed_main.c        |  6 ++--
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  3 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 16 +++------
 drivers/net/ethernet/qlogic/qede/qede_main.c      |  4 +--
 include/linux/qed/qed_if.h                        |  2 +-
 8 files changed, 71 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index e07460a68d30..c8f3507d1151 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -439,6 +439,41 @@ struct qed_fw_data {
 	u32			init_ops_size;
 };
 
+enum qed_mf_mode_bit {
+	/* Supports PF-classification based on tag */
+	QED_MF_OVLAN_CLSS,
+
+	/* Supports PF-classification based on MAC */
+	QED_MF_LLH_MAC_CLSS,
+
+	/* Supports PF-classification based on protocol type */
+	QED_MF_LLH_PROTO_CLSS,
+
+	/* Requires a default PF to be set */
+	QED_MF_NEED_DEF_PF,
+
+	/* Allow LL2 to multicast/broadcast */
+	QED_MF_LL2_NON_UNICAST,
+
+	/* Allow Cross-PF [& child VFs] Tx-switching */
+	QED_MF_INTER_PF_SWITCH,
+
+	/* Unified Fabtic Port support enabled */
+	QED_MF_UFP_SPECIFIC,
+
+	/* Disable Accelerated Receive Flow Steering (aRFS) */
+	QED_MF_DISABLE_ARFS,
+
+	/* Use vlan for steering */
+	QED_MF_8021Q_TAGGING,
+
+	/* Use stag for steering */
+	QED_MF_8021AD_TAGGING,
+
+	/* Allow DSCP to TC mapping */
+	QED_MF_DSCP_TO_TC_MAP,
+};
+
 enum BAR_ID {
 	BAR_ID_0,		/* used for GRC */
 	BAR_ID_1		/* Used for doorbells */
@@ -669,10 +704,8 @@ struct qed_dev {
 	u8				num_funcs_in_port;
 
 	u8				path_id;
-	enum qed_mf_mode		mf_mode;
-#define IS_MF_DEFAULT(_p_hwfn)  (((_p_hwfn)->cdev)->mf_mode == QED_MF_DEFAULT)
-#define IS_MF_SI(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_NPAR)
-#define IS_MF_SD(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_OVLAN)
+
+	unsigned long			mf_bits;
 
 	int				pcie_width;
 	int				pcie_speed;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index d2ad5e92c74f..9b07d7f25042 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1149,18 +1149,10 @@ static int qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
 		return -EINVAL;
 	}
 
-	switch (p_hwfn->cdev->mf_mode) {
-	case QED_MF_DEFAULT:
-	case QED_MF_NPAR:
-		hw_mode |= 1 << MODE_MF_SI;
-		break;
-	case QED_MF_OVLAN:
+	if (test_bit(QED_MF_OVLAN_CLSS, &p_hwfn->cdev->mf_bits))
 		hw_mode |= 1 << MODE_MF_SD;
-		break;
-	default:
-		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+	else
 		hw_mode |= 1 << MODE_MF_SI;
-	}
 
 	hw_mode |= 1 << MODE_ASIC;
 
@@ -1557,7 +1549,6 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 
 		/* send function start command */
 		rc = qed_sp_pf_start(p_hwfn, p_ptt, p_tunn,
-				     p_hwfn->cdev->mf_mode,
 				     allow_npar_tx_switch);
 		if (rc) {
 			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
@@ -2651,17 +2642,25 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 	switch (mf_mode) {
 	case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
-		p_hwfn->cdev->mf_mode = QED_MF_OVLAN;
+		p_hwfn->cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
 		break;
 	case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
-		p_hwfn->cdev->mf_mode = QED_MF_NPAR;
+		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_LL2_NON_UNICAST) |
+					BIT(QED_MF_INTER_PF_SWITCH);
 		break;
 	case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
-		p_hwfn->cdev->mf_mode = QED_MF_DEFAULT;
+		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_LL2_NON_UNICAST);
+		if (QED_IS_BB(p_hwfn->cdev))
+			p_hwfn->cdev->mf_bits |= BIT(QED_MF_NEED_DEF_PF);
 		break;
 	}
-	DP_INFO(p_hwfn, "Multi function mode is %08x\n",
-		p_hwfn->cdev->mf_mode);
+
+	DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n",
+		p_hwfn->cdev->mf_bits);
 
 	/* Read Multi-function information from shmem */
 	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
@@ -3462,7 +3461,7 @@ int qed_llh_add_mac_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0, en;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_MAC_CLSS, &p_hwfn->cdev->mf_bits))
 		return 0;
 
 	qed_llh_mac_to_filter(&high, &low, p_filter);
@@ -3507,7 +3506,7 @@ void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_MAC_CLSS, &p_hwfn->cdev->mf_bits))
 		return;
 
 	qed_llh_mac_to_filter(&high, &low, p_filter);
@@ -3549,7 +3548,7 @@ qed_llh_add_protocol_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0, en;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_PROTO_CLSS, &p_hwfn->cdev->mf_bits))
 		return 0;
 
 	switch (type) {
@@ -3647,7 +3646,7 @@ qed_llh_remove_protocol_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_PROTO_CLSS, &p_hwfn->cdev->mf_bits))
 		return;
 
 	switch (type) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 38502815d681..6c942c133b2c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -922,9 +922,9 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->queue_id = p_ll2_conn->queue_id;
 	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
-	if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) &&
-	    p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) &&
-	    (conn_type != QED_LL2_TYPE_IWARP)) {
+	if (test_bit(QED_MF_LL2_NON_UNICAST, &p_hwfn->cdev->mf_bits) &&
+	    p_ramrod->main_func_queue && conn_type != QED_LL2_TYPE_ROCE &&
+	    conn_type != QED_LL2_TYPE_IWARP) {
 		p_ramrod->mf_si_bcast_accept_all = 1;
 		p_ramrod->mf_si_mcast_accept_all = 1;
 	} else {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index d1d3787affe8..307fe33d75c8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -264,7 +264,8 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->pci_mem_end = cdev->pci_params.mem_end;
 	dev_info->pci_irq = cdev->pci_params.irq;
 	dev_info->rdma_supported = QED_IS_RDMA_PERSONALITY(p_hwfn);
-	dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]);
+	dev_info->is_mf_default = !test_bit(QED_MF_LLH_MAC_CLSS,
+					    &cdev->mf_bits);
 	dev_info->dev_type = cdev->type;
 	ether_addr_copy(dev_info->hw_mac, hw_info->hw_mac_addr);
 
@@ -273,7 +274,8 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 		dev_info->fw_minor = FW_MINOR_VERSION;
 		dev_info->fw_rev = FW_REVISION_VERSION;
 		dev_info->fw_eng = FW_ENGINEERING_VERSION;
-		dev_info->mf_mode = cdev->mf_mode;
+		dev_info->b_inter_pf_switch = test_bit(QED_MF_INTER_PF_SWITCH,
+						       &cdev->mf_bits);
 		dev_info->tx_switching = true;
 
 		if (hw_info->b_wol_support == QED_WOL_SUPPORT_PME)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index ab4ad8a1e2a5..768022235451 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -416,7 +416,6 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  * @param p_hwfn
  * @param p_ptt
  * @param p_tunn
- * @param mode
  * @param allow_npar_tx_switch
  *
  * @return int
@@ -425,7 +424,7 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    struct qed_tunnel_info *p_tunn,
-		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
+		    bool allow_npar_tx_switch);
 
 /**
  * @brief qed_sp_pf_update - PF Function Update Ramrod
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 5e927b6cac22..fbb317257629 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -306,7 +306,7 @@ qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    struct qed_tunnel_info *p_tunn,
-		    enum qed_mf_mode mode, bool allow_npar_tx_switch)
+		    bool allow_npar_tx_switch)
 {
 	struct pf_start_ramrod_data *p_ramrod = NULL;
 	u16 sb = qed_int_get_sp_sb_id(p_hwfn);
@@ -339,18 +339,10 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->dont_log_ramrods	= 0;
 	p_ramrod->log_type_mask		= cpu_to_le16(0xf);
 
-	switch (mode) {
-	case QED_MF_DEFAULT:
-	case QED_MF_NPAR:
-		p_ramrod->mf_mode = MF_NPAR;
-		break;
-	case QED_MF_OVLAN:
+	if (test_bit(QED_MF_OVLAN_CLSS, &p_hwfn->cdev->mf_bits))
 		p_ramrod->mf_mode = MF_OVLAN;
-		break;
-	default:
-		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+	else
 		p_ramrod->mf_mode = MF_NPAR;
-	}
 
 	p_ramrod->outer_tag_config.outer_tag.tci =
 		cpu_to_le16(p_hwfn->hw_info.ovlan);
@@ -365,7 +357,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	qed_tunn_set_pf_start_params(p_hwfn, p_tunn, &p_ramrod->tunnel_config);
 
-	if (IS_MF_SI(p_hwfn))
+	if (test_bit(QED_MF_INTER_PF_SWITCH, &p_hwfn->cdev->mf_bits))
 		p_ramrod->allow_npar_tx_switching = allow_npar_tx_switch;
 
 	switch (p_hwfn->hw_info.personality) {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index a01e7d6e5442..89c581c3c21a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -199,7 +199,7 @@ static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 
 	/* Enable/Disable Tx switching for PF */
 	if ((rc == num_vfs_param) && netif_running(edev->ndev) &&
-	    qed_info->mf_mode != QED_MF_NPAR && qed_info->tx_switching) {
+	    !qed_info->b_inter_pf_switch && qed_info->tx_switching) {
 		vport_params->vport_id = 0;
 		vport_params->update_tx_switching_flg = 1;
 		vport_params->tx_switching_flg = num_vfs_param ? 1 : 0;
@@ -1928,7 +1928,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 	vport_update_params->update_vport_active_flg = 1;
 	vport_update_params->vport_active_flg = 1;
 
-	if ((qed_info->mf_mode == QED_MF_NPAR || pci_num_vf(edev->pdev)) &&
+	if ((qed_info->b_inter_pf_switch || pci_num_vf(edev->pdev)) &&
 	    qed_info->tx_switching) {
 		vport_update_params->update_tx_switching_flg = 1;
 		vport_update_params->tx_switching_flg = 1;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index e53f9c7c2809..5dac56147a07 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -359,7 +359,7 @@ struct qed_dev_info {
 #define QED_MFW_VERSION_3_OFFSET	24
 
 	u32		flash_size;
-	u8		mf_mode;
+	bool		b_inter_pf_switch;
 	bool		tx_switching;
 	bool		rdma_supported;
 	u16		mtu;
-- 
cgit v1.2.3


From 27bf96e32c92599dc7523b36d6c761fc8312c8c0 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Sat, 5 May 2018 18:43:00 -0700
Subject: qed: Remove unused data member 'is_mf_default'.

The data member 'is_mf_default' is not used by the qed/qede drivers,
removing the same.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c | 2 --
 include/linux/qed/qed_if.h                 | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 307fe33d75c8..70bc5634675c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -264,8 +264,6 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->pci_mem_end = cdev->pci_params.mem_end;
 	dev_info->pci_irq = cdev->pci_params.irq;
 	dev_info->rdma_supported = QED_IS_RDMA_PERSONALITY(p_hwfn);
-	dev_info->is_mf_default = !test_bit(QED_MF_LLH_MAC_CLSS,
-					    &cdev->mf_bits);
 	dev_info->dev_type = cdev->type;
 	ether_addr_copy(dev_info->hw_mac, hw_info->hw_mac_addr);
 
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 5dac56147a07..907976fd56f7 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -339,7 +339,6 @@ struct qed_dev_info {
 	u8		num_hwfns;
 
 	u8		hw_mac[ETH_ALEN];
-	bool		is_mf_default;
 
 	/* FW version */
 	u16		fw_major;
-- 
cgit v1.2.3


From cac6f691546b9efd50c31c0db97fe50d0357104a Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Sat, 5 May 2018 18:43:02 -0700
Subject: qed: Add support for Unified Fabric Port.

This patch adds driver changes for supporting the Unified Fabric Port
(UFP). This is a new paritioning mode wherein MFW provides the set of
parameters to be used by the device such as traffic class, outer-vlan
tag value, priority type etc. Drivers receives this info via notifications
from mfw and configures the hardware accordingly.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h             | 20 ++++++
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c        | 14 +++-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 32 ++++++++--
 drivers/net/ethernet/qlogic/qed/qed_fcoe.c        |  3 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         | 28 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         | 40 ++++++++----
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         | 78 +++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         |  8 +++
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  9 +++
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 57 ++++++++++++++++-
 drivers/scsi/qedf/qedf_fip.c                      |  7 +-
 drivers/scsi/qedf/qedf_main.c                     |  2 +-
 drivers/scsi/qedi/qedi_iscsi.c                    |  2 +-
 include/linux/qed/qed_ll2_if.h                    | 10 ++-
 14 files changed, 283 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index c8f3507d1151..adcff495466e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -474,6 +474,24 @@ enum qed_mf_mode_bit {
 	QED_MF_DSCP_TO_TC_MAP,
 };
 
+enum qed_ufp_mode {
+	QED_UFP_MODE_ETS,
+	QED_UFP_MODE_VNIC_BW,
+	QED_UFP_MODE_UNKNOWN
+};
+
+enum qed_ufp_pri_type {
+	QED_UFP_PRI_OS,
+	QED_UFP_PRI_VNIC,
+	QED_UFP_PRI_UNKNOWN
+};
+
+struct qed_ufp_info {
+	enum qed_ufp_pri_type pri_type;
+	enum qed_ufp_mode mode;
+	u8 tc;
+};
+
 enum BAR_ID {
 	BAR_ID_0,		/* used for GRC */
 	BAR_ID_1		/* Used for doorbells */
@@ -582,6 +600,8 @@ struct qed_hwfn {
 
 	struct qed_dcbx_info		*p_dcbx_info;
 
+	struct qed_ufp_info		ufp_info;
+
 	struct qed_dmae_info		dmae_info;
 
 	/* QM init */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index 449777f21237..8f31406ec894 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -274,8 +274,8 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
 		     u32 pri_tc_tbl, int count, u8 dcbx_version)
 {
 	enum dcbx_protocol_type type;
+	bool enable, ieee, eth_tlv;
 	u8 tc, priority_map;
-	bool enable, ieee;
 	u16 protocol_id;
 	int priority;
 	int i;
@@ -283,6 +283,7 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
 	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Num APP entries = %d\n", count);
 
 	ieee = (dcbx_version == DCBX_CONFIG_VERSION_IEEE);
+	eth_tlv = false;
 	/* Parse APP TLV */
 	for (i = 0; i < count; i++) {
 		protocol_id = QED_MFW_GET_FIELD(p_tbl[i].entry,
@@ -304,13 +305,22 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
 			 * indication, but we only got here if there was an
 			 * app tlv for the protocol, so dcbx must be enabled.
 			 */
-			enable = !(type == DCBX_PROTOCOL_ETH);
+			if (type == DCBX_PROTOCOL_ETH) {
+				enable = false;
+				eth_tlv = true;
+			} else {
+				enable = true;
+			}
 
 			qed_dcbx_update_app_info(p_data, p_hwfn, enable,
 						 priority, tc, type);
 		}
 	}
 
+	/* If Eth TLV is not detected, use UFP TC as default TC */
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) && !eth_tlv)
+		p_data->arr[DCBX_PROTOCOL_ETH].tc = p_hwfn->ufp_info.tc;
+
 	/* Update ramrod protocol data and hw_info fields
 	 * with default info when corresponding APP TLV's are not detected.
 	 * The enabled field has a different logic for ethernet as only for
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 95d00cbe1a7f..560528962658 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1499,6 +1499,11 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET, 1);
 		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET,
 			     p_hwfn->hw_info.ovlan);
+
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "Configuring LLH_FUNC_FILTER_HDR_SEL\n");
+		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET,
+			     1);
 	}
 
 	/* Enable classification by MAC if needed */
@@ -1635,6 +1640,7 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 	bool b_default_mtu = true;
 	struct qed_hwfn *p_hwfn;
 	int rc = 0, mfw_rc, i;
+	u16 ether_type;
 
 	if ((p_params->int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
 		DP_NOTICE(cdev, "MSI mode is not supported for CMT devices\n");
@@ -1668,16 +1674,22 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 		if (rc)
 			return rc;
 
-		if (IS_PF(cdev) && test_bit(QED_MF_8021AD_TAGGING,
-					    &cdev->mf_bits)) {
+		if (IS_PF(cdev) && (test_bit(QED_MF_8021Q_TAGGING,
+					     &cdev->mf_bits) ||
+				    test_bit(QED_MF_8021AD_TAGGING,
+					     &cdev->mf_bits))) {
+			if (test_bit(QED_MF_8021Q_TAGGING, &cdev->mf_bits))
+				ether_type = ETH_P_8021Q;
+			else
+				ether_type = ETH_P_8021AD;
 			STORE_RT_REG(p_hwfn, PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 		}
 
 		qed_fill_load_req_params(&load_req_params,
@@ -2659,6 +2671,12 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
 			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
 			break;
+		case NVM_CFG1_GLOB_MF_MODE_UFP:
+			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_UFP_SPECIFIC) |
+					BIT(QED_MF_8021Q_TAGGING);
+			break;
 		case NVM_CFG1_GLOB_MF_MODE_BD:
 			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
 					BIT(QED_MF_LLH_PROTO_CLSS) |
@@ -2879,6 +2897,8 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		qed_mcp_cmd_port_init(p_hwfn, p_ptt);
 
 		qed_get_eee_caps(p_hwfn, p_ptt);
+
+		qed_mcp_read_ufp_config(p_hwfn, p_ptt);
 	}
 
 	if (qed_mcp_is_init(p_hwfn)) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
index 2dc9b312a795..cc1b373c0ace 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
@@ -313,6 +313,9 @@ qed_sp_fcoe_conn_offload(struct qed_hwfn *p_hwfn,
 	p_data->d_id.addr_mid = p_conn->d_id.addr_mid;
 	p_data->d_id.addr_lo = p_conn->d_id.addr_lo;
 	p_data->flags = p_conn->flags;
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+		SET_FIELD(p_data->flags,
+			  FCOE_CONN_OFFLOAD_RAMROD_DATA_B_SINGLE_VLAN, 1);
 	p_data->def_q_idx = p_conn->def_q_idx;
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 7f5ec42dde48..b5f70eff0182 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -11993,6 +11993,16 @@ struct public_port {
 #define EEE_REMOTE_TW_TX_OFFSET 0
 #define EEE_REMOTE_TW_RX_MASK   0xffff0000
 #define EEE_REMOTE_TW_RX_OFFSET 16
+
+	u32 oem_cfg_port;
+#define OEM_CFG_CHANNEL_TYPE_MASK                       0x00000003
+#define OEM_CFG_CHANNEL_TYPE_OFFSET                     0
+#define OEM_CFG_CHANNEL_TYPE_VLAN_PARTITION             0x1
+#define OEM_CFG_CHANNEL_TYPE_STAGGED                    0x2
+#define OEM_CFG_SCHED_TYPE_MASK                         0x0000000C
+#define OEM_CFG_SCHED_TYPE_OFFSET                       2
+#define OEM_CFG_SCHED_TYPE_ETS                          0x1
+#define OEM_CFG_SCHED_TYPE_VNIC_BW                      0x2
 };
 
 struct public_func {
@@ -12069,6 +12079,23 @@ struct public_func {
 #define DRV_ID_DRV_INIT_HW_MASK		0x80000000
 #define DRV_ID_DRV_INIT_HW_SHIFT	31
 #define DRV_ID_DRV_INIT_HW_FLAG		(1 << DRV_ID_DRV_INIT_HW_SHIFT)
+
+	u32 oem_cfg_func;
+#define OEM_CFG_FUNC_TC_MASK                    0x0000000F
+#define OEM_CFG_FUNC_TC_OFFSET                  0
+#define OEM_CFG_FUNC_TC_0                       0x0
+#define OEM_CFG_FUNC_TC_1                       0x1
+#define OEM_CFG_FUNC_TC_2                       0x2
+#define OEM_CFG_FUNC_TC_3                       0x3
+#define OEM_CFG_FUNC_TC_4                       0x4
+#define OEM_CFG_FUNC_TC_5                       0x5
+#define OEM_CFG_FUNC_TC_6                       0x6
+#define OEM_CFG_FUNC_TC_7                       0x7
+
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_MASK         0x00000030
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_OFFSET       4
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_VNIC         0x1
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_OS           0x2
 };
 
 struct mcp_mac {
@@ -12495,6 +12522,7 @@ enum MFW_DRV_MSG_TYPE {
 	MFW_DRV_MSG_BW_UPDATE10,
 	MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE,
 	MFW_DRV_MSG_BW_UPDATE11,
+	MFW_DRV_MSG_OEM_CFG_UPDATE,
 	MFW_DRV_MSG_MAX
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 6c942c133b2c..c3c1a99a0252 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -919,6 +919,10 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
 	p_ramrod->inner_vlan_stripping_en =
 		p_ll2_conn->input.rx_vlan_removal_en;
+
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) &&
+	    p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE)
+		p_ramrod->report_outer_vlan = 1;
 	p_ramrod->queue_id = p_ll2_conn->queue_id;
 	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
@@ -1493,11 +1497,12 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
 	qed_ll2_establish_connection_ooo(p_hwfn, p_ll2_conn);
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+			qed_llh_add_protocol_filter(p_hwfn, p_ptt,
+						    ETH_P_FCOE, 0,
+						    QED_LLH_FILTER_ETHERTYPE);
 		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
-					    0x8906, 0,
-					    QED_LLH_FILTER_ETHERTYPE);
-		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
-					    0x8914, 0,
+					    ETH_P_FIP, 0,
 					    QED_LLH_FILTER_ETHERTYPE);
 	}
 
@@ -1653,11 +1658,16 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 
 	start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
 	if (QED_IS_IWARP_PERSONALITY(p_hwfn) &&
-	    p_ll2->input.conn_type == QED_LL2_TYPE_OOO)
+	    p_ll2->input.conn_type == QED_LL2_TYPE_OOO) {
 		start_bd->nw_vlan_or_lb_echo =
 		    cpu_to_le16(IWARP_LL2_IN_ORDER_TX_QUEUE);
-	else
+	} else {
 		start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
+		if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) &&
+		    p_ll2->input.conn_type == QED_LL2_TYPE_FCOE)
+			pkt->remove_stag = true;
+	}
+
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W,
 		  cpu_to_le16(pkt->l4_hdr_offset_w));
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest);
@@ -1668,6 +1678,9 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_CSUM, !!(pkt->enable_ip_cksum));
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_L4_CSUM, !!(pkt->enable_l4_cksum));
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_LEN, !!(pkt->calc_ip_len));
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_DISABLE_STAG_INSERTION,
+		  !!(pkt->remove_stag));
+
 	start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data);
 	DMA_REGPAIR_LE(start_bd->addr, pkt->first_frag);
 	start_bd->nbytes = cpu_to_le16(pkt->first_frag_len);
@@ -1884,11 +1897,12 @@ int qed_ll2_terminate_connection(void *cxt, u8 connection_handle)
 		qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+			qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
+						       ETH_P_FCOE, 0,
+						      QED_LLH_FILTER_ETHERTYPE);
 		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
-					       0x8906, 0,
-					       QED_LLH_FILTER_ETHERTYPE);
-		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
-					       0x8914, 0,
+					       ETH_P_FIP, 0,
 					       QED_LLH_FILTER_ETHERTYPE);
 	}
 
@@ -2360,7 +2374,8 @@ fail:
 	return -EINVAL;
 }
 
-static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
+static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb,
+			      unsigned long xmit_flags)
 {
 	struct qed_ll2_tx_pkt_info pkt;
 	const skb_frag_t *frag;
@@ -2405,6 +2420,9 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
 	pkt.first_frag = mapping;
 	pkt.first_frag_len = skb->len;
 	pkt.cookie = skb;
+	if (test_bit(QED_MF_UFP_SPECIFIC, &cdev->mf_bits) &&
+	    test_bit(QED_LL2_XMIT_FLAGS_FIP_DISCOVERY, &xmit_flags))
+		pkt.remove_stag = true;
 
 	rc = qed_ll2_prepare_tx_packet(&cdev->hwfns[0], cdev->ll2->handle,
 				       &pkt, 1);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 0550f0ee11b3..e80f5e7c7992 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/etherdevice.h>
 #include "qed.h"
+#include "qed_cxt.h"
 #include "qed_dcbx.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
@@ -1486,6 +1487,80 @@ static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		    &resp, &param);
 }
 
+void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct public_func shmem_info;
+	u32 port_cfg, val;
+
+	if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+		return;
+
+	memset(&p_hwfn->ufp_info, 0, sizeof(p_hwfn->ufp_info));
+	port_cfg = qed_rd(p_hwfn, p_ptt, p_hwfn->mcp_info->port_addr +
+			  offsetof(struct public_port, oem_cfg_port));
+	val = (port_cfg & OEM_CFG_CHANNEL_TYPE_MASK) >>
+		OEM_CFG_CHANNEL_TYPE_OFFSET;
+	if (val != OEM_CFG_CHANNEL_TYPE_STAGGED)
+		DP_NOTICE(p_hwfn, "Incorrect UFP Channel type  %d\n", val);
+
+	val = (port_cfg & OEM_CFG_SCHED_TYPE_MASK) >> OEM_CFG_SCHED_TYPE_OFFSET;
+	if (val == OEM_CFG_SCHED_TYPE_ETS) {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_ETS;
+	} else if (val == OEM_CFG_SCHED_TYPE_VNIC_BW) {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_VNIC_BW;
+	} else {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_UNKNOWN;
+		DP_NOTICE(p_hwfn, "Unknown UFP scheduling mode %d\n", val);
+	}
+
+	qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, MCP_PF_ID(p_hwfn));
+	val = (port_cfg & OEM_CFG_FUNC_TC_MASK) >> OEM_CFG_FUNC_TC_OFFSET;
+	p_hwfn->ufp_info.tc = (u8)val;
+	val = (port_cfg & OEM_CFG_FUNC_HOST_PRI_CTRL_MASK) >>
+		OEM_CFG_FUNC_HOST_PRI_CTRL_OFFSET;
+	if (val == OEM_CFG_FUNC_HOST_PRI_CTRL_VNIC) {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_VNIC;
+	} else if (val == OEM_CFG_FUNC_HOST_PRI_CTRL_OS) {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_OS;
+	} else {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_UNKNOWN;
+		DP_NOTICE(p_hwfn, "Unknown Host priority control %d\n", val);
+	}
+
+	DP_NOTICE(p_hwfn,
+		  "UFP shmem config: mode = %d tc = %d pri_type = %d\n",
+		  p_hwfn->ufp_info.mode,
+		  p_hwfn->ufp_info.tc, p_hwfn->ufp_info.pri_type);
+}
+
+static int
+qed_mcp_handle_ufp_event(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	qed_mcp_read_ufp_config(p_hwfn, p_ptt);
+
+	if (p_hwfn->ufp_info.mode == QED_UFP_MODE_VNIC_BW) {
+		p_hwfn->qm_info.ooo_tc = p_hwfn->ufp_info.tc;
+		p_hwfn->hw_info.offload_tc = p_hwfn->ufp_info.tc;
+
+		qed_qm_reconf(p_hwfn, p_ptt);
+	} else if (p_hwfn->ufp_info.mode == QED_UFP_MODE_ETS) {
+		/* Merge UFP TC with the dcbx TC data */
+		qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+					  QED_DCBX_OPERATIONAL_MIB);
+	} else {
+		DP_ERR(p_hwfn, "Invalid sched type, discard the UFP config\n");
+		return -EINVAL;
+	}
+
+	/* update storm FW with negotiation results */
+	qed_sp_pf_update_ufp(p_hwfn);
+
+	/* update stag pcp value */
+	qed_sp_pf_update_stag(p_hwfn);
+
+	return 0;
+}
+
 int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt)
 {
@@ -1529,6 +1604,9 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
 						  QED_DCBX_OPERATIONAL_MIB);
 			break;
+		case MFW_DRV_MSG_OEM_CFG_UPDATE:
+			qed_mcp_handle_ufp_event(p_hwfn, p_ptt);
+			break;
 		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
 			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 3af3896420b9..250579ba632b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -1004,6 +1004,14 @@ int qed_mcp_get_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
  */
 int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 
+/**
+ * @brief Read ufp config from the shared memory.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
 /**
  * @brief Populate the nvm info shadow in the given hardware function
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 768022235451..e95431f6acd4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -462,6 +462,15 @@ int qed_sp_pf_update_stag(struct qed_hwfn *p_hwfn);
  * @return int
  */
 
+/**
+ * @brief qed_sp_pf_update_ufp - PF ufp update Ramrod
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_sp_pf_update_ufp(struct qed_hwfn *p_hwfn);
+
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn);
 
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 26bed26b9071..8de644b4721e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -314,7 +314,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
 	int rc = -EINVAL;
-	u8 page_cnt;
+	u8 page_cnt, i;
 
 	/* update initial eq producer */
 	qed_eq_prod_update(p_hwfn,
@@ -345,12 +345,30 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		p_ramrod->mf_mode = MF_NPAR;
 
 	p_ramrod->outer_tag_config.outer_tag.tci =
-		cpu_to_le16(p_hwfn->hw_info.ovlan);
-	if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
+				cpu_to_le16(p_hwfn->hw_info.ovlan);
+	if (test_bit(QED_MF_8021Q_TAGGING, &p_hwfn->cdev->mf_bits)) {
+		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021Q;
+	} else if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
 		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021AD;
 		p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
 	}
 
+	p_ramrod->outer_tag_config.pri_map_valid = 1;
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
+		p_ramrod->outer_tag_config.inner_to_outer_pri_map[i] = i;
+
+	/* enable_stag_pri_change should be set if port is in BD mode or,
+	 * UFP with Host Control mode.
+	 */
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits)) {
+		if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_OS)
+			p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
+		else
+			p_ramrod->outer_tag_config.enable_stag_pri_change = 0;
+
+		p_ramrod->outer_tag_config.outer_tag.tci |=
+		    cpu_to_le16(((u16)p_hwfn->ufp_info.tc << 13));
+	}
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
@@ -431,6 +449,39 @@ int qed_sp_pf_update(struct qed_hwfn *p_hwfn)
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+int qed_sp_pf_update_ufp(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EOPNOTSUPP;
+
+	if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_UNKNOWN) {
+		DP_INFO(p_hwfn, "Invalid priority type %d\n",
+			p_hwfn->ufp_info.pri_type);
+		return -EINVAL;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	p_ent->ramrod.pf_update.update_enable_stag_pri_change = true;
+	if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_OS)
+		p_ent->ramrod.pf_update.enable_stag_pri_change = 1;
+	else
+		p_ent->ramrod.pf_update.enable_stag_pri_change = 0;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 /* Set pf update ramrod command params */
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 			      struct qed_ptt *p_ptt,
diff --git a/drivers/scsi/qedf/qedf_fip.c b/drivers/scsi/qedf/qedf_fip.c
index 773558fc0697..16d1a21cdff9 100644
--- a/drivers/scsi/qedf/qedf_fip.c
+++ b/drivers/scsi/qedf/qedf_fip.c
@@ -23,6 +23,7 @@ void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
 	struct fip_vlan *vlan;
 #define MY_FIP_ALL_FCF_MACS        ((__u8[6]) { 1, 0x10, 0x18, 1, 0, 2 })
 	static u8 my_fcoe_all_fcfs[ETH_ALEN] = MY_FIP_ALL_FCF_MACS;
+	unsigned long flags = 0;
 
 	skb = dev_alloc_skb(sizeof(struct fip_vlan));
 	if (!skb)
@@ -65,7 +66,9 @@ void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
 		kfree_skb(skb);
 		return;
 	}
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+
+	set_bit(QED_LL2_XMIT_FLAGS_FIP_DISCOVERY, &flags);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, flags);
 }
 
 static void qedf_fcoe_process_vlan_resp(struct qedf_ctx *qedf,
@@ -139,7 +142,7 @@ void qedf_fip_send(struct fcoe_ctlr *fip, struct sk_buff *skb)
 		print_hex_dump(KERN_WARNING, "fip ", DUMP_PREFIX_OFFSET, 16, 1,
 		    skb->data, skb->len, false);
 
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, 0);
 }
 
 /* Process incoming FIP frames. */
diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c
index 284ccb566b19..6c19015975a8 100644
--- a/drivers/scsi/qedf/qedf_main.c
+++ b/drivers/scsi/qedf/qedf_main.c
@@ -994,7 +994,7 @@ static int qedf_xmit(struct fc_lport *lport, struct fc_frame *fp)
 	if (qedf_dump_frames)
 		print_hex_dump(KERN_WARNING, "fcoe: ", DUMP_PREFIX_OFFSET, 16,
 		    1, skb->data, skb->len, false);
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, 0);
 
 	return 0;
 }
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index 7ec7f6e00fb8..ff21aa1fd939 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -1150,7 +1150,7 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid)
 	if (vlanid)
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
 
-	rc = qedi_ops->ll2->start_xmit(cdev, skb);
+	rc = qedi_ops->ll2->start_xmit(cdev, skb, 0);
 	if (rc) {
 		QEDI_ERR(&qedi->dbg_ctx, "ll2 start_xmit returned %d\n",
 			 rc);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 266c1fb45387..5eb022953aca 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -202,6 +202,7 @@ struct qed_ll2_tx_pkt_info {
 	bool enable_ip_cksum;
 	bool enable_l4_cksum;
 	bool calc_ip_len;
+	bool remove_stag;
 };
 
 #define QED_LL2_UNUSED_HANDLE   (0xff)
@@ -220,6 +221,11 @@ struct qed_ll2_params {
 	u8 ll2_mac_address[ETH_ALEN];
 };
 
+enum qed_ll2_xmit_flags {
+	/* FIP discovery packet */
+	QED_LL2_XMIT_FLAGS_FIP_DISCOVERY
+};
+
 struct qed_ll2_ops {
 /**
  * @brief start - initializes ll2
@@ -245,10 +251,12 @@ struct qed_ll2_ops {
  *
  * @param cdev
  * @param skb
+ * @param xmit_flags - Transmit options defined by the enum qed_ll2_xmit_flags.
  *
  * @return 0 on success, otherwise error value.
  */
-	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb);
+	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb,
+			  unsigned long xmit_flags);
 
 /**
  * @brief register_cb_ops - protocol driver register the callback for Rx/Tx
-- 
cgit v1.2.3


From 72a338bcc6ae51e01c95d687e5d775e3fe52eff1 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 May 2018 11:32:59 +0200
Subject: net: core: rework basic flow dissection helper

When the core networking needs to detect the transport offset in a given
packet and parse it explicitly, a full-blown flow_keys struct is used for
storage.
This patch introduces a smaller keys store, rework the basic flow dissect
helper to use it, and apply this new helper where possible - namely in
skb_probe_transport_header(). The used flow dissector data structures
are renamed to match more closely the new role.

The above gives ~50% performance improvement in micro benchmarking around
skb_probe_transport_header() and ~30% around eth_get_headlen(), mostly due
to the smaller memset. Small, but measurable improvement is measured also
in macro benchmarking.

v1 -> v2: use the new helper in eth_get_headlen() and skb_get_poff(),
  as per DaveM suggestion

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 18 ++++++++++--------
 include/net/flow_dissector.h |  7 ++++++-
 net/core/flow_dissector.c    | 17 +++++++++--------
 net/ethernet/eth.c           |  6 +++---
 4 files changed, 28 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 908d66e55b14..693564a9a979 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1171,7 +1171,7 @@ void __skb_get_hash(struct sk_buff *skb);
 u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
+		   const struct flow_keys_basic *keys, int hlen);
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 			    void *data, int hlen_proto);
 
@@ -1208,13 +1208,14 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
 				  NULL, 0, 0, 0, flags);
 }
 
-static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
-						  void *data, __be16 proto,
-						  int nhoff, int hlen,
-						  unsigned int flags)
+static inline bool
+skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb,
+				 struct flow_keys_basic *flow, void *data,
+				 __be16 proto, int nhoff, int hlen,
+				 unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+	return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow,
 				  data, proto, nhoff, hlen, flags);
 }
 
@@ -2350,11 +2351,12 @@ static inline void skb_pop_mac_header(struct sk_buff *skb)
 static inline void skb_probe_transport_header(struct sk_buff *skb,
 					      const int offset_hint)
 {
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect_flow_keys(skb, &keys, 0))
+
+	if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
 		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 9a074776f70b..e2f6e5c928bb 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -226,6 +226,11 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
+struct flow_keys_basic {
+	struct flow_dissector_key_control control;
+	struct flow_dissector_key_basic basic;
+};
+
 struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
@@ -244,7 +249,7 @@ __be32 flow_get_u32_src(const struct flow_keys *flow);
 __be32 flow_get_u32_dst(const struct flow_keys *flow);
 
 extern struct flow_dissector flow_keys_dissector;
-extern struct flow_dissector flow_keys_buf_dissector;
+extern struct flow_dissector flow_keys_basic_dissector;
 
 /* struct flow_keys_digest:
  *
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..030d4ca177fb 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen)
+		   const struct flow_keys_basic *keys, int hlen)
 {
 	u32 poff = keys->control.thoff;
 
@@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
  */
 u32 skb_get_poff(const struct sk_buff *skb)
 {
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
-	if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
+	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
 	},
 };
 
-static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
 	{
 		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
 		.offset = offsetof(struct flow_keys, control),
@@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
 struct flow_dissector flow_keys_dissector __read_mostly;
 EXPORT_SYMBOL(flow_keys_dissector);
 
-struct flow_dissector flow_keys_buf_dissector __read_mostly;
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
 
 static int __init init_default_flow_dissectors(void)
 {
@@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)
 	skb_flow_dissector_init(&flow_keys_dissector_symmetric,
 				flow_keys_dissector_symmetric_keys,
 				ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
-	skb_flow_dissector_init(&flow_keys_buf_dissector,
-				flow_keys_buf_dissector_keys,
-				ARRAY_SIZE(flow_keys_buf_dissector_keys));
+	skb_flow_dissector_init(&flow_keys_basic_dissector,
+				flow_keys_basic_dissector_keys,
+				ARRAY_SIZE(flow_keys_basic_dissector_keys));
 	return 0;
 }
 
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index eaeba9b99a73..ee28440f57c5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len)
 {
 	const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 	const struct ethhdr *eth = (const struct ethhdr *)data;
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
 	/* this should never happen, but better safe than sorry */
 	if (unlikely(len < sizeof(*eth)))
 		return len;
 
 	/* parse any remaining L2/L3 headers, check for L4 */
-	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
-					    sizeof(*eth), len, flags))
+	if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto,
+					      sizeof(*eth), len, flags))
 		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
-- 
cgit v1.2.3


From d869dea664e662a4380ccf59cb9be9888a7af53a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 7 May 2018 12:06:03 +0200
Subject: flow_dissector: do not rely on implicit casts

This change fixes a couple of type mismatch reported by the sparse
tool, explicitly using the requested type for the offending arguments.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tipc.h        | 4 ++--
 net/core/flow_dissector.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tipc.h b/include/net/tipc.h
index 07670ec022a7..f0e7e6bc1bef 100644
--- a/include/net/tipc.h
+++ b/include/net/tipc.h
@@ -44,11 +44,11 @@ struct tipc_basic_hdr {
 	__be32 w[4];
 };
 
-static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
+static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
 {
 	u32 w0 = ntohl(hdr->w[0]);
 	bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
-	int key;
+	__be32 key;
 
 	/* Return source node identity as key */
 	if (likely(!keepalive_msg))
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 030d4ca177fb..4fc1e84d77ec 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1316,7 +1316,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys_basic keys;
 
-	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
+	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
-- 
cgit v1.2.3


From ea6eca778500b0aaf6e5f10dac4d2cd745c2a50b Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 4 May 2018 01:08:14 +0000
Subject: seccomp: Audit attempts to modify the actions_logged sysctl

The decision to log a seccomp action will always be subject to the
value of the kernel.seccomp.actions_logged sysctl, even for processes
that are being inspected via the audit subsystem, in an upcoming patch.
Therefore, we need to emit an audit record on attempts at writing to the
actions_logged sysctl when auditing is enabled.

This patch updates the write handler for the actions_logged sysctl to
emit an audit record on attempts to write to the sysctl. Successful
writes to the sysctl will result in a record that includes a normalized
list of logged actions in the "actions" field and a "res" field equal to
1. Unsuccessful writes to the sysctl will result in a record that
doesn't include the "actions" field and has a "res" field equal to 0.

Not all unsuccessful writes to the sysctl are audited. For example, an
audit record will not be emitted if an unprivileged process attempts to
open the sysctl file for reading since that access control check is not
part of the sysctl's write handler.

Below are some example audit records when writing various strings to the
actions_logged sysctl.

Writing "not-a-real-action", when the kernel.seccomp.actions_logged
sysctl previously was "kill_process kill_thread trap errno trace log",
emits this audit record:

 type=CONFIG_CHANGE msg=audit(1525392371.454:120): op=seccomp-logging
 actions=? old-actions=kill_process,kill_thread,trap,errno,trace,log
 res=0

If you then write "kill_process kill_thread errno trace log", this audit
record is emitted:

 type=CONFIG_CHANGE msg=audit(1525392401.645:126): op=seccomp-logging
 actions=kill_process,kill_thread,errno,trace,log
 old-actions=kill_process,kill_thread,trap,errno,trace,log res=1

If you then write "log log errno trace kill_process kill_thread", which
is unordered and contains the log action twice, it results in the same
actions value as the previous record:

 type=CONFIG_CHANGE msg=audit(1525392436.354:132): op=seccomp-logging
 actions=kill_process,kill_thread,errno,trace,log
 old-actions=kill_process,kill_thread,errno,trace,log res=1

If you then write an empty string to the sysctl, this audit record is
emitted:

 type=CONFIG_CHANGE msg=audit(1525392494.413:138): op=seccomp-logging
 actions=(none) old-actions=kill_process,kill_thread,errno,trace,log
 res=1

No audit records are generated when reading the actions_logged sysctl.

Suggested-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  5 +++++
 kernel/auditsc.c      | 20 ++++++++++++++++++
 kernel/seccomp.c      | 58 +++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 74 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 75d5b031e802..d4e35e7a80c0 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -233,6 +233,8 @@ extern void __audit_inode_child(struct inode *parent,
 				const struct dentry *dentry,
 				const unsigned char type);
 extern void __audit_seccomp(unsigned long syscall, long signr, int code);
+extern void audit_seccomp_actions_logged(const char *names,
+					 const char *old_names, int res);
 extern void __audit_ptrace(struct task_struct *t);
 
 static inline bool audit_dummy_context(void)
@@ -502,6 +504,9 @@ static inline void __audit_seccomp(unsigned long syscall, long signr, int code)
 { }
 static inline void audit_seccomp(unsigned long syscall, long signr, int code)
 { }
+static inline void audit_seccomp_actions_logged(const char *names,
+						const char *old_names, int res)
+{ }
 static inline int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec64 *t, unsigned int *serial)
 {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 479c031ec54c..46ef2c23618d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2480,6 +2480,26 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
 	audit_log_end(ab);
 }
 
+void audit_seccomp_actions_logged(const char *names, const char *old_names,
+				  int res)
+{
+	struct audit_buffer *ab;
+
+	if (!audit_enabled)
+		return;
+
+	ab = audit_log_start(current->audit_context, GFP_KERNEL,
+			     AUDIT_CONFIG_CHANGE);
+	if (unlikely(!ab))
+		return;
+
+	audit_log_format(ab, "op=seccomp-logging");
+	audit_log_format(ab, " actions=%s", names);
+	audit_log_format(ab, " old-actions=%s", old_names);
+	audit_log_format(ab, " res=%d", res);
+	audit_log_end(ab);
+}
+
 struct list_head *audit_killed_trees(void)
 {
 	struct audit_context *ctx = current->audit_context;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b36ac1e0cd0e..f5630d1a88fe 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1219,11 +1219,10 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
 }
 
 static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
-				size_t *lenp, loff_t *ppos)
+				size_t *lenp, loff_t *ppos, u32 *actions_logged)
 {
 	char names[sizeof(seccomp_actions_avail)];
 	struct ctl_table table;
-	u32 actions_logged;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1238,24 +1237,65 @@ static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
 	if (ret)
 		return ret;
 
-	if (!seccomp_actions_logged_from_names(&actions_logged, table.data))
+	if (!seccomp_actions_logged_from_names(actions_logged, table.data))
 		return -EINVAL;
 
-	if (actions_logged & SECCOMP_LOG_ALLOW)
+	if (*actions_logged & SECCOMP_LOG_ALLOW)
 		return -EINVAL;
 
-	seccomp_actions_logged = actions_logged;
+	seccomp_actions_logged = *actions_logged;
 	return 0;
 }
 
+static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
+				 int ret)
+{
+	char names[sizeof(seccomp_actions_avail)];
+	char old_names[sizeof(seccomp_actions_avail)];
+	const char *new = names;
+	const char *old = old_names;
+
+	if (!audit_enabled)
+		return;
+
+	memset(names, 0, sizeof(names));
+	memset(old_names, 0, sizeof(old_names));
+
+	if (ret)
+		new = "?";
+	else if (!actions_logged)
+		new = "(none)";
+	else if (!seccomp_names_from_actions_logged(names, sizeof(names),
+						    actions_logged, ","))
+		new = "?";
+
+	if (!old_actions_logged)
+		old = "(none)";
+	else if (!seccomp_names_from_actions_logged(old_names,
+						    sizeof(old_names),
+						    old_actions_logged, ","))
+		old = "?";
+
+	return audit_seccomp_actions_logged(new, old, !ret);
+}
+
 static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos)
 {
-	if (write)
-		return write_actions_logged(ro_table, buffer, lenp, ppos);
-	else
-		return read_actions_logged(ro_table, buffer, lenp, ppos);
+	int ret;
+
+	if (write) {
+		u32 actions_logged = 0;
+		u32 old_actions_logged = seccomp_actions_logged;
+
+		ret = write_actions_logged(ro_table, buffer, lenp, ppos,
+					   &actions_logged);
+		audit_actions_logged(actions_logged, old_actions_logged, ret);
+	} else
+		ret = read_actions_logged(ro_table, buffer, lenp, ppos);
+
+	return ret;
 }
 
 static struct ctl_path seccomp_sysctl_path[] = {
-- 
cgit v1.2.3


From 326bee0286d7f6b0d780f5b75a35ea9fe489a802 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 4 May 2018 01:08:15 +0000
Subject: seccomp: Don't special case audited processes when logging

Seccomp logging for "handled" actions such as RET_TRAP, RET_TRACE, or
RET_ERRNO can be very noisy for processes that are being audited. This
patch modifies the seccomp logging behavior to treat processes that are
being inspected via the audit subsystem the same as processes that
aren't under inspection. Handled actions will no longer be logged just
because the process is being inspected. Since v4.14, applications have
the ability to request logging of handled actions by using the
SECCOMP_FILTER_FLAG_LOG flag when loading seccomp filters.

With this patch, the logic for deciding if an action will be logged is:

  if action == RET_ALLOW:
    do not log
  else if action not in actions_logged:
    do not log
  else if action == RET_KILL:
    log
  else if action == RET_LOG:
    log
  else if filter-requests-logging:
    log
  else:
    do not log

Reported-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 Documentation/userspace-api/seccomp_filter.rst |  7 -------
 include/linux/audit.h                          | 10 +---------
 kernel/auditsc.c                               | 14 +++++++++++++-
 kernel/seccomp.c                               | 17 +++++++----------
 4 files changed, 21 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 099c412951d6..82a468bc7560 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -207,13 +207,6 @@ directory. Here's a description of each file in that directory:
 	to the file do not need to be in ordered form but reads from the file
 	will be ordered in the same way as the actions_avail sysctl.
 
-	It is important to note that the value of ``actions_logged`` does not
-	prevent certain actions from being logged when the audit subsystem is
-	configured to audit a task. If the action is not found in
-	``actions_logged`` list, the final decision on whether to audit the
-	action for that task is ultimately left up to the audit subsystem to
-	decide for all seccomp return values other than ``SECCOMP_RET_ALLOW``.
-
 	The ``allow`` string is not accepted in the ``actions_logged`` sysctl
 	as it is not possible to log ``SECCOMP_RET_ALLOW`` actions. Attempting
 	to write ``allow`` to the sysctl will result in an EINVAL being
diff --git a/include/linux/audit.h b/include/linux/audit.h
index d4e35e7a80c0..b639cf1f55ff 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -232,7 +232,7 @@ extern void __audit_file(const struct file *);
 extern void __audit_inode_child(struct inode *parent,
 				const struct dentry *dentry,
 				const unsigned char type);
-extern void __audit_seccomp(unsigned long syscall, long signr, int code);
+extern void audit_seccomp(unsigned long syscall, long signr, int code);
 extern void audit_seccomp_actions_logged(const char *names,
 					 const char *old_names, int res);
 extern void __audit_ptrace(struct task_struct *t);
@@ -304,12 +304,6 @@ static inline void audit_inode_child(struct inode *parent,
 }
 void audit_core_dumps(long signr);
 
-static inline void audit_seccomp(unsigned long syscall, long signr, int code)
-{
-	if (audit_enabled && unlikely(!audit_dummy_context()))
-		__audit_seccomp(syscall, signr, code);
-}
-
 static inline void audit_ptrace(struct task_struct *t)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -500,8 +494,6 @@ static inline void audit_inode_child(struct inode *parent,
 { }
 static inline void audit_core_dumps(long signr)
 { }
-static inline void __audit_seccomp(unsigned long syscall, long signr, int code)
-{ }
 static inline void audit_seccomp(unsigned long syscall, long signr, int code)
 { }
 static inline void audit_seccomp_actions_logged(const char *names,
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 46ef2c23618d..0d4e7ab847b1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2466,7 +2466,19 @@ void audit_core_dumps(long signr)
 	audit_log_end(ab);
 }
 
-void __audit_seccomp(unsigned long syscall, long signr, int code)
+/**
+ * audit_seccomp - record information about a seccomp action
+ * @syscall: syscall number
+ * @signr: signal value
+ * @code: the seccomp action
+ *
+ * Record the information associated with a seccomp action. Event filtering for
+ * seccomp actions that are not to be logged is done in seccomp_log().
+ * Therefore, this function forces auditing independent of the audit_enabled
+ * and dummy context state because seccomp actions should be logged even when
+ * audit is not in use.
+ */
+void audit_seccomp(unsigned long syscall, long signr, int code)
 {
 	struct audit_buffer *ab;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f5630d1a88fe..5386749cdd21 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -584,18 +584,15 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL_*,
-	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
-	 * allowed to be logged by the admin.
+	 * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
+	 * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
+	 * any action from being logged by removing the action name from the
+	 * seccomp_actions_logged sysctl.
 	 */
-	if (log)
-		return __audit_seccomp(syscall, signr, action);
+	if (!log)
+		return;
 
-	/*
-	 * Let the audit subsystem decide if the action should be audited based
-	 * on whether the current task itself is being audited.
-	 */
-	return audit_seccomp(syscall, signr, action);
+	audit_seccomp(syscall, signr, action);
 }
 
 /*
-- 
cgit v1.2.3


From 1433269c4d2461be1f36db5dbb453976b38996ff Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 5 Apr 2018 13:42:00 +0200
Subject: mmc: core: Export a function mmc_sw_reset() to allow soft reset of
 cards

It's rather common that a firmware is loaded into an SDIO func device
memory, by the corresponding SDIO func driver during ->probe() time.

However, to actually start running the new firmware, sometimes a soft reset
(no power cycle) and a re-initialization of the card is needed. This is for
example the case with the Espressif ESP8089 WiFi chips, when connected to
an SDIO interface.

To cope with this scenario, let's add a new exported function,
mmc_sw_reset(), which may be called when a soft reset and re-initialization
of the card are needed.

The mmc_sw_reset() is implemented on top of a new bus ops callback, similar
to how the mmc_hw_reset() has been implemented.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Quentin Schulz <quentin.schulz@bootlin.com>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 drivers/mmc/core/core.c  | 24 ++++++++++++++++++++++++
 drivers/mmc/core/core.h  |  1 +
 include/linux/mmc/core.h |  1 +
 3 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 3e17c62eea86..7a79dc1599e5 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2451,6 +2451,30 @@ int mmc_hw_reset(struct mmc_host *host)
 }
 EXPORT_SYMBOL(mmc_hw_reset);
 
+int mmc_sw_reset(struct mmc_host *host)
+{
+	int ret;
+
+	if (!host->card)
+		return -EINVAL;
+
+	mmc_bus_get(host);
+	if (!host->bus_ops || host->bus_dead || !host->bus_ops->sw_reset) {
+		mmc_bus_put(host);
+		return -EOPNOTSUPP;
+	}
+
+	ret = host->bus_ops->sw_reset(host);
+	mmc_bus_put(host);
+
+	if (ret)
+		pr_warn("%s: tried to SW reset card, got error %d\n",
+			mmc_hostname(host), ret);
+
+	return ret;
+}
+EXPORT_SYMBOL(mmc_sw_reset);
+
 static int mmc_rescan_try_freq(struct mmc_host *host, unsigned freq)
 {
 	host->f_init = freq;
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 367ed1127be2..a141ec0e1ccc 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -33,6 +33,7 @@ struct mmc_bus_ops {
 	int (*alive)(struct mmc_host *);
 	int (*shutdown)(struct mmc_host *);
 	int (*hw_reset)(struct mmc_host *);
+	int (*sw_reset)(struct mmc_host *);
 };
 
 void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops);
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 927519385482..134a6483347a 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -177,6 +177,7 @@ int mmc_wait_for_cmd(struct mmc_host *host, struct mmc_command *cmd,
 		int retries);
 
 int mmc_hw_reset(struct mmc_host *host);
+int mmc_sw_reset(struct mmc_host *host);
 void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card);
 
 #endif /* LINUX_MMC_CORE_H */
-- 
cgit v1.2.3


From 15b28bbcd567a9199481ecfef39702b258f9baff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Apr 2018 17:22:28 +0200
Subject: dma-debug: move initialization to common code

Most mainstream architectures are using 65536 entries, so lets stick to
that.  If someone is really desperate to override it that can still be
done through <asm/dma-mapping.h>, but I'd rather see a really good
rationale for that.

dma_debug_init is now called as a core_initcall, which for many
architectures means much earlier, and provides dma-debug functionality
earlier in the boot process.  This should be safe as it only relies
on the memory allocator already being available.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm/mm/dma-mapping-nommu.c |  9 ---------
 arch/arm/mm/dma-mapping.c       |  9 ---------
 arch/arm64/mm/dma-mapping.c     | 10 ----------
 arch/c6x/kernel/dma.c           | 11 -----------
 arch/ia64/kernel/dma-mapping.c  | 10 ----------
 arch/microblaze/kernel/dma.c    | 11 -----------
 arch/mips/mm/dma-default.c      | 10 ----------
 arch/openrisc/kernel/dma.c      | 11 -----------
 arch/powerpc/kernel/dma.c       |  3 ---
 arch/s390/pci/pci_dma.c         |  9 ---------
 arch/sh/mm/consistent.c         |  9 ---------
 arch/sparc/kernel/Makefile      |  2 --
 arch/sparc/kernel/dma.c         | 13 -------------
 arch/x86/kernel/pci-dma.c       |  4 ----
 arch/xtensa/kernel/pci-dma.c    |  9 ---------
 include/linux/dma-debug.h       |  6 ------
 lib/dma-debug.c                 | 21 ++++++++++++++-------
 17 files changed, 14 insertions(+), 143 deletions(-)
 delete mode 100644 arch/sparc/kernel/dma.c

(limited to 'include')

diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 619f24a42d09..f448a0663b10 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -241,12 +241,3 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 void arch_teardown_dma_ops(struct device *dev)
 {
 }
-
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-core_initcall(dma_debug_do_init);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 8c398fedbbb6..c26bf83f44ca 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1165,15 +1165,6 @@ int arm_dma_supported(struct device *dev, u64 mask)
 	return __dma_supported(dev, mask, false);
 }
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-core_initcall(dma_debug_do_init);
-
 #ifdef CONFIG_ARM_DMA_USE_IOMMU
 
 static int __dma_info_to_prot(enum dma_data_direction dir, unsigned long attrs)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a96ec0181818..db01f2709842 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -508,16 +508,6 @@ static int __init arm64_dma_init(void)
 }
 arch_initcall(arm64_dma_init);
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_debug_do_init);
-
-
 #ifdef CONFIG_IOMMU_DMA
 #include <linux/dma-iommu.h>
 #include <linux/platform_device.h>
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
index 9fff8be75f58..31e1a9ec3a9c 100644
--- a/arch/c6x/kernel/dma.c
+++ b/arch/c6x/kernel/dma.c
@@ -136,14 +136,3 @@ const struct dma_map_ops c6x_dma_ops = {
 	.sync_sg_for_cpu	= c6x_dma_sync_sg_for_cpu,
 };
 EXPORT_SYMBOL(c6x_dma_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index f2d57e66fd86..7a471d8d67d4 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -9,16 +9,6 @@ int iommu_detected __read_mostly;
 const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
-
 const struct dma_map_ops *dma_get_ops(struct device *dev)
 {
 	return dma_ops;
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index c91e8cef98dd..3145e7dc8ab1 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -184,14 +184,3 @@ const struct dma_map_ops dma_nommu_ops = {
 	.sync_sg_for_device	= dma_nommu_sync_sg_for_device,
 };
 EXPORT_SYMBOL(dma_nommu_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index dcafa43613b6..f9fef0028ca2 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -402,13 +402,3 @@ static const struct dma_map_ops mips_default_dma_map_ops = {
 
 const struct dma_map_ops *mips_dma_map_ops = &mips_default_dma_map_ops;
 EXPORT_SYMBOL(mips_dma_map_ops);
-
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init mips_dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(mips_dma_init);
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index a945f00011b4..ec7fd45704d2 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -247,14 +247,3 @@ const struct dma_map_ops or1k_dma_map_ops = {
 	.sync_single_for_device = or1k_sync_single_for_device,
 };
 EXPORT_SYMBOL(or1k_dma_map_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index da20569de9d4..138157deeadf 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -309,8 +309,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
 
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
 int dma_set_mask(struct device *dev, u64 dma_mask)
 {
 	if (ppc_md.dma_set_mask)
@@ -361,7 +359,6 @@ EXPORT_SYMBOL_GPL(dma_get_required_mask);
 
 static int __init dma_init(void)
 {
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 #ifdef CONFIG_PCI
 	dma_debug_add_bus(&pci_bus_type);
 #endif
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 10abf5ed6187..d387a0fbdd7e 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -668,15 +668,6 @@ void zpci_dma_exit(void)
 	kmem_cache_destroy(dma_region_table_cache);
 }
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	(1 << 16)
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_debug_do_init);
-
 const struct dma_map_ops s390_pci_dma_ops = {
 	.alloc		= s390_dma_alloc,
 	.free		= s390_dma_free,
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index 8ce98691d822..35ea3099a3b6 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -20,18 +20,9 @@
 #include <asm/cacheflush.h>
 #include <asm/addrspace.h>
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
 const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_init);
-
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 				 dma_addr_t *dma_handle, gfp_t gfp,
 				 unsigned long attrs)
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 76cb57750dda..84cfc5a428d6 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -74,8 +74,6 @@ obj-$(CONFIG_SPARC64)	+= pcr.o
 obj-$(CONFIG_SPARC64)	+= nmi.o
 obj-$(CONFIG_SPARC64_SMP) += cpumap.o
 
-obj-y                     += dma.o
-
 obj-$(CONFIG_PCIC_PCI)    += pcic.o
 obj-$(CONFIG_LEON_PCI)    += leon_pci.o
 obj-$(CONFIG_SPARC_GRPCI2)+= leon_pci_grpci2.o
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
deleted file mode 100644
index f73e7597c971..000000000000
--- a/arch/sparc/kernel/dma.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-debug.h>
-
-#define PREALLOC_DMA_DEBUG_ENTRIES       (1 << 15)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 77625b60a510..bcbaa2e8031e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -55,9 +55,6 @@ struct device x86_dma_fallback_dev = {
 };
 EXPORT_SYMBOL(x86_dma_fallback_dev);
 
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES       65536
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
@@ -189,7 +186,6 @@ EXPORT_SYMBOL(arch_dma_supported);
 static int __init pci_iommu_init(void)
 {
 	struct iommu_table_entry *p;
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 
 #ifdef CONFIG_PCI
 	dma_debug_add_bus(&pci_bus_type);
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 732631ce250f..392b4a80ebc2 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -261,12 +261,3 @@ const struct dma_map_ops xtensa_dma_map_ops = {
 	.mapping_error = xtensa_dma_mapping_error,
 };
 EXPORT_SYMBOL(xtensa_dma_map_ops);
-
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init xtensa_dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(xtensa_dma_init);
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index c7d844f09c3a..a785f2507159 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -30,8 +30,6 @@ struct bus_type;
 
 extern void dma_debug_add_bus(struct bus_type *bus);
 
-extern void dma_debug_init(u32 num_entries);
-
 extern int dma_debug_resize_entries(u32 num_entries);
 
 extern void debug_dma_map_page(struct device *dev, struct page *page,
@@ -100,10 +98,6 @@ static inline void dma_debug_add_bus(struct bus_type *bus)
 {
 }
 
-static inline void dma_debug_init(u32 num_entries)
-{
-}
-
 static inline int dma_debug_resize_entries(u32 num_entries)
 {
 	return 0;
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 7f5cdc1e6b29..712a897174e4 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -41,6 +41,11 @@
 #define HASH_FN_SHIFT   13
 #define HASH_FN_MASK    (HASH_SIZE - 1)
 
+/* allow architectures to override this if absolutely required */
+#ifndef PREALLOC_DMA_DEBUG_ENTRIES
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
+#endif
+
 enum {
 	dma_debug_single,
 	dma_debug_page,
@@ -1004,18 +1009,16 @@ void dma_debug_add_bus(struct bus_type *bus)
 	bus_register_notifier(bus, nb);
 }
 
-/*
- * Let the architectures decide how many entries should be preallocated.
- */
-void dma_debug_init(u32 num_entries)
+static int dma_debug_init(void)
 {
+	u32 num_entries;
 	int i;
 
 	/* Do not use dma_debug_initialized here, since we really want to be
 	 * called to set dma_debug_initialized
 	 */
 	if (global_disable)
-		return;
+		return 0;
 
 	for (i = 0; i < HASH_SIZE; ++i) {
 		INIT_LIST_HEAD(&dma_entry_hash[i].list);
@@ -1026,17 +1029,19 @@ void dma_debug_init(u32 num_entries)
 		pr_err("DMA-API: error creating debugfs entries - disabling\n");
 		global_disable = true;
 
-		return;
+		return 0;
 	}
 
 	if (req_entries)
 		num_entries = req_entries;
+	else
+		num_entries = PREALLOC_DMA_DEBUG_ENTRIES;
 
 	if (prealloc_memory(num_entries) != 0) {
 		pr_err("DMA-API: debugging out of memory error - disabled\n");
 		global_disable = true;
 
-		return;
+		return 0;
 	}
 
 	nr_total_entries = num_free_entries;
@@ -1044,7 +1049,9 @@ void dma_debug_init(u32 num_entries)
 	dma_debug_initialized = true;
 
 	pr_info("DMA-API: debugging enabled by kernel config\n");
+	return 0;
 }
+core_initcall(dma_debug_init);
 
 static __init int dma_debug_cmdline(char *str)
 {
-- 
cgit v1.2.3


From 52539ca89f365d3db530535fbffa88a3cca4d2ec Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Tue, 8 May 2018 13:03:50 +0200
Subject: cfg80211: Expose TXQ stats and parameters to userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for exporting the mac80211 TXQ stats via nl80211 by
way of a nested TXQ stats attribute, as well as for configuring the
quantum and limits that were previously only changeable through debugfs.

This commit adds just the nl80211 API, a subsequent commit adds support to
mac80211 itself.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  50 +++++++++++
 include/uapi/linux/nl80211.h |  58 +++++++++++++
 net/mac80211/ethtool.c       |  32 ++++---
 net/wireless/nl80211.c       | 202 +++++++++++++++++++++++++++++++++++++------
 net/wireless/rdev-ops.h      |  12 +++
 net/wireless/trace.h         |  14 +++
 net/wireless/wext-compat.c   |  23 +++--
 7 files changed, 343 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5e888ec62811..8db6071b6063 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1079,6 +1079,37 @@ struct sta_bss_parameters {
 	u16 beacon_interval;
 };
 
+/**
+ * struct cfg80211_txq_stats - TXQ statistics for this TID
+ * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to
+ *	indicate the relevant values in this struct are filled
+ * @backlog_bytes: total number of bytes currently backlogged
+ * @backlog_packets: total number of packets currently backlogged
+ * @flows: number of new flows seen
+ * @drops: total number of packets dropped
+ * @ecn_marks: total number of packets marked with ECN CE
+ * @overlimit: number of drops due to queue space overflow
+ * @overmemory: number of drops due to memory limit overflow
+ * @collisions: number of hash collisions
+ * @tx_bytes: total number of bytes dequeued
+ * @tx_packets: total number of packets dequeued
+ * @max_flows: maximum number of flows supported
+ */
+struct cfg80211_txq_stats {
+	u32 filled;
+	u32 backlog_bytes;
+	u32 backlog_packets;
+	u32 flows;
+	u32 drops;
+	u32 ecn_marks;
+	u32 overlimit;
+	u32 overmemory;
+	u32 collisions;
+	u32 tx_bytes;
+	u32 tx_packets;
+	u32 max_flows;
+};
+
 /**
  * struct cfg80211_tid_stats - per-TID statistics
  * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to
@@ -1088,6 +1119,7 @@ struct sta_bss_parameters {
  * @tx_msdu_retries: number of retries (not counting the first) for
  *	transmitted MSDUs
  * @tx_msdu_failed: number of failed transmitted MSDUs
+ * @txq_stats: TXQ statistics
  */
 struct cfg80211_tid_stats {
 	u32 filled;
@@ -1095,6 +1127,7 @@ struct cfg80211_tid_stats {
 	u64 tx_msdu;
 	u64 tx_msdu_retries;
 	u64 tx_msdu_failed;
+	struct cfg80211_txq_stats txq_stats;
 };
 
 #define IEEE80211_MAX_CHAINS	4
@@ -2204,6 +2237,9 @@ enum cfg80211_connect_params_changed {
  * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed
  * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed
  * @WIPHY_PARAM_DYN_ACK: dynack has been enabled
+ * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed
+ * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed
+ * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum
  */
 enum wiphy_params_flags {
 	WIPHY_PARAM_RETRY_SHORT		= 1 << 0,
@@ -2212,6 +2248,9 @@ enum wiphy_params_flags {
 	WIPHY_PARAM_RTS_THRESHOLD	= 1 << 3,
 	WIPHY_PARAM_COVERAGE_CLASS	= 1 << 4,
 	WIPHY_PARAM_DYN_ACK		= 1 << 5,
+	WIPHY_PARAM_TXQ_LIMIT		= 1 << 6,
+	WIPHY_PARAM_TXQ_MEMORY_LIMIT	= 1 << 7,
+	WIPHY_PARAM_TXQ_QUANTUM		= 1 << 8,
 };
 
 /**
@@ -2964,6 +3003,9 @@ struct cfg80211_external_auth_params {
  *
  * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
  *
+ * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this
+ *      function should return phy stats, and interface stats otherwise.
+ *
  * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake.
  *	If not deleted through @del_pmk the PMK remains valid until disconnect
  *	upon which the driver should clear it.
@@ -3265,6 +3307,10 @@ struct cfg80211_ops {
 					    struct net_device *dev,
 					    const bool enabled);
 
+	int	(*get_txq_stats)(struct wiphy *wiphy,
+				 struct wireless_dev *wdev,
+				 struct cfg80211_txq_stats *txqstats);
+
 	int	(*set_pmk)(struct wiphy *wiphy, struct net_device *dev,
 			   const struct cfg80211_pmk_conf *conf);
 	int	(*del_pmk)(struct wiphy *wiphy, struct net_device *dev,
@@ -3943,6 +3989,10 @@ struct wiphy {
 
 	u8 nan_supported_bands;
 
+	u32 txq_limit;
+	u32 txq_memory_limit;
+	u32 txq_quantum;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8e55b63ed3ff..5e67e3444aba 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2226,6 +2226,16 @@ enum nl80211_commands {
  * @NL80211_ATTR_NSS: Station's New/updated  RX_NSS value notified using this
  *	u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED.
  *
+ * @NL80211_ATTR_TXQ_STATS: TXQ statistics (nested attribute, see &enum
+ *      nl80211_txq_stats)
+ * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy.
+ *      The smaller of this and the memory limit is enforced.
+ * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory memory limit (in bytes) for the
+ *      TXQ queues for this phy. The smaller of this and the packet limit is
+ *      enforced.
+ * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes
+ *      a flow is assigned on each round of the DRR scheduler.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2660,6 +2670,11 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_CONTROL_PORT_OVER_NL80211,
 
+	NL80211_ATTR_TXQ_STATS,
+	NL80211_ATTR_TXQ_LIMIT,
+	NL80211_ATTR_TXQ_MEMORY_LIMIT,
+	NL80211_ATTR_TXQ_QUANTUM,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3040,6 +3055,7 @@ enum nl80211_sta_info {
  * @NL80211_TID_STATS_TX_MSDU_FAILED: number of failed transmitted
  *	MSDUs (u64)
  * @NL80211_TID_STATS_PAD: attribute used for padding for 64-bit alignment
+ * @NL80211_TID_STATS_TXQ_STATS: TXQ stats (nested attribute)
  * @NUM_NL80211_TID_STATS: number of attributes here
  * @NL80211_TID_STATS_MAX: highest numbered attribute here
  */
@@ -3050,12 +3066,51 @@ enum nl80211_tid_stats {
 	NL80211_TID_STATS_TX_MSDU_RETRIES,
 	NL80211_TID_STATS_TX_MSDU_FAILED,
 	NL80211_TID_STATS_PAD,
+	NL80211_TID_STATS_TXQ_STATS,
 
 	/* keep last */
 	NUM_NL80211_TID_STATS,
 	NL80211_TID_STATS_MAX = NUM_NL80211_TID_STATS - 1
 };
 
+/**
+ * enum nl80211_txq_stats - per TXQ statistics attributes
+ * @__NL80211_TXQ_STATS_INVALID: attribute number 0 is reserved
+ * @NUM_NL80211_TXQ_STATS: number of attributes here
+ * @NL80211_TXQ_STATS_BACKLOG_BYTES: number of bytes currently backlogged
+ * @NL80211_TXQ_STATS_BACKLOG_PACKETS: number of packets currently
+ *      backlogged
+ * @NL80211_TXQ_STATS_FLOWS: total number of new flows seen
+ * @NL80211_TXQ_STATS_DROPS: total number of packet drops
+ * @NL80211_TXQ_STATS_ECN_MARKS: total number of packet ECN marks
+ * @NL80211_TXQ_STATS_OVERLIMIT: number of drops due to queue space overflow
+ * @NL80211_TXQ_STATS_OVERMEMORY: number of drops due to memory limit overflow
+ *      (only for per-phy stats)
+ * @NL80211_TXQ_STATS_COLLISIONS: number of hash collisions
+ * @NL80211_TXQ_STATS_TX_BYTES: total number of bytes dequeued from TXQ
+ * @NL80211_TXQ_STATS_TX_PACKETS: total number of packets dequeued from TXQ
+ * @NL80211_TXQ_STATS_MAX_FLOWS: number of flow buckets for PHY
+ * @NL80211_TXQ_STATS_MAX: highest numbered attribute here
+ */
+enum nl80211_txq_stats {
+	__NL80211_TXQ_STATS_INVALID,
+	NL80211_TXQ_STATS_BACKLOG_BYTES,
+	NL80211_TXQ_STATS_BACKLOG_PACKETS,
+	NL80211_TXQ_STATS_FLOWS,
+	NL80211_TXQ_STATS_DROPS,
+	NL80211_TXQ_STATS_ECN_MARKS,
+	NL80211_TXQ_STATS_OVERLIMIT,
+	NL80211_TXQ_STATS_OVERMEMORY,
+	NL80211_TXQ_STATS_COLLISIONS,
+	NL80211_TXQ_STATS_TX_BYTES,
+	NL80211_TXQ_STATS_TX_PACKETS,
+	NL80211_TXQ_STATS_MAX_FLOWS,
+
+	/* keep last */
+	NUM_NL80211_TXQ_STATS,
+	NL80211_TXQ_STATS_MAX = NUM_NL80211_TXQ_STATS - 1
+};
+
 /**
  * enum nl80211_mpath_flags - nl80211 mesh path flags
  *
@@ -5072,6 +5127,8 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack
  *	rssi if firmware support, this flag is to intimate about ack rssi
  *	support to nl80211.
+ * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate
+ *      TXQs.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -5105,6 +5162,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_DFS_OFFLOAD,
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
 	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
+	NL80211_EXT_FEATURE_TXQS,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 08408520c3f8..1afeff94af8b 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -71,11 +71,15 @@ static void ieee80211_get_stats(struct net_device *dev,
 	struct ieee80211_channel *channel;
 	struct sta_info *sta;
 	struct ieee80211_local *local = sdata->local;
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct survey_info survey;
 	int i, q;
 #define STA_STATS_SURVEY_LEN 7
 
+	sinfo = kmalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return;
+
 	memset(data, 0, sizeof(u64) * STA_STATS_LEN);
 
 #define ADD_STA_STATS(sta)					\
@@ -86,8 +90,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] += sta->rx_stats.fragments;		\
 		data[i++] += sta->rx_stats.dropped;		\
 								\
-		data[i++] += sinfo.tx_packets;			\
-		data[i++] += sinfo.tx_bytes;			\
+		data[i++] += sinfo->tx_packets;			\
+		data[i++] += sinfo->tx_bytes;			\
 		data[i++] += sta->status_stats.filtered;	\
 		data[i++] += sta->status_stats.retry_failed;	\
 		data[i++] += sta->status_stats.retry_count;	\
@@ -107,8 +111,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
 			goto do_survey;
 
-		memset(&sinfo, 0, sizeof(sinfo));
-		sta_set_sinfo(sta, &sinfo);
+		memset(sinfo, 0, sizeof(*sinfo));
+		sta_set_sinfo(sta, sinfo);
 
 		i = 0;
 		ADD_STA_STATS(sta);
@@ -116,17 +120,17 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] = sta->sta_state;
 
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))
 			data[i] = 100000 *
-				cfg80211_calculate_bitrate(&sinfo.txrate);
+				cfg80211_calculate_bitrate(&sinfo->txrate);
 		i++;
-		if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))
 			data[i] = 100000 *
-				cfg80211_calculate_bitrate(&sinfo.rxrate);
+				cfg80211_calculate_bitrate(&sinfo->rxrate);
 		i++;
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
-			data[i] = (u8)sinfo.signal_avg;
+		if (sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
+			data[i] = (u8)sinfo->signal_avg;
 		i++;
 	} else {
 		list_for_each_entry(sta, &local->sta_list, list) {
@@ -134,14 +138,16 @@ static void ieee80211_get_stats(struct net_device *dev,
 			if (sta->sdata->dev != dev)
 				continue;
 
-			memset(&sinfo, 0, sizeof(sinfo));
-			sta_set_sinfo(sta, &sinfo);
+			memset(sinfo, 0, sizeof(*sinfo));
+			sta_set_sinfo(sta, sinfo);
 			i = 0;
 			ADD_STA_STATS(sta);
 		}
 	}
 
 do_survey:
+	kfree(sinfo);
+
 	i = STA_STATS_LEN - STA_STATS_SURVEY_LEN;
 	/* Get survey stats for current channel */
 	survey.filled = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6b942a68d1c8..f7715b85fd2b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -424,6 +424,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
 	[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
 	[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
+
+	[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
+	[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
+	[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
 };
 
 /* policy for the key attributes */
@@ -774,6 +778,39 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 	return -ENOBUFS;
 }
 
+static bool nl80211_put_txq_stats(struct sk_buff *msg,
+				  struct cfg80211_txq_stats *txqstats,
+				  int attrtype)
+{
+	struct nlattr *txqattr;
+
+#define PUT_TXQVAL_U32(attr, memb) do {					  \
+	if (txqstats->filled & BIT(NL80211_TXQ_STATS_ ## attr) &&	  \
+	    nla_put_u32(msg, NL80211_TXQ_STATS_ ## attr, txqstats->memb)) \
+		return false;						  \
+	} while (0)
+
+	txqattr = nla_nest_start(msg, attrtype);
+	if (!txqattr)
+		return false;
+
+	PUT_TXQVAL_U32(BACKLOG_BYTES, backlog_bytes);
+	PUT_TXQVAL_U32(BACKLOG_PACKETS, backlog_packets);
+	PUT_TXQVAL_U32(FLOWS, flows);
+	PUT_TXQVAL_U32(DROPS, drops);
+	PUT_TXQVAL_U32(ECN_MARKS, ecn_marks);
+	PUT_TXQVAL_U32(OVERLIMIT, overlimit);
+	PUT_TXQVAL_U32(OVERMEMORY, overmemory);
+	PUT_TXQVAL_U32(COLLISIONS, collisions);
+	PUT_TXQVAL_U32(TX_BYTES, tx_bytes);
+	PUT_TXQVAL_U32(TX_PACKETS, tx_packets);
+	PUT_TXQVAL_U32(MAX_FLOWS, max_flows);
+	nla_nest_end(msg, txqattr);
+
+#undef PUT_TXQVAL_U32
+	return true;
+}
+
 /* netlink command implementations */
 
 struct key_parse {
@@ -1973,6 +2010,28 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 				rdev->wiphy.nan_supported_bands))
 			goto nla_put_failure;
 
+		if (wiphy_ext_feature_isset(&rdev->wiphy,
+					    NL80211_EXT_FEATURE_TXQS)) {
+			struct cfg80211_txq_stats txqstats = {};
+			int res;
+
+			res = rdev_get_txq_stats(rdev, NULL, &txqstats);
+			if (!res &&
+			    !nl80211_put_txq_stats(msg, &txqstats,
+						   NL80211_ATTR_TXQ_STATS))
+				goto nla_put_failure;
+
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_LIMIT,
+					rdev->wiphy.txq_limit))
+				goto nla_put_failure;
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_MEMORY_LIMIT,
+					rdev->wiphy.txq_memory_limit))
+				goto nla_put_failure;
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_QUANTUM,
+					rdev->wiphy.txq_quantum))
+				goto nla_put_failure;
+		}
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -2350,6 +2409,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	u8 retry_short = 0, retry_long = 0;
 	u32 frag_threshold = 0, rts_threshold = 0;
 	u8 coverage_class = 0;
+	u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0;
 
 	ASSERT_RTNL();
 
@@ -2556,10 +2616,38 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		changed |= WIPHY_PARAM_DYN_ACK;
 	}
 
+	if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_limit = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_LIMIT]);
+		changed |= WIPHY_PARAM_TXQ_LIMIT;
+	}
+
+	if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_memory_limit = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]);
+		changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT;
+	}
+
+	if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_quantum = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_QUANTUM]);
+		changed |= WIPHY_PARAM_TXQ_QUANTUM;
+	}
+
 	if (changed) {
 		u8 old_retry_short, old_retry_long;
 		u32 old_frag_threshold, old_rts_threshold;
 		u8 old_coverage_class;
+		u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;
 
 		if (!rdev->ops->set_wiphy_params)
 			return -EOPNOTSUPP;
@@ -2569,6 +2657,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		old_frag_threshold = rdev->wiphy.frag_threshold;
 		old_rts_threshold = rdev->wiphy.rts_threshold;
 		old_coverage_class = rdev->wiphy.coverage_class;
+		old_txq_limit = rdev->wiphy.txq_limit;
+		old_txq_memory_limit = rdev->wiphy.txq_memory_limit;
+		old_txq_quantum = rdev->wiphy.txq_quantum;
 
 		if (changed & WIPHY_PARAM_RETRY_SHORT)
 			rdev->wiphy.retry_short = retry_short;
@@ -2580,6 +2671,12 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.rts_threshold = rts_threshold;
 		if (changed & WIPHY_PARAM_COVERAGE_CLASS)
 			rdev->wiphy.coverage_class = coverage_class;
+		if (changed & WIPHY_PARAM_TXQ_LIMIT)
+			rdev->wiphy.txq_limit = txq_limit;
+		if (changed & WIPHY_PARAM_TXQ_MEMORY_LIMIT)
+			rdev->wiphy.txq_memory_limit = txq_memory_limit;
+		if (changed & WIPHY_PARAM_TXQ_QUANTUM)
+			rdev->wiphy.txq_quantum = txq_quantum;
 
 		result = rdev_set_wiphy_params(rdev, changed);
 		if (result) {
@@ -2588,6 +2685,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.frag_threshold = old_frag_threshold;
 			rdev->wiphy.rts_threshold = old_rts_threshold;
 			rdev->wiphy.coverage_class = old_coverage_class;
+			rdev->wiphy.txq_limit = old_txq_limit;
+			rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
+			rdev->wiphy.txq_quantum = old_txq_quantum;
 			return result;
 		}
 	}
@@ -2709,6 +2809,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	}
 	wdev_unlock(wdev);
 
+	if (rdev->ops->get_txq_stats) {
+		struct cfg80211_txq_stats txqstats = {};
+		int ret = rdev_get_txq_stats(rdev, wdev, &txqstats);
+
+		if (ret == 0 &&
+		    !nl80211_put_txq_stats(msg, &txqstats,
+					   NL80211_ATTR_TXQ_STATS))
+			goto nla_put_failure;
+	}
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -4582,6 +4692,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 			PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed);
 
 #undef PUT_TIDVAL_U64
+			if ((tidstats->filled &
+			     BIT(NL80211_TID_STATS_TXQ_STATS)) &&
+			    !nl80211_put_txq_stats(msg, &tidstats->txq_stats,
+						   NL80211_TID_STATS_TXQ_STATS))
+				goto nla_put_failure;
+
 			nla_nest_end(msg, tidattr);
 		}
 
@@ -4606,13 +4722,17 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 static int nl80211_dump_station(struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct cfg80211_registered_device *rdev;
 	struct wireless_dev *wdev;
 	u8 mac_addr[ETH_ALEN];
 	int sta_idx = cb->args[2];
 	int err;
 
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
+
 	rtnl_lock();
 	err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
 	if (err)
@@ -4629,9 +4749,9 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	}
 
 	while (1) {
-		memset(&sinfo, 0, sizeof(sinfo));
+		memset(sinfo, 0, sizeof(*sinfo));
 		err = rdev_dump_station(rdev, wdev->netdev, sta_idx,
-					mac_addr, &sinfo);
+					mac_addr, sinfo);
 		if (err == -ENOENT)
 			break;
 		if (err)
@@ -4641,7 +4761,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
 				rdev, wdev->netdev, mac_addr,
-				&sinfo) < 0)
+				sinfo) < 0)
 			goto out;
 
 		sta_idx++;
@@ -4652,6 +4772,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	err = skb->len;
  out_err:
 	rtnl_unlock();
+	kfree(sinfo);
 
 	return err;
 }
@@ -4660,37 +4781,49 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct sk_buff *msg;
 	u8 *mac_addr = NULL;
 	int err;
 
-	memset(&sinfo, 0, sizeof(sinfo));
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
 
-	if (!info->attrs[NL80211_ATTR_MAC])
-		return -EINVAL;
+	if (!info->attrs[NL80211_ATTR_MAC]) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	if (!rdev->ops->get_station)
-		return -EOPNOTSUPP;
+	if (!rdev->ops->get_station) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
 
-	err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
+	err = rdev_get_station(rdev, dev, mac_addr, sinfo);
 	if (err)
-		return err;
+		goto out;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
+	if (!msg) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
 				 info->snd_portid, info->snd_seq, 0,
-				 rdev, dev, mac_addr, &sinfo) < 0) {
+				 rdev, dev, mac_addr, sinfo) < 0) {
 		nlmsg_free(msg);
-		return -ENOBUFS;
+		err = -ENOBUFS;
+		goto out;
 	}
 
-	return genlmsg_reply(msg, info);
+	err = genlmsg_reply(msg, info);
+out:
+	kfree(sinfo);
+	return err;
 }
 
 int cfg80211_check_station_change(struct wiphy *wiphy,
@@ -9954,18 +10087,26 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
 	 */
 	if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss &&
 	    rdev->ops->get_station) {
-		struct station_info sinfo = {};
+		struct station_info *sinfo;
 		u8 *mac_addr;
 
+		sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+		if (!sinfo)
+			return -ENOMEM;
+
 		mac_addr = wdev->current_bss->pub.bssid;
 
-		err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
-		if (err)
+		err = rdev_get_station(rdev, dev, mac_addr, sinfo);
+		if (err) {
+			kfree(sinfo);
 			return err;
+		}
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
 			wdev->cqm_config->last_rssi_event_value =
-				(s8) sinfo.rx_beacon_signal_avg;
+				(s8)sinfo->rx_beacon_signal_avg;
+
+		kfree(sinfo);
 	}
 
 	last = wdev->cqm_config->last_rssi_event_value;
@@ -14499,25 +14640,32 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
 	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct sk_buff *msg;
-	struct station_info empty_sinfo = {};
+	struct station_info *empty_sinfo = NULL;
 
-	if (!sinfo)
-		sinfo = &empty_sinfo;
+	if (!sinfo) {
+		empty_sinfo = kzalloc(sizeof(*empty_sinfo), GFP_KERNEL);
+		if (!empty_sinfo)
+			return;
+		sinfo = empty_sinfo;
+	}
 
 	trace_cfg80211_del_sta(dev, mac_addr);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
 	if (!msg)
-		return;
+		goto out;
 
 	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
 				 rdev, dev, mac_addr, sinfo) < 0) {
 		nlmsg_free(msg);
-		return;
+		goto out;
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				NL80211_MCGRP_MLME, gfp);
+
+out:
+	kfree(empty_sinfo);
 }
 EXPORT_SYMBOL(cfg80211_del_sta_sinfo);
 
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 87479a53411b..364f5d67f05b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -586,6 +586,18 @@ rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
+		   struct wireless_dev *wdev,
+		   struct cfg80211_txq_stats *txqstats)
+{
+	int ret;
+	trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
+	ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
 {
 	trace_rdev_rfkill_poll(&rdev->wiphy);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 55fb279a5196..2b417a2fe63f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3243,6 +3243,20 @@ TRACE_EVENT(rdev_set_multicast_to_unicast,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG,
 		  BOOL_TO_STR(__entry->enabled))
 );
+
+TRACE_EVENT(rdev_get_txq_stats,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+	TP_ARGS(wiphy, wdev),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
+);
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 05186a47878f..9e002df0f8d8 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1254,7 +1254,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
-	struct station_info sinfo = {};
+	struct station_info *sinfo;
 	u8 addr[ETH_ALEN];
 	int err;
 
@@ -1274,16 +1274,23 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 	if (err)
 		return err;
 
-	err = rdev_get_station(rdev, dev, addr, &sinfo);
-	if (err)
-		return err;
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
 
-	if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)))
-		return -EOPNOTSUPP;
+	err = rdev_get_station(rdev, dev, addr, sinfo);
+	if (err)
+		goto out;
 
-	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
+	if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
 
-	return 0;
+	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo->txrate);
+out:
+	kfree(sinfo);
+	return err;
 }
 
 /* Get wireless statistics.  Called by /proc/net/wireless and by SIOCGIWSTATS */
-- 
cgit v1.2.3


From 5212213aa5a2359dd0474c9dab22b6220b591fe1 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Tue, 27 Mar 2018 02:09:26 -0700
Subject: mm, powerpc, x86: define VM_PKEY_BITx bits if CONFIG_ARCH_HAS_PKEYS
 is enabled

VM_PKEY_BITx are defined only if CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
is enabled. Powerpc also needs these bits. Hence lets define the
VM_PKEY_BITx bits for any architecture that enables
CONFIG_ARCH_HAS_PKEYS.

Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/pkeys.h | 2 ++
 fs/proc/task_mmu.c               | 4 ++--
 include/linux/mm.h               | 9 +++++----
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 0409c80c32c0..18ef59a9886d 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -26,6 +26,8 @@ extern u32 initial_allocation_mask; /* bits set for reserved keys */
 # define VM_PKEY_BIT2	VM_HIGH_ARCH_2
 # define VM_PKEY_BIT3	VM_HIGH_ARCH_3
 # define VM_PKEY_BIT4	VM_HIGH_ARCH_4
+#elif !defined(VM_PKEY_BIT4)
+# define VM_PKEY_BIT4	VM_HIGH_ARCH_4
 #endif
 
 #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | \
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c486ad4b43f0..541392a62608 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -673,13 +673,13 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_MERGEABLE)]	= "mg",
 		[ilog2(VM_UFFD_MISSING)]= "um",
 		[ilog2(VM_UFFD_WP)]	= "uw",
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_ARCH_HAS_PKEYS
 		/* These come out via ProtectionKey: */
 		[ilog2(VM_PKEY_BIT0)]	= "",
 		[ilog2(VM_PKEY_BIT1)]	= "",
 		[ilog2(VM_PKEY_BIT2)]	= "",
 		[ilog2(VM_PKEY_BIT3)]	= "",
-#endif
+#endif /* CONFIG_ARCH_HAS_PKEYS */
 	};
 	size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06a4be6..c6a6f2492c1b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -228,15 +228,16 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
-#if defined(CONFIG_X86)
-# define VM_PAT		VM_ARCH_1	/* PAT reserves whole VMA at once (x86) */
-#if defined (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)
+#ifdef CONFIG_ARCH_HAS_PKEYS
 # define VM_PKEY_SHIFT	VM_HIGH_ARCH_BIT_0
 # define VM_PKEY_BIT0	VM_HIGH_ARCH_0	/* A protection key is a 4-bit value */
 # define VM_PKEY_BIT1	VM_HIGH_ARCH_1
 # define VM_PKEY_BIT2	VM_HIGH_ARCH_2
 # define VM_PKEY_BIT3	VM_HIGH_ARCH_3
-#endif
+#endif /* CONFIG_ARCH_HAS_PKEYS */
+
+#if defined(CONFIG_X86)
+# define VM_PAT		VM_ARCH_1	/* PAT reserves whole VMA at once (x86) */
 #elif defined(CONFIG_PPC)
 # define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
 #elif defined(CONFIG_PARISC)
-- 
cgit v1.2.3


From 9ee38b21a29f49311a30978e78e82810ceeace35 Mon Sep 17 00:00:00 2001
From: Amit Nischal <anischal@codeaurora.org>
Date: Mon, 7 May 2018 16:20:19 +0530
Subject: clk: qcom: Add DT bindings for SDM845 gcc clock controller

Add compatible string and the include file for gcc clock
controller for SDM845.

Signed-off-by: Amit Nischal <anischal@codeaurora.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/qcom,gcc.txt         |   1 +
 include/dt-bindings/clock/qcom,gcc-sdm845.h        | 239 +++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 include/dt-bindings/clock/qcom,gcc-sdm845.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc.txt b/Documentation/devicetree/bindings/clock/qcom,gcc.txt
index 551d03be9665..bf2355d9ada8 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc.txt
@@ -18,6 +18,7 @@ Required properties :
 			"qcom,gcc-msm8994"
 			"qcom,gcc-msm8996"
 			"qcom,gcc-mdm9615"
+			"qcom,gcc-sdm845"
 
 - reg : shall contain base register location and length
 - #clock-cells : shall contain 1
diff --git a/include/dt-bindings/clock/qcom,gcc-sdm845.h b/include/dt-bindings/clock/qcom,gcc-sdm845.h
new file mode 100644
index 000000000000..aca61264f12c
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,gcc-sdm845.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_SDM_GCC_SDM845_H
+#define _DT_BINDINGS_CLK_SDM_GCC_SDM845_H
+
+/* GCC clock registers */
+#define GCC_AGGRE_NOC_PCIE_TBU_CLK				0
+#define GCC_AGGRE_UFS_CARD_AXI_CLK				1
+#define GCC_AGGRE_UFS_PHY_AXI_CLK				2
+#define GCC_AGGRE_USB3_PRIM_AXI_CLK				3
+#define GCC_AGGRE_USB3_SEC_AXI_CLK				4
+#define GCC_BOOT_ROM_AHB_CLK					5
+#define GCC_CAMERA_AHB_CLK					6
+#define GCC_CAMERA_AXI_CLK					7
+#define GCC_CAMERA_XO_CLK					8
+#define GCC_CE1_AHB_CLK						9
+#define GCC_CE1_AXI_CLK						10
+#define GCC_CE1_CLK						11
+#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK				12
+#define GCC_CFG_NOC_USB3_SEC_AXI_CLK				13
+#define GCC_CPUSS_AHB_CLK					14
+#define GCC_CPUSS_AHB_CLK_SRC					15
+#define GCC_CPUSS_RBCPR_CLK					16
+#define GCC_CPUSS_RBCPR_CLK_SRC					17
+#define GCC_DDRSS_GPU_AXI_CLK					18
+#define GCC_DISP_AHB_CLK					19
+#define GCC_DISP_AXI_CLK					20
+#define GCC_DISP_GPLL0_CLK_SRC					21
+#define GCC_DISP_GPLL0_DIV_CLK_SRC				22
+#define GCC_DISP_XO_CLK						23
+#define GCC_GP1_CLK						24
+#define GCC_GP1_CLK_SRC						25
+#define GCC_GP2_CLK						26
+#define GCC_GP2_CLK_SRC						27
+#define GCC_GP3_CLK						28
+#define GCC_GP3_CLK_SRC						29
+#define GCC_GPU_CFG_AHB_CLK					30
+#define GCC_GPU_GPLL0_CLK_SRC					31
+#define GCC_GPU_GPLL0_DIV_CLK_SRC				32
+#define GCC_GPU_MEMNOC_GFX_CLK					33
+#define GCC_GPU_SNOC_DVM_GFX_CLK				34
+#define GCC_MSS_AXIS2_CLK					35
+#define GCC_MSS_CFG_AHB_CLK					36
+#define GCC_MSS_GPLL0_DIV_CLK_SRC				37
+#define GCC_MSS_MFAB_AXIS_CLK					38
+#define GCC_MSS_Q6_MEMNOC_AXI_CLK				39
+#define GCC_MSS_SNOC_AXI_CLK					40
+#define GCC_PCIE_0_AUX_CLK					41
+#define GCC_PCIE_0_AUX_CLK_SRC					42
+#define GCC_PCIE_0_CFG_AHB_CLK					43
+#define GCC_PCIE_0_CLKREF_CLK					44
+#define GCC_PCIE_0_MSTR_AXI_CLK					45
+#define GCC_PCIE_0_PIPE_CLK					46
+#define GCC_PCIE_0_SLV_AXI_CLK					47
+#define GCC_PCIE_0_SLV_Q2A_AXI_CLK				48
+#define GCC_PCIE_1_AUX_CLK					49
+#define GCC_PCIE_1_AUX_CLK_SRC					50
+#define GCC_PCIE_1_CFG_AHB_CLK					51
+#define GCC_PCIE_1_CLKREF_CLK					52
+#define GCC_PCIE_1_MSTR_AXI_CLK					53
+#define GCC_PCIE_1_PIPE_CLK					54
+#define GCC_PCIE_1_SLV_AXI_CLK					55
+#define GCC_PCIE_1_SLV_Q2A_AXI_CLK				56
+#define GCC_PCIE_PHY_AUX_CLK					57
+#define GCC_PCIE_PHY_REFGEN_CLK					58
+#define GCC_PCIE_PHY_REFGEN_CLK_SRC				59
+#define GCC_PDM2_CLK						60
+#define GCC_PDM2_CLK_SRC					61
+#define GCC_PDM_AHB_CLK						62
+#define GCC_PDM_XO4_CLK						63
+#define GCC_PRNG_AHB_CLK					64
+#define GCC_QMIP_CAMERA_AHB_CLK					65
+#define GCC_QMIP_DISP_AHB_CLK					66
+#define GCC_QMIP_VIDEO_AHB_CLK					67
+#define GCC_QUPV3_WRAP0_S0_CLK					68
+#define GCC_QUPV3_WRAP0_S0_CLK_SRC				69
+#define GCC_QUPV3_WRAP0_S1_CLK					70
+#define GCC_QUPV3_WRAP0_S1_CLK_SRC				71
+#define GCC_QUPV3_WRAP0_S2_CLK					72
+#define GCC_QUPV3_WRAP0_S2_CLK_SRC				73
+#define GCC_QUPV3_WRAP0_S3_CLK					74
+#define GCC_QUPV3_WRAP0_S3_CLK_SRC				75
+#define GCC_QUPV3_WRAP0_S4_CLK					76
+#define GCC_QUPV3_WRAP0_S4_CLK_SRC				77
+#define GCC_QUPV3_WRAP0_S5_CLK					78
+#define GCC_QUPV3_WRAP0_S5_CLK_SRC				79
+#define GCC_QUPV3_WRAP0_S6_CLK					80
+#define GCC_QUPV3_WRAP0_S6_CLK_SRC				81
+#define GCC_QUPV3_WRAP0_S7_CLK					82
+#define GCC_QUPV3_WRAP0_S7_CLK_SRC				83
+#define GCC_QUPV3_WRAP1_S0_CLK					84
+#define GCC_QUPV3_WRAP1_S0_CLK_SRC				85
+#define GCC_QUPV3_WRAP1_S1_CLK					86
+#define GCC_QUPV3_WRAP1_S1_CLK_SRC				87
+#define GCC_QUPV3_WRAP1_S2_CLK					88
+#define GCC_QUPV3_WRAP1_S2_CLK_SRC				89
+#define GCC_QUPV3_WRAP1_S3_CLK					90
+#define GCC_QUPV3_WRAP1_S3_CLK_SRC				91
+#define GCC_QUPV3_WRAP1_S4_CLK					92
+#define GCC_QUPV3_WRAP1_S4_CLK_SRC				93
+#define GCC_QUPV3_WRAP1_S5_CLK					94
+#define GCC_QUPV3_WRAP1_S5_CLK_SRC				95
+#define GCC_QUPV3_WRAP1_S6_CLK					96
+#define GCC_QUPV3_WRAP1_S6_CLK_SRC				97
+#define GCC_QUPV3_WRAP1_S7_CLK					98
+#define GCC_QUPV3_WRAP1_S7_CLK_SRC				99
+#define GCC_QUPV3_WRAP_0_M_AHB_CLK				100
+#define GCC_QUPV3_WRAP_0_S_AHB_CLK				101
+#define GCC_QUPV3_WRAP_1_M_AHB_CLK				102
+#define GCC_QUPV3_WRAP_1_S_AHB_CLK				103
+#define GCC_SDCC2_AHB_CLK					104
+#define GCC_SDCC2_APPS_CLK					105
+#define GCC_SDCC2_APPS_CLK_SRC					106
+#define GCC_SDCC4_AHB_CLK					107
+#define GCC_SDCC4_APPS_CLK					108
+#define GCC_SDCC4_APPS_CLK_SRC					109
+#define GCC_SYS_NOC_CPUSS_AHB_CLK				110
+#define GCC_TSIF_AHB_CLK					111
+#define GCC_TSIF_INACTIVITY_TIMERS_CLK				112
+#define GCC_TSIF_REF_CLK					113
+#define GCC_TSIF_REF_CLK_SRC					114
+#define GCC_UFS_CARD_AHB_CLK					115
+#define GCC_UFS_CARD_AXI_CLK					116
+#define GCC_UFS_CARD_AXI_CLK_SRC				117
+#define GCC_UFS_CARD_CLKREF_CLK					118
+#define GCC_UFS_CARD_ICE_CORE_CLK				119
+#define GCC_UFS_CARD_ICE_CORE_CLK_SRC				120
+#define GCC_UFS_CARD_PHY_AUX_CLK				121
+#define GCC_UFS_CARD_PHY_AUX_CLK_SRC				122
+#define GCC_UFS_CARD_RX_SYMBOL_0_CLK				123
+#define GCC_UFS_CARD_RX_SYMBOL_1_CLK				124
+#define GCC_UFS_CARD_TX_SYMBOL_0_CLK				125
+#define GCC_UFS_CARD_UNIPRO_CORE_CLK				126
+#define GCC_UFS_CARD_UNIPRO_CORE_CLK_SRC			127
+#define GCC_UFS_MEM_CLKREF_CLK					128
+#define GCC_UFS_PHY_AHB_CLK					129
+#define GCC_UFS_PHY_AXI_CLK					130
+#define GCC_UFS_PHY_AXI_CLK_SRC					131
+#define GCC_UFS_PHY_ICE_CORE_CLK				132
+#define GCC_UFS_PHY_ICE_CORE_CLK_SRC				133
+#define GCC_UFS_PHY_PHY_AUX_CLK					134
+#define GCC_UFS_PHY_PHY_AUX_CLK_SRC				135
+#define GCC_UFS_PHY_RX_SYMBOL_0_CLK				136
+#define GCC_UFS_PHY_RX_SYMBOL_1_CLK				137
+#define GCC_UFS_PHY_TX_SYMBOL_0_CLK				138
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK				139
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC				140
+#define GCC_USB30_PRIM_MASTER_CLK				141
+#define GCC_USB30_PRIM_MASTER_CLK_SRC				142
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK				143
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC			144
+#define GCC_USB30_PRIM_SLEEP_CLK				145
+#define GCC_USB30_SEC_MASTER_CLK				146
+#define GCC_USB30_SEC_MASTER_CLK_SRC				147
+#define GCC_USB30_SEC_MOCK_UTMI_CLK				148
+#define GCC_USB30_SEC_MOCK_UTMI_CLK_SRC				149
+#define GCC_USB30_SEC_SLEEP_CLK					150
+#define GCC_USB3_PRIM_CLKREF_CLK				151
+#define GCC_USB3_PRIM_PHY_AUX_CLK				152
+#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC				153
+#define GCC_USB3_PRIM_PHY_COM_AUX_CLK				154
+#define GCC_USB3_PRIM_PHY_PIPE_CLK				155
+#define GCC_USB3_SEC_CLKREF_CLK					156
+#define GCC_USB3_SEC_PHY_AUX_CLK				157
+#define GCC_USB3_SEC_PHY_AUX_CLK_SRC				158
+#define GCC_USB3_SEC_PHY_PIPE_CLK				159
+#define GCC_USB3_SEC_PHY_COM_AUX_CLK				160
+#define GCC_USB_PHY_CFG_AHB2PHY_CLK				161
+#define GCC_VIDEO_AHB_CLK					162
+#define GCC_VIDEO_AXI_CLK					163
+#define GCC_VIDEO_XO_CLK					164
+#define GPLL0							165
+#define GPLL0_OUT_EVEN						166
+#define GPLL0_OUT_MAIN						167
+#define GCC_GPU_IREF_CLK					168
+#define GCC_SDCC1_AHB_CLK					169
+#define GCC_SDCC1_APPS_CLK					170
+#define GCC_SDCC1_ICE_CORE_CLK					171
+#define GCC_SDCC1_APPS_CLK_SRC					172
+#define GCC_SDCC1_ICE_CORE_CLK_SRC				173
+#define GCC_APC_VS_CLK						174
+#define GCC_GPU_VS_CLK						175
+#define GCC_MSS_VS_CLK						176
+#define GCC_VDDA_VS_CLK						177
+#define GCC_VDDCX_VS_CLK					178
+#define GCC_VDDMX_VS_CLK					179
+#define GCC_VS_CTRL_AHB_CLK					180
+#define GCC_VS_CTRL_CLK						181
+#define GCC_VS_CTRL_CLK_SRC					182
+#define GCC_VSENSOR_CLK_SRC					183
+#define GPLL4							184
+
+/* GCC Resets */
+#define GCC_MMSS_BCR						0
+#define GCC_PCIE_0_BCR						1
+#define GCC_PCIE_1_BCR						2
+#define GCC_PCIE_PHY_BCR					3
+#define GCC_PDM_BCR						4
+#define GCC_PRNG_BCR						5
+#define GCC_QUPV3_WRAPPER_0_BCR					6
+#define GCC_QUPV3_WRAPPER_1_BCR					7
+#define GCC_QUSB2PHY_PRIM_BCR					8
+#define GCC_QUSB2PHY_SEC_BCR					9
+#define GCC_SDCC2_BCR						10
+#define GCC_SDCC4_BCR						11
+#define GCC_TSIF_BCR						12
+#define GCC_UFS_CARD_BCR					13
+#define GCC_UFS_PHY_BCR						14
+#define GCC_USB30_PRIM_BCR					15
+#define GCC_USB30_SEC_BCR					16
+#define GCC_USB3_PHY_PRIM_BCR					17
+#define GCC_USB3PHY_PHY_PRIM_BCR				18
+#define GCC_USB3_DP_PHY_PRIM_BCR				19
+#define GCC_USB3_PHY_SEC_BCR					20
+#define GCC_USB3PHY_PHY_SEC_BCR					21
+#define GCC_USB3_DP_PHY_SEC_BCR					22
+#define GCC_USB_PHY_CFG_AHB2PHY_BCR				23
+#define GCC_PCIE_0_PHY_BCR					24
+#define GCC_PCIE_1_PHY_BCR					25
+
+/* GCC GDSCRs */
+#define PCIE_0_GDSC						0
+#define PCIE_1_GDSC						1
+#define UFS_CARD_GDSC						2
+#define UFS_PHY_GDSC						3
+#define USB30_PRIM_GDSC						4
+#define USB30_SEC_GDSC						5
+#define HLOS1_VOTE_AGGRE_NOC_MMU_AUDIO_TBU_GDSC			6
+#define HLOS1_VOTE_AGGRE_NOC_MMU_PCIE_TBU_GDSC			7
+#define HLOS1_VOTE_AGGRE_NOC_MMU_TBU1_GDSC			8
+#define HLOS1_VOTE_AGGRE_NOC_MMU_TBU2_GDSC			9
+#define HLOS1_VOTE_MMNOC_MMU_TBU_HF0_GDSC			10
+#define HLOS1_VOTE_MMNOC_MMU_TBU_HF1_GDSC			11
+#define HLOS1_VOTE_MMNOC_MMU_TBU_SF_GDSC			12
+
+#endif
-- 
cgit v1.2.3


From 0597017cd18dc973ec6c80e55abfa36df05665d6 Mon Sep 17 00:00:00 2001
From: Matt Atwood <matthew.s.atwood@intel.com>
Date: Fri, 4 May 2018 15:17:59 -0700
Subject: drm/dp: Add DP_DPCD_REV_XX to drm_dp_helper

As more differentation occurs between DP spec. Its useful to have these
as macros in a drm_dp_helper.

v2: DPCD_REV_XX to DP_DPCD_REV_XX

Signed-off-by: Matt Atwood <matthew.s.atwood@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180504221800.17830-1-matthew.s.atwood@intel.com
---
 include/drm/drm_dp_helper.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 930919f74af5..fc01341a46fa 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -64,6 +64,11 @@
 /* AUX CH addresses */
 /* DPCD */
 #define DP_DPCD_REV                         0x000
+# define DP_DPCD_REV_10                     0x10
+# define DP_DPCD_REV_11                     0x11
+# define DP_DPCD_REV_12                     0x12
+# define DP_DPCD_REV_13                     0x13
+# define DP_DPCD_REV_14                     0x14
 
 #define DP_MAX_LINK_RATE                    0x001
 
-- 
cgit v1.2.3


From 2f065d8ae918159791474049ab67a0cb85723b81 Mon Sep 17 00:00:00 2001
From: Matt Atwood <matthew.s.atwood@intel.com>
Date: Fri, 4 May 2018 15:18:00 -0700
Subject: drm/dp: Correctly mask DP_TRAINING_AUX_RD_INTERVAL values for DP 1.4

DP_TRAINING_AUX_RD_INTERVAL with DP 1.3 spec changed bit scheeme from 8
bits to 7 in DPCD 0x000e. The 8th bit is used to identify extended
receiver capabilities. For panels that use this new feature wait interval
would be increased by 512 ms, when spec is max 16 ms. This behavior is
described in table 2-158 of DP 1.4 spec address 0000eh.

With the introduction of DP 1.4 spec main link clock recovery was
standardized to 100 us regardless of TRAINING_AUX_RD_INTERVAL value.

To avoid breaking panels that are not spec compiant we now warn on
invalid values.

V2: commit title/message, masking all 7 bits, warn on out of spec values.
V3: commit message, make link train clock recovery follow DP 1.4 spec.
V4: style changes
V5: typo
V6: print statement revisions, DP_REV to DPCD_REV, comment correction
V7: typo
V8: Style
V9: Strip out DPCD_REV_XX into seperate patch
v10: DPCD_REV_XX to DP_DPCD_REV_XX

Signed-off-by: Matt Atwood <matthew.s.atwood@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180504221800.17830-2-matthew.s.atwood@intel.com
---
 drivers/gpu/drm/drm_dp_helper.c | 22 ++++++++++++++++++----
 include/drm/drm_dp_helper.h     |  1 +
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_dp_helper.c b/drivers/gpu/drm/drm_dp_helper.c
index ffe14ec3e7f2..36c7609a4bd5 100644
--- a/drivers/gpu/drm/drm_dp_helper.c
+++ b/drivers/gpu/drm/drm_dp_helper.c
@@ -119,18 +119,32 @@ u8 drm_dp_get_adjust_request_pre_emphasis(const u8 link_status[DP_LINK_STATUS_SI
 EXPORT_SYMBOL(drm_dp_get_adjust_request_pre_emphasis);
 
 void drm_dp_link_train_clock_recovery_delay(const u8 dpcd[DP_RECEIVER_CAP_SIZE]) {
-	if (dpcd[DP_TRAINING_AUX_RD_INTERVAL] == 0)
+	int rd_interval = dpcd[DP_TRAINING_AUX_RD_INTERVAL] &
+			  DP_TRAINING_AUX_RD_MASK;
+
+	if (rd_interval > 4)
+		DRM_DEBUG_KMS("AUX interval %d, out of range (max 4)\n",
+			      rd_interval);
+
+	if (rd_interval == 0 || dpcd[DP_DPCD_REV] >= DP_DPCD_REV_14)
 		udelay(100);
 	else
-		mdelay(dpcd[DP_TRAINING_AUX_RD_INTERVAL] * 4);
+		mdelay(rd_interval * 4);
 }
 EXPORT_SYMBOL(drm_dp_link_train_clock_recovery_delay);
 
 void drm_dp_link_train_channel_eq_delay(const u8 dpcd[DP_RECEIVER_CAP_SIZE]) {
-	if (dpcd[DP_TRAINING_AUX_RD_INTERVAL] == 0)
+	int rd_interval = dpcd[DP_TRAINING_AUX_RD_INTERVAL] &
+			  DP_TRAINING_AUX_RD_MASK;
+
+	if (rd_interval > 4)
+		DRM_DEBUG_KMS("AUX interval %d, out of range (max 4)\n",
+			      rd_interval);
+
+	if (rd_interval == 0)
 		udelay(400);
 	else
-		mdelay(dpcd[DP_TRAINING_AUX_RD_INTERVAL] * 4);
+		mdelay(rd_interval * 4);
 }
 EXPORT_SYMBOL(drm_dp_link_train_channel_eq_delay);
 
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index fc01341a46fa..c7b285637f86 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -124,6 +124,7 @@
 # define DP_DPCD_DISPLAY_CONTROL_CAPABLE     (1 << 3) /* edp v1.2 or higher */
 
 #define DP_TRAINING_AUX_RD_INTERVAL         0x00e   /* XXX 1.2? */
+# define DP_TRAINING_AUX_RD_MASK            0x7F    /* XXX 1.2? */
 
 #define DP_ADAPTER_CAP			    0x00f   /* 1.2 */
 # define DP_FORCE_LOAD_SENSE_CAP	    (1 << 0)
-- 
cgit v1.2.3


From 2c9e0a6fa2bb75697981825153128f43b32f6d08 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Tue, 27 Mar 2018 02:09:27 -0700
Subject: mm, powerpc, x86: introduce an additional vma bit for powerpc pkey

Currently only 4bits are allocated in the vma flags to hold 16
keys. This is sufficient for x86. PowerPC  supports  32  keys,
which needs 5bits. This patch allocates an  additional bit.

Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[mpe: Fold in #if VM_PKEY_BIT4 as noticed by Dave Hansen]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 fs/proc/task_mmu.c | 3 +++
 include/linux/mm.h | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 541392a62608..c2163606e6fb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -679,6 +679,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_PKEY_BIT1)]	= "",
 		[ilog2(VM_PKEY_BIT2)]	= "",
 		[ilog2(VM_PKEY_BIT3)]	= "",
+#if VM_PKEY_BIT4
+		[ilog2(VM_PKEY_BIT4)]	= "",
+#endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
 	};
 	size_t i;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c6a6f2492c1b..fc25929031c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -231,9 +231,14 @@ extern unsigned int kobjsize(const void *objp);
 #ifdef CONFIG_ARCH_HAS_PKEYS
 # define VM_PKEY_SHIFT	VM_HIGH_ARCH_BIT_0
 # define VM_PKEY_BIT0	VM_HIGH_ARCH_0	/* A protection key is a 4-bit value */
-# define VM_PKEY_BIT1	VM_HIGH_ARCH_1
+# define VM_PKEY_BIT1	VM_HIGH_ARCH_1	/* on x86 and 5-bit value on ppc64   */
 # define VM_PKEY_BIT2	VM_HIGH_ARCH_2
 # define VM_PKEY_BIT3	VM_HIGH_ARCH_3
+#ifdef CONFIG_PPC
+# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
+#else
+# define VM_PKEY_BIT4  0
+#endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
 
 #if defined(CONFIG_X86)
-- 
cgit v1.2.3


From cd419a513146367af08b895a8f7d360e4e77b638 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 10 Apr 2018 13:12:20 +1000
Subject: mm/pkeys: Remove include of asm/mmu_context.h from pkeys.h

While trying to unify the pkey handling in show_smap() between x86 and
powerpc we stumbled across various build failures due to the order of
includes between the two arches.

Part of the problem is that linux/pkeys.h includes asm/mmu_context.h,
and the relationship between asm/mmu_context.h and asm/pkeys.h is not
consistent between the two arches.

It would be cleaner if linux/pkeys.h only included asm/pkeys.h,
creating a single integration point for the arch pkey definitions.

So this patch removes the include of asm/mmu_context.h from
linux/pkeys.h.

We can't prove that this is safe in the general case, but it passes
all the build tests I've thrown at it. Also asm/mmu_context.h is
included widely while linux/pkeys.h is not, so most likely any code
that is including linux/pkeys.h is already getting asm/mmu_context.h
from elsewhere.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/pkeys.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 0794ca78c379..ed06e1a67bfa 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -3,7 +3,6 @@
 #define _LINUX_PKEYS_H
 
 #include <linux/mm_types.h>
-#include <asm/mmu_context.h>
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
 #include <asm/pkeys.h>
-- 
cgit v1.2.3


From dbec10e58deadba596d59a0ab4a394fef271992f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 11 Apr 2018 23:41:42 +1000
Subject: mm/pkeys, powerpc, x86: Provide an empty vma_pkey() in linux/pkeys.h

Consolidate the pkey handling by providing a common empty definition
of vma_pkey() in pkeys.h when CONFIG_ARCH_HAS_PKEYS=n.

This also removes another entanglement of pkeys.h and
asm/mmu_context.h.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
---
 arch/powerpc/include/asm/mmu_context.h | 5 -----
 arch/x86/include/asm/mmu_context.h     | 5 -----
 include/linux/pkeys.h                  | 5 +++++
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 1835ca1505d6..896efa559996 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -250,11 +250,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 #define thread_pkey_regs_restore(new_thread, old_thread)
 #define thread_pkey_regs_init(thread)
 
-static inline int vma_pkey(struct vm_area_struct *vma)
-{
-	return 0;
-}
-
 static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
 {
 	return 0x0UL;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 57e3785d0d26..3d748bdf44a7 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -296,11 +296,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
 
 	return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
 }
-#else
-static inline int vma_pkey(struct vm_area_struct *vma)
-{
-	return 0;
-}
 #endif
 
 /*
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index ed06e1a67bfa..aad54663763b 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -13,6 +13,11 @@
 #define PKEY_DEDICATED_EXECUTE_ONLY 0
 #define ARCH_VM_PKEY_FLAGS 0
 
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+	return 0;
+}
+
 static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
 	return (pkey == 0);
-- 
cgit v1.2.3


From 555934a71bb479ce109722807b374f2d98aefe89 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 12 Apr 2018 23:54:00 +1000
Subject: x86/pkeys: Move vma_pkey() into asm/pkeys.h

Move the last remaining pkey helper, vma_pkey() into asm/pkeys.h

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
---
 arch/x86/include/asm/mmu_context.h | 10 ----------
 arch/x86/include/asm/pkeys.h       |  8 ++++++++
 include/linux/pkeys.h              |  2 +-
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 3d748bdf44a7..59e1fadef401 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -288,16 +288,6 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		mpx_notify_unmap(mm, vma, start, end);
 }
 
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-static inline int vma_pkey(struct vm_area_struct *vma)
-{
-	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
-				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
-
-	return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
-}
-#endif
-
 /*
  * We only want to enforce protection keys on the current process
  * because we effectively have no access to PKRU for other
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index a0ba1ffda0df..0e5f749158e4 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -106,4 +106,12 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 extern void copy_init_pkru_to_fpregs(void);
 
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
+
+	return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+
 #endif /*_ASM_X86_PKEYS_H */
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index aad54663763b..946cb773b79f 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_PKEYS_H
 #define _LINUX_PKEYS_H
 
-#include <linux/mm_types.h>
+#include <linux/mm.h>
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
 #include <asm/pkeys.h>
-- 
cgit v1.2.3


From 4414ef9536c398ebc4e1443d59fbae8fea53732b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 13 Apr 2018 23:54:52 +1000
Subject: mm/pkeys: Add an empty arch_pkeys_enabled()

Add an empty arch_pkeys_enabled() in linux/pkeys.h for the
CONFIG_ARCH_HAS_PKEYS=n case.

Split out of a patch by Ram Pai <linuxram@us.ibm.com>.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
---
 include/linux/pkeys.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 946cb773b79f..2955ba976048 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -39,6 +39,11 @@ static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	return 0;
 }
 
+static inline bool arch_pkeys_enabled(void)
+{
+	return false;
+}
+
 static inline void copy_init_pkru_to_fpregs(void)
 {
 }
-- 
cgit v1.2.3


From b21c034b3df833b5d9db1cfdc3938dbb0d7995c6 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:28 -0700
Subject: udp: Do not pass MSS as parameter to GSO segmentation

There is no point in passing MSS as a parameter for for the GSO
segmentation call as it is already available via the shared info for the
skb itself.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h      | 2 +-
 net/ipv4/udp_offload.c | 6 ++++--
 net/ipv6/udp_offload.c | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index 05990746810e..8bd83b044ecd 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -176,7 +176,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check);
+				  __sum16 check);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 006257092f06..c1afcd2f1a76 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -189,14 +189,16 @@ EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check)
+				  __sum16 check)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
 	unsigned int hdrlen;
 	struct udphdr *uh;
+	unsigned int mss;
 
+	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
 		return ERR_PTR(-EINVAL);
 
@@ -244,7 +246,7 @@ static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v4_check(sizeof(struct udphdr) + mss,
 					      iph->saddr, iph->daddr, 0));
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index f7b85b1e6b3e..dea03ec09715 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -26,7 +26,7 @@ static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v6_check(sizeof(struct udphdr) + mss,
 					      &ip6h->saddr, &ip6h->daddr, 0));
 }
-- 
cgit v1.2.3


From 9a0d41b3598ff62ecb26661bbfb1d523586cdea3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:34 -0700
Subject: udp: Do not pass checksum as a parameter to GSO segmentation

This patch is meant to allow us to avoid having to recompute the checksum
from scratch and have it passed as a parameter.

Instead of taking that approach we can take advantage of the fact that the
length that was used to compute the existing checksum is included in the
UDP header.

Finally to avoid the need to invert the result we can just call csum16_add
and csum16_sub directly. By doing this we can avoid a number of
instructions in the loop that is handling segmentation.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h      |  3 +--
 net/ipv4/udp_offload.c | 32 ++++++++++++++++++--------------
 net/ipv6/udp_offload.c |  7 +------
 3 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index 8bd83b044ecd..9289b6425032 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -175,8 +175,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check);
+				  netdev_features_t features);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index c1afcd2f1a76..92c182e99ddc 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -188,8 +188,7 @@ out_unlock:
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check)
+				  netdev_features_t features)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
@@ -197,6 +196,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	unsigned int hdrlen;
 	struct udphdr *uh;
 	unsigned int mss;
+	__sum16 check;
+	__be16 newlen;
 
 	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
@@ -215,17 +216,25 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
+	uh = udp_hdr(segs);
+
+	/* compute checksum adjustment based on old length versus new */
+	newlen = htons(sizeof(*uh) + mss);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
 	for (seg = segs; seg; seg = seg->next) {
 		uh = udp_hdr(seg);
-		uh->len = htons(seg->len - hdrlen);
-		uh->check = check;
 
 		/* last packet can be partial gso_size */
-		if (!seg->next)
-			csum_replace2(&uh->check, htons(mss),
-				      htons(seg->len - hdrlen - sizeof(*uh)));
+		if (!seg->next) {
+			newlen = htons(seg->len - hdrlen);
+			check = csum16_add(csum16_sub(uh->check, uh->len),
+					   newlen);
+		}
+
+		uh->len = newlen;
+		uh->check = check;
 
-		uh->check = ~uh->check;
 		seg->destructor = sock_wfree;
 		seg->sk = sk;
 		sum_truesize += seg->truesize;
@@ -240,15 +249,10 @@ EXPORT_SYMBOL_GPL(__udp_gso_segment);
 static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct iphdr *iph = ip_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v4_check(sizeof(struct udphdr) + mss,
-					      iph->saddr, iph->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index dea03ec09715..61e34f1d2fa2 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -20,15 +20,10 @@
 static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v6_check(sizeof(struct udphdr) + mss,
-					      &ip6h->saddr, &ip6h->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
-- 
cgit v1.2.3


From fa9b274f8aeffb97787b055b8cfbf9062e158551 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 26 Apr 2017 10:45:46 +0530
Subject: PM / OPP: Implement dev_pm_opp_of_add_table_indexed()

The "operating-points-v2" property can contain a list of phandles now,
specifically for the power domain providers that provide multiple
domains.

Add support to parse that.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/opp/of.c       | 50 +++++++++++++++++++++++++++++++++++++++++---------
 include/linux/pm_opp.h |  6 ++++++
 2 files changed, 47 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index c5c5afcaa2e3..cba669cd00c5 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -250,20 +250,17 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
 /* Returns opp descriptor node for a device node, caller must
  * do of_node_put() */
-static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np)
+static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np,
+						     int index)
 {
-	/*
-	 * There should be only ONE phandle present in "operating-points-v2"
-	 * property.
-	 */
-
-	return of_parse_phandle(np, "operating-points-v2", 0);
+	/* "operating-points-v2" can be an array for power domain providers */
+	return of_parse_phandle(np, "operating-points-v2", index);
 }
 
 /* Returns opp descriptor node for a device, caller must do of_node_put() */
 struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
 {
-	return _opp_of_get_opp_desc_node(dev->of_node);
+	return _opp_of_get_opp_desc_node(dev->of_node, 0);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node);
 
@@ -517,6 +514,41 @@ int dev_pm_opp_of_add_table(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
 
+/**
+ * dev_pm_opp_of_add_table_indexed() - Initialize indexed opp table from device tree
+ * @dev:	device pointer used to lookup OPP table.
+ * @index:	Index number.
+ *
+ * Register the initial OPP table with the OPP library for given device only
+ * using the "operating-points-v2" property.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -ENODEV	when 'operating-points' property is not found or is invalid data
+ *		in device node.
+ * -ENODATA	when empty 'operating-points' property is found
+ * -EINVAL	when invalid entries are found in opp-v2 table
+ */
+int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
+{
+	struct device_node *opp_np;
+	int ret;
+
+	opp_np = _opp_of_get_opp_desc_node(dev->of_node, index);
+	if (!opp_np)
+		return -ENODEV;
+
+	ret = _of_add_opp_table_v2(dev, opp_np);
+	of_node_put(opp_np);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed);
+
 /* CPU device specific helpers */
 
 /**
@@ -621,7 +653,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 		}
 
 		/* Get OPP descriptor node */
-		tmp_np = _opp_of_get_opp_desc_node(cpu_np);
+		tmp_np = _opp_of_get_opp_desc_node(cpu_np, 0);
 		of_node_put(cpu_np);
 		if (!tmp_np) {
 			pr_err("%pOF: Couldn't find opp node\n", cpu_np);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 6c2d2e88f066..f042fdeaaa3c 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -303,6 +303,7 @@ static inline void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
 int dev_pm_opp_of_add_table(struct device *dev);
+int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);
 void dev_pm_opp_of_remove_table(struct device *dev);
 int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
 void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask);
@@ -314,6 +315,11 @@ static inline int dev_pm_opp_of_add_table(struct device *dev)
 	return -ENOTSUPP;
 }
 
+static inline int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
+{
+	return -ENOTSUPP;
+}
+
 static inline void dev_pm_opp_of_remove_table(struct device *dev)
 {
 }
-- 
cgit v1.2.3


From a88bd2a51e901ed8081841d647157de8153df813 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 29 Nov 2017 15:18:36 +0530
Subject: PM / OPP: Implement of_dev_pm_opp_find_required_opp()

A device's DT node or its OPP nodes can contain a phandle to other
device's OPP node, in the "required-opps" property.

This patch implements a routine to find that required OPP from the node
that contains the "required-opps" property.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/opp/core.c     |  4 +---
 drivers/opp/of.c       | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/opp/opp.h      |  1 +
 include/linux/pm_opp.h |  6 ++++++
 4 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index a0f72c732718..416f54ba7a26 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -33,8 +33,6 @@ LIST_HEAD(opp_tables);
 /* Lock to allow exclusive modification to the device and opp lists */
 DEFINE_MUTEX(opp_table_lock);
 
-static void dev_pm_opp_get(struct dev_pm_opp *opp);
-
 static struct opp_device *_find_opp_dev(const struct device *dev,
 					struct opp_table *opp_table)
 {
@@ -901,7 +899,7 @@ static void _opp_kref_release(struct kref *kref)
 	dev_pm_opp_put_opp_table(opp_table);
 }
 
-static void dev_pm_opp_get(struct dev_pm_opp *opp)
+void dev_pm_opp_get(struct dev_pm_opp *opp)
 {
 	kref_get(&opp->kref);
 }
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index cba669cd00c5..6380ec3d695b 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -673,3 +673,57 @@ put_cpu_node:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
+
+/**
+ * of_dev_pm_opp_find_required_opp() - Search for required OPP.
+ * @dev: The device whose OPP node is referenced by the 'np' DT node.
+ * @np: Node that contains the "required-opps" property.
+ *
+ * Returns the OPP of the device 'dev', whose phandle is present in the "np"
+ * node. Although the "required-opps" property supports having multiple
+ * phandles, this helper routine only parses the very first phandle in the list.
+ *
+ * Return: Matching opp, else returns ERR_PTR in case of error and should be
+ * handled using IS_ERR.
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *of_dev_pm_opp_find_required_opp(struct device *dev,
+						   struct device_node *np)
+{
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ENODEV);
+	struct device_node *required_np;
+	struct opp_table *opp_table;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
+
+	required_np = of_parse_phandle(np, "required-opps", 0);
+	if (unlikely(!required_np)) {
+		dev_err(dev, "Unable to parse required-opps\n");
+		goto put_opp_table;
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available && temp_opp->np == required_np) {
+			opp = temp_opp;
+
+			/* Increment the reference count of OPP */
+			dev_pm_opp_get(opp);
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+
+	of_node_put(required_np);
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(of_dev_pm_opp_find_required_opp);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 381a4fb15d5c..f9eccf9811ae 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -187,6 +187,7 @@ struct opp_table {
 };
 
 /* Routines internal to opp core */
+void dev_pm_opp_get(struct dev_pm_opp *opp);
 void _get_opp_table_kref(struct opp_table *opp_table);
 int _get_opp_count(struct opp_table *opp_table);
 struct opp_table *_find_opp_table(struct device *dev);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index f042fdeaaa3c..70686f434c13 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -309,6 +309,7 @@ int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
 void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask);
 int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
 struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev);
+struct dev_pm_opp *of_dev_pm_opp_find_required_opp(struct device *dev, struct device_node *np);
 #else
 static inline int dev_pm_opp_of_add_table(struct device *dev)
 {
@@ -342,6 +343,11 @@ static inline struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device
 {
 	return NULL;
 }
+
+static inline struct dev_pm_opp *of_dev_pm_opp_find_required_opp(struct device *dev, struct device_node *np)
+{
+	return NULL;
+}
 #endif
 
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From e2f4b5f8dc59c28605a320ea923905e519fd2ca7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 12 Jan 2018 10:03:45 +0530
Subject: PM / OPP: Implement dev_pm_opp_get_of_node()

This adds a new helper to let the power domain drivers to access
opp->np, so that they can read platform specific properties from the
node.

Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
Signed-off-by: Rajendra Nayak <rnayak@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/opp/of.c       | 19 +++++++++++++++++++
 include/linux/pm_opp.h |  5 +++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 6380ec3d695b..de41e68b780f 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -727,3 +727,22 @@ put_opp_table:
 	return opp;
 }
 EXPORT_SYMBOL_GPL(of_dev_pm_opp_find_required_opp);
+
+/**
+ * dev_pm_opp_get_of_node() - Gets the DT node corresponding to an opp
+ * @opp:	opp for which DT node has to be returned for
+ *
+ * Return: DT node corresponding to the opp, else 0 on success.
+ *
+ * The caller needs to put the node with of_node_put() after using it.
+ */
+struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp)
+{
+	if (IS_ERR_OR_NULL(opp)) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return NULL;
+	}
+
+	return of_node_get(opp->np);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 70686f434c13..8fd34c4398b2 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -310,6 +310,7 @@ void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask);
 int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
 struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev);
 struct dev_pm_opp *of_dev_pm_opp_find_required_opp(struct device *dev, struct device_node *np);
+struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp);
 #else
 static inline int dev_pm_opp_of_add_table(struct device *dev)
 {
@@ -348,6 +349,10 @@ static inline struct dev_pm_opp *of_dev_pm_opp_find_required_opp(struct device *
 {
 	return NULL;
 }
+static inline struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp)
+{
+	return NULL;
+}
 #endif
 
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 401ea1572de944df548a13eded82339491a739ff Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 17 Mar 2017 11:26:19 +0530
Subject: PM / Domain: Add struct device to genpd

The power-domain core would be using the OPP core going forward and the
OPP core has the basic requirement of a device structure for its working.

Add a struct device to the genpd structure. This doesn't register the
device with device core as the "dev" pointer is mostly used by the OPP
core as a cookie for now and registering the device is not mandatory.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/domain.c | 3 +++
 include/linux/pm_domain.h   | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 1ea0e2502e8e..4a3dc9cc0848 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1696,6 +1696,9 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 			return ret;
 	}
 
+	device_initialize(&genpd->dev);
+	dev_set_name(&genpd->dev, "%s", genpd->name);
+
 	mutex_lock(&gpd_list_lock);
 	list_add(&genpd->gpd_list_node, &gpd_list);
 	mutex_unlock(&gpd_list_lock);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 04dbef9847d3..aaacaa35005d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -49,6 +49,7 @@ struct genpd_power_state {
 struct genpd_lock_ops;
 
 struct generic_pm_domain {
+	struct device dev;
 	struct dev_pm_domain domain;	/* PM domain operations */
 	struct list_head gpd_list_node;	/* Node in the global PM domains list */
 	struct list_head master_links;	/* Links with PM domain as a master */
-- 
cgit v1.2.3


From 6e41766a6a504b605a105cc5ab8d276ea20052ba Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 29 Nov 2017 15:21:51 +0530
Subject: PM / Domain: Implement of_genpd_opp_to_performance_state()

This implements of_genpd_opp_to_performance_state() which can be used
from the device drivers or the OPP core to find the performance state
encoded in the "required-opps" property of a node. Normally this would
be called only once for each OPP of the device for which the OPP table
of the device is getting generated.

Different platforms may encode the performance state differently using
the OPP table (they may simply return value of opp-hz or opp-microvolt,
or apply some algorithm on top of those values) and so a new callback
->opp_to_performance_state() is implemented to allow platform specific
drivers to convert the power domain OPP to a performance state value.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/domain.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h   | 12 ++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 5c0019d70d76..29e25dc0584c 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2412,6 +2412,54 @@ int of_genpd_parse_idle_states(struct device_node *dn,
 }
 EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states);
 
+/**
+ * of_genpd_opp_to_performance_state- Gets performance state of device's
+ * power domain corresponding to a DT node's "required-opps" property.
+ *
+ * @dev: Device for which the performance-state needs to be found.
+ * @opp_node: DT node where the "required-opps" property is present. This can be
+ *	the device node itself (if it doesn't have an OPP table) or a node
+ *	within the OPP table of a device (if device has an OPP table).
+ * @state: Pointer to return performance state.
+ *
+ * Returns performance state corresponding to the "required-opps" property of
+ * a DT node. This calls platform specific genpd->opp_to_performance_state()
+ * callback to translate power domain OPP to performance state.
+ *
+ * Returns performance state on success and 0 on failure.
+ */
+unsigned int of_genpd_opp_to_performance_state(struct device *dev,
+					       struct device_node *opp_node)
+{
+	struct generic_pm_domain *genpd;
+	struct dev_pm_opp *opp;
+	int state = 0;
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return 0;
+
+	if (unlikely(!genpd->set_performance_state))
+		return 0;
+
+	genpd_lock(genpd);
+
+	opp = of_dev_pm_opp_find_required_opp(&genpd->dev, opp_node);
+	if (IS_ERR(opp)) {
+		state = PTR_ERR(opp);
+		goto unlock;
+	}
+
+	state = genpd->opp_to_performance_state(genpd, opp);
+	dev_pm_opp_put(opp);
+
+unlock:
+	genpd_unlock(genpd);
+
+	return state;
+}
+EXPORT_SYMBOL_GPL(of_genpd_opp_to_performance_state);
+
 #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
 
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index aaacaa35005d..a2fa297e96f7 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -47,6 +47,7 @@ struct genpd_power_state {
 };
 
 struct genpd_lock_ops;
+struct dev_pm_opp;
 
 struct generic_pm_domain {
 	struct device dev;
@@ -68,6 +69,8 @@ struct generic_pm_domain {
 	unsigned int performance_state;	/* Aggregated max performance state */
 	int (*power_off)(struct generic_pm_domain *domain);
 	int (*power_on)(struct generic_pm_domain *domain);
+	unsigned int (*opp_to_performance_state)(struct generic_pm_domain *genpd,
+						 struct dev_pm_opp *opp);
 	int (*set_performance_state)(struct generic_pm_domain *genpd,
 				     unsigned int state);
 	struct gpd_dev_ops dev_ops;
@@ -244,6 +247,8 @@ extern int of_genpd_add_subdomain(struct of_phandle_args *parent,
 extern struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
 extern int of_genpd_parse_idle_states(struct device_node *dn,
 			struct genpd_power_state **states, int *n);
+extern unsigned int of_genpd_opp_to_performance_state(struct device *dev,
+				struct device_node *opp_node);
 
 int genpd_dev_pm_attach(struct device *dev);
 #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
@@ -279,6 +284,13 @@ static inline int of_genpd_parse_idle_states(struct device_node *dn,
 	return -ENODEV;
 }
 
+static inline unsigned int
+of_genpd_opp_to_performance_state(struct device *dev,
+				  struct device_node *opp_node)
+{
+	return -ENODEV;
+}
+
 static inline int genpd_dev_pm_attach(struct device *dev)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 28fa4aca262ce0865d27788ebc480e643117d7ab Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 22 Dec 2017 12:08:00 +0530
Subject: PM / OPP: Remove dev_pm_opp_{un}register_get_pstate_helper()

These helpers aren't used anymore, remove them.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/opp/core.c     | 75 --------------------------------------------------
 drivers/opp/opp.h      |  2 --
 include/linux/pm_opp.h | 10 -------
 3 files changed, 87 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e4ec30ee1493..6d3624ba89b6 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1567,81 +1567,6 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper);
 
-/**
- * dev_pm_opp_register_get_pstate_helper() - Register get_pstate() helper.
- * @dev: Device for which the helper is getting registered.
- * @get_pstate: Helper.
- *
- * TODO: Remove this callback after the same information is available via Device
- * Tree.
- *
- * This allows a platform to initialize the performance states of individual
- * OPPs for its devices, until we get similar information directly from DT.
- *
- * This must be called before the OPPs are initialized for the device.
- */
-struct opp_table *dev_pm_opp_register_get_pstate_helper(struct device *dev,
-		int (*get_pstate)(struct device *dev, unsigned long rate))
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	if (!get_pstate)
-		return ERR_PTR(-EINVAL);
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	/* Already have genpd_performance_state set */
-	if (WARN_ON(opp_table->genpd_performance_state)) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->genpd_performance_state = true;
-	opp_table->get_pstate = get_pstate;
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_get_pstate_helper);
-
-/**
- * dev_pm_opp_unregister_get_pstate_helper() - Releases resources blocked for
- *					   get_pstate() helper
- * @opp_table: OPP table returned from dev_pm_opp_register_get_pstate_helper().
- *
- * Release resources blocked for platform specific get_pstate() helper.
- */
-void dev_pm_opp_unregister_get_pstate_helper(struct opp_table *opp_table)
-{
-	if (!opp_table->genpd_performance_state) {
-		pr_err("%s: Doesn't have performance states set\n",
-		       __func__);
-		return;
-	}
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	opp_table->genpd_performance_state = false;
-	opp_table->get_pstate = NULL;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_get_pstate_helper);
-
 /**
  * dev_pm_opp_add()  - Add an OPP table from a table definitions
  * @dev:	device for which we do this operation
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index f9eccf9811ae..7c540fd063b2 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -140,7 +140,6 @@ enum opp_table_access {
  * @genpd_performance_state: Device's power domain support performance state.
  * @set_opp: Platform specific set_opp callback
  * @set_opp_data: Data to be passed to set_opp callback
- * @get_pstate: Platform specific get_pstate callback
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -178,7 +177,6 @@ struct opp_table {
 
 	int (*set_opp)(struct dev_pm_set_opp_data *data);
 	struct dev_pm_set_opp_data *set_opp_data;
-	int (*get_pstate)(struct device *dev, unsigned long rate);
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 8fd34c4398b2..099b31960dec 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -125,8 +125,6 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_register_get_pstate_helper(struct device *dev, int (*get_pstate)(struct device *dev, unsigned long rate));
-void dev_pm_opp_unregister_get_pstate_helper(struct opp_table *opp_table);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -247,14 +245,6 @@ static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device
 
 static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_register_get_pstate_helper(struct device *dev,
-		int (*get_pstate)(struct device *dev, unsigned long rate))
-{
-	return ERR_PTR(-ENOTSUPP);
-}
-
-static inline void dev_pm_opp_unregister_get_pstate_helper(struct opp_table *opp_table) {}
-
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	return ERR_PTR(-ENOTSUPP);
-- 
cgit v1.2.3


From 0d3fdb157fd0b96b06be7f1d8cea21dd2bc030da Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Apr 2018 15:34:58 +0200
Subject: iommu-common: move to arch/sparc

This code is only used by sparc, and all new iommu drivers should use the
drivers/iommu/ framework.  Also remove the unused exports.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
---
 arch/sparc/include/asm/iommu-common.h |  53 +++++++
 arch/sparc/include/asm/iommu_64.h     |   2 +-
 arch/sparc/kernel/Makefile            |   2 +-
 arch/sparc/kernel/iommu-common.c      | 264 +++++++++++++++++++++++++++++++++
 arch/sparc/kernel/iommu.c             |   2 +-
 arch/sparc/kernel/ldc.c               |   2 +-
 arch/sparc/kernel/pci_sun4v.c         |   2 +-
 include/linux/iommu-common.h          |  53 -------
 lib/Makefile                          |   2 +-
 lib/iommu-common.c                    | 267 ----------------------------------
 10 files changed, 323 insertions(+), 326 deletions(-)
 create mode 100644 arch/sparc/include/asm/iommu-common.h
 create mode 100644 arch/sparc/kernel/iommu-common.c
 delete mode 100644 include/linux/iommu-common.h
 delete mode 100644 lib/iommu-common.c

(limited to 'include')

diff --git a/arch/sparc/include/asm/iommu-common.h b/arch/sparc/include/asm/iommu-common.h
new file mode 100644
index 000000000000..802c90c79d1f
--- /dev/null
+++ b/arch/sparc/include/asm/iommu-common.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IOMMU_COMMON_H
+#define _LINUX_IOMMU_COMMON_H
+
+#include <linux/spinlock_types.h>
+#include <linux/device.h>
+#include <asm/page.h>
+
+#define IOMMU_POOL_HASHBITS     4
+#define IOMMU_NR_POOLS          (1 << IOMMU_POOL_HASHBITS)
+#define IOMMU_ERROR_CODE	(~(unsigned long) 0)
+
+struct iommu_pool {
+	unsigned long	start;
+	unsigned long	end;
+	unsigned long	hint;
+	spinlock_t	lock;
+};
+
+struct iommu_map_table {
+	unsigned long		table_map_base;
+	unsigned long		table_shift;
+	unsigned long		nr_pools;
+	void			(*lazy_flush)(struct iommu_map_table *);
+	unsigned long		poolsize;
+	struct iommu_pool	pools[IOMMU_NR_POOLS];
+	u32			flags;
+#define	IOMMU_HAS_LARGE_POOL	0x00000001
+#define	IOMMU_NO_SPAN_BOUND	0x00000002
+#define	IOMMU_NEED_FLUSH	0x00000004
+	struct iommu_pool	large_pool;
+	unsigned long		*map;
+};
+
+extern void iommu_tbl_pool_init(struct iommu_map_table *iommu,
+				unsigned long num_entries,
+				u32 table_shift,
+				void (*lazy_flush)(struct iommu_map_table *),
+				bool large_pool, u32 npools,
+				bool skip_span_boundary_check);
+
+extern unsigned long iommu_tbl_range_alloc(struct device *dev,
+					   struct iommu_map_table *iommu,
+					   unsigned long npages,
+					   unsigned long *handle,
+					   unsigned long mask,
+					   unsigned int align_order);
+
+extern void iommu_tbl_range_free(struct iommu_map_table *iommu,
+				 u64 dma_addr, unsigned long npages,
+				 unsigned long entry);
+
+#endif
diff --git a/arch/sparc/include/asm/iommu_64.h b/arch/sparc/include/asm/iommu_64.h
index 9ed6b54caa4b..0ef6dedf747e 100644
--- a/arch/sparc/include/asm/iommu_64.h
+++ b/arch/sparc/include/asm/iommu_64.h
@@ -17,7 +17,7 @@
 #define IOPTE_WRITE   0x0000000000000002UL
 
 #define IOMMU_NUM_CTXS	4096
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 struct iommu_arena {
 	unsigned long	*map;
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 84cfc5a428d6..cf8640841b7a 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -59,7 +59,7 @@ obj-$(CONFIG_SPARC32)   += leon_pmc.o
 
 obj-$(CONFIG_SPARC64)   += reboot.o
 obj-$(CONFIG_SPARC64)   += sysfs.o
-obj-$(CONFIG_SPARC64)   += iommu.o
+obj-$(CONFIG_SPARC64)   += iommu.o iommu-common.o
 obj-$(CONFIG_SPARC64)   += central.o
 obj-$(CONFIG_SPARC64)   += starfire.o
 obj-$(CONFIG_SPARC64)   += power.o
diff --git a/arch/sparc/kernel/iommu-common.c b/arch/sparc/kernel/iommu-common.c
new file mode 100644
index 000000000000..59cb16691322
--- /dev/null
+++ b/arch/sparc/kernel/iommu-common.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IOMMU mmap management and range allocation functions.
+ * Based almost entirely upon the powerpc iommu allocator.
+ */
+
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/iommu-helper.h>
+#include <linux/dma-mapping.h>
+#include <linux/hash.h>
+#include <asm/iommu-common.h>
+
+static unsigned long iommu_large_alloc = 15;
+
+static	DEFINE_PER_CPU(unsigned int, iommu_hash_common);
+
+static inline bool need_flush(struct iommu_map_table *iommu)
+{
+	return ((iommu->flags & IOMMU_NEED_FLUSH) != 0);
+}
+
+static inline void set_flush(struct iommu_map_table *iommu)
+{
+	iommu->flags |= IOMMU_NEED_FLUSH;
+}
+
+static inline void clear_flush(struct iommu_map_table *iommu)
+{
+	iommu->flags &= ~IOMMU_NEED_FLUSH;
+}
+
+static void setup_iommu_pool_hash(void)
+{
+	unsigned int i;
+	static bool do_once;
+
+	if (do_once)
+		return;
+	do_once = true;
+	for_each_possible_cpu(i)
+		per_cpu(iommu_hash_common, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+}
+
+/*
+ * Initialize iommu_pool entries for the iommu_map_table. `num_entries'
+ * is the number of table entries. If `large_pool' is set to true,
+ * the top 1/4 of the table will be set aside for pool allocations
+ * of more than iommu_large_alloc pages.
+ */
+void iommu_tbl_pool_init(struct iommu_map_table *iommu,
+			 unsigned long num_entries,
+			 u32 table_shift,
+			 void (*lazy_flush)(struct iommu_map_table *),
+			 bool large_pool, u32 npools,
+			 bool skip_span_boundary_check)
+{
+	unsigned int start, i;
+	struct iommu_pool *p = &(iommu->large_pool);
+
+	setup_iommu_pool_hash();
+	if (npools == 0)
+		iommu->nr_pools = IOMMU_NR_POOLS;
+	else
+		iommu->nr_pools = npools;
+	BUG_ON(npools > IOMMU_NR_POOLS);
+
+	iommu->table_shift = table_shift;
+	iommu->lazy_flush = lazy_flush;
+	start = 0;
+	if (skip_span_boundary_check)
+		iommu->flags |= IOMMU_NO_SPAN_BOUND;
+	if (large_pool)
+		iommu->flags |= IOMMU_HAS_LARGE_POOL;
+
+	if (!large_pool)
+		iommu->poolsize = num_entries/iommu->nr_pools;
+	else
+		iommu->poolsize = (num_entries * 3 / 4)/iommu->nr_pools;
+	for (i = 0; i < iommu->nr_pools; i++) {
+		spin_lock_init(&(iommu->pools[i].lock));
+		iommu->pools[i].start = start;
+		iommu->pools[i].hint = start;
+		start += iommu->poolsize; /* start for next pool */
+		iommu->pools[i].end = start - 1;
+	}
+	if (!large_pool)
+		return;
+	/* initialize large_pool */
+	spin_lock_init(&(p->lock));
+	p->start = start;
+	p->hint = p->start;
+	p->end = num_entries;
+}
+
+unsigned long iommu_tbl_range_alloc(struct device *dev,
+				struct iommu_map_table *iommu,
+				unsigned long npages,
+				unsigned long *handle,
+				unsigned long mask,
+				unsigned int align_order)
+{
+	unsigned int pool_hash = __this_cpu_read(iommu_hash_common);
+	unsigned long n, end, start, limit, boundary_size;
+	struct iommu_pool *pool;
+	int pass = 0;
+	unsigned int pool_nr;
+	unsigned int npools = iommu->nr_pools;
+	unsigned long flags;
+	bool large_pool = ((iommu->flags & IOMMU_HAS_LARGE_POOL) != 0);
+	bool largealloc = (large_pool && npages > iommu_large_alloc);
+	unsigned long shift;
+	unsigned long align_mask = 0;
+
+	if (align_order > 0)
+		align_mask = ~0ul >> (BITS_PER_LONG - align_order);
+
+	/* Sanity check */
+	if (unlikely(npages == 0)) {
+		WARN_ON_ONCE(1);
+		return IOMMU_ERROR_CODE;
+	}
+
+	if (largealloc) {
+		pool = &(iommu->large_pool);
+		pool_nr = 0; /* to keep compiler happy */
+	} else {
+		/* pick out pool_nr */
+		pool_nr =  pool_hash & (npools - 1);
+		pool = &(iommu->pools[pool_nr]);
+	}
+	spin_lock_irqsave(&pool->lock, flags);
+
+ again:
+	if (pass == 0 && handle && *handle &&
+	    (*handle >= pool->start) && (*handle < pool->end))
+		start = *handle;
+	else
+		start = pool->hint;
+
+	limit = pool->end;
+
+	/* The case below can happen if we have a small segment appended
+	 * to a large, or when the previous alloc was at the very end of
+	 * the available space. If so, go back to the beginning. If a
+	 * flush is needed, it will get done based on the return value
+	 * from iommu_area_alloc() below.
+	 */
+	if (start >= limit)
+		start = pool->start;
+	shift = iommu->table_map_base >> iommu->table_shift;
+	if (limit + shift > mask) {
+		limit = mask - shift + 1;
+		/* If we're constrained on address range, first try
+		 * at the masked hint to avoid O(n) search complexity,
+		 * but on second pass, start at 0 in pool 0.
+		 */
+		if ((start & mask) >= limit || pass > 0) {
+			spin_unlock(&(pool->lock));
+			pool = &(iommu->pools[0]);
+			spin_lock(&(pool->lock));
+			start = pool->start;
+		} else {
+			start &= mask;
+		}
+	}
+
+	if (dev)
+		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+				      1 << iommu->table_shift);
+	else
+		boundary_size = ALIGN(1ULL << 32, 1 << iommu->table_shift);
+
+	boundary_size = boundary_size >> iommu->table_shift;
+	/*
+	 * if the skip_span_boundary_check had been set during init, we set
+	 * things up so that iommu_is_span_boundary() merely checks if the
+	 * (index + npages) < num_tsb_entries
+	 */
+	if ((iommu->flags & IOMMU_NO_SPAN_BOUND) != 0) {
+		shift = 0;
+		boundary_size = iommu->poolsize * iommu->nr_pools;
+	}
+	n = iommu_area_alloc(iommu->map, limit, start, npages, shift,
+			     boundary_size, align_mask);
+	if (n == -1) {
+		if (likely(pass == 0)) {
+			/* First failure, rescan from the beginning.  */
+			pool->hint = pool->start;
+			set_flush(iommu);
+			pass++;
+			goto again;
+		} else if (!largealloc && pass <= iommu->nr_pools) {
+			spin_unlock(&(pool->lock));
+			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
+			pool = &(iommu->pools[pool_nr]);
+			spin_lock(&(pool->lock));
+			pool->hint = pool->start;
+			set_flush(iommu);
+			pass++;
+			goto again;
+		} else {
+			/* give up */
+			n = IOMMU_ERROR_CODE;
+			goto bail;
+		}
+	}
+	if (iommu->lazy_flush &&
+	    (n < pool->hint || need_flush(iommu))) {
+		clear_flush(iommu);
+		iommu->lazy_flush(iommu);
+	}
+
+	end = n + npages;
+	pool->hint = end;
+
+	/* Update handle for SG allocations */
+	if (handle)
+		*handle = end;
+bail:
+	spin_unlock_irqrestore(&(pool->lock), flags);
+
+	return n;
+}
+
+static struct iommu_pool *get_pool(struct iommu_map_table *tbl,
+				   unsigned long entry)
+{
+	struct iommu_pool *p;
+	unsigned long largepool_start = tbl->large_pool.start;
+	bool large_pool = ((tbl->flags & IOMMU_HAS_LARGE_POOL) != 0);
+
+	/* The large pool is the last pool at the top of the table */
+	if (large_pool && entry >= largepool_start) {
+		p = &tbl->large_pool;
+	} else {
+		unsigned int pool_nr = entry / tbl->poolsize;
+
+		BUG_ON(pool_nr >= tbl->nr_pools);
+		p = &tbl->pools[pool_nr];
+	}
+	return p;
+}
+
+/* Caller supplies the index of the entry into the iommu map table
+ * itself when the mapping from dma_addr to the entry is not the
+ * default addr->entry mapping below.
+ */
+void iommu_tbl_range_free(struct iommu_map_table *iommu, u64 dma_addr,
+			  unsigned long npages, unsigned long entry)
+{
+	struct iommu_pool *pool;
+	unsigned long flags;
+	unsigned long shift = iommu->table_shift;
+
+	if (entry == IOMMU_ERROR_CODE) /* use default addr->entry mapping */
+		entry = (dma_addr - iommu->table_map_base) >> shift;
+	pool = get_pool(iommu, entry);
+
+	spin_lock_irqsave(&(pool->lock), flags);
+	bitmap_clear(iommu->map, entry, npages);
+	spin_unlock_irqrestore(&(pool->lock), flags);
+}
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index b08dc3416f06..40d008b0bd3e 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -14,7 +14,7 @@
 #include <linux/errno.h>
 #include <linux/iommu-helper.h>
 #include <linux/bitmap.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 86b625f9d8dc..c0fa3ef6cf01 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -16,7 +16,7 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/bitmap.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 #include <asm/hypervisor.h>
 #include <asm/iommu.h>
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 249367228c33..565d9ac883d0 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -16,7 +16,7 @@
 #include <linux/export.h>
 #include <linux/log2.h>
 #include <linux/of_device.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 #include <asm/iommu.h>
 #include <asm/irq.h>
diff --git a/include/linux/iommu-common.h b/include/linux/iommu-common.h
deleted file mode 100644
index 802c90c79d1f..000000000000
--- a/include/linux/iommu-common.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_IOMMU_COMMON_H
-#define _LINUX_IOMMU_COMMON_H
-
-#include <linux/spinlock_types.h>
-#include <linux/device.h>
-#include <asm/page.h>
-
-#define IOMMU_POOL_HASHBITS     4
-#define IOMMU_NR_POOLS          (1 << IOMMU_POOL_HASHBITS)
-#define IOMMU_ERROR_CODE	(~(unsigned long) 0)
-
-struct iommu_pool {
-	unsigned long	start;
-	unsigned long	end;
-	unsigned long	hint;
-	spinlock_t	lock;
-};
-
-struct iommu_map_table {
-	unsigned long		table_map_base;
-	unsigned long		table_shift;
-	unsigned long		nr_pools;
-	void			(*lazy_flush)(struct iommu_map_table *);
-	unsigned long		poolsize;
-	struct iommu_pool	pools[IOMMU_NR_POOLS];
-	u32			flags;
-#define	IOMMU_HAS_LARGE_POOL	0x00000001
-#define	IOMMU_NO_SPAN_BOUND	0x00000002
-#define	IOMMU_NEED_FLUSH	0x00000004
-	struct iommu_pool	large_pool;
-	unsigned long		*map;
-};
-
-extern void iommu_tbl_pool_init(struct iommu_map_table *iommu,
-				unsigned long num_entries,
-				u32 table_shift,
-				void (*lazy_flush)(struct iommu_map_table *),
-				bool large_pool, u32 npools,
-				bool skip_span_boundary_check);
-
-extern unsigned long iommu_tbl_range_alloc(struct device *dev,
-					   struct iommu_map_table *iommu,
-					   unsigned long npages,
-					   unsigned long *handle,
-					   unsigned long mask,
-					   unsigned int align_order);
-
-extern void iommu_tbl_range_free(struct iommu_map_table *iommu,
-				 u64 dma_addr, unsigned long npages,
-				 unsigned long entry);
-
-#endif
diff --git a/lib/Makefile b/lib/Makefile
index ce20696d5a92..94203b5eecd4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -147,7 +147,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
diff --git a/lib/iommu-common.c b/lib/iommu-common.c
deleted file mode 100644
index 55b00de106b5..000000000000
--- a/lib/iommu-common.c
+++ /dev/null
@@ -1,267 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IOMMU mmap management and range allocation functions.
- * Based almost entirely upon the powerpc iommu allocator.
- */
-
-#include <linux/export.h>
-#include <linux/bitmap.h>
-#include <linux/bug.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu-common.h>
-#include <linux/dma-mapping.h>
-#include <linux/hash.h>
-
-static unsigned long iommu_large_alloc = 15;
-
-static	DEFINE_PER_CPU(unsigned int, iommu_hash_common);
-
-static inline bool need_flush(struct iommu_map_table *iommu)
-{
-	return ((iommu->flags & IOMMU_NEED_FLUSH) != 0);
-}
-
-static inline void set_flush(struct iommu_map_table *iommu)
-{
-	iommu->flags |= IOMMU_NEED_FLUSH;
-}
-
-static inline void clear_flush(struct iommu_map_table *iommu)
-{
-	iommu->flags &= ~IOMMU_NEED_FLUSH;
-}
-
-static void setup_iommu_pool_hash(void)
-{
-	unsigned int i;
-	static bool do_once;
-
-	if (do_once)
-		return;
-	do_once = true;
-	for_each_possible_cpu(i)
-		per_cpu(iommu_hash_common, i) = hash_32(i, IOMMU_POOL_HASHBITS);
-}
-
-/*
- * Initialize iommu_pool entries for the iommu_map_table. `num_entries'
- * is the number of table entries. If `large_pool' is set to true,
- * the top 1/4 of the table will be set aside for pool allocations
- * of more than iommu_large_alloc pages.
- */
-void iommu_tbl_pool_init(struct iommu_map_table *iommu,
-			 unsigned long num_entries,
-			 u32 table_shift,
-			 void (*lazy_flush)(struct iommu_map_table *),
-			 bool large_pool, u32 npools,
-			 bool skip_span_boundary_check)
-{
-	unsigned int start, i;
-	struct iommu_pool *p = &(iommu->large_pool);
-
-	setup_iommu_pool_hash();
-	if (npools == 0)
-		iommu->nr_pools = IOMMU_NR_POOLS;
-	else
-		iommu->nr_pools = npools;
-	BUG_ON(npools > IOMMU_NR_POOLS);
-
-	iommu->table_shift = table_shift;
-	iommu->lazy_flush = lazy_flush;
-	start = 0;
-	if (skip_span_boundary_check)
-		iommu->flags |= IOMMU_NO_SPAN_BOUND;
-	if (large_pool)
-		iommu->flags |= IOMMU_HAS_LARGE_POOL;
-
-	if (!large_pool)
-		iommu->poolsize = num_entries/iommu->nr_pools;
-	else
-		iommu->poolsize = (num_entries * 3 / 4)/iommu->nr_pools;
-	for (i = 0; i < iommu->nr_pools; i++) {
-		spin_lock_init(&(iommu->pools[i].lock));
-		iommu->pools[i].start = start;
-		iommu->pools[i].hint = start;
-		start += iommu->poolsize; /* start for next pool */
-		iommu->pools[i].end = start - 1;
-	}
-	if (!large_pool)
-		return;
-	/* initialize large_pool */
-	spin_lock_init(&(p->lock));
-	p->start = start;
-	p->hint = p->start;
-	p->end = num_entries;
-}
-EXPORT_SYMBOL(iommu_tbl_pool_init);
-
-unsigned long iommu_tbl_range_alloc(struct device *dev,
-				struct iommu_map_table *iommu,
-				unsigned long npages,
-				unsigned long *handle,
-				unsigned long mask,
-				unsigned int align_order)
-{
-	unsigned int pool_hash = __this_cpu_read(iommu_hash_common);
-	unsigned long n, end, start, limit, boundary_size;
-	struct iommu_pool *pool;
-	int pass = 0;
-	unsigned int pool_nr;
-	unsigned int npools = iommu->nr_pools;
-	unsigned long flags;
-	bool large_pool = ((iommu->flags & IOMMU_HAS_LARGE_POOL) != 0);
-	bool largealloc = (large_pool && npages > iommu_large_alloc);
-	unsigned long shift;
-	unsigned long align_mask = 0;
-
-	if (align_order > 0)
-		align_mask = ~0ul >> (BITS_PER_LONG - align_order);
-
-	/* Sanity check */
-	if (unlikely(npages == 0)) {
-		WARN_ON_ONCE(1);
-		return IOMMU_ERROR_CODE;
-	}
-
-	if (largealloc) {
-		pool = &(iommu->large_pool);
-		pool_nr = 0; /* to keep compiler happy */
-	} else {
-		/* pick out pool_nr */
-		pool_nr =  pool_hash & (npools - 1);
-		pool = &(iommu->pools[pool_nr]);
-	}
-	spin_lock_irqsave(&pool->lock, flags);
-
- again:
-	if (pass == 0 && handle && *handle &&
-	    (*handle >= pool->start) && (*handle < pool->end))
-		start = *handle;
-	else
-		start = pool->hint;
-
-	limit = pool->end;
-
-	/* The case below can happen if we have a small segment appended
-	 * to a large, or when the previous alloc was at the very end of
-	 * the available space. If so, go back to the beginning. If a
-	 * flush is needed, it will get done based on the return value
-	 * from iommu_area_alloc() below.
-	 */
-	if (start >= limit)
-		start = pool->start;
-	shift = iommu->table_map_base >> iommu->table_shift;
-	if (limit + shift > mask) {
-		limit = mask - shift + 1;
-		/* If we're constrained on address range, first try
-		 * at the masked hint to avoid O(n) search complexity,
-		 * but on second pass, start at 0 in pool 0.
-		 */
-		if ((start & mask) >= limit || pass > 0) {
-			spin_unlock(&(pool->lock));
-			pool = &(iommu->pools[0]);
-			spin_lock(&(pool->lock));
-			start = pool->start;
-		} else {
-			start &= mask;
-		}
-	}
-
-	if (dev)
-		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << iommu->table_shift);
-	else
-		boundary_size = ALIGN(1ULL << 32, 1 << iommu->table_shift);
-
-	boundary_size = boundary_size >> iommu->table_shift;
-	/*
-	 * if the skip_span_boundary_check had been set during init, we set
-	 * things up so that iommu_is_span_boundary() merely checks if the
-	 * (index + npages) < num_tsb_entries
-	 */
-	if ((iommu->flags & IOMMU_NO_SPAN_BOUND) != 0) {
-		shift = 0;
-		boundary_size = iommu->poolsize * iommu->nr_pools;
-	}
-	n = iommu_area_alloc(iommu->map, limit, start, npages, shift,
-			     boundary_size, align_mask);
-	if (n == -1) {
-		if (likely(pass == 0)) {
-			/* First failure, rescan from the beginning.  */
-			pool->hint = pool->start;
-			set_flush(iommu);
-			pass++;
-			goto again;
-		} else if (!largealloc && pass <= iommu->nr_pools) {
-			spin_unlock(&(pool->lock));
-			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
-			pool = &(iommu->pools[pool_nr]);
-			spin_lock(&(pool->lock));
-			pool->hint = pool->start;
-			set_flush(iommu);
-			pass++;
-			goto again;
-		} else {
-			/* give up */
-			n = IOMMU_ERROR_CODE;
-			goto bail;
-		}
-	}
-	if (iommu->lazy_flush &&
-	    (n < pool->hint || need_flush(iommu))) {
-		clear_flush(iommu);
-		iommu->lazy_flush(iommu);
-	}
-
-	end = n + npages;
-	pool->hint = end;
-
-	/* Update handle for SG allocations */
-	if (handle)
-		*handle = end;
-bail:
-	spin_unlock_irqrestore(&(pool->lock), flags);
-
-	return n;
-}
-EXPORT_SYMBOL(iommu_tbl_range_alloc);
-
-static struct iommu_pool *get_pool(struct iommu_map_table *tbl,
-				   unsigned long entry)
-{
-	struct iommu_pool *p;
-	unsigned long largepool_start = tbl->large_pool.start;
-	bool large_pool = ((tbl->flags & IOMMU_HAS_LARGE_POOL) != 0);
-
-	/* The large pool is the last pool at the top of the table */
-	if (large_pool && entry >= largepool_start) {
-		p = &tbl->large_pool;
-	} else {
-		unsigned int pool_nr = entry / tbl->poolsize;
-
-		BUG_ON(pool_nr >= tbl->nr_pools);
-		p = &tbl->pools[pool_nr];
-	}
-	return p;
-}
-
-/* Caller supplies the index of the entry into the iommu map table
- * itself when the mapping from dma_addr to the entry is not the
- * default addr->entry mapping below.
- */
-void iommu_tbl_range_free(struct iommu_map_table *iommu, u64 dma_addr,
-			  unsigned long npages, unsigned long entry)
-{
-	struct iommu_pool *pool;
-	unsigned long flags;
-	unsigned long shift = iommu->table_shift;
-
-	if (entry == IOMMU_ERROR_CODE) /* use default addr->entry mapping */
-		entry = (dma_addr - iommu->table_map_base) >> shift;
-	pool = get_pool(iommu, entry);
-
-	spin_lock_irqsave(&(pool->lock), flags);
-	bitmap_clear(iommu->map, entry, npages);
-	spin_unlock_irqrestore(&(pool->lock), flags);
-}
-EXPORT_SYMBOL(iommu_tbl_range_free);
-- 
cgit v1.2.3


From 79c1879ee5473e3404469b07f9bccfe6d0814b93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Apr 2018 15:41:07 +0200
Subject: iommu-helper: mark iommu_is_span_boundary as inline

This avoids selecting IOMMU_HELPER just for this function.  And we only
use it once or twice in normal builds so this often even is a size
reduction.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/alpha/Kconfig              |  3 ---
 arch/arm/Kconfig                |  3 ---
 arch/arm64/Kconfig              |  3 ---
 arch/ia64/Kconfig               |  3 ---
 arch/mips/cavium-octeon/Kconfig |  4 ----
 arch/mips/loongson64/Kconfig    |  4 ----
 arch/mips/netlogic/Kconfig      |  3 ---
 arch/powerpc/Kconfig            |  1 -
 arch/unicore32/mm/Kconfig       |  3 ---
 arch/x86/Kconfig                |  2 +-
 drivers/parisc/Kconfig          |  5 -----
 include/linux/iommu-helper.h    | 13 ++++++++++---
 lib/iommu-helper.c              | 12 +-----------
 13 files changed, 12 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b2022885ced8..3ff735a722af 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -345,9 +345,6 @@ config PCI_DOMAINS
 config PCI_SYSCALL
 	def_bool PCI
 
-config IOMMU_HELPER
-	def_bool PCI
-
 config ALPHA_NONAME
 	bool
 	depends on ALPHA_BOOK1 || ALPHA_NONAME_CH
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index c723d3e375e8..6bbf7b928b4f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1780,9 +1780,6 @@ config SECCOMP
 config SWIOTLB
 	def_bool y
 
-config IOMMU_HELPER
-	def_bool SWIOTLB
-
 config PARAVIRT
 	bool "Enable paravirtualization code"
 	help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 2005e4baaada..a553deb60fc2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -251,9 +251,6 @@ config SMP
 config SWIOTLB
 	def_bool y
 
-config IOMMU_HELPER
-	def_bool SWIOTLB
-
 config KERNEL_MODE_NEON
 	def_bool y
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index f55aec969cfa..ec2a39741f91 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -612,6 +612,3 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
-
-config IOMMU_HELPER
-	def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC || SWIOTLB)
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index b5eee1a57d6c..647ed158ac98 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -67,16 +67,12 @@ config CAVIUM_OCTEON_LOCK_L2_MEMCPY
 	help
 	  Lock the kernel's implementation of memcpy() into L2.
 
-config IOMMU_HELPER
-	bool
-
 config NEED_SG_DMA_LENGTH
 	bool
 
 config SWIOTLB
 	def_bool y
 	select DMA_DIRECT_OPS
-	select IOMMU_HELPER
 	select NEED_SG_DMA_LENGTH
 
 config OCTEON_ILM
diff --git a/arch/mips/loongson64/Kconfig b/arch/mips/loongson64/Kconfig
index 72af0c183969..5efb2e63878e 100644
--- a/arch/mips/loongson64/Kconfig
+++ b/arch/mips/loongson64/Kconfig
@@ -130,9 +130,6 @@ config LOONGSON_UART_BASE
 	default y
 	depends on EARLY_PRINTK || SERIAL_8250
 
-config IOMMU_HELPER
-	bool
-
 config NEED_SG_DMA_LENGTH
 	bool
 
@@ -141,7 +138,6 @@ config SWIOTLB
 	default y
 	depends on CPU_LOONGSON3
 	select DMA_DIRECT_OPS
-	select IOMMU_HELPER
 	select NEED_SG_DMA_LENGTH
 	select NEED_DMA_MAP_STATE
 
diff --git a/arch/mips/netlogic/Kconfig b/arch/mips/netlogic/Kconfig
index 7fcfc7fe9f14..5c5ee0e05a17 100644
--- a/arch/mips/netlogic/Kconfig
+++ b/arch/mips/netlogic/Kconfig
@@ -83,9 +83,6 @@ endif
 config NLM_COMMON
 	bool
 
-config IOMMU_HELPER
-	bool
-
 config NEED_SG_DMA_LENGTH
 	bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d81e0d4f1f68..5cf611f799be 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -483,7 +483,6 @@ config IOMMU_HELPER
 config SWIOTLB
 	bool "SWIOTLB support"
 	default n
-	select IOMMU_HELPER
 	---help---
 	  Support for IO bounce buffering for systems without an IOMMU.
 	  This allows us to DMA to the full physical address space on
diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig
index e9154a59d561..3f105e00c432 100644
--- a/arch/unicore32/mm/Kconfig
+++ b/arch/unicore32/mm/Kconfig
@@ -44,9 +44,6 @@ config SWIOTLB
 	def_bool y
 	select DMA_DIRECT_OPS
 
-config IOMMU_HELPER
-	def_bool SWIOTLB
-
 config NEED_SG_DMA_LENGTH
 	def_bool SWIOTLB
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13bf2c727524..a43f21e7cfc5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -934,7 +934,7 @@ config SWIOTLB
 
 config IOMMU_HELPER
 	def_bool y
-	depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
+	depends on CALGARY_IOMMU || GART_IOMMU
 
 config MAXSMP
 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
diff --git a/drivers/parisc/Kconfig b/drivers/parisc/Kconfig
index 3a102a84d637..5a48b5606110 100644
--- a/drivers/parisc/Kconfig
+++ b/drivers/parisc/Kconfig
@@ -103,11 +103,6 @@ config IOMMU_SBA
 	depends on PCI_LBA
 	default PCI_LBA
 
-config IOMMU_HELPER
-	bool
-	depends on IOMMU_SBA || IOMMU_CCIO
-	default y
-
 source "drivers/pcmcia/Kconfig"
 
 endmenu
diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index cb9a9248c8c0..70d01edcbf8b 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_IOMMU_HELPER_H
 #define _LINUX_IOMMU_HELPER_H
 
+#include <linux/bug.h>
 #include <linux/kernel.h>
 
 static inline unsigned long iommu_device_max_index(unsigned long size,
@@ -14,9 +15,15 @@ static inline unsigned long iommu_device_max_index(unsigned long size,
 		return size;
 }
 
-extern int iommu_is_span_boundary(unsigned int index, unsigned int nr,
-				  unsigned long shift,
-				  unsigned long boundary_size);
+static inline int iommu_is_span_boundary(unsigned int index, unsigned int nr,
+		unsigned long shift, unsigned long boundary_size)
+{
+	BUG_ON(!is_power_of_2(boundary_size));
+
+	shift = (shift + index) & (boundary_size - 1);
+	return shift + nr > boundary_size;
+}
+
 extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 				      unsigned long start, unsigned int nr,
 				      unsigned long shift,
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index ded1703e7e64..92a9f243c0e2 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -4,17 +4,7 @@
  */
 
 #include <linux/bitmap.h>
-#include <linux/bug.h>
-
-int iommu_is_span_boundary(unsigned int index, unsigned int nr,
-			   unsigned long shift,
-			   unsigned long boundary_size)
-{
-	BUG_ON(!is_power_of_2(boundary_size));
-
-	shift = (shift + index) & (boundary_size - 1);
-	return shift + nr > boundary_size;
-}
+#include <linux/iommu-helper.h>
 
 unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 			       unsigned long start, unsigned int nr,
-- 
cgit v1.2.3


From f616ab59c294b6ea6efa94f6139ea3eda2f52be0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 May 2018 06:53:49 +0200
Subject: dma-mapping: move the NEED_DMA_MAP_STATE config symbol to lib/Kconfig

This way we have one central definition of it, and user can select it as
needed.  Note that we now also always select it when CONFIG_DMA_API_DEBUG
is select, which fixes some incorrect checks in a few network drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
---
 arch/alpha/Kconfig          | 4 +---
 arch/arm/Kconfig            | 4 +---
 arch/arm64/Kconfig          | 4 +---
 arch/ia64/Kconfig           | 4 +---
 arch/mips/Kconfig           | 3 ---
 arch/parisc/Kconfig         | 4 +---
 arch/s390/Kconfig           | 4 +---
 arch/sh/Kconfig             | 4 +---
 arch/sparc/Kconfig          | 4 +---
 arch/unicore32/Kconfig      | 4 +---
 arch/x86/Kconfig            | 6 ++----
 drivers/iommu/Kconfig       | 1 +
 include/linux/dma-mapping.h | 2 +-
 lib/Kconfig                 | 3 +++
 lib/Kconfig.debug           | 1 +
 15 files changed, 17 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 8e6a67ecf069..1fd9645b0c67 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -10,6 +10,7 @@ config ALPHA
 	select HAVE_OPROFILE
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
+	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
@@ -68,9 +69,6 @@ config ZONE_DMA
 config ARCH_DMA_ADDR_T_64BIT
 	def_bool y
 
-config NEED_DMA_MAP_STATE
-       def_bool y
-
 config GENERIC_ISA_DMA
 	bool
 	default y
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0d2930df7580..924ccac1836b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -95,6 +95,7 @@ config ARM
 	select HAVE_VIRT_CPU_ACCOUNTING_GEN
 	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_REL
+	select NEED_DMA_MAP_STATE
 	select NO_BOOTMEM
 	select OF_EARLY_FLATTREE if OF
 	select OF_RESERVED_MEM if OF
@@ -220,9 +221,6 @@ config ARCH_MAY_HAVE_PC_FDC
 config ZONE_DMA
 	bool
 
-config NEED_DMA_MAP_STATE
-       def_bool y
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eab772a51aad..7259ca832120 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -132,6 +132,7 @@ config ARM64
 	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_RELA
 	select MULTI_IRQ_HANDLER
+	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	select NO_BOOTMEM
 	select OF
@@ -240,9 +241,6 @@ config HAVE_GENERIC_GUP
 config ARCH_DMA_ADDR_T_64BIT
 	def_bool y
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-
 config SMP
 	def_bool y
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 5381e46fe21d..7fd94b41b6da 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -53,6 +53,7 @@ config IA64
 	select MODULES_USE_ELF_RELA
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_AUDITSYSCALL
+	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	default y
 	help
@@ -81,9 +82,6 @@ config MMU
 config ARCH_DMA_ADDR_T_64BIT
 	def_bool y
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-
 config SWIOTLB
        bool
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index fa7e9bbfdd96..2f53a486c86a 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1121,9 +1121,6 @@ config DMA_NONCOHERENT
 	bool
 	select NEED_DMA_MAP_STATE
 
-config NEED_DMA_MAP_STATE
-	bool
-
 config SYS_HAS_EARLY_PRINTK
 	bool
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 89caea87556e..4d8f64d48597 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -51,6 +51,7 @@ config PARISC
 	select GENERIC_CLOCKEVENTS
 	select ARCH_NO_COHERENT_DMA_MMAP
 	select CPU_NO_EFFICIENT_FFS
+	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 
 	help
@@ -112,9 +113,6 @@ config PM
 config STACKTRACE_SUPPORT
 	def_bool y
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-
 config ISA_DMA_API
 	bool
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a36b4e4e010a..6bdac4c2a64a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -710,6 +710,7 @@ menuconfig PCI
 	select PCI_MSI
 	select IOMMU_HELPER
 	select IOMMU_SUPPORT
+	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 
 	help
@@ -735,9 +736,6 @@ config PCI_DOMAINS
 config HAS_IOMEM
 	def_bool PCI
 
-config NEED_DMA_MAP_STATE
-	def_bool PCI
-
 config CHSC_SCH
 	def_tristate m
 	prompt "Support for CHSC subchannels"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index a0f1c4a8a093..7d521926041e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -49,6 +49,7 @@ config SUPERH
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
+	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 
 	help
@@ -161,9 +162,6 @@ config DMA_COHERENT
 config DMA_NONCOHERENT
 	def_bool !DMA_COHERENT
 
-config NEED_DMA_MAP_STATE
-	def_bool DMA_NONCOHERENT
-
 config PGTABLE_LEVELS
 	default 3 if X2TLB
 	default 2
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a2f4483d9b6e..304c94b50171 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -43,6 +43,7 @@ config SPARC
 	select ARCH_HAS_SG_CHAIN
 	select CPU_NO_EFFICIENT_FFS
 	select LOCKDEP_SMALL if LOCKDEP
+	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 
 config SPARC32
@@ -143,9 +144,6 @@ config ZONE_DMA
 	bool
 	default y if SPARC32
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-
 config GENERIC_ISA_DMA
 	bool
 	default y if SPARC32
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 462e59a7ae78..82195714d20b 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -19,6 +19,7 @@ config UNICORE32
 	select ARCH_WANT_FRAME_POINTERS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_REL
+	select NEED_DMA_MAP_STATE
 	help
 	  UniCore-32 is 32-bit Instruction Set Architecture,
 	  including a series of low-power-consumption RISC chip
@@ -61,9 +62,6 @@ config ARCH_MAY_HAVE_PC_FDC
 config ZONE_DMA
 	def_bool y
 
-config NEED_DMA_MAP_STATE
-       def_bool y
-
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9cbadd6ba27b..06eea9bffecb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86_64
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
 	select MODULES_USE_ELF_RELA
+	select NEED_DMA_MAP_STATE
 	select X86_DEV_DMA_OPS
 	select ARCH_HAS_SYSCALL_WRAPPER
 
@@ -236,10 +237,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX
 config SBUS
 	bool
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
-
 config GENERIC_ISA_DMA
 	def_bool y
 	depends on ISA_DMA_API
@@ -925,6 +922,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	def_bool y if X86_64
+	select NEED_DMA_MAP_STATE
 	---help---
 	  Support for software bounce buffers used on x86-64 systems
 	  which don't have a hardware IOMMU. Using this PCI devices
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index df171cb85822..5b714a062fa7 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -146,6 +146,7 @@ config INTEL_IOMMU
 	select DMA_DIRECT_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
+	select NEED_DMA_MAP_STATE
 	select DMAR_TABLE
 	help
 	  DMA remapping (DMAR) devices support enables independent address
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index a6d4ac8b81e4..25a9a2b04f78 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -838,7 +838,7 @@ static inline int dma_mmap_wc(struct device *dev,
 #define dma_mmap_writecombine dma_mmap_wc
 #endif
 
-#if defined(CONFIG_NEED_DMA_MAP_STATE) || defined(CONFIG_DMA_API_DEBUG)
+#ifdef CONFIG_NEED_DMA_MAP_STATE
 #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
 #define DEFINE_DMA_UNMAP_LEN(LEN_NAME)          __u32 LEN_NAME
 #define dma_unmap_addr(PTR, ADDR_NAME)           ((PTR)->ADDR_NAME)
diff --git a/lib/Kconfig b/lib/Kconfig
index aeb7fae16bc2..ce9fa962d59b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -432,6 +432,9 @@ config SGL_ALLOC
 config NEED_SG_DMA_LENGTH
 	bool
 
+config NEED_DMA_MAP_STATE
+	bool
+
 config IOMMU_HELPER
 	bool
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 45100eda1dab..d5175eb7b917 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1634,6 +1634,7 @@ config PROVIDE_OHCI1394_DMA_INIT
 
 config DMA_API_DEBUG
 	bool "Enable debugging of DMA-API usage"
+	select NEED_DMA_MAP_STATE
 	help
 	  Enable this option to debug the use of the DMA API by device drivers.
 	  With this option you will be able to detect common bugs in device
-- 
cgit v1.2.3


From 8e6390795eacf87f717bc755f352f0aa85da5700 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Apr 2018 16:40:54 +0200
Subject: PCI: remove CONFIG_PCI_BUS_ADDR_T_64BIT

This symbol is now always identical to CONFIG_ARCH_DMA_ADDR_T_64BIT, so
remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/Kconfig | 4 ----
 drivers/pci/bus.c   | 4 ++--
 include/linux/pci.h | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 34b56a8f8480..29a487f31dae 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -5,10 +5,6 @@
 
 source "drivers/pci/pcie/Kconfig"
 
-config PCI_BUS_ADDR_T_64BIT
-	def_bool y if (ARCH_DMA_ADDR_T_64BIT || 64BIT)
-	depends on PCI
-
 config PCI_MSI
 	bool "Message Signaled Interrupts (MSI and MSI-X)"
 	depends on PCI
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index bc2ded4c451f..35b7fc87eac5 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -120,7 +120,7 @@ int devm_request_pci_bus_resources(struct device *dev,
 EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources);
 
 static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL};
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 static struct pci_bus_region pci_64_bit = {0,
 				(pci_bus_addr_t) 0xffffffffffffffffULL};
 static struct pci_bus_region pci_high = {(pci_bus_addr_t) 0x100000000ULL,
@@ -230,7 +230,7 @@ int pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
 					  resource_size_t),
 		void *alignf_data)
 {
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 	int rc;
 
 	if (res->flags & IORESOURCE_MEM_64) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..55371cb827ad 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -670,7 +670,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
 int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
 		  int reg, int len, u32 val);
 
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 typedef u64 pci_bus_addr_t;
 #else
 typedef u32 pci_bus_addr_t;
-- 
cgit v1.2.3


From 81da8a0b7975c51db482acee750a2dbcb320b97d Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 May 2018 03:18:11 +0000
Subject: ASoC: remove codec hw_write/control_data

No one is using codec hw_write/control_data any more.
Let's remove these.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 9ea99e5d3c8e..0e1cb847f8f8 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -932,8 +932,6 @@ struct snd_soc_codec {
 	unsigned int cache_init:1; /* codec cache has been initialized */
 
 	/* codec IO */
-	void *control_data; /* codec control (i2c/3wire) data */
-	hw_write_t hw_write;
 	void *reg_cache;
 
 	/* component */
-- 
cgit v1.2.3


From d1021c88526273644a711d481fd1117a67b6ae20 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 May 2018 03:18:38 +0000
Subject: ASoC: remove codec reg_cache

Codec reg_cache is legacy feature, almost all driver are now using
common regmap, and very few driver had been used this legacy feature.
Because of this background, it is now implemented on each
driver internally now.
So now, no one is using codec reg_cache. Let's remove it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h   |  13 ----
 sound/soc/Makefile    |   2 +-
 sound/soc/soc-cache.c |  53 -------------
 sound/soc/soc-core.c  | 207 --------------------------------------------------
 4 files changed, 1 insertion(+), 274 deletions(-)
 delete mode 100644 sound/soc/soc-cache.c

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 0e1cb847f8f8..7233d3a206b2 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -471,8 +471,6 @@ int devm_snd_soc_register_component(struct device *dev,
 void snd_soc_unregister_component(struct device *dev);
 struct snd_soc_component *snd_soc_lookup_component(struct device *dev,
 						   const char *driver_name);
-int snd_soc_cache_init(struct snd_soc_codec *codec);
-int snd_soc_cache_exit(struct snd_soc_codec *codec);
 
 int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num);
 #ifdef CONFIG_SND_SOC_COMPRESS
@@ -904,7 +902,6 @@ struct snd_soc_component {
 	int (*init)(struct snd_soc_component *component);
 
 #ifdef CONFIG_DEBUG_FS
-	void (*init_debugfs)(struct snd_soc_component *component);
 	const char *debugfs_prefix;
 #endif
 };
@@ -928,12 +925,6 @@ struct snd_soc_codec {
 
 	struct list_head list;
 
-	/* runtime */
-	unsigned int cache_init:1; /* codec cache has been initialized */
-
-	/* codec IO */
-	void *reg_cache;
-
 	/* component */
 	struct snd_soc_component component;
 };
@@ -960,10 +951,6 @@ struct snd_soc_codec_driver {
 	struct regmap *(*get_regmap)(struct device *);
 	unsigned int (*read)(struct snd_soc_codec *, unsigned int);
 	int (*write)(struct snd_soc_codec *, unsigned int, unsigned int);
-	unsigned int reg_cache_size;
-	short reg_cache_step;
-	short reg_word_size;
-	const void *reg_cache_default;
 
 	/* codec bias level */
 	int (*set_bias_level)(struct snd_soc_codec *,
diff --git a/sound/soc/Makefile b/sound/soc/Makefile
index 8d92492183d2..06389a5385d7 100644
--- a/sound/soc/Makefile
+++ b/sound/soc/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-snd-soc-core-objs := soc-core.o soc-dapm.o soc-jack.o soc-cache.o soc-utils.o
+snd-soc-core-objs := soc-core.o soc-dapm.o soc-jack.o soc-utils.o
 snd-soc-core-objs += soc-pcm.o soc-io.o soc-devres.o soc-ops.o
 snd-soc-core-$(CONFIG_SND_SOC_COMPRESS) += soc-compress.o
 
diff --git a/sound/soc/soc-cache.c b/sound/soc/soc-cache.c
deleted file mode 100644
index 07f43356f963..000000000000
--- a/sound/soc/soc-cache.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * soc-cache.c  --  ASoC register cache helpers
- *
- * Copyright 2009 Wolfson Microelectronics PLC.
- *
- * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
- *
- *  This program is free software; you can redistribute  it and/or modify it
- *  under  the terms of  the GNU General  Public License as published by the
- *  Free Software Foundation;  either version 2 of the  License, or (at your
- *  option) any later version.
- */
-
-#include <sound/soc.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-
-int snd_soc_cache_init(struct snd_soc_codec *codec)
-{
-	const struct snd_soc_codec_driver *codec_drv = codec->driver;
-	size_t reg_size;
-
-	reg_size = codec_drv->reg_cache_size * codec_drv->reg_word_size;
-
-	if (!reg_size)
-		return 0;
-
-	dev_dbg(codec->dev, "ASoC: Initializing cache for %s codec\n",
-				codec->component.name);
-
-	if (codec_drv->reg_cache_default)
-		codec->reg_cache = kmemdup(codec_drv->reg_cache_default,
-					   reg_size, GFP_KERNEL);
-	else
-		codec->reg_cache = kzalloc(reg_size, GFP_KERNEL);
-	if (!codec->reg_cache)
-		return -ENOMEM;
-
-	return 0;
-}
-
-/*
- * NOTE: keep in mind that this function might be called
- * multiple times.
- */
-int snd_soc_cache_exit(struct snd_soc_codec *codec)
-{
-	dev_dbg(codec->dev, "ASoC: Destroying cache for %s codec\n",
-			codec->component.name);
-	kfree(codec->reg_cache);
-	codec->reg_cache = NULL;
-	return 0;
-}
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 4b068ccf4e13..96081160fd24 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -82,98 +82,6 @@ static const char * const dmi_blacklist[] = {
 	NULL,	/* terminator */
 };
 
-/* returns the minimum number of bytes needed to represent
- * a particular given value */
-static int min_bytes_needed(unsigned long val)
-{
-	int c = 0;
-	int i;
-
-	for (i = (sizeof val * 8) - 1; i >= 0; --i, ++c)
-		if (val & (1UL << i))
-			break;
-	c = (sizeof val * 8) - c;
-	if (!c || (c % 8))
-		c = (c + 8) / 8;
-	else
-		c /= 8;
-	return c;
-}
-
-/* fill buf which is 'len' bytes with a formatted
- * string of the form 'reg: value\n' */
-static int format_register_str(struct snd_soc_codec *codec,
-			       unsigned int reg, char *buf, size_t len)
-{
-	int wordsize = min_bytes_needed(codec->driver->reg_cache_size) * 2;
-	int regsize = codec->driver->reg_word_size * 2;
-	int ret;
-
-	/* +2 for ': ' and + 1 for '\n' */
-	if (wordsize + regsize + 2 + 1 != len)
-		return -EINVAL;
-
-	sprintf(buf, "%.*x: ", wordsize, reg);
-	buf += wordsize + 2;
-
-	ret = snd_soc_read(codec, reg);
-	if (ret < 0)
-		memset(buf, 'X', regsize);
-	else
-		sprintf(buf, "%.*x", regsize, ret);
-	buf[regsize] = '\n';
-	/* no NUL-termination needed */
-	return 0;
-}
-
-/* codec register dump */
-static ssize_t soc_codec_reg_show(struct snd_soc_codec *codec, char *buf,
-				  size_t count, loff_t pos)
-{
-	int i, step = 1;
-	int wordsize, regsize;
-	int len;
-	size_t total = 0;
-	loff_t p = 0;
-
-	wordsize = min_bytes_needed(codec->driver->reg_cache_size) * 2;
-	regsize = codec->driver->reg_word_size * 2;
-
-	len = wordsize + regsize + 2 + 1;
-
-	if (!codec->driver->reg_cache_size)
-		return 0;
-
-	if (codec->driver->reg_cache_step)
-		step = codec->driver->reg_cache_step;
-
-	for (i = 0; i < codec->driver->reg_cache_size; i += step) {
-		/* only support larger than PAGE_SIZE bytes debugfs
-		 * entries for the default case */
-		if (p >= pos) {
-			if (total + len >= count - 1)
-				break;
-			format_register_str(codec, i, buf + total, len);
-			total += len;
-		}
-		p += len;
-	}
-
-	total = min(total, count - 1);
-
-	return total;
-}
-
-static ssize_t codec_reg_show(struct device *dev,
-	struct device_attribute *attr, char *buf)
-{
-	struct snd_soc_pcm_runtime *rtd = dev_get_drvdata(dev);
-
-	return soc_codec_reg_show(rtd->codec, buf, PAGE_SIZE, 0);
-}
-
-static DEVICE_ATTR_RO(codec_reg);
-
 static ssize_t pmdown_time_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -199,7 +107,6 @@ static ssize_t pmdown_time_set(struct device *dev,
 static DEVICE_ATTR(pmdown_time, 0644, pmdown_time_show, pmdown_time_set);
 
 static struct attribute *soc_dev_attrs[] = {
-	&dev_attr_codec_reg.attr,
 	&dev_attr_pmdown_time.attr,
 	NULL
 };
@@ -232,71 +139,6 @@ static const struct attribute_group *soc_dev_attr_groups[] = {
 };
 
 #ifdef CONFIG_DEBUG_FS
-static ssize_t codec_reg_read_file(struct file *file, char __user *user_buf,
-				   size_t count, loff_t *ppos)
-{
-	ssize_t ret;
-	struct snd_soc_codec *codec = file->private_data;
-	char *buf;
-
-	if (*ppos < 0 || !count)
-		return -EINVAL;
-
-	buf = kmalloc(count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	ret = soc_codec_reg_show(codec, buf, count, *ppos);
-	if (ret >= 0) {
-		if (copy_to_user(user_buf, buf, ret)) {
-			kfree(buf);
-			return -EFAULT;
-		}
-		*ppos += ret;
-	}
-
-	kfree(buf);
-	return ret;
-}
-
-static ssize_t codec_reg_write_file(struct file *file,
-		const char __user *user_buf, size_t count, loff_t *ppos)
-{
-	char buf[32];
-	size_t buf_size;
-	char *start = buf;
-	unsigned long reg, value;
-	struct snd_soc_codec *codec = file->private_data;
-	int ret;
-
-	buf_size = min(count, (sizeof(buf)-1));
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-	buf[buf_size] = 0;
-
-	while (*start == ' ')
-		start++;
-	reg = simple_strtoul(start, &start, 16);
-	while (*start == ' ')
-		start++;
-	ret = kstrtoul(start, 16, &value);
-	if (ret)
-		return ret;
-
-	/* Userspace has been fiddling around behind the kernel's back */
-	add_taint(TAINT_USER, LOCKDEP_NOW_UNRELIABLE);
-
-	snd_soc_write(codec, reg, value);
-	return buf_size;
-}
-
-static const struct file_operations codec_reg_fops = {
-	.open = simple_open,
-	.read = codec_reg_read_file,
-	.write = codec_reg_write_file,
-	.llseek = default_llseek,
-};
-
 static void soc_init_component_debugfs(struct snd_soc_component *component)
 {
 	if (!component->card->debugfs_card_root)
@@ -325,9 +167,6 @@ static void soc_init_component_debugfs(struct snd_soc_component *component)
 
 	snd_soc_dapm_debugfs_init(snd_soc_component_get_dapm(component),
 		component->debugfs_root);
-
-	if (component->init_debugfs)
-		component->init_debugfs(component);
 }
 
 static void soc_cleanup_component_debugfs(struct snd_soc_component *component)
@@ -335,19 +174,6 @@ static void soc_cleanup_component_debugfs(struct snd_soc_component *component)
 	debugfs_remove_recursive(component->debugfs_root);
 }
 
-static void soc_init_codec_debugfs(struct snd_soc_component *component)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-	struct dentry *debugfs_reg;
-
-	debugfs_reg = debugfs_create_file("codec_reg", 0644,
-					  codec->component.debugfs_root,
-					  codec, &codec_reg_fops);
-	if (!debugfs_reg)
-		dev_warn(codec->dev,
-			"ASoC: Failed to create codec register debugfs file\n");
-}
-
 static int codec_list_show(struct seq_file *m, void *v)
 {
 	struct snd_soc_codec *codec;
@@ -431,8 +257,6 @@ static void snd_soc_debugfs_exit(void)
 
 #else
 
-#define soc_init_codec_debugfs NULL
-
 static inline void soc_init_component_debugfs(
 	struct snd_soc_component *component)
 {
@@ -1805,24 +1629,6 @@ static void soc_remove_aux_devices(struct snd_soc_card *card)
 	}
 }
 
-static int snd_soc_init_codec_cache(struct snd_soc_codec *codec)
-{
-	int ret;
-
-	if (codec->cache_init)
-		return 0;
-
-	ret = snd_soc_cache_init(codec);
-	if (ret < 0) {
-		dev_err(codec->dev,
-			"ASoC: Failed to set cache compression type: %d\n",
-			ret);
-		return ret;
-	}
-	codec->cache_init = 1;
-	return 0;
-}
-
 /**
  * snd_soc_runtime_set_dai_fmt() - Change DAI link format for a ASoC runtime
  * @rtd: The runtime for which the DAI link format should be changed
@@ -2045,7 +1851,6 @@ EXPORT_SYMBOL_GPL(snd_soc_set_dmi_name);
 
 static int snd_soc_instantiate_card(struct snd_soc_card *card)
 {
-	struct snd_soc_codec *codec;
 	struct snd_soc_pcm_runtime *rtd;
 	struct snd_soc_dai_link *dai_link;
 	int ret, i, order;
@@ -2071,15 +1876,6 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card)
 	for (i = 0; i < card->num_links; i++)
 		snd_soc_add_dai_link(card, card->dai_link+i);
 
-	/* initialize the register cache for each available codec */
-	list_for_each_entry(codec, &codec_list, list) {
-		if (codec->cache_init)
-			continue;
-		ret = snd_soc_init_codec_cache(codec);
-		if (ret < 0)
-			goto base_error;
-	}
-
 	/* card bind complete so register a sound card */
 	ret = snd_card_new(card->dev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1,
 			card->owner, 0, &card->snd_card);
@@ -3591,10 +3387,8 @@ int snd_soc_register_codec(struct device *dev,
 		dapm->set_bias_level = snd_soc_codec_set_bias_level;
 	codec->dev = dev;
 	codec->driver = codec_drv;
-	codec->component.val_bytes = codec_drv->reg_word_size;
 
 #ifdef CONFIG_DEBUG_FS
-	codec->component.init_debugfs = soc_init_codec_debugfs;
 	codec->component.debugfs_prefix = "codec";
 #endif
 
@@ -3658,7 +3452,6 @@ found:
 			codec->component.name);
 
 	snd_soc_component_cleanup(&codec->component);
-	snd_soc_cache_exit(codec);
 	kfree(codec);
 }
 EXPORT_SYMBOL_GPL(snd_soc_unregister_codec);
-- 
cgit v1.2.3


From 2250e76d78e82b1da0f6f571f7211dfa78cbd469 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 May 2018 03:19:16 +0000
Subject: ASoC: remove .get_regmap

To setup regmap, ALSA SoC has snd_soc_component_init_regmap() and
.get_regmap. But these are duplicated feature.
Now, no one is using .get_regmap, let's remove it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 2 --
 sound/soc/soc-core.c | 3 ---
 2 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 7233d3a206b2..309bb70bcb1e 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -947,8 +947,6 @@ struct snd_soc_codec_driver {
 	int (*set_jack)(struct snd_soc_codec *codec,
 			struct snd_soc_jack *jack,  void *data);
 
-	/* codec IO */
-	struct regmap *(*get_regmap)(struct device *);
 	unsigned int (*read)(struct snd_soc_codec *, unsigned int);
 	int (*write)(struct snd_soc_codec *, unsigned int, unsigned int);
 
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 96081160fd24..e5af15aa1c28 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -3392,9 +3392,6 @@ int snd_soc_register_codec(struct device *dev,
 	codec->component.debugfs_prefix = "codec";
 #endif
 
-	if (codec_drv->get_regmap)
-		codec->component.regmap = codec_drv->get_regmap(dev);
-
 	for (i = 0; i < num_dai; i++) {
 		convert_endianness_formats(&dai_drv[i].playback);
 		convert_endianness_formats(&dai_drv[i].capture);
-- 
cgit v1.2.3


From 11fb14f8c5b8f016ea0dae2237854331071cad42 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 May 2018 03:19:49 +0000
Subject: ASoC: remove unneeded .pcm_new/free

commit ef050bece1b55 ("ASoC: Remove platform code now everything is
componentised") removed platform code, but it didn't remove
.pcm_new/free which existed only for platform.
This patch remove these

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  |  2 --
 sound/soc/soc-core.c | 18 ------------------
 sound/soc/soc-pcm.c  |  8 ++++----
 3 files changed, 4 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 309bb70bcb1e..131185563532 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -886,8 +886,6 @@ struct snd_soc_component {
 	void (*remove)(struct snd_soc_component *);
 	int (*suspend)(struct snd_soc_component *);
 	int (*resume)(struct snd_soc_component *);
-	int (*pcm_new)(struct snd_soc_component *, struct snd_soc_pcm_runtime *);
-	void (*pcm_free)(struct snd_soc_component *, struct snd_pcm *);
 
 	int (*set_sysclk)(struct snd_soc_component *component,
 			  int clk_id, int source, unsigned int freq, int dir);
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index e5af15aa1c28..3b78868969d5 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2943,22 +2943,6 @@ static int snd_soc_component_stream_event(struct snd_soc_dapm_context *dapm,
 	return component->driver->stream_event(component, event);
 }
 
-static int snd_soc_component_drv_pcm_new(struct snd_soc_component *component,
-					struct snd_soc_pcm_runtime *rtd)
-{
-	if (component->driver->pcm_new)
-		return component->driver->pcm_new(rtd);
-
-	return 0;
-}
-
-static void snd_soc_component_drv_pcm_free(struct snd_soc_component *component,
-					  struct snd_pcm *pcm)
-{
-	if (component->driver->pcm_free)
-		component->driver->pcm_free(pcm);
-}
-
 static int snd_soc_component_set_bias_level(struct snd_soc_dapm_context *dapm,
 					enum snd_soc_bias_level level)
 {
@@ -2987,8 +2971,6 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 	component->set_sysclk = component->driver->set_sysclk;
 	component->set_pll = component->driver->set_pll;
 	component->set_jack = component->driver->set_jack;
-	component->pcm_new = snd_soc_component_drv_pcm_new;
-	component->pcm_free = snd_soc_component_drv_pcm_free;
 
 	dapm = snd_soc_component_get_dapm(component);
 	dapm->dev = dev;
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index da5a2dcb6520..3f6375499102 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -2745,8 +2745,8 @@ static void soc_pcm_private_free(struct snd_pcm *pcm)
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		if (component->pcm_free)
-			component->pcm_free(component, pcm);
+		if (component->driver->pcm_free)
+			component->driver->pcm_free(pcm);
 	}
 }
 
@@ -3012,10 +3012,10 @@ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num)
 	for_each_rtdcom(rtd, rtdcom) {
 		component = rtdcom->component;
 
-		if (!component->pcm_new)
+		if (!component->driver->pcm_new)
 			continue;
 
-		ret = component->pcm_new(component, rtd);
+		ret = component->driver->pcm_new(rtd);
 		if (ret < 0) {
 			dev_err(component->dev,
 				"ASoC: pcm constructor failed: %d\n",
-- 
cgit v1.2.3


From 999f7f5af8eb7766f68d925a22bf296011abc84c Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 May 2018 03:20:24 +0000
Subject: ASoC: remove Codec related code

Now no one is using Codec related code.
Let's remove all

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dai.h |   1 -
 include/sound/soc.h     | 265 --------------------------------------
 sound/soc/soc-core.c    | 331 ++----------------------------------------------
 sound/soc/soc-io.c      |  62 ---------
 sound/soc/soc-jack.c    |  22 ----
 sound/soc/soc-pcm.c     |   5 -
 6 files changed, 11 insertions(+), 675 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h
index 35ebb0be5114..568f6a72c974 100644
--- a/include/sound/soc-dai.h
+++ b/include/sound/soc-dai.h
@@ -313,7 +313,6 @@ struct snd_soc_dai {
 	unsigned int sample_bits;
 
 	/* parent platform/codec */
-	struct snd_soc_codec *codec;
 	struct snd_soc_component *component;
 
 	/* CODEC TDM slot masks and params (for fixup) */
diff --git a/include/sound/soc.h b/include/sound/soc.h
index 131185563532..21861f366dcb 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -402,8 +402,6 @@ struct snd_soc_pcm_runtime;
 struct snd_soc_dai;
 struct snd_soc_dai_driver;
 struct snd_soc_dai_link;
-struct snd_soc_codec;
-struct snd_soc_codec_driver;
 struct snd_soc_component;
 struct snd_soc_component_driver;
 struct soc_enum;
@@ -428,13 +426,6 @@ enum snd_soc_card_subclass {
 	SND_SOC_CARD_CLASS_RUNTIME	= 1,
 };
 
-int snd_soc_codec_set_sysclk(struct snd_soc_codec *codec, int clk_id,
-			     int source, unsigned int freq, int dir);
-int snd_soc_codec_set_pll(struct snd_soc_codec *codec, int pll_id, int source,
-			  unsigned int freq_in, unsigned int freq_out);
-int snd_soc_codec_set_jack(struct snd_soc_codec *codec,
-			   struct snd_soc_jack *jack, void *data);
-
 int snd_soc_register_card(struct snd_soc_card *card);
 int snd_soc_unregister_card(struct snd_soc_card *card);
 int devm_snd_soc_register_card(struct device *dev, struct snd_soc_card *card);
@@ -453,10 +444,6 @@ static inline int snd_soc_resume(struct device *dev)
 }
 #endif
 int snd_soc_poweroff(struct device *dev);
-int snd_soc_register_codec(struct device *dev,
-		const struct snd_soc_codec_driver *codec_drv,
-		struct snd_soc_dai_driver *dai_drv, int num_dai);
-void snd_soc_unregister_codec(struct device *dev);
 int snd_soc_add_component(struct device *dev,
 		struct snd_soc_component *component,
 		const struct snd_soc_component_driver *component_driver,
@@ -559,23 +546,7 @@ static inline void snd_soc_jack_free_gpios(struct snd_soc_jack *jack, int count,
 }
 #endif
 
-/* codec register bit access */
-int snd_soc_update_bits(struct snd_soc_codec *codec, unsigned int reg,
-				unsigned int mask, unsigned int value);
-int snd_soc_update_bits_locked(struct snd_soc_codec *codec,
-			       unsigned int reg, unsigned int mask,
-			       unsigned int value);
-int snd_soc_test_bits(struct snd_soc_codec *codec, unsigned int reg,
-				unsigned int mask, unsigned int value);
-
 #ifdef CONFIG_SND_SOC_AC97_BUS
-#define snd_soc_alloc_ac97_codec(codec) \
-	snd_soc_alloc_ac97_component(&codec->component)
-#define snd_soc_new_ac97_codec(codec, id, id_mask) \
-	snd_soc_new_ac97_component(&codec->component, id, id_mask)
-#define snd_soc_free_ac97_codec(ac97) \
-	snd_soc_free_ac97_component(ac97)
-
 struct snd_ac97 *snd_soc_alloc_ac97_component(struct snd_soc_component *component);
 struct snd_ac97 *snd_soc_new_ac97_component(struct snd_soc_component *component,
 	unsigned int id, unsigned int id_mask);
@@ -609,8 +580,6 @@ struct snd_kcontrol *snd_soc_card_get_kcontrol(struct snd_soc_card *soc_card,
 					       const char *name);
 int snd_soc_add_component_controls(struct snd_soc_component *component,
 	const struct snd_kcontrol_new *controls, unsigned int num_controls);
-int snd_soc_add_codec_controls(struct snd_soc_codec *codec,
-	const struct snd_kcontrol_new *controls, unsigned int num_controls);
 int snd_soc_add_card_controls(struct snd_soc_card *soc_card,
 	const struct snd_kcontrol_new *controls, int num_controls);
 int snd_soc_add_dai_controls(struct snd_soc_dai *dai,
@@ -843,8 +812,6 @@ struct snd_soc_component {
 
 	unsigned int active;
 
-	unsigned int ignore_pmdown_time:1; /* pmdown_time is ignored at stop */
-	unsigned int registered_as_component:1;
 	unsigned int suspended:1; /* is in suspend PM state */
 
 	struct list_head list;
@@ -856,9 +823,6 @@ struct snd_soc_component {
 	struct list_head dai_list;
 	int num_dai;
 
-	int (*read)(struct snd_soc_component *, unsigned int, unsigned int *);
-	int (*write)(struct snd_soc_component *, unsigned int, unsigned int);
-
 	struct regmap *regmap;
 	int val_bytes;
 
@@ -880,22 +844,6 @@ struct snd_soc_component {
 	/* Don't use these, use snd_soc_component_get_dapm() */
 	struct snd_soc_dapm_context dapm;
 
-	struct snd_soc_codec *codec;
-
-	int (*probe)(struct snd_soc_component *);
-	void (*remove)(struct snd_soc_component *);
-	int (*suspend)(struct snd_soc_component *);
-	int (*resume)(struct snd_soc_component *);
-
-	int (*set_sysclk)(struct snd_soc_component *component,
-			  int clk_id, int source, unsigned int freq, int dir);
-	int (*set_pll)(struct snd_soc_component *component, int pll_id,
-		       int source, unsigned int freq_in, unsigned int freq_out);
-	int (*set_jack)(struct snd_soc_component *component,
-			struct snd_soc_jack *jack,  void *data);
-	int (*set_bias_level)(struct snd_soc_component *component,
-			      enum snd_soc_bias_level level);
-
 	/* machine specific init */
 	int (*init)(struct snd_soc_component *component);
 
@@ -916,50 +864,6 @@ snd_soc_rtdcom_lookup(struct snd_soc_pcm_runtime *rtd,
 #define for_each_rtdcom_safe(rtd, rtdcom1, rtdcom2) \
 	list_for_each_entry_safe(rtdcom1, rtdcom2, &(rtd)->component_list, list)
 
-/* SoC Audio Codec device */
-struct snd_soc_codec {
-	struct device *dev;
-	const struct snd_soc_codec_driver *driver;
-
-	struct list_head list;
-
-	/* component */
-	struct snd_soc_component component;
-};
-
-/* codec driver */
-struct snd_soc_codec_driver {
-
-	/* driver ops */
-	int (*probe)(struct snd_soc_codec *);
-	int (*remove)(struct snd_soc_codec *);
-	int (*suspend)(struct snd_soc_codec *);
-	int (*resume)(struct snd_soc_codec *);
-	struct snd_soc_component_driver component_driver;
-
-	/* codec wide operations */
-	int (*set_sysclk)(struct snd_soc_codec *codec,
-			  int clk_id, int source, unsigned int freq, int dir);
-	int (*set_pll)(struct snd_soc_codec *codec, int pll_id, int source,
-		unsigned int freq_in, unsigned int freq_out);
-	int (*set_jack)(struct snd_soc_codec *codec,
-			struct snd_soc_jack *jack,  void *data);
-
-	unsigned int (*read)(struct snd_soc_codec *, unsigned int);
-	int (*write)(struct snd_soc_codec *, unsigned int, unsigned int);
-
-	/* codec bias level */
-	int (*set_bias_level)(struct snd_soc_codec *,
-			      enum snd_soc_bias_level level);
-	bool idle_bias_off;
-	bool suspend_bias_off;
-
-	void (*seq_notifier)(struct snd_soc_dapm_context *,
-			     enum snd_soc_dapm_type, int);
-
-	bool ignore_pmdown_time;  /* Doesn't benefit from pmdown delay */
-};
-
 struct snd_soc_dai_link_component {
 	const char *name;
 	struct device_node *of_node;
@@ -1213,7 +1117,6 @@ struct snd_soc_pcm_runtime {
 	/* runtime devices */
 	struct snd_pcm *pcm;
 	struct snd_compr *compr;
-	struct snd_soc_codec *codec;
 	struct snd_soc_dai *codec_dai;
 	struct snd_soc_dai *cpu_dai;
 
@@ -1281,19 +1184,6 @@ struct soc_enum {
 	struct snd_soc_dobj dobj;
 };
 
-/**
- * snd_soc_component_to_codec() - Casts a component to the CODEC it is embedded in
- * @component: The component to cast to a CODEC
- *
- * This function must only be used on components that are known to be CODECs.
- * Otherwise the behavior is undefined.
- */
-static inline struct snd_soc_codec *snd_soc_component_to_codec(
-	struct snd_soc_component *component)
-{
-	return container_of(component, struct snd_soc_codec, component);
-}
-
 /**
  * snd_soc_dapm_to_component() - Casts a DAPM context to the component it is
  *  embedded in
@@ -1309,19 +1199,6 @@ static inline struct snd_soc_component *snd_soc_dapm_to_component(
 	return container_of(dapm, struct snd_soc_component, dapm);
 }
 
-/**
- * snd_soc_dapm_to_codec() - Casts a DAPM context to the CODEC it is embedded in
- * @dapm: The DAPM context to cast to the CODEC
- *
- * This function must only be used on DAPM contexts that are known to be part of
- * a CODEC (e.g. in a CODEC driver). Otherwise the behavior is undefined.
- */
-static inline struct snd_soc_codec *snd_soc_dapm_to_codec(
-	struct snd_soc_dapm_context *dapm)
-{
-	return snd_soc_component_to_codec(snd_soc_dapm_to_component(dapm));
-}
-
 /**
  * snd_soc_component_get_dapm() - Returns the DAPM context associated with a
  *  component
@@ -1333,31 +1210,6 @@ static inline struct snd_soc_dapm_context *snd_soc_component_get_dapm(
 	return &component->dapm;
 }
 
-/**
- * snd_soc_codec_get_dapm() - Returns the DAPM context for the CODEC
- * @codec: The CODEC for which to get the DAPM context
- *
- * Note: Use this function instead of directly accessing the CODEC's dapm field
- */
-static inline struct snd_soc_dapm_context *snd_soc_codec_get_dapm(
-	struct snd_soc_codec *codec)
-{
-	return snd_soc_component_get_dapm(&codec->component);
-}
-
-/**
- * snd_soc_dapm_init_bias_level() - Initialize CODEC DAPM bias level
- * @codec: The CODEC for which to initialize the DAPM bias level
- * @level: The DAPM level to initialize to
- *
- * Initializes the CODEC DAPM bias level. See snd_soc_dapm_init_bias_level().
- */
-static inline void snd_soc_codec_init_bias_level(struct snd_soc_codec *codec,
-	enum snd_soc_bias_level level)
-{
-	snd_soc_dapm_init_bias_level(snd_soc_codec_get_dapm(codec), level);
-}
-
 /**
  * snd_soc_component_init_bias_level() - Initialize COMPONENT DAPM bias level
  * @component: The COMPONENT for which to initialize the DAPM bias level
@@ -1373,18 +1225,6 @@ snd_soc_component_init_bias_level(struct snd_soc_component *component,
 		snd_soc_component_get_dapm(component), level);
 }
 
-/**
- * snd_soc_dapm_get_bias_level() - Get current CODEC DAPM bias level
- * @codec: The CODEC for which to get the DAPM bias level
- *
- * Returns: The current DAPM bias level of the CODEC.
- */
-static inline enum snd_soc_bias_level snd_soc_codec_get_bias_level(
-	struct snd_soc_codec *codec)
-{
-	return snd_soc_dapm_get_bias_level(snd_soc_codec_get_dapm(codec));
-}
-
 /**
  * snd_soc_component_get_bias_level() - Get current COMPONENT DAPM bias level
  * @component: The COMPONENT for which to get the DAPM bias level
@@ -1398,21 +1238,6 @@ snd_soc_component_get_bias_level(struct snd_soc_component *component)
 		snd_soc_component_get_dapm(component));
 }
 
-/**
- * snd_soc_codec_force_bias_level() - Set the CODEC DAPM bias level
- * @codec: The CODEC for which to set the level
- * @level: The level to set to
- *
- * Forces the CODEC bias level to a specific state. See
- * snd_soc_dapm_force_bias_level().
- */
-static inline int snd_soc_codec_force_bias_level(struct snd_soc_codec *codec,
-	enum snd_soc_bias_level level)
-{
-	return snd_soc_dapm_force_bias_level(snd_soc_codec_get_dapm(codec),
-		level);
-}
-
 /**
  * snd_soc_component_force_bias_level() - Set the COMPONENT DAPM bias level
  * @component: The COMPONENT for which to set the level
@@ -1430,19 +1255,6 @@ snd_soc_component_force_bias_level(struct snd_soc_component *component,
 		level);
 }
 
-/**
- * snd_soc_dapm_kcontrol_codec() - Returns the codec associated to a kcontrol
- * @kcontrol: The kcontrol
- *
- * This function must only be used on DAPM contexts that are known to be part of
- * a CODEC (e.g. in a CODEC driver). Otherwise the behavior is undefined.
- */
-static inline struct snd_soc_codec *snd_soc_dapm_kcontrol_codec(
-	struct snd_kcontrol *kcontrol)
-{
-	return snd_soc_dapm_to_codec(snd_soc_dapm_kcontrol_dapm(kcontrol));
-}
-
 /**
  * snd_soc_dapm_kcontrol_component() - Returns the component associated to a kcontrol
  * @kcontrol: The kcontrol
@@ -1456,22 +1268,6 @@ static inline struct snd_soc_component *snd_soc_dapm_kcontrol_component(
 	return snd_soc_dapm_to_component(snd_soc_dapm_kcontrol_dapm(kcontrol));
 }
 
-/* codec IO */
-unsigned int snd_soc_read(struct snd_soc_codec *codec, unsigned int reg);
-int snd_soc_write(struct snd_soc_codec *codec, unsigned int reg,
-	unsigned int val);
-
-/**
- * snd_soc_cache_sync() - Sync the register cache with the hardware
- * @codec: CODEC to sync
- *
- * Note: This function will call regcache_sync()
- */
-static inline int snd_soc_cache_sync(struct snd_soc_codec *codec)
-{
-	return regcache_sync(codec->component.regmap);
-}
-
 /**
  * snd_soc_component_cache_sync() - Sync the register cache with the hardware
  * @component: COMPONENT to sync
@@ -1514,37 +1310,6 @@ void snd_soc_component_init_regmap(struct snd_soc_component *component,
 	struct regmap *regmap);
 void snd_soc_component_exit_regmap(struct snd_soc_component *component);
 
-/**
- * snd_soc_codec_init_regmap() - Initialize regmap instance for the CODEC
- * @codec: The CODEC for which to initialize the regmap instance
- * @regmap: The regmap instance that should be used by the CODEC
- *
- * This function allows deferred assignment of the regmap instance that is
- * associated with the CODEC. Only use this if the regmap instance is not yet
- * ready when the CODEC is registered. The function must also be called before
- * the first IO attempt of the CODEC.
- */
-static inline void snd_soc_codec_init_regmap(struct snd_soc_codec *codec,
-	struct regmap *regmap)
-{
-	snd_soc_component_init_regmap(&codec->component, regmap);
-}
-
-/**
- * snd_soc_codec_exit_regmap() - De-initialize regmap instance for the CODEC
- * @codec: The CODEC for which to de-initialize the regmap instance
- *
- * Calls regmap_exit() on the regmap instance associated to the CODEC and
- * removes the regmap instance from the CODEC.
- *
- * This function should only be used if snd_soc_codec_init_regmap() was used to
- * initialize the regmap instance.
- */
-static inline void snd_soc_codec_exit_regmap(struct snd_soc_codec *codec)
-{
-	snd_soc_component_exit_regmap(&codec->component);
-}
-
 #endif
 
 /* device driver data */
@@ -1571,17 +1336,6 @@ static inline void *snd_soc_component_get_drvdata(struct snd_soc_component *c)
 	return dev_get_drvdata(c->dev);
 }
 
-static inline void snd_soc_codec_set_drvdata(struct snd_soc_codec *codec,
-		void *data)
-{
-	snd_soc_component_set_drvdata(&codec->component, data);
-}
-
-static inline void *snd_soc_codec_get_drvdata(struct snd_soc_codec *codec)
-{
-	return snd_soc_component_get_drvdata(&codec->component);
-}
-
 static inline void snd_soc_initialize_card_lists(struct snd_soc_card *card)
 {
 	INIT_LIST_HEAD(&card->widgets);
@@ -1633,11 +1387,6 @@ static inline bool snd_soc_component_is_active(
 	return component->active != 0;
 }
 
-static inline bool snd_soc_codec_is_active(struct snd_soc_codec *codec)
-{
-	return snd_soc_component_is_active(&codec->component);
-}
-
 /**
  * snd_soc_kcontrol_component() - Returns the component that registered the
  *  control
@@ -1654,20 +1403,6 @@ static inline struct snd_soc_component *snd_soc_kcontrol_component(
 	return snd_kcontrol_chip(kcontrol);
 }
 
-/**
- * snd_soc_kcontrol_codec() - Returns the CODEC that registered the control
- * @kcontrol: The control for which to get the CODEC
- *
- * Note: This function will only work correctly if the control has been
- * registered with snd_soc_add_codec_controls() or via table based setup of
- * snd_soc_codec_driver. Otherwise the behavior is undefined.
- */
-static inline struct snd_soc_codec *snd_soc_kcontrol_codec(
-	struct snd_kcontrol *kcontrol)
-{
-	return snd_soc_component_to_codec(snd_soc_kcontrol_component(kcontrol));
-}
-
 int snd_soc_util_init(void);
 void snd_soc_util_exit(void);
 
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 3b78868969d5..a4cb141b8f38 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -56,7 +56,6 @@ EXPORT_SYMBOL_GPL(snd_soc_debugfs_root);
 #endif
 
 static DEFINE_MUTEX(client_mutex);
-static LIST_HEAD(codec_list);
 static LIST_HEAD(component_list);
 
 /*
@@ -174,21 +173,6 @@ static void soc_cleanup_component_debugfs(struct snd_soc_component *component)
 	debugfs_remove_recursive(component->debugfs_root);
 }
 
-static int codec_list_show(struct seq_file *m, void *v)
-{
-	struct snd_soc_codec *codec;
-
-	mutex_lock(&client_mutex);
-
-	list_for_each_entry(codec, &codec_list, list)
-		seq_printf(m, "%s\n", codec->component.name);
-
-	mutex_unlock(&client_mutex);
-
-	return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(codec_list);
-
 static int dai_list_show(struct seq_file *m, void *v)
 {
 	struct snd_soc_component *component;
@@ -241,10 +225,6 @@ static void snd_soc_debugfs_init(void)
 		return;
 	}
 
-	if (!debugfs_create_file("codecs", 0444, snd_soc_debugfs_root, NULL,
-				 &codec_list_fops))
-		pr_warn("ASoC: Failed to create CODEC list debugfs file\n");
-
 	if (!debugfs_create_file("dais", 0444, snd_soc_debugfs_root, NULL,
 				 &dai_list_fops))
 		pr_warn("ASoC: Failed to create DAI list debugfs file\n");
@@ -536,8 +516,8 @@ int snd_soc_suspend(struct device *dev)
 				}
 
 			case SND_SOC_BIAS_OFF:
-				if (component->suspend)
-					component->suspend(component);
+				if (component->driver->suspend)
+					component->driver->suspend(component);
 				component->suspended = 1;
 				if (component->regmap)
 					regcache_mark_dirty(component->regmap);
@@ -608,8 +588,8 @@ static void soc_resume_deferred(struct work_struct *work)
 
 	list_for_each_entry(component, &card->component_dev_list, card_list) {
 		if (component->suspended) {
-			if (component->resume)
-				component->resume(component);
+			if (component->driver->resume)
+				component->driver->resume(component);
 			component->suspended = 0;
 		}
 	}
@@ -892,7 +872,6 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 
 	/* Single codec links expect codec and codec_dai in runtime data */
 	rtd->codec_dai = codec_dais[0];
-	rtd->codec = rtd->codec_dai->codec;
 
 	/* if there's no platform we match on the empty platform */
 	platform_name = dai_link->platform_name;
@@ -931,8 +910,8 @@ static void soc_remove_component(struct snd_soc_component *component)
 
 	list_del(&component->card_list);
 
-	if (component->remove)
-		component->remove(component);
+	if (component->driver->remove)
+		component->driver->remove(component);
 
 	snd_soc_dapm_free(snd_soc_component_get_dapm(component));
 
@@ -1271,8 +1250,8 @@ static int soc_probe_component(struct snd_soc_card *card,
 		}
 	}
 
-	if (component->probe) {
-		ret = component->probe(component);
+	if (component->driver->probe) {
+		ret = component->driver->probe(component);
 		if (ret < 0) {
 			dev_err(component->dev,
 				"ASoC: failed to probe component %d\n", ret);
@@ -1663,8 +1642,7 @@ int snd_soc_runtime_set_dai_fmt(struct snd_soc_pcm_runtime *rtd,
 
 	/* Flip the polarity for the "CPU" end of a CODEC<->CODEC link */
 	/* the component which has non_legacy_dai_naming is Codec */
-	if (cpu_dai->codec ||
-	    cpu_dai->component->driver->non_legacy_dai_naming) {
+	if (cpu_dai->component->driver->non_legacy_dai_naming) {
 		unsigned int inv_dai_fmt;
 
 		inv_dai_fmt = dai_fmt & ~SND_SOC_DAIFMT_MASTER_MASK;
@@ -2256,25 +2234,6 @@ int snd_soc_add_component_controls(struct snd_soc_component *component,
 }
 EXPORT_SYMBOL_GPL(snd_soc_add_component_controls);
 
-/**
- * snd_soc_add_codec_controls - add an array of controls to a codec.
- * Convenience function to add a list of controls. Many codecs were
- * duplicating this code.
- *
- * @codec: codec to add controls to
- * @controls: array of controls to add
- * @num_controls: number of elements in the array
- *
- * Return 0 for success, else error.
- */
-int snd_soc_add_codec_controls(struct snd_soc_codec *codec,
-	const struct snd_kcontrol_new *controls, unsigned int num_controls)
-{
-	return snd_soc_add_component_controls(&codec->component, controls,
-		num_controls);
-}
-EXPORT_SYMBOL_GPL(snd_soc_add_codec_controls);
-
 /**
  * snd_soc_add_card_controls - add an array of controls to a SoC card.
  * Convenience function to add a list of controls.
@@ -2335,27 +2294,6 @@ int snd_soc_dai_set_sysclk(struct snd_soc_dai *dai, int clk_id,
 }
 EXPORT_SYMBOL_GPL(snd_soc_dai_set_sysclk);
 
-/**
- * snd_soc_codec_set_sysclk - configure CODEC system or master clock.
- * @codec: CODEC
- * @clk_id: DAI specific clock ID
- * @source: Source for the clock
- * @freq: new clock frequency in Hz
- * @dir: new clock direction - input/output.
- *
- * Configures the CODEC master (MCLK) or system (SYSCLK) clocking.
- */
-int snd_soc_codec_set_sysclk(struct snd_soc_codec *codec, int clk_id,
-			     int source, unsigned int freq, int dir)
-{
-	if (codec->driver->set_sysclk)
-		return codec->driver->set_sysclk(codec, clk_id, source,
-						 freq, dir);
-	else
-		return -ENOTSUPP;
-}
-EXPORT_SYMBOL_GPL(snd_soc_codec_set_sysclk);
-
 /**
  * snd_soc_component_set_sysclk - configure COMPONENT system or master clock.
  * @component: COMPONENT
@@ -2369,11 +2307,6 @@ EXPORT_SYMBOL_GPL(snd_soc_codec_set_sysclk);
 int snd_soc_component_set_sysclk(struct snd_soc_component *component, int clk_id,
 			     int source, unsigned int freq, int dir)
 {
-	/* will be removed */
-	if (component->set_sysclk)
-		return component->set_sysclk(component, clk_id, source,
-					     freq, dir);
-
 	if (component->driver->set_sysclk)
 		return component->driver->set_sysclk(component, clk_id, source,
 						 freq, dir);
@@ -2424,27 +2357,6 @@ int snd_soc_dai_set_pll(struct snd_soc_dai *dai, int pll_id, int source,
 }
 EXPORT_SYMBOL_GPL(snd_soc_dai_set_pll);
 
-/*
- * snd_soc_codec_set_pll - configure codec PLL.
- * @codec: CODEC
- * @pll_id: DAI specific PLL ID
- * @source: DAI specific source for the PLL
- * @freq_in: PLL input clock frequency in Hz
- * @freq_out: requested PLL output clock frequency in Hz
- *
- * Configures and enables PLL to generate output clock based on input clock.
- */
-int snd_soc_codec_set_pll(struct snd_soc_codec *codec, int pll_id, int source,
-			  unsigned int freq_in, unsigned int freq_out)
-{
-	if (codec->driver->set_pll)
-		return codec->driver->set_pll(codec, pll_id, source,
-					      freq_in, freq_out);
-	else
-		return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(snd_soc_codec_set_pll);
-
 /*
  * snd_soc_component_set_pll - configure component PLL.
  * @component: COMPONENT
@@ -2459,11 +2371,6 @@ int snd_soc_component_set_pll(struct snd_soc_component *component, int pll_id,
 			      int source, unsigned int freq_in,
 			      unsigned int freq_out)
 {
-	/* will be removed */
-	if (component->set_pll)
-		return component->set_pll(component, pll_id, source,
-					      freq_in, freq_out);
-
 	if (component->driver->set_pll)
 		return component->driver->set_pll(component, pll_id, source,
 					      freq_in, freq_out);
@@ -2964,13 +2871,6 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 
 	component->dev = dev;
 	component->driver = driver;
-	component->probe = component->driver->probe;
-	component->remove = component->driver->remove;
-	component->suspend = component->driver->suspend;
-	component->resume = component->driver->resume;
-	component->set_sysclk = component->driver->set_sysclk;
-	component->set_pll = component->driver->set_pll;
-	component->set_jack = component->driver->set_jack;
 
 	dapm = snd_soc_component_get_dapm(component);
 	dapm->dev = dev;
@@ -3041,7 +2941,7 @@ EXPORT_SYMBOL_GPL(snd_soc_component_exit_regmap);
 
 static void snd_soc_component_add_unlocked(struct snd_soc_component *component)
 {
-	if (!component->write && !component->read) {
+	if (!component->driver->write && !component->driver->read) {
 		if (!component->regmap)
 			component->regmap = dev_get_regmap(component->dev, NULL);
 		if (component->regmap)
@@ -3123,9 +3023,6 @@ int snd_soc_add_component(struct device *dev,
 	if (ret)
 		goto err_free;
 
-	component->ignore_pmdown_time = true;
-	component->registered_as_component = true;
-
 	if (component_driver->endianness) {
 		for (i = 0; i < num_dai; i++) {
 			convert_endianness_formats(&dai_drv[i].playback);
@@ -3180,8 +3077,7 @@ static int __snd_soc_unregister_component(struct device *dev)
 
 	mutex_lock(&client_mutex);
 	list_for_each_entry(component, &component_list, list) {
-		if (dev != component->dev ||
-		    !component->registered_as_component)
+		if (dev != component->dev)
 			continue;
 
 		snd_soc_tplg_component_remove(component, SND_SOC_TPLG_INDEX_ALL);
@@ -3230,211 +3126,6 @@ struct snd_soc_component *snd_soc_lookup_component(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(snd_soc_lookup_component);
 
-static int snd_soc_codec_drv_probe(struct snd_soc_component *component)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	return codec->driver->probe(codec);
-}
-
-static void snd_soc_codec_drv_remove(struct snd_soc_component *component)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	codec->driver->remove(codec);
-}
-
-static int snd_soc_codec_drv_suspend(struct snd_soc_component *component)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	return codec->driver->suspend(codec);
-}
-
-static int snd_soc_codec_drv_resume(struct snd_soc_component *component)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	return codec->driver->resume(codec);
-}
-
-static int snd_soc_codec_drv_write(struct snd_soc_component *component,
-	unsigned int reg, unsigned int val)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	return codec->driver->write(codec, reg, val);
-}
-
-static int snd_soc_codec_drv_read(struct snd_soc_component *component,
-	unsigned int reg, unsigned int *val)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	*val = codec->driver->read(codec, reg);
-
-	return 0;
-}
-
-static int snd_soc_codec_set_sysclk_(struct snd_soc_component *component,
-			  int clk_id, int source, unsigned int freq, int dir)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	return snd_soc_codec_set_sysclk(codec, clk_id, source, freq, dir);
-}
-
-static int snd_soc_codec_set_pll_(struct snd_soc_component *component,
-				  int pll_id, int source, unsigned int freq_in,
-				  unsigned int freq_out)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	return snd_soc_codec_set_pll(codec, pll_id, source, freq_in, freq_out);
-}
-
-static int snd_soc_codec_set_jack_(struct snd_soc_component *component,
-			       struct snd_soc_jack *jack, void *data)
-{
-	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
-
-	return snd_soc_codec_set_jack(codec, jack, data);
-}
-
-static int snd_soc_codec_set_bias_level(struct snd_soc_dapm_context *dapm,
-	enum snd_soc_bias_level level)
-{
-	struct snd_soc_codec *codec = snd_soc_dapm_to_codec(dapm);
-
-	return codec->driver->set_bias_level(codec, level);
-}
-
-/**
- * snd_soc_register_codec - Register a codec with the ASoC core
- *
- * @dev: The parent device for this codec
- * @codec_drv: Codec driver
- * @dai_drv: The associated DAI driver
- * @num_dai: Number of DAIs
- */
-int snd_soc_register_codec(struct device *dev,
-			   const struct snd_soc_codec_driver *codec_drv,
-			   struct snd_soc_dai_driver *dai_drv,
-			   int num_dai)
-{
-	struct snd_soc_dapm_context *dapm;
-	struct snd_soc_codec *codec;
-	struct snd_soc_dai *dai;
-	int ret, i;
-
-	dev_dbg(dev, "codec register %s\n", dev_name(dev));
-
-	codec = kzalloc(sizeof(struct snd_soc_codec), GFP_KERNEL);
-	if (codec == NULL)
-		return -ENOMEM;
-
-	codec->component.codec = codec;
-
-	ret = snd_soc_component_initialize(&codec->component,
-			&codec_drv->component_driver, dev);
-	if (ret)
-		goto err_free;
-
-	if (codec_drv->probe)
-		codec->component.probe = snd_soc_codec_drv_probe;
-	if (codec_drv->remove)
-		codec->component.remove = snd_soc_codec_drv_remove;
-	if (codec_drv->suspend)
-		codec->component.suspend = snd_soc_codec_drv_suspend;
-	if (codec_drv->resume)
-		codec->component.resume = snd_soc_codec_drv_resume;
-	if (codec_drv->write)
-		codec->component.write = snd_soc_codec_drv_write;
-	if (codec_drv->read)
-		codec->component.read = snd_soc_codec_drv_read;
-	if (codec_drv->set_sysclk)
-		codec->component.set_sysclk = snd_soc_codec_set_sysclk_;
-	if (codec_drv->set_pll)
-		codec->component.set_pll = snd_soc_codec_set_pll_;
-	if (codec_drv->set_jack)
-		codec->component.set_jack = snd_soc_codec_set_jack_;
-	codec->component.ignore_pmdown_time = codec_drv->ignore_pmdown_time;
-
-	dapm = snd_soc_codec_get_dapm(codec);
-	dapm->idle_bias_off = codec_drv->idle_bias_off;
-	dapm->suspend_bias_off = codec_drv->suspend_bias_off;
-	if (codec_drv->seq_notifier)
-		dapm->seq_notifier = codec_drv->seq_notifier;
-	if (codec_drv->set_bias_level)
-		dapm->set_bias_level = snd_soc_codec_set_bias_level;
-	codec->dev = dev;
-	codec->driver = codec_drv;
-
-#ifdef CONFIG_DEBUG_FS
-	codec->component.debugfs_prefix = "codec";
-#endif
-
-	for (i = 0; i < num_dai; i++) {
-		convert_endianness_formats(&dai_drv[i].playback);
-		convert_endianness_formats(&dai_drv[i].capture);
-	}
-
-	ret = snd_soc_register_dais(&codec->component, dai_drv, num_dai, false);
-	if (ret < 0) {
-		dev_err(dev, "ASoC: Failed to register DAIs: %d\n", ret);
-		goto err_cleanup;
-	}
-
-	list_for_each_entry(dai, &codec->component.dai_list, list)
-		dai->codec = codec;
-
-	mutex_lock(&client_mutex);
-	snd_soc_component_add_unlocked(&codec->component);
-	list_add(&codec->list, &codec_list);
-	mutex_unlock(&client_mutex);
-
-	dev_dbg(codec->dev, "ASoC: Registered codec '%s'\n",
-		codec->component.name);
-	return 0;
-
-err_cleanup:
-	snd_soc_component_cleanup(&codec->component);
-err_free:
-	kfree(codec);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(snd_soc_register_codec);
-
-/**
- * snd_soc_unregister_codec - Unregister a codec from the ASoC core
- *
- * @dev: codec to unregister
- */
-void snd_soc_unregister_codec(struct device *dev)
-{
-	struct snd_soc_codec *codec;
-
-	mutex_lock(&client_mutex);
-	list_for_each_entry(codec, &codec_list, list) {
-		if (dev == codec->dev)
-			goto found;
-	}
-	mutex_unlock(&client_mutex);
-	return;
-
-found:
-	list_del(&codec->list);
-	snd_soc_component_del_unlocked(&codec->component);
-	mutex_unlock(&client_mutex);
-
-	dev_dbg(codec->dev, "ASoC: Unregistered codec '%s'\n",
-			codec->component.name);
-
-	snd_soc_component_cleanup(&codec->component);
-	kfree(codec);
-}
-EXPORT_SYMBOL_GPL(snd_soc_unregister_codec);
-
 /* Retrieve a card's name from device tree */
 int snd_soc_of_parse_card_name(struct snd_soc_card *card,
 			       const char *propname)
diff --git a/sound/soc/soc-io.c b/sound/soc/soc-io.c
index c92a04bac3c5..026cd5347e53 100644
--- a/sound/soc/soc-io.c
+++ b/sound/soc/soc-io.c
@@ -32,8 +32,6 @@ int snd_soc_component_read(struct snd_soc_component *component,
 
 	if (component->regmap)
 		ret = regmap_read(component->regmap, reg, val);
-	else if (component->read)
-		ret = component->read(component, reg, val);
 	else if (component->driver->read) {
 		*val = component->driver->read(component, reg);
 		ret = 0;
@@ -72,8 +70,6 @@ int snd_soc_component_write(struct snd_soc_component *component,
 {
 	if (component->regmap)
 		return regmap_write(component->regmap, reg, val);
-	else if (component->write)
-		return component->write(component, reg, val);
 	else if (component->driver->write)
 		return component->driver->write(component, reg, val);
 	else
@@ -209,61 +205,3 @@ int snd_soc_component_test_bits(struct snd_soc_component *component,
 	return old != new;
 }
 EXPORT_SYMBOL_GPL(snd_soc_component_test_bits);
-
-unsigned int snd_soc_read(struct snd_soc_codec *codec, unsigned int reg)
-{
-	unsigned int val;
-	int ret;
-
-	ret = snd_soc_component_read(&codec->component, reg, &val);
-	if (ret < 0)
-		return -1;
-
-	return val;
-}
-EXPORT_SYMBOL_GPL(snd_soc_read);
-
-int snd_soc_write(struct snd_soc_codec *codec, unsigned int reg,
-	unsigned int val)
-{
-	return snd_soc_component_write(&codec->component, reg, val);
-}
-EXPORT_SYMBOL_GPL(snd_soc_write);
-
-/**
- * snd_soc_update_bits - update codec register bits
- * @codec: audio codec
- * @reg: codec register
- * @mask: register mask
- * @value: new value
- *
- * Writes new register value.
- *
- * Returns 1 for change, 0 for no change, or negative error code.
- */
-int snd_soc_update_bits(struct snd_soc_codec *codec, unsigned int reg,
-				unsigned int mask, unsigned int value)
-{
-	return snd_soc_component_update_bits(&codec->component, reg, mask,
-		value);
-}
-EXPORT_SYMBOL_GPL(snd_soc_update_bits);
-
-/**
- * snd_soc_test_bits - test register for change
- * @codec: audio codec
- * @reg: codec register
- * @mask: register mask
- * @value: new value
- *
- * Tests a register with a new value and checks if the new value is
- * different from the old value.
- *
- * Returns 1 for change else 0.
- */
-int snd_soc_test_bits(struct snd_soc_codec *codec, unsigned int reg,
-				unsigned int mask, unsigned int value)
-{
-	return snd_soc_component_test_bits(&codec->component, reg, mask, value);
-}
-EXPORT_SYMBOL_GPL(snd_soc_test_bits);
diff --git a/sound/soc/soc-jack.c b/sound/soc/soc-jack.c
index 99902ae1a2d9..b2b16044ae80 100644
--- a/sound/soc/soc-jack.c
+++ b/sound/soc/soc-jack.c
@@ -28,24 +28,6 @@ struct jack_gpio_tbl {
 	struct snd_soc_jack_gpio *gpios;
 };
 
-/**
- * snd_soc_codec_set_jack - configure codec jack.
- * @codec: CODEC
- * @jack: structure to use for the jack
- * @data: can be used if codec driver need extra data for configuring jack
- *
- * Configures and enables jack detection function.
- */
-int snd_soc_codec_set_jack(struct snd_soc_codec *codec,
-	struct snd_soc_jack *jack, void *data)
-{
-	if (codec->driver->set_jack)
-		return codec->driver->set_jack(codec, jack, data);
-	else
-		return -ENOTSUPP;
-}
-EXPORT_SYMBOL_GPL(snd_soc_codec_set_jack);
-
 /**
  * snd_soc_component_set_jack - configure component jack.
  * @component: COMPONENTs
@@ -57,10 +39,6 @@ EXPORT_SYMBOL_GPL(snd_soc_codec_set_jack);
 int snd_soc_component_set_jack(struct snd_soc_component *component,
 			       struct snd_soc_jack *jack, void *data)
 {
-	/* will be removed */
-	if (component->set_jack)
-		return component->set_jack(component, jack, data);
-
 	if (component->driver->set_jack)
 		return component->driver->set_jack(component, jack, data);
 
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 3f6375499102..87c9af2158d0 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -135,7 +135,6 @@ bool snd_soc_runtime_ignore_pmdown_time(struct snd_soc_pcm_runtime *rtd)
 {
 	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_component *component;
-	int i;
 	bool ignore = true;
 
 	if (!rtd->pmdown_time || rtd->dai_link->ignore_pmdown_time)
@@ -147,10 +146,6 @@ bool snd_soc_runtime_ignore_pmdown_time(struct snd_soc_pcm_runtime *rtd)
 		ignore &= !component->driver->use_pmdown_time;
 	}
 
-	/* this will be removed */
-	for (i = 0; i < rtd->num_codecs; i++)
-		ignore &= rtd->codec_dais[i]->component->ignore_pmdown_time;
-
 	return ignore;
 }
 
-- 
cgit v1.2.3


From c8306238faf596ffdb01e5c96e0532be37a4a2a6 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 May 2018 03:21:46 +0000
Subject: ASoC: soc.h: merge CONFIG_DEBUG_FS

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 21861f366dcb..600a7ebd10c0 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -831,10 +831,6 @@ struct snd_soc_component {
 	/* attached dynamic objects */
 	struct list_head dobj_list;
 
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_root;
-#endif
-
 	/*
 	* DO NOT use any of the fields below in drivers, they are temporary and
 	* are going to be removed again soon. If you use them in driver code the
@@ -848,6 +844,7 @@ struct snd_soc_component {
 	int (*init)(struct snd_soc_component *component);
 
 #ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_root;
 	const char *debugfs_prefix;
 #endif
 };
-- 
cgit v1.2.3


From 9c4a121e82634aa000a702c98cd6f05b27d6e186 Mon Sep 17 00:00:00 2001
From: Sean Lanigan <sean@lano.id.au>
Date: Fri, 4 May 2018 16:48:23 +1000
Subject: brcmfmac: Add support for bcm43364 wireless chipset

Add support for the BCM43364 chipset via an SDIO interface, as used in
e.g. the Murata 1FX module.

The BCM43364 uses the same firmware as the BCM43430 (which is already
included), the only difference is the omission of Bluetooth.

However, the SDIO_ID for the BCM43364 is 02D0:A9A4, giving it a MODALIAS
of sdio:c00v02D0dA9A4, which doesn't get recognised and hence doesn't
load the brcmfmac module. Adding the 'A9A4' ID in the appropriate place
triggers the brcmfmac driver to load, and then correctly use the
firmware file 'brcmfmac43430-sdio.bin'.

Signed-off-by: Sean Lanigan <sean@lano.id.au>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 +
 include/linux/mmc/sdio_ids.h                              | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 0b68240ec7b4..a1915411c280 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -963,6 +963,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43340),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43341),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43362),
+ 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43364),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4335_4339),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430),
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index cdd66a5fbd5e..0a7abe8a407f 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -35,6 +35,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_4335_4339	0x4335
 #define SDIO_DEVICE_ID_BROADCOM_4339		0x4339
 #define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
+#define SDIO_DEVICE_ID_BROADCOM_43364		0xa9a4
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_4345		0x4345
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
-- 
cgit v1.2.3


From a676688032e819c4bb1999a422553a1a1f52ce45 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Mon, 7 May 2018 06:35:52 -0300
Subject: mtd: rawnand.h: use nested union kernel-doc markups

Gets rid of those warnings and better document the parameters.

  ./include/linux/mtd/rawnand.h:752: warning: Function parameter or member 'timings.sdr' not described in 'nand_data_interface'
  ./include/linux/mtd/rawnand.h:817: warning: Function parameter or member 'buf' not described in 'nand_op_data_instr'
  ./include/linux/mtd/rawnand.h:817: warning: Function parameter or member 'buf.in' not described in 'nand_op_data_instr'
  ./include/linux/mtd/rawnand.h:817: warning: Function parameter or member 'buf.out' not described in 'nand_op_data_instr'
  ./include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx' not described in 'nand_op_instr'
  ./include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.cmd' not described in 'nand_op_instr'
  ./include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.addr' not described in 'nand_op_instr'
  ./include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.data' not described in 'nand_op_instr'
  ./include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.waitrdy' not described in 'nand_op_instr'
  ./include/linux/mtd/rawnand.h:1010: warning: Function parameter or member 'ctx' not described in 'nand_op_parser_pattern_elem'
  ./include/linux/mtd/rawnand.h:1010: warning: Function parameter or member 'ctx.addr' not described in 'nand_op_parser_pattern_elem'
  ./include/linux/mtd/rawnand.h:1010: warning: Function parameter or member 'ctx.data' not described in 'nand_op_parser_pattern_elem'
  ./include/linux/mtd/rawnand.h:1313: warning: Function parameter or member 'manufacturer.desc' not described in 'nand_chip'
  ./include/linux/mtd/rawnand.h:1313: warning: Function parameter or member 'manufacturer.priv' not described in 'nand_chip'

  ./include/linux/mtd/rawnand.h:848: WARNING: Unexpected indentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 include/linux/mtd/rawnand.h | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index ba8d908f5cc7..a206172ec23b 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -747,8 +747,9 @@ enum nand_data_interface_type {
 
 /**
  * struct nand_data_interface - NAND interface timing
- * @type:	type of the timing
- * @timings:	The timing, type according to @type
+ * @type:	 type of the timing
+ * @timings:	 The timing, type according to @type
+ * @timings.sdr: Use it when @type is %NAND_SDR_IFACE.
  */
 struct nand_data_interface {
 	enum nand_data_interface_type type;
@@ -805,8 +806,9 @@ struct nand_op_addr_instr {
 /**
  * struct nand_op_data_instr - Definition of a data instruction
  * @len: number of data bytes to move
- * @in: buffer to fill when reading from the NAND chip
- * @out: buffer to read from when writing to the NAND chip
+ * @buf: buffer to fill
+ * @buf.in: buffer to fill when reading from the NAND chip
+ * @buf.out: buffer to read from when writing to the NAND chip
  * @force_8bit: force 8-bit access
  *
  * Please note that "in" and "out" are inverted from the ONFI specification
@@ -849,9 +851,13 @@ enum nand_op_instr_type {
 /**
  * struct nand_op_instr - Instruction object
  * @type: the instruction type
- * @cmd/@addr/@data/@waitrdy: extra data associated to the instruction.
- *                            You'll have to use the appropriate element
- *                            depending on @type
+ * @ctx:  extra data associated to the instruction. You'll have to use the
+ *        appropriate element depending on @type
+ * @ctx.cmd: use it if @type is %NAND_OP_CMD_INSTR
+ * @ctx.addr: use it if @type is %NAND_OP_ADDR_INSTR
+ * @ctx.data: use it if @type is %NAND_OP_DATA_IN_INSTR
+ *	      or %NAND_OP_DATA_OUT_INSTR
+ * @ctx.waitrdy: use it if @type is %NAND_OP_WAITRDY_INSTR
  * @delay_ns: delay the controller should apply after the instruction has been
  *	      issued on the bus. Most modern controllers have internal timings
  *	      control logic, and in this case, the controller driver can ignore
@@ -1004,7 +1010,9 @@ struct nand_op_parser_data_constraints {
  * struct nand_op_parser_pattern_elem - One element of a pattern
  * @type: the instructuction type
  * @optional: whether this element of the pattern is optional or mandatory
- * @addr/@data: address or data constraint (number of cycles or data length)
+ * @ctx: address or data constraint
+ * @ctx.addr: address constraint (number of cycles)
+ * @ctx.data: data constraint (data length)
  */
 struct nand_op_parser_pattern_elem {
 	enum nand_op_instr_type type;
@@ -1231,6 +1239,8 @@ int nand_op_parser_exec_op(struct nand_chip *chip,
  *			devices.
  * @priv:		[OPTIONAL] pointer to private chip data
  * @manufacturer:	[INTERN] Contains manufacturer information
+ * @manufacturer.desc:	[INTERN] Contains manufacturer's description
+ * @manufacturer.priv:	[INTERN] Contains manufacturer private information
  */
 
 struct nand_chip {
-- 
cgit v1.2.3


From 5238dcf4136fd7287be8e7d38752645bfa5782ec Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 9 May 2018 02:08:49 -0700
Subject: block: replace bio->bi_issue_stat with bio-specific type

struct blk_issue_stat is going away, and bio->bi_issue_stat doesn't even
use the blk-stats interface, so we can provide a separate implementation
specific for bios. The helpers work the same way as the blk-stats
helpers.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c      | 19 ++++++++----------
 include/linux/blk_types.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c5a131673733..35f9b8ff40d7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -36,8 +36,6 @@ static int throtl_quantum = 32;
  */
 #define LATENCY_FILTERED_HD (1000L) /* 1ms */
 
-#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
-
 static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
@@ -2139,7 +2137,7 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 		bio->bi_cg_private = tg;
 		blkg_get(tg_to_blkg(tg));
 	}
-	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 #endif
 }
 
@@ -2251,7 +2249,7 @@ out:
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (throttled || !td->track_bio_latency)
-		bio->bi_issue_stat.stat |= SKIP_LATENCY;
+		bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
 #endif
 	return throttled;
 }
@@ -2302,8 +2300,8 @@ void blk_throtl_bio_endio(struct bio *bio)
 	finish_time_ns = ktime_get_ns();
 	tg->last_finish_time = finish_time_ns >> 10;
 
-	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
-	finish_time = __blk_stat_time(finish_time_ns) >> 10;
+	start_time = bio_issue_time(&bio->bi_issue) >> 10;
+	finish_time = __bio_issue_time(finish_time_ns) >> 10;
 	if (!start_time || finish_time <= start_time) {
 		blkg_put(tg_to_blkg(tg));
 		return;
@@ -2311,16 +2309,15 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 	lat = finish_time - start_time;
 	/* this is only for bio based driver */
-	if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
-		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
-			bio_op(bio), lat);
+	if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
+		throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
+				     bio_op(bio), lat);
 
 	if (tg->latency_target && lat >= tg->td->filtered_latency) {
 		int bucket;
 		unsigned int threshold;
 
-		bucket = request_bucket_index(
-			blk_stat_size(&bio->bi_issue_stat));
+		bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
 		threshold = tg->td->avg_buckets[rw][bucket].latency +
 			tg->latency_target;
 		if (lat > threshold)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 17b18b91ebac..b6f1d53cf113 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -8,6 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/bvec.h>
+#include <linux/ktime.h>
 
 struct bio_set;
 struct bio;
@@ -94,6 +95,52 @@ struct blk_issue_stat {
 	u64 stat;
 };
 
+/*
+ * From most significant bit:
+ * 1 bit: reserved for other usage, see below
+ * 12 bits: original size of bio
+ * 51 bits: issue time of bio
+ */
+#define BIO_ISSUE_RES_BITS      1
+#define BIO_ISSUE_SIZE_BITS     12
+#define BIO_ISSUE_RES_SHIFT     (64 - BIO_ISSUE_RES_BITS)
+#define BIO_ISSUE_SIZE_SHIFT    (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
+#define BIO_ISSUE_TIME_MASK     ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
+#define BIO_ISSUE_SIZE_MASK     \
+	(((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
+#define BIO_ISSUE_RES_MASK      (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
+
+/* Reserved bit for blk-throtl */
+#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
+
+struct bio_issue {
+	u64 value;
+};
+
+static inline u64 __bio_issue_time(u64 time)
+{
+	return time & BIO_ISSUE_TIME_MASK;
+}
+
+static inline u64 bio_issue_time(struct bio_issue *issue)
+{
+	return __bio_issue_time(issue->value);
+}
+
+static inline sector_t bio_issue_size(struct bio_issue *issue)
+{
+	return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
+}
+
+static inline void bio_issue_init(struct bio_issue *issue,
+				       sector_t size)
+{
+	size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
+	issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
+			(ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
+			((u64)size << BIO_ISSUE_SIZE_SHIFT));
+}
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -138,7 +185,7 @@ struct bio {
 	struct cgroup_subsys_state *bi_css;
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	void			*bi_cg_private;
-	struct blk_issue_stat	bi_issue_stat;
+	struct bio_issue	bi_issue;
 #endif
 #endif
 	union {
-- 
cgit v1.2.3


From 544ccc8dc904db55d4576c27a1eb66a888ffacea Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 9 May 2018 02:08:50 -0700
Subject: block: get rid of struct blk_issue_stat

struct blk_issue_stat squashes three things into one u64:

- The time the driver started working on a request
- The original size of the request (for the io.low controller)
- Flags for writeback throttling

It turns out that on x86_64, we have a 4 byte hole in struct request
which we can fill with the non-timestamp fields from blk_issue_stat,
simplifying things quite a bit.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          |  5 ++++-
 block/blk-mq.c            |  8 ++++++--
 block/blk-stat.c          |  7 ++-----
 block/blk-stat.h          | 43 -------------------------------------------
 block/blk-throttle.c      |  3 +--
 block/blk-wbt.c           | 12 +++++-------
 block/blk-wbt.h           |  4 ++--
 block/kyber-iosched.c     |  6 +++---
 include/linux/blk_types.h |  4 ----
 include/linux/blkdev.h    | 26 ++++++++++++++++++--------
 10 files changed, 41 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index fe2f457ed27d..33d5c7d85da1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2991,7 +2991,10 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-		blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
+		req->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+		req->throtl_size = blk_rq_sectors(req);
+#endif
 		req->rq_flags |= RQF_STATS;
 		wbt_issue(req->q->rq_wb, req);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index deb85b5c6c21..17612e04d041 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -310,6 +310,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->rq_disk = NULL;
 	rq->part = NULL;
 	rq->start_time = jiffies;
+	rq->io_start_time_ns = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -329,7 +330,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 #ifdef CONFIG_BLK_CGROUP
 	rq->rl = NULL;
 	set_start_time_ns(rq);
-	rq->io_start_time_ns = 0;
+	rq->cgroup_io_start_time_ns = 0;
 #endif
 
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
@@ -669,7 +670,10 @@ void blk_mq_start_request(struct request *rq)
 	trace_block_rq_issue(q, rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
+		rq->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+		rq->throtl_size = blk_rq_sectors(rq);
+#endif
 		rq->rq_flags |= RQF_STATS;
 		wbt_issue(q->rq_wb, rq);
 	}
diff --git a/block/blk-stat.c b/block/blk-stat.c
index bd365a95fcf8..725a881723b0 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -55,11 +55,8 @@ void blk_stat_add(struct request *rq)
 	int bucket;
 	u64 now, value;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
-		return;
-
-	value = now - blk_stat_time(&rq->issue_stat);
+	now = ktime_get_ns();
+	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
 
 	blk_throtl_stat_add(rq, value);
 
diff --git a/block/blk-stat.h b/block/blk-stat.h
index c22049a8125e..17c812db0aca 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -8,21 +8,6 @@
 #include <linux/rcupdate.h>
 #include <linux/timer.h>
 
-/*
- * from upper:
- * 4 bits: reserved for other usage
- * 12 bits: size
- * 48 bits: time
- */
-#define BLK_STAT_RES_BITS	4
-#define BLK_STAT_SIZE_BITS	12
-#define BLK_STAT_RES_SHIFT	(64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_SIZE_SHIFT	(BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
-#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
-#define BLK_STAT_SIZE_MASK	\
-	(((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
-#define BLK_STAT_RES_MASK	(~((1ULL << BLK_STAT_RES_SHIFT) - 1))
-
 /**
  * struct blk_stat_callback - Block statistics callback.
  *
@@ -82,34 +67,6 @@ void blk_free_queue_stats(struct blk_queue_stats *);
 
 void blk_stat_add(struct request *);
 
-static inline u64 __blk_stat_time(u64 time)
-{
-	return time & BLK_STAT_TIME_MASK;
-}
-
-static inline u64 blk_stat_time(struct blk_issue_stat *stat)
-{
-	return __blk_stat_time(stat->stat);
-}
-
-static inline sector_t blk_capped_size(sector_t size)
-{
-	return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
-}
-
-static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
-{
-	return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
-}
-
-static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
-	sector_t size)
-{
-	stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
-		(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
-		(((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
-}
-
 /* record time/size info in request but not add a callback */
 void blk_stat_enable_accounting(struct request_queue *q);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 35f9b8ff40d7..f63d88c92c3a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2279,8 +2279,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
 	struct request_queue *q = rq->q;
 	struct throtl_data *td = q->td;
 
-	throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
-		req_op(rq), time_ns >> 10);
+	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
 }
 
 void blk_throtl_bio_endio(struct bio *bio)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 592e914c9890..4f89b28fa652 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -31,22 +31,22 @@
 
 static inline void wbt_clear_state(struct request *rq)
 {
-	rq->issue_stat.stat &= ~BLK_STAT_RES_MASK;
+	rq->wbt_flags = 0;
 }
 
 static inline enum wbt_flags wbt_flags(struct request *rq)
 {
-	return (rq->issue_stat.stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
+	return rq->wbt_flags;
 }
 
 static inline bool wbt_is_tracked(struct request *rq)
 {
-	return (rq->issue_stat.stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
+	return rq->wbt_flags & WBT_TRACKED;
 }
 
 static inline bool wbt_is_read(struct request *rq)
 {
-	return (rq->issue_stat.stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
+	return rq->wbt_flags & WBT_READ;
 }
 
 enum {
@@ -657,7 +657,7 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq)
 	 */
 	if (wbt_is_read(rq) && !rwb->sync_issue) {
 		rwb->sync_cookie = rq;
-		rwb->sync_issue = blk_stat_time(&rq->issue_stat);
+		rwb->sync_issue = rq->io_start_time_ns;
 	}
 }
 
@@ -746,8 +746,6 @@ int wbt_init(struct request_queue *q)
 	struct rq_wb *rwb;
 	int i;
 
-	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
-
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return -ENOMEM;
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 85fbcccf12a5..300df531d0a6 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -63,7 +63,7 @@ struct rq_wb {
 
 	struct blk_stat_callback *cb;
 
-	s64 sync_issue;
+	u64 sync_issue;
 	void *sync_cookie;
 
 	unsigned int wc;
@@ -90,7 +90,7 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
 
 static inline void wbt_track(struct request *rq, enum wbt_flags flags)
 {
-	rq->issue_stat.stat |= ((u64)flags) << BLK_STAT_RES_SHIFT;
+	rq->wbt_flags |= flags;
 }
 
 void __wbt_done(struct rq_wb *, enum wbt_flags);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 0d6d25e32e1f..564967fafe5f 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -485,11 +485,11 @@ static void kyber_completed_request(struct request *rq)
 	if (blk_stat_is_active(kqd->cb))
 		return;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
+	now = ktime_get_ns();
+	if (now < rq->io_start_time_ns)
 		return;
 
-	latency = now - blk_stat_time(&rq->issue_stat);
+	latency = now - rq->io_start_time_ns;
 
 	if (latency > target)
 		blk_stat_activate_msecs(kqd->cb, 10);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b6f1d53cf113..4cb970cdcd11 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -91,10 +91,6 @@ static inline bool blk_path_error(blk_status_t error)
 	return true;
 }
 
-struct blk_issue_stat {
-	u64 stat;
-};
-
 /*
  * From most significant bit:
  * 1 bit: reserved for other usage, see below
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c4eee043191..f2c2fc011e6b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -206,8 +206,18 @@ struct request {
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
 	unsigned long start_time;
-	struct blk_issue_stat issue_stat;
-	/* Number of scatter-gather DMA addr+len pairs after
+	/* Time that I/O was submitted to the device. */
+	u64 io_start_time_ns;
+
+#ifdef CONFIG_BLK_WBT
+	unsigned short wbt_flags;
+#endif
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	unsigned short throtl_size;
+#endif
+
+	/*
+	 * Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
@@ -267,8 +277,8 @@ struct request {
 
 #ifdef CONFIG_BLK_CGROUP
 	struct request_list *rl;		/* rl this rq is alloced from */
-	unsigned long long start_time_ns;
-	unsigned long long io_start_time_ns;    /* when passed to hardware */
+	unsigned long long cgroup_start_time_ns;
+	unsigned long long cgroup_io_start_time_ns;    /* when passed to hardware */
 #endif
 };
 
@@ -1797,25 +1807,25 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo
 static inline void set_start_time_ns(struct request *req)
 {
 	preempt_disable();
-	req->start_time_ns = sched_clock();
+	req->cgroup_start_time_ns = sched_clock();
 	preempt_enable();
 }
 
 static inline void set_io_start_time_ns(struct request *req)
 {
 	preempt_disable();
-	req->io_start_time_ns = sched_clock();
+	req->cgroup_io_start_time_ns = sched_clock();
 	preempt_enable();
 }
 
 static inline uint64_t rq_start_time_ns(struct request *req)
 {
-        return req->start_time_ns;
+	return req->cgroup_start_time_ns;
 }
 
 static inline uint64_t rq_io_start_time_ns(struct request *req)
 {
-        return req->io_start_time_ns;
+	return req->cgroup_io_start_time_ns;
 }
 #else
 static inline void set_start_time_ns(struct request *req) {}
-- 
cgit v1.2.3


From 84c7afcebed913c93d50f116b046b7f0d8ec0cdc Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 9 May 2018 02:08:51 -0700
Subject: block: use ktime_get_ns() instead of sched_clock() for cfq and bfq

cfq and bfq have some internal fields that use sched_clock() which can
trivially use ktime_get_ns() instead. Their timestamp fields in struct
request can also use ktime_get_ns(), which resolves the 8 year old
comment added by commit 28f4197e5d47 ("block: disable preemption before
using sched_clock()").

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c     | 40 ++++++++++++++++++++--------------------
 block/bfq-iosched.h    | 10 +++++-----
 block/cfq-iosched.c    | 49 ++++++++++++++++++++++++++-----------------------
 include/linux/blkdev.h | 21 ++++++---------------
 4 files changed, 57 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index d819dc77fe65..a9e8633388f4 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -55,13 +55,13 @@ BFQG_FLAG_FNS(empty)
 /* This should be called with the scheduler lock held. */
 static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!bfqg_stats_waiting(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
+	now = ktime_get_ns();
+	if (now > stats->start_group_wait_time)
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	bfqg_stats_clear_waiting(stats);
@@ -77,20 +77,20 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
 		return;
 	if (bfqg == curr_bfqg)
 		return;
-	stats->start_group_wait_time = sched_clock();
+	stats->start_group_wait_time = ktime_get_ns();
 	bfqg_stats_mark_waiting(stats);
 }
 
 /* This should be called with the scheduler lock held. */
 static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!bfqg_stats_empty(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
+	now = ktime_get_ns();
+	if (now > stats->start_empty_time)
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	bfqg_stats_clear_empty(stats);
@@ -116,7 +116,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
 	if (bfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = sched_clock();
+	stats->start_empty_time = ktime_get_ns();
 	bfqg_stats_mark_empty(stats);
 }
 
@@ -125,9 +125,9 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
 	struct bfqg_stats *stats = &bfqg->stats;
 
 	if (bfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
+		u64 now = ktime_get_ns();
 
-		if (time_after64(now, stats->start_idle_time))
+		if (now > stats->start_idle_time)
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		bfqg_stats_clear_idling(stats);
@@ -138,7 +138,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
 
-	stats->start_idle_time = sched_clock();
+	stats->start_idle_time = ktime_get_ns();
 	bfqg_stats_mark_idling(stats);
 }
 
@@ -171,18 +171,18 @@ void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
 	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
 }
 
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op)
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
-	unsigned long long now = sched_clock();
+	u64 now = ktime_get_ns();
 
-	if (time_after64(now, io_start_time))
+	if (now > io_start_time_ns)
 		blkg_rwstat_add(&stats->service_time, op,
-				now - io_start_time);
-	if (time_after64(io_start_time, start_time))
+				now - io_start_time_ns);
+	if (io_start_time_ns > start_time_ns)
 		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time - start_time);
+				io_start_time_ns - start_time_ns);
 }
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
@@ -191,8 +191,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
 			      unsigned int op) { }
 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op) { }
 void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
 void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ae2f3dadec44..8ec7ff92cd6f 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -732,9 +732,9 @@ struct bfqg_stats {
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
+	u64				start_group_wait_time;
+	u64				start_idle_time;
+	u64				start_empty_time;
 	uint16_t			flags;
 #endif	/* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 };
@@ -856,8 +856,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
 			      unsigned int op);
 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op);
 void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
 void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9f342ef1ad42..652ca064de20 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -210,9 +210,9 @@ struct cfqg_stats {
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
+	u64				start_group_wait_time;
+	u64				start_idle_time;
+	u64				start_empty_time;
 	uint16_t			flags;
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
@@ -491,13 +491,13 @@ CFQG_FLAG_FNS(empty)
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!cfqg_stats_waiting(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
+	now = ktime_get_ns();
+	if (now > stats->start_group_wait_time)
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	cfqg_stats_clear_waiting(stats);
@@ -513,20 +513,20 @@ static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 		return;
 	if (cfqg == curr_cfqg)
 		return;
-	stats->start_group_wait_time = sched_clock();
+	stats->start_group_wait_time = ktime_get_ns();
 	cfqg_stats_mark_waiting(stats);
 }
 
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!cfqg_stats_empty(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
+	now = ktime_get_ns();
+	if (now > stats->start_empty_time)
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	cfqg_stats_clear_empty(stats);
@@ -552,7 +552,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 	if (cfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = sched_clock();
+	stats->start_empty_time = ktime_get_ns();
 	cfqg_stats_mark_empty(stats);
 }
 
@@ -561,9 +561,9 @@ static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 	struct cfqg_stats *stats = &cfqg->stats;
 
 	if (cfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
+		u64 now = ktime_get_ns();
 
-		if (time_after64(now, stats->start_idle_time))
+		if (now > stats->start_idle_time)
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		cfqg_stats_clear_idling(stats);
@@ -576,7 +576,7 @@ static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 
 	BUG_ON(cfqg_stats_idling(stats));
 
-	stats->start_idle_time = sched_clock();
+	stats->start_idle_time = ktime_get_ns();
 	cfqg_stats_mark_idling(stats);
 }
 
@@ -701,17 +701,19 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op)
+						u64 start_time_ns,
+						u64 io_start_time_ns,
+						unsigned int op)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
-	unsigned long long now = sched_clock();
+	u64 now = ktime_get_ns();
 
-	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
-	if (time_after64(io_start_time, start_time))
+	if (now > io_start_time_ns)
+		blkg_rwstat_add(&stats->service_time, op,
+				now - io_start_time_ns);
+	if (io_start_time_ns > start_time_ns)
 		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time - start_time);
+				io_start_time_ns - start_time_ns);
 }
 
 /* @stats = 0 */
@@ -797,8 +799,9 @@ static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 			unsigned int op) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op) { }
+						u64 start_time_ns,
+						u64 io_start_time_ns,
+						unsigned int op) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f2c2fc011e6b..9ef412666df1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1799,42 +1799,33 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
-/*
- * This should not be using sched_clock(). A real patch is in progress
- * to fix this up, until that is in place we need to disable preemption
- * around sched_clock() in this function and set_io_start_time_ns().
- */
 static inline void set_start_time_ns(struct request *req)
 {
-	preempt_disable();
-	req->cgroup_start_time_ns = sched_clock();
-	preempt_enable();
+	req->cgroup_start_time_ns = ktime_get_ns();
 }
 
 static inline void set_io_start_time_ns(struct request *req)
 {
-	preempt_disable();
-	req->cgroup_io_start_time_ns = sched_clock();
-	preempt_enable();
+	req->cgroup_io_start_time_ns = ktime_get_ns();
 }
 
-static inline uint64_t rq_start_time_ns(struct request *req)
+static inline u64 rq_start_time_ns(struct request *req)
 {
 	return req->cgroup_start_time_ns;
 }
 
-static inline uint64_t rq_io_start_time_ns(struct request *req)
+static inline u64 rq_io_start_time_ns(struct request *req)
 {
 	return req->cgroup_io_start_time_ns;
 }
 #else
 static inline void set_start_time_ns(struct request *req) {}
 static inline void set_io_start_time_ns(struct request *req) {}
-static inline uint64_t rq_start_time_ns(struct request *req)
+static inline u64 rq_start_time_ns(struct request *req)
 {
 	return 0;
 }
-static inline uint64_t rq_io_start_time_ns(struct request *req)
+static inline u64 rq_io_start_time_ns(struct request *req)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 522a777566f5669606a1227bf13f3fb40963780b Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 9 May 2018 02:08:53 -0700
Subject: block: consolidate struct request timestamp fields

Currently, struct request has four timestamp fields:

- A start time, set at get_request time, in jiffies, used for iostats
- An I/O start time, set at start_request time, in ktime nanoseconds,
  used for blk-stats (i.e., wbt, kyber, hybrid polling)
- Another start time and another I/O start time, used for cfq and bfq

These can all be consolidated into one start time and one I/O start
time, both in ktime nanoseconds, shaving off up to 16 bytes from struct
request depending on the kernel config.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c    |  4 ++--
 block/blk-core.c       | 17 ++++++++---------
 block/blk-merge.c      | 11 +++++------
 block/blk-mq.c         | 10 +++++-----
 block/blk-stat.c       |  5 ++---
 block/blk-stat.h       |  2 +-
 block/blk.h            |  2 +-
 block/cfq-iosched.c    | 15 +++------------
 drivers/md/dm-rq.c     |  2 +-
 include/linux/blkdev.h | 38 ++------------------------------------
 10 files changed, 30 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 771ae9730ac6..ebc264c87a09 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4778,8 +4778,8 @@ static void bfq_finish_requeue_request(struct request *rq)
 
 	if (rq->rq_flags & RQF_STARTED)
 		bfqg_stats_update_completion(bfqq_group(bfqq),
-					     rq_start_time_ns(rq),
-					     rq_io_start_time_ns(rq),
+					     rq->start_time_ns,
+					     rq->io_start_time_ns,
 					     rq->cmd_flags);
 
 	if (likely(rq->rq_flags & RQF_STARTED)) {
diff --git a/block/blk-core.c b/block/blk-core.c
index 33d5c7d85da1..1418a1ccd80d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -196,8 +196,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = -1;
 	rq->internal_tag = -1;
-	rq->start_time = jiffies;
-	set_start_time_ns(rq);
+	rq->start_time_ns = ktime_get_ns();
 	rq->part = NULL;
 	seqcount_init(&rq->gstate_seq);
 	u64_stats_init(&rq->aborted_gstate_sync);
@@ -2726,7 +2725,7 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
 	}
 }
 
-void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req, u64 now)
 {
 	/*
 	 * Account IO completion.  flush_rq isn't accounted as a
@@ -2734,11 +2733,12 @@ void blk_account_io_done(struct request *req)
 	 * containing request is enough.
 	 */
 	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
-		unsigned long duration = jiffies - req->start_time;
+		unsigned long duration;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 
+		duration = nsecs_to_jiffies(now - req->start_time_ns);
 		cpu = part_stat_lock();
 		part = req->part;
 
@@ -2969,10 +2969,8 @@ static void blk_dequeue_request(struct request *rq)
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
-	if (blk_account_rq(rq)) {
+	if (blk_account_rq(rq))
 		q->in_flight[rq_is_sync(rq)]++;
-		set_io_start_time_ns(rq);
-	}
 }
 
 /**
@@ -3192,12 +3190,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
 void blk_finish_request(struct request *req, blk_status_t error)
 {
 	struct request_queue *q = req->q;
+	u64 now = ktime_get_ns();
 
 	lockdep_assert_held(req->q->queue_lock);
 	WARN_ON_ONCE(q->mq_ops);
 
 	if (req->rq_flags & RQF_STATS)
-		blk_stat_add(req);
+		blk_stat_add(req, now);
 
 	if (req->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, req);
@@ -3212,7 +3211,7 @@ void blk_finish_request(struct request *req, blk_status_t error)
 	if (req->rq_flags & RQF_DONTPREP)
 		blk_unprep_request(req);
 
-	blk_account_io_done(req);
+	blk_account_io_done(req, now);
 
 	if (req->end_io) {
 		wbt_done(req->q->rq_wb, req);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 782940c65d8a..5573d0fbec53 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -724,13 +724,12 @@ static struct request *attempt_merge(struct request_queue *q,
 	}
 
 	/*
-	 * At this point we have either done a back merge
-	 * or front merge. We need the smaller start_time of
-	 * the merged requests to be the current request
-	 * for accounting purposes.
+	 * At this point we have either done a back merge or front merge. We
+	 * need the smaller start_time_ns of the merged requests to be the
+	 * current request for accounting purposes.
 	 */
-	if (time_after(req->start_time, next->start_time))
-		req->start_time = next->start_time;
+	if (next->start_time_ns < req->start_time_ns)
+		req->start_time_ns = next->start_time_ns;
 
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 39b4e9834d82..4e9d83594cca 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -309,7 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
-	rq->start_time = jiffies;
+	rq->start_time_ns = ktime_get_ns();
 	rq->io_start_time_ns = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -329,8 +329,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 #ifdef CONFIG_BLK_CGROUP
 	rq->rl = NULL;
-	set_start_time_ns(rq);
-	rq->cgroup_io_start_time_ns = 0;
 #endif
 
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
@@ -506,12 +504,14 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
+	u64 now = ktime_get_ns();
+
 	if (rq->rq_flags & RQF_STATS) {
 		blk_mq_poll_stats_start(rq->q);
-		blk_stat_add(rq);
+		blk_stat_add(rq, now);
 	}
 
-	blk_account_io_done(rq);
+	blk_account_io_done(rq, now);
 
 	if (rq->end_io) {
 		wbt_done(rq->q->rq_wb, rq);
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 725a881723b0..175c143ac5b9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -47,15 +47,14 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 	stat->nr_samples++;
 }
 
-void blk_stat_add(struct request *rq)
+void blk_stat_add(struct request *rq, u64 now)
 {
 	struct request_queue *q = rq->q;
 	struct blk_stat_callback *cb;
 	struct blk_rq_stat *stat;
 	int bucket;
-	u64 now, value;
+	u64 value;
 
-	now = ktime_get_ns();
 	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
 
 	blk_throtl_stat_add(rq, value);
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 17c812db0aca..78399cdde9c9 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -65,7 +65,7 @@ struct blk_stat_callback {
 struct blk_queue_stats *blk_alloc_queue_stats(void);
 void blk_free_queue_stats(struct blk_queue_stats *);
 
-void blk_stat_add(struct request *);
+void blk_stat_add(struct request *rq, u64 now);
 
 /* record time/size info in request but not add a callback */
 void blk_stat_enable_accounting(struct request_queue *q);
diff --git a/block/blk.h b/block/blk.h
index b034fd2460c4..eaf1a8e87d11 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -186,7 +186,7 @@ unsigned int blk_plug_queued_count(struct request_queue *q);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
-void blk_account_io_done(struct request *req);
+void blk_account_io_done(struct request *req, u64 now);
 
 /*
  * EH timer and IO completion will both attempt to 'grab' the request, make
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 652ca064de20..6b9f6b1cd33b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4228,8 +4228,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
-	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), rq->cmd_flags);
+	cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
+				     rq->io_start_time_ns, rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -4245,16 +4245,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 					cfqq_type(cfqq));
 
 		st->ttime.last_end_request = now;
-		/*
-		 * We have to do this check in jiffies since start_time is in
-		 * jiffies and it is not trivial to convert to ns. If
-		 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
-		 * will become problematic but so far we are fine (the default
-		 * is 128 ms).
-		 */
-		if (!time_after(rq->start_time +
-				  nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
-				jiffies))
+		if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
 			cfqd->last_delayed_sync = now;
 	}
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index bf0b840645cc..1c18f335da04 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -406,7 +406,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
 	if (blk_queue_io_stat(clone->q))
 		clone->rq_flags |= RQF_IO_STAT;
 
-	clone->start_time = jiffies;
+	clone->start_time_ns = ktime_get_ns();
 	r = blk_insert_cloned_request(clone->q, clone);
 	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
 		/* must complete clone in terms of original request */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9ef412666df1..e42d510daf3c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -205,7 +205,8 @@ struct request {
 
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
-	unsigned long start_time;
+	/* Time that I/O was submitted to the kernel. */
+	u64 start_time_ns;
 	/* Time that I/O was submitted to the device. */
 	u64 io_start_time_ns;
 
@@ -277,8 +278,6 @@ struct request {
 
 #ifdef CONFIG_BLK_CGROUP
 	struct request_list *rl;		/* rl this rq is alloced from */
-	unsigned long long cgroup_start_time_ns;
-	unsigned long long cgroup_io_start_time_ns;    /* when passed to hardware */
 #endif
 };
 
@@ -1798,39 +1797,6 @@ int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
-#ifdef CONFIG_BLK_CGROUP
-static inline void set_start_time_ns(struct request *req)
-{
-	req->cgroup_start_time_ns = ktime_get_ns();
-}
-
-static inline void set_io_start_time_ns(struct request *req)
-{
-	req->cgroup_io_start_time_ns = ktime_get_ns();
-}
-
-static inline u64 rq_start_time_ns(struct request *req)
-{
-	return req->cgroup_start_time_ns;
-}
-
-static inline u64 rq_io_start_time_ns(struct request *req)
-{
-	return req->cgroup_io_start_time_ns;
-}
-#else
-static inline void set_start_time_ns(struct request *req) {}
-static inline void set_io_start_time_ns(struct request *req) {}
-static inline u64 rq_start_time_ns(struct request *req)
-{
-	return 0;
-}
-static inline u64 rq_io_start_time_ns(struct request *req)
-{
-	return 0;
-}
-#endif
-
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
-- 
cgit v1.2.3


From 78958fca7ead2f81b60a6827881c4866d1ed0c52 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:51 -0700
Subject: bpf: btf: Introduce BTF ID

This patch gives an ID to each loaded BTF.  The ID is allocated by
the idr like the existing prog-id and map-id.

The bpf_put(map->btf) is moved to __bpf_map_put() so that the
userspace can stop seeing the BTF ID ASAP when the last BTF
refcnt is gone.

It also makes BTF accessible from userspace through the
1. new BPF_BTF_GET_FD_BY_ID command.  It is limited to CAP_SYS_ADMIN
   which is inline with the BPF_BTF_LOAD cmd and the existing
   BPF_[MAP|PROG]_GET_FD_BY_ID cmd.
2. new btf_id (and btf_key_id + btf_value_id) in "struct bpf_map_info"

Once the BTF ID handler is accessible from userspace, freeing a BTF
object has to go through a rcu period.  The BPF_BTF_GET_FD_BY_ID cmd
can then be done under a rcu_read_lock() instead of taking
spin_lock.
[Note: A similar rcu usage can be done to the existing
       bpf_prog_get_fd_by_id() in a follow up patch]

When processing the BPF_BTF_GET_FD_BY_ID cmd,
refcount_inc_not_zero() is needed because the BTF object
could be already in the rcu dead row .  btf_get() is
removed since its usage is currently limited to btf.c
alone.  refcount_inc() is used directly instead.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h      |   2 +
 include/uapi/linux/bpf.h |   5 +++
 kernel/bpf/btf.c         | 108 ++++++++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c     |  24 ++++++++++-
 4 files changed, 128 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a966dc6d61ee..e076c4697049 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -44,5 +44,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 					u32 *ret_size);
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
+int btf_get_fd_by_id(u32 id);
+u32 btf_id(const struct btf *btf);
 
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 93d5a4eeec2a..6106f23a9a8a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_cmd {
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
+	BPF_BTF_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -344,6 +345,7 @@ union bpf_attr {
 			__u32		start_id;
 			__u32		prog_id;
 			__u32		map_id;
+			__u32		btf_id;
 		};
 		__u32		next_id;
 		__u32		open_flags;
@@ -2130,6 +2132,9 @@ struct bpf_map_info {
 	__u32 ifindex;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 btf_id;
+	__u32 btf_key_id;
+	__u32 btf_value_id;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index fa0dce0452e7..40950b6bf395 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -11,6 +11,7 @@
 #include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
+#include <linux/idr.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -179,6 +180,9 @@
 	     i < btf_type_vlen(struct_type);				\
 	     i++, member++)
 
+static DEFINE_IDR(btf_idr);
+static DEFINE_SPINLOCK(btf_idr_lock);
+
 struct btf {
 	union {
 		struct btf_header *hdr;
@@ -193,6 +197,8 @@ struct btf {
 	u32 types_size;
 	u32 data_size;
 	refcount_t refcnt;
+	u32 id;
+	struct rcu_head rcu;
 };
 
 enum verifier_phase {
@@ -598,6 +604,42 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 	return 0;
 }
 
+static int btf_alloc_id(struct btf *btf)
+{
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_bh(&btf_idr_lock);
+	id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		btf->id = id;
+	spin_unlock_bh(&btf_idr_lock);
+	idr_preload_end();
+
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+	unsigned long flags;
+
+	/*
+	 * In map-in-map, calling map_delete_elem() on outer
+	 * map will call bpf_map_put on the inner map.
+	 * It will then eventually call btf_free_id()
+	 * on the inner map.  Some of the map_delete_elem()
+	 * implementation may have irq disabled, so
+	 * we need to use the _irqsave() version instead
+	 * of the _bh() version.
+	 */
+	spin_lock_irqsave(&btf_idr_lock, flags);
+	idr_remove(&btf_idr, btf->id);
+	spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
 static void btf_free(struct btf *btf)
 {
 	kvfree(btf->types);
@@ -607,15 +649,19 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
-static void btf_get(struct btf *btf)
+static void btf_free_rcu(struct rcu_head *rcu)
 {
-	refcount_inc(&btf->refcnt);
+	struct btf *btf = container_of(rcu, struct btf, rcu);
+
+	btf_free(btf);
 }
 
 void btf_put(struct btf *btf)
 {
-	if (btf && refcount_dec_and_test(&btf->refcnt))
-		btf_free(btf);
+	if (btf && refcount_dec_and_test(&btf->refcnt)) {
+		btf_free_id(btf);
+		call_rcu(&btf->rcu, btf_free_rcu);
+	}
 }
 
 static int env_resolve_init(struct btf_verifier_env *env)
@@ -2006,10 +2052,15 @@ const struct file_operations btf_fops = {
 	.release	= btf_release,
 };
 
+static int __btf_new_fd(struct btf *btf)
+{
+	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
 int btf_new_fd(const union bpf_attr *attr)
 {
 	struct btf *btf;
-	int fd;
+	int ret;
 
 	btf = btf_parse(u64_to_user_ptr(attr->btf),
 			attr->btf_size, attr->btf_log_level,
@@ -2018,12 +2069,23 @@ int btf_new_fd(const union bpf_attr *attr)
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
-	fd = anon_inode_getfd("btf", &btf_fops, btf,
-			      O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
+	ret = btf_alloc_id(btf);
+	if (ret) {
+		btf_free(btf);
+		return ret;
+	}
+
+	/*
+	 * The BTF ID is published to the userspace.
+	 * All BTF free must go through call_rcu() from
+	 * now on (i.e. free by calling btf_put()).
+	 */
+
+	ret = __btf_new_fd(btf);
+	if (ret < 0)
 		btf_put(btf);
 
-	return fd;
+	return ret;
 }
 
 struct btf *btf_get_by_fd(int fd)
@@ -2042,7 +2104,7 @@ struct btf *btf_get_by_fd(int fd)
 	}
 
 	btf = f.file->private_data;
-	btf_get(btf);
+	refcount_inc(&btf->refcnt);
 	fdput(f);
 
 	return btf;
@@ -2062,3 +2124,29 @@ int btf_get_info_by_fd(const struct btf *btf,
 
 	return 0;
 }
+
+int btf_get_fd_by_id(u32 id)
+{
+	struct btf *btf;
+	int fd;
+
+	rcu_read_lock();
+	btf = idr_find(&btf_idr, id);
+	if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+		btf = ERR_PTR(-ENOENT);
+	rcu_read_unlock();
+
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = __btf_new_fd(btf);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+u32 btf_id(const struct btf *btf)
+{
+	return btf->id;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9b87198deea2..31c4092da277 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -252,7 +252,6 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_uncharge_memlock(map);
 	security_bpf_map_free(map);
-	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -273,6 +272,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map, do_idr_lock);
+		btf_put(map->btf);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
@@ -2002,6 +2002,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (map->btf) {
+		info.btf_id = btf_id(map->btf);
+		info.btf_key_id = map->btf_key_id;
+		info.btf_value_id = map->btf_value_id;
+	}
+
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_info_fill(&info, map);
 		if (err)
@@ -2059,6 +2065,19 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	return btf_new_fd(attr);
 }
 
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_get_fd_by_id(attr->btf_id);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2142,6 +2161,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_LOAD:
 		err = bpf_btf_load(&attr);
 		break;
+	case BPF_BTF_GET_FD_BY_ID:
+		err = bpf_btf_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 62dab84c81a487d946a5fc37c6df541dd95cca38 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:52 -0700
Subject: bpf: btf: Add struct bpf_btf_info

During BPF_OBJ_GET_INFO_BY_FD on a btf_fd, the current bpf_attr's
info.info is directly filled with the BTF binary data.  It is
not extensible.  In this case, we want to add BTF ID.

This patch adds "struct bpf_btf_info" which has the BTF ID as
one of its member.  The BTF binary data itself is exposed through
the "btf" and "btf_size" members.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/btf.c         | 26 +++++++++++++++++++++-----
 kernel/bpf/syscall.c     | 17 ++++++++++++++++-
 3 files changed, 43 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6106f23a9a8a..d615c777b573 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2137,6 +2137,12 @@ struct bpf_map_info {
 	__u32 btf_value_id;
 } __attribute__((aligned(8)));
 
+struct bpf_btf_info {
+	__aligned_u64 btf;
+	__u32 btf_size;
+	__u32 id;
+} __attribute__((aligned(8)));
+
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
  * attach attach type).
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 40950b6bf395..ded10ab47b8a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2114,12 +2114,28 @@ int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
 		       union bpf_attr __user *uattr)
 {
-	void __user *udata = u64_to_user_ptr(attr->info.info);
-	u32 copy_len = min_t(u32, btf->data_size,
-			     attr->info.info_len);
+	struct bpf_btf_info __user *uinfo;
+	struct bpf_btf_info info = {};
+	u32 info_copy, btf_copy;
+	void __user *ubtf;
+	u32 uinfo_len;
 
-	if (copy_to_user(udata, btf->data, copy_len) ||
-	    put_user(btf->data_size, &uattr->info.info_len))
+	uinfo = u64_to_user_ptr(attr->info.info);
+	uinfo_len = attr->info.info_len;
+
+	info_copy = min_t(u32, uinfo_len, sizeof(info));
+	if (copy_from_user(&info, uinfo, info_copy))
+		return -EFAULT;
+
+	info.id = btf->id;
+	ubtf = u64_to_user_ptr(info.btf);
+	btf_copy = min_t(u32, btf->data_size, info.btf_size);
+	if (copy_to_user(ubtf, btf->data, btf_copy))
+		return -EFAULT;
+	info.btf_size = btf->data_size;
+
+	if (copy_to_user(uinfo, &info, info_copy) ||
+	    put_user(info_copy, &uattr->info.info_len))
 		return -EFAULT;
 
 	return 0;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 31c4092da277..e2aeb5e89f44 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2021,6 +2021,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	return 0;
 }
 
+static int bpf_btf_get_info_by_fd(struct btf *btf,
+				  const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	if (err)
+		return err;
+
+	return btf_get_info_by_fd(btf, attr, uattr);
+}
+
 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
 
 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -2044,7 +2059,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
 					     uattr);
 	else if (f.file->f_op == &btf_fops)
-		err = btf_get_info_by_fd(f.file->private_data, attr, uattr);
+		err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
 	else
 		err = -EINVAL;
 
-- 
cgit v1.2.3


From 0d8300325660f81787892a1c58dc1f9428a67143 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 8 May 2018 19:37:06 -0700
Subject: bpf: xdp: allow offloads to store into rx_queue_index

It's fairly easy for offloaded XDP programs to select the RX queue
packets go to.  We need a way of expressing this in the software.
Allow write to the rx_queue_index field of struct xdp_md for
device-bound programs.

Skip convert_ctx_access callback entirely for offloads.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   | 2 +-
 kernel/bpf/verifier.c | 2 +-
 net/core/filter.c     | 9 ++++++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 321969da67b7..a38e474bf7ee 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -627,7 +627,7 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
 
-static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
 {
 	return aux->offload_requested;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d5e1a6c4165d..d92d9c37affd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5215,7 +5215,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 	}
 
-	if (!ops->convert_ctx_access)
+	if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
 		return 0;
 
 	insn = env->prog->insnsi + delta;
diff --git a/net/core/filter.c b/net/core/filter.c
index 6877426c23a6..0baa715e4699 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4645,8 +4645,15 @@ static bool xdp_is_valid_access(int off, int size,
 				const struct bpf_prog *prog,
 				struct bpf_insn_access_aux *info)
 {
-	if (type == BPF_WRITE)
+	if (type == BPF_WRITE) {
+		if (bpf_prog_is_dev_bound(prog->aux)) {
+			switch (off) {
+			case offsetof(struct xdp_md, rx_queue_index):
+				return __is_valid_xdp_access(off, size);
+			}
+		}
 		return false;
+	}
 
 	switch (off) {
 	case offsetof(struct xdp_md, data):
-- 
cgit v1.2.3


From 1baf47c2e5c946fd17ef07597b9d25722d13ff14 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 9 May 2018 17:52:06 +0100
Subject: firmware: arm_scmi: fix kernel-docs documentation

There are few missing descriptions for function parameters and structure
members along with certain instances where excessive function parameters
or structure members are described.

This patch fixes all of those warnings.

Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/base.c   | 20 +++++++++--------
 drivers/firmware/arm_scmi/common.h |  7 ++++--
 drivers/firmware/arm_scmi/driver.c | 45 ++++++++++++++++++++------------------
 include/linux/scmi_protocol.h      |  8 +++++++
 4 files changed, 48 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/base.c b/drivers/firmware/arm_scmi/base.c
index 0d3806c0d432..c36ded9dbb83 100644
--- a/drivers/firmware/arm_scmi/base.c
+++ b/drivers/firmware/arm_scmi/base.c
@@ -26,7 +26,7 @@ struct scmi_msg_resp_base_attributes {
  * scmi_base_attributes_get() - gets the implementation details
  *	that are associated with the base protocol.
  *
- * @handle - SCMI entity handle
+ * @handle: SCMI entity handle
  *
  * Return: 0 on success, else appropriate SCMI error.
  */
@@ -50,14 +50,15 @@ static int scmi_base_attributes_get(const struct scmi_handle *handle)
 	}
 
 	scmi_one_xfer_put(handle, t);
+
 	return ret;
 }
 
 /**
  * scmi_base_vendor_id_get() - gets vendor/subvendor identifier ASCII string.
  *
- * @handle - SCMI entity handle
- * @sub_vendor - specify true if sub-vendor ID is needed
+ * @handle: SCMI entity handle
+ * @sub_vendor: specify true if sub-vendor ID is needed
  *
  * Return: 0 on success, else appropriate SCMI error.
  */
@@ -97,7 +98,7 @@ scmi_base_vendor_id_get(const struct scmi_handle *handle, bool sub_vendor)
  *	implementation 32-bit version. The format of the version number is
  *	vendor-specific
  *
- * @handle - SCMI entity handle
+ * @handle: SCMI entity handle
  *
  * Return: 0 on success, else appropriate SCMI error.
  */
@@ -128,8 +129,8 @@ scmi_base_implementation_version_get(const struct scmi_handle *handle)
  * scmi_base_implementation_list_get() - gets the list of protocols it is
  *	OSPM is allowed to access
  *
- * @handle - SCMI entity handle
- * @protocols_imp - pointer to hold the list of protocol identifiers
+ * @handle: SCMI entity handle
+ * @protocols_imp: pointer to hold the list of protocol identifiers
  *
  * Return: 0 on success, else appropriate SCMI error.
  */
@@ -173,15 +174,16 @@ static int scmi_base_implementation_list_get(const struct scmi_handle *handle,
 	} while (loop_num_ret);
 
 	scmi_one_xfer_put(handle, t);
+
 	return ret;
 }
 
 /**
  * scmi_base_discover_agent_get() - discover the name of an agent
  *
- * @handle - SCMI entity handle
- * @id - Agent identifier
- * @name - Agent identifier ASCII string
+ * @handle: SCMI entity handle
+ * @id: Agent identifier
+ * @name: Agent identifier ASCII string
  *
  * An agent id of 0 is reserved to identify the platform itself.
  * Generally operating system is represented as "OSPM"
diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index e8f332c9c469..0821662a4633 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -51,8 +51,11 @@ struct scmi_msg_resp_prot_version {
  * @id: The identifier of the command being sent
  * @protocol_id: The identifier of the protocol used to send @id command
  * @seq: The token to identify the message. when a message/command returns,
- *       the platform returns the whole message header unmodified including
- *	 the token.
+ *	the platform returns the whole message header unmodified including
+ *	the token
+ * @status: Status of the transfer once it's complete
+ * @poll_completion: Indicate if the transfer needs to be polled for
+ *	completion or interrupt mode is used
  */
 struct scmi_msg_hdr {
 	u8 id;
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 917786d91f55..6fee11f06a66 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -51,7 +51,7 @@ enum scmi_error_codes {
 	SCMI_ERR_MAX
 };
 
-/* List of all  SCMI devices active in system */
+/* List of all SCMI devices active in system */
 static LIST_HEAD(scmi_list);
 /* Protection for the entire list */
 static DEFINE_MUTEX(scmi_list_mutex);
@@ -68,7 +68,6 @@ static DEFINE_MUTEX(scmi_list_mutex);
 struct scmi_xfers_info {
 	struct scmi_xfer *xfer_block;
 	unsigned long *xfer_alloc_table;
-	/* protect transfer allocation */
 	spinlock_t xfer_lock;
 };
 
@@ -94,6 +93,7 @@ struct scmi_desc {
  * @payload: Transmit/Receive mailbox channel payload area
  * @dev: Reference to device in the SCMI hierarchy corresponding to this
  *	 channel
+ * @handle: Pointer to SCMI entity handle
  */
 struct scmi_chan_info {
 	struct mbox_client cl;
@@ -104,7 +104,7 @@ struct scmi_chan_info {
 };
 
 /**
- * struct scmi_info - Structure representing a  SCMI instance
+ * struct scmi_info - Structure representing a SCMI instance
  *
  * @dev: Device pointer
  * @desc: SoC description for this instance
@@ -113,9 +113,9 @@ struct scmi_chan_info {
  *	implementation version and (sub-)vendor identification.
  * @minfo: Message info
  * @tx_idr: IDR object to map protocol id to channel info pointer
- * @protocols_imp: list of protocols implemented, currently maximum of
+ * @protocols_imp: List of protocols implemented, currently maximum of
  *	MAX_PROTOCOLS_IMP elements allocated by the base protocol
- * @node: list head
+ * @node: List head
  * @users: Number of users of this instance
  */
 struct scmi_info {
@@ -221,9 +221,7 @@ static void scmi_rx_callback(struct mbox_client *cl, void *m)
 
 	xfer_id = MSG_XTRACT_TOKEN(ioread32(&mem->msg_header));
 
-	/*
-	 * Are we even expecting this?
-	 */
+	/* Are we even expecting this? */
 	if (!test_bit(xfer_id, minfo->xfer_alloc_table)) {
 		dev_err(dev, "message for %d is not expected!\n", xfer_id);
 		return;
@@ -248,6 +246,8 @@ static void scmi_rx_callback(struct mbox_client *cl, void *m)
  *
  * @hdr: pointer to header containing all the information on message id,
  *	protocol id and sequence id.
+ *
+ * Return: 32-bit packed command header to be sent to the platform.
  */
 static inline u32 pack_scmi_header(struct scmi_msg_hdr *hdr)
 {
@@ -282,9 +282,9 @@ static void scmi_tx_prepare(struct mbox_client *cl, void *m)
 }
 
 /**
- * scmi_one_xfer_get() - Allocate one message
+ * scmi_xfer_get() - Allocate one message
  *
- * @handle: SCMI entity handle
+ * @handle: Pointer to SCMI entity handle
  *
  * Helper function which is used by various command functions that are
  * exposed to clients of this driver for allocating a message traffic event.
@@ -326,8 +326,8 @@ static struct scmi_xfer *scmi_one_xfer_get(const struct scmi_handle *handle)
 /**
  * scmi_one_xfer_put() - Release a message
  *
- * @minfo: transfer info pointer
- * @xfer: message that was reserved by scmi_one_xfer_get
+ * @handle: Pointer to SCMI entity handle
+ * @xfer: message that was reserved by scmi_xfer_get
  *
  * This holds a spinlock to maintain integrity of internal data structures.
  */
@@ -374,12 +374,12 @@ static bool scmi_xfer_done_no_timeout(const struct scmi_chan_info *cinfo,
 /**
  * scmi_do_xfer() - Do one transfer
  *
- * @info: Pointer to SCMI entity information
+ * @handle: Pointer to SCMI entity handle
  * @xfer: Transfer to initiate and wait for response
  *
  * Return: -ETIMEDOUT in case of no response, if transmit error,
- *   return corresponding error, else if all goes well,
- *   return 0.
+ *	return corresponding error, else if all goes well,
+ *	return 0.
  */
 int scmi_do_xfer(const struct scmi_handle *handle, struct scmi_xfer *xfer)
 {
@@ -438,9 +438,9 @@ int scmi_do_xfer(const struct scmi_handle *handle, struct scmi_xfer *xfer)
 /**
  * scmi_one_xfer_init() - Allocate and initialise one message
  *
- * @handle: SCMI entity handle
+ * @handle: Pointer to SCMI entity handle
  * @msg_id: Message identifier
- * @msg_prot_id: Protocol identifier for the message
+ * @prot_id: Protocol identifier for the message
  * @tx_size: transmit message size
  * @rx_size: receive message size
  * @p: pointer to the allocated and initialised message
@@ -478,13 +478,16 @@ int scmi_one_xfer_init(const struct scmi_handle *handle, u8 msg_id, u8 prot_id,
 	xfer->hdr.poll_completion = false;
 
 	*p = xfer;
+
 	return 0;
 }
 
 /**
  * scmi_version_get() - command to get the revision of the SCMI entity
  *
- * @handle: Handle to SCMI entity information
+ * @handle: Pointer to SCMI entity handle
+ * @protocol: Protocol identifier for the message
+ * @version: Holds returned version of protocol.
  *
  * Updates the SCMI information in the internal data structure.
  *
@@ -541,7 +544,7 @@ scmi_is_protocol_implemented(const struct scmi_handle *handle, u8 prot_id)
  * @dev: pointer to device for which we want SCMI handle
  *
  * NOTE: The function does not track individual clients of the framework
- * and is expected to be maintained by caller of  SCMI protocol library.
+ * and is expected to be maintained by caller of SCMI protocol library.
  * scmi_handle_put must be balanced with successful scmi_handle_get
  *
  * Return: pointer to handle if successful, NULL on error
@@ -572,7 +575,7 @@ struct scmi_handle *scmi_handle_get(struct device *dev)
  * @handle: handle acquired by scmi_handle_get
  *
  * NOTE: The function does not track individual clients of the framework
- * and is expected to be maintained by caller of  SCMI protocol library.
+ * and is expected to be maintained by caller of SCMI protocol library.
  * scmi_handle_put must be balanced with successful scmi_handle_get
  *
  * Return: 0 is successfully released
@@ -595,7 +598,7 @@ int scmi_handle_put(const struct scmi_handle *handle)
 }
 
 static const struct scmi_desc scmi_generic_desc = {
-	.max_rx_timeout_ms = 30,	/* we may increase this if required */
+	.max_rx_timeout_ms = 30,	/* We may increase this if required */
 	.max_msg = 20,		/* Limited by MBOX_TX_QUEUE_LEN */
 	.max_msg_size = 128,
 };
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index b458c87b866c..a171c1e293e8 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -189,6 +189,14 @@ struct scmi_sensor_ops {
  * @perf_ops: pointer to set of performance protocol operations
  * @clk_ops: pointer to set of clock protocol operations
  * @sensor_ops: pointer to set of sensor protocol operations
+ * @perf_priv: pointer to private data structure specific to performance
+ *	protocol(for internal use only)
+ * @clk_priv: pointer to private data structure specific to clock
+ *	protocol(for internal use only)
+ * @power_priv: pointer to private data structure specific to power
+ *	protocol(for internal use only)
+ * @sensor_priv: pointer to private data structure specific to sensors
+ *	protocol(for internal use only)
  */
 struct scmi_handle {
 	struct device *dev;
-- 
cgit v1.2.3


From 163475ebf9f3d1b516c1f8ee4f59eb8ff8e97ee8 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 18 Apr 2018 13:01:21 -0500
Subject: ipmi: Remove the proc interface

It has been deprecated long enough, get rid of it.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/Kconfig           |   8 --
 drivers/char/ipmi/ipmi_msghandler.c | 264 ------------------------------------
 drivers/char/ipmi/ipmi_si_intf.c    | 125 -----------------
 drivers/char/ipmi/ipmi_ssif.c       |  89 ------------
 include/linux/ipmi_smi.h            |  11 --
 5 files changed, 497 deletions(-)

(limited to 'include')

diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig
index 470f976cf850..c108441882cc 100644
--- a/drivers/char/ipmi/Kconfig
+++ b/drivers/char/ipmi/Kconfig
@@ -22,14 +22,6 @@ config IPMI_DMI_DECODE
 
 if IPMI_HANDLER
 
-config IPMI_PROC_INTERFACE
-       bool 'Provide an interface for IPMI stats in /proc (deprecated)'
-       depends on PROC_FS
-       default y
-       help
-         Do not use this any more, use sysfs for this info.  It will be
-	 removed in future kernel versions.
-
 config IPMI_PANIC_EVENT
        bool 'Generate a panic event to all BMCs on a panic'
        help
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 946bfcb1eeee..606d561fe0e2 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -131,10 +131,6 @@ module_param_cb(panic_op, &panic_op_ops, NULL, 0600);
 MODULE_PARM_DESC(panic_op, "Sets if the IPMI driver will attempt to store panic information in the event log in the event of a panic.  Set to 'none' for no, 'event' for a single event, or 'string' for a generic event and the panic string in IPMI OEM events.");
 
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-static struct proc_dir_entry *proc_ipmi_root;
-#endif /* CONFIG_IPMI_PROC_INTERFACE */
-
 #define MAX_EVENTS_IN_QUEUE	25
 
 /* Remain in auto-maintenance mode for this amount of time (in ms). */
@@ -315,13 +311,6 @@ struct ipmi_my_addrinfo {
 	unsigned char lun;
 };
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-struct ipmi_proc_entry {
-	char                   *name;
-	struct ipmi_proc_entry *next;
-};
-#endif
-
 /*
  * Note that the product id, manufacturer id, guid, and device id are
  * immutable in this structure, so dyn_mutex is not required for
@@ -492,15 +481,6 @@ struct ipmi_smi {
 	const struct ipmi_smi_handlers *handlers;
 	void                     *send_info;
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	/* A list of proc entries for this interface. */
-	struct mutex           proc_entry_lock;
-	struct ipmi_proc_entry *proc_entries;
-
-	struct proc_dir_entry *proc_dir;
-	char                  proc_dir_name[10];
-#endif
-
 	/* Driver-model device for the system interface. */
 	struct device          *si_dev;
 
@@ -2515,216 +2495,6 @@ static int bmc_get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc,
 	return __bmc_get_device_id(intf, bmc, id, guid_set, guid, -1);
 }
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-static int smi_ipmb_proc_show(struct seq_file *m, void *v)
-{
-	struct ipmi_smi *intf = m->private;
-	int        i;
-
-	seq_printf(m, "%x", intf->addrinfo[0].address);
-	for (i = 1; i < IPMI_MAX_CHANNELS; i++)
-		seq_printf(m, " %x", intf->addrinfo[i].address);
-	seq_putc(m, '\n');
-
-	return 0;
-}
-
-static int smi_ipmb_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_ipmb_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations smi_ipmb_proc_ops = {
-	.open		= smi_ipmb_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int smi_version_proc_show(struct seq_file *m, void *v)
-{
-	struct ipmi_smi *intf = m->private;
-	struct ipmi_device_id id;
-	int rv;
-
-	rv = bmc_get_device_id(intf, NULL, &id, NULL, NULL);
-	if (rv)
-		return rv;
-
-	seq_printf(m, "%u.%u\n",
-		   ipmi_version_major(&id),
-		   ipmi_version_minor(&id));
-
-	return 0;
-}
-
-static int smi_version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_version_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations smi_version_proc_ops = {
-	.open		= smi_version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int smi_stats_proc_show(struct seq_file *m, void *v)
-{
-	struct ipmi_smi *intf = m->private;
-
-	seq_printf(m, "sent_invalid_commands:       %u\n",
-		       ipmi_get_stat(intf, sent_invalid_commands));
-	seq_printf(m, "sent_local_commands:         %u\n",
-		       ipmi_get_stat(intf, sent_local_commands));
-	seq_printf(m, "handled_local_responses:     %u\n",
-		       ipmi_get_stat(intf, handled_local_responses));
-	seq_printf(m, "unhandled_local_responses:   %u\n",
-		       ipmi_get_stat(intf, unhandled_local_responses));
-	seq_printf(m, "sent_ipmb_commands:          %u\n",
-		       ipmi_get_stat(intf, sent_ipmb_commands));
-	seq_printf(m, "sent_ipmb_command_errs:      %u\n",
-		       ipmi_get_stat(intf, sent_ipmb_command_errs));
-	seq_printf(m, "retransmitted_ipmb_commands: %u\n",
-		       ipmi_get_stat(intf, retransmitted_ipmb_commands));
-	seq_printf(m, "timed_out_ipmb_commands:     %u\n",
-		       ipmi_get_stat(intf, timed_out_ipmb_commands));
-	seq_printf(m, "timed_out_ipmb_broadcasts:   %u\n",
-		       ipmi_get_stat(intf, timed_out_ipmb_broadcasts));
-	seq_printf(m, "sent_ipmb_responses:         %u\n",
-		       ipmi_get_stat(intf, sent_ipmb_responses));
-	seq_printf(m, "handled_ipmb_responses:      %u\n",
-		       ipmi_get_stat(intf, handled_ipmb_responses));
-	seq_printf(m, "invalid_ipmb_responses:      %u\n",
-		       ipmi_get_stat(intf, invalid_ipmb_responses));
-	seq_printf(m, "unhandled_ipmb_responses:    %u\n",
-		       ipmi_get_stat(intf, unhandled_ipmb_responses));
-	seq_printf(m, "sent_lan_commands:           %u\n",
-		       ipmi_get_stat(intf, sent_lan_commands));
-	seq_printf(m, "sent_lan_command_errs:       %u\n",
-		       ipmi_get_stat(intf, sent_lan_command_errs));
-	seq_printf(m, "retransmitted_lan_commands:  %u\n",
-		       ipmi_get_stat(intf, retransmitted_lan_commands));
-	seq_printf(m, "timed_out_lan_commands:      %u\n",
-		       ipmi_get_stat(intf, timed_out_lan_commands));
-	seq_printf(m, "sent_lan_responses:          %u\n",
-		       ipmi_get_stat(intf, sent_lan_responses));
-	seq_printf(m, "handled_lan_responses:       %u\n",
-		       ipmi_get_stat(intf, handled_lan_responses));
-	seq_printf(m, "invalid_lan_responses:       %u\n",
-		       ipmi_get_stat(intf, invalid_lan_responses));
-	seq_printf(m, "unhandled_lan_responses:     %u\n",
-		       ipmi_get_stat(intf, unhandled_lan_responses));
-	seq_printf(m, "handled_commands:            %u\n",
-		       ipmi_get_stat(intf, handled_commands));
-	seq_printf(m, "invalid_commands:            %u\n",
-		       ipmi_get_stat(intf, invalid_commands));
-	seq_printf(m, "unhandled_commands:          %u\n",
-		       ipmi_get_stat(intf, unhandled_commands));
-	seq_printf(m, "invalid_events:              %u\n",
-		       ipmi_get_stat(intf, invalid_events));
-	seq_printf(m, "events:                      %u\n",
-		       ipmi_get_stat(intf, events));
-	seq_printf(m, "failed rexmit LAN msgs:      %u\n",
-		       ipmi_get_stat(intf, dropped_rexmit_lan_commands));
-	seq_printf(m, "failed rexmit IPMB msgs:     %u\n",
-		       ipmi_get_stat(intf, dropped_rexmit_ipmb_commands));
-	return 0;
-}
-
-static int smi_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_stats_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations smi_stats_proc_ops = {
-	.open		= smi_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-int ipmi_smi_add_proc_entry(struct ipmi_smi *smi, char *name,
-			    const struct file_operations *proc_ops,
-			    void *data)
-{
-	int                    rv = 0;
-	struct proc_dir_entry  *file;
-	struct ipmi_proc_entry *entry;
-
-	/* Create a list element. */
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-	entry->name = kstrdup(name, GFP_KERNEL);
-	if (!entry->name) {
-		kfree(entry);
-		return -ENOMEM;
-	}
-
-	file = proc_create_data(name, 0, smi->proc_dir, proc_ops, data);
-	if (!file) {
-		kfree(entry->name);
-		kfree(entry);
-		rv = -ENOMEM;
-	} else {
-		mutex_lock(&smi->proc_entry_lock);
-		/* Stick it on the list. */
-		entry->next = smi->proc_entries;
-		smi->proc_entries = entry;
-		mutex_unlock(&smi->proc_entry_lock);
-	}
-
-	return rv;
-}
-EXPORT_SYMBOL(ipmi_smi_add_proc_entry);
-
-static int add_proc_entries(struct ipmi_smi *smi, int num)
-{
-	int rv = 0;
-
-	sprintf(smi->proc_dir_name, "%d", num);
-	smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root);
-	if (!smi->proc_dir)
-		rv = -ENOMEM;
-
-	if (rv == 0)
-		rv = ipmi_smi_add_proc_entry(smi, "stats",
-					     &smi_stats_proc_ops,
-					     smi);
-
-	if (rv == 0)
-		rv = ipmi_smi_add_proc_entry(smi, "ipmb",
-					     &smi_ipmb_proc_ops,
-					     smi);
-
-	if (rv == 0)
-		rv = ipmi_smi_add_proc_entry(smi, "version",
-					     &smi_version_proc_ops,
-					     smi);
-
-	return rv;
-}
-
-static void remove_proc_entries(struct ipmi_smi *smi)
-{
-	struct ipmi_proc_entry *entry;
-
-	mutex_lock(&smi->proc_entry_lock);
-	while (smi->proc_entries) {
-		entry = smi->proc_entries;
-		smi->proc_entries = entry->next;
-
-		remove_proc_entry(entry->name, smi->proc_dir);
-		kfree(entry->name);
-		kfree(entry);
-	}
-	mutex_unlock(&smi->proc_entry_lock);
-	remove_proc_entry(smi->proc_dir_name, proc_ipmi_root);
-}
-#endif /* CONFIG_IPMI_PROC_INTERFACE */
-
 static ssize_t device_id_show(struct device *dev,
 			      struct device_attribute *attr,
 			      char *buf)
@@ -3564,9 +3334,6 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 		intf->seq_table[j].seqid = 0;
 	}
 	intf->curr_seq = 0;
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	mutex_init(&intf->proc_entry_lock);
-#endif
 	spin_lock_init(&intf->waiting_rcv_msgs_lock);
 	INIT_LIST_HEAD(&intf->waiting_rcv_msgs);
 	tasklet_init(&intf->recv_tasklet,
@@ -3588,10 +3355,6 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	for (i = 0; i < IPMI_NUM_STATS; i++)
 		atomic_set(&intf->stats[i], 0);
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	intf->proc_dir = NULL;
-#endif
-
 	mutex_lock(&ipmi_interfaces_mutex);
 	/* Look for a hole in the numbers. */
 	i = 0;
@@ -3622,20 +3385,10 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	mutex_lock(&intf->bmc_reg_mutex);
 	rv = __scan_channels(intf, &id);
 	mutex_unlock(&intf->bmc_reg_mutex);
-	if (rv)
-		goto out;
-
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	rv = add_proc_entries(intf, i);
-#endif
 
  out:
 	if (rv) {
 		ipmi_bmc_unregister(intf);
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-		if (intf->proc_dir)
-			remove_proc_entries(intf);
-#endif
 		list_del_rcu(&intf->link);
 		mutex_unlock(&ipmi_interfaces_mutex);
 		synchronize_srcu(&ipmi_interfaces_srcu);
@@ -3748,9 +3501,6 @@ void ipmi_unregister_smi(struct ipmi_smi *intf)
 
 	cleanup_smi_msgs(intf);
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	remove_proc_entries(intf);
-#endif
 	ipmi_bmc_unregister(intf);
 
 	cleanup_srcu_struct(&intf->users_srcu);
@@ -5277,16 +5027,6 @@ static int ipmi_init_msghandler(void)
 
 	pr_info("ipmi message handler version " IPMI_DRIVER_VERSION "\n");
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	proc_ipmi_root = proc_mkdir("ipmi", NULL);
-	if (!proc_ipmi_root) {
-	    pr_err(PFX "Unable to create IPMI proc dir");
-	    driver_unregister(&ipmidriver.driver);
-	    return -ENOMEM;
-	}
-
-#endif /* CONFIG_IPMI_PROC_INTERFACE */
-
 	timer_setup(&ipmi_timer, ipmi_timeout, 0);
 	mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES);
 
@@ -5325,10 +5065,6 @@ static void __exit cleanup_ipmi(void)
 	atomic_inc(&stop_operation);
 	del_timer_sync(&ipmi_timer);
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	proc_remove(proc_ipmi_root);
-#endif /* CONFIG_IPMI_PROC_INTERFACE */
-
 	driver_unregister(&ipmidriver.driver);
 
 	initialized = 0;
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index f2c39bf91bf5..ad353be871bf 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1588,102 +1588,6 @@ out:
 	return rv;
 }
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-static int smi_type_proc_show(struct seq_file *m, void *v)
-{
-	struct smi_info *smi = m->private;
-
-	seq_printf(m, "%s\n", si_to_str[smi->io.si_type]);
-
-	return 0;
-}
-
-static int smi_type_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_type_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations smi_type_proc_ops = {
-	.open		= smi_type_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int smi_si_stats_proc_show(struct seq_file *m, void *v)
-{
-	struct smi_info *smi = m->private;
-
-	seq_printf(m, "interrupts_enabled:    %d\n",
-		       smi->io.irq && !smi->interrupt_disabled);
-	seq_printf(m, "short_timeouts:        %u\n",
-		       smi_get_stat(smi, short_timeouts));
-	seq_printf(m, "long_timeouts:         %u\n",
-		       smi_get_stat(smi, long_timeouts));
-	seq_printf(m, "idles:                 %u\n",
-		       smi_get_stat(smi, idles));
-	seq_printf(m, "interrupts:            %u\n",
-		       smi_get_stat(smi, interrupts));
-	seq_printf(m, "attentions:            %u\n",
-		       smi_get_stat(smi, attentions));
-	seq_printf(m, "flag_fetches:          %u\n",
-		       smi_get_stat(smi, flag_fetches));
-	seq_printf(m, "hosed_count:           %u\n",
-		       smi_get_stat(smi, hosed_count));
-	seq_printf(m, "complete_transactions: %u\n",
-		       smi_get_stat(smi, complete_transactions));
-	seq_printf(m, "events:                %u\n",
-		       smi_get_stat(smi, events));
-	seq_printf(m, "watchdog_pretimeouts:  %u\n",
-		       smi_get_stat(smi, watchdog_pretimeouts));
-	seq_printf(m, "incoming_messages:     %u\n",
-		       smi_get_stat(smi, incoming_messages));
-	return 0;
-}
-
-static int smi_si_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_si_stats_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations smi_si_stats_proc_ops = {
-	.open		= smi_si_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int smi_params_proc_show(struct seq_file *m, void *v)
-{
-	struct smi_info *smi = m->private;
-
-	seq_printf(m,
-		   "%s,%s,0x%lx,rsp=%d,rsi=%d,rsh=%d,irq=%d,ipmb=%d\n",
-		   si_to_str[smi->io.si_type],
-		   addr_space_to_str[smi->io.addr_type],
-		   smi->io.addr_data,
-		   smi->io.regspacing,
-		   smi->io.regsize,
-		   smi->io.regshift,
-		   smi->io.irq,
-		   smi->io.slave_addr);
-
-	return 0;
-}
-
-static int smi_params_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_params_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations smi_params_proc_ops = {
-	.open		= smi_params_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
 #define IPMI_SI_ATTR(name) \
 static ssize_t ipmi_##name##_show(struct device *dev,			\
 				  struct device_attribute *attr,	\
@@ -2172,35 +2076,6 @@ static int try_smi_init(struct smi_info *new_smi)
 		goto out_err;
 	}
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	rv = ipmi_smi_add_proc_entry(new_smi->intf, "type",
-				     &smi_type_proc_ops,
-				     new_smi);
-	if (rv) {
-		dev_err(new_smi->io.dev,
-			"Unable to create proc entry: %d\n", rv);
-		goto out_err;
-	}
-
-	rv = ipmi_smi_add_proc_entry(new_smi->intf, "si_stats",
-				     &smi_si_stats_proc_ops,
-				     new_smi);
-	if (rv) {
-		dev_err(new_smi->io.dev,
-			"Unable to create proc entry: %d\n", rv);
-		goto out_err;
-	}
-
-	rv = ipmi_smi_add_proc_entry(new_smi->intf, "params",
-				     &smi_params_proc_ops,
-				     new_smi);
-	if (rv) {
-		dev_err(new_smi->io.dev,
-			"Unable to create proc entry: %d\n", rv);
-		goto out_err;
-	}
-#endif
-
 	/* Don't increment till we know we have succeeded. */
 	smi_num++;
 
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 37f9ae2de6a4..22f634eb09fd 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -1316,72 +1316,6 @@ static int ssif_detect(struct i2c_client *client, struct i2c_board_info *info)
 	return rv;
 }
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-static int smi_type_proc_show(struct seq_file *m, void *v)
-{
-	seq_puts(m, "ssif\n");
-
-	return 0;
-}
-
-static int smi_type_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_type_proc_show, inode->i_private);
-}
-
-static const struct file_operations smi_type_proc_ops = {
-	.open		= smi_type_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int smi_stats_proc_show(struct seq_file *m, void *v)
-{
-	struct ssif_info *ssif_info = m->private;
-
-	seq_printf(m, "sent_messages:          %u\n",
-		   ssif_get_stat(ssif_info, sent_messages));
-	seq_printf(m, "sent_messages_parts:    %u\n",
-		   ssif_get_stat(ssif_info, sent_messages_parts));
-	seq_printf(m, "send_retries:           %u\n",
-		   ssif_get_stat(ssif_info, send_retries));
-	seq_printf(m, "send_errors:            %u\n",
-		   ssif_get_stat(ssif_info, send_errors));
-	seq_printf(m, "received_messages:      %u\n",
-		   ssif_get_stat(ssif_info, received_messages));
-	seq_printf(m, "received_message_parts: %u\n",
-		   ssif_get_stat(ssif_info, received_message_parts));
-	seq_printf(m, "receive_retries:        %u\n",
-		   ssif_get_stat(ssif_info, receive_retries));
-	seq_printf(m, "receive_errors:         %u\n",
-		   ssif_get_stat(ssif_info, receive_errors));
-	seq_printf(m, "flag_fetches:           %u\n",
-		   ssif_get_stat(ssif_info, flag_fetches));
-	seq_printf(m, "hosed:                  %u\n",
-		   ssif_get_stat(ssif_info, hosed));
-	seq_printf(m, "events:                 %u\n",
-		   ssif_get_stat(ssif_info, events));
-	seq_printf(m, "watchdog_pretimeouts:   %u\n",
-		   ssif_get_stat(ssif_info, watchdog_pretimeouts));
-	seq_printf(m, "alerts:                 %u\n",
-		   ssif_get_stat(ssif_info, alerts));
-	return 0;
-}
-
-static int smi_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, smi_stats_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations smi_stats_proc_ops = {
-	.open		= smi_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
 static int strcmp_nospace(char *s1, char *s2)
 {
 	while (*s1 && *s2) {
@@ -1712,24 +1646,6 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto out_remove_attr;
 	}
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-	rv = ipmi_smi_add_proc_entry(ssif_info->intf, "type",
-				     &smi_type_proc_ops,
-				     ssif_info);
-	if (rv) {
-		pr_err(PFX "Unable to create proc entry: %d\n", rv);
-		goto out_err_unreg;
-	}
-
-	rv = ipmi_smi_add_proc_entry(ssif_info->intf, "ssif_stats",
-				     &smi_stats_proc_ops,
-				     ssif_info);
-	if (rv) {
-		pr_err(PFX "Unable to create proc entry: %d\n", rv);
-		goto out_err_unreg;
-	}
-#endif
-
  out:
 	if (rv) {
 		/*
@@ -1747,11 +1663,6 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	kfree(resp);
 	return rv;
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-out_err_unreg:
-	ipmi_unregister_smi(ssif_info->intf);
-#endif
-
 out_remove_attr:
 	device_remove_group(&ssif_info->client->dev, &ipmi_ssif_dev_attr_group);
 	dev_set_drvdata(&ssif_info->client->dev, NULL);
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 0d438662a821..7d5fd38d5282 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -240,15 +240,4 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
 	msg->done(msg);
 }
 
-#ifdef CONFIG_IPMI_PROC_INTERFACE
-/*
- * Allow the lower layer to add things to the proc filesystem
- * directory for this interface.  Note that the entry will
- * automatically be dstroyed when the interface is destroyed.
- */
-int ipmi_smi_add_proc_entry(struct ipmi_smi *smi, char *name,
-			    const struct file_operations *proc_ops,
-			    void *data);
-#endif
-
 #endif /* __LINUX_IPMI_SMI_H */
-- 
cgit v1.2.3


From f0a3fa2149e36cc2426bcb13a8bdcdbe592a6d2d Mon Sep 17 00:00:00 2001
From: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Date: Sun, 8 Apr 2018 12:11:52 -0400
Subject: media: entity: fix spelling for media_entity_get_fwnode_pad()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

s/dose/does/

Fixes: d295c6a460cd2ac6 ("[media] media: entity: Add media_entity_get_fwnode_pad() function")

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/media/media-entity.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/media-entity.h b/include/media/media-entity.h
index a732af1dbba0..3aa3d58d1d58 100644
--- a/include/media/media-entity.h
+++ b/include/media/media-entity.h
@@ -842,7 +842,7 @@ struct media_entity *media_entity_get(struct media_entity *entity);
  * a fwnode. This is useful for devices which use more complex
  * mappings of media pads.
  *
- * If the entity dose not implement the get_fwnode_pad() operation
+ * If the entity does not implement the get_fwnode_pad() operation
  * then this function searches the entity for the first pad that
  * matches the @direction_flags.
  *
-- 
cgit v1.2.3


From af8aab71370a692eaf7e7969ba5b1a455ac20113 Mon Sep 17 00:00:00 2001
From: Sebastian Sanchez <sebastian.sanchez@intel.com>
Date: Wed, 2 May 2018 06:43:39 -0700
Subject: IB/hfi1: Optimize kthread pointer locking when queuing CQ entries

All threads queuing CQ entries on different CQs are unnecessarily
synchronized by a spin lock to check if the CQ kthread worker hasn't
been destroyed before queuing an CQ entry.

The lock used in 6efaf10f163d ("IB/rdmavt: Avoid queuing work into a
destroyed cq kthread worker") is a device global lock and will have
poor performance at scale as completions are entered from a large
number of CPUs.

Convert to use RCU where the read side of RCU is rvt_cq_enter() to
determine that the worker is alive prior to triggering the
completion event.
Apply write side RCU semantics in rvt_driver_cq_init() and
rvt_cq_exit().

Fixes: 6efaf10f163d ("IB/rdmavt: Avoid queuing work into a destroyed cq kthread worker")
Cc: <stable@vger.kernel.org> # 4.14.x
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/cq.c | 31 +++++++++++++++++++------------
 include/rdma/rdma_vt.h            |  2 +-
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index fb52b669bfce..340c17aba3b0 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -120,17 +120,20 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 	if (cq->notify == IB_CQ_NEXT_COMP ||
 	    (cq->notify == IB_CQ_SOLICITED &&
 	     (solicited || entry->status != IB_WC_SUCCESS))) {
+		struct kthread_worker *worker;
+
 		/*
 		 * This will cause send_complete() to be called in
 		 * another thread.
 		 */
-		spin_lock(&cq->rdi->n_cqs_lock);
-		if (likely(cq->rdi->worker)) {
+		rcu_read_lock();
+		worker = rcu_dereference(cq->rdi->worker);
+		if (likely(worker)) {
 			cq->notify = RVT_CQ_NONE;
 			cq->triggered++;
-			kthread_queue_work(cq->rdi->worker, &cq->comptask);
+			kthread_queue_work(worker, &cq->comptask);
 		}
-		spin_unlock(&cq->rdi->n_cqs_lock);
+		rcu_read_unlock();
 	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
@@ -512,7 +515,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
 	int cpu;
 	struct kthread_worker *worker;
 
-	if (rdi->worker)
+	if (rcu_access_pointer(rdi->worker))
 		return 0;
 
 	spin_lock_init(&rdi->n_cqs_lock);
@@ -524,7 +527,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
 		return PTR_ERR(worker);
 
 	set_user_nice(worker->task, MIN_NICE);
-	rdi->worker = worker;
+	RCU_INIT_POINTER(rdi->worker, worker);
 	return 0;
 }
 
@@ -536,15 +539,19 @@ void rvt_cq_exit(struct rvt_dev_info *rdi)
 {
 	struct kthread_worker *worker;
 
-	/* block future queuing from send_complete() */
-	spin_lock_irq(&rdi->n_cqs_lock);
-	worker = rdi->worker;
+	if (!rcu_access_pointer(rdi->worker))
+		return;
+
+	spin_lock(&rdi->n_cqs_lock);
+	worker = rcu_dereference_protected(rdi->worker,
+					   lockdep_is_held(&rdi->n_cqs_lock));
 	if (!worker) {
-		spin_unlock_irq(&rdi->n_cqs_lock);
+		spin_unlock(&rdi->n_cqs_lock);
 		return;
 	}
-	rdi->worker = NULL;
-	spin_unlock_irq(&rdi->n_cqs_lock);
+	RCU_INIT_POINTER(rdi->worker, NULL);
+	spin_unlock(&rdi->n_cqs_lock);
+	synchronize_rcu();
 
 	kthread_destroy_worker(worker);
 }
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 3f4c187e435d..eec495e68823 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -402,7 +402,7 @@ struct rvt_dev_info {
 	spinlock_t pending_lock; /* protect pending mmap list */
 
 	/* CQ */
-	struct kthread_worker *worker; /* per device cq worker */
+	struct kthread_worker __rcu *worker; /* per device cq worker */
 	u32 n_cqs_allocated;    /* number of CQs allocated for device */
 	spinlock_t n_cqs_lock; /* protect count of in use cqs */
 
-- 
cgit v1.2.3


From 5d18ee67d4c1735f5c1f757e89228ec68e4f4ef3 Mon Sep 17 00:00:00 2001
From: Sebastian Sanchez <sebastian.sanchez@intel.com>
Date: Wed, 2 May 2018 06:43:55 -0700
Subject: IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support

Currently the driver doesn't support completion vectors. These
are used to indicate which sets of CQs should be grouped together
into the same vector. A vector is a CQ processing thread that
runs on a specific CPU.

If an application has several CQs bound to different completion
vectors, and each completion vector runs on different CPUs, then
the completion queue workload is balanced. This helps scale as more
nodes are used.

Implement CQ completion vector support using a global workqueue
where a CQ entry is queued to the CPU corresponding to the CQ's
completion vector. Since the workqueue is global, it's guaranteed
to always be there when queueing CQ entries; Therefore, the RCU
locking for cq->rdi->worker in the hot path is superfluous.

Each completion vector is assigned to a different CPU. The number of
completion vectors available is computed by taking the number of
online, physical CPUs from the local NUMA node and subtracting the
CPUs used for kernel receive queues and the general interrupt.
Special use cases:

  * If there are no CPUs left for completion vectors, the same CPU
    for the general interrupt is used; Therefore, there would only
    be one completion vector available.

  * For multi-HFI systems, the number of completion vectors available
    for each device is the total number of completion vectors in
    the local NUMA node divided by the number of devices in the same
    NUMA node. If there's a division remainder, the first device to
    get initialized gets an extra completion vector.

Upon a CQ creation, an invalid completion vector could be specified.
Handle it as follows:

  * If the completion vector is less than 0, set it to 0.

  * Set the completion vector to the result of the passed completion
    vector moded with the number of device completion vectors
    available.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/affinity.c   | 414 +++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/hfi1/affinity.h   |  10 +-
 drivers/infiniband/hw/hfi1/chip.c       |   5 +
 drivers/infiniband/hw/hfi1/hfi.h        |   3 +
 drivers/infiniband/hw/hfi1/init.c       |  15 +-
 drivers/infiniband/hw/hfi1/trace.c      |   3 +-
 drivers/infiniband/hw/hfi1/trace_dbg.h  |   3 +-
 drivers/infiniband/hw/hfi1/verbs.c      |   7 +-
 drivers/infiniband/hw/qib/qib_verbs.c   |   6 +-
 drivers/infiniband/sw/rdmavt/cq.c       |  81 +++----
 drivers/infiniband/sw/rdmavt/cq.h       |   6 +-
 drivers/infiniband/sw/rdmavt/trace_cq.h |  35 ++-
 drivers/infiniband/sw/rdmavt/vt.c       |  35 +--
 include/rdma/rdma_vt.h                  |   7 +-
 include/rdma/rdmavt_cq.h                |   5 +-
 15 files changed, 534 insertions(+), 101 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index eca9e6354017..fbe7198a715a 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -208,7 +208,13 @@ int node_affinity_init(void)
 	return 0;
 }
 
-void node_affinity_destroy(void)
+static void node_affinity_destroy(struct hfi1_affinity_node *entry)
+{
+	free_percpu(entry->comp_vect_affinity);
+	kfree(entry);
+}
+
+void node_affinity_destroy_all(void)
 {
 	struct list_head *pos, *q;
 	struct hfi1_affinity_node *entry;
@@ -218,7 +224,7 @@ void node_affinity_destroy(void)
 		entry = list_entry(pos, struct hfi1_affinity_node,
 				   list);
 		list_del(pos);
-		kfree(entry);
+		node_affinity_destroy(entry);
 	}
 	mutex_unlock(&node_affinity.lock);
 	kfree(hfi1_per_node_cntr);
@@ -232,6 +238,7 @@ static struct hfi1_affinity_node *node_affinity_allocate(int node)
 	if (!entry)
 		return NULL;
 	entry->node = node;
+	entry->comp_vect_affinity = alloc_percpu(u16);
 	INIT_LIST_HEAD(&entry->list);
 
 	return entry;
@@ -261,6 +268,341 @@ static struct hfi1_affinity_node *node_affinity_lookup(int node)
 	return NULL;
 }
 
+static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
+				u16 __percpu *comp_vect_affinity)
+{
+	int curr_cpu;
+	u16 cntr;
+	u16 prev_cntr;
+	int ret_cpu;
+
+	if (!possible_cpumask) {
+		ret_cpu = -EINVAL;
+		goto fail;
+	}
+
+	if (!comp_vect_affinity) {
+		ret_cpu = -EINVAL;
+		goto fail;
+	}
+
+	ret_cpu = cpumask_first(possible_cpumask);
+	if (ret_cpu >= nr_cpu_ids) {
+		ret_cpu = -EINVAL;
+		goto fail;
+	}
+
+	prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
+	for_each_cpu(curr_cpu, possible_cpumask) {
+		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
+
+		if (cntr < prev_cntr) {
+			ret_cpu = curr_cpu;
+			prev_cntr = cntr;
+		}
+	}
+
+	*per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
+
+fail:
+	return ret_cpu;
+}
+
+static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
+				    u16 __percpu *comp_vect_affinity)
+{
+	int curr_cpu;
+	int max_cpu;
+	u16 cntr;
+	u16 prev_cntr;
+
+	if (!possible_cpumask)
+		return -EINVAL;
+
+	if (!comp_vect_affinity)
+		return -EINVAL;
+
+	max_cpu = cpumask_first(possible_cpumask);
+	if (max_cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
+	for_each_cpu(curr_cpu, possible_cpumask) {
+		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
+
+		if (cntr > prev_cntr) {
+			max_cpu = curr_cpu;
+			prev_cntr = cntr;
+		}
+	}
+
+	*per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
+
+	return max_cpu;
+}
+
+/*
+ * Non-interrupt CPUs are used first, then interrupt CPUs.
+ * Two already allocated cpu masks must be passed.
+ */
+static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
+				  struct hfi1_affinity_node *entry,
+				  cpumask_var_t non_intr_cpus,
+				  cpumask_var_t available_cpus)
+	__must_hold(&node_affinity.lock)
+{
+	int cpu;
+	struct cpu_mask_set *set = dd->comp_vect;
+
+	lockdep_assert_held(&node_affinity.lock);
+	if (!non_intr_cpus) {
+		cpu = -1;
+		goto fail;
+	}
+
+	if (!available_cpus) {
+		cpu = -1;
+		goto fail;
+	}
+
+	/* Available CPUs for pinning completion vectors */
+	_cpu_mask_set_gen_inc(set);
+	cpumask_andnot(available_cpus, &set->mask, &set->used);
+
+	/* Available CPUs without SDMA engine interrupts */
+	cpumask_andnot(non_intr_cpus, available_cpus,
+		       &entry->def_intr.used);
+
+	/* If there are non-interrupt CPUs available, use them first */
+	if (!cpumask_empty(non_intr_cpus))
+		cpu = cpumask_first(non_intr_cpus);
+	else /* Otherwise, use interrupt CPUs */
+		cpu = cpumask_first(available_cpus);
+
+	if (cpu >= nr_cpu_ids) { /* empty */
+		cpu = -1;
+		goto fail;
+	}
+	cpumask_set_cpu(cpu, &set->used);
+
+fail:
+	return cpu;
+}
+
+static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
+{
+	struct cpu_mask_set *set = dd->comp_vect;
+
+	if (cpu < 0)
+		return;
+
+	cpu_mask_set_put(set, cpu);
+}
+
+/* _dev_comp_vect_mappings_destroy() is reentrant */
+static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
+{
+	int i, cpu;
+
+	if (!dd->comp_vect_mappings)
+		return;
+
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		cpu = dd->comp_vect_mappings[i];
+		_dev_comp_vect_cpu_put(dd, cpu);
+		dd->comp_vect_mappings[i] = -1;
+		hfi1_cdbg(AFFINITY,
+			  "[%s] Release CPU %d from completion vector %d",
+			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
+	}
+
+	kfree(dd->comp_vect_mappings);
+	dd->comp_vect_mappings = NULL;
+}
+
+/*
+ * This function creates the table for looking up CPUs for completion vectors.
+ * num_comp_vectors needs to have been initilized before calling this function.
+ */
+static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
+					  struct hfi1_affinity_node *entry)
+	__must_hold(&node_affinity.lock)
+{
+	int i, cpu, ret;
+	cpumask_var_t non_intr_cpus;
+	cpumask_var_t available_cpus;
+
+	lockdep_assert_held(&node_affinity.lock);
+
+	if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
+		free_cpumask_var(non_intr_cpus);
+		return -ENOMEM;
+	}
+
+	dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
+					 sizeof(*dd->comp_vect_mappings),
+					 GFP_KERNEL);
+	if (!dd->comp_vect_mappings) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++)
+		dd->comp_vect_mappings[i] = -1;
+
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
+					     available_cpus);
+		if (cpu < 0) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		dd->comp_vect_mappings[i] = cpu;
+		hfi1_cdbg(AFFINITY,
+			  "[%s] Completion Vector %d -> CPU %d",
+			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
+	}
+
+	return 0;
+
+fail:
+	free_cpumask_var(available_cpus);
+	free_cpumask_var(non_intr_cpus);
+	_dev_comp_vect_mappings_destroy(dd);
+
+	return ret;
+}
+
+int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
+{
+	int ret;
+	struct hfi1_affinity_node *entry;
+
+	mutex_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	if (!entry) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+	ret = _dev_comp_vect_mappings_create(dd, entry);
+unlock:
+	mutex_unlock(&node_affinity.lock);
+
+	return ret;
+}
+
+void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
+{
+	_dev_comp_vect_mappings_destroy(dd);
+}
+
+int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
+{
+	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+
+	if (!dd->comp_vect_mappings)
+		return -EINVAL;
+	if (comp_vect >= dd->comp_vect_possible_cpus)
+		return -EINVAL;
+
+	return dd->comp_vect_mappings[comp_vect];
+}
+
+/*
+ * It assumes dd->comp_vect_possible_cpus is available.
+ */
+static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
+					struct hfi1_affinity_node *entry,
+					bool first_dev_init)
+	__must_hold(&node_affinity.lock)
+{
+	int i, j, curr_cpu;
+	int possible_cpus_comp_vect = 0;
+	struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
+
+	lockdep_assert_held(&node_affinity.lock);
+	/*
+	 * If there's only one CPU available for completion vectors, then
+	 * there will only be one completion vector available. Othewise,
+	 * the number of completion vector available will be the number of
+	 * available CPUs divide it by the number of devices in the
+	 * local NUMA node.
+	 */
+	if (cpumask_weight(&entry->comp_vect_mask) == 1) {
+		possible_cpus_comp_vect = 1;
+		dd_dev_warn(dd,
+			    "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
+	} else {
+		possible_cpus_comp_vect +=
+			cpumask_weight(&entry->comp_vect_mask) /
+				       hfi1_per_node_cntr[dd->node];
+
+		/*
+		 * If the completion vector CPUs available doesn't divide
+		 * evenly among devices, then the first device device to be
+		 * initialized gets an extra CPU.
+		 */
+		if (first_dev_init &&
+		    cpumask_weight(&entry->comp_vect_mask) %
+		    hfi1_per_node_cntr[dd->node] != 0)
+			possible_cpus_comp_vect++;
+	}
+
+	dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
+
+	/* Reserving CPUs for device completion vector */
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
+						entry->comp_vect_affinity);
+		if (curr_cpu < 0)
+			goto fail;
+
+		cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
+	}
+
+	hfi1_cdbg(AFFINITY,
+		  "[%s] Completion vector affinity CPU set(s) %*pbl",
+		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
+		  cpumask_pr_args(dev_comp_vect_mask));
+
+	return 0;
+
+fail:
+	for (j = 0; j < i; j++)
+		per_cpu_affinity_put_max(&entry->comp_vect_mask,
+					 entry->comp_vect_affinity);
+
+	return curr_cpu;
+}
+
+/*
+ * It assumes dd->comp_vect_possible_cpus is available.
+ */
+static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
+					     struct hfi1_affinity_node *entry)
+	__must_hold(&node_affinity.lock)
+{
+	int i, cpu;
+
+	lockdep_assert_held(&node_affinity.lock);
+	if (!dd->comp_vect_possible_cpus)
+		return;
+
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
+					       entry->comp_vect_affinity);
+		/* Clearing CPU in device completion vector cpu mask */
+		if (cpu >= 0)
+			cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
+	}
+
+	dd->comp_vect_possible_cpus = 0;
+}
+
 /*
  * Interrupt affinity.
  *
@@ -277,7 +619,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 	int node = pcibus_to_node(dd->pcidev->bus);
 	struct hfi1_affinity_node *entry;
 	const struct cpumask *local_mask;
-	int curr_cpu, possible, i;
+	int curr_cpu, possible, i, ret;
+	bool new_entry = false;
 
 	if (node < 0)
 		node = numa_node_id();
@@ -299,11 +642,14 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 		if (!entry) {
 			dd_dev_err(dd,
 				   "Unable to allocate global affinity node\n");
-			mutex_unlock(&node_affinity.lock);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto fail;
 		}
+		new_entry = true;
+
 		init_cpu_mask_set(&entry->def_intr);
 		init_cpu_mask_set(&entry->rcv_intr);
+		cpumask_clear(&entry->comp_vect_mask);
 		cpumask_clear(&entry->general_intr_mask);
 		/* Use the "real" cpu mask of this node as the default */
 		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
@@ -356,10 +702,64 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 					     &entry->general_intr_mask);
 		}
 
-		node_affinity_add_tail(entry);
+		/* Determine completion vector CPUs for the entire node */
+		cpumask_and(&entry->comp_vect_mask,
+			    &node_affinity.real_cpu_mask, local_mask);
+		cpumask_andnot(&entry->comp_vect_mask,
+			       &entry->comp_vect_mask,
+			       &entry->rcv_intr.mask);
+		cpumask_andnot(&entry->comp_vect_mask,
+			       &entry->comp_vect_mask,
+			       &entry->general_intr_mask);
+
+		/*
+		 * If there ends up being 0 CPU cores leftover for completion
+		 * vectors, use the same CPU core as the general/control
+		 * context.
+		 */
+		if (cpumask_weight(&entry->comp_vect_mask) == 0)
+			cpumask_copy(&entry->comp_vect_mask,
+				     &entry->general_intr_mask);
 	}
+
+	ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
+	if (ret < 0)
+		goto fail;
+
+	if (new_entry)
+		node_affinity_add_tail(entry);
+
 	mutex_unlock(&node_affinity.lock);
+
 	return 0;
+
+fail:
+	if (new_entry)
+		node_affinity_destroy(entry);
+	mutex_unlock(&node_affinity.lock);
+	return ret;
+}
+
+void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
+{
+	struct hfi1_affinity_node *entry;
+
+	if (dd->node < 0)
+		return;
+
+	mutex_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	if (!entry)
+		goto unlock;
+
+	/*
+	 * Free device completion vector CPUs to be used by future
+	 * completion vectors
+	 */
+	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
+unlock:
+	mutex_unlock(&node_affinity.lock);
+	dd->node = -1;
 }
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
index 2a1e374169c0..6a7e6ea4e426 100644
--- a/drivers/infiniband/hw/hfi1/affinity.h
+++ b/drivers/infiniband/hw/hfi1/affinity.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -98,9 +98,11 @@ void hfi1_put_proc_affinity(int cpu);
 
 struct hfi1_affinity_node {
 	int node;
+	u16 __percpu *comp_vect_affinity;
 	struct cpu_mask_set def_intr;
 	struct cpu_mask_set rcv_intr;
 	struct cpumask general_intr_mask;
+	struct cpumask comp_vect_mask;
 	struct list_head list;
 };
 
@@ -116,7 +118,11 @@ struct hfi1_affinity_node_list {
 };
 
 int node_affinity_init(void);
-void node_affinity_destroy(void);
+void node_affinity_destroy_all(void);
 extern struct hfi1_affinity_node_list node_affinity;
+void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd);
+int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect);
+int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd);
+void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd);
 
 #endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 0fab6df0a345..46e9e4ffcba4 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -15233,6 +15233,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 	if (ret)
 		goto bail_cleanup;
 
+	ret = hfi1_comp_vectors_set_up(dd);
+	if (ret)
+		goto bail_clear_intr;
+
 	/* set up LCB access - must be after set_up_interrupts() */
 	init_lcb_access(dd);
 
@@ -15275,6 +15279,7 @@ bail_free_rcverr:
 bail_free_cntrs:
 	free_cntrs(dd);
 bail_clear_intr:
+	hfi1_comp_vectors_clean_up(dd);
 	hfi1_clean_up_interrupts(dd);
 bail_cleanup:
 	hfi1_pcie_ddcleanup(dd);
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 9cd758ce7764..dd84238c1aac 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1263,6 +1263,9 @@ struct hfi1_devdata {
 
 	/* Save the enabled LCB error bits */
 	u64 lcb_err_en;
+	struct cpu_mask_set *comp_vect;
+	int *comp_vect_mappings;
+	u32 comp_vect_possible_cpus;
 
 	/*
 	 * Capability to have different send engines simply by changing a
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 790542ce89a5..5d1adfc450d3 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015-2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -1244,6 +1244,8 @@ static void hfi1_clean_devdata(struct hfi1_devdata *dd)
 	dd->rcv_limit     = NULL;
 	dd->send_schedule = NULL;
 	dd->tx_opstats    = NULL;
+	kfree(dd->comp_vect);
+	dd->comp_vect = NULL;
 	sdma_clean(dd, dd->num_sdma);
 	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
@@ -1300,6 +1302,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 		dd->unit = ret;
 		list_add(&dd->list, &hfi1_dev_list);
 	}
+	dd->node = -1;
 
 	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
 	idr_preload_end();
@@ -1352,6 +1355,12 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 		goto bail;
 	}
 
+	dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL);
+	if (!dd->comp_vect) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
 	kobject_init(&dd->kobj, &hfi1_devdata_type);
 	return dd;
 
@@ -1521,7 +1530,7 @@ module_init(hfi1_mod_init);
 static void __exit hfi1_mod_cleanup(void)
 {
 	pci_unregister_driver(&hfi1_pci_driver);
-	node_affinity_destroy();
+	node_affinity_destroy_all();
 	hfi1_wss_exit();
 	hfi1_dbg_exit();
 
@@ -1605,6 +1614,8 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
 static void postinit_cleanup(struct hfi1_devdata *dd)
 {
 	hfi1_start_cleanup(dd);
+	hfi1_comp_vectors_clean_up(dd);
+	hfi1_dev_affinity_clean_up(dd);
 
 	hfi1_pcie_ddcleanup(dd);
 	hfi1_pcie_cleanup(dd->pcidev);
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 89bd9851065b..332b9b7c554a 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -374,6 +374,7 @@ const char *print_u32_array(
 	return ret;
 }
 
+__hfi1_trace_fn(AFFINITY);
 __hfi1_trace_fn(PKT);
 __hfi1_trace_fn(PROC);
 __hfi1_trace_fn(SDMA);
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index 0e7d929530c5..e62171fb7379 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -1,5 +1,5 @@
 /*
-* Copyright(c) 2015, 2016 Intel Corporation.
+* Copyright(c) 2015 - 2018 Intel Corporation.
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
@@ -113,6 +113,7 @@ void __hfi1_trace_##lvl(const char *func, char *fmt, ...)		\
  * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
  * the debugfs stuff.
  */
+__hfi1_trace_def(AFFINITY);
 __hfi1_trace_def(PKT);
 __hfi1_trace_def(PROC);
 __hfi1_trace_def(SDMA);
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 9554e912af98..fc2e44cde161 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -64,6 +64,7 @@
 #include "debugfs.h"
 #include "vnic.h"
 #include "fault.h"
+#include "affinity.h"
 
 static unsigned int hfi1_lkey_table_size = 16;
 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
@@ -1934,11 +1935,11 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
 	dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
 	dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+	dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
+						hfi1_comp_vect_mappings_lookup;
 
 	/* completeion queue */
-	snprintf(dd->verbs_dev.rdi.dparms.cq_name,
-		 sizeof(dd->verbs_dev.rdi.dparms.cq_name),
-		 "hfi1_cq%d", dd->unit);
+	dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus;
 	dd->verbs_dev.rdi.dparms.node = dd->node;
 
 	/* misc settings */
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 3977abbc83ad..14b4057a2b8f 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2012 - 2018 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -1631,10 +1631,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB;
 	dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE;
 
-	snprintf(dd->verbs_dev.rdi.dparms.cq_name,
-		 sizeof(dd->verbs_dev.rdi.dparms.cq_name),
-		 "qib_cq%d", dd->unit);
-
 	qib_fill_device_attr(dd);
 
 	ppd = dd->pport;
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 340c17aba3b0..4f1544ad4aff 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -47,11 +47,12 @@
 
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/kthread.h>
 #include "cq.h"
 #include "vt.h"
 #include "trace.h"
 
+static struct workqueue_struct *comp_vector_wq;
+
 /**
  * rvt_cq_enter - add a new entry to the completion queue
  * @cq: completion queue
@@ -120,27 +121,21 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 	if (cq->notify == IB_CQ_NEXT_COMP ||
 	    (cq->notify == IB_CQ_SOLICITED &&
 	     (solicited || entry->status != IB_WC_SUCCESS))) {
-		struct kthread_worker *worker;
-
 		/*
 		 * This will cause send_complete() to be called in
 		 * another thread.
 		 */
-		rcu_read_lock();
-		worker = rcu_dereference(cq->rdi->worker);
-		if (likely(worker)) {
-			cq->notify = RVT_CQ_NONE;
-			cq->triggered++;
-			kthread_queue_work(worker, &cq->comptask);
-		}
-		rcu_read_unlock();
+		cq->notify = RVT_CQ_NONE;
+		cq->triggered++;
+		queue_work_on(cq->comp_vector_cpu, comp_vector_wq,
+			      &cq->comptask);
 	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
 }
 EXPORT_SYMBOL(rvt_cq_enter);
 
-static void send_complete(struct kthread_work *work)
+static void send_complete(struct work_struct *work)
 {
 	struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask);
 
@@ -192,6 +187,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	struct ib_cq *ret;
 	u32 sz;
 	unsigned int entries = attr->cqe;
+	int comp_vector = attr->comp_vector;
 
 	if (attr->flags)
 		return ERR_PTR(-EINVAL);
@@ -199,6 +195,11 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	if (entries < 1 || entries > rdi->dparms.props.max_cqe)
 		return ERR_PTR(-EINVAL);
 
+	if (comp_vector < 0)
+		comp_vector = 0;
+
+	comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
+
 	/* Allocate the completion queue structure. */
 	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
 	if (!cq)
@@ -267,14 +268,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	 * an error.
 	 */
 	cq->rdi = rdi;
+	if (rdi->driver_f.comp_vect_cpu_lookup)
+		cq->comp_vector_cpu =
+			rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector);
+	else
+		cq->comp_vector_cpu =
+			cpumask_first(cpumask_of_node(rdi->dparms.node));
+
 	cq->ibcq.cqe = entries;
 	cq->notify = RVT_CQ_NONE;
 	spin_lock_init(&cq->lock);
-	kthread_init_work(&cq->comptask, send_complete);
+	INIT_WORK(&cq->comptask, send_complete);
 	cq->queue = wc;
 
 	ret = &cq->ibcq;
 
+	trace_rvt_create_cq(cq, attr);
 	goto done;
 
 bail_ip:
@@ -300,7 +309,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq)
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 	struct rvt_dev_info *rdi = cq->rdi;
 
-	kthread_flush_work(&cq->comptask);
+	flush_work(&cq->comptask);
 	spin_lock_irq(&rdi->n_cqs_lock);
 	rdi->n_cqs_allocated--;
 	spin_unlock_irq(&rdi->n_cqs_lock);
@@ -510,24 +519,13 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
  *
  * Return: 0 on success
  */
-int rvt_driver_cq_init(struct rvt_dev_info *rdi)
+int rvt_driver_cq_init(void)
 {
-	int cpu;
-	struct kthread_worker *worker;
-
-	if (rcu_access_pointer(rdi->worker))
-		return 0;
-
-	spin_lock_init(&rdi->n_cqs_lock);
-
-	cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
-	worker = kthread_create_worker_on_cpu(cpu, 0,
-					      "%s", rdi->dparms.cq_name);
-	if (IS_ERR(worker))
-		return PTR_ERR(worker);
+	comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+					 0, "rdmavt_cq");
+	if (!comp_vector_wq)
+		return -ENOMEM;
 
-	set_user_nice(worker->task, MIN_NICE);
-	RCU_INIT_POINTER(rdi->worker, worker);
 	return 0;
 }
 
@@ -535,23 +533,8 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
  * rvt_cq_exit - tear down cq reources
  * @rdi: rvt dev structure
  */
-void rvt_cq_exit(struct rvt_dev_info *rdi)
+void rvt_cq_exit(void)
 {
-	struct kthread_worker *worker;
-
-	if (!rcu_access_pointer(rdi->worker))
-		return;
-
-	spin_lock(&rdi->n_cqs_lock);
-	worker = rcu_dereference_protected(rdi->worker,
-					   lockdep_is_held(&rdi->n_cqs_lock));
-	if (!worker) {
-		spin_unlock(&rdi->n_cqs_lock);
-		return;
-	}
-	RCU_INIT_POINTER(rdi->worker, NULL);
-	spin_unlock(&rdi->n_cqs_lock);
-	synchronize_rcu();
-
-	kthread_destroy_worker(worker);
+	destroy_workqueue(comp_vector_wq);
+	comp_vector_wq = NULL;
 }
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
index 6182c29eff66..72184b1c176b 100644
--- a/drivers/infiniband/sw/rdmavt/cq.h
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -2,7 +2,7 @@
 #define DEF_RVTCQ_H
 
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -59,6 +59,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq);
 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-int rvt_driver_cq_init(struct rvt_dev_info *rdi);
-void rvt_cq_exit(struct rvt_dev_info *rdi);
+int rvt_driver_cq_init(void);
+void rvt_cq_exit(void);
 #endif          /* DEF_RVTCQ_H */
diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h
index a315850aa9bb..df8e1adbef9d 100644
--- a/drivers/infiniband/sw/rdmavt/trace_cq.h
+++ b/drivers/infiniband/sw/rdmavt/trace_cq.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -71,6 +71,39 @@ __print_symbolic(opcode,                                      \
 	wc_opcode_name(RECV),                                 \
 	wc_opcode_name(RECV_RDMA_WITH_IMM))
 
+#define CQ_ATTR_PRINT \
+"[%s] user cq %s cqe %u comp_vector %d comp_vector_cpu %d flags %x"
+
+DECLARE_EVENT_CLASS(rvt_cq_template,
+		    TP_PROTO(struct rvt_cq *cq,
+			     const struct ib_cq_init_attr *attr),
+		    TP_ARGS(cq, attr),
+		    TP_STRUCT__entry(RDI_DEV_ENTRY(cq->rdi)
+				     __field(struct rvt_mmap_info *, ip)
+				     __field(unsigned int, cqe)
+				     __field(int, comp_vector)
+				     __field(int, comp_vector_cpu)
+				     __field(u32, flags)
+				     ),
+		    TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi)
+				   __entry->ip = cq->ip;
+				   __entry->cqe = attr->cqe;
+				   __entry->comp_vector = attr->comp_vector;
+				   __entry->comp_vector_cpu =
+							cq->comp_vector_cpu;
+				   __entry->flags = attr->flags;
+				   ),
+		    TP_printk(CQ_ATTR_PRINT, __get_str(dev),
+			      __entry->ip ? "true" : "false", __entry->cqe,
+			      __entry->comp_vector, __entry->comp_vector_cpu,
+			      __entry->flags
+			      )
+);
+
+DEFINE_EVENT(rvt_cq_template, rvt_create_cq,
+	     TP_PROTO(struct rvt_cq *cq, const struct ib_cq_init_attr *attr),
+	     TP_ARGS(cq, attr));
+
 #define CQ_PRN \
 "[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x"
 
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 434199d0bc96..17e4abc067af 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -49,6 +49,7 @@
 #include <linux/kernel.h>
 #include <linux/dma-mapping.h>
 #include "vt.h"
+#include "cq.h"
 #include "trace.h"
 
 #define RVT_UVERBS_ABI_VERSION 2
@@ -58,21 +59,18 @@ MODULE_DESCRIPTION("RDMA Verbs Transport Library");
 
 static int rvt_init(void)
 {
-	/*
-	 * rdmavt does not need to do anything special when it starts up. All it
-	 * needs to do is sit and wait until a driver attempts registration.
-	 */
-	return 0;
+	int ret = rvt_driver_cq_init();
+
+	if (ret)
+		pr_err("Error in driver CQ init.\n");
+
+	return ret;
 }
 module_init(rvt_init);
 
 static void rvt_cleanup(void)
 {
-	/*
-	 * Nothing to do at exit time either. The module won't be able to be
-	 * removed until all drivers are gone which means all the dev structs
-	 * are gone so there is really nothing to do.
-	 */
+	rvt_cq_exit();
 }
 module_exit(rvt_cleanup);
 
@@ -777,11 +775,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 	}
 
 	/* Completion queues */
-	ret = rvt_driver_cq_init(rdi);
-	if (ret) {
-		pr_err("Error in driver CQ init.\n");
-		goto bail_mr;
-	}
+	spin_lock_init(&rdi->n_cqs_lock);
 
 	/* DMA Operations */
 	rdi->ibdev.dev.dma_ops = rdi->ibdev.dev.dma_ops ? : &dma_virt_ops;
@@ -829,14 +823,15 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
 		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
 	rdi->ibdev.node_type = RDMA_NODE_IB_CA;
-	rdi->ibdev.num_comp_vectors = 1;
+	if (!rdi->ibdev.num_comp_vectors)
+		rdi->ibdev.num_comp_vectors = 1;
 
 	rdi->ibdev.driver_id = driver_id;
 	/* We are now good to announce we exist */
 	ret =  ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback);
 	if (ret) {
 		rvt_pr_err(rdi, "Failed to register driver with ib core.\n");
-		goto bail_cq;
+		goto bail_mr;
 	}
 
 	rvt_create_mad_agents(rdi);
@@ -844,9 +839,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 	rvt_pr_info(rdi, "Registration with rdmavt done.\n");
 	return ret;
 
-bail_cq:
-	rvt_cq_exit(rdi);
-
 bail_mr:
 	rvt_mr_exit(rdi);
 
@@ -870,7 +862,6 @@ void rvt_unregister_device(struct rvt_dev_info *rdi)
 	rvt_free_mad_agents(rdi);
 
 	ib_unregister_device(&rdi->ibdev);
-	rvt_cq_exit(rdi);
 	rvt_mr_exit(rdi);
 	rvt_qp_exit(rdi);
 }
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index eec495e68823..e79229a0cf01 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -2,7 +2,7 @@
 #define DEF_RDMA_VT_H
 
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -167,7 +167,6 @@ struct rvt_driver_params {
 	int qpn_res_end;
 	int nports;
 	int npkeys;
-	char cq_name[RVT_CQN_MAX];
 	int node;
 	int psn_mask;
 	int psn_shift;
@@ -347,6 +346,9 @@ struct rvt_driver_provided {
 
 	/* Notify driver to restart rc */
 	void (*notify_restart_rc)(struct rvt_qp *qp, u32 psn, int wait);
+
+	/* Get and return CPU to pin CQ processing thread */
+	int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect);
 };
 
 struct rvt_dev_info {
@@ -402,7 +404,6 @@ struct rvt_dev_info {
 	spinlock_t pending_lock; /* protect pending mmap list */
 
 	/* CQ */
-	struct kthread_worker __rcu *worker; /* per device cq worker */
 	u32 n_cqs_allocated;    /* number of CQs allocated for device */
 	spinlock_t n_cqs_lock; /* protect count of in use cqs */
 
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
index 51fd00b243d0..75dc65c0bfb8 100644
--- a/include/rdma/rdmavt_cq.h
+++ b/include/rdma/rdmavt_cq.h
@@ -8,7 +8,7 @@
  *
  * GPL LICENSE SUMMARY
  *
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -80,10 +80,11 @@ struct rvt_cq_wc {
  */
 struct rvt_cq {
 	struct ib_cq ibcq;
-	struct kthread_work comptask;
+	struct work_struct comptask;
 	spinlock_t lock; /* protect changes in this struct */
 	u8 notify;
 	u8 triggered;
+	int comp_vector_cpu;
 	struct rvt_dev_info *rdi;
 	struct rvt_cq_wc *queue;
 	struct rvt_mmap_info *ip;
-- 
cgit v1.2.3


From 832369fa6410c93547264ad449ebbf16567bbccd Mon Sep 17 00:00:00 2001
From: Brian Welty <brian.welty@intel.com>
Date: Wed, 2 May 2018 06:44:03 -0700
Subject: IB/{hfi1, qib, rdmavt}: Move logic to allocate receive WQE into
 rdmavt

Moving receive-side WQE allocation logic into rdmavt will allow
further code reuse between qib and hfi1 drivers.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Brian Welty <brian.welty@intel.com>
Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/rc.c       |   8 +-
 drivers/infiniband/hw/hfi1/ruc.c      | 154 +---------------------------------
 drivers/infiniband/hw/hfi1/uc.c       |   4 +-
 drivers/infiniband/hw/hfi1/ud.c       |   4 +-
 drivers/infiniband/hw/hfi1/verbs.h    |   2 -
 drivers/infiniband/hw/qib/qib_rc.c    |   8 +-
 drivers/infiniband/hw/qib/qib_ruc.c   | 154 +---------------------------------
 drivers/infiniband/hw/qib/qib_uc.c    |   4 +-
 drivers/infiniband/hw/qib/qib_ud.c    |   4 +-
 drivers/infiniband/hw/qib/qib_verbs.h |   2 -
 drivers/infiniband/sw/rdmavt/qp.c     | 149 ++++++++++++++++++++++++++++++++
 include/rdma/rdmavt_qp.h              |   1 +
 12 files changed, 170 insertions(+), 324 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index da58046a02ea..79ee2b9e28c6 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -2123,7 +2123,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
 	/* OK, process the packet. */
 	switch (opcode) {
 	case OP(SEND_FIRST):
-		ret = hfi1_rvt_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret)
@@ -2149,7 +2149,7 @@ send_middle:
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
 		/* consume RWQE */
-		ret = hfi1_rvt_get_rwqe(qp, 1);
+		ret = rvt_get_rwqe(qp, true);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret)
@@ -2159,7 +2159,7 @@ send_middle:
 	case OP(SEND_ONLY):
 	case OP(SEND_ONLY_WITH_IMMEDIATE):
 	case OP(SEND_ONLY_WITH_INVALIDATE):
-		ret = hfi1_rvt_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret)
@@ -2271,7 +2271,7 @@ send_last:
 			goto send_middle;
 		else if (opcode == OP(RDMA_WRITE_ONLY))
 			goto no_immediate_data;
-		ret = hfi1_rvt_get_rwqe(qp, 1);
+		ret = rvt_get_rwqe(qp, true);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret) {
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index c0071ca4147a..ef4c566e206f 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -53,156 +53,6 @@
 #include "verbs_txreq.h"
 #include "trace.h"
 
-/*
- * Validate a RWQE and fill in the SGE state.
- * Return 1 if OK.
- */
-static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
-{
-	int i, j, ret;
-	struct ib_wc wc;
-	struct rvt_lkey_table *rkt;
-	struct rvt_pd *pd;
-	struct rvt_sge_state *ss;
-
-	rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
-	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
-	ss = &qp->r_sge;
-	ss->sg_list = qp->r_sg_list;
-	qp->r_len = 0;
-	for (i = j = 0; i < wqe->num_sge; i++) {
-		if (wqe->sg_list[i].length == 0)
-			continue;
-		/* Check LKEY */
-		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-				  NULL, &wqe->sg_list[i],
-				  IB_ACCESS_LOCAL_WRITE);
-		if (unlikely(ret <= 0))
-			goto bad_lkey;
-		qp->r_len += wqe->sg_list[i].length;
-		j++;
-	}
-	ss->num_sge = j;
-	ss->total_len = qp->r_len;
-	ret = 1;
-	goto bail;
-
-bad_lkey:
-	while (j) {
-		struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
-
-		rvt_put_mr(sge->mr);
-	}
-	ss->num_sge = 0;
-	memset(&wc, 0, sizeof(wc));
-	wc.wr_id = wqe->wr_id;
-	wc.status = IB_WC_LOC_PROT_ERR;
-	wc.opcode = IB_WC_RECV;
-	wc.qp = &qp->ibqp;
-	/* Signal solicited completion event. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
-	ret = 0;
-bail:
-	return ret;
-}
-
-/**
- * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
- * @qp: the QP
- * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
- *
- * Return -1 if there is a local error, 0 if no RWQE is available,
- * otherwise return 1.
- *
- * Can be called from interrupt level.
- */
-int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
-{
-	unsigned long flags;
-	struct rvt_rq *rq;
-	struct rvt_rwq *wq;
-	struct rvt_srq *srq;
-	struct rvt_rwqe *wqe;
-	void (*handler)(struct ib_event *, void *);
-	u32 tail;
-	int ret;
-
-	if (qp->ibqp.srq) {
-		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
-		handler = srq->ibsrq.event_handler;
-		rq = &srq->rq;
-	} else {
-		srq = NULL;
-		handler = NULL;
-		rq = &qp->r_rq;
-	}
-
-	spin_lock_irqsave(&rq->lock, flags);
-	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
-		ret = 0;
-		goto unlock;
-	}
-
-	wq = rq->wq;
-	tail = wq->tail;
-	/* Validate tail before using it since it is user writable. */
-	if (tail >= rq->size)
-		tail = 0;
-	if (unlikely(tail == wq->head)) {
-		ret = 0;
-		goto unlock;
-	}
-	/* Make sure entry is read after head index is read. */
-	smp_rmb();
-	wqe = rvt_get_rwqe_ptr(rq, tail);
-	/*
-	 * Even though we update the tail index in memory, the verbs
-	 * consumer is not supposed to post more entries until a
-	 * completion is generated.
-	 */
-	if (++tail >= rq->size)
-		tail = 0;
-	wq->tail = tail;
-	if (!wr_id_only && !init_sge(qp, wqe)) {
-		ret = -1;
-		goto unlock;
-	}
-	qp->r_wr_id = wqe->wr_id;
-
-	ret = 1;
-	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
-	if (handler) {
-		u32 n;
-
-		/*
-		 * Validate head pointer value and compute
-		 * the number of remaining WQEs.
-		 */
-		n = wq->head;
-		if (n >= rq->size)
-			n = 0;
-		if (n < tail)
-			n += rq->size - tail;
-		else
-			n -= tail;
-		if (n < srq->limit) {
-			struct ib_event ev;
-
-			srq->limit = 0;
-			spin_unlock_irqrestore(&rq->lock, flags);
-			ev.device = qp->ibqp.device;
-			ev.element.srq = qp->ibqp.srq;
-			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-			handler(&ev, srq->ibsrq.srq_context);
-			goto bail;
-		}
-	}
-unlock:
-	spin_unlock_irqrestore(&rq->lock, flags);
-bail:
-	return ret;
-}
-
 static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
 {
 	return (gid->global.interface_id == id &&
@@ -423,7 +273,7 @@ again:
 		/* FALLTHROUGH */
 	case IB_WR_SEND:
 send:
-		ret = hfi1_rvt_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0)
 			goto op_err;
 		if (!ret)
@@ -435,7 +285,7 @@ send:
 			goto inv_err;
 		wc.wc_flags = IB_WC_WITH_IMM;
 		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		ret = hfi1_rvt_get_rwqe(qp, 1);
+		ret = rvt_get_rwqe(qp, true);
 		if (ret < 0)
 			goto op_err;
 		if (!ret)
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 9d7a3110c14c..b7b671017e59 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -397,7 +397,7 @@ send_first:
 		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
 			qp->r_sge = qp->s_rdma_read_sge;
 		} else {
-			ret = hfi1_rvt_get_rwqe(qp, 0);
+			ret = rvt_get_rwqe(qp, false);
 			if (ret < 0)
 				goto op_err;
 			if (!ret)
@@ -542,7 +542,7 @@ rdma_last_imm:
 		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
 			rvt_put_ss(&qp->s_rdma_read_sge);
 		} else {
-			ret = hfi1_rvt_get_rwqe(qp, 1);
+			ret = rvt_get_rwqe(qp, true);
 			if (ret < 0)
 				goto op_err;
 			if (!ret)
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 69c17a5ef038..6ad203f6da88 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -163,7 +163,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	} else {
 		int ret;
 
-		ret = hfi1_rvt_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0) {
 			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 			goto bail_unlock;
@@ -974,7 +974,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 	} else {
 		int ret;
 
-		ret = hfi1_rvt_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0) {
 			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 			return;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 081ca52e6621..a16fe5d3f7c4 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -328,8 +328,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet);
 
 int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
 
-int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only);
-
 void hfi1_migrate_qp(struct rvt_qp *qp);
 
 int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index c9955d48c50f..f35fdeb14347 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -1828,7 +1828,7 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
 	/* OK, process the packet. */
 	switch (opcode) {
 	case OP(SEND_FIRST):
-		ret = qib_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret)
@@ -1849,7 +1849,7 @@ send_middle:
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
 		/* consume RWQE */
-		ret = qib_get_rwqe(qp, 1);
+		ret = rvt_get_rwqe(qp, true);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret)
@@ -1858,7 +1858,7 @@ send_middle:
 
 	case OP(SEND_ONLY):
 	case OP(SEND_ONLY_WITH_IMMEDIATE):
-		ret = qib_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret)
@@ -1949,7 +1949,7 @@ send_last:
 			goto send_middle;
 		else if (opcode == OP(RDMA_WRITE_ONLY))
 			goto no_immediate_data;
-		ret = qib_get_rwqe(qp, 1);
+		ret = rvt_get_rwqe(qp, true);
 		if (ret < 0)
 			goto nack_op_err;
 		if (!ret) {
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index 4662cc7bde92..f8a7de795beb 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -37,156 +37,6 @@
 #include "qib.h"
 #include "qib_mad.h"
 
-/*
- * Validate a RWQE and fill in the SGE state.
- * Return 1 if OK.
- */
-static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
-{
-	int i, j, ret;
-	struct ib_wc wc;
-	struct rvt_lkey_table *rkt;
-	struct rvt_pd *pd;
-	struct rvt_sge_state *ss;
-
-	rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
-	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
-	ss = &qp->r_sge;
-	ss->sg_list = qp->r_sg_list;
-	qp->r_len = 0;
-	for (i = j = 0; i < wqe->num_sge; i++) {
-		if (wqe->sg_list[i].length == 0)
-			continue;
-		/* Check LKEY */
-		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-				  NULL, &wqe->sg_list[i],
-				  IB_ACCESS_LOCAL_WRITE);
-		if (unlikely(ret <= 0))
-			goto bad_lkey;
-		qp->r_len += wqe->sg_list[i].length;
-		j++;
-	}
-	ss->num_sge = j;
-	ss->total_len = qp->r_len;
-	ret = 1;
-	goto bail;
-
-bad_lkey:
-	while (j) {
-		struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
-
-		rvt_put_mr(sge->mr);
-	}
-	ss->num_sge = 0;
-	memset(&wc, 0, sizeof(wc));
-	wc.wr_id = wqe->wr_id;
-	wc.status = IB_WC_LOC_PROT_ERR;
-	wc.opcode = IB_WC_RECV;
-	wc.qp = &qp->ibqp;
-	/* Signal solicited completion event. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
-	ret = 0;
-bail:
-	return ret;
-}
-
-/**
- * qib_get_rwqe - copy the next RWQE into the QP's RWQE
- * @qp: the QP
- * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
- *
- * Return -1 if there is a local error, 0 if no RWQE is available,
- * otherwise return 1.
- *
- * Can be called from interrupt level.
- */
-int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only)
-{
-	unsigned long flags;
-	struct rvt_rq *rq;
-	struct rvt_rwq *wq;
-	struct rvt_srq *srq;
-	struct rvt_rwqe *wqe;
-	void (*handler)(struct ib_event *, void *);
-	u32 tail;
-	int ret;
-
-	if (qp->ibqp.srq) {
-		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
-		handler = srq->ibsrq.event_handler;
-		rq = &srq->rq;
-	} else {
-		srq = NULL;
-		handler = NULL;
-		rq = &qp->r_rq;
-	}
-
-	spin_lock_irqsave(&rq->lock, flags);
-	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
-		ret = 0;
-		goto unlock;
-	}
-
-	wq = rq->wq;
-	tail = wq->tail;
-	/* Validate tail before using it since it is user writable. */
-	if (tail >= rq->size)
-		tail = 0;
-	if (unlikely(tail == wq->head)) {
-		ret = 0;
-		goto unlock;
-	}
-	/* Make sure entry is read after head index is read. */
-	smp_rmb();
-	wqe = rvt_get_rwqe_ptr(rq, tail);
-	/*
-	 * Even though we update the tail index in memory, the verbs
-	 * consumer is not supposed to post more entries until a
-	 * completion is generated.
-	 */
-	if (++tail >= rq->size)
-		tail = 0;
-	wq->tail = tail;
-	if (!wr_id_only && !qib_init_sge(qp, wqe)) {
-		ret = -1;
-		goto unlock;
-	}
-	qp->r_wr_id = wqe->wr_id;
-
-	ret = 1;
-	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
-	if (handler) {
-		u32 n;
-
-		/*
-		 * Validate head pointer value and compute
-		 * the number of remaining WQEs.
-		 */
-		n = wq->head;
-		if (n >= rq->size)
-			n = 0;
-		if (n < tail)
-			n += rq->size - tail;
-		else
-			n -= tail;
-		if (n < srq->limit) {
-			struct ib_event ev;
-
-			srq->limit = 0;
-			spin_unlock_irqrestore(&rq->lock, flags);
-			ev.device = qp->ibqp.device;
-			ev.element.srq = qp->ibqp.srq;
-			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-			handler(&ev, srq->ibsrq.srq_context);
-			goto bail;
-		}
-	}
-unlock:
-	spin_unlock_irqrestore(&rq->lock, flags);
-bail:
-	return ret;
-}
-
 /*
  * Switch to alternate path.
  * The QP s_lock should be held and interrupts disabled.
@@ -419,7 +269,7 @@ again:
 		wc.ex.imm_data = wqe->wr.ex.imm_data;
 		/* FALLTHROUGH */
 	case IB_WR_SEND:
-		ret = qib_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0)
 			goto op_err;
 		if (!ret)
@@ -431,7 +281,7 @@ again:
 			goto inv_err;
 		wc.wc_flags = IB_WC_WITH_IMM;
 		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		ret = qib_get_rwqe(qp, 1);
+		ret = rvt_get_rwqe(qp, true);
 		if (ret < 0)
 			goto op_err;
 		if (!ret)
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 840eec6ebc33..3e54bc11e0ae 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -335,7 +335,7 @@ send_first:
 		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
 			qp->r_sge = qp->s_rdma_read_sge;
 		else {
-			ret = qib_get_rwqe(qp, 0);
+			ret = rvt_get_rwqe(qp, false);
 			if (ret < 0)
 				goto op_err;
 			if (!ret)
@@ -471,7 +471,7 @@ rdma_last_imm:
 		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
 			rvt_put_ss(&qp->s_rdma_read_sge);
 		else {
-			ret = qib_get_rwqe(qp, 1);
+			ret = rvt_get_rwqe(qp, true);
 			if (ret < 0)
 				goto op_err;
 			if (!ret)
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index 3e4ff77260c2..f8d029a2390f 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -139,7 +139,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	else {
 		int ret;
 
-		ret = qib_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0) {
 			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 			goto bail_unlock;
@@ -534,7 +534,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
 	else {
 		int ret;
 
-		ret = qib_get_rwqe(qp, 0);
+		ret = rvt_get_rwqe(qp, false);
 		if (ret < 0) {
 			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 			return;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index f887737ac142..f9a46768a19a 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -321,8 +321,6 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
 
 void mr_rcu_callback(struct rcu_head *list);
 
-int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only);
-
 void qib_migrate_qp(struct rvt_qp *qp);
 
 int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr,
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index c82e6bb3d77c..6e9a351f45fb 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1987,6 +1987,155 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	return 0;
 }
 
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+	struct rvt_lkey_table *rkt;
+	struct rvt_pd *pd;
+	struct rvt_sge_state *ss;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+	rkt = &rdi->lkey_table;
+	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+	ss = &qp->r_sge;
+	ss->sg_list = qp->r_sg_list;
+	qp->r_len = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+				  NULL, &wqe->sg_list[i],
+				  IB_ACCESS_LOCAL_WRITE);
+		if (unlikely(ret <= 0))
+			goto bad_lkey;
+		qp->r_len += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ss->total_len = qp->r_len;
+	return 1;
+
+bad_lkey:
+	while (j) {
+		struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+		rvt_put_mr(sge->mr);
+	}
+	ss->num_sge = 0;
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+	return 0;
+}
+
+/**
+ * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
+{
+	unsigned long flags;
+	struct rvt_rq *rq;
+	struct rvt_rwq *wq;
+	struct rvt_srq *srq;
+	struct rvt_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		ret = 0;
+		goto unlock;
+	}
+	/* Make sure entry is read after head index is read. */
+	smp_rmb();
+	wqe = rvt_get_rwqe_ptr(rq, tail);
+	/*
+	 * Even though we update the tail index in memory, the verbs
+	 * consumer is not supposed to post more entries until a
+	 * completion is generated.
+	 */
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	if (!wr_id_only && !init_sge(qp, wqe)) {
+		ret = -1;
+		goto unlock;
+	}
+	qp->r_wr_id = wqe->wr_id;
+
+	ret = 1;
+	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * Validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+EXPORT_SYMBOL(rvt_get_rwqe);
+
 /**
  * qp_comm_est - handle trap with QP established
  * @qp: the QP
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 89ab88c342b6..1145a4c154b2 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -663,6 +663,7 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout)
 extern const int  ib_rvt_state_ops[];
 
 struct rvt_dev_info;
+int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only);
 void rvt_comm_est(struct rvt_qp *qp);
 int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
 void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
-- 
cgit v1.2.3


From 5bc3744c38e75f169b0e7eefebe033e2937fd71c Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Thu, 3 May 2018 08:47:10 -0400
Subject: media: v4l2-dev.h: fix doc warning

Fix this warning when building the docs:

include/media/v4l2-dev.h:42: warning: Enum value 'VFL_TYPE_MAX' not described in enum 'vfl_devnode_type'

Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/media/v4l2-dev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h
index f60cf9cf3b9c..73073f6ee48c 100644
--- a/include/media/v4l2-dev.h
+++ b/include/media/v4l2-dev.h
@@ -30,6 +30,7 @@
  * @VFL_TYPE_SUBDEV:	for V4L2 subdevices
  * @VFL_TYPE_SDR:	for Software Defined Radio tuners
  * @VFL_TYPE_TOUCH:	for touch sensors
+ * @VFL_TYPE_MAX:	number of VFL types, must always be last in the enum
  */
 enum vfl_devnode_type {
 	VFL_TYPE_GRABBER	= 0,
-- 
cgit v1.2.3


From 378e3f81cb5653b28100d3231db6da07b0581ff4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 25 Apr 2018 17:30:10 -0400
Subject: media: omap3isp: support 64-bit version of omap3isp_stat_data

C libraries with 64-bit time_t use an incompatible format for
struct omap3isp_stat_data. This changes the kernel code to
support either version, by moving over the normal handling
to the 64-bit variant, and adding compatiblity code to handle
the old binary format with the existing ioctl command code.

Fortunately, the command code includes the size of the structure,
so the difference gets handled automatically. In the process of
eliminating the references to 'struct timeval' from the kernel,
I also change the way the timestamp is generated internally,
basically by open-coding the v4l2_get_timestamp() call.

[Sakari Ailus: Alphabetical order of headers, clean up compat code]

Cc: Sakari Ailus <sakari.ailus@iki.fi>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/platform/omap3isp/isph3a_aewb.c |  2 ++
 drivers/media/platform/omap3isp/isph3a_af.c   |  2 ++
 drivers/media/platform/omap3isp/isphist.c     |  2 ++
 drivers/media/platform/omap3isp/ispstat.c     | 23 +++++++++++++++++++++--
 drivers/media/platform/omap3isp/ispstat.h     |  4 +++-
 include/uapi/linux/omap3isp.h                 | 22 ++++++++++++++++++++++
 6 files changed, 52 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/omap3isp/isph3a_aewb.c b/drivers/media/platform/omap3isp/isph3a_aewb.c
index d44626f20ac6..3c82dea4d375 100644
--- a/drivers/media/platform/omap3isp/isph3a_aewb.c
+++ b/drivers/media/platform/omap3isp/isph3a_aewb.c
@@ -250,6 +250,8 @@ static long h3a_aewb_ioctl(struct v4l2_subdev *sd, unsigned int cmd, void *arg)
 		return omap3isp_stat_config(stat, arg);
 	case VIDIOC_OMAP3ISP_STAT_REQ:
 		return omap3isp_stat_request_statistics(stat, arg);
+	case VIDIOC_OMAP3ISP_STAT_REQ_TIME32:
+		return omap3isp_stat_request_statistics_time32(stat, arg);
 	case VIDIOC_OMAP3ISP_STAT_EN: {
 		unsigned long *en = arg;
 		return omap3isp_stat_enable(stat, !!*en);
diff --git a/drivers/media/platform/omap3isp/isph3a_af.c b/drivers/media/platform/omap3isp/isph3a_af.c
index 99bd6cc21d86..4da25c84f0c6 100644
--- a/drivers/media/platform/omap3isp/isph3a_af.c
+++ b/drivers/media/platform/omap3isp/isph3a_af.c
@@ -314,6 +314,8 @@ static long h3a_af_ioctl(struct v4l2_subdev *sd, unsigned int cmd, void *arg)
 		return omap3isp_stat_config(stat, arg);
 	case VIDIOC_OMAP3ISP_STAT_REQ:
 		return omap3isp_stat_request_statistics(stat, arg);
+	case VIDIOC_OMAP3ISP_STAT_REQ_TIME32:
+		return omap3isp_stat_request_statistics_time32(stat, arg);
 	case VIDIOC_OMAP3ISP_STAT_EN: {
 		int *en = arg;
 		return omap3isp_stat_enable(stat, !!*en);
diff --git a/drivers/media/platform/omap3isp/isphist.c b/drivers/media/platform/omap3isp/isphist.c
index a4ed5d140d48..d4be3d0e06f9 100644
--- a/drivers/media/platform/omap3isp/isphist.c
+++ b/drivers/media/platform/omap3isp/isphist.c
@@ -435,6 +435,8 @@ static long hist_ioctl(struct v4l2_subdev *sd, unsigned int cmd, void *arg)
 		return omap3isp_stat_config(stat, arg);
 	case VIDIOC_OMAP3ISP_STAT_REQ:
 		return omap3isp_stat_request_statistics(stat, arg);
+	case VIDIOC_OMAP3ISP_STAT_REQ_TIME32:
+		return omap3isp_stat_request_statistics_time32(stat, arg);
 	case VIDIOC_OMAP3ISP_STAT_EN: {
 		int *en = arg;
 		return omap3isp_stat_enable(stat, !!*en);
diff --git a/drivers/media/platform/omap3isp/ispstat.c b/drivers/media/platform/omap3isp/ispstat.c
index 0b31f6c5791f..6af7460a3559 100644
--- a/drivers/media/platform/omap3isp/ispstat.c
+++ b/drivers/media/platform/omap3isp/ispstat.c
@@ -17,6 +17,7 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
+#include <linux/timekeeping.h>
 #include <linux/uaccess.h>
 
 #include "isp.h"
@@ -237,7 +238,7 @@ static int isp_stat_buf_queue(struct ispstat *stat)
 	if (!stat->active_buf)
 		return STAT_NO_BUF;
 
-	v4l2_get_timestamp(&stat->active_buf->ts);
+	ktime_get_ts64(&stat->active_buf->ts);
 
 	stat->active_buf->buf_size = stat->buf_size;
 	if (isp_stat_buf_check_magic(stat, stat->active_buf)) {
@@ -498,7 +499,8 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
 		return PTR_ERR(buf);
 	}
 
-	data->ts = buf->ts;
+	data->ts.tv_sec = buf->ts.tv_sec;
+	data->ts.tv_usec = buf->ts.tv_nsec / NSEC_PER_USEC;
 	data->config_counter = buf->config_counter;
 	data->frame_number = buf->frame_number;
 	data->buf_size = buf->buf_size;
@@ -510,6 +512,23 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
 	return 0;
 }
 
+int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
+					struct omap3isp_stat_data_time32 *data)
+{
+	struct omap3isp_stat_data data64;
+	int ret;
+
+	ret = omap3isp_stat_request_statistics(stat, &data64);
+	if (ret)
+		return ret;
+
+	data->ts.tv_sec = data64.ts.tv_sec;
+	data->ts.tv_usec = data64.ts.tv_usec;
+	memcpy(&data->buf, &data64.buf, sizeof(*data) - sizeof(data->ts));
+
+	return 0;
+}
+
 /*
  * omap3isp_stat_config - Receives new statistic engine configuration.
  * @new_conf: Pointer to config structure.
diff --git a/drivers/media/platform/omap3isp/ispstat.h b/drivers/media/platform/omap3isp/ispstat.h
index 6d9b0244f320..923b38cfc682 100644
--- a/drivers/media/platform/omap3isp/ispstat.h
+++ b/drivers/media/platform/omap3isp/ispstat.h
@@ -39,7 +39,7 @@ struct ispstat_buffer {
 	struct sg_table sgt;
 	void *virt_addr;
 	dma_addr_t dma_addr;
-	struct timeval ts;
+	struct timespec64 ts;
 	u32 buf_size;
 	u32 frame_number;
 	u16 config_counter;
@@ -130,6 +130,8 @@ struct ispstat_generic_config {
 int omap3isp_stat_config(struct ispstat *stat, void *new_conf);
 int omap3isp_stat_request_statistics(struct ispstat *stat,
 				     struct omap3isp_stat_data *data);
+int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
+				     struct omap3isp_stat_data_time32 *data);
 int omap3isp_stat_init(struct ispstat *stat, const char *name,
 		       const struct v4l2_subdev_ops *sd_ops);
 void omap3isp_stat_cleanup(struct ispstat *stat);
diff --git a/include/uapi/linux/omap3isp.h b/include/uapi/linux/omap3isp.h
index 1a920145db04..87b55755f4ff 100644
--- a/include/uapi/linux/omap3isp.h
+++ b/include/uapi/linux/omap3isp.h
@@ -55,6 +55,8 @@
 	_IOWR('V', BASE_VIDIOC_PRIVATE + 5, struct omap3isp_h3a_af_config)
 #define VIDIOC_OMAP3ISP_STAT_REQ \
 	_IOWR('V', BASE_VIDIOC_PRIVATE + 6, struct omap3isp_stat_data)
+#define VIDIOC_OMAP3ISP_STAT_REQ_TIME32 \
+	_IOWR('V', BASE_VIDIOC_PRIVATE + 6, struct omap3isp_stat_data_time32)
 #define VIDIOC_OMAP3ISP_STAT_EN \
 	_IOWR('V', BASE_VIDIOC_PRIVATE + 7, unsigned long)
 
@@ -165,7 +167,14 @@ struct omap3isp_h3a_aewb_config {
  * @config_counter: Number of the configuration associated with the data.
  */
 struct omap3isp_stat_data {
+#ifdef __KERNEL__
+	struct {
+		__s64	tv_sec;
+		__s64	tv_usec;
+	} ts;
+#else
 	struct timeval ts;
+#endif
 	void __user *buf;
 	__u32 buf_size;
 	__u16 frame_number;
@@ -173,6 +182,19 @@ struct omap3isp_stat_data {
 	__u16 config_counter;
 };
 
+#ifdef __KERNEL__
+struct omap3isp_stat_data_time32 {
+	struct {
+		__s32	tv_sec;
+		__s32	tv_usec;
+	} ts;
+	__u32 buf;
+	__u32 buf_size;
+	__u16 frame_number;
+	__u16 cur_frame;
+	__u16 config_counter;
+};
+#endif
 
 /* Histogram related structs */
 
-- 
cgit v1.2.3


From 826950868cd81268913a91b69f00fb6c3f0b6f5b Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Fri, 27 Apr 2018 14:16:38 +0200
Subject: remoteproc/ste: remove abandoned include file

STE modem driver has been removed in 2016. This include has no
users since then.

Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Cc: Jean Delvare <jdelvare@suse.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Ohad Ben-Cohen <ohad@wizery.com>
Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Cc: Suman Anna <s-anna@ti.com>
Acked-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/ste_modem_shm.h | 56 -------------------------------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 include/linux/ste_modem_shm.h

(limited to 'include')

diff --git a/include/linux/ste_modem_shm.h b/include/linux/ste_modem_shm.h
deleted file mode 100644
index 8444a4eff1bb..000000000000
--- a/include/linux/ste_modem_shm.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) ST-Ericsson AB 2012
- * Author: Sjur Brendeland / sjur.brandeland@stericsson.com
- *
- * License terms: GNU General Public License (GPL) version 2
- */
-
-#ifndef __INC_MODEM_DEV_H
-#define __INC_MODEM_DEV_H
-#include <linux/types.h>
-#include <linux/platform_device.h>
-
-struct ste_modem_device;
-
-/**
- * struct ste_modem_dev_cb - Callbacks for modem initiated events.
- * @kick: Called when the modem kicks the host.
- *
- * This structure contains callbacks for actions triggered by the modem.
- */
-struct ste_modem_dev_cb {
-	void (*kick)(struct ste_modem_device *mdev, int notify_id);
-};
-
-/**
- * struct ste_modem_dev_ops - Functions to control modem and modem interface.
- *
- * @power:	Main power switch, used for cold-start or complete power off.
- * @kick:	Kick the modem.
- * @kick_subscribe: Subscribe for notifications from the modem.
- * @setup:	Provide callback functions to modem device.
- *
- * This structure contains functions used by the ste remoteproc driver
- * to manage the modem.
- */
-struct ste_modem_dev_ops {
-	int (*power)(struct ste_modem_device *mdev, bool on);
-	int (*kick)(struct ste_modem_device *mdev, int notify_id);
-	int (*kick_subscribe)(struct ste_modem_device *mdev, int notify_id);
-	int (*setup)(struct ste_modem_device *mdev,
-		     struct ste_modem_dev_cb *cfg);
-};
-
-/**
- * struct ste_modem_device - represent the STE modem device
- * @pdev: Reference to platform device
- * @ops: Operations used to manage the modem.
- * @drv_data: Driver private data.
- */
-struct ste_modem_device {
-	struct platform_device pdev;
-	struct ste_modem_dev_ops ops;
-	void *drv_data;
-};
-
-#endif /*INC_MODEM_DEV_H*/
-- 
cgit v1.2.3


From 9913f74fe15705acd5163551ddf449568cf0048d Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 10 May 2018 08:46:36 +0900
Subject: drm/exynos: ipp: Add IPP v2 framework

This patch adds Exynos IPP v2 subsystem and userspace API.

New userspace API is focused ONLY on memory-to-memory image processing.
The two remainging operation modes of obsolete IPP v1 API (framebuffer
writeback and local-path output with image processing) can be implemented
using standard DRM features: writeback connectors and additional DRM planes
with scaling features.

V2 IPP userspace API is based on stateless approach, which much better fits
to memory-to-memory image processing model. It also provides support for
all image formats, which are both already defined in DRM API and supported
by the existing IPP hardware modules.

The API consists of the following ioctls:
- DRM_IOCTL_EXYNOS_IPP_GET_RESOURCES: to enumerate all available image
  processing modules,
- DRM_IOCTL_EXYNOS_IPP_GET_CAPS: to query capabilities and supported image
  formats of given IPP module,
- DRM_IOCTL_EXYNOS_IPP_GET_LIMITS: to query hardware limitiations for
  selected image format of given IPP module,
- DRM_IOCTL_EXYNOS_IPP_COMMIT: to perform operation described by the
  provided structures (source and destination buffers, operation rectangle,
  transformation, etc).

The proposed userspace API is extensible. In the future more advanced image
processing operations can be defined to support for example blending.

Userspace API is fully functional also on DRM render nodes, so it is not
limited to the root/privileged client.

Internal driver API also has been completely rewritten. New IPP core
performs all possible input validation, checks and object life-time
control. The drivers can focus only on writing configuration to hardware
registers. Stateless nature of DRM_IOCTL_EXYNOS_IPP_COMMIT ioctl simplifies
the driver API. Minimal driver needs to provide a single callback for
starting processing and an array with supported image formats.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Hoegeun Kwon <hoegeun.kwon@samsung.com>
Merge conflict so merged manually.
Signed-off-by: Inki Dae <inki.dae@samsung.com>
---
 drivers/gpu/drm/exynos/Kconfig          |   3 +
 drivers/gpu/drm/exynos/Makefile         |   1 +
 drivers/gpu/drm/exynos/exynos_drm_drv.c |  22 +-
 drivers/gpu/drm/exynos/exynos_drm_ipp.c | 916 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/exynos/exynos_drm_ipp.h | 175 ++++++
 include/uapi/drm/exynos_drm.h           | 240 +++++++++
 6 files changed, 1355 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/exynos/exynos_drm_ipp.c
 create mode 100644 drivers/gpu/drm/exynos/exynos_drm_ipp.h

(limited to 'include')

diff --git a/drivers/gpu/drm/exynos/Kconfig b/drivers/gpu/drm/exynos/Kconfig
index 1548a784ef71..9e914655c430 100644
--- a/drivers/gpu/drm/exynos/Kconfig
+++ b/drivers/gpu/drm/exynos/Kconfig
@@ -95,6 +95,9 @@ config DRM_EXYNOS_G2D
 	help
 	  Choose this option if you want to use Exynos G2D for DRM.
 
+config DRM_EXYNOS_IPP
+	bool
+
 config DRM_EXYNOS_FIMC
 	bool "FIMC"
 	depends on BROKEN && MFD_SYSCON
diff --git a/drivers/gpu/drm/exynos/Makefile b/drivers/gpu/drm/exynos/Makefile
index a51c5459bb13..bdf4212dde7b 100644
--- a/drivers/gpu/drm/exynos/Makefile
+++ b/drivers/gpu/drm/exynos/Makefile
@@ -18,6 +18,7 @@ exynosdrm-$(CONFIG_DRM_EXYNOS_MIXER)	+= exynos_mixer.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_HDMI)	+= exynos_hdmi.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_VIDI)	+= exynos_drm_vidi.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_G2D)	+= exynos_drm_g2d.o
+exynosdrm-$(CONFIG_DRM_EXYNOS_IPP)	+= exynos_drm_ipp.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_FIMC)	+= exynos_drm_fimc.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_ROTATOR)	+= exynos_drm_rotator.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_GSC)	+= exynos_drm_gsc.o
diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c
index a518e9c6d6cc..37c0db759674 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_drv.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c
@@ -27,15 +27,23 @@
 #include "exynos_drm_fb.h"
 #include "exynos_drm_gem.h"
 #include "exynos_drm_plane.h"
+#include "exynos_drm_ipp.h"
 #include "exynos_drm_vidi.h"
 #include "exynos_drm_g2d.h"
 #include "exynos_drm_iommu.h"
 
 #define DRIVER_NAME	"exynos"
 #define DRIVER_DESC	"Samsung SoC DRM"
-#define DRIVER_DATE	"20110530"
+#define DRIVER_DATE	"20180330"
+
+/*
+ * Interface history:
+ *
+ * 1.0 - Original version
+ * 1.1 - Upgrade IPP driver to version 2.0
+ */
 #define DRIVER_MAJOR	1
-#define DRIVER_MINOR	0
+#define DRIVER_MINOR	1
 
 int exynos_atomic_check(struct drm_device *dev,
 			struct drm_atomic_state *state)
@@ -108,6 +116,16 @@ static const struct drm_ioctl_desc exynos_ioctls[] = {
 			DRM_AUTH | DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(EXYNOS_G2D_EXEC, exynos_g2d_exec_ioctl,
 			DRM_AUTH | DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_GET_RESOURCES,
+			exynos_drm_ipp_get_res_ioctl,
+			DRM_AUTH | DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_GET_CAPS, exynos_drm_ipp_get_caps_ioctl,
+			DRM_AUTH | DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_GET_LIMITS,
+			exynos_drm_ipp_get_limits_ioctl,
+			DRM_AUTH | DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_COMMIT, exynos_drm_ipp_commit_ioctl,
+			DRM_AUTH | DRM_RENDER_ALLOW),
 };
 
 static const struct file_operations exynos_drm_driver_fops = {
diff --git a/drivers/gpu/drm/exynos/exynos_drm_ipp.c b/drivers/gpu/drm/exynos/exynos_drm_ipp.c
new file mode 100644
index 000000000000..26374e58c557
--- /dev/null
+++ b/drivers/gpu/drm/exynos/exynos_drm_ipp.c
@@ -0,0 +1,916 @@
+/*
+ * Copyright (C) 2017 Samsung Electronics Co.Ltd
+ * Authors:
+ *	Marek Szyprowski <m.szyprowski@samsung.com>
+ *
+ * Exynos DRM Image Post Processing (IPP) related functions
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ */
+
+
+#include <drm/drmP.h>
+#include <drm/drm_mode.h>
+#include <uapi/drm/exynos_drm.h>
+
+#include "exynos_drm_drv.h"
+#include "exynos_drm_gem.h"
+#include "exynos_drm_ipp.h"
+
+static int num_ipp;
+static LIST_HEAD(ipp_list);
+
+/**
+ * exynos_drm_ipp_register - Register a new picture processor hardware module
+ * @dev: DRM device
+ * @ipp: ipp module to init
+ * @funcs: callbacks for the new ipp object
+ * @caps: bitmask of ipp capabilities (%DRM_EXYNOS_IPP_CAP_*)
+ * @formats: array of supported formats
+ * @num_formats: size of the supported formats array
+ * @name: name (for debugging purposes)
+ *
+ * Initializes a ipp module.
+ *
+ * Returns:
+ * Zero on success, error code on failure.
+ */
+int exynos_drm_ipp_register(struct drm_device *dev, struct exynos_drm_ipp *ipp,
+		const struct exynos_drm_ipp_funcs *funcs, unsigned int caps,
+		const struct exynos_drm_ipp_formats *formats,
+		unsigned int num_formats, const char *name)
+{
+	WARN_ON(!ipp);
+	WARN_ON(!funcs);
+	WARN_ON(!formats);
+	WARN_ON(!num_formats);
+
+	spin_lock_init(&ipp->lock);
+	INIT_LIST_HEAD(&ipp->todo_list);
+	init_waitqueue_head(&ipp->done_wq);
+	ipp->dev = dev;
+	ipp->funcs = funcs;
+	ipp->capabilities = caps;
+	ipp->name = name;
+	ipp->formats = formats;
+	ipp->num_formats = num_formats;
+
+	/* ipp_list modification is serialized by component framework */
+	list_add_tail(&ipp->head, &ipp_list);
+	ipp->id = num_ipp++;
+
+	DRM_DEBUG_DRIVER("Registered ipp %d\n", ipp->id);
+
+	return 0;
+}
+
+/**
+ * exynos_drm_ipp_unregister - Unregister the picture processor module
+ * @dev: DRM device
+ * @ipp: ipp module
+ */
+void exynos_drm_ipp_unregister(struct drm_device *dev,
+			       struct exynos_drm_ipp *ipp)
+{
+	WARN_ON(ipp->task);
+	WARN_ON(!list_empty(&ipp->todo_list));
+	list_del(&ipp->head);
+}
+
+/**
+ * exynos_drm_ipp_ioctl_get_res_ioctl - enumerate all ipp modules
+ * @dev: DRM device
+ * @data: ioctl data
+ * @file_priv: DRM file info
+ *
+ * Construct a list of ipp ids.
+ *
+ * Called by the user via ioctl.
+ *
+ * Returns:
+ * Zero on success, negative errno on failure.
+ */
+int exynos_drm_ipp_get_res_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file_priv)
+{
+	struct drm_exynos_ioctl_ipp_get_res *resp = data;
+	struct exynos_drm_ipp *ipp;
+	uint32_t __user *ipp_ptr = (uint32_t __user *)
+						(unsigned long)resp->ipp_id_ptr;
+	unsigned int count = num_ipp, copied = 0;
+
+	/*
+	 * This ioctl is called twice, once to determine how much space is
+	 * needed, and the 2nd time to fill it.
+	 */
+	if (count && resp->count_ipps >= count) {
+		list_for_each_entry(ipp, &ipp_list, head) {
+			if (put_user(ipp->id, ipp_ptr + copied))
+				return -EFAULT;
+			copied++;
+		}
+	}
+	resp->count_ipps = count;
+
+	return 0;
+}
+
+static inline struct exynos_drm_ipp *__ipp_get(uint32_t id)
+{
+	struct exynos_drm_ipp *ipp;
+
+	list_for_each_entry(ipp, &ipp_list, head)
+		if (ipp->id == id)
+			return ipp;
+	return NULL;
+}
+
+/**
+ * exynos_drm_ipp_ioctl_get_caps - get ipp module capabilities and formats
+ * @dev: DRM device
+ * @data: ioctl data
+ * @file_priv: DRM file info
+ *
+ * Construct a structure describing ipp module capabilities.
+ *
+ * Called by the user via ioctl.
+ *
+ * Returns:
+ * Zero on success, negative errno on failure.
+ */
+int exynos_drm_ipp_get_caps_ioctl(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv)
+{
+	struct drm_exynos_ioctl_ipp_get_caps *resp = data;
+	void __user *ptr = (void __user *)(unsigned long)resp->formats_ptr;
+	struct exynos_drm_ipp *ipp;
+	int i;
+
+	ipp = __ipp_get(resp->ipp_id);
+	if (!ipp)
+		return -ENOENT;
+
+	resp->ipp_id = ipp->id;
+	resp->capabilities = ipp->capabilities;
+
+	/*
+	 * This ioctl is called twice, once to determine how much space is
+	 * needed, and the 2nd time to fill it.
+	 */
+	if (resp->formats_count >= ipp->num_formats) {
+		for (i = 0; i < ipp->num_formats; i++) {
+			struct drm_exynos_ipp_format tmp = {
+				.fourcc = ipp->formats[i].fourcc,
+				.type = ipp->formats[i].type,
+				.modifier = ipp->formats[i].modifier,
+			};
+
+			if (copy_to_user(ptr, &tmp, sizeof(tmp)))
+				return -EFAULT;
+			ptr += sizeof(tmp);
+		}
+	}
+	resp->formats_count = ipp->num_formats;
+
+	return 0;
+}
+
+static inline const struct exynos_drm_ipp_formats *__ipp_format_get(
+				struct exynos_drm_ipp *ipp, uint32_t fourcc,
+				uint64_t mod, unsigned int type)
+{
+	int i;
+
+	for (i = 0; i < ipp->num_formats; i++) {
+		if ((ipp->formats[i].type & type) &&
+		    ipp->formats[i].fourcc == fourcc &&
+		    ipp->formats[i].modifier == mod)
+			return &ipp->formats[i];
+	}
+	return NULL;
+}
+
+/**
+ * exynos_drm_ipp_get_limits_ioctl - get ipp module limits
+ * @dev: DRM device
+ * @data: ioctl data
+ * @file_priv: DRM file info
+ *
+ * Construct a structure describing ipp module limitations for provided
+ * picture format.
+ *
+ * Called by the user via ioctl.
+ *
+ * Returns:
+ * Zero on success, negative errno on failure.
+ */
+int exynos_drm_ipp_get_limits_ioctl(struct drm_device *dev, void *data,
+				    struct drm_file *file_priv)
+{
+	struct drm_exynos_ioctl_ipp_get_limits *resp = data;
+	void __user *ptr = (void __user *)(unsigned long)resp->limits_ptr;
+	const struct exynos_drm_ipp_formats *format;
+	struct exynos_drm_ipp *ipp;
+
+	if (resp->type != DRM_EXYNOS_IPP_FORMAT_SOURCE &&
+	    resp->type != DRM_EXYNOS_IPP_FORMAT_DESTINATION)
+		return -EINVAL;
+
+	ipp = __ipp_get(resp->ipp_id);
+	if (!ipp)
+		return -ENOENT;
+
+	format = __ipp_format_get(ipp, resp->fourcc, resp->modifier,
+				  resp->type);
+	if (!format)
+		return -EINVAL;
+
+	/*
+	 * This ioctl is called twice, once to determine how much space is
+	 * needed, and the 2nd time to fill it.
+	 */
+	if (format->num_limits && resp->limits_count >= format->num_limits)
+		if (copy_to_user((void __user *)ptr, format->limits,
+				 sizeof(*format->limits) * format->num_limits))
+			return -EFAULT;
+	resp->limits_count = format->num_limits;
+
+	return 0;
+}
+
+struct drm_pending_exynos_ipp_event {
+	struct drm_pending_event base;
+	struct drm_exynos_ipp_event event;
+};
+
+static inline struct exynos_drm_ipp_task *
+			exynos_drm_ipp_task_alloc(struct exynos_drm_ipp *ipp)
+{
+	struct exynos_drm_ipp_task *task;
+
+	task = kzalloc(sizeof(*task), GFP_KERNEL);
+	if (!task)
+		return NULL;
+
+	task->dev = ipp->dev;
+	task->ipp = ipp;
+
+	/* some defaults */
+	task->src.rect.w = task->dst.rect.w = UINT_MAX;
+	task->src.rect.h = task->dst.rect.h = UINT_MAX;
+	task->transform.rotation = DRM_MODE_ROTATE_0;
+
+	DRM_DEBUG_DRIVER("Allocated task %pK\n", task);
+
+	return task;
+}
+
+static const struct exynos_drm_param_map {
+	unsigned int id;
+	unsigned int size;
+	unsigned int offset;
+} exynos_drm_ipp_params_maps[] = {
+	{
+		DRM_EXYNOS_IPP_TASK_BUFFER | DRM_EXYNOS_IPP_TASK_TYPE_SOURCE,
+		sizeof(struct drm_exynos_ipp_task_buffer),
+		offsetof(struct exynos_drm_ipp_task, src.buf),
+	}, {
+		DRM_EXYNOS_IPP_TASK_BUFFER |
+			DRM_EXYNOS_IPP_TASK_TYPE_DESTINATION,
+		sizeof(struct drm_exynos_ipp_task_buffer),
+		offsetof(struct exynos_drm_ipp_task, dst.buf),
+	}, {
+		DRM_EXYNOS_IPP_TASK_RECTANGLE | DRM_EXYNOS_IPP_TASK_TYPE_SOURCE,
+		sizeof(struct drm_exynos_ipp_task_rect),
+		offsetof(struct exynos_drm_ipp_task, src.rect),
+	}, {
+		DRM_EXYNOS_IPP_TASK_RECTANGLE |
+			DRM_EXYNOS_IPP_TASK_TYPE_DESTINATION,
+		sizeof(struct drm_exynos_ipp_task_rect),
+		offsetof(struct exynos_drm_ipp_task, dst.rect),
+	}, {
+		DRM_EXYNOS_IPP_TASK_TRANSFORM,
+		sizeof(struct drm_exynos_ipp_task_transform),
+		offsetof(struct exynos_drm_ipp_task, transform),
+	}, {
+		DRM_EXYNOS_IPP_TASK_ALPHA,
+		sizeof(struct drm_exynos_ipp_task_alpha),
+		offsetof(struct exynos_drm_ipp_task, alpha),
+	},
+};
+
+static int exynos_drm_ipp_task_set(struct exynos_drm_ipp_task *task,
+				   struct drm_exynos_ioctl_ipp_commit *arg)
+{
+	const struct exynos_drm_param_map *map = exynos_drm_ipp_params_maps;
+	void __user *params = (void __user *)(unsigned long)arg->params_ptr;
+	unsigned int size = arg->params_size;
+	uint32_t id;
+	int i;
+
+	while (size) {
+		if (get_user(id, (uint32_t __user *)params))
+			return -EFAULT;
+
+		for (i = 0; i < ARRAY_SIZE(exynos_drm_ipp_params_maps); i++)
+			if (map[i].id == id)
+				break;
+		if (i == ARRAY_SIZE(exynos_drm_ipp_params_maps) ||
+		    map[i].size > size)
+			return -EINVAL;
+
+		if (copy_from_user((void *)task + map[i].offset, params,
+				   map[i].size))
+			return -EFAULT;
+
+		params += map[i].size;
+		size -= map[i].size;
+	}
+
+	DRM_DEBUG_DRIVER("Got task %pK configuration from userspace\n", task);
+	return 0;
+}
+
+static int exynos_drm_ipp_task_setup_buffer(struct exynos_drm_ipp_buffer *buf,
+					    struct drm_file *filp)
+{
+	int ret = 0;
+	int i;
+
+	/* basic checks */
+	if (buf->buf.width == 0 || buf->buf.height == 0)
+		return -EINVAL;
+	buf->format = drm_format_info(buf->buf.fourcc);
+	for (i = 0; i < buf->format->num_planes; i++) {
+		unsigned int width = (i == 0) ? buf->buf.width :
+			     DIV_ROUND_UP(buf->buf.width, buf->format->hsub);
+
+		if (buf->buf.pitch[i] == 0)
+			buf->buf.pitch[i] = width * buf->format->cpp[i];
+		if (buf->buf.pitch[i] < width * buf->format->cpp[i])
+			return -EINVAL;
+		if (!buf->buf.gem_id[i])
+			return -ENOENT;
+	}
+
+	/* pitch for additional planes must match */
+	if (buf->format->num_planes > 2 &&
+	    buf->buf.pitch[1] != buf->buf.pitch[2])
+		return -EINVAL;
+
+	/* get GEM buffers and check their size */
+	for (i = 0; i < buf->format->num_planes; i++) {
+		unsigned int height = (i == 0) ? buf->buf.height :
+			     DIV_ROUND_UP(buf->buf.height, buf->format->vsub);
+		unsigned long size = height * buf->buf.pitch[i];
+		struct drm_gem_object *obj = drm_gem_object_lookup(filp,
+							    buf->buf.gem_id[i]);
+		if (!obj) {
+			ret = -ENOENT;
+			goto gem_free;
+		}
+		buf->exynos_gem[i] = to_exynos_gem(obj);
+
+		if (size + buf->buf.offset[i] > buf->exynos_gem[i]->size) {
+			i++;
+			ret = -EINVAL;
+			goto gem_free;
+		}
+		buf->dma_addr[i] = buf->exynos_gem[i]->dma_addr +
+				   buf->buf.offset[i];
+	}
+
+	return 0;
+gem_free:
+	while (i--) {
+		drm_gem_object_put_unlocked(&buf->exynos_gem[i]->base);
+		buf->exynos_gem[i] = NULL;
+	}
+	return ret;
+}
+
+static void exynos_drm_ipp_task_release_buf(struct exynos_drm_ipp_buffer *buf)
+{
+	int i;
+
+	if (!buf->exynos_gem[0])
+		return;
+	for (i = 0; i < buf->format->num_planes; i++)
+		drm_gem_object_put_unlocked(&buf->exynos_gem[i]->base);
+}
+
+static void exynos_drm_ipp_task_free(struct exynos_drm_ipp *ipp,
+				 struct exynos_drm_ipp_task *task)
+{
+	DRM_DEBUG_DRIVER("Freeing task %pK\n", task);
+
+	exynos_drm_ipp_task_release_buf(&task->src);
+	exynos_drm_ipp_task_release_buf(&task->dst);
+	if (task->event)
+		drm_event_cancel_free(ipp->dev, &task->event->base);
+	kfree(task);
+}
+
+struct drm_ipp_limit {
+	struct drm_exynos_ipp_limit_val h;
+	struct drm_exynos_ipp_limit_val v;
+};
+
+enum drm_ipp_size_id {
+	IPP_LIMIT_BUFFER, IPP_LIMIT_AREA, IPP_LIMIT_ROTATED, IPP_LIMIT_MAX
+};
+
+static const enum drm_ipp_size_id limit_id_fallback[IPP_LIMIT_MAX][4] = {
+	[IPP_LIMIT_BUFFER]  = { DRM_EXYNOS_IPP_LIMIT_SIZE_BUFFER },
+	[IPP_LIMIT_AREA]    = { DRM_EXYNOS_IPP_LIMIT_SIZE_AREA,
+				DRM_EXYNOS_IPP_LIMIT_SIZE_BUFFER },
+	[IPP_LIMIT_ROTATED] = { DRM_EXYNOS_IPP_LIMIT_SIZE_ROTATED,
+				DRM_EXYNOS_IPP_LIMIT_SIZE_AREA,
+				DRM_EXYNOS_IPP_LIMIT_SIZE_BUFFER },
+};
+
+static inline void __limit_set_val(unsigned int *ptr, unsigned int val)
+{
+	if (!*ptr)
+		*ptr = val;
+}
+
+static void __get_size_limit(const struct drm_exynos_ipp_limit *limits,
+			     unsigned int num_limits, enum drm_ipp_size_id id,
+			     struct drm_ipp_limit *res)
+{
+	const struct drm_exynos_ipp_limit *l = limits;
+	int i = 0;
+
+	memset(res, 0, sizeof(*res));
+	for (i = 0; limit_id_fallback[id][i]; i++)
+		for (l = limits; l - limits < num_limits; l++) {
+			if (((l->type & DRM_EXYNOS_IPP_LIMIT_TYPE_MASK) !=
+			      DRM_EXYNOS_IPP_LIMIT_TYPE_SIZE) ||
+			    ((l->type & DRM_EXYNOS_IPP_LIMIT_SIZE_MASK) !=
+						     limit_id_fallback[id][i]))
+				continue;
+			__limit_set_val(&res->h.min, l->h.min);
+			__limit_set_val(&res->h.max, l->h.max);
+			__limit_set_val(&res->h.align, l->h.align);
+			__limit_set_val(&res->v.min, l->v.min);
+			__limit_set_val(&res->v.max, l->v.max);
+			__limit_set_val(&res->v.align, l->v.align);
+		}
+}
+
+static inline bool __align_check(unsigned int val, unsigned int align)
+{
+	if (align && (val & (align - 1))) {
+		DRM_DEBUG_DRIVER("Value %d exceeds HW limits (align %d)\n",
+				 val, align);
+		return false;
+	}
+	return true;
+}
+
+static inline bool __size_limit_check(unsigned int val,
+				 struct drm_exynos_ipp_limit_val *l)
+{
+	if ((l->min && val < l->min) || (l->max && val > l->max)) {
+		DRM_DEBUG_DRIVER("Value %d exceeds HW limits (min %d, max %d)\n",
+				 val, l->min, l->max);
+		return false;
+	}
+	return __align_check(val, l->align);
+}
+
+static int exynos_drm_ipp_check_size_limits(struct exynos_drm_ipp_buffer *buf,
+	const struct drm_exynos_ipp_limit *limits, unsigned int num_limits,
+	bool rotate, bool swap)
+{
+	enum drm_ipp_size_id id = rotate ? IPP_LIMIT_ROTATED : IPP_LIMIT_AREA;
+	struct drm_ipp_limit l;
+	struct drm_exynos_ipp_limit_val *lh = &l.h, *lv = &l.v;
+
+	if (!limits)
+		return 0;
+
+	__get_size_limit(limits, num_limits, IPP_LIMIT_BUFFER, &l);
+	if (!__size_limit_check(buf->buf.width, &l.h) ||
+	    !__size_limit_check(buf->buf.height, &l.v))
+		return -EINVAL;
+
+	if (swap) {
+		lv = &l.h;
+		lh = &l.v;
+	}
+	__get_size_limit(limits, num_limits, id, &l);
+	if (!__size_limit_check(buf->rect.w, lh) ||
+	    !__align_check(buf->rect.x, lh->align) ||
+	    !__size_limit_check(buf->rect.h, lv) ||
+	    !__align_check(buf->rect.y, lv->align))
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline bool __scale_limit_check(unsigned int src, unsigned int dst,
+				       unsigned int min, unsigned int max)
+{
+	if ((max && (dst << 16) > src * max) ||
+	    (min && (dst << 16) < src * min)) {
+		DRM_DEBUG_DRIVER("Scale from %d to %d exceeds HW limits (ratio min %d.%05d, max %d.%05d)\n",
+			 src, dst,
+			 min >> 16, 100000 * (min & 0xffff) / (1 << 16),
+			 max >> 16, 100000 * (max & 0xffff) / (1 << 16));
+		return false;
+	}
+	return true;
+}
+
+static int exynos_drm_ipp_check_scale_limits(
+				struct drm_exynos_ipp_task_rect *src,
+				struct drm_exynos_ipp_task_rect *dst,
+				const struct drm_exynos_ipp_limit *limits,
+				unsigned int num_limits, bool swap)
+{
+	const struct drm_exynos_ipp_limit_val *lh, *lv;
+	int dw, dh;
+
+	for (; num_limits; limits++, num_limits--)
+		if ((limits->type & DRM_EXYNOS_IPP_LIMIT_TYPE_MASK) ==
+		    DRM_EXYNOS_IPP_LIMIT_TYPE_SCALE)
+			break;
+	if (!num_limits)
+		return 0;
+
+	lh = (!swap) ? &limits->h : &limits->v;
+	lv = (!swap) ? &limits->v : &limits->h;
+	dw = (!swap) ? dst->w : dst->h;
+	dh = (!swap) ? dst->h : dst->w;
+
+	if (!__scale_limit_check(src->w, dw, lh->min, lh->max) ||
+	    !__scale_limit_check(src->h, dh, lv->min, lv->max))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int exynos_drm_ipp_task_check(struct exynos_drm_ipp_task *task)
+{
+	struct exynos_drm_ipp *ipp = task->ipp;
+	const struct exynos_drm_ipp_formats *src_fmt, *dst_fmt;
+	struct exynos_drm_ipp_buffer *src = &task->src, *dst = &task->dst;
+	unsigned int rotation = task->transform.rotation;
+	int ret = 0;
+	bool swap = drm_rotation_90_or_270(rotation);
+	bool rotate = (rotation != DRM_MODE_ROTATE_0);
+	bool scale = false;
+
+	DRM_DEBUG_DRIVER("Checking task %pK\n", task);
+
+	if (src->rect.w == UINT_MAX)
+		src->rect.w = src->buf.width;
+	if (src->rect.h == UINT_MAX)
+		src->rect.h = src->buf.height;
+	if (dst->rect.w == UINT_MAX)
+		dst->rect.w = dst->buf.width;
+	if (dst->rect.h == UINT_MAX)
+		dst->rect.h = dst->buf.height;
+
+	if (src->rect.x + src->rect.w > (src->buf.width) ||
+	    src->rect.y + src->rect.h > (src->buf.height) ||
+	    dst->rect.x + dst->rect.w > (dst->buf.width) ||
+	    dst->rect.y + dst->rect.h > (dst->buf.height)) {
+		DRM_DEBUG_DRIVER("Task %pK: defined area is outside provided buffers\n",
+				 task);
+		return -EINVAL;
+	}
+
+	if ((!swap && (src->rect.w != dst->rect.w ||
+		       src->rect.h != dst->rect.h)) ||
+	    (swap && (src->rect.w != dst->rect.h ||
+		      src->rect.h != dst->rect.w)))
+		scale = true;
+
+	if ((!(ipp->capabilities & DRM_EXYNOS_IPP_CAP_CROP) &&
+	     (src->rect.x || src->rect.y || dst->rect.x || dst->rect.y)) ||
+	    (!(ipp->capabilities & DRM_EXYNOS_IPP_CAP_ROTATE) && rotate) ||
+	    (!(ipp->capabilities & DRM_EXYNOS_IPP_CAP_SCALE) && scale) ||
+	    (!(ipp->capabilities & DRM_EXYNOS_IPP_CAP_CONVERT) &&
+	     src->buf.fourcc != dst->buf.fourcc)) {
+		DRM_DEBUG_DRIVER("Task %pK: hw capabilities exceeded\n", task);
+		return -EINVAL;
+	}
+
+	src_fmt = __ipp_format_get(ipp, src->buf.fourcc, src->buf.modifier,
+				   DRM_EXYNOS_IPP_FORMAT_SOURCE);
+	if (!src_fmt) {
+		DRM_DEBUG_DRIVER("Task %pK: src format not supported\n", task);
+		return -EINVAL;
+	}
+	ret = exynos_drm_ipp_check_size_limits(src, src_fmt->limits,
+					       src_fmt->num_limits,
+					       rotate, false);
+	if (ret)
+		return ret;
+	ret = exynos_drm_ipp_check_scale_limits(&src->rect, &dst->rect,
+						src_fmt->limits,
+						src_fmt->num_limits, swap);
+	if (ret)
+		return ret;
+
+	dst_fmt = __ipp_format_get(ipp, dst->buf.fourcc, dst->buf.modifier,
+				   DRM_EXYNOS_IPP_FORMAT_DESTINATION);
+	if (!dst_fmt) {
+		DRM_DEBUG_DRIVER("Task %pK: dst format not supported\n", task);
+		return -EINVAL;
+	}
+	ret = exynos_drm_ipp_check_size_limits(dst, dst_fmt->limits,
+					       dst_fmt->num_limits,
+					       false, swap);
+	if (ret)
+		return ret;
+	ret = exynos_drm_ipp_check_scale_limits(&src->rect, &dst->rect,
+						dst_fmt->limits,
+						dst_fmt->num_limits, swap);
+	if (ret)
+		return ret;
+
+	DRM_DEBUG_DRIVER("Task %pK: all checks done.\n", task);
+
+	return ret;
+}
+
+static int exynos_drm_ipp_task_setup_buffers(struct exynos_drm_ipp_task *task,
+				     struct drm_file *filp)
+{
+	struct exynos_drm_ipp_buffer *src = &task->src, *dst = &task->dst;
+	int ret = 0;
+
+	DRM_DEBUG_DRIVER("Setting buffer for task %pK\n", task);
+
+	ret = exynos_drm_ipp_task_setup_buffer(src, filp);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Task %pK: src buffer setup failed\n", task);
+		return ret;
+	}
+	ret = exynos_drm_ipp_task_setup_buffer(dst, filp);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Task %pK: dst buffer setup failed\n", task);
+		return ret;
+	}
+
+	DRM_DEBUG_DRIVER("Task %pK: buffers prepared.\n", task);
+
+	return ret;
+}
+
+
+static int exynos_drm_ipp_event_create(struct exynos_drm_ipp_task *task,
+				 struct drm_file *file_priv, uint64_t user_data)
+{
+	struct drm_pending_exynos_ipp_event *e = NULL;
+	int ret;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	e->event.base.type = DRM_EXYNOS_IPP_EVENT;
+	e->event.base.length = sizeof(e->event);
+	e->event.user_data = user_data;
+
+	ret = drm_event_reserve_init(task->dev, file_priv, &e->base,
+				     &e->event.base);
+	if (ret)
+		goto free;
+
+	task->event = e;
+	return 0;
+free:
+	kfree(e);
+	return ret;
+}
+
+static void exynos_drm_ipp_event_send(struct exynos_drm_ipp_task *task)
+{
+	struct timespec64 now;
+
+	ktime_get_ts64(&now);
+	task->event->event.tv_sec = now.tv_sec;
+	task->event->event.tv_usec = now.tv_nsec / NSEC_PER_USEC;
+	task->event->event.sequence = atomic_inc_return(&task->ipp->sequence);
+
+	drm_send_event(task->dev, &task->event->base);
+}
+
+static int exynos_drm_ipp_task_cleanup(struct exynos_drm_ipp_task *task)
+{
+	int ret = task->ret;
+
+	if (ret == 0 && task->event) {
+		exynos_drm_ipp_event_send(task);
+		/* ensure event won't be canceled on task free */
+		task->event = NULL;
+	}
+
+	exynos_drm_ipp_task_free(task->ipp, task);
+	return ret;
+}
+
+static void exynos_drm_ipp_cleanup_work(struct work_struct *work)
+{
+	struct exynos_drm_ipp_task *task = container_of(work,
+				      struct exynos_drm_ipp_task, cleanup_work);
+
+	exynos_drm_ipp_task_cleanup(task);
+}
+
+static void exynos_drm_ipp_next_task(struct exynos_drm_ipp *ipp);
+
+/**
+ * exynos_drm_ipp_task_done - finish given task and set return code
+ * @task: ipp task to finish
+ * @ret: error code or 0 if operation has been performed successfully
+ */
+void exynos_drm_ipp_task_done(struct exynos_drm_ipp_task *task, int ret)
+{
+	struct exynos_drm_ipp *ipp = task->ipp;
+	unsigned long flags;
+
+	DRM_DEBUG_DRIVER("ipp: %d, task %pK done: %d\n", ipp->id, task, ret);
+
+	spin_lock_irqsave(&ipp->lock, flags);
+	if (ipp->task == task)
+		ipp->task = NULL;
+	task->flags |= DRM_EXYNOS_IPP_TASK_DONE;
+	task->ret = ret;
+	spin_unlock_irqrestore(&ipp->lock, flags);
+
+	exynos_drm_ipp_next_task(ipp);
+	wake_up(&ipp->done_wq);
+
+	if (task->flags & DRM_EXYNOS_IPP_TASK_ASYNC) {
+		INIT_WORK(&task->cleanup_work, exynos_drm_ipp_cleanup_work);
+		schedule_work(&task->cleanup_work);
+	}
+}
+
+static void exynos_drm_ipp_next_task(struct exynos_drm_ipp *ipp)
+{
+	struct exynos_drm_ipp_task *task;
+	unsigned long flags;
+	int ret;
+
+	DRM_DEBUG_DRIVER("ipp: %d, try to run new task\n", ipp->id);
+
+	spin_lock_irqsave(&ipp->lock, flags);
+
+	if (ipp->task || list_empty(&ipp->todo_list)) {
+		spin_unlock_irqrestore(&ipp->lock, flags);
+		return;
+	}
+
+	task = list_first_entry(&ipp->todo_list, struct exynos_drm_ipp_task,
+				head);
+	list_del_init(&task->head);
+	ipp->task = task;
+
+	spin_unlock_irqrestore(&ipp->lock, flags);
+
+	DRM_DEBUG_DRIVER("ipp: %d, selected task %pK to run\n", ipp->id, task);
+
+	ret = ipp->funcs->commit(ipp, task);
+	if (ret)
+		exynos_drm_ipp_task_done(task, ret);
+}
+
+static void exynos_drm_ipp_schedule_task(struct exynos_drm_ipp *ipp,
+					 struct exynos_drm_ipp_task *task)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipp->lock, flags);
+	list_add(&task->head, &ipp->todo_list);
+	spin_unlock_irqrestore(&ipp->lock, flags);
+
+	exynos_drm_ipp_next_task(ipp);
+}
+
+static void exynos_drm_ipp_task_abort(struct exynos_drm_ipp *ipp,
+				      struct exynos_drm_ipp_task *task)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipp->lock, flags);
+	if (task->flags & DRM_EXYNOS_IPP_TASK_DONE) {
+		/* already completed task */
+		exynos_drm_ipp_task_cleanup(task);
+	} else if (ipp->task != task) {
+		/* task has not been scheduled for execution yet */
+		list_del_init(&task->head);
+		exynos_drm_ipp_task_cleanup(task);
+	} else {
+		/*
+		 * currently processed task, call abort() and perform
+		 * cleanup with async worker
+		 */
+		task->flags |= DRM_EXYNOS_IPP_TASK_ASYNC;
+		spin_unlock_irqrestore(&ipp->lock, flags);
+		if (ipp->funcs->abort)
+			ipp->funcs->abort(ipp, task);
+		return;
+	}
+	spin_unlock_irqrestore(&ipp->lock, flags);
+}
+
+/**
+ * exynos_drm_ipp_commit_ioctl - perform image processing operation
+ * @dev: DRM device
+ * @data: ioctl data
+ * @file_priv: DRM file info
+ *
+ * Construct a ipp task from the set of properties provided from the user
+ * and try to schedule it to framebuffer processor hardware.
+ *
+ * Called by the user via ioctl.
+ *
+ * Returns:
+ * Zero on success, negative errno on failure.
+ */
+int exynos_drm_ipp_commit_ioctl(struct drm_device *dev, void *data,
+				struct drm_file *file_priv)
+{
+	struct drm_exynos_ioctl_ipp_commit *arg = data;
+	struct exynos_drm_ipp *ipp;
+	struct exynos_drm_ipp_task *task;
+	int ret = 0;
+
+	if ((arg->flags & ~DRM_EXYNOS_IPP_FLAGS) || arg->reserved)
+		return -EINVAL;
+
+	/* can't test and expect an event at the same time */
+	if ((arg->flags & DRM_EXYNOS_IPP_FLAG_TEST_ONLY) &&
+			(arg->flags & DRM_EXYNOS_IPP_FLAG_EVENT))
+		return -EINVAL;
+
+	ipp = __ipp_get(arg->ipp_id);
+	if (!ipp)
+		return -ENOENT;
+
+	task = exynos_drm_ipp_task_alloc(ipp);
+	if (!task)
+		return -ENOMEM;
+
+	ret = exynos_drm_ipp_task_set(task, arg);
+	if (ret)
+		goto free;
+
+	ret = exynos_drm_ipp_task_check(task);
+	if (ret)
+		goto free;
+
+	ret = exynos_drm_ipp_task_setup_buffers(task, file_priv);
+	if (ret || arg->flags & DRM_EXYNOS_IPP_FLAG_TEST_ONLY)
+		goto free;
+
+	if (arg->flags & DRM_EXYNOS_IPP_FLAG_EVENT) {
+		ret = exynos_drm_ipp_event_create(task, file_priv,
+						 arg->user_data);
+		if (ret)
+			goto free;
+	}
+
+	/*
+	 * Queue task for processing on the hardware. task object will be
+	 * then freed after exynos_drm_ipp_task_done()
+	 */
+	if (arg->flags & DRM_EXYNOS_IPP_FLAG_NONBLOCK) {
+		DRM_DEBUG_DRIVER("ipp: %d, nonblocking processing task %pK\n",
+				 ipp->id, task);
+
+		task->flags |= DRM_EXYNOS_IPP_TASK_ASYNC;
+		exynos_drm_ipp_schedule_task(task->ipp, task);
+		ret = 0;
+	} else {
+		DRM_DEBUG_DRIVER("ipp: %d, processing task %pK\n", ipp->id,
+				 task);
+		exynos_drm_ipp_schedule_task(ipp, task);
+		ret = wait_event_interruptible(ipp->done_wq,
+					task->flags & DRM_EXYNOS_IPP_TASK_DONE);
+		if (ret)
+			exynos_drm_ipp_task_abort(ipp, task);
+		else
+			ret = exynos_drm_ipp_task_cleanup(task);
+	}
+	return ret;
+free:
+	exynos_drm_ipp_task_free(ipp, task);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/exynos/exynos_drm_ipp.h b/drivers/gpu/drm/exynos/exynos_drm_ipp.h
new file mode 100644
index 000000000000..0b27d4a9bf94
--- /dev/null
+++ b/drivers/gpu/drm/exynos/exynos_drm_ipp.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017 Samsung Electronics Co., Ltd.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef _EXYNOS_DRM_IPP_H_
+#define _EXYNOS_DRM_IPP_H_
+
+#include <drm/drmP.h>
+
+struct exynos_drm_ipp;
+struct exynos_drm_ipp_task;
+
+/**
+ * struct exynos_drm_ipp_funcs - exynos_drm_ipp control functions
+ */
+struct exynos_drm_ipp_funcs {
+	/**
+	 * @commit:
+	 *
+	 * This is the main entry point to start framebuffer processing
+	 * in the hardware. The exynos_drm_ipp_task has been already validated.
+	 * This function must not wait until the device finishes processing.
+	 * When the driver finishes processing, it has to call
+	 * exynos_exynos_drm_ipp_task_done() function.
+	 *
+	 * RETURNS:
+	 *
+	 * 0 on success or negative error codes in case of failure.
+	 */
+	int (*commit)(struct exynos_drm_ipp *ipp,
+		      struct exynos_drm_ipp_task *task);
+
+	/**
+	 * @abort:
+	 *
+	 * Informs the driver that it has to abort the currently running
+	 * task as soon as possible (i.e. as soon as it can stop the device
+	 * safely), even if the task would not have been finished by then.
+	 * After the driver performs the necessary steps, it has to call
+	 * exynos_drm_ipp_task_done() (as if the task ended normally).
+	 * This function does not have to (and will usually not) wait
+	 * until the device enters a state when it can be stopped.
+	 */
+	void (*abort)(struct exynos_drm_ipp *ipp,
+		      struct exynos_drm_ipp_task *task);
+};
+
+/**
+ * struct exynos_drm_ipp - central picture processor module structure
+ */
+struct exynos_drm_ipp {
+	struct drm_device *dev;
+	struct list_head head;
+	unsigned int id;
+
+	const char *name;
+	const struct exynos_drm_ipp_funcs *funcs;
+	unsigned int capabilities;
+	const struct exynos_drm_ipp_formats *formats;
+	unsigned int num_formats;
+	atomic_t sequence;
+
+	spinlock_t lock;
+	struct exynos_drm_ipp_task *task;
+	struct list_head todo_list;
+	wait_queue_head_t done_wq;
+};
+
+struct exynos_drm_ipp_buffer {
+	struct drm_exynos_ipp_task_buffer buf;
+	struct drm_exynos_ipp_task_rect rect;
+
+	struct exynos_drm_gem *exynos_gem[MAX_FB_BUFFER];
+	const struct drm_format_info *format;
+	dma_addr_t dma_addr[MAX_FB_BUFFER];
+};
+
+/**
+ * struct exynos_drm_ipp_task - a structure describing transformation that
+ * has to be performed by the picture processor hardware module
+ */
+struct exynos_drm_ipp_task {
+	struct drm_device *dev;
+	struct exynos_drm_ipp *ipp;
+	struct list_head head;
+
+	struct exynos_drm_ipp_buffer src;
+	struct exynos_drm_ipp_buffer dst;
+
+	struct drm_exynos_ipp_task_transform transform;
+	struct drm_exynos_ipp_task_alpha alpha;
+
+	struct work_struct cleanup_work;
+	unsigned int flags;
+	int ret;
+
+	struct drm_pending_exynos_ipp_event *event;
+};
+
+#define DRM_EXYNOS_IPP_TASK_DONE	(1 << 0)
+#define DRM_EXYNOS_IPP_TASK_ASYNC	(1 << 1)
+
+struct exynos_drm_ipp_formats {
+	uint32_t fourcc;
+	uint32_t type;
+	uint64_t modifier;
+	const struct drm_exynos_ipp_limit *limits;
+	unsigned int num_limits;
+};
+
+/* helper macros to set exynos_drm_ipp_formats structure and limits*/
+#define IPP_SRCDST_MFORMAT(f, m, l) \
+	.fourcc = DRM_FORMAT_##f, .modifier = m, .limits = l, \
+	.num_limits = ARRAY_SIZE(l), \
+	.type = (DRM_EXYNOS_IPP_FORMAT_SOURCE | \
+		 DRM_EXYNOS_IPP_FORMAT_DESTINATION)
+
+#define IPP_SRCDST_FORMAT(f, l) IPP_SRCDST_MFORMAT(f, 0, l)
+
+#define IPP_SIZE_LIMIT(l, val...)	\
+	.type = (DRM_EXYNOS_IPP_LIMIT_TYPE_SIZE | \
+		 DRM_EXYNOS_IPP_LIMIT_SIZE_##l), val
+
+#define IPP_SCALE_LIMIT(val...)		\
+	.type = (DRM_EXYNOS_IPP_LIMIT_TYPE_SCALE), val
+
+int exynos_drm_ipp_register(struct drm_device *dev, struct exynos_drm_ipp *ipp,
+		const struct exynos_drm_ipp_funcs *funcs, unsigned int caps,
+		const struct exynos_drm_ipp_formats *formats,
+		unsigned int num_formats, const char *name);
+void exynos_drm_ipp_unregister(struct drm_device *dev,
+			       struct exynos_drm_ipp *ipp);
+
+void exynos_drm_ipp_task_done(struct exynos_drm_ipp_task *task, int ret);
+
+#ifdef CONFIG_DRM_EXYNOS_IPP
+int exynos_drm_ipp_get_res_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file_priv);
+int exynos_drm_ipp_get_caps_ioctl(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv);
+int exynos_drm_ipp_get_limits_ioctl(struct drm_device *dev, void *data,
+				    struct drm_file *file_priv);
+int exynos_drm_ipp_commit_ioctl(struct drm_device *dev,
+				void *data, struct drm_file *file_priv);
+#else
+static inline int exynos_drm_ipp_get_res_ioctl(struct drm_device *dev,
+	 void *data, struct drm_file *file_priv)
+{
+	struct drm_exynos_ioctl_ipp_get_res *resp = data;
+
+	resp->count_ipps = 0;
+	return 0;
+}
+static inline int exynos_drm_ipp_get_caps_ioctl(struct drm_device *dev,
+	 void *data, struct drm_file *file_priv)
+{
+	return -ENODEV;
+}
+static inline int exynos_drm_ipp_get_limits_ioctl(struct drm_device *dev,
+	 void *data, struct drm_file *file_priv)
+{
+	return -ENODEV;
+}
+static inline int exynos_drm_ipp_commit_ioctl(struct drm_device *dev,
+	 void *data, struct drm_file *file_priv)
+{
+	return -ENODEV;
+}
+#endif
+#endif
diff --git a/include/uapi/drm/exynos_drm.h b/include/uapi/drm/exynos_drm.h
index 4a54305120e0..3e59b8382dd8 100644
--- a/include/uapi/drm/exynos_drm.h
+++ b/include/uapi/drm/exynos_drm.h
@@ -135,6 +135,219 @@ struct drm_exynos_g2d_exec {
 	__u64					async;
 };
 
+/* Exynos DRM IPP v2 API */
+
+/**
+ * Enumerate available IPP hardware modules.
+ *
+ * @count_ipps: size of ipp_id array / number of ipp modules (set by driver)
+ * @reserved: padding
+ * @ipp_id_ptr: pointer to ipp_id array or NULL
+ */
+struct drm_exynos_ioctl_ipp_get_res {
+	__u32 count_ipps;
+	__u32 reserved;
+	__u64 ipp_id_ptr;
+};
+
+enum drm_exynos_ipp_format_type {
+	DRM_EXYNOS_IPP_FORMAT_SOURCE		= 0x01,
+	DRM_EXYNOS_IPP_FORMAT_DESTINATION	= 0x02,
+};
+
+struct drm_exynos_ipp_format {
+	__u32 fourcc;
+	__u32 type;
+	__u64 modifier;
+};
+
+enum drm_exynos_ipp_capability {
+	DRM_EXYNOS_IPP_CAP_CROP		= 0x01,
+	DRM_EXYNOS_IPP_CAP_ROTATE	= 0x02,
+	DRM_EXYNOS_IPP_CAP_SCALE	= 0x04,
+	DRM_EXYNOS_IPP_CAP_CONVERT	= 0x08,
+};
+
+/**
+ * Get IPP hardware capabilities and supported image formats.
+ *
+ * @ipp_id: id of IPP module to query
+ * @capabilities: bitmask of drm_exynos_ipp_capability (set by driver)
+ * @reserved: padding
+ * @formats_count: size of formats array (in entries) / number of filled
+ *		   formats (set by driver)
+ * @formats_ptr: pointer to formats array or NULL
+ */
+struct drm_exynos_ioctl_ipp_get_caps {
+	__u32 ipp_id;
+	__u32 capabilities;
+	__u32 reserved;
+	__u32 formats_count;
+	__u64 formats_ptr;
+};
+
+enum drm_exynos_ipp_limit_type {
+	/* size (horizontal/vertial) limits, in pixels (min, max, alignment) */
+	DRM_EXYNOS_IPP_LIMIT_TYPE_SIZE		= 0x0001,
+	/* scale ratio (horizonta/vertial), 16.16 fixed point (min, max) */
+	DRM_EXYNOS_IPP_LIMIT_TYPE_SCALE		= 0x0002,
+
+	/* image buffer area */
+	DRM_EXYNOS_IPP_LIMIT_SIZE_BUFFER	= 0x0001 << 16,
+	/* src/dst rectangle area */
+	DRM_EXYNOS_IPP_LIMIT_SIZE_AREA		= 0x0002 << 16,
+	/* src/dst rectangle area when rotation enabled */
+	DRM_EXYNOS_IPP_LIMIT_SIZE_ROTATED	= 0x0003 << 16,
+
+	DRM_EXYNOS_IPP_LIMIT_TYPE_MASK		= 0x000f,
+	DRM_EXYNOS_IPP_LIMIT_SIZE_MASK		= 0x000f << 16,
+};
+
+struct drm_exynos_ipp_limit_val {
+	__u32 min;
+	__u32 max;
+	__u32 align;
+	__u32 reserved;
+};
+
+/**
+ * IPP module limitation.
+ *
+ * @type: limit type (see drm_exynos_ipp_limit_type enum)
+ * @reserved: padding
+ * @h: horizontal limits
+ * @v: vertical limits
+ */
+struct drm_exynos_ipp_limit {
+	__u32 type;
+	__u32 reserved;
+	struct drm_exynos_ipp_limit_val h;
+	struct drm_exynos_ipp_limit_val v;
+};
+
+/**
+ * Get IPP limits for given image format.
+ *
+ * @ipp_id: id of IPP module to query
+ * @fourcc: image format code (see DRM_FORMAT_* in drm_fourcc.h)
+ * @modifier: image format modifier (see DRM_FORMAT_MOD_* in drm_fourcc.h)
+ * @type: source/destination identifier (drm_exynos_ipp_format_flag enum)
+ * @limits_count: size of limits array (in entries) / number of filled entries
+ *		 (set by driver)
+ * @limits_ptr: pointer to limits array or NULL
+ */
+struct drm_exynos_ioctl_ipp_get_limits {
+	__u32 ipp_id;
+	__u32 fourcc;
+	__u64 modifier;
+	__u32 type;
+	__u32 limits_count;
+	__u64 limits_ptr;
+};
+
+enum drm_exynos_ipp_task_id {
+	/* buffer described by struct drm_exynos_ipp_task_buffer */
+	DRM_EXYNOS_IPP_TASK_BUFFER		= 0x0001,
+	/* rectangle described by struct drm_exynos_ipp_task_rect */
+	DRM_EXYNOS_IPP_TASK_RECTANGLE		= 0x0002,
+	/* transformation described by struct drm_exynos_ipp_task_transform */
+	DRM_EXYNOS_IPP_TASK_TRANSFORM		= 0x0003,
+	/* alpha configuration described by struct drm_exynos_ipp_task_alpha */
+	DRM_EXYNOS_IPP_TASK_ALPHA		= 0x0004,
+
+	/* source image data (for buffer and rectangle chunks) */
+	DRM_EXYNOS_IPP_TASK_TYPE_SOURCE		= 0x0001 << 16,
+	/* destination image data (for buffer and rectangle chunks) */
+	DRM_EXYNOS_IPP_TASK_TYPE_DESTINATION	= 0x0002 << 16,
+};
+
+/**
+ * Memory buffer with image data.
+ *
+ * @id: must be DRM_EXYNOS_IPP_TASK_BUFFER
+ * other parameters are same as for AddFB2 generic DRM ioctl
+ */
+struct drm_exynos_ipp_task_buffer {
+	__u32	id;
+	__u32	fourcc;
+	__u32	width, height;
+	__u32	gem_id[4];
+	__u32	offset[4];
+	__u32	pitch[4];
+	__u64	modifier;
+};
+
+/**
+ * Rectangle for processing.
+ *
+ * @id: must be DRM_EXYNOS_IPP_TASK_RECTANGLE
+ * @reserved: padding
+ * @x,@y: left corner in pixels
+ * @w,@h: width/height in pixels
+ */
+struct drm_exynos_ipp_task_rect {
+	__u32	id;
+	__u32	reserved;
+	__u32	x;
+	__u32	y;
+	__u32	w;
+	__u32	h;
+};
+
+/**
+ * Image tranformation description.
+ *
+ * @id: must be DRM_EXYNOS_IPP_TASK_TRANSFORM
+ * @rotation: DRM_MODE_ROTATE_* and DRM_MODE_REFLECT_* values
+ */
+struct drm_exynos_ipp_task_transform {
+	__u32	id;
+	__u32	rotation;
+};
+
+/**
+ * Image global alpha configuration for formats without alpha values.
+ *
+ * @id: must be DRM_EXYNOS_IPP_TASK_ALPHA
+ * @value: global alpha value (0-255)
+ */
+struct drm_exynos_ipp_task_alpha {
+	__u32	id;
+	__u32	value;
+};
+
+enum drm_exynos_ipp_flag {
+	/* generate DRM event after processing */
+	DRM_EXYNOS_IPP_FLAG_EVENT	= 0x01,
+	/* dry run, only check task parameters */
+	DRM_EXYNOS_IPP_FLAG_TEST_ONLY	= 0x02,
+	/* non-blocking processing */
+	DRM_EXYNOS_IPP_FLAG_NONBLOCK	= 0x04,
+};
+
+#define DRM_EXYNOS_IPP_FLAGS (DRM_EXYNOS_IPP_FLAG_EVENT |\
+		DRM_EXYNOS_IPP_FLAG_TEST_ONLY | DRM_EXYNOS_IPP_FLAG_NONBLOCK)
+
+/**
+ * Perform image processing described by array of drm_exynos_ipp_task_*
+ * structures (parameters array).
+ *
+ * @ipp_id: id of IPP module to run the task
+ * @flags: bitmask of drm_exynos_ipp_flag values
+ * @reserved: padding
+ * @params_size: size of parameters array (in bytes)
+ * @params_ptr: pointer to parameters array or NULL
+ * @user_data: (optional) data for drm event
+ */
+struct drm_exynos_ioctl_ipp_commit {
+	__u32 ipp_id;
+	__u32 flags;
+	__u32 reserved;
+	__u32 params_size;
+	__u64 params_ptr;
+	__u64 user_data;
+};
+
 #define DRM_EXYNOS_GEM_CREATE		0x00
 #define DRM_EXYNOS_GEM_MAP		0x01
 /* Reserved 0x03 ~ 0x05 for exynos specific gem ioctl */
@@ -147,6 +360,11 @@ struct drm_exynos_g2d_exec {
 #define DRM_EXYNOS_G2D_EXEC		0x22
 
 /* Reserved 0x30 ~ 0x33 for obsolete Exynos IPP ioctls */
+/* IPP - Image Post Processing */
+#define DRM_EXYNOS_IPP_GET_RESOURCES	0x40
+#define DRM_EXYNOS_IPP_GET_CAPS		0x41
+#define DRM_EXYNOS_IPP_GET_LIMITS	0x42
+#define DRM_EXYNOS_IPP_COMMIT		0x43
 
 #define DRM_IOCTL_EXYNOS_GEM_CREATE		DRM_IOWR(DRM_COMMAND_BASE + \
 		DRM_EXYNOS_GEM_CREATE, struct drm_exynos_gem_create)
@@ -165,8 +383,20 @@ struct drm_exynos_g2d_exec {
 #define DRM_IOCTL_EXYNOS_G2D_EXEC		DRM_IOWR(DRM_COMMAND_BASE + \
 		DRM_EXYNOS_G2D_EXEC, struct drm_exynos_g2d_exec)
 
+#define DRM_IOCTL_EXYNOS_IPP_GET_RESOURCES	DRM_IOWR(DRM_COMMAND_BASE + \
+		DRM_EXYNOS_IPP_GET_RESOURCES, \
+		struct drm_exynos_ioctl_ipp_get_res)
+#define DRM_IOCTL_EXYNOS_IPP_GET_CAPS		DRM_IOWR(DRM_COMMAND_BASE + \
+		DRM_EXYNOS_IPP_GET_CAPS, struct drm_exynos_ioctl_ipp_get_caps)
+#define DRM_IOCTL_EXYNOS_IPP_GET_LIMITS		DRM_IOWR(DRM_COMMAND_BASE + \
+		DRM_EXYNOS_IPP_GET_LIMITS, \
+		struct drm_exynos_ioctl_ipp_get_limits)
+#define DRM_IOCTL_EXYNOS_IPP_COMMIT		DRM_IOWR(DRM_COMMAND_BASE + \
+		DRM_EXYNOS_IPP_COMMIT, struct drm_exynos_ioctl_ipp_commit)
+
 /* EXYNOS specific events */
 #define DRM_EXYNOS_G2D_EVENT		0x80000000
+#define DRM_EXYNOS_IPP_EVENT		0x80000002
 
 struct drm_exynos_g2d_event {
 	struct drm_event	base;
@@ -177,6 +407,16 @@ struct drm_exynos_g2d_event {
 	__u32			reserved;
 };
 
+struct drm_exynos_ipp_event {
+	struct drm_event	base;
+	__u64			user_data;
+	__u32			tv_sec;
+	__u32			tv_usec;
+	__u32			ipp_id;
+	__u32			sequence;
+	__u64			reserved;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3


From 7859e08c1bdef00841d29e8ff320264fd6f9257b Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 9 May 2018 17:52:06 +0100
Subject: firmware: arm_scmi: rename get_transition_latency and
 add_opps_to_device

Most of the scmi code follows the suggestion from Greg KH on a totally
different thread[0] to have the subsystem name first, followed by the
noun and finally the verb with couple of these exceptions.

This patch fixes them so that all the functions names are aligned to
that practice.

[0] https://www.spinics.net/lists/arm-kernel/msg583673.html

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/cpufreq/scmi-cpufreq.c   |  4 ++--
 drivers/firmware/arm_scmi/perf.c | 10 +++++-----
 include/linux/scmi_protocol.h    | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index b4dbc77459b6..50b1551ba894 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -117,7 +117,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 		return -ENODEV;
 	}
 
-	ret = handle->perf_ops->add_opps_to_device(handle, cpu_dev);
+	ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
 	if (ret) {
 		dev_warn(cpu_dev, "failed to add opps to the device\n");
 		return ret;
@@ -164,7 +164,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	/* SCMI allows DVFS request for any domain from any CPU */
 	policy->dvfs_possible_from_any_cpu = true;
 
-	latency = handle->perf_ops->get_transition_latency(handle, cpu_dev);
+	latency = handle->perf_ops->transition_latency_get(handle, cpu_dev);
 	if (!latency)
 		latency = CPUFREQ_ETERNAL;
 
diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 987c64d19801..611ab08e6174 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -349,8 +349,8 @@ static int scmi_dev_domain_id(struct device *dev)
 	return clkspec.args[0];
 }
 
-static int scmi_dvfs_add_opps_to_device(const struct scmi_handle *handle,
-					struct device *dev)
+static int scmi_dvfs_device_opps_add(const struct scmi_handle *handle,
+				     struct device *dev)
 {
 	int idx, ret, domain;
 	unsigned long freq;
@@ -383,7 +383,7 @@ static int scmi_dvfs_add_opps_to_device(const struct scmi_handle *handle,
 	return 0;
 }
 
-static int scmi_dvfs_get_transition_latency(const struct scmi_handle *handle,
+static int scmi_dvfs_transition_latency_get(const struct scmi_handle *handle,
 					    struct device *dev)
 {
 	struct perf_dom_info *dom;
@@ -432,8 +432,8 @@ static struct scmi_perf_ops perf_ops = {
 	.level_set = scmi_perf_level_set,
 	.level_get = scmi_perf_level_get,
 	.device_domain_id = scmi_dev_domain_id,
-	.get_transition_latency = scmi_dvfs_get_transition_latency,
-	.add_opps_to_device = scmi_dvfs_add_opps_to_device,
+	.transition_latency_get = scmi_dvfs_transition_latency_get,
+	.device_opps_add = scmi_dvfs_device_opps_add,
 	.freq_set = scmi_dvfs_freq_set,
 	.freq_get = scmi_dvfs_freq_get,
 };
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index a171c1e293e8..f4c9fc0fc755 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -85,8 +85,8 @@ struct scmi_clk_ops {
  * @level_set: sets the performance level of a domain
  * @level_get: gets the performance level of a domain
  * @device_domain_id: gets the scmi domain id for a given device
- * @get_transition_latency: gets the DVFS transition latency for a given device
- * @add_opps_to_device: adds all the OPPs for a given device
+ * @transition_latency_get: gets the DVFS transition latency for a given device
+ * @device_opps_add: adds all the OPPs for a given device
  * @freq_set: sets the frequency for a given device using sustained frequency
  *	to sustained performance level mapping
  * @freq_get: gets the frequency for a given device using sustained frequency
@@ -102,10 +102,10 @@ struct scmi_perf_ops {
 	int (*level_get)(const struct scmi_handle *handle, u32 domain,
 			 u32 *level, bool poll);
 	int (*device_domain_id)(struct device *dev);
-	int (*get_transition_latency)(const struct scmi_handle *handle,
+	int (*transition_latency_get)(const struct scmi_handle *handle,
 				      struct device *dev);
-	int (*add_opps_to_device)(const struct scmi_handle *handle,
-				  struct device *dev);
+	int (*device_opps_add)(const struct scmi_handle *handle,
+			       struct device *dev);
 	int (*freq_set)(const struct scmi_handle *handle, u32 domain,
 			unsigned long rate, bool poll);
 	int (*freq_get)(const struct scmi_handle *handle, u32 domain,
-- 
cgit v1.2.3


From 2af8641b2ad3c0faf1ba63e989ca2f2f2134e10d Mon Sep 17 00:00:00 2001
From: Thomas Tai <thomas.tai@oracle.com>
Date: Tue, 8 May 2018 19:04:56 -0400
Subject: PCI/AER: Add TLP header information to tracepoint

When a PCIe AER error occurs, the TLP header information is printed in the
kernel message but it is missing from the tracepoint.  A userspace program
can use this information in the tracepoint to better analyze problems.

To enable the tracepoint:

  echo 1 > /sys/kernel/debug/tracing/events/ras/aer_event/enable

Example tracepoint output:

  $ cat /sys/kernel/debug/tracing/trace
  aer_event: 0000:01:00.0
  PCIe Bus Error: severity=Uncorrected, non-fatal, Completer Abort
  TLP Header={0x0,0x1,0x2,0x3}

Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 drivers/pci/pcie/aer/aerdrv_errprint.c |  4 ++--
 include/ras/ras_event.h                | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index b5612cc51b63..21ca5e1b0ded 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -189,7 +189,7 @@ out:
 		pci_err(dev, "  Error of this Agent(%04x) is reported first\n", id);
 
 	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
-			info->severity);
+			info->severity, info->tlp_header_valid, &info->tlp);
 }
 
 void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -251,6 +251,6 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
 		__print_tlp_header(dev, &aer->header_log);
 
 	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
-			aer_severity);
+			aer_severity, tlp_header_valid, &aer->header_log);
 }
 #endif
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 9c689868eb4d..a0794632fd01 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -298,30 +298,44 @@ TRACE_EVENT(non_standard_event,
 TRACE_EVENT(aer_event,
 	TP_PROTO(const char *dev_name,
 		 const u32 status,
-		 const u8 severity),
+		 const u8 severity,
+		 const u8 tlp_header_valid,
+		 struct aer_header_log_regs *tlp),
 
-	TP_ARGS(dev_name, status, severity),
+	TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
 
 	TP_STRUCT__entry(
 		__string(	dev_name,	dev_name	)
 		__field(	u32,		status		)
 		__field(	u8,		severity	)
+		__field(	u8, 		tlp_header_valid)
+		__array(	u32, 		tlp_header, 4	)
 	),
 
 	TP_fast_assign(
 		__assign_str(dev_name, dev_name);
 		__entry->status		= status;
 		__entry->severity	= severity;
+		__entry->tlp_header_valid = tlp_header_valid;
+		if (tlp_header_valid) {
+			__entry->tlp_header[0] = tlp->dw0;
+			__entry->tlp_header[1] = tlp->dw1;
+			__entry->tlp_header[2] = tlp->dw2;
+			__entry->tlp_header[3] = tlp->dw3;
+		}
 	),
 
-	TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
+	TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
 		__get_str(dev_name),
 		__entry->severity == AER_CORRECTABLE ? "Corrected" :
 			__entry->severity == AER_FATAL ?
 			"Fatal" : "Uncorrected, non-fatal",
 		__entry->severity == AER_CORRECTABLE ?
 		__print_flags(__entry->status, "|", aer_correctable_errors) :
-		__print_flags(__entry->status, "|", aer_uncorrectable_errors))
+		__print_flags(__entry->status, "|", aer_uncorrectable_errors),
+		__entry->tlp_header_valid ?
+			__print_array(__entry->tlp_header, 4, 4) :
+			"Not available")
 );
 
 /*
-- 
cgit v1.2.3


From c3052594c8ded984ceab3725f63990dfdea1e58f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 25 Apr 2018 16:28:26 +0200
Subject: ACPICA: provide abstraction for raw_spinlock_t

Provide a new lock type acpi_raw_spinlock which is implemented as
raw_spinlock_t on Linux. This type should be used in code which covers
small areas of code and disables interrupts only for short time even on
a realtime OS.
There is a fallback to spinlock_t if an OS does not provide an
implementation for acpi_raw_spinlock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpiosxf.h           | 21 +++++++++++++++++++++
 include/acpi/actypes.h            |  4 ++++
 include/acpi/platform/aclinux.h   |  5 +++++
 include/acpi/platform/aclinuxex.h | 30 ++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index 540d35f06ad6..eb1f21af7556 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -97,6 +97,27 @@ acpi_cpu_flags acpi_os_acquire_lock(acpi_spinlock handle);
 void acpi_os_release_lock(acpi_spinlock handle, acpi_cpu_flags flags);
 #endif
 
+/*
+ * RAW spinlock primitives. If the OS does not provide them, fallback to
+ * spinlock primitives
+ */
+#ifndef ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_raw_lock
+# define acpi_os_create_raw_lock(out_handle)	acpi_os_create_lock(out_handle)
+#endif
+
+#ifndef ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_delete_raw_lock
+# define acpi_os_delete_raw_lock(handle)	acpi_os_delete_lock(handle)
+#endif
+
+#ifndef ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_acquire_raw_lock
+# define acpi_os_acquire_raw_lock(handle)	acpi_os_acquire_lock(handle)
+#endif
+
+#ifndef ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_release_raw_lock
+# define acpi_os_release_raw_lock(handle, flags)	\
+	acpi_os_release_lock(handle, flags)
+#endif
+
 /*
  * Semaphore primitives
  */
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 1c530f95dc34..2b1bafa197c0 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -245,6 +245,10 @@ typedef u64 acpi_physical_address;
 #define acpi_spinlock                   void *
 #endif
 
+#ifndef acpi_raw_spinlock
+#define acpi_raw_spinlock		acpi_spinlock
+#endif
+
 #ifndef acpi_semaphore
 #define acpi_semaphore                  void *
 #endif
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index a0b232703302..7451b3bca83a 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -102,6 +102,7 @@
 
 #define acpi_cache_t                        struct kmem_cache
 #define acpi_spinlock                       spinlock_t *
+#define acpi_raw_spinlock                   raw_spinlock_t *
 #define acpi_cpu_flags                      unsigned long
 
 /* Use native linux version of acpi_os_allocate_zeroed */
@@ -119,6 +120,10 @@
 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_acquire_object
 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
+#define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_raw_lock
+#define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_delete_raw_lock
+#define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_acquire_raw_lock
+#define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_release_raw_lock
 
 /*
  * OSL interfaces used by debugger/disassembler
diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h
index 7e81475fe034..d754a1b12721 100644
--- a/include/acpi/platform/aclinuxex.h
+++ b/include/acpi/platform/aclinuxex.h
@@ -90,6 +90,36 @@ static inline acpi_thread_id acpi_os_get_thread_id(void)
 		lock ? AE_OK : AE_NO_MEMORY; \
 	})
 
+
+#define acpi_os_create_raw_lock(__handle) \
+	({ \
+		raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
+		if (lock) { \
+			*(__handle) = lock; \
+			raw_spin_lock_init(*(__handle)); \
+		} \
+		lock ? AE_OK : AE_NO_MEMORY; \
+	})
+
+static inline acpi_cpu_flags acpi_os_acquire_raw_lock(acpi_raw_spinlock lockp)
+{
+	acpi_cpu_flags flags;
+
+	raw_spin_lock_irqsave(lockp, flags);
+	return flags;
+}
+
+static inline void acpi_os_release_raw_lock(acpi_raw_spinlock lockp,
+					    acpi_cpu_flags flags)
+{
+	raw_spin_unlock_irqrestore(lockp, flags);
+}
+
+static inline void acpi_os_delete_raw_lock(acpi_raw_spinlock handle)
+{
+	ACPI_FREE(handle);
+}
+
 static inline u8 acpi_os_readable(void *pointer, acpi_size length)
 {
 	return TRUE;
-- 
cgit v1.2.3


From a327553965dede92587e6ccbe7df98dba36edcea Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 9 May 2018 17:16:31 -0700
Subject: sbitmap: fix missed wakeups caused by sbitmap_queue_get_shallow()

The sbitmap queue wake batch is calculated such that once allocations
start blocking, all of the bits which are already allocated must be
enough to fulfill the batch counters of all of the waitqueues. However,
the shallow allocation depth can break this invariant, since we block
before our full depth is being utilized. Add
sbitmap_queue_min_shallow_depth(), which saves the minimum shallow depth
the sbq will use, and update sbq_calc_wake_batch() to take it into
account.

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 29 +++++++++++++++++++++++++++++
 lib/sbitmap.c           | 49 ++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 69 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 841585f6e5f2..0c4a9c242dd7 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -127,6 +127,12 @@ struct sbitmap_queue {
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
 	bool round_robin;
+
+	/**
+	 * @min_shallow_depth: The minimum shallow depth which may be passed to
+	 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+	 */
+	unsigned int min_shallow_depth;
 };
 
 /**
@@ -390,6 +396,9 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq);
  * @shallow_depth: The maximum number of bits to allocate from a single word.
  * See sbitmap_get_shallow().
  *
+ * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
+ * initializing @sbq.
+ *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
 int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
@@ -424,6 +433,9 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
  * @shallow_depth: The maximum number of bits to allocate from a single word.
  * See sbitmap_get_shallow().
  *
+ * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
+ * initializing @sbq.
+ *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
 static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
@@ -438,6 +450,23 @@ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 	return nr;
 }
 
+/**
+ * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
+ * minimum shallow depth that will be used.
+ * @sbq: Bitmap queue in question.
+ * @min_shallow_depth: The minimum shallow depth that will be passed to
+ * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+ *
+ * sbitmap_queue_clear() batches wakeups as an optimization. The batch size
+ * depends on the depth of the bitmap. Since the shallow allocation functions
+ * effectively operate with a different depth, the shallow depth must be taken
+ * into account when calculating the batch size. This function must be called
+ * with the minimum shallow depth that will be used. Failure to do so can result
+ * in missed wakeups.
+ */
+void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
+				     unsigned int min_shallow_depth);
+
 /**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e6a9c06ec70c..d21473b42465 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -270,18 +270,33 @@ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
 }
 EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
 
-static unsigned int sbq_calc_wake_batch(unsigned int depth)
+static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
+					unsigned int depth)
 {
 	unsigned int wake_batch;
+	unsigned int shallow_depth;
 
 	/*
 	 * For each batch, we wake up one queue. We need to make sure that our
-	 * batch size is small enough that the full depth of the bitmap is
-	 * enough to wake up all of the queues.
+	 * batch size is small enough that the full depth of the bitmap,
+	 * potentially limited by a shallow depth, is enough to wake up all of
+	 * the queues.
+	 *
+	 * Each full word of the bitmap has bits_per_word bits, and there might
+	 * be a partial word. There are depth / bits_per_word full words and
+	 * depth % bits_per_word bits left over. In bitwise arithmetic:
+	 *
+	 * bits_per_word = 1 << shift
+	 * depth / bits_per_word = depth >> shift
+	 * depth % bits_per_word = depth & ((1 << shift) - 1)
+	 *
+	 * Each word can be limited to sbq->min_shallow_depth bits.
 	 */
-	wake_batch = SBQ_WAKE_BATCH;
-	if (wake_batch > depth / SBQ_WAIT_QUEUES)
-		wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+	shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
+	depth = ((depth >> sbq->sb.shift) * shallow_depth +
+		 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
+	wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
+			     SBQ_WAKE_BATCH);
 
 	return wake_batch;
 }
@@ -307,7 +322,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 			*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
 	}
 
-	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	sbq->min_shallow_depth = UINT_MAX;
+	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
@@ -327,9 +343,10 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
 
-void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+					    unsigned int depth)
 {
-	unsigned int wake_batch = sbq_calc_wake_batch(depth);
+	unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
 	int i;
 
 	if (sbq->wake_batch != wake_batch) {
@@ -342,6 +359,11 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
 		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
 			atomic_set(&sbq->ws[i].wait_cnt, 1);
 	}
+}
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+	sbitmap_queue_update_wake_batch(sbq, depth);
 	sbitmap_resize(&sbq->sb, depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
@@ -403,6 +425,14 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
 
+void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
+				     unsigned int min_shallow_depth)
+{
+	sbq->min_shallow_depth = min_shallow_depth;
+	sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
@@ -528,5 +558,6 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_puts(m, "}\n");
 
 	seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
-- 
cgit v1.2.3


From 5263a98f162f7a46e1292632c87f1e3444eb8fbf Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:06:58 -0700
Subject: net/ipv4: Update ip_tunnel_metadata_cnt static key to modern api

No changes in refcount semantics -- key init is false; replace

static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h  | 4 ++--
 net/ipv4/ip_tunnel_core.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 751646adc769..90ff430f5e9d 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -477,12 +477,12 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat
 	return (struct ip_tunnel_info *)lwtstate->data;
 }
 
-extern struct static_key ip_tunnel_metadata_cnt;
+DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
 
 /* Returns > 0 if metadata should be collected */
 static inline int ip_tunnel_collect_metadata(void)
 {
-	return static_key_false(&ip_tunnel_metadata_cnt);
+	return static_branch_unlikely(&ip_tunnel_metadata_cnt);
 }
 
 void __init ip_tunnel_core_init(void);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2f39479be92f..dde671e97829 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -423,17 +423,17 @@ void __init ip_tunnel_core_init(void)
 	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
 }
 
-struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
 EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
 
 void ip_tunnel_need_metadata(void)
 {
-	static_key_slow_inc(&ip_tunnel_metadata_cnt);
+	static_branch_inc(&ip_tunnel_metadata_cnt);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
 
 void ip_tunnel_unneed_metadata(void)
 {
-	static_key_slow_dec(&ip_tunnel_metadata_cnt);
+	static_branch_dec(&ip_tunnel_metadata_cnt);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
-- 
cgit v1.2.3


From a7950ae8213cf38343fd27ad1fb58f3f04e3130f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:06:59 -0700
Subject: net/sock: Update memalloc_socks static key to modern api

No changes in refcount semantics -- key init is false; replace

static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Added a '_key' suffix to memalloc_socks, for better self
documentation.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 4 ++--
 net/core/sock.c    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 3c568b36ee36..4f7c584e9765 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -808,10 +808,10 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
 }
 
 #ifdef CONFIG_NET
-extern struct static_key memalloc_socks;
+DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
 static inline int sk_memalloc_socks(void)
 {
-	return static_key_false(&memalloc_socks);
+	return static_branch_unlikely(&memalloc_socks_key);
 }
 #else
 
diff --git a/net/core/sock.c b/net/core/sock.c
index e7d8b6c955c6..042cfc612660 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -327,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max);
 
 int sysctl_tstamp_allow_data __read_mostly = 1;
 
-struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL_GPL(memalloc_socks);
+DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
+EXPORT_SYMBOL_GPL(memalloc_socks_key);
 
 /**
  * sk_set_memalloc - sets %SOCK_MEMALLOC
@@ -342,7 +342,7 @@ void sk_set_memalloc(struct sock *sk)
 {
 	sock_set_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation |= __GFP_MEMALLOC;
-	static_key_slow_inc(&memalloc_socks);
+	static_branch_inc(&memalloc_socks_key);
 }
 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 
@@ -350,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk)
 {
 	sock_reset_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation &= ~__GFP_MEMALLOC;
-	static_key_slow_dec(&memalloc_socks);
+	static_branch_dec(&memalloc_socks_key);
 
 	/*
 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
-- 
cgit v1.2.3


From e5c9a705452acf59efe00cb84f086624fda010ab Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 9 May 2018 18:29:02 +0300
Subject: net/mlx4_core: Report driver version to FW

If supported, write a driver version string to FW as part of the
INIT_HCA command.

Example of driver version: "Linux,mlx4_core,4.0-0"

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 12 ++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  2 +-
 include/linux/mlx4/device.h               |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index de6b3d416148..46dcbfbe4c5e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -165,6 +165,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[36] = "QinQ VST mode support",
 		[37] = "sl to vl mapping table change event support",
 		[38] = "user MAC support",
+		[39] = "Report driver version to FW support",
 	};
 	int i;
 
@@ -1038,6 +1039,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
 	if (field32 & (1 << 7))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
+	if (field32 & (1 << 8))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW;
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT);
 	if (field32 & (1 << 17))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT;
@@ -1860,6 +1863,8 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
 #define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
 #define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
+#define  INIT_HCA_DRIVER_VERSION_OFFSET   0x140
+#define  INIT_HCA_DRIVER_VERSION_SZ       0x40
 #define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
 #define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
 #define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
@@ -1950,6 +1955,13 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
 		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW) {
+		u8 *dst = (u8 *)(inbox + INIT_HCA_DRIVER_VERSION_OFFSET / 4);
+
+		strncpy(dst, DRV_NAME_FOR_FW, INIT_HCA_DRIVER_VERSION_SZ - 1);
+		mlx4_dbg(dev, "Reporting Driver Version to FW: %s\n", dst);
+	}
+
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
 	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index c68da1986e51..cb9e923e8399 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -55,8 +55,8 @@
 #include "fw_qos.h"
 
 #define DRV_NAME	"mlx4_core"
-#define PFX		DRV_NAME ": "
 #define DRV_VERSION	"4.0-0"
+#define DRV_NAME_FOR_FW		"Linux," DRV_NAME "," DRV_VERSION
 
 #define MLX4_FS_UDP_UC_EN		(1 << 1)
 #define MLX4_FS_TCP_UC_EN		(1 << 2)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 81d0799b6091..122e7e9d3091 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -225,6 +225,7 @@ enum {
 	MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP          = 1ULL <<  36,
 	MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT = 1ULL << 37,
 	MLX4_DEV_CAP_FLAG2_USER_MAC_EN		= 1ULL << 38,
+	MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW = 1ULL << 39,
 };
 
 enum {
-- 
cgit v1.2.3


From 03bdfc001c951cb04ad3d28aecee4ec0e18e9664 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 9 May 2018 23:24:07 -0700
Subject: net: ipv4: remove define INET_CSK_DEBUG and unnecessary EXPORT_SYMBOL

INET_CSK_DEBUG is always set and only is used for 2 pr_debug calls.

EXPORT_SYMBOL(inet_csk_timer_bug_msg) is only used by these 2
pr_debug calls and is also unnecessary as the exported string can
be used directly by these calls.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 22 ++++------------------
 net/ipv4/inet_connection_sock.c    |  5 -----
 2 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 2ab6667275df..0a6c9e0f2b5a 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -23,8 +23,6 @@
 #include <net/inet_sock.h>
 #include <net/request_sock.h>
 
-#define INET_CSK_DEBUG 1
-
 /* Cancel timers, when they are not required. */
 #undef INET_CSK_CLEAR_TIMERS
 
@@ -196,10 +194,6 @@ static inline void inet_csk_delack_init(struct sock *sk)
 void inet_csk_delete_keepalive_timer(struct sock *sk);
 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout);
 
-#ifdef INET_CSK_DEBUG
-extern const char inet_csk_timer_bug_msg[];
-#endif
-
 static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -214,12 +208,9 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
 #ifdef INET_CSK_CLEAR_TIMERS
 		sk_stop_timer(sk, &icsk->icsk_delack_timer);
 #endif
+	} else {
+		pr_debug("inet_csk BUG: unknown timer value\n");
 	}
-#ifdef INET_CSK_DEBUG
-	else {
-		pr_debug("%s", inet_csk_timer_bug_msg);
-	}
-#endif
 }
 
 /*
@@ -232,10 +223,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (when > max_when) {
-#ifdef INET_CSK_DEBUG
 		pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n",
 			 sk, what, when, current_text_addr());
-#endif
 		when = max_when;
 	}
 
@@ -249,12 +238,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 		icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
 		icsk->icsk_ack.timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+	} else {
+		pr_debug("inet_csk BUG: unknown timer value\n");
 	}
-#ifdef INET_CSK_DEBUG
-	else {
-		pr_debug("%s", inet_csk_timer_bug_msg);
-	}
-#endif
 }
 
 static inline unsigned long
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 881ac6d046f2..33a88e045efd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -27,11 +27,6 @@
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
 
-#ifdef INET_CSK_DEBUG
-const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
-EXPORT_SYMBOL(inet_csk_timer_bug_msg);
-#endif
-
 #if IS_ENABLED(CONFIG_IPV6)
 /* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
  *                          only, and any IPv4 addresses if not IPv6 only
-- 
cgit v1.2.3


From 00483690552c5fb6aa30bf3acb75b0ee89b4c0fd Mon Sep 17 00:00:00 2001
From: Jon Maxwell <jmaxwell37@gmail.com>
Date: Thu, 10 May 2018 16:53:51 +1000
Subject: tcp: Add mark for TIMEWAIT sockets

This version has some suggestions by Eric Dumazet:

- Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP
races.
- Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark"
statement.
- Factorize code as sk_fullsock() check is not necessary.

Aidan McGurn from Openwave Mobility systems reported the following bug:

"Marked routing is broken on customer deployment. Its effects are large
increase in Uplink retransmissions caused by the client never receiving
the final ACK to their FINACK - this ACK misses the mark and routes out
of the incorrect route."

Currently marks are added to sk_buffs for replies when the "fwmark_reflect"
sysctl is enabled. But not for TW sockets that had sk->sk_mark set via
setsockopt(SO_MARK..).

Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the
original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location.
Then progate this so that the skb gets sent with the correct mark. Do the same
for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that
netfilter rules are still honored.

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h |  1 +
 net/ipv4/ip_output.c             |  2 +-
 net/ipv4/tcp_ipv4.c              | 16 ++++++++++++++--
 net/ipv4/tcp_minisocks.c         |  1 +
 net/ipv6/tcp_ipv6.c              |  6 +++++-
 5 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..659d8ed5a3bc 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -62,6 +62,7 @@ struct inet_timewait_sock {
 #define tw_dr			__tw_common.skc_tw_dr
 
 	int			tw_timeout;
+	__u32			tw_mark;
 	volatile unsigned char	tw_substate;
 	unsigned char		tw_rcv_wscale;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 95adb171f852..b5e21eb198d8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1561,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 		oif = skb->skb_iif;
 
 	flowi4_init_output(&fl4, oif,
-			   IP4_REPLY_MARK(net, skb->mark),
+			   IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
 			   RT_TOS(arg->tos),
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b50838..caf23de88f8a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	struct sock *sk1 = NULL;
 #endif
 	struct net *net;
+	struct sock *ctl_sk;
 
 	/* Never send a reset in response to a reset. */
 	if (th->rst)
@@ -723,11 +724,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	arg.tos = ip_hdr(skb)->tos;
 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk)
+		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 	local_bh_enable();
@@ -759,6 +765,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	} rep;
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
+	struct sock *ctl_sk;
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
@@ -809,11 +816,16 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	arg.tos = tos;
 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk)
+		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	local_bh_enable();
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468b5139..f867658b4b30 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		struct inet_sock *inet = inet_sk(sk);
 
 		tw->tw_transparent	= inet->transparent;
+		tw->tw_mark		= sk->sk_mark;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6d664d83cd16..7d47c2b550a9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	unsigned int tot_len = sizeof(struct tcphdr);
 	struct dst_entry *dst;
 	__be32 *topt;
+	__u32 mark = 0;
 
 	if (tsecr)
 		tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		fl6.flowi6_oif = oif;
 	}
 
-	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
+	if (sk)
+		mark = (sk->sk_state == TCP_TIME_WAIT) ?
+			inet_twsk(sk)->tw_mark : sk->sk_mark;
+	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
-- 
cgit v1.2.3


From 6454743bc13e7dfd4f2720758ca3fcdea76b82a4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:19 -0700
Subject: net/ipv6: Rename fib6_lookup to fib6_node_lookup

Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  6 +++---
 net/ipv6/ip6_fib.c    | 14 ++++++++------
 net/ipv6/route.c      |  8 ++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a3ec08d05756..43ab545e64ea 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
-struct fib6_node *fib6_lookup(struct fib6_node *root,
-			      const struct in6_addr *daddr,
-			      const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr);
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f0a4262a4789..487faffeae28 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1354,8 +1354,8 @@ struct lookup_args {
 	const struct in6_addr	*addr;		/* search key			*/
 };
 
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
-				       struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+					    struct lookup_args *args)
 {
 	struct fib6_node *fn;
 	__be32 dir;
@@ -1400,7 +1400,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 #ifdef CONFIG_IPV6_SUBTREES
 				if (subtree) {
 					struct fib6_node *sfn;
-					sfn = fib6_lookup_1(subtree, args + 1);
+					sfn = fib6_node_lookup_1(subtree,
+								 args + 1);
 					if (!sfn)
 						goto backtrack;
 					fn = sfn;
@@ -1422,8 +1423,9 @@ backtrack:
 
 /* called with rcu_read_lock() held
  */
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
-			      const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr)
 {
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
@@ -1442,7 +1444,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 		}
 	};
 
-	fn = fib6_lookup_1(root, daddr ? args : args + 1);
+	fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
 	if (!fn || fn->fn_flags & RTN_TL_ROOT)
 		fn = root;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index daa3662da0ee..443d2a0bc150 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 		pn = rcu_dereference(fn->parent);
 		sn = FIB6_SUBTREE(pn);
 		if (sn && sn != fn)
-			fn = fib6_lookup(sn, NULL, saddr);
+			fn = fib6_node_lookup(sn, NULL, saddr);
 		else
 			fn = pn;
 		if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		flags &= ~RT6_LOOKUP_F_IFACE;
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	f6i = rcu_dereference(fn->leaf);
 	if (!f6i) {
@@ -1815,7 +1815,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	rcu_read_lock();
 
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2425,7 +2425,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	 */
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
-- 
cgit v1.2.3


From 3b290a31bbc5969f9193f73d547a6dc8a25c6f9e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:20 -0700
Subject: net/ipv6: Rename rt6_multipath_select

Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/route.c      | 17 +++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 43ab545e64ea..2597d8fdd92f 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb, int strict);
+
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 443d2a0bc150..6a10608d9025 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct fib6_info *rt6_multipath_select(const struct net *net,
-					      struct fib6_info *match,
-					     struct flowi6 *fl6, int oif,
-					     const struct sk_buff *skb,
-					     int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb,
+					int strict)
 {
 	struct fib6_info *sibling, *next_sibling;
 
@@ -1068,8 +1068,9 @@ restart:
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
-			f6i = rt6_multipath_select(net, f6i, fl6,
-						   fl6->flowi6_oif, skb, flags);
+			f6i = fib6_multipath_select(net, f6i, fl6,
+						    fl6->flowi6_oif, skb,
+						    flags);
 	}
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
 	if (f6i->fib6_nsiblings)
-		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
-- 
cgit v1.2.3


From 1d053da910947afccec96d90892c0f5488c7a9cf Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:21 -0700
Subject: net/ipv6: Extract table lookup from ip6_pol_route

ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.

ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 39 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2597d8fdd92f..c70705f2647a 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict);
+
 struct fib6_info *fib6_multipath_select(const struct net *net,
 					struct fib6_info *match,
 					struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6a10608d9025..019d8ba9021e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6,
-			       const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct fib6_info *f6i;
-	struct rt6_info *rt;
-	int strict = 0;
-
-	strict |= flags & RT6_LOOKUP_F_IFACE;
-	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
-		strict |= RT6_LOOKUP_F_REACHABLE;
-
-	rcu_read_lock();
 
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->fib6_nsiblings)
-		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1838,6 +1827,28 @@ redo_rt6_select:
 		}
 	}
 
+	return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
+{
+	struct fib6_info *f6i;
+	struct rt6_info *rt;
+	int strict = 0;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
+
+	rcu_read_lock();
+
+	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+	if (f6i->fib6_nsiblings)
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
 	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
-- 
cgit v1.2.3


From 138118ec96cbfc303c1d7cc05fbb2caf8382c95b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:23 -0700
Subject: net/ipv6: Add fib6_lookup

Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled)
to 60% faster than any of the dst based lookup methods (without custom
rules) and 25% faster with custom rules (e.g., l3mdev rule).

Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.

Caller must hold rcu lock as no reference is taken on the returned
fib entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  6 ++++
 net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_fib.c    |  7 +++++
 3 files changed, 97 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c70705f2647a..cc70f6da8462 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags);
+
 /* called with rcu lock held; caller needs to select path */
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
 	return fib_rules_seq_read(net, AF_INET6);
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	struct fib6_info *f6i;
+	int err;
+
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = fib6_table_lookup,
+			.lookup_data = &oif,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				       flowi6_to_flowi(fl6), flags, &arg);
+		if (err)
+			return ERR_PTR(err);
+
+		f6i = arg.result ? : net->ipv6.fib6_null_entry;
+	} else {
+		f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+					oif, fl6, flags);
+		if (!f6i || f6i == net->ipv6.fib6_null_entry)
+			f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+						oif, fl6, flags);
+	}
+
+	return f6i;
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
 	return 0;
 }
 
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-			    int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+				int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct net *net = rule->fr_net;
+	struct fib6_table *table;
+	struct fib6_info *f6i;
+	int err = -EAGAIN, *oif;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
+	if (!table)
+		return -EAGAIN;
+
+	oif = (int *)arg->lookup_data;
+	f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+	if (f6i != net->ipv6.fib6_null_entry) {
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      fib6_info_nh_dev(f6i));
+
+		if (likely(!err))
+			arg->result = f6i;
+	}
+
+	return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	struct flowi6 *flp6 = &flp->u.ip6;
 	struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ out:
 	return err;
 }
 
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	if (arg->lookup_ptr == fib6_table_lookup)
+		return fib6_rule_action_alt(rule, flp, flags, arg);
+
+	return __fib6_rule_action(rule, flp, flags, arg);
+}
+
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 487faffeae28..d1dc6017f5a6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &rt->dst;
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
 static void __net_init fib6_tables_init(struct net *net)
 {
 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
-- 
cgit v1.2.3


From d4bea421f7322400d804c2284739e42e61f78349 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:24 -0700
Subject: net/ipv6: Update fib6 tracepoint to take fib6_info

Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/trace/events/fib6.h | 14 +++++++-------
 net/ipv6/route.c            | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct rt6_info *rt,
+	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, table, flp),
+	TP_ARGS(net, f6i, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
-		if (rt->rt6i_idev) {
-			__assign_str(name, rt->rt6i_idev->dev->name);
+		if (f6i->fib6_nh.nh_dev) {
+			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
 			__assign_str(name, "");
 		}
-		if (rt == net->ipv6.ip6_null_entry) {
+		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (rt) {
+		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = rt->rt6i_gateway;
+			*in6 = f6i->fib6_nh.nh_gw;
 		}
 	),
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 019d8ba9021e..73f9c29a5878 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ restart:
 			goto restart;
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	/* Search through exception table */
 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
 	if (rt) {
@@ -1096,8 +1098,6 @@ restart:
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-
 	return rt;
 }
 
@@ -1827,6 +1827,8 @@ redo_rt6_select:
 		}
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	return f6i;
 }
 
@@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	}
 
@@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_use_noref(&rt->dst, jiffies);
 
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_hold(&uncached_rt->dst);
 		}
 
-		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
-
 	} else {
 		/* Get a percpu copy */
 
@@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
 		return pcpu_rt;
 	}
 }
@@ -2491,7 +2489,7 @@ out:
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, ret, table, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return ret;
 };
 
-- 
cgit v1.2.3


From 65a2022e89a4760f9702837e2d9d15a39a9c68a3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:25 -0700
Subject: net/ipv6: Add fib lookup stubs for use in bpf helper

Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.

The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/addrconf.h   | 14 ++++++++++++++
 net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c      |  6 +++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
 	return -EAFNOSUPPORT;
 }
 
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			 int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+				   struct flowi6 *fl6, int oif,
+				   const struct sk_buff *skb, int strict)
+{
+	return f6i;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
-	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
+	.fib6_get_table    = eafnosupport_fib6_get_table,
+	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
+	.fib6_lookup       = eafnosupport_fib6_lookup,
+	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d0af96e0d109..50de8b0d4f70 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -889,7 +889,11 @@ static struct pernet_operations inet6_net_ops = {
 static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_join = ipv6_sock_mc_join,
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
-	.ipv6_dst_lookup = ip6_dst_lookup,
+	.ipv6_dst_lookup   = ip6_dst_lookup,
+	.fib6_get_table	   = fib6_get_table,
+	.fib6_table_lookup = fib6_table_lookup,
+	.fib6_lookup       = fib6_lookup,
+	.fib6_multipath_select = fib6_multipath_select,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
cgit v1.2.3


From 87f5fc7e48dd3175b30dd03b41564e1a8e136323 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:26 -0700
Subject: bpf: Provide helper to do forwarding lookups in kernel FIB table

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720

These number are single CPU core forwarding on a Broadwell
E5-1650 v4 @ 3.60GHz.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  81 +++++++++++++-
 net/core/filter.c        | 267 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 347 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d615c777b573..02e4112510f8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1828,6 +1828,33 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packet is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst,
+ *		ipv6_dst or mpls_out based on family, smac is set to mac
+ *		address of egress device, dmac is set to nexthop mac address,
+ *		rt_metric is set to metric from route.
+ *
+ *             *plen* argument is the size of the passed in struct.
+ *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *
+ *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
+ *             full lookup using FIB rules
+ *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
+ *             perspective (default is ingress)
+ *
+ *             *ctx* is either **struct xdp_md** for XDP programs or
+ *             **struct sk_buff** tc cls_act programs.
+ *
+ *     Return
+ *             Egress device index on success, 0 if packet needs to continue
+ *             up the stack for further processing or a negative error in case
+ *             of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1898,7 +1925,8 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(fib_lookup),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2321,4 +2349,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		__u32		ipv6_src[4];  /* in6_addr; network order */
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		__u32		ipv6_dst[4];  /* in6_addr; network order */
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[6];     /* ETH_ALEN */
+	__u8	dmac[6];     /* ETH_ALEN */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 0baa715e4699..ca60d2872da4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -60,6 +60,10 @@
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,265 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+	struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(dst) || rt6_need_strict(src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	idev = __in6_dev_get_safely(dev);
+	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = *dst;
+	fl6.saddr = *src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		*dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+	}
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+	.func		= bpf_xdp_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+	.func		= bpf_skb_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4181,6 +4444,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4206,6 +4471,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_xdp_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From cef74409ea79b0a37af6889e7abf7a2a9c47979b Mon Sep 17 00:00:00 2001
From: Gil Kupfer <gilkup@gmail.com>
Date: Thu, 10 May 2018 17:56:02 -0500
Subject: PCI: Add "pci=noats" boot parameter

Adds a "pci=noats" boot parameter.  When supplied, all ATS related
functions fail immediately and the IOMMU is configured to not use
device-IOTLB.

Any function that checks for ATS capabilities directly against the devices
should also check this flag.  Currently, such functions exist only in IOMMU
drivers, and they are covered by this patch.

The motivation behind this patch is the existence of malicious devices.
Lots of research has been done about how to use the IOMMU as protection
from such devices.  When ATS is supported, any I/O device can access any
physical address by faking device-IOTLB entries.  Adding the ability to
ignore these entries lets sysadmins enhance system security.

Signed-off-by: Gil Kupfer <gilkup@cs.technion.ac.il>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
---
 Documentation/admin-guide/kernel-parameters.txt |  2 ++
 drivers/iommu/amd_iommu.c                       | 11 ++++++++---
 drivers/iommu/intel-iommu.c                     |  3 ++-
 drivers/pci/ats.c                               |  3 +++
 drivers/pci/pci.c                               | 11 +++++++++++
 include/linux/pci.h                             |  2 ++
 6 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 11fc28ecdb6d..a19ccac3b4c7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3147,6 +3147,8 @@
 				on: Turn realloc on
 		realloc		same as realloc=on
 		noari		do not use PCIe ARI.
+		noats		[PCIE, Intel-IOMMU, AMD-IOMMU]
+				do not use PCIe ATS (and IOMMU device IOTLB).
 		pcie_scan_all	Scan all possible PCIe devices.  Otherwise we
 				only look for one device below a PCIe downstream
 				port.
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 2a99f0f14795..56da1c6121d3 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -355,6 +355,9 @@ static bool pci_iommuv2_capable(struct pci_dev *pdev)
 	};
 	int i, pos;
 
+	if (pci_ats_disabled())
+		return false;
+
 	for (i = 0; i < 3; ++i) {
 		pos = pci_find_ext_capability(pdev, caps[i]);
 		if (pos == 0)
@@ -3524,9 +3527,11 @@ int amd_iommu_device_info(struct pci_dev *pdev,
 
 	memset(info, 0, sizeof(*info));
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
-	if (pos)
-		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
+	if (!pci_ats_disabled()) {
+		pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
+		if (pos)
+			info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
+	}
 
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
 	if (pos)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 749d8f235346..772b404a6604 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2459,7 +2459,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	if (dev && dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(info->dev);
 
-		if (ecap_dev_iotlb_support(iommu->ecap) &&
+		if (!pci_ats_disabled() &&
+		    ecap_dev_iotlb_support(iommu->ecap) &&
 		    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
 		    dmar_find_matched_atsr_unit(pdev))
 			info->ats_supported = 1;
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 89305b569d3d..4923a2a8e14b 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -20,6 +20,9 @@ void pci_ats_init(struct pci_dev *dev)
 {
 	int pos;
 
+	if (pci_ats_disabled())
+		return;
+
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
 	if (!pos)
 		return;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655a5643..789ce36be341 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -112,6 +112,14 @@ unsigned int pcibios_max_latency = 255;
 /* If set, the PCIe ARI capability will not be used. */
 static bool pcie_ari_disabled;
 
+/* If set, the PCIe ATS capability will not be used. */
+static bool pcie_ats_disabled;
+
+bool pci_ats_disabled(void)
+{
+	return pcie_ats_disabled;
+}
+
 /* Disable bridge_d3 for all PCIe ports */
 static bool pci_bridge_d3_disable;
 /* Force bridge_d3 for all PCIe ports */
@@ -5793,6 +5801,9 @@ static int __init pci_setup(char *str)
 		if (*str && (str = pcibios_setup(str)) && *str) {
 			if (!strcmp(str, "nomsi")) {
 				pci_no_msi();
+			} else if (!strncmp(str, "noats", 5)) {
+				pr_info("PCIe: ATS is disabled\n");
+				pcie_ats_disabled = true;
 			} else if (!strcmp(str, "noaer")) {
 				pci_no_aer();
 			} else if (!strncmp(str, "realloc=", 8)) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 911f9098a466..aa9c27e129d4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1479,6 +1479,8 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
 static inline void pcie_ecrc_get_policy(char *str) { }
 #endif
 
+bool pci_ats_disabled(void);
+
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
 void pci_ats_init(struct pci_dev *dev);
-- 
cgit v1.2.3


From e9e7a3bdcdf41efe9137dcf225b021e0b6fd2dc3 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 8 May 2018 17:35:47 +0200
Subject: ASoC: rt5640: Add devicetree-bindings for dmic, jack-detect

Add devicetree-bindings for the dmic, jack-detect source and overcurrent-
detect threshold settings.

The dmic bindings mirror the existing bindings for the rt5645.
The jd-src and ovcd bindings mirror the existing bindings for the rt5651.

Cc devicetree@vger.kernel.org
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/rt5640.txt | 35 ++++++++++++++++++++++
 include/dt-bindings/sound/rt5640.h                 | 25 ++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 include/dt-bindings/sound/rt5640.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/rt5640.txt b/Documentation/devicetree/bindings/sound/rt5640.txt
index 57fe64643050..e40e4893eed8 100644
--- a/Documentation/devicetree/bindings/sound/rt5640.txt
+++ b/Documentation/devicetree/bindings/sound/rt5640.txt
@@ -22,6 +22,41 @@ Optional properties:
 
 - realtek,ldo1-en-gpios : The GPIO that controls the CODEC's LDO1_EN pin.
 
+- realtek,dmic1-data-pin
+  0: dmic1 is not used
+  1: using IN1P pin as dmic1 data pin
+  2: using GPIO3 pin as dmic1 data pin
+
+- realtek,dmic2-data-pin
+  0: dmic2 is not used
+  1: using IN1N pin as dmic2 data pin
+  2: using GPIO4 pin as dmic2 data pin
+
+- realtek,jack-detect-source
+  u32. Valid values:
+  0: jack-detect is not used
+  1: Use GPIO1 for jack-detect
+  2: Use JD1_IN4P for jack-detect
+  3: Use JD2_IN4N for jack-detect
+  4: Use GPIO2 for jack-detect
+  5: Use GPIO3 for jack-detect
+  6: Use GPIO4 for jack-detect
+
+- realtek,jack-detect-not-inverted
+  bool. Normal jack-detect switches give an inverted signal, set this bool
+  in the rare case you've a jack-detect switch which is not inverted.
+
+- realtek,over-current-threshold-microamp
+  u32, micbias over-current detection threshold in µA, valid values are
+  600, 1500 and 2000µA.
+
+- realtek,over-current-scale-factor
+  u32, micbias over-current detection scale-factor, valid values are:
+  0: Scale current by 0.5
+  1: Scale current by 0.75
+  2: Scale current by 1.0
+  3: Scale current by 1.5
+
 Pins on the device (for linking into audio routes) for RT5639/RT5640:
 
   * DMIC1
diff --git a/include/dt-bindings/sound/rt5640.h b/include/dt-bindings/sound/rt5640.h
new file mode 100644
index 000000000000..154c9b4414f2
--- /dev/null
+++ b/include/dt-bindings/sound/rt5640.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_RT5640_H
+#define __DT_RT5640_H
+
+#define RT5640_DMIC1_DATA_PIN_NONE	0
+#define RT5640_DMIC1_DATA_PIN_IN1P	1
+#define RT5640_DMIC1_DATA_PIN_GPIO3	2
+
+#define RT5640_DMIC2_DATA_PIN_NONE	0
+#define RT5640_DMIC2_DATA_PIN_IN1N	1
+#define RT5640_DMIC2_DATA_PIN_GPIO4	2
+
+#define RT5640_JD_SRC_GPIO1		1
+#define RT5640_JD_SRC_JD1_IN4P		2
+#define RT5640_JD_SRC_JD2_IN4N		3
+#define RT5640_JD_SRC_GPIO2		4
+#define RT5640_JD_SRC_GPIO3		5
+#define RT5640_JD_SRC_GPIO4		6
+
+#define RT5640_OVCD_SF_0P5		0
+#define RT5640_OVCD_SF_0P75		1
+#define RT5640_OVCD_SF_1P0		2
+#define RT5640_OVCD_SF_1P5		3
+
+#endif /* __DT_RT5640_H */
-- 
cgit v1.2.3


From 8e3ebf5e8f0a6da53795d940763cc34f5073c4c3 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 8 May 2018 17:35:48 +0200
Subject: ASoC: rt5640: Remove unused rt5640_platform_data

There are no in tree users of platform-data for the rt5640 codec driver,
so lets remove support for it.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/rt5640.h    | 27 ------------------------
 sound/soc/codecs/rt5640.c | 53 ++++++++++++++---------------------------------
 sound/soc/codecs/rt5640.h |  3 +--
 3 files changed, 17 insertions(+), 66 deletions(-)
 delete mode 100644 include/sound/rt5640.h

(limited to 'include')

diff --git a/include/sound/rt5640.h b/include/sound/rt5640.h
deleted file mode 100644
index e3c84b92ff70..000000000000
--- a/include/sound/rt5640.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * linux/sound/rt5640.h -- Platform data for RT5640
- *
- * Copyright 2011 Realtek Microelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __LINUX_SND_RT5640_H
-#define __LINUX_SND_RT5640_H
-
-struct rt5640_platform_data {
-	/* IN1 & IN2 & IN3 can optionally be differential */
-	bool in1_diff;
-	bool in2_diff;
-	bool in3_diff;
-
-	bool dmic_en;
-	bool dmic1_data_pin; /* 0 = IN1P; 1 = GPIO3 */
-	bool dmic2_data_pin; /* 0 = IN1N; 1 = GPIO4 */
-
-	int ldo1_en; /* GPIO for LDO1_EN */
-};
-
-#endif
diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c
index 140bad07429e..616f0a69b850 100644
--- a/sound/soc/codecs/rt5640.c
+++ b/sound/soc/codecs/rt5640.c
@@ -2138,10 +2138,6 @@ static int rt5640_probe(struct snd_soc_component *component)
 		return -ENODEV;
 	}
 
-	if (rt5640->pdata.dmic_en)
-		rt5640_dmic_enable(component, rt5640->pdata.dmic1_data_pin,
-					  rt5640->pdata.dmic2_data_pin);
-
 	return 0;
 }
 
@@ -2159,8 +2155,8 @@ static int rt5640_suspend(struct snd_soc_component *component)
 	rt5640_reset(component);
 	regcache_cache_only(rt5640->regmap, true);
 	regcache_mark_dirty(rt5640->regmap);
-	if (gpio_is_valid(rt5640->pdata.ldo1_en))
-		gpio_set_value_cansleep(rt5640->pdata.ldo1_en, 0);
+	if (gpio_is_valid(rt5640->ldo1_en))
+		gpio_set_value_cansleep(rt5640->ldo1_en, 0);
 
 	return 0;
 }
@@ -2169,8 +2165,8 @@ static int rt5640_resume(struct snd_soc_component *component)
 {
 	struct rt5640_priv *rt5640 = snd_soc_component_get_drvdata(component);
 
-	if (gpio_is_valid(rt5640->pdata.ldo1_en)) {
-		gpio_set_value_cansleep(rt5640->pdata.ldo1_en, 1);
+	if (gpio_is_valid(rt5640->ldo1_en)) {
+		gpio_set_value_cansleep(rt5640->ldo1_en, 1);
 		msleep(400);
 	}
 
@@ -2302,22 +2298,16 @@ MODULE_DEVICE_TABLE(acpi, rt5640_acpi_match);
 
 static int rt5640_parse_dt(struct rt5640_priv *rt5640, struct device_node *np)
 {
-	rt5640->pdata.in1_diff = of_property_read_bool(np,
-					"realtek,in1-differential");
-	rt5640->pdata.in2_diff = of_property_read_bool(np,
-					"realtek,in2-differential");
-
-	rt5640->pdata.ldo1_en = of_get_named_gpio(np,
-					"realtek,ldo1-en-gpios", 0);
+	rt5640->ldo1_en = of_get_named_gpio(np, "realtek,ldo1-en-gpios", 0);
 	/*
 	 * LDO1_EN is optional (it may be statically tied on the board).
 	 * -ENOENT means that the property doesn't exist, i.e. there is no
 	 * GPIO, so is not an error. Any other error code means the property
 	 * exists, but could not be parsed.
 	 */
-	if (!gpio_is_valid(rt5640->pdata.ldo1_en) &&
-			(rt5640->pdata.ldo1_en != -ENOENT))
-		return rt5640->pdata.ldo1_en;
+	if (!gpio_is_valid(rt5640->ldo1_en) &&
+			(rt5640->ldo1_en != -ENOENT))
+		return rt5640->ldo1_en;
 
 	return 0;
 }
@@ -2325,7 +2315,6 @@ static int rt5640_parse_dt(struct rt5640_priv *rt5640, struct device_node *np)
 static int rt5640_i2c_probe(struct i2c_client *i2c,
 		    const struct i2c_device_id *id)
 {
-	struct rt5640_platform_data *pdata = dev_get_platdata(&i2c->dev);
 	struct rt5640_priv *rt5640;
 	int ret;
 	unsigned int val;
@@ -2337,22 +2326,12 @@ static int rt5640_i2c_probe(struct i2c_client *i2c,
 		return -ENOMEM;
 	i2c_set_clientdata(i2c, rt5640);
 
-	if (pdata) {
-		rt5640->pdata = *pdata;
-		/*
-		 * Translate zero'd out (default) pdata value to an invalid
-		 * GPIO ID. This makes the pdata and DT paths consistent in
-		 * terms of the value left in this field when no GPIO is
-		 * specified, but means we can't actually use GPIO 0.
-		 */
-		if (!rt5640->pdata.ldo1_en)
-			rt5640->pdata.ldo1_en = -EINVAL;
-	} else if (i2c->dev.of_node) {
+	if (i2c->dev.of_node) {
 		ret = rt5640_parse_dt(rt5640, i2c->dev.of_node);
 		if (ret)
 			return ret;
 	} else
-		rt5640->pdata.ldo1_en = -EINVAL;
+		rt5640->ldo1_en = -EINVAL;
 
 	rt5640->regmap = devm_regmap_init_i2c(i2c, &rt5640_regmap);
 	if (IS_ERR(rt5640->regmap)) {
@@ -2362,13 +2341,13 @@ static int rt5640_i2c_probe(struct i2c_client *i2c,
 		return ret;
 	}
 
-	if (gpio_is_valid(rt5640->pdata.ldo1_en)) {
-		ret = devm_gpio_request_one(&i2c->dev, rt5640->pdata.ldo1_en,
+	if (gpio_is_valid(rt5640->ldo1_en)) {
+		ret = devm_gpio_request_one(&i2c->dev, rt5640->ldo1_en,
 					    GPIOF_OUT_INIT_HIGH,
 					    "RT5640 LDO1_EN");
 		if (ret < 0) {
 			dev_err(&i2c->dev, "Failed to request LDO1_EN %d: %d\n",
-				rt5640->pdata.ldo1_en, ret);
+				rt5640->ldo1_en, ret);
 			return ret;
 		}
 		msleep(400);
@@ -2391,15 +2370,15 @@ static int rt5640_i2c_probe(struct i2c_client *i2c,
 	regmap_update_bits(rt5640->regmap, RT5640_DUMMY1,
 				RT5640_MCLK_DET, RT5640_MCLK_DET);
 
-	if (rt5640->pdata.in1_diff)
+	if (device_property_read_bool(&i2c->dev, "realtek,in1-differential"))
 		regmap_update_bits(rt5640->regmap, RT5640_IN1_IN2,
 					RT5640_IN_DF1, RT5640_IN_DF1);
 
-	if (rt5640->pdata.in2_diff)
+	if (device_property_read_bool(&i2c->dev, "realtek,in2-differential"))
 		regmap_update_bits(rt5640->regmap, RT5640_IN3_IN4,
 					RT5640_IN_DF2, RT5640_IN_DF2);
 
-	if (rt5640->pdata.in3_diff)
+	if (device_property_read_bool(&i2c->dev, "realtek,in3-differential"))
 		regmap_update_bits(rt5640->regmap, RT5640_IN1_IN2,
 					RT5640_IN_DF2, RT5640_IN_DF2);
 
diff --git a/sound/soc/codecs/rt5640.h b/sound/soc/codecs/rt5640.h
index c473e8ae2eda..2db6586f5ab4 100644
--- a/sound/soc/codecs/rt5640.h
+++ b/sound/soc/codecs/rt5640.h
@@ -13,7 +13,6 @@
 #define _RT5640_H
 
 #include <linux/clk.h>
-#include <sound/rt5640.h>
 
 /* Info */
 #define RT5640_RESET				0x00
@@ -2103,10 +2102,10 @@ enum {
 
 struct rt5640_priv {
 	struct snd_soc_component *component;
-	struct rt5640_platform_data pdata;
 	struct regmap *regmap;
 	struct clk *mclk;
 
+	int ldo1_en; /* GPIO for LDO1_EN */
 	int sysclk;
 	int sysclk_src;
 	int lrck[RT5640_AIFS];
-- 
cgit v1.2.3


From cfb53a56d1024bfa547a55ba599a19f882253003 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 9 May 2018 14:03:56 +0300
Subject: ASoC: omap: Delete the obsolete omap-pcm

All DAI drivers are now using the new sdma-pcm platform driver. The
omap-pcm can be removed from the tree, but we need to keep the SND_OMAP_SOC
Kconfig option until the relevant defconfigs are updated to avoid
regression due to missing audio.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/omap-pcm.h  |  30 ------
 sound/soc/omap/Kconfig    |   3 +-
 sound/soc/omap/Makefile   |   2 -
 sound/soc/omap/omap-pcm.c | 262 ----------------------------------------------
 4 files changed, 1 insertion(+), 296 deletions(-)
 delete mode 100644 include/sound/omap-pcm.h
 delete mode 100644 sound/soc/omap/omap-pcm.c

(limited to 'include')

diff --git a/include/sound/omap-pcm.h b/include/sound/omap-pcm.h
deleted file mode 100644
index c1d2f31d71e9..000000000000
--- a/include/sound/omap-pcm.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * omap-pcm.h - OMAP PCM driver
- *
- * Copyright (C) 2014 Texas Instruments, Inc.
- *
- * Author: Peter Ujfalusi <peter.ujfalusi@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#ifndef __OMAP_PCM_H__
-#define __OMAP_PCM_H__
-
-#if IS_ENABLED(CONFIG_SND_OMAP_SOC)
-int omap_pcm_platform_register(struct device *dev);
-#else
-static inline int omap_pcm_platform_register(struct device *dev)
-{
-	return 0;
-}
-#endif /* CONFIG_SND_OMAP_SOC */
-
-#endif /* __OMAP_PCM_H__ */
diff --git a/sound/soc/omap/Kconfig b/sound/soc/omap/Kconfig
index a8319aaba97d..f1d6799e08a9 100644
--- a/sound/soc/omap/Kconfig
+++ b/sound/soc/omap/Kconfig
@@ -1,7 +1,6 @@
 config SND_OMAP_SOC
-	tristate "SoC Audio for the Texas Instruments OMAP chips"
+	tristate "SoC Audio for Texas Instruments OMAP chips (deprecated)"
 	depends on (ARCH_OMAP && DMA_OMAP) || (ARM && COMPILE_TEST)
-	select SND_DMAENGINE_PCM
 	select SND_SDMA_SOC
 
 config SND_SDMA_SOC
diff --git a/sound/soc/omap/Makefile b/sound/soc/omap/Makefile
index 0619a5b3bd9f..53eba3413485 100644
--- a/sound/soc/omap/Makefile
+++ b/sound/soc/omap/Makefile
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 # OMAP Platform Support
-snd-soc-omap-objs := omap-pcm.o
 snd-soc-sdma-objs := sdma-pcm.o
 snd-soc-omap-dmic-objs := omap-dmic.o
 snd-soc-omap-mcbsp-objs := omap-mcbsp.o mcbsp.o
 snd-soc-omap-mcpdm-objs := omap-mcpdm.o
 snd-soc-omap-hdmi-audio-objs := omap-hdmi-audio.o
 
-obj-$(CONFIG_SND_OMAP_SOC) += snd-soc-omap.o
 obj-$(CONFIG_SND_SDMA_SOC) += snd-soc-sdma.o
 obj-$(CONFIG_SND_OMAP_SOC_DMIC) += snd-soc-omap-dmic.o
 obj-$(CONFIG_SND_OMAP_SOC_MCBSP) += snd-soc-omap-mcbsp.o
diff --git a/sound/soc/omap/omap-pcm.c b/sound/soc/omap/omap-pcm.c
deleted file mode 100644
index 778cc8f75b6a..000000000000
--- a/sound/soc/omap/omap-pcm.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * omap-pcm.c  --  ALSA PCM interface for the OMAP SoC
- *
- * Copyright (C) 2008 Nokia Corporation
- *
- * Contact: Jarkko Nikula <jarkko.nikula@bitmer.com>
- *          Peter Ujfalusi <peter.ujfalusi@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- *
- */
-
-#include <linux/dma-mapping.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/omap-dma.h>
-#include <sound/core.h>
-#include <sound/pcm.h>
-#include <sound/pcm_params.h>
-#include <sound/dmaengine_pcm.h>
-#include <sound/soc.h>
-#include <sound/omap-pcm.h>
-
-#ifdef CONFIG_ARCH_OMAP1
-#define pcm_omap1510()	cpu_is_omap1510()
-#else
-#define pcm_omap1510()	0
-#endif
-
-static struct snd_pcm_hardware omap_pcm_hardware = {
-	.info			= SNDRV_PCM_INFO_MMAP |
-				  SNDRV_PCM_INFO_MMAP_VALID |
-				  SNDRV_PCM_INFO_INTERLEAVED |
-				  SNDRV_PCM_INFO_PAUSE |
-				  SNDRV_PCM_INFO_RESUME |
-				  SNDRV_PCM_INFO_NO_PERIOD_WAKEUP,
-	.period_bytes_min	= 32,
-	.period_bytes_max	= 64 * 1024,
-	.periods_min		= 2,
-	.periods_max		= 255,
-	.buffer_bytes_max	= 128 * 1024,
-};
-
-/* sDMA supports only 1, 2, and 4 byte transfer elements. */
-static void omap_pcm_limit_supported_formats(void)
-{
-	int i;
-
-	for (i = 0; i <= SNDRV_PCM_FORMAT_LAST; i++) {
-		switch (snd_pcm_format_physical_width(i)) {
-		case 8:
-		case 16:
-		case 32:
-			omap_pcm_hardware.formats |= (1LL << i);
-			break;
-		default:
-			break;
-		}
-	}
-}
-
-/* this may get called several times by oss emulation */
-static int omap_pcm_hw_params(struct snd_pcm_substream *substream,
-			      struct snd_pcm_hw_params *params)
-{
-	struct snd_pcm_runtime *runtime = substream->runtime;
-	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct omap_pcm_dma_data *dma_data;
-	struct dma_slave_config config;
-	struct dma_chan *chan;
-	int err = 0;
-
-	memset(&config, 0x00, sizeof(config));
-
-	dma_data = snd_soc_dai_get_dma_data(rtd->cpu_dai, substream);
-
-	/* return if this is a bufferless transfer e.g.
-	 * codec <--> BT codec or GSM modem -- lg FIXME */
-	if (!dma_data)
-		return 0;
-
-	snd_pcm_set_runtime_buffer(substream, &substream->dma_buffer);
-	runtime->dma_bytes = params_buffer_bytes(params);
-
-	chan = snd_dmaengine_pcm_get_chan(substream);
-	if (!chan)
-		return -EINVAL;
-
-	/* fills in addr_width and direction */
-	err = snd_hwparams_to_dma_slave_config(substream, params, &config);
-	if (err)
-		return err;
-
-	snd_dmaengine_pcm_set_config_from_dai_data(substream,
-			snd_soc_dai_get_dma_data(rtd->cpu_dai, substream),
-			&config);
-
-	return dmaengine_slave_config(chan, &config);
-}
-
-static int omap_pcm_hw_free(struct snd_pcm_substream *substream)
-{
-	snd_pcm_set_runtime_buffer(substream, NULL);
-	return 0;
-}
-
-static snd_pcm_uframes_t omap_pcm_pointer(struct snd_pcm_substream *substream)
-{
-	snd_pcm_uframes_t offset;
-
-	if (pcm_omap1510())
-		offset = snd_dmaengine_pcm_pointer_no_residue(substream);
-	else
-		offset = snd_dmaengine_pcm_pointer(substream);
-
-	return offset;
-}
-
-static int omap_pcm_open(struct snd_pcm_substream *substream)
-{
-	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_dmaengine_dai_dma_data *dma_data;
-	int ret;
-
-	snd_soc_set_runtime_hwparams(substream, &omap_pcm_hardware);
-
-	dma_data = snd_soc_dai_get_dma_data(rtd->cpu_dai, substream);
-
-	/* DT boot: filter_data is the DMA name */
-	if (rtd->cpu_dai->dev->of_node) {
-		struct dma_chan *chan;
-
-		chan = dma_request_slave_channel(rtd->cpu_dai->dev,
-						 dma_data->filter_data);
-		ret = snd_dmaengine_pcm_open(substream, chan);
-	} else {
-		ret = snd_dmaengine_pcm_open_request_chan(substream,
-							  omap_dma_filter_fn,
-							  dma_data->filter_data);
-	}
-	return ret;
-}
-
-static int omap_pcm_mmap(struct snd_pcm_substream *substream,
-	struct vm_area_struct *vma)
-{
-	struct snd_pcm_runtime *runtime = substream->runtime;
-
-	return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
-			   runtime->dma_addr, runtime->dma_bytes);
-}
-
-static const struct snd_pcm_ops omap_pcm_ops = {
-	.open		= omap_pcm_open,
-	.close		= snd_dmaengine_pcm_close_release_chan,
-	.ioctl		= snd_pcm_lib_ioctl,
-	.hw_params	= omap_pcm_hw_params,
-	.hw_free	= omap_pcm_hw_free,
-	.trigger	= snd_dmaengine_pcm_trigger,
-	.pointer	= omap_pcm_pointer,
-	.mmap		= omap_pcm_mmap,
-};
-
-static int omap_pcm_preallocate_dma_buffer(struct snd_pcm *pcm,
-	int stream)
-{
-	struct snd_pcm_substream *substream = pcm->streams[stream].substream;
-	struct snd_dma_buffer *buf = &substream->dma_buffer;
-	size_t size = omap_pcm_hardware.buffer_bytes_max;
-
-	buf->dev.type = SNDRV_DMA_TYPE_DEV;
-	buf->dev.dev = pcm->card->dev;
-	buf->private_data = NULL;
-	buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
-	if (!buf->area)
-		return -ENOMEM;
-
-	buf->bytes = size;
-	return 0;
-}
-
-static void omap_pcm_free_dma_buffers(struct snd_pcm *pcm)
-{
-	struct snd_pcm_substream *substream;
-	struct snd_dma_buffer *buf;
-	int stream;
-
-	for (stream = 0; stream < 2; stream++) {
-		substream = pcm->streams[stream].substream;
-		if (!substream)
-			continue;
-
-		buf = &substream->dma_buffer;
-		if (!buf->area)
-			continue;
-
-		dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
-		buf->area = NULL;
-	}
-}
-
-static int omap_pcm_new(struct snd_soc_pcm_runtime *rtd)
-{
-	struct snd_card *card = rtd->card->snd_card;
-	struct snd_pcm *pcm = rtd->pcm;
-	int ret;
-
-	ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
-	if (ret)
-		return ret;
-
-	if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
-		ret = omap_pcm_preallocate_dma_buffer(pcm,
-			SNDRV_PCM_STREAM_PLAYBACK);
-		if (ret)
-			goto out;
-	}
-
-	if (pcm->streams[SNDRV_PCM_STREAM_CAPTURE].substream) {
-		ret = omap_pcm_preallocate_dma_buffer(pcm,
-			SNDRV_PCM_STREAM_CAPTURE);
-		if (ret)
-			goto out;
-	}
-
-out:
-	/* free preallocated buffers in case of error */
-	if (ret)
-		omap_pcm_free_dma_buffers(pcm);
-
-	return ret;
-}
-
-static const struct snd_soc_component_driver omap_soc_component = {
-	.ops		= &omap_pcm_ops,
-	.pcm_new	= omap_pcm_new,
-	.pcm_free	= omap_pcm_free_dma_buffers,
-};
-
-int omap_pcm_platform_register(struct device *dev)
-{
-	omap_pcm_limit_supported_formats();
-	return devm_snd_soc_register_component(dev, &omap_soc_component,
-					       NULL, 0);
-}
-EXPORT_SYMBOL_GPL(omap_pcm_platform_register);
-
-MODULE_AUTHOR("Jarkko Nikula <jarkko.nikula@bitmer.com>");
-MODULE_DESCRIPTION("OMAP PCM DMA module");
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From c36ff266dc82f4ae797a6f3513c6ffa344f7f1c7 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 26 Apr 2018 18:18:14 +0200
Subject: spi: Extend the core to ease integration of SPI memory controllers

Some controllers are exposing high-level interfaces to access various
kind of SPI memories. Unfortunately they do not fit in the current
spi_controller model and usually have drivers placed in
drivers/mtd/spi-nor which are only supporting SPI NORs and not SPI
memories in general.

This is an attempt at defining a SPI memory interface which works for
all kinds of SPI memories (NORs, NANDs, SRAMs).

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Tested-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/Kconfig         |   7 +
 drivers/spi/Makefile        |   1 +
 drivers/spi/spi-mem.c       | 410 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/spi-mem.h | 249 +++++++++++++++++++++++++++
 include/linux/spi/spi.h     |   7 +
 5 files changed, 674 insertions(+)
 create mode 100644 drivers/spi/spi-mem.c
 create mode 100644 include/linux/spi/spi-mem.h

(limited to 'include')

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 4e1e5c9c7b2c..e62ac3289bc1 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -47,6 +47,13 @@ config SPI_MASTER
 
 if SPI_MASTER
 
+config SPI_MEM
+	bool "SPI memory extension"
+	help
+	  Enable this option if you want to enable the SPI memory extension.
+	  This extension is meant to simplify interaction with SPI memories
+	  by providing an high-level interface to send memory-like commands.
+
 comment "SPI Master Controller Drivers"
 
 config SPI_ALTERA
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index ce23974acb91..cb1f4378b87c 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -8,6 +8,7 @@ ccflags-$(CONFIG_SPI_DEBUG) := -DDEBUG
 # small core, mostly translating board-specific
 # config declarations into driver model code
 obj-$(CONFIG_SPI_MASTER)		+= spi.o
+obj-$(CONFIG_SPI_MEM)			+= spi-mem.o
 obj-$(CONFIG_SPI_SPIDEV)		+= spidev.o
 obj-$(CONFIG_SPI_LOOPBACK_TEST)		+= spi-loopback-test.o
 
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
new file mode 100644
index 000000000000..990770dfa5cf
--- /dev/null
+++ b/drivers/spi/spi-mem.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+#include <linux/dmaengine.h>
+#include <linux/pm_runtime.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
+
+#include "internals.h"
+
+/**
+ * spi_controller_dma_map_mem_op_data() - DMA-map the buffer attached to a
+ *					  memory operation
+ * @ctlr: the SPI controller requesting this dma_map()
+ * @op: the memory operation containing the buffer to map
+ * @sgt: a pointer to a non-initialized sg_table that will be filled by this
+ *	 function
+ *
+ * Some controllers might want to do DMA on the data buffer embedded in @op.
+ * This helper prepares everything for you and provides a ready-to-use
+ * sg_table. This function is not intended to be called from spi drivers.
+ * Only SPI controller drivers should use it.
+ * Note that the caller must ensure the memory region pointed by
+ * op->data.buf.{in,out} is DMA-able before calling this function.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+int spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+				       const struct spi_mem_op *op,
+				       struct sg_table *sgt)
+{
+	struct device *dmadev;
+
+	if (!op->data.nbytes)
+		return -EINVAL;
+
+	if (op->data.dir == SPI_MEM_DATA_OUT && ctlr->dma_tx)
+		dmadev = ctlr->dma_tx->device->dev;
+	else if (op->data.dir == SPI_MEM_DATA_IN && ctlr->dma_rx)
+		dmadev = ctlr->dma_rx->device->dev;
+	else
+		dmadev = ctlr->dev.parent;
+
+	if (!dmadev)
+		return -EINVAL;
+
+	return spi_map_buf(ctlr, dmadev, sgt, op->data.buf.in, op->data.nbytes,
+			   op->data.dir == SPI_MEM_DATA_IN ?
+			   DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+EXPORT_SYMBOL_GPL(spi_controller_dma_map_mem_op_data);
+
+/**
+ * spi_controller_dma_unmap_mem_op_data() - DMA-unmap the buffer attached to a
+ *					    memory operation
+ * @ctlr: the SPI controller requesting this dma_unmap()
+ * @op: the memory operation containing the buffer to unmap
+ * @sgt: a pointer to an sg_table previously initialized by
+ *	 spi_controller_dma_map_mem_op_data()
+ *
+ * Some controllers might want to do DMA on the data buffer embedded in @op.
+ * This helper prepares things so that the CPU can access the
+ * op->data.buf.{in,out} buffer again.
+ *
+ * This function is not intended to be called from SPI drivers. Only SPI
+ * controller drivers should use it.
+ *
+ * This function should be called after the DMA operation has finished and is
+ * only valid if the previous spi_controller_dma_map_mem_op_data() call
+ * returned 0.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+void spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+					  const struct spi_mem_op *op,
+					  struct sg_table *sgt)
+{
+	struct device *dmadev;
+
+	if (!op->data.nbytes)
+		return;
+
+	if (op->data.dir == SPI_MEM_DATA_OUT && ctlr->dma_tx)
+		dmadev = ctlr->dma_tx->device->dev;
+	else if (op->data.dir == SPI_MEM_DATA_IN && ctlr->dma_rx)
+		dmadev = ctlr->dma_rx->device->dev;
+	else
+		dmadev = ctlr->dev.parent;
+
+	spi_unmap_buf(ctlr, dmadev, sgt,
+		      op->data.dir == SPI_MEM_DATA_IN ?
+		      DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+EXPORT_SYMBOL_GPL(spi_controller_dma_unmap_mem_op_data);
+
+static int spi_check_buswidth_req(struct spi_mem *mem, u8 buswidth, bool tx)
+{
+	u32 mode = mem->spi->mode;
+
+	switch (buswidth) {
+	case 1:
+		return 0;
+
+	case 2:
+		if ((tx && (mode & (SPI_TX_DUAL | SPI_TX_QUAD))) ||
+		    (!tx && (mode & (SPI_RX_DUAL | SPI_RX_QUAD))))
+			return 0;
+
+		break;
+
+	case 4:
+		if ((tx && (mode & SPI_TX_QUAD)) ||
+		    (!tx && (mode & SPI_RX_QUAD)))
+			return 0;
+
+		break;
+
+	default:
+		break;
+	}
+
+	return -ENOTSUPP;
+}
+
+static bool spi_mem_default_supports_op(struct spi_mem *mem,
+					const struct spi_mem_op *op)
+{
+	if (spi_check_buswidth_req(mem, op->cmd.buswidth, true))
+		return false;
+
+	if (op->addr.nbytes &&
+	    spi_check_buswidth_req(mem, op->addr.buswidth, true))
+		return false;
+
+	if (op->dummy.nbytes &&
+	    spi_check_buswidth_req(mem, op->dummy.buswidth, true))
+		return false;
+
+	if (op->data.nbytes &&
+	    spi_check_buswidth_req(mem, op->data.buswidth,
+				   op->data.dir == SPI_MEM_DATA_OUT))
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(spi_mem_default_supports_op);
+
+/**
+ * spi_mem_supports_op() - Check if a memory device and the controller it is
+ *			   connected to support a specific memory operation
+ * @mem: the SPI memory
+ * @op: the memory operation to check
+ *
+ * Some controllers are only supporting Single or Dual IOs, others might only
+ * support specific opcodes, or it can even be that the controller and device
+ * both support Quad IOs but the hardware prevents you from using it because
+ * only 2 IO lines are connected.
+ *
+ * This function checks whether a specific operation is supported.
+ *
+ * Return: true if @op is supported, false otherwise.
+ */
+bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+	struct spi_controller *ctlr = mem->spi->controller;
+
+	if (ctlr->mem_ops && ctlr->mem_ops->supports_op)
+		return ctlr->mem_ops->supports_op(mem, op);
+
+	return spi_mem_default_supports_op(mem, op);
+}
+EXPORT_SYMBOL_GPL(spi_mem_supports_op);
+
+/**
+ * spi_mem_exec_op() - Execute a memory operation
+ * @mem: the SPI memory
+ * @op: the memory operation to execute
+ *
+ * Executes a memory operation.
+ *
+ * This function first checks that @op is supported and then tries to execute
+ * it.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+	unsigned int tmpbufsize, xferpos = 0, totalxferlen = 0;
+	struct spi_controller *ctlr = mem->spi->controller;
+	struct spi_transfer xfers[4] = { };
+	struct spi_message msg;
+	u8 *tmpbuf;
+	int ret;
+
+	if (!spi_mem_supports_op(mem, op))
+		return -ENOTSUPP;
+
+	if (ctlr->mem_ops) {
+		/*
+		 * Flush the message queue before executing our SPI memory
+		 * operation to prevent preemption of regular SPI transfers.
+		 */
+		spi_flush_queue(ctlr);
+
+		if (ctlr->auto_runtime_pm) {
+			ret = pm_runtime_get_sync(ctlr->dev.parent);
+			if (ret < 0) {
+				dev_err(&ctlr->dev,
+					"Failed to power device: %d\n",
+					ret);
+				return ret;
+			}
+		}
+
+		mutex_lock(&ctlr->bus_lock_mutex);
+		mutex_lock(&ctlr->io_mutex);
+		ret = ctlr->mem_ops->exec_op(mem, op);
+		mutex_unlock(&ctlr->io_mutex);
+		mutex_unlock(&ctlr->bus_lock_mutex);
+
+		if (ctlr->auto_runtime_pm)
+			pm_runtime_put(ctlr->dev.parent);
+
+		/*
+		 * Some controllers only optimize specific paths (typically the
+		 * read path) and expect the core to use the regular SPI
+		 * interface in other cases.
+		 */
+		if (!ret || ret != -ENOTSUPP)
+			return ret;
+	}
+
+	tmpbufsize = sizeof(op->cmd.opcode) + op->addr.nbytes +
+		     op->dummy.nbytes;
+
+	/*
+	 * Allocate a buffer to transmit the CMD, ADDR cycles with kmalloc() so
+	 * we're guaranteed that this buffer is DMA-able, as required by the
+	 * SPI layer.
+	 */
+	tmpbuf = kzalloc(tmpbufsize, GFP_KERNEL | GFP_DMA);
+	if (!tmpbuf)
+		return -ENOMEM;
+
+	spi_message_init(&msg);
+
+	tmpbuf[0] = op->cmd.opcode;
+	xfers[xferpos].tx_buf = tmpbuf;
+	xfers[xferpos].len = sizeof(op->cmd.opcode);
+	xfers[xferpos].tx_nbits = op->cmd.buswidth;
+	spi_message_add_tail(&xfers[xferpos], &msg);
+	xferpos++;
+	totalxferlen++;
+
+	if (op->addr.nbytes) {
+		int i;
+
+		for (i = 0; i < op->addr.nbytes; i++)
+			tmpbuf[i + 1] = op->addr.val >>
+					(8 * (op->addr.nbytes - i - 1));
+
+		xfers[xferpos].tx_buf = tmpbuf + 1;
+		xfers[xferpos].len = op->addr.nbytes;
+		xfers[xferpos].tx_nbits = op->addr.buswidth;
+		spi_message_add_tail(&xfers[xferpos], &msg);
+		xferpos++;
+		totalxferlen += op->addr.nbytes;
+	}
+
+	if (op->dummy.nbytes) {
+		memset(tmpbuf + op->addr.nbytes + 1, 0xff, op->dummy.nbytes);
+		xfers[xferpos].tx_buf = tmpbuf + op->addr.nbytes + 1;
+		xfers[xferpos].len = op->dummy.nbytes;
+		xfers[xferpos].tx_nbits = op->dummy.buswidth;
+		spi_message_add_tail(&xfers[xferpos], &msg);
+		xferpos++;
+		totalxferlen += op->dummy.nbytes;
+	}
+
+	if (op->data.nbytes) {
+		if (op->data.dir == SPI_MEM_DATA_IN) {
+			xfers[xferpos].rx_buf = op->data.buf.in;
+			xfers[xferpos].rx_nbits = op->data.buswidth;
+		} else {
+			xfers[xferpos].tx_buf = op->data.buf.out;
+			xfers[xferpos].tx_nbits = op->data.buswidth;
+		}
+
+		xfers[xferpos].len = op->data.nbytes;
+		spi_message_add_tail(&xfers[xferpos], &msg);
+		xferpos++;
+		totalxferlen += op->data.nbytes;
+	}
+
+	ret = spi_sync(mem->spi, &msg);
+
+	kfree(tmpbuf);
+
+	if (ret)
+		return ret;
+
+	if (msg.actual_length != totalxferlen)
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_mem_exec_op);
+
+/**
+ * spi_mem_adjust_op_size() - Adjust the data size of a SPI mem operation to
+ *			      match controller limitations
+ * @mem: the SPI memory
+ * @op: the operation to adjust
+ *
+ * Some controllers have FIFO limitations and must split a data transfer
+ * operation into multiple ones, others require a specific alignment for
+ * optimized accesses. This function allows SPI mem drivers to split a single
+ * operation into multiple sub-operations when required.
+ *
+ * Return: a negative error code if the controller can't properly adjust @op,
+ *	   0 otherwise. Note that @op->data.nbytes will be updated if @op
+ *	   can't be handled in a single step.
+ */
+int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op)
+{
+	struct spi_controller *ctlr = mem->spi->controller;
+
+	if (ctlr->mem_ops && ctlr->mem_ops->adjust_op_size)
+		return ctlr->mem_ops->adjust_op_size(mem, op);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_mem_adjust_op_size);
+
+static inline struct spi_mem_driver *to_spi_mem_drv(struct device_driver *drv)
+{
+	return container_of(drv, struct spi_mem_driver, spidrv.driver);
+}
+
+static int spi_mem_probe(struct spi_device *spi)
+{
+	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+	struct spi_mem *mem;
+
+	mem = devm_kzalloc(&spi->dev, sizeof(*mem), GFP_KERNEL);
+	if (!mem)
+		return -ENOMEM;
+
+	mem->spi = spi;
+	spi_set_drvdata(spi, mem);
+
+	return memdrv->probe(mem);
+}
+
+static int spi_mem_remove(struct spi_device *spi)
+{
+	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+	struct spi_mem *mem = spi_get_drvdata(spi);
+
+	if (memdrv->remove)
+		return memdrv->remove(mem);
+
+	return 0;
+}
+
+static void spi_mem_shutdown(struct spi_device *spi)
+{
+	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+	struct spi_mem *mem = spi_get_drvdata(spi);
+
+	if (memdrv->shutdown)
+		memdrv->shutdown(mem);
+}
+
+/**
+ * spi_mem_driver_register_with_owner() - Register a SPI memory driver
+ * @memdrv: the SPI memory driver to register
+ * @owner: the owner of this driver
+ *
+ * Registers a SPI memory driver.
+ *
+ * Return: 0 in case of success, a negative error core otherwise.
+ */
+
+int spi_mem_driver_register_with_owner(struct spi_mem_driver *memdrv,
+				       struct module *owner)
+{
+	memdrv->spidrv.probe = spi_mem_probe;
+	memdrv->spidrv.remove = spi_mem_remove;
+	memdrv->spidrv.shutdown = spi_mem_shutdown;
+
+	return __spi_register_driver(owner, &memdrv->spidrv);
+}
+EXPORT_SYMBOL_GPL(spi_mem_driver_register_with_owner);
+
+/**
+ * spi_mem_driver_unregister_with_owner() - Unregister a SPI memory driver
+ * @memdrv: the SPI memory driver to unregister
+ *
+ * Unregisters a SPI memory driver.
+ */
+void spi_mem_driver_unregister(struct spi_mem_driver *memdrv)
+{
+	spi_unregister_driver(&memdrv->spidrv);
+}
+EXPORT_SYMBOL_GPL(spi_mem_driver_unregister);
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
new file mode 100644
index 000000000000..bb4bd15ae1f6
--- /dev/null
+++ b/include/linux/spi/spi-mem.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+
+#ifndef __LINUX_SPI_MEM_H
+#define __LINUX_SPI_MEM_H
+
+#include <linux/spi/spi.h>
+
+#define SPI_MEM_OP_CMD(__opcode, __buswidth)			\
+	{							\
+		.buswidth = __buswidth,				\
+		.opcode = __opcode,				\
+	}
+
+#define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth)		\
+	{							\
+		.nbytes = __nbytes,				\
+		.val = __val,					\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_NO_ADDR	{ }
+
+#define SPI_MEM_OP_DUMMY(__nbytes, __buswidth)			\
+	{							\
+		.nbytes = __nbytes,				\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_NO_DUMMY	{ }
+
+#define SPI_MEM_OP_DATA_IN(__nbytes, __buf, __buswidth)		\
+	{							\
+		.dir = SPI_MEM_DATA_IN,				\
+		.nbytes = __nbytes,				\
+		.buf.in = __buf,				\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_DATA_OUT(__nbytes, __buf, __buswidth)	\
+	{							\
+		.dir = SPI_MEM_DATA_OUT,			\
+		.nbytes = __nbytes,				\
+		.buf.out = __buf,				\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_NO_DATA	{ }
+
+/**
+ * enum spi_mem_data_dir - describes the direction of a SPI memory data
+ *			   transfer from the controller perspective
+ * @SPI_MEM_DATA_IN: data coming from the SPI memory
+ * @SPI_MEM_DATA_OUT: data sent the SPI memory
+ */
+enum spi_mem_data_dir {
+	SPI_MEM_DATA_IN,
+	SPI_MEM_DATA_OUT,
+};
+
+/**
+ * struct spi_mem_op - describes a SPI memory operation
+ * @cmd.buswidth: number of IO lines used to transmit the command
+ * @cmd.opcode: operation opcode
+ * @addr.nbytes: number of address bytes to send. Can be zero if the operation
+ *		 does not need to send an address
+ * @addr.buswidth: number of IO lines used to transmit the address cycles
+ * @addr.val: address value. This value is always sent MSB first on the bus.
+ *	      Note that only @addr.nbytes are taken into account in this
+ *	      address value, so users should make sure the value fits in the
+ *	      assigned number of bytes.
+ * @dummy.nbytes: number of dummy bytes to send after an opcode or address. Can
+ *		  be zero if the operation does not require dummy bytes
+ * @dummy.buswidth: number of IO lanes used to transmit the dummy bytes
+ * @data.buswidth: number of IO lanes used to send/receive the data
+ * @data.dir: direction of the transfer
+ * @data.buf.in: input buffer
+ * @data.buf.out: output buffer
+ */
+struct spi_mem_op {
+	struct {
+		u8 buswidth;
+		u8 opcode;
+	} cmd;
+
+	struct {
+		u8 nbytes;
+		u8 buswidth;
+		u64 val;
+	} addr;
+
+	struct {
+		u8 nbytes;
+		u8 buswidth;
+	} dummy;
+
+	struct {
+		u8 buswidth;
+		enum spi_mem_data_dir dir;
+		unsigned int nbytes;
+		/* buf.{in,out} must be DMA-able. */
+		union {
+			void *in;
+			const void *out;
+		} buf;
+	} data;
+};
+
+#define SPI_MEM_OP(__cmd, __addr, __dummy, __data)		\
+	{							\
+		.cmd = __cmd,					\
+		.addr = __addr,					\
+		.dummy = __dummy,				\
+		.data = __data,					\
+	}
+
+/**
+ * struct spi_mem - describes a SPI memory device
+ * @spi: the underlying SPI device
+ * @drvpriv: spi_mem_drviver private data
+ *
+ * Extra information that describe the SPI memory device and may be needed by
+ * the controller to properly handle this device should be placed here.
+ *
+ * One example would be the device size since some controller expose their SPI
+ * mem devices through a io-mapped region.
+ */
+struct spi_mem {
+	struct spi_device *spi;
+	void *drvpriv;
+};
+
+/**
+ * struct spi_mem_set_drvdata() - attach driver private data to a SPI mem
+ *				  device
+ * @mem: memory device
+ * @data: data to attach to the memory device
+ */
+static inline void spi_mem_set_drvdata(struct spi_mem *mem, void *data)
+{
+	mem->drvpriv = data;
+}
+
+/**
+ * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem
+ *				  device
+ * @mem: memory device
+ *
+ * Return: the data attached to the mem device.
+ */
+static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
+{
+	return mem->drvpriv;
+}
+
+/**
+ * struct spi_controller_mem_ops - SPI memory operations
+ * @adjust_op_size: shrink the data xfer of an operation to match controller's
+ *		    limitations (can be alignment of max RX/TX size
+ *		    limitations)
+ * @supports_op: check if an operation is supported by the controller
+ * @exec_op: execute a SPI memory operation
+ *
+ * This interface should be implemented by SPI controllers providing an
+ * high-level interface to execute SPI memory operation, which is usually the
+ * case for QSPI controllers.
+ */
+struct spi_controller_mem_ops {
+	int (*adjust_op_size)(struct spi_mem *mem, struct spi_mem_op *op);
+	bool (*supports_op)(struct spi_mem *mem,
+			    const struct spi_mem_op *op);
+	int (*exec_op)(struct spi_mem *mem,
+		       const struct spi_mem_op *op);
+};
+
+/**
+ * struct spi_mem_driver - SPI memory driver
+ * @spidrv: inherit from a SPI driver
+ * @probe: probe a SPI memory. Usually where detection/initialization takes
+ *	   place
+ * @remove: remove a SPI memory
+ * @shutdown: take appropriate action when the system is shutdown
+ *
+ * This is just a thin wrapper around a spi_driver. The core takes care of
+ * allocating the spi_mem object and forwarding the probe/remove/shutdown
+ * request to the spi_mem_driver. The reason we use this wrapper is because
+ * we might have to stuff more information into the spi_mem struct to let
+ * SPI controllers know more about the SPI memory they interact with, and
+ * having this intermediate layer allows us to do that without adding more
+ * useless fields to the spi_device object.
+ */
+struct spi_mem_driver {
+	struct spi_driver spidrv;
+	int (*probe)(struct spi_mem *mem);
+	int (*remove)(struct spi_mem *mem);
+	void (*shutdown)(struct spi_mem *mem);
+};
+
+#if IS_ENABLED(CONFIG_SPI_MEM)
+int spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+				       const struct spi_mem_op *op,
+				       struct sg_table *sg);
+
+void spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+					  const struct spi_mem_op *op,
+					  struct sg_table *sg);
+#else
+static inline int
+spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+				   const struct spi_mem_op *op,
+				   struct sg_table *sg)
+{
+	return -ENOTSUPP;
+}
+
+static inline void
+spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+				     const struct spi_mem_op *op,
+				     struct sg_table *sg)
+{
+}
+#endif /* CONFIG_SPI_MEM */
+
+int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op);
+
+bool spi_mem_supports_op(struct spi_mem *mem,
+			 const struct spi_mem_op *op);
+
+int spi_mem_exec_op(struct spi_mem *mem,
+		    const struct spi_mem_op *op);
+
+int spi_mem_driver_register_with_owner(struct spi_mem_driver *drv,
+				       struct module *owner);
+
+void spi_mem_driver_unregister(struct spi_mem_driver *drv);
+
+#define spi_mem_driver_register(__drv)                                  \
+	spi_mem_driver_register_with_owner(__drv, THIS_MODULE)
+
+#define module_spi_mem_driver(__drv)                                    \
+	module_driver(__drv, spi_mem_driver_register,                   \
+		      spi_mem_driver_unregister)
+
+#endif /* __LINUX_SPI_MEM_H */
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index bc6bb325d1bf..a7e0bbed738c 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -27,6 +27,7 @@ struct property_entry;
 struct spi_controller;
 struct spi_transfer;
 struct spi_flash_read_message;
+struct spi_controller_mem_ops;
 
 /*
  * INTERFACES between SPI master-side drivers and SPI slave protocol handlers,
@@ -376,6 +377,9 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  *                    transfer_one callback.
  * @handle_err: the subsystem calls the driver to handle an error that occurs
  *		in the generic implementation of transfer_one_message().
+ * @mem_ops: optimized/dedicated operations for interactions with SPI memory.
+ *	     This field is optional and should only be implemented if the
+ *	     controller has native support for memory like operations.
  * @unprepare_message: undo any work done by prepare_message().
  * @slave_abort: abort the ongoing transfer request on an SPI slave controller
  * @spi_flash_read: to support spi-controller hardwares that provide
@@ -564,6 +568,9 @@ struct spi_controller {
 	void (*handle_err)(struct spi_controller *ctlr,
 			   struct spi_message *message);
 
+	/* Optimized handlers for SPI memory-like operations. */
+	const struct spi_controller_mem_ops *mem_ops;
+
 	/* gpio chip select */
 	int			*cs_gpios;
 
-- 
cgit v1.2.3


From c1f5ba70decfc2f35edcc10505e3e78fb528d212 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 26 Apr 2018 18:18:20 +0200
Subject: spi: Get rid of the spi_flash_read() API

This API has been replaced by the spi_mem_xx() one, its only user
(spi-nor) has been converted to spi_mem_xx() and all SPI controller
drivers that were implementing the ->spi_flash_xxx() hooks are also
implementing the spi_mem ones. So we can safely get rid of this API.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Tested-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-ti-qspi.c | 41 ----------------------------------
 drivers/spi/spi.c         | 57 -----------------------------------------------
 include/linux/spi/spi.h   | 53 -------------------------------------------
 3 files changed, 151 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c
index 1b12c2ff8be3..c54b760e00ed 100644
--- a/drivers/spi/spi-ti-qspi.c
+++ b/drivers/spi/spi-ti-qspi.c
@@ -531,44 +531,6 @@ static void ti_qspi_setup_mmap_read(struct spi_device *spi, u8 opcode,
 		      QSPI_SPI_SETUP_REG(spi->chip_select));
 }
 
-static bool ti_qspi_spi_flash_can_dma(struct spi_device *spi,
-				      struct spi_flash_read_message *msg)
-{
-	return virt_addr_valid(msg->buf);
-}
-
-static int ti_qspi_spi_flash_read(struct spi_device *spi,
-				  struct spi_flash_read_message *msg)
-{
-	struct ti_qspi *qspi = spi_master_get_devdata(spi->master);
-	int ret = 0;
-
-	mutex_lock(&qspi->list_lock);
-
-	if (!qspi->mmap_enabled)
-		ti_qspi_enable_memory_map(spi);
-	ti_qspi_setup_mmap_read(spi, msg->read_opcode, msg->data_nbits,
-				msg->addr_width, msg->dummy_bytes);
-
-	if (qspi->rx_chan) {
-		if (msg->cur_msg_mapped)
-			ret = ti_qspi_dma_xfer_sg(qspi, msg->rx_sg, msg->from);
-		else
-			ret = ti_qspi_dma_bounce_buffer(qspi, msg->from,
-							msg->buf, msg->len);
-		if (ret)
-			goto err_unlock;
-	} else {
-		memcpy_fromio(msg->buf, qspi->mmap_base + msg->from, msg->len);
-	}
-	msg->retlen = msg->len;
-
-err_unlock:
-	mutex_unlock(&qspi->list_lock);
-
-	return ret;
-}
-
 static int ti_qspi_exec_mem_op(struct spi_mem *mem,
 			       const struct spi_mem_op *op)
 {
@@ -727,7 +689,6 @@ static int ti_qspi_probe(struct platform_device *pdev)
 	master->dev.of_node = pdev->dev.of_node;
 	master->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(16) |
 				     SPI_BPW_MASK(8);
-	master->spi_flash_read = ti_qspi_spi_flash_read;
 	master->mem_ops = &ti_qspi_mem_ops;
 
 	if (!of_property_read_u32(np, "num-cs", &num_cs))
@@ -827,7 +788,6 @@ static int ti_qspi_probe(struct platform_device *pdev)
 		dma_release_channel(qspi->rx_chan);
 		goto no_dma;
 	}
-	master->spi_flash_can_dma = ti_qspi_spi_flash_can_dma;
 	master->dma_rx = qspi->rx_chan;
 	init_completion(&qspi->transfer_complete);
 	if (res_mmap)
@@ -841,7 +801,6 @@ no_dma:
 				 "mmap failed with error %ld using PIO mode\n",
 				 PTR_ERR(qspi->mmap_base));
 			qspi->mmap_base = NULL;
-			master->spi_flash_read = NULL;
 			master->mem_ops = NULL;
 		}
 	}
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index c85b0cf7b4a9..8ee1ba13eb23 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3055,63 +3055,6 @@ int spi_async_locked(struct spi_device *spi, struct spi_message *message)
 }
 EXPORT_SYMBOL_GPL(spi_async_locked);
 
-
-int spi_flash_read(struct spi_device *spi,
-		   struct spi_flash_read_message *msg)
-
-{
-	struct spi_controller *master = spi->controller;
-	struct device *rx_dev = NULL;
-	int ret;
-
-	if ((msg->opcode_nbits == SPI_NBITS_DUAL ||
-	     msg->addr_nbits == SPI_NBITS_DUAL) &&
-	    !(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD)))
-		return -EINVAL;
-	if ((msg->opcode_nbits == SPI_NBITS_QUAD ||
-	     msg->addr_nbits == SPI_NBITS_QUAD) &&
-	    !(spi->mode & SPI_TX_QUAD))
-		return -EINVAL;
-	if (msg->data_nbits == SPI_NBITS_DUAL &&
-	    !(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD)))
-		return -EINVAL;
-	if (msg->data_nbits == SPI_NBITS_QUAD &&
-	    !(spi->mode &  SPI_RX_QUAD))
-		return -EINVAL;
-
-	if (master->auto_runtime_pm) {
-		ret = pm_runtime_get_sync(master->dev.parent);
-		if (ret < 0) {
-			dev_err(&master->dev, "Failed to power device: %d\n",
-				ret);
-			return ret;
-		}
-	}
-
-	mutex_lock(&master->bus_lock_mutex);
-	mutex_lock(&master->io_mutex);
-	if (master->dma_rx && master->spi_flash_can_dma(spi, msg)) {
-		rx_dev = master->dma_rx->device->dev;
-		ret = spi_map_buf(master, rx_dev, &msg->rx_sg,
-				  msg->buf, msg->len,
-				  DMA_FROM_DEVICE);
-		if (!ret)
-			msg->cur_msg_mapped = true;
-	}
-	ret = master->spi_flash_read(spi, msg);
-	if (msg->cur_msg_mapped)
-		spi_unmap_buf(master, rx_dev, &msg->rx_sg,
-			      DMA_FROM_DEVICE);
-	mutex_unlock(&master->io_mutex);
-	mutex_unlock(&master->bus_lock_mutex);
-
-	if (master->auto_runtime_pm)
-		pm_runtime_put(master->dev.parent);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(spi_flash_read);
-
 /*-------------------------------------------------------------------------*/
 
 /* Utility methods for SPI protocol drivers, layered on
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a7e0bbed738c..a64235e05321 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -26,7 +26,6 @@ struct dma_chan;
 struct property_entry;
 struct spi_controller;
 struct spi_transfer;
-struct spi_flash_read_message;
 struct spi_controller_mem_ops;
 
 /*
@@ -382,11 +381,6 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  *	     controller has native support for memory like operations.
  * @unprepare_message: undo any work done by prepare_message().
  * @slave_abort: abort the ongoing transfer request on an SPI slave controller
- * @spi_flash_read: to support spi-controller hardwares that provide
- *                  accelerated interface to read from flash devices.
- * @spi_flash_can_dma: analogous to can_dma() interface, but for
- *		       controllers implementing spi_flash_read.
- * @flash_read_supported: spi device supports flash read
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *	number. Any individual value may be -ENOENT for CS lines that
  *	are not GPIOs (driven by the SPI controller itself).
@@ -552,11 +546,6 @@ struct spi_controller {
 	int (*unprepare_message)(struct spi_controller *ctlr,
 				 struct spi_message *message);
 	int (*slave_abort)(struct spi_controller *ctlr);
-	int (*spi_flash_read)(struct  spi_device *spi,
-			      struct spi_flash_read_message *msg);
-	bool (*spi_flash_can_dma)(struct spi_device *spi,
-				  struct spi_flash_read_message *msg);
-	bool (*flash_read_supported)(struct spi_device *spi);
 
 	/*
 	 * These hooks are for drivers that use a generic implementation
@@ -1190,48 +1179,6 @@ static inline ssize_t spi_w8r16be(struct spi_device *spi, u8 cmd)
 	return be16_to_cpu(result);
 }
 
-/**
- * struct spi_flash_read_message - flash specific information for
- * spi-masters that provide accelerated flash read interfaces
- * @buf: buffer to read data
- * @from: offset within the flash from where data is to be read
- * @len: length of data to be read
- * @retlen: actual length of data read
- * @read_opcode: read_opcode to be used to communicate with flash
- * @addr_width: number of address bytes
- * @dummy_bytes: number of dummy bytes
- * @opcode_nbits: number of lines to send opcode
- * @addr_nbits: number of lines to send address
- * @data_nbits: number of lines for data
- * @rx_sg: Scatterlist for receive data read from flash
- * @cur_msg_mapped: message has been mapped for DMA
- */
-struct spi_flash_read_message {
-	void *buf;
-	loff_t from;
-	size_t len;
-	size_t retlen;
-	u8 read_opcode;
-	u8 addr_width;
-	u8 dummy_bytes;
-	u8 opcode_nbits;
-	u8 addr_nbits;
-	u8 data_nbits;
-	struct sg_table rx_sg;
-	bool cur_msg_mapped;
-};
-
-/* SPI core interface for flash read support */
-static inline bool spi_flash_read_supported(struct spi_device *spi)
-{
-	return spi->controller->spi_flash_read &&
-	       (!spi->controller->flash_read_supported ||
-	       spi->controller->flash_read_supported(spi));
-}
-
-int spi_flash_read(struct spi_device *spi,
-		   struct spi_flash_read_message *msg);
-
 /*---------------------------------------------------------------------------*/
 
 /*
-- 
cgit v1.2.3


From 8675e8d3d1b413dc0e6165d2ce09de4335f7f57a Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Wed, 9 May 2018 13:56:12 +0100
Subject: soc: qcom dt-bindings: Add APR bus bindings

This patch add dt bindings for Qualcomm APR (Asynchronous Packet Router)
bus driver. This bus is used for communicating with DSP which provides
audio and various other services to cpu.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Banajit Goswami <bgoswami@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/soc/qcom/qcom,apr.txt      | 84 ++++++++++++++++++++++
 include/dt-bindings/soc/qcom,apr.h                 | 28 ++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
 create mode 100644 include/dt-bindings/soc/qcom,apr.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
new file mode 100644
index 000000000000..bcc612cc7423
--- /dev/null
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
@@ -0,0 +1,84 @@
+Qualcomm APR (Asynchronous Packet Router) binding
+
+This binding describes the Qualcomm APR. APR is a IPC protocol for
+communication between Application processor and QDSP. APR is mainly
+used for audio/voice services on the QDSP.
+
+- compatible:
+	Usage: required
+	Value type: <stringlist>
+	Definition: must be "qcom,apr-v<VERSION-NUMBER>", example "qcom,apr-v2"
+
+- reg
+	Usage: required
+	Value type: <u32>
+	Definition: Destination processor ID.
+	Possible values are :
+			1 - APR simulator
+			2 - PC
+			3 - MODEM
+			4 - ADSP
+			5 - APPS
+			6 - MODEM2
+			7 - APPS2
+
+= APR SERVICES
+Each subnode of the APR node represents service tied to this apr. The name
+of the nodes are not important. The properties of these nodes are defined
+by the individual bindings for the specific service
+- All APR services MUST contain the following property:
+
+- reg
+	Usage: required
+	Value type: <u32>
+	Definition: APR Service ID
+	Possible values are :
+			3 - DSP Core Service
+			4 - Audio Front End Service.
+			5 - Voice Stream Manager Service.
+			6 - Voice processing manager.
+			7 - Audio Stream Manager Service.
+			8 - Audio Device Manager Service.
+			9 - Multimode voice manager.
+			10 - Core voice stream.
+			11 - Core voice processor.
+			12 - Ultrasound stream manager.
+			13 - Listen stream manager.
+
+= EXAMPLE
+The following example represents a QDSP based sound card on a MSM8996 device
+which uses apr as communication between Apps and QDSP.
+
+	apr@4 {
+		compatible = "qcom,apr-v2";
+		reg = <APR_DOMAIN_ADSP>;
+
+		q6core@3 {
+			compatible = "qcom,q6core";
+			reg = <APR_SVC_ADSP_CORE>;
+		};
+
+		q6afe@4 {
+			compatible = "qcom,q6afe";
+			reg = <APR_SVC_AFE>;
+
+			dais {
+				#sound-dai-cells = <1>;
+				hdmi@1 {
+					reg = <1>;
+				};
+			};
+		};
+
+		q6asm@7 {
+			compatible = "qcom,q6asm";
+			reg = <APR_SVC_ASM>;
+			...
+		};
+
+		q6adm@8 {
+			compatible = "qcom,q6adm";
+			reg = <APR_SVC_ADM>;
+			...
+		};
+	};
diff --git a/include/dt-bindings/soc/qcom,apr.h b/include/dt-bindings/soc/qcom,apr.h
new file mode 100644
index 000000000000..006362400c0f
--- /dev/null
+++ b/include/dt-bindings/soc/qcom,apr.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_BINDINGS_QCOM_APR_H
+#define __DT_BINDINGS_QCOM_APR_H
+
+/* Domain IDs */
+#define APR_DOMAIN_SIM		0x1
+#define APR_DOMAIN_PC		0x2
+#define APR_DOMAIN_MODEM	0x3
+#define APR_DOMAIN_ADSP		0x4
+#define APR_DOMAIN_APPS		0x5
+#define APR_DOMAIN_MAX		0x6
+
+/* ADSP service IDs */
+#define APR_SVC_ADSP_CORE	0x3
+#define APR_SVC_AFE		0x4
+#define APR_SVC_VSM		0x5
+#define APR_SVC_VPM		0x6
+#define APR_SVC_ASM		0x7
+#define APR_SVC_ADM		0x8
+#define APR_SVC_ADSP_MVM	0x09
+#define APR_SVC_ADSP_CVS	0x0A
+#define APR_SVC_ADSP_CVP	0x0B
+#define APR_SVC_USM		0x0C
+#define APR_SVC_LSM		0x0D
+#define APR_SVC_VIDC		0x16
+#define APR_SVC_MAX		0x17
+
+#endif /* __DT_BINDINGS_QCOM_APR_H */
-- 
cgit v1.2.3


From 6adba21eb434d8160da05507c564c2339f4afdda Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Wed, 9 May 2018 13:56:13 +0100
Subject: soc: qcom: Add APR bus driver

This patch adds support to APR bus (Asynchronous Packet Router) driver.
APR driver is made as a bus driver so that the apr devices can added removed
more dynamically depending on the state of the services on the dsp.
APR is used for communication between application processor and QDSP to
use services on QDSP like Audio and others.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Reviewed-and-tested-by: Rohit kumar <rohitkr@codeaurora.org>
Acked-by: Andy Gross <andy.gross@linaro.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soc/qcom/Kconfig        |   9 +
 drivers/soc/qcom/Makefile       |   1 +
 drivers/soc/qcom/apr.c          | 378 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mod_devicetable.h |  11 ++
 include/linux/soc/qcom/apr.h    | 128 ++++++++++++++
 5 files changed, 527 insertions(+)
 create mode 100644 drivers/soc/qcom/apr.c
 create mode 100644 include/linux/soc/qcom/apr.h

(limited to 'include')

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 5c4535b545cc..d053f2634c67 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -108,4 +108,13 @@ config QCOM_WCNSS_CTRL
 	  Client driver for the WCNSS_CTRL SMD channel, used to download nv
 	  firmware to a newly booted WCNSS chip.
 
+config QCOM_APR
+	tristate "Qualcomm APR Bus (Asynchronous Packet Router)"
+	depends on ARCH_QCOM
+	depends on RPMSG
+	help
+          Enable APR IPC protocol support between
+          application processor and QDSP6. APR is
+          used by audio driver to configure QDSP6
+          ASM, ADM and AFE modules.
 endmenu
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index dcebf2814e6d..39de5dee55d9 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -12,3 +12,4 @@ obj-$(CONFIG_QCOM_SMEM_STATE) += smem_state.o
 obj-$(CONFIG_QCOM_SMP2P)	+= smp2p.o
 obj-$(CONFIG_QCOM_SMSM)	+= smsm.o
 obj-$(CONFIG_QCOM_WCNSS_CTRL) += wcnss_ctrl.o
+obj-$(CONFIG_QCOM_APR) += apr.o
diff --git a/drivers/soc/qcom/apr.c b/drivers/soc/qcom/apr.c
new file mode 100644
index 000000000000..97f3622da535
--- /dev/null
+++ b/drivers/soc/qcom/apr.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2011-2017, The Linux Foundation. All rights reserved.
+// Copyright (c) 2018, Linaro Limited
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/of_device.h>
+#include <linux/soc/qcom/apr.h>
+#include <linux/rpmsg.h>
+#include <linux/of.h>
+
+struct apr {
+	struct rpmsg_endpoint *ch;
+	struct device *dev;
+	spinlock_t svcs_lock;
+	struct idr svcs_idr;
+	int dest_domain_id;
+};
+
+/**
+ * apr_send_pkt() - Send a apr message from apr device
+ *
+ * @adev: Pointer to previously registered apr device.
+ * @pkt: Pointer to apr packet to send
+ *
+ * Return: Will be an negative on packet size on success.
+ */
+int apr_send_pkt(struct apr_device *adev, struct apr_pkt *pkt)
+{
+	struct apr *apr = dev_get_drvdata(adev->dev.parent);
+	struct apr_hdr *hdr;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&adev->lock, flags);
+
+	hdr = &pkt->hdr;
+	hdr->src_domain = APR_DOMAIN_APPS;
+	hdr->src_svc = adev->svc_id;
+	hdr->dest_domain = adev->domain_id;
+	hdr->dest_svc = adev->svc_id;
+
+	ret = rpmsg_trysend(apr->ch, pkt, hdr->pkt_size);
+	spin_unlock_irqrestore(&adev->lock, flags);
+
+	return ret ? ret : hdr->pkt_size;
+}
+EXPORT_SYMBOL_GPL(apr_send_pkt);
+
+static void apr_dev_release(struct device *dev)
+{
+	struct apr_device *adev = to_apr_device(dev);
+
+	kfree(adev);
+}
+
+static int apr_callback(struct rpmsg_device *rpdev, void *buf,
+				  int len, void *priv, u32 addr)
+{
+	struct apr *apr = dev_get_drvdata(&rpdev->dev);
+	uint16_t hdr_size, msg_type, ver, svc_id;
+	struct apr_device *svc = NULL;
+	struct apr_driver *adrv = NULL;
+	struct apr_resp_pkt resp;
+	struct apr_hdr *hdr;
+	unsigned long flags;
+
+	if (len <= APR_HDR_SIZE) {
+		dev_err(apr->dev, "APR: Improper apr pkt received:%p %d\n",
+			buf, len);
+		return -EINVAL;
+	}
+
+	hdr = buf;
+	ver = APR_HDR_FIELD_VER(hdr->hdr_field);
+	if (ver > APR_PKT_VER + 1)
+		return -EINVAL;
+
+	hdr_size = APR_HDR_FIELD_SIZE_BYTES(hdr->hdr_field);
+	if (hdr_size < APR_HDR_SIZE) {
+		dev_err(apr->dev, "APR: Wrong hdr size:%d\n", hdr_size);
+		return -EINVAL;
+	}
+
+	if (hdr->pkt_size < APR_HDR_SIZE || hdr->pkt_size != len) {
+		dev_err(apr->dev, "APR: Wrong paket size\n");
+		return -EINVAL;
+	}
+
+	msg_type = APR_HDR_FIELD_MT(hdr->hdr_field);
+	if (msg_type >= APR_MSG_TYPE_MAX && msg_type != APR_BASIC_RSP_RESULT) {
+		dev_err(apr->dev, "APR: Wrong message type: %d\n", msg_type);
+		return -EINVAL;
+	}
+
+	if (hdr->src_domain >= APR_DOMAIN_MAX ||
+			hdr->dest_domain >= APR_DOMAIN_MAX ||
+			hdr->src_svc >= APR_SVC_MAX ||
+			hdr->dest_svc >= APR_SVC_MAX) {
+		dev_err(apr->dev, "APR: Wrong APR header\n");
+		return -EINVAL;
+	}
+
+	svc_id = hdr->dest_svc;
+	spin_lock_irqsave(&apr->svcs_lock, flags);
+	svc = idr_find(&apr->svcs_idr, svc_id);
+	if (svc && svc->dev.driver)
+		adrv = to_apr_driver(svc->dev.driver);
+	spin_unlock_irqrestore(&apr->svcs_lock, flags);
+
+	if (!adrv) {
+		dev_err(apr->dev, "APR: service is not registered\n");
+		return -EINVAL;
+	}
+
+	resp.hdr = *hdr;
+	resp.payload_size = hdr->pkt_size - hdr_size;
+
+	/*
+	 * NOTE: hdr_size is not same as APR_HDR_SIZE as remote can include
+	 * optional headers in to apr_hdr which should be ignored
+	 */
+	if (resp.payload_size > 0)
+		resp.payload = buf + hdr_size;
+
+	adrv->callback(svc, &resp);
+
+	return 0;
+}
+
+static int apr_device_match(struct device *dev, struct device_driver *drv)
+{
+	struct apr_device *adev = to_apr_device(dev);
+	struct apr_driver *adrv = to_apr_driver(drv);
+	const struct apr_device_id *id = adrv->id_table;
+
+	/* Attempt an OF style match first */
+	if (of_driver_match_device(dev, drv))
+		return 1;
+
+	if (!id)
+		return 0;
+
+	while (id->domain_id != 0 || id->svc_id != 0) {
+		if (id->domain_id == adev->domain_id &&
+		    id->svc_id == adev->svc_id)
+			return 1;
+		id++;
+	}
+
+	return 0;
+}
+
+static int apr_device_probe(struct device *dev)
+{
+	struct apr_device *adev = to_apr_device(dev);
+	struct apr_driver *adrv = to_apr_driver(dev->driver);
+
+	return adrv->probe(adev);
+}
+
+static int apr_device_remove(struct device *dev)
+{
+	struct apr_device *adev = to_apr_device(dev);
+	struct apr_driver *adrv;
+	struct apr *apr = dev_get_drvdata(adev->dev.parent);
+
+	if (dev->driver) {
+		adrv = to_apr_driver(dev->driver);
+		if (adrv->remove)
+			adrv->remove(adev);
+		spin_lock(&apr->svcs_lock);
+		idr_remove(&apr->svcs_idr, adev->svc_id);
+		spin_unlock(&apr->svcs_lock);
+	}
+
+	return 0;
+}
+
+static int apr_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct apr_device *adev = to_apr_device(dev);
+	int ret;
+
+	ret = of_device_uevent_modalias(dev, env);
+	if (ret != -ENODEV)
+		return ret;
+
+	return add_uevent_var(env, "MODALIAS=apr:%s", adev->name);
+}
+
+struct bus_type aprbus = {
+	.name		= "aprbus",
+	.match		= apr_device_match,
+	.probe		= apr_device_probe,
+	.uevent		= apr_uevent,
+	.remove		= apr_device_remove,
+};
+EXPORT_SYMBOL_GPL(aprbus);
+
+static int apr_add_device(struct device *dev, struct device_node *np,
+			  const struct apr_device_id *id)
+{
+	struct apr *apr = dev_get_drvdata(dev);
+	struct apr_device *adev = NULL;
+	int ret;
+
+	adev = kzalloc(sizeof(*adev), GFP_KERNEL);
+	if (!adev)
+		return -ENOMEM;
+
+	spin_lock_init(&adev->lock);
+
+	adev->svc_id = id->svc_id;
+	adev->domain_id = id->domain_id;
+	adev->version = id->svc_version;
+	if (np)
+		strncpy(adev->name, np->name, APR_NAME_SIZE);
+	else
+		strncpy(adev->name, id->name, APR_NAME_SIZE);
+
+	dev_set_name(&adev->dev, "aprsvc:%s:%x:%x", adev->name,
+		     id->domain_id, id->svc_id);
+
+	adev->dev.bus = &aprbus;
+	adev->dev.parent = dev;
+	adev->dev.of_node = np;
+	adev->dev.release = apr_dev_release;
+	adev->dev.driver = NULL;
+
+	spin_lock(&apr->svcs_lock);
+	idr_alloc(&apr->svcs_idr, adev, id->svc_id,
+		  id->svc_id + 1, GFP_ATOMIC);
+	spin_unlock(&apr->svcs_lock);
+
+	dev_info(dev, "Adding APR dev: %s\n", dev_name(&adev->dev));
+
+	ret = device_register(&adev->dev);
+	if (ret) {
+		dev_err(dev, "device_register failed: %d\n", ret);
+		put_device(&adev->dev);
+	}
+
+	return ret;
+}
+
+static void of_register_apr_devices(struct device *dev)
+{
+	struct apr *apr = dev_get_drvdata(dev);
+	struct device_node *node;
+
+	for_each_child_of_node(dev->of_node, node) {
+		struct apr_device_id id = { {0} };
+
+		if (of_property_read_u32(node, "reg", &id.svc_id))
+			continue;
+
+		id.domain_id = apr->dest_domain_id;
+
+		if (apr_add_device(dev, node, &id))
+			dev_err(dev, "Failed to add apr %d svc\n", id.svc_id);
+	}
+}
+
+static int apr_probe(struct rpmsg_device *rpdev)
+{
+	struct device *dev = &rpdev->dev;
+	struct apr *apr;
+	int ret;
+
+	apr = devm_kzalloc(dev, sizeof(*apr), GFP_KERNEL);
+	if (!apr)
+		return -ENOMEM;
+
+	ret = of_property_read_u32(dev->of_node, "reg", &apr->dest_domain_id);
+	if (ret) {
+		dev_err(dev, "APR Domain ID not specified in DT\n");
+		return ret;
+	}
+
+	dev_set_drvdata(dev, apr);
+	apr->ch = rpdev->ept;
+	apr->dev = dev;
+	spin_lock_init(&apr->svcs_lock);
+	idr_init(&apr->svcs_idr);
+	of_register_apr_devices(dev);
+
+	return 0;
+}
+
+static int apr_remove_device(struct device *dev, void *null)
+{
+	struct apr_device *adev = to_apr_device(dev);
+
+	device_unregister(&adev->dev);
+
+	return 0;
+}
+
+static void apr_remove(struct rpmsg_device *rpdev)
+{
+	device_for_each_child(&rpdev->dev, NULL, apr_remove_device);
+}
+
+/*
+ * __apr_driver_register() - Client driver registration with aprbus
+ *
+ * @drv:Client driver to be associated with client-device.
+ * @owner: owning module/driver
+ *
+ * This API will register the client driver with the aprbus
+ * It is called from the driver's module-init function.
+ */
+int __apr_driver_register(struct apr_driver *drv, struct module *owner)
+{
+	drv->driver.bus = &aprbus;
+	drv->driver.owner = owner;
+
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(__apr_driver_register);
+
+/*
+ * apr_driver_unregister() - Undo effect of apr_driver_register
+ *
+ * @drv: Client driver to be unregistered
+ */
+void apr_driver_unregister(struct apr_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(apr_driver_unregister);
+
+static const struct of_device_id apr_of_match[] = {
+	{ .compatible = "qcom,apr"},
+	{ .compatible = "qcom,apr-v2"},
+	{}
+};
+MODULE_DEVICE_TABLE(of, apr_of_match);
+
+static struct rpmsg_driver apr_driver = {
+	.probe = apr_probe,
+	.remove = apr_remove,
+	.callback = apr_callback,
+	.drv = {
+		.name = "qcom,apr",
+		.of_match_table = apr_of_match,
+	},
+};
+
+static int __init apr_init(void)
+{
+	int ret;
+
+	ret = bus_register(&aprbus);
+	if (!ret)
+		ret = register_rpmsg_driver(&apr_driver);
+	else
+		bus_unregister(&aprbus);
+
+	return ret;
+}
+
+static void __exit apr_exit(void)
+{
+	bus_unregister(&aprbus);
+	unregister_rpmsg_driver(&apr_driver);
+}
+
+subsys_initcall(apr_init);
+module_exit(apr_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Qualcomm APR Bus");
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 7d361be2e24f..2014bd19f28e 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -471,6 +471,17 @@ struct slim_device_id {
 	kernel_ulong_t driver_data;
 };
 
+#define APR_NAME_SIZE	32
+#define APR_MODULE_PREFIX "apr:"
+
+struct apr_device_id {
+	char name[APR_NAME_SIZE];
+	__u32 domain_id;
+	__u32 svc_id;
+	__u32 svc_version;
+	kernel_ulong_t driver_data;	/* Data private to the driver */
+};
+
 #define SPMI_NAME_SIZE	32
 #define SPMI_MODULE_PREFIX "spmi:"
 
diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h
new file mode 100644
index 000000000000..c5d52e2cb275
--- /dev/null
+++ b/include/linux/soc/qcom/apr.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __QCOM_APR_H_
+#define __QCOM_APR_H_
+
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <dt-bindings/soc/qcom,apr.h>
+
+extern struct bus_type aprbus;
+
+#define APR_HDR_LEN(hdr_len) ((hdr_len)/4)
+
+/*
+ * HEADER field
+ * version:0:3
+ * header_size : 4:7
+ * message_type : 8:9
+ * reserved: 10:15
+ */
+#define APR_HDR_FIELD(msg_type, hdr_len, ver)\
+	(((msg_type & 0x3) << 8) | ((hdr_len & 0xF) << 4) | (ver & 0xF))
+
+#define APR_HDR_SIZE sizeof(struct apr_hdr)
+#define APR_SEQ_CMD_HDR_FIELD APR_HDR_FIELD(APR_MSG_TYPE_SEQ_CMD, \
+					    APR_HDR_LEN(APR_HDR_SIZE), \
+					    APR_PKT_VER)
+/* Version */
+#define APR_PKT_VER		0x0
+
+/* Command and Response Types */
+#define APR_MSG_TYPE_EVENT	0x0
+#define APR_MSG_TYPE_CMD_RSP	0x1
+#define APR_MSG_TYPE_SEQ_CMD	0x2
+#define APR_MSG_TYPE_NSEQ_CMD	0x3
+#define APR_MSG_TYPE_MAX	0x04
+
+/* APR Basic Response Message */
+#define APR_BASIC_RSP_RESULT 0x000110E8
+#define APR_RSP_ACCEPTED     0x000100BE
+
+struct aprv2_ibasic_rsp_result_t {
+	uint32_t opcode;
+	uint32_t status;
+};
+
+/* hdr field Ver [0:3], Size [4:7], Message type [8:10] */
+#define APR_HDR_FIELD_VER(h)		(h & 0x000F)
+#define APR_HDR_FIELD_SIZE(h)		((h & 0x00F0) >> 4)
+#define APR_HDR_FIELD_SIZE_BYTES(h)	(((h & 0x00F0) >> 4) * 4)
+#define APR_HDR_FIELD_MT(h)		((h & 0x0300) >> 8)
+
+struct apr_hdr {
+	uint16_t hdr_field;
+	uint16_t pkt_size;
+	uint8_t src_svc;
+	uint8_t src_domain;
+	uint16_t src_port;
+	uint8_t dest_svc;
+	uint8_t dest_domain;
+	uint16_t dest_port;
+	uint32_t token;
+	uint32_t opcode;
+} __packed;
+
+struct apr_pkt {
+	struct apr_hdr hdr;
+	uint8_t payload[];
+};
+
+struct apr_resp_pkt {
+	struct apr_hdr hdr;
+	void *payload;
+	int payload_size;
+};
+
+/* Bits 0 to 15 -- Minor version,  Bits 16 to 31 -- Major version */
+#define APR_SVC_MAJOR_VERSION(v)	((v >> 16) & 0xFF)
+#define APR_SVC_MINOR_VERSION(v)	(v & 0xFF)
+
+struct apr_device {
+	struct device	dev;
+	uint16_t	svc_id;
+	uint16_t	domain_id;
+	uint32_t	version;
+	char name[APR_NAME_SIZE];
+	spinlock_t	lock;
+	struct list_head node;
+};
+
+#define to_apr_device(d) container_of(d, struct apr_device, dev)
+
+struct apr_driver {
+	int	(*probe)(struct apr_device *sl);
+	int	(*remove)(struct apr_device *sl);
+	int	(*callback)(struct apr_device *a,
+			    struct apr_resp_pkt *d);
+	struct device_driver		driver;
+	const struct apr_device_id	*id_table;
+};
+
+#define to_apr_driver(d) container_of(d, struct apr_driver, driver)
+
+/*
+ * use a macro to avoid include chaining to get THIS_MODULE
+ */
+#define apr_driver_register(drv) __apr_driver_register(drv, THIS_MODULE)
+
+int __apr_driver_register(struct apr_driver *drv, struct module *owner);
+void apr_driver_unregister(struct apr_driver *drv);
+
+/**
+ * module_apr_driver() - Helper macro for registering a aprbus driver
+ * @__aprbus_driver: aprbus_driver struct
+ *
+ * Helper macro for aprbus drivers which do not do anything special in
+ * module init/exit. This eliminates a lot of boilerplate. Each module
+ * may only use this macro once, and calling it replaces module_init()
+ * and module_exit()
+ */
+#define module_apr_driver(__apr_driver) \
+	module_driver(__apr_driver, apr_driver_register, \
+			apr_driver_unregister)
+
+int apr_send_pkt(struct apr_device *adev, struct apr_pkt *pkt);
+
+#endif /* __QCOM_APR_H_ */
-- 
cgit v1.2.3


From 6ad2ef6530bcb9ab838975c1587c527bed422fbe Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Wed, 9 May 2018 13:56:15 +0100
Subject: ASoC: qdsp6: dt-bindings: Add q6afe dt bindings

This patch add DT bindings for AFE (Audio Frontend) DSP module.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Reviewed-and-tested-by: Rohit kumar <rohitkr@codeaurora.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Banajit Goswami <bgoswami@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/sound/qcom,q6afe.txt       | 104 +++++++++++++++++++++
 include/dt-bindings/sound/qcom,q6afe.h             |  31 ++++++
 2 files changed, 135 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/sound/qcom,q6afe.txt
 create mode 100644 include/dt-bindings/sound/qcom,q6afe.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/qcom,q6afe.txt b/Documentation/devicetree/bindings/sound/qcom,q6afe.txt
new file mode 100644
index 000000000000..14335a08b963
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/qcom,q6afe.txt
@@ -0,0 +1,104 @@
+Qualcomm Audio Front End (Q6AFE) binding
+
+AFE is one of the APR audio service on Q6DSP
+Please refer to qcom,apr.txt for details of the common apr service bindings
+used by all apr services. Must contain the following properties.
+
+- compatible:
+	Usage: required
+	Value type: <stringlist>
+	Definition: must be "qcom,q6afe-v<MAJOR-NUMBER>.<MINOR-NUMBER>"
+		  Or "qcom,q6afe" where the version number can be queried
+		  from DSP.
+		  example "qcom,q6afe"
+
+= AFE DAIs (Digial Audio Interface)
+"dais" subnode of the AFE node. It represents afe dais, each afe dai is a
+subnode of "dais" representing board specific dai setup.
+"dais" node should have following properties followed by dai children.
+
+- #sound-dai-cells
+	Usage: required
+	Value type: <u32>
+	Definition: Must be 1
+
+- #address-cells
+	Usage: required
+	Value type: <u32>
+	Definition: Must be 1
+
+- #size-cells
+	Usage: required
+	Value type: <u32>
+	Definition: Must be 0
+
+== AFE DAI is subnode of "dais" and represent a dai, it includes board specific
+configuration of each dai. Must contain the following properties.
+
+- reg
+	Usage: required
+	Value type: <u32>
+	Definition: Must be dai id
+
+- qcom,sd-lines
+	Usage: required for mi2s interface
+	Value type: <prop-encoded-array>
+	Definition: Must be list of serial data lines used by this dai.
+	should be one or more of the 1-4 sd lines.
+
+= EXAMPLE
+
+q6afe@4 {
+	compatible = "qcom,q6afe";
+	reg = <APR_SVC_AFE>;
+
+	dais {
+		#sound-dai-cells = <1>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		hdmi@1 {
+			reg = <1>;
+		};
+
+		prim-mi2s-rx@16 {
+			reg = <16>;
+			qcom,sd-lines = <1 3>;
+		};
+
+		prim-mi2s-tx@17 {
+			reg = <17>;
+			qcom,sd-lines = <2>;
+		};
+
+		sec-mi2s-rx@18 {
+			reg = <18>;
+			qcom,sd-lines = <1 4>;
+		};
+
+		sec-mi2s-tx@19 {
+			reg = <19>;
+			qcom,sd-lines = <2>;
+		};
+
+		tert-mi2s-rx@20 {
+			reg = <20>;
+			qcom,sd-lines = <2 4>;
+		};
+
+		tert-mi2s-tx@21 {
+			reg = <21>;
+			qcom,sd-lines = <1>;
+		};
+
+		quat-mi2s-rx@22 {
+			reg = <22>;
+			qcom,sd-lines = <1>;
+		};
+
+		quat-mi2s-tx@23 {
+			reg = <23>;
+			qcom,sd-lines = <2>;
+		};
+	};
+};
diff --git a/include/dt-bindings/sound/qcom,q6afe.h b/include/dt-bindings/sound/qcom,q6afe.h
new file mode 100644
index 000000000000..e162045f5dc9
--- /dev/null
+++ b/include/dt-bindings/sound/qcom,q6afe.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_BINDINGS_Q6_AFE_H__
+#define __DT_BINDINGS_Q6_AFE_H__
+
+/* Audio Front End (AFE) virtual ports IDs */
+#define HDMI_RX		1
+#define SLIMBUS_0_RX    2
+#define SLIMBUS_0_TX    3
+#define SLIMBUS_1_RX    4
+#define SLIMBUS_1_TX    5
+#define SLIMBUS_2_RX    6
+#define SLIMBUS_2_TX    7
+#define SLIMBUS_3_RX    8
+#define SLIMBUS_3_TX    9
+#define SLIMBUS_4_RX    10
+#define SLIMBUS_4_TX    11
+#define SLIMBUS_5_RX    12
+#define SLIMBUS_5_TX    13
+#define SLIMBUS_6_RX    14
+#define SLIMBUS_6_TX    15
+#define PRIMARY_MI2S_RX		16
+#define PRIMARY_MI2S_TX		17
+#define SECONDARY_MI2S_RX	18
+#define SECONDARY_MI2S_TX	19
+#define TERTIARY_MI2S_RX	20
+#define TERTIARY_MI2S_TX	21
+#define QUATERNARY_MI2S_RX	22
+#define QUATERNARY_MI2S_TX	23
+
+#endif /* __DT_BINDINGS_Q6_AFE_H__ */
+
-- 
cgit v1.2.3


From 0e17e9820319c012c80ebba0a6df07d6a3a03ab3 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Wed, 9 May 2018 13:56:17 +0100
Subject: ASoC: qdsp6: dt-bindings: Add q6asm dt bindings

This patch add DT bindings for ASM (Audio Stream Manager) DSP module.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Reviewed-and-tested-by: Rohit kumar <rohitkr@codeaurora.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Banajit Goswami <bgoswami@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/sound/qcom,q6asm.txt       | 33 ++++++++++++++++++++++
 include/dt-bindings/sound/qcom,q6asm.h             | 22 +++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/sound/qcom,q6asm.txt
 create mode 100644 include/dt-bindings/sound/qcom,q6asm.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/qcom,q6asm.txt b/Documentation/devicetree/bindings/sound/qcom,q6asm.txt
new file mode 100644
index 000000000000..2178eb91146f
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/qcom,q6asm.txt
@@ -0,0 +1,33 @@
+Qualcomm Audio Stream Manager (Q6ASM) binding
+
+Q6ASM is one of the APR audio service on Q6DSP.
+Please refer to qcom,apr.txt for details of the common apr service bindings
+used by the apr service device.
+
+- but must contain the following property:
+
+- compatible:
+	Usage: required
+	Value type: <stringlist>
+	Definition: must be "qcom,q6asm-v<MAJOR-NUMBER>.<MINOR-NUMBER>".
+		    Or "qcom,q6asm" where the version number can be queried
+		    from DSP.
+		    example "qcom,q6asm-v2.0"
+
+= ASM DAIs (Digial Audio Interface)
+"dais" subnode of the ASM node represents dai specific configuration
+
+- #sound-dai-cells
+	Usage: required
+	Value type: <u32>
+	Definition: Must be 1
+
+= EXAMPLE
+
+q6asm@7 {
+	compatible = "qcom,q6asm";
+	reg = <APR_SVC_ASM>;
+	q6asmdai: dais {
+		#sound-dai-cells = <1>;
+	};
+};
diff --git a/include/dt-bindings/sound/qcom,q6asm.h b/include/dt-bindings/sound/qcom,q6asm.h
new file mode 100644
index 000000000000..1eb77d87c2e8
--- /dev/null
+++ b/include/dt-bindings/sound/qcom,q6asm.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_BINDINGS_Q6_ASM_H__
+#define __DT_BINDINGS_Q6_ASM_H__
+
+#define	MSM_FRONTEND_DAI_MULTIMEDIA1	0
+#define	MSM_FRONTEND_DAI_MULTIMEDIA2	1
+#define	MSM_FRONTEND_DAI_MULTIMEDIA3	2
+#define	MSM_FRONTEND_DAI_MULTIMEDIA4	3
+#define	MSM_FRONTEND_DAI_MULTIMEDIA5	4
+#define	MSM_FRONTEND_DAI_MULTIMEDIA6	5
+#define	MSM_FRONTEND_DAI_MULTIMEDIA7	6
+#define	MSM_FRONTEND_DAI_MULTIMEDIA8	7
+#define	MSM_FRONTEND_DAI_MULTIMEDIA9	8
+#define	MSM_FRONTEND_DAI_MULTIMEDIA10	9
+#define	MSM_FRONTEND_DAI_MULTIMEDIA11	10
+#define	MSM_FRONTEND_DAI_MULTIMEDIA12	11
+#define	MSM_FRONTEND_DAI_MULTIMEDIA13	12
+#define	MSM_FRONTEND_DAI_MULTIMEDIA14	13
+#define	MSM_FRONTEND_DAI_MULTIMEDIA15	14
+#define	MSM_FRONTEND_DAI_MULTIMEDIA16	15
+
+#endif /* __DT_BINDINGS_Q6_ASM_H__ */
-- 
cgit v1.2.3


From dd7c2626329468c0344a794187b467d34c3640cb Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Tue, 8 May 2018 16:39:36 +0530
Subject: drm/modes: Introduce drm_mode_match()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make mode matching less confusing by allowing the caller to specify
which parts of the modes should match via some flags.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1525777785-9740-2-git-send-email-ankit.k.nautiyal@intel.com
---
 drivers/gpu/drm/drm_modes.c | 134 ++++++++++++++++++++++++++++++++++----------
 include/drm/drm_modes.h     |   9 +++
 2 files changed, 112 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index e82b61e08f8c..c395a244f665 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -939,17 +939,68 @@ struct drm_display_mode *drm_mode_duplicate(struct drm_device *dev,
 }
 EXPORT_SYMBOL(drm_mode_duplicate);
 
+static bool drm_mode_match_timings(const struct drm_display_mode *mode1,
+				   const struct drm_display_mode *mode2)
+{
+	return mode1->hdisplay == mode2->hdisplay &&
+		mode1->hsync_start == mode2->hsync_start &&
+		mode1->hsync_end == mode2->hsync_end &&
+		mode1->htotal == mode2->htotal &&
+		mode1->hskew == mode2->hskew &&
+		mode1->vdisplay == mode2->vdisplay &&
+		mode1->vsync_start == mode2->vsync_start &&
+		mode1->vsync_end == mode2->vsync_end &&
+		mode1->vtotal == mode2->vtotal &&
+		mode1->vscan == mode2->vscan;
+}
+
+static bool drm_mode_match_clock(const struct drm_display_mode *mode1,
+				  const struct drm_display_mode *mode2)
+{
+	/*
+	 * do clock check convert to PICOS
+	 * so fb modes get matched the same
+	 */
+	if (mode1->clock && mode2->clock)
+		return KHZ2PICOS(mode1->clock) == KHZ2PICOS(mode2->clock);
+	else
+		return mode1->clock == mode2->clock;
+}
+
+static bool drm_mode_match_flags(const struct drm_display_mode *mode1,
+				 const struct drm_display_mode *mode2)
+{
+	return (mode1->flags & ~DRM_MODE_FLAG_3D_MASK) ==
+		(mode2->flags & ~DRM_MODE_FLAG_3D_MASK);
+}
+
+static bool drm_mode_match_3d_flags(const struct drm_display_mode *mode1,
+				    const struct drm_display_mode *mode2)
+{
+	return (mode1->flags & DRM_MODE_FLAG_3D_MASK) ==
+		(mode2->flags & DRM_MODE_FLAG_3D_MASK);
+}
+
+static bool drm_mode_match_aspect_ratio(const struct drm_display_mode *mode1,
+					const struct drm_display_mode *mode2)
+{
+	return mode1->picture_aspect_ratio == mode2->picture_aspect_ratio;
+}
+
 /**
- * drm_mode_equal - test modes for equality
+ * drm_mode_match - test modes for (partial) equality
  * @mode1: first mode
  * @mode2: second mode
+ * @match_flags: which parts need to match (DRM_MODE_MATCH_*)
  *
  * Check to see if @mode1 and @mode2 are equivalent.
  *
  * Returns:
- * True if the modes are equal, false otherwise.
+ * True if the modes are (partially) equal, false otherwise.
  */
-bool drm_mode_equal(const struct drm_display_mode *mode1, const struct drm_display_mode *mode2)
+bool drm_mode_match(const struct drm_display_mode *mode1,
+		    const struct drm_display_mode *mode2,
+		    unsigned int match_flags)
 {
 	if (!mode1 && !mode2)
 		return true;
@@ -957,15 +1008,48 @@ bool drm_mode_equal(const struct drm_display_mode *mode1, const struct drm_displ
 	if (!mode1 || !mode2)
 		return false;
 
-	/* do clock check convert to PICOS so fb modes get matched
-	 * the same */
-	if (mode1->clock && mode2->clock) {
-		if (KHZ2PICOS(mode1->clock) != KHZ2PICOS(mode2->clock))
-			return false;
-	} else if (mode1->clock != mode2->clock)
+	if (match_flags & DRM_MODE_MATCH_TIMINGS &&
+	    !drm_mode_match_timings(mode1, mode2))
 		return false;
 
-	return drm_mode_equal_no_clocks(mode1, mode2);
+	if (match_flags & DRM_MODE_MATCH_CLOCK &&
+	    !drm_mode_match_clock(mode1, mode2))
+		return false;
+
+	if (match_flags & DRM_MODE_MATCH_FLAGS &&
+	    !drm_mode_match_flags(mode1, mode2))
+		return false;
+
+	if (match_flags & DRM_MODE_MATCH_3D_FLAGS &&
+	    !drm_mode_match_3d_flags(mode1, mode2))
+		return false;
+
+	if (match_flags & DRM_MODE_MATCH_ASPECT_RATIO &&
+	    !drm_mode_match_aspect_ratio(mode1, mode2))
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(drm_mode_match);
+
+/**
+ * drm_mode_equal - test modes for equality
+ * @mode1: first mode
+ * @mode2: second mode
+ *
+ * Check to see if @mode1 and @mode2 are equivalent.
+ *
+ * Returns:
+ * True if the modes are equal, false otherwise.
+ */
+bool drm_mode_equal(const struct drm_display_mode *mode1,
+		    const struct drm_display_mode *mode2)
+{
+	return drm_mode_match(mode1, mode2,
+			      DRM_MODE_MATCH_TIMINGS |
+			      DRM_MODE_MATCH_CLOCK |
+			      DRM_MODE_MATCH_FLAGS |
+			      DRM_MODE_MATCH_3D_FLAGS);
 }
 EXPORT_SYMBOL(drm_mode_equal);
 
@@ -980,13 +1064,13 @@ EXPORT_SYMBOL(drm_mode_equal);
  * Returns:
  * True if the modes are equal, false otherwise.
  */
-bool drm_mode_equal_no_clocks(const struct drm_display_mode *mode1, const struct drm_display_mode *mode2)
+bool drm_mode_equal_no_clocks(const struct drm_display_mode *mode1,
+			      const struct drm_display_mode *mode2)
 {
-	if ((mode1->flags & DRM_MODE_FLAG_3D_MASK) !=
-	    (mode2->flags & DRM_MODE_FLAG_3D_MASK))
-		return false;
-
-	return drm_mode_equal_no_clocks_no_stereo(mode1, mode2);
+	return drm_mode_match(mode1, mode2,
+			      DRM_MODE_MATCH_TIMINGS |
+			      DRM_MODE_MATCH_FLAGS |
+			      DRM_MODE_MATCH_3D_FLAGS);
 }
 EXPORT_SYMBOL(drm_mode_equal_no_clocks);
 
@@ -1004,21 +1088,9 @@ EXPORT_SYMBOL(drm_mode_equal_no_clocks);
 bool drm_mode_equal_no_clocks_no_stereo(const struct drm_display_mode *mode1,
 					const struct drm_display_mode *mode2)
 {
-	if (mode1->hdisplay == mode2->hdisplay &&
-	    mode1->hsync_start == mode2->hsync_start &&
-	    mode1->hsync_end == mode2->hsync_end &&
-	    mode1->htotal == mode2->htotal &&
-	    mode1->hskew == mode2->hskew &&
-	    mode1->vdisplay == mode2->vdisplay &&
-	    mode1->vsync_start == mode2->vsync_start &&
-	    mode1->vsync_end == mode2->vsync_end &&
-	    mode1->vtotal == mode2->vtotal &&
-	    mode1->vscan == mode2->vscan &&
-	    (mode1->flags & ~DRM_MODE_FLAG_3D_MASK) ==
-	     (mode2->flags & ~DRM_MODE_FLAG_3D_MASK))
-		return true;
-
-	return false;
+	return drm_mode_match(mode1, mode2,
+			      DRM_MODE_MATCH_TIMINGS |
+			      DRM_MODE_MATCH_FLAGS);
 }
 EXPORT_SYMBOL(drm_mode_equal_no_clocks_no_stereo);
 
diff --git a/include/drm/drm_modes.h b/include/drm/drm_modes.h
index 0d310beae6af..2f78b7ee4824 100644
--- a/include/drm/drm_modes.h
+++ b/include/drm/drm_modes.h
@@ -147,6 +147,12 @@ enum drm_mode_status {
 
 #define DRM_MODE_FLAG_3D_MAX	DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF
 
+#define DRM_MODE_MATCH_TIMINGS (1 << 0)
+#define DRM_MODE_MATCH_CLOCK (1 << 1)
+#define DRM_MODE_MATCH_FLAGS (1 << 2)
+#define DRM_MODE_MATCH_3D_FLAGS (1 << 3)
+#define DRM_MODE_MATCH_ASPECT_RATIO (1 << 4)
+
 /**
  * struct drm_display_mode - DRM kernel-internal display mode structure
  * @hdisplay: horizontal display size
@@ -490,6 +496,9 @@ void drm_mode_copy(struct drm_display_mode *dst,
 		   const struct drm_display_mode *src);
 struct drm_display_mode *drm_mode_duplicate(struct drm_device *dev,
 					    const struct drm_display_mode *mode);
+bool drm_mode_match(const struct drm_display_mode *mode1,
+		    const struct drm_display_mode *mode2,
+		    unsigned int match_flags);
 bool drm_mode_equal(const struct drm_display_mode *mode1,
 		    const struct drm_display_mode *mode2);
 bool drm_mode_equal_no_clocks(const struct drm_display_mode *mode1,
-- 
cgit v1.2.3


From 7595bda2fb4378ccbb8db1d0e8de56d15ea7f7fa Mon Sep 17 00:00:00 2001
From: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Date: Tue, 8 May 2018 16:39:41 +0530
Subject: drm: Add DRM client cap for aspect-ratio

To enable aspect-ratio support in DRM, blindly exposing the aspect
ratio information along with mode, can break things in existing
non-atomic user-spaces which have no intention or support to use this
aspect ratio information.

To avoid this, a new drm client cap is required to enable a non-atomic
user-space to advertise if it supports modes with aspect-ratio. Based
on this cap value, the kernel will take a call on exposing the aspect
ratio info in modes or not.

This patch adds the client cap for aspect-ratio.

Since no atomic-userspaces blow up on receiving aspect-ratio
information, the client cap for aspect-ratio is always enabled
for atomic clients.

Cc: Ville Syrjala <ville.syrjala@linux.intel.com>
Cc: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>

V3: rebase
V4: As suggested by Marteen Lankhorst modified the commit message
    explaining the need to use the DRM cap for aspect-ratio. Also,
    tweaked the comment lines in the code for better understanding and
    clarity, as recommended by Shashank Sharma.
V5: rebase
V6: rebase
V7: rebase
V8: rebase
V9: rebase
V10: rebase
V11: rebase
V12: As suggested by Daniel Vetter and Ville Syrjala,
     always enable aspect-ratio client cap for atomic userspaces,
     if no atomic userspace breaks on aspect-ratio bits.
V13: rebase
V14: rebase

Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1525777785-9740-7-git-send-email-ankit.k.nautiyal@intel.com
---
 drivers/gpu/drm/drm_ioctl.c | 9 +++++++++
 include/drm/drm_file.h      | 8 ++++++++
 include/uapi/drm/drm.h      | 7 +++++++
 3 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index eadeabc393f0..0d4cfb232576 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -324,6 +324,15 @@ drm_setclientcap(struct drm_device *dev, void *data, struct drm_file *file_priv)
 			return -EINVAL;
 		file_priv->atomic = req->value;
 		file_priv->universal_planes = req->value;
+		/*
+		 * No atomic user-space blows up on aspect ratio mode bits.
+		 */
+		file_priv->aspect_ratio_allowed = req->value;
+		break;
+	case DRM_CLIENT_CAP_ASPECT_RATIO:
+		if (req->value > 1)
+			return -EINVAL;
+		file_priv->aspect_ratio_allowed = req->value;
 		break;
 	default:
 		return -EINVAL;
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 99ab50cbab00..91a65a360079 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -180,6 +180,14 @@ struct drm_file {
 	/** @atomic: True if client understands atomic properties. */
 	unsigned atomic:1;
 
+	/**
+	 * @aspect_ratio_allowed:
+	 *
+	 * True, if client can handle picture aspect ratios, and has requested
+	 * to pass this information along with the mode.
+	 */
+	unsigned aspect_ratio_allowed:1;
+
 	/**
 	 * @is_master:
 	 *
diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index 6fdff5945c8a..9c660e1688ab 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -680,6 +680,13 @@ struct drm_get_cap {
  */
 #define DRM_CLIENT_CAP_ATOMIC	3
 
+/**
+ * DRM_CLIENT_CAP_ASPECT_RATIO
+ *
+ * If set to 1, the DRM core will provide aspect ratio information in modes.
+ */
+#define DRM_CLIENT_CAP_ASPECT_RATIO    4
+
 /** DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */
 struct drm_set_client_cap {
 	__u64 capability;
-- 
cgit v1.2.3


From c3ff0cdb354f89a5b877eee61af70e6ae51de50b Mon Sep 17 00:00:00 2001
From: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Date: Tue, 8 May 2018 16:39:43 +0530
Subject: drm: Expose modes with aspect ratio, only if requested

We parse the EDID and add all the modes in the connector's modelist.
This adds CEA modes with aspect ratio information too, regardless of
whether user space requested this information or not.

This patch:
-prunes the modes with aspect-ratio information, from the
 drm_mode_get_connector modelist supplied to the user, if the
 user-space has not set the aspect ratio DRM client cap. However if
 such a mode is unique in the list, it is kept in the list, with
 aspect-ratio flags reset.
-prepares a list of exposed modes, which is used to find unique modes
 if aspect-ratio is not allowed.
-adds a new list_head 'exposed_head' in drm_mode_display, to traverse
 the list of exposed modes.

Cc: Ville Syrjala <ville.syrjala@linux.intel.com>
Cc: Shashank Sharma <shashank.sharma@intel.com>
Cc: Jose Abreu <jose.abreu@synopsys.com>

Signed-off-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>

V3: As suggested by Ville, modified the mechanism of pruning of modes
    with aspect-ratio, if the aspect-ratio is not supported. Instead
    of straight away pruning such a mode, the mode is retained with
    aspect ratio bits set to zero, provided it is unique.
V4: rebase
V5: Addressed review comments from Ville:
    -used a pointer to store last valid mode.
    -avoided, modifying of picture_aspect_ratio in kernel mode,
     instead only flags bits of user mode are reset (if aspect-ratio
     is not supported).
V6: As suggested by Ville, corrected the mode pruning logic and
    elaborated the mode pruning logic and the assumptions taken.
V7: rebase
V8: rebase
V9: rebase
V10: rebase
V11: Fixed the issue caused in kms_3d test, and enhanced the pruning
     logic to correctly identify and prune modes with aspect-ratio,
     if aspect-ratio cap is not set.
V12: As suggested by Ville, added another list_head in
     drm_mode_display to traverse the list of exposed modes and
     avoided duplication of modes.
V13: Minor modifications, as suggested by Ville.
v14: As suggested by Daniel Vetter and Ville Syrjala, corrected the
     pruning logic to avoid any dependency in the order of mode with
     aspect-ratio.
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1525777785-9740-9-git-send-email-ankit.k.nautiyal@intel.com
---
 drivers/gpu/drm/drm_connector.c | 44 ++++++++++++++++++++++++++++++++++-------
 include/drm/drm_modes.h         | 13 ++++++++++++
 2 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index dfc8ca1e9413..9b9ba5d5ec0c 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1531,8 +1531,10 @@ static struct drm_encoder *drm_connector_get_encoder(struct drm_connector *conne
 	return connector->encoder;
 }
 
-static bool drm_mode_expose_to_userspace(const struct drm_display_mode *mode,
-					 const struct drm_file *file_priv)
+static bool
+drm_mode_expose_to_userspace(const struct drm_display_mode *mode,
+			     const struct list_head *export_list,
+			     const struct drm_file *file_priv)
 {
 	/*
 	 * If user-space hasn't configured the driver to expose the stereo 3D
@@ -1540,6 +1542,23 @@ static bool drm_mode_expose_to_userspace(const struct drm_display_mode *mode,
 	 */
 	if (!file_priv->stereo_allowed && drm_mode_is_stereo(mode))
 		return false;
+	/*
+	 * If user-space hasn't configured the driver to expose the modes
+	 * with aspect-ratio, don't expose them. However if such a mode
+	 * is unique, let it be exposed, but reset the aspect-ratio flags
+	 * while preparing the list of user-modes.
+	 */
+	if (!file_priv->aspect_ratio_allowed) {
+		struct drm_display_mode *mode_itr;
+
+		list_for_each_entry(mode_itr, export_list, export_head)
+			if (drm_mode_match(mode_itr, mode,
+					   DRM_MODE_MATCH_TIMINGS |
+					   DRM_MODE_MATCH_CLOCK |
+					   DRM_MODE_MATCH_FLAGS |
+					   DRM_MODE_MATCH_3D_FLAGS))
+				return false;
+	}
 
 	return true;
 }
@@ -1559,6 +1578,7 @@ int drm_mode_getconnector(struct drm_device *dev, void *data,
 	struct drm_mode_modeinfo u_mode;
 	struct drm_mode_modeinfo __user *mode_ptr;
 	uint32_t __user *encoder_ptr;
+	LIST_HEAD(export_list);
 
 	if (!drm_core_check_feature(dev, DRIVER_MODESET))
 		return -EINVAL;
@@ -1607,21 +1627,31 @@ int drm_mode_getconnector(struct drm_device *dev, void *data,
 
 	/* delayed so we get modes regardless of pre-fill_modes state */
 	list_for_each_entry(mode, &connector->modes, head)
-		if (drm_mode_expose_to_userspace(mode, file_priv))
+		if (drm_mode_expose_to_userspace(mode, &export_list,
+						 file_priv)) {
+			list_add_tail(&mode->export_head, &export_list);
 			mode_count++;
+		}
 
 	/*
 	 * This ioctl is called twice, once to determine how much space is
 	 * needed, and the 2nd time to fill it.
+	 * The modes that need to be exposed to the user are maintained in the
+	 * 'export_list'. When the ioctl is called first time to determine the,
+	 * space, the export_list gets filled, to find the no.of modes. In the
+	 * 2nd time, the user modes are filled, one by one from the export_list.
 	 */
 	if ((out_resp->count_modes >= mode_count) && mode_count) {
 		copied = 0;
 		mode_ptr = (struct drm_mode_modeinfo __user *)(unsigned long)out_resp->modes_ptr;
-		list_for_each_entry(mode, &connector->modes, head) {
-			if (!drm_mode_expose_to_userspace(mode, file_priv))
-				continue;
-
+		list_for_each_entry(mode, &export_list, export_head) {
 			drm_mode_convert_to_umode(&u_mode, mode);
+			/*
+			 * Reset aspect ratio flags of user-mode, if modes with
+			 * aspect-ratio are not supported.
+			 */
+			if (!file_priv->aspect_ratio_allowed)
+				u_mode.flags &= ~DRM_MODE_FLAG_PIC_AR_MASK;
 			if (copy_to_user(mode_ptr + copied,
 					 &u_mode, sizeof(u_mode))) {
 				ret = -EFAULT;
diff --git a/include/drm/drm_modes.h b/include/drm/drm_modes.h
index 2f78b7ee4824..b159fe07fcf9 100644
--- a/include/drm/drm_modes.h
+++ b/include/drm/drm_modes.h
@@ -411,6 +411,19 @@ struct drm_display_mode {
 	 * Field for setting the HDMI picture aspect ratio of a mode.
 	 */
 	enum hdmi_picture_aspect picture_aspect_ratio;
+
+	/**
+	 * @export_head:
+	 *
+	 * struct list_head for modes to be exposed to the userspace.
+	 * This is to maintain a list of exposed modes while preparing
+	 * user-mode's list in drm_mode_getconnector ioctl. The purpose of this
+	 * list_head only lies in the ioctl function, and is not expected to be
+	 * used outside the function.
+	 * Once used, the stale pointers are not reset, but left as it is, to
+	 * avoid overhead of protecting it by mode_config.mutex.
+	 */
+	struct list_head export_head;
 };
 
 /**
-- 
cgit v1.2.3


From 900aa8ad21587e909603f471b6cd81fd5338ec45 Mon Sep 17 00:00:00 2001
From: Shashank Sharma <shashank.sharma@intel.com>
Date: Tue, 8 May 2018 16:39:45 +0530
Subject: drm: Add and handle new aspect ratios in DRM layer

HDMI 2.0/CEA-861-F introduces two new aspect ratios:
- 64:27
- 256:135

This patch:
-  Adds new DRM flags for to represent these new aspect ratios.
-  Adds new cases to handle these aspect ratios while converting
from user->kernel mode or vise versa.

This patch was once reviewed and merged, and later reverted due
to lack of DRM client protection, while adding aspect ratio bits
in user modes. This is a re-spin of the series, with DRM client
cap protection.

The previous series can be found here:
https://pw-emeril.freedesktop.org/series/10850/

Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
Reviewed-by: Sean Paul <seanpaul@chromium.org> (V2)
Reviewed-by: Jose Abreu <Jose.Abreu@synopsys.com> (V2)

Cc: Ville Syrjala <ville.syrjala@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: Jose Abreu <Jose.Abreu@synopsys.com>
Cc: Ankit Nautiyal <ankit.k.nautiyal@intel.com>

V3: rebase
V4: rebase
V5: corrected the macro name for an aspect ratio, in a switch case.
V6: rebase
V7: rebase
V8: rebase
V9: rebase
V10: rebase
V11: rebase
V12: rebase
V13: rebase
V14: rebase

Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1525777785-9740-11-git-send-email-ankit.k.nautiyal@intel.com
---
 drivers/gpu/drm/drm_modes.c | 12 ++++++++++++
 include/uapi/drm/drm_mode.h |  6 ++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 7dfabdd6bcc8..c78ca0e84ffd 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -1656,6 +1656,12 @@ void drm_mode_convert_to_umode(struct drm_mode_modeinfo *out,
 	case HDMI_PICTURE_ASPECT_16_9:
 		out->flags |= DRM_MODE_FLAG_PIC_AR_16_9;
 		break;
+	case HDMI_PICTURE_ASPECT_64_27:
+		out->flags |= DRM_MODE_FLAG_PIC_AR_64_27;
+		break;
+	case HDMI_PICTURE_ASPECT_256_135:
+		out->flags |= DRM_MODE_FLAG_PIC_AR_256_135;
+		break;
 	case HDMI_PICTURE_ASPECT_RESERVED:
 	default:
 		out->flags |= DRM_MODE_FLAG_PIC_AR_NONE;
@@ -1721,6 +1727,12 @@ int drm_mode_convert_umode(struct drm_device *dev,
 	case DRM_MODE_FLAG_PIC_AR_16_9:
 		out->picture_aspect_ratio |= HDMI_PICTURE_ASPECT_16_9;
 		break;
+	case DRM_MODE_FLAG_PIC_AR_64_27:
+		out->picture_aspect_ratio |= HDMI_PICTURE_ASPECT_64_27;
+		break;
+	case DRM_MODE_FLAG_PIC_AR_256_135:
+		out->picture_aspect_ratio |= HDMI_PICTURE_ASPECT_256_135;
+		break;
 	default:
 		out->picture_aspect_ratio = HDMI_PICTURE_ASPECT_NONE;
 		break;
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 50bcf4214ff9..4b3a1bb58e68 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -93,6 +93,8 @@ extern "C" {
 #define DRM_MODE_PICTURE_ASPECT_NONE		0
 #define DRM_MODE_PICTURE_ASPECT_4_3		1
 #define DRM_MODE_PICTURE_ASPECT_16_9		2
+#define DRM_MODE_PICTURE_ASPECT_64_27		3
+#define DRM_MODE_PICTURE_ASPECT_256_135		4
 
 /* Aspect ratio flag bitmask (4 bits 22:19) */
 #define DRM_MODE_FLAG_PIC_AR_MASK		(0x0F<<19)
@@ -102,6 +104,10 @@ extern "C" {
 			(DRM_MODE_PICTURE_ASPECT_4_3<<19)
 #define  DRM_MODE_FLAG_PIC_AR_16_9 \
 			(DRM_MODE_PICTURE_ASPECT_16_9<<19)
+#define  DRM_MODE_FLAG_PIC_AR_64_27 \
+			(DRM_MODE_PICTURE_ASPECT_64_27<<19)
+#define  DRM_MODE_FLAG_PIC_AR_256_135 \
+			(DRM_MODE_PICTURE_ASPECT_256_135<<19)
 
 #define  DRM_MODE_FLAG_ALL	(DRM_MODE_FLAG_PHSYNC |		\
 				 DRM_MODE_FLAG_NHSYNC |		\
-- 
cgit v1.2.3


From acd14c181fc201a397aaff02af3b8c3f1dcf233f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Thu, 3 May 2018 10:52:51 -0400
Subject: media: v4l2-device.h: always expose mdev

The mdev field is only present if CONFIG_MEDIA_CONTROLLER is set.
But since we will need to pass the media_device to vb2 and the
control framework it is very convenient to just make this field
available all the time. If CONFIG_MEDIA_CONTROLLER is not set,
then it will just be NULL.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/media/v4l2-device.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h
index 0c9e4da55499..b330e4a08a6b 100644
--- a/include/media/v4l2-device.h
+++ b/include/media/v4l2-device.h
@@ -33,7 +33,7 @@ struct v4l2_ctrl_handler;
  * struct v4l2_device - main struct to for V4L2 device drivers
  *
  * @dev: pointer to struct device.
- * @mdev: pointer to struct media_device
+ * @mdev: pointer to struct media_device, may be NULL.
  * @subdevs: used to keep track of the registered subdevs
  * @lock: lock this struct; can be used by the driver as well
  *	if this struct is embedded into a larger struct.
@@ -58,9 +58,7 @@ struct v4l2_ctrl_handler;
  */
 struct v4l2_device {
 	struct device *dev;
-#if defined(CONFIG_MEDIA_CONTROLLER)
 	struct media_device *mdev;
-#endif
 	struct list_head subdevs;
 	spinlock_t lock;
 	char name[V4L2_DEVICE_NAME_SIZE];
-- 
cgit v1.2.3


From 11d8f3ddab1e2b0f148def287859d0903b7f8ac5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:32 -0700
Subject: net: dsa: Add PHYLINK switch operations

In preparation for adding support for PHYLINK within DSA, define a number of
operations that we will need and that switch drivers can start implementing.
Proper integration with PHYLINK will follow in subsequent patches.

We start selecting PHYLINK (which implies PHYLIB) in net/dsa/Kconfig
such that drivers can be guaranteed that this dependency is properly
taken care of and can start referencing PHYLINK helper functions without
requiring stubs or anything.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 24 ++++++++++++++++++++++++
 net/dsa/Kconfig   |  2 +-
 net/dsa/slave.c   |  5 +++++
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 462e9741b210..ed64c1f3f117 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -20,12 +20,14 @@
 #include <linux/of.h>
 #include <linux/ethtool.h>
 #include <linux/net_tstamp.h>
+#include <linux/phy.h>
 #include <net/devlink.h>
 #include <net/switchdev.h>
 
 struct tc_action;
 struct phy_device;
 struct fixed_phy_status;
+struct phylink_link_state;
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE = 0,
@@ -353,6 +355,27 @@ struct dsa_switch_ops {
 	void	(*fixed_link_update)(struct dsa_switch *ds, int port,
 				struct fixed_phy_status *st);
 
+	/*
+	 * PHYLINK integration
+	 */
+	void	(*phylink_validate)(struct dsa_switch *ds, int port,
+				    unsigned long *supported,
+				    struct phylink_link_state *state);
+	int	(*phylink_mac_link_state)(struct dsa_switch *ds, int port,
+					  struct phylink_link_state *state);
+	void	(*phylink_mac_config)(struct dsa_switch *ds, int port,
+				      unsigned int mode,
+				      const struct phylink_link_state *state);
+	void	(*phylink_mac_an_restart)(struct dsa_switch *ds, int port);
+	void	(*phylink_mac_link_down)(struct dsa_switch *ds, int port,
+					 unsigned int mode,
+					 phy_interface_t interface);
+	void	(*phylink_mac_link_up)(struct dsa_switch *ds, int port,
+				       unsigned int mode,
+				       phy_interface_t interface,
+				       struct phy_device *phydev);
+	void	(*phylink_fixed_state)(struct dsa_switch *ds, int port,
+				       struct phylink_link_state *state);
 	/*
 	 * ethtool hardware statistics.
 	 */
@@ -595,5 +618,6 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
 int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
 int dsa_port_get_phy_sset_count(struct dsa_port *dp);
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
 
 #endif
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index bbf2c82cf7b2..4183e4ba27a5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -9,7 +9,7 @@ config NET_DSA
 	depends on HAVE_NET_DSA && MAY_USE_DEVLINK
 	depends on BRIDGE || BRIDGE=n
 	select NET_SWITCHDEV
-	select PHYLIB
+	select PHYLINK
 	---help---
 	  Say Y if you want to enable support for the hardware switches supported
 	  by the Distributed Switch Architecture.
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 746ab428a17a..6c2f042e3c29 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1119,6 +1119,11 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
 				  dsa_slave_adjust_link, p->phy_interface);
 }
 
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
 static int dsa_slave_phy_setup(struct net_device *slave_dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-- 
cgit v1.2.3


From aab9c4067d2389d0adfc9c53806437df7b0fe3d5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:36 -0700
Subject: net: dsa: Plug in PHYLINK support

Add support for PHYLINK within the DSA subsystem in order to support more
complex devices such as pluggable (SFP) and non-pluggable (SFF) modules, 10G
PHYs, and traditional PHYs. Using PHYLINK allows us to drop some amount of
complexity we had while probing fixed and non-fixed PHYs using Device Tree.

Because PHYLINK separates the Ethernet MAC/port configuration into different
stages, we let switch drivers implement those, and for now, we maintain
functionality by calling dsa_slave_adjust_link() during
phylink_mac_link_{up,down} which provides semantically equivalent steps.

Drivers willing to take advantage of PHYLINK should implement the phylink_mac_*
operations that DSA wraps.

We cannot quite remove the adjust_link() callback just yet, because a number of
drivers rely on that for configuring their "CPU" and "DSA" ports, this is done
dsa_port_setup_phy_of() and dsa_port_fixed_link_register_of() still.

Drivers that utilize fixed links for user-facing ports (e.g: bcm_sf2) will need
to implement phylink_mac_ops from now on to preserve functionality, since PHYLINK
*does not* create a phy_device instance for fixed links.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |   1 +
 net/dsa/dsa_priv.h |   9 --
 net/dsa/slave.c    | 294 +++++++++++++++++++++++++++++++----------------------
 3 files changed, 172 insertions(+), 132 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ed64c1f3f117..fdbd6082945d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -201,6 +201,7 @@ struct dsa_port {
 	u8			stp_state;
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
+	struct phylink		*pl;
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 053731473c99..3964c6f7a7c0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -75,15 +75,6 @@ struct dsa_slave_priv {
 	/* DSA port data, such as switch, port index, etc. */
 	struct dsa_port		*dp;
 
-	/*
-	 * The phylib phy_device pointer for the PHY connected
-	 * to this port.
-	 */
-	phy_interface_t		phy_interface;
-	int			old_link;
-	int			old_pause;
-	int			old_duplex;
-
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 729f18d23bdd..1e3b6a6d8a40 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -13,6 +13,7 @@
 #include <linux/netdevice.h>
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
+#include <linux/phylink.h>
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
 #include <linux/mdio.h>
@@ -97,8 +98,7 @@ static int dsa_slave_open(struct net_device *dev)
 	if (err)
 		goto clear_promisc;
 
-	if (dev->phydev)
-		phy_start(dev->phydev);
+	phylink_start(dp->pl);
 
 	return 0;
 
@@ -120,8 +120,7 @@ static int dsa_slave_close(struct net_device *dev)
 	struct net_device *master = dsa_slave_to_master(dev);
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 
-	if (dev->phydev)
-		phy_stop(dev->phydev);
+	phylink_stop(dp->pl);
 
 	dsa_port_disable(dp, dev->phydev);
 
@@ -272,10 +271,7 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 	}
 
-	if (!dev->phydev)
-		return -ENODEV;
-
-	return phy_mii_ioctl(dev->phydev, ifr, cmd);
+	return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
 }
 
 static int dsa_slave_port_attr_set(struct net_device *dev,
@@ -498,6 +494,13 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
 		ds->ops->get_regs(ds, dp->index, regs, _p);
 }
 
+static int dsa_slave_nway_reset(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return phylink_ethtool_nway_reset(dp->pl);
+}
+
 static int dsa_slave_get_eeprom_len(struct net_device *dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
@@ -609,6 +612,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	struct dsa_switch *ds = dp->ds;
 
+	phylink_ethtool_get_wol(dp->pl, w);
+
 	if (ds->ops->get_wol)
 		ds->ops->get_wol(ds, dp->index, w);
 }
@@ -619,6 +624,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 	struct dsa_switch *ds = dp->ds;
 	int ret = -EOPNOTSUPP;
 
+	phylink_ethtool_set_wol(dp->pl, w);
+
 	if (ds->ops->set_wol)
 		ret = ds->ops->set_wol(ds, dp->index, w);
 
@@ -642,13 +649,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (ret)
 		return ret;
 
-	if (e->eee_enabled) {
-		ret = phy_init_eee(dev->phydev, 0);
-		if (ret)
-			return ret;
-	}
-
-	return phy_ethtool_set_eee(dev->phydev, e);
+	return phylink_ethtool_set_eee(dp->pl, e);
 }
 
 static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
@@ -668,7 +669,23 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (ret)
 		return ret;
 
-	return phy_ethtool_get_eee(dev->phydev, e);
+	return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_slave_get_link_ksettings(struct net_device *dev,
+					struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_slave_set_link_ksettings(struct net_device *dev,
+					const struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return phylink_ethtool_ksettings_set(dp->pl, cmd);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -971,7 +988,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_drvinfo		= dsa_slave_get_drvinfo,
 	.get_regs_len		= dsa_slave_get_regs_len,
 	.get_regs		= dsa_slave_get_regs,
-	.nway_reset		= phy_ethtool_nway_reset,
+	.nway_reset		= dsa_slave_nway_reset,
 	.get_link		= ethtool_op_get_link,
 	.get_eeprom_len		= dsa_slave_get_eeprom_len,
 	.get_eeprom		= dsa_slave_get_eeprom,
@@ -983,8 +1000,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_wol		= dsa_slave_get_wol,
 	.set_eee		= dsa_slave_set_eee,
 	.get_eee		= dsa_slave_get_eee,
-	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
-	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
+	.get_link_ksettings	= dsa_slave_get_link_ksettings,
+	.set_link_ksettings	= dsa_slave_set_link_ksettings,
 	.get_rxnfc		= dsa_slave_get_rxnfc,
 	.set_rxnfc		= dsa_slave_set_rxnfc,
 	.get_ts_info		= dsa_slave_get_ts_info,
@@ -1043,56 +1060,122 @@ static struct device_type dsa_type = {
 	.name	= "dsa",
 };
 
-static void dsa_slave_adjust_link(struct net_device *dev)
+static void dsa_slave_phylink_validate(struct net_device *dev,
+				       unsigned long *supported,
+				       struct phylink_link_state *state)
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = dp->ds;
-	unsigned int status_changed = 0;
 
-	if (p->old_link != dev->phydev->link) {
-		status_changed = 1;
-		p->old_link = dev->phydev->link;
-	}
+	if (!ds->ops->phylink_validate)
+		return;
 
-	if (p->old_duplex != dev->phydev->duplex) {
-		status_changed = 1;
-		p->old_duplex = dev->phydev->duplex;
-	}
+	ds->ops->phylink_validate(ds, dp->index, supported, state);
+}
 
-	if (p->old_pause != dev->phydev->pause) {
-		status_changed = 1;
-		p->old_pause = dev->phydev->pause;
-	}
+static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
+					    struct phylink_link_state *state)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
 
-	if (ds->ops->adjust_link && status_changed)
-		ds->ops->adjust_link(ds, dp->index, dev->phydev);
+	/* Only called for SGMII and 802.3z */
+	if (!ds->ops->phylink_mac_link_state)
+		return -EOPNOTSUPP;
 
-	if (status_changed)
-		phy_print_status(dev->phydev);
+	return ds->ops->phylink_mac_link_state(ds, dp->index, state);
 }
 
-static int dsa_slave_fixed_link_update(struct net_device *dev,
-				       struct fixed_phy_status *status)
+static void dsa_slave_phylink_mac_config(struct net_device *dev,
+					 unsigned int mode,
+					 const struct phylink_link_state *state)
 {
-	struct dsa_switch *ds;
-	struct dsa_port *dp;
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_config)
+		return;
 
-	if (dev) {
-		dp = dsa_slave_to_port(dev);
-		ds = dp->ds;
-		if (ds->ops->fixed_link_update)
-			ds->ops->fixed_link_update(ds, dp->index, status);
+	ds->ops->phylink_mac_config(ds, dp->index, mode, state);
+}
+
+static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_an_restart)
+		return;
+
+	ds->ops->phylink_mac_an_restart(ds, dp->index);
+}
+
+static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
+					    unsigned int mode,
+					    phy_interface_t interface)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_link_down) {
+		if (ds->ops->adjust_link && dev->phydev)
+			ds->ops->adjust_link(ds, dp->index, dev->phydev);
+		return;
 	}
 
-	return 0;
+	ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
+}
+
+static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
+					  unsigned int mode,
+					  phy_interface_t interface,
+					  struct phy_device *phydev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_link_up) {
+		if (ds->ops->adjust_link && dev->phydev)
+			ds->ops->adjust_link(ds, dp->index, dev->phydev);
+		return;
+	}
+
+	ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
+}
+
+static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
+	.validate = dsa_slave_phylink_validate,
+	.mac_link_state = dsa_slave_phylink_mac_link_state,
+	.mac_config = dsa_slave_phylink_mac_config,
+	.mac_an_restart = dsa_slave_phylink_mac_an_restart,
+	.mac_link_down = dsa_slave_phylink_mac_link_down,
+	.mac_link_up = dsa_slave_phylink_mac_link_up,
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+	const struct dsa_port *dp = dsa_to_port(ds, port);
+
+	phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_slave_phylink_fixed_state(struct net_device *dev,
+					  struct phylink_link_state *state)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	/* No need to check that this operation is valid, the callback would
+	 * not be called if it was not.
+	 */
+	ds->ops->phylink_fixed_state(ds, dp->index, state);
 }
 
 /* slave device setup *******************************************************/
 static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
 	struct dsa_switch *ds = dp->ds;
 
 	slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
@@ -1101,80 +1184,54 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
 		return -ENODEV;
 	}
 
-	/* Use already configured phy mode */
-	if (p->phy_interface == PHY_INTERFACE_MODE_NA)
-		p->phy_interface = slave_dev->phydev->interface;
-
-	return phy_connect_direct(slave_dev, slave_dev->phydev,
-				  dsa_slave_adjust_link, p->phy_interface);
-}
-
-void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
-{
+	return phylink_connect_phy(dp->pl, slave_dev->phydev);
 }
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
 
 static int dsa_slave_phy_setup(struct net_device *slave_dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
 	struct device_node *port_dn = dp->dn;
 	struct dsa_switch *ds = dp->ds;
-	struct device_node *phy_dn;
-	bool phy_is_fixed = false;
 	u32 phy_flags = 0;
 	int mode, ret;
 
 	mode = of_get_phy_mode(port_dn);
 	if (mode < 0)
 		mode = PHY_INTERFACE_MODE_NA;
-	p->phy_interface = mode;
 
-	phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
-	if (!phy_dn && of_phy_is_fixed_link(port_dn)) {
-		/* In the case of a fixed PHY, the DT node associated
-		 * to the fixed PHY is the Port DT node
-		 */
-		ret = of_phy_register_fixed_link(port_dn);
-		if (ret) {
-			netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret);
-			return ret;
-		}
-		phy_is_fixed = true;
-		phy_dn = of_node_get(port_dn);
+	dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
+				&dsa_slave_phylink_mac_ops);
+	if (IS_ERR(dp->pl)) {
+		netdev_err(slave_dev,
+			   "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
+		return PTR_ERR(dp->pl);
 	}
 
+	/* Register only if the switch provides such a callback, since this
+	 * callback takes precedence over polling the link GPIO in PHYLINK
+	 * (see phylink_get_fixed_state).
+	 */
+	if (ds->ops->phylink_fixed_state)
+		phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state);
+
 	if (ds->ops->get_phy_flags)
 		phy_flags = ds->ops->get_phy_flags(ds, dp->index);
 
-	if (phy_dn) {
-		slave_dev->phydev = of_phy_connect(slave_dev, phy_dn,
-						   dsa_slave_adjust_link,
-						   phy_flags,
-						   p->phy_interface);
-		of_node_put(phy_dn);
-	}
-
-	if (slave_dev->phydev && phy_is_fixed)
-		fixed_phy_set_link_update(slave_dev->phydev,
-					  dsa_slave_fixed_link_update);
-
-	/* We could not connect to a designated PHY, so use the switch internal
-	 * MDIO bus instead
-	 */
-	if (!slave_dev->phydev) {
+	ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+	if (ret == -ENODEV) {
+		/* We could not connect to a designated PHY or SFP, so use the
+		 * switch internal MDIO bus instead
+		 */
 		ret = dsa_slave_phy_connect(slave_dev, dp->index);
 		if (ret) {
-			netdev_err(slave_dev, "failed to connect to port %d: %d\n",
+			netdev_err(slave_dev,
+				   "failed to connect to port %d: %d\n",
 				   dp->index, ret);
-			if (phy_is_fixed)
-				of_phy_deregister_fixed_link(port_dn);
+			phylink_destroy(dp->pl);
 			return ret;
 		}
 	}
 
-	phy_attached_info(slave_dev->phydev);
-
 	return 0;
 }
 
@@ -1189,29 +1246,26 @@ static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
 
 int dsa_slave_suspend(struct net_device *slave_dev)
 {
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
+	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
 
 	netif_device_detach(slave_dev);
 
-	if (slave_dev->phydev) {
-		phy_stop(slave_dev->phydev);
-		p->old_pause = -1;
-		p->old_link = -1;
-		p->old_duplex = -1;
-		phy_suspend(slave_dev->phydev);
-	}
+	rtnl_lock();
+	phylink_stop(dp->pl);
+	rtnl_unlock();
 
 	return 0;
 }
 
 int dsa_slave_resume(struct net_device *slave_dev)
 {
+	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
+
 	netif_device_attach(slave_dev);
 
-	if (slave_dev->phydev) {
-		phy_resume(slave_dev->phydev);
-		phy_start(slave_dev->phydev);
-	}
+	rtnl_lock();
+	phylink_start(dp->pl);
+	rtnl_unlock();
 
 	return 0;
 }
@@ -1276,11 +1330,6 @@ int dsa_slave_create(struct dsa_port *port)
 	p->dp = port;
 	INIT_LIST_HEAD(&p->mall_tc_list);
 	p->xmit = cpu_dp->tag_ops->xmit;
-
-	p->old_pause = -1;
-	p->old_link = -1;
-	p->old_duplex = -1;
-
 	port->slave = slave_dev;
 
 	netif_carrier_off(slave_dev);
@@ -1303,9 +1352,10 @@ int dsa_slave_create(struct dsa_port *port)
 	return 0;
 
 out_phy:
-	phy_disconnect(slave_dev->phydev);
-	if (of_phy_is_fixed_link(port->dn))
-		of_phy_deregister_fixed_link(port->dn);
+	rtnl_lock();
+	phylink_disconnect_phy(p->dp->pl);
+	rtnl_unlock();
+	phylink_destroy(p->dp->pl);
 out_free:
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
@@ -1317,17 +1367,15 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
 	struct dsa_slave_priv *p = netdev_priv(slave_dev);
-	struct device_node *port_dn = dp->dn;
 
 	netif_carrier_off(slave_dev);
-	if (slave_dev->phydev) {
-		phy_disconnect(slave_dev->phydev);
+	rtnl_lock();
+	phylink_disconnect_phy(dp->pl);
+	rtnl_unlock();
 
-		if (of_phy_is_fixed_link(port_dn))
-			of_phy_deregister_fixed_link(port_dn);
-	}
 	dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
 	unregister_netdev(slave_dev);
+	phylink_destroy(dp->pl);
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
 }
-- 
cgit v1.2.3


From f17c403af9bf3dc19069cb8c10390030ee4c8fb0 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@gmail.com>
Date: Thu, 3 May 2018 14:12:59 -0400
Subject: media: dvbdev: add a mutex protecting the "mdev" pointer

During destruction, a race condition in
dvb_media_controller_disable_source() can cause a kernel crash,
because the "mdev" pointer has been read successfully while another
task executes dvb_usb_media_device_unregister(), which destroys the
object.  Example for such a crash:

    general protection fault: 0000 [#1] SMP
    CPU: 1 PID: 301 Comm: vdr Not tainted 4.8.1-nuc+ #102
    [142B blob data]
    task: ffff8802301f2040 task.stack: ffff880233728000
    RIP: 0010:[<ffffffff816c296b>]  [<ffffffff816c296b>] dvb_frontend_release+0xcb/0x120
    RSP: 0018:ffff88023372bdd8  EFLAGS: 00010202
    RAX: 001fd55c000000da RBX: ffff880236bad810 RCX: 0000000000000000
    RDX: ffff880235bd81f0 RSI: 0000000000000246 RDI: ffff880235bd81e8
    RBP: ffff88023372be00 R08: 0000000000000000 R09: 0000000000000000
    R10: 0000000000000000 R11: ffff88022f009910 R12: 0000000000000000
    R13: ffff880235a21a80 R14: ffff880235bd8000 R15: ffff880235bb8a78
    FS:  0000000000000000(0000) GS:ffff88023fd00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007f96edd69818 CR3: 0000000002406000 CR4: 00000000001006e0
    Stack:
     ffff88022f009900 0000000000000008 ffff880235bb8a78 ffff8802344fbb20
     ffff880236437b40 ffff88023372be48 ffffffff8117a81e ffff880235bb8a78
     ffff88022f009910 ffff8802335a7400 ffff8802301f2040 ffff88022f009900
    Call Trace:
     [<ffffffff8117a81e>] __fput+0xde/0x1d0
     [<ffffffff8117a949>] ____fput+0x9/0x10
     [<ffffffff810a9fce>] task_work_run+0x7e/0xa0
     [<ffffffff81094bab>] do_exit+0x27b/0xa50
     [<ffffffff810407e3>] ? __do_page_fault+0x1c3/0x430
     [<ffffffff81095402>] do_group_exit+0x42/0xb0
     [<ffffffff8109547f>] SyS_exit_group+0xf/0x10
     [<ffffffff8108bedb>] entry_SYSCALL_64_fastpath+0x13/0x8f
    Code: 31 c9 49 8d be e8 01 00 00 ba 01 00 00 00 be 03 00 00 00 e8 68 2d a0 ff 48 8b 83 10 03 00 00 48 8b 80 88 00 00 00 48 85 c0 74 12 <48> 8b 80 88 02 00 00 48 85 c0 74 06 49 8b 7d
    RIP  [<ffffffff816c296b>] dvb_frontend_release+0xcb/0x120

[mchehab+samsung@kernel.org: fix a Coding Style issue]
Signed-off-by: Max Kellermann <max.kellermann@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/dvb-core/dvb_frontend.c   | 7 +++++++
 drivers/media/dvb-core/dvbdev.c         | 4 ++++
 drivers/media/usb/dvb-usb/dvb-usb-dvb.c | 4 ++++
 include/media/dvbdev.h                  | 2 ++
 4 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index bc5eaad0f915..f64141662a82 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -2732,6 +2732,7 @@ static int dvb_frontend_open(struct inode *inode, struct file *file)
 		fepriv->voltage = -1;
 
 #ifdef CONFIG_MEDIA_CONTROLLER_DVB
+		mutex_lock(&fe->dvb->mdev_lock);
 		if (fe->dvb->mdev) {
 			mutex_lock(&fe->dvb->mdev->graph_mutex);
 			if (fe->dvb->mdev->enable_source)
@@ -2740,11 +2741,13 @@ static int dvb_frontend_open(struct inode *inode, struct file *file)
 							   &fepriv->pipe);
 			mutex_unlock(&fe->dvb->mdev->graph_mutex);
 			if (ret) {
+				mutex_unlock(&fe->dvb->mdev_lock);
 				dev_err(fe->dvb->device,
 					"Tuner is busy. Error %d\n", ret);
 				goto err2;
 			}
 		}
+		mutex_unlock(&fe->dvb->mdev_lock);
 #endif
 		ret = dvb_frontend_start (fe);
 		if (ret)
@@ -2762,12 +2765,14 @@ static int dvb_frontend_open(struct inode *inode, struct file *file)
 
 err3:
 #ifdef CONFIG_MEDIA_CONTROLLER_DVB
+	mutex_lock(&fe->dvb->mdev_lock);
 	if (fe->dvb->mdev) {
 		mutex_lock(&fe->dvb->mdev->graph_mutex);
 		if (fe->dvb->mdev->disable_source)
 			fe->dvb->mdev->disable_source(dvbdev->entity);
 		mutex_unlock(&fe->dvb->mdev->graph_mutex);
 	}
+	mutex_unlock(&fe->dvb->mdev_lock);
 err2:
 #endif
 	dvb_generic_release(inode, file);
@@ -2799,12 +2804,14 @@ static int dvb_frontend_release(struct inode *inode, struct file *file)
 	if (dvbdev->users == -1) {
 		wake_up(&fepriv->wait_queue);
 #ifdef CONFIG_MEDIA_CONTROLLER_DVB
+		mutex_lock(&fe->dvb->mdev_lock);
 		if (fe->dvb->mdev) {
 			mutex_lock(&fe->dvb->mdev->graph_mutex);
 			if (fe->dvb->mdev->disable_source)
 				fe->dvb->mdev->disable_source(dvbdev->entity);
 			mutex_unlock(&fe->dvb->mdev->graph_mutex);
 		}
+		mutex_unlock(&fe->dvb->mdev_lock);
 #endif
 		if (fe->exit != DVB_FE_NO_EXIT)
 			wake_up(&dvbdev->wait_queue);
diff --git a/drivers/media/dvb-core/dvbdev.c b/drivers/media/dvb-core/dvbdev.c
index 787fe06df217..64d6793674b9 100644
--- a/drivers/media/dvb-core/dvbdev.c
+++ b/drivers/media/dvb-core/dvbdev.c
@@ -859,6 +859,10 @@ int dvb_register_adapter(struct dvb_adapter *adap, const char *name,
 	adap->mfe_dvbdev = NULL;
 	mutex_init (&adap->mfe_lock);
 
+#ifdef CONFIG_MEDIA_CONTROLLER_DVB
+	mutex_init(&adap->mdev_lock);
+#endif
+
 	list_add_tail (&adap->list_head, &dvb_adapter_list);
 
 	mutex_unlock(&dvbdev_register_lock);
diff --git a/drivers/media/usb/dvb-usb/dvb-usb-dvb.c b/drivers/media/usb/dvb-usb/dvb-usb-dvb.c
index 3a66e732e0d8..8056053c9ab0 100644
--- a/drivers/media/usb/dvb-usb/dvb-usb-dvb.c
+++ b/drivers/media/usb/dvb-usb/dvb-usb-dvb.c
@@ -132,10 +132,14 @@ static void dvb_usb_media_device_unregister(struct dvb_usb_adapter *adap)
 	if (!adap->dvb_adap.mdev)
 		return;
 
+	mutex_lock(&adap->dvb_adap.mdev_lock);
+
 	media_device_unregister(adap->dvb_adap.mdev);
 	media_device_cleanup(adap->dvb_adap.mdev);
 	kfree(adap->dvb_adap.mdev);
 	adap->dvb_adap.mdev = NULL;
+
+	mutex_unlock(&adap->dvb_adap.mdev_lock);
 #endif
 }
 
diff --git a/include/media/dvbdev.h b/include/media/dvbdev.h
index ee91516ad074..881ca461b7bb 100644
--- a/include/media/dvbdev.h
+++ b/include/media/dvbdev.h
@@ -91,6 +91,7 @@ struct dvb_frontend;
  * @mfe_dvbdev:		Frontend device in use, in the case of MFE
  * @mfe_lock:		Lock to prevent using the other frontends when MFE is
  *			used.
+ * @mdev_lock:          Protect access to the mdev pointer.
  * @mdev:		pointer to struct media_device, used when the media
  *			controller is used.
  * @conn:		RF connector. Used only if the device has no separate
@@ -114,6 +115,7 @@ struct dvb_adapter {
 	struct mutex mfe_lock;		/* access lock for thread creation */
 
 #if defined(CONFIG_MEDIA_CONTROLLER_DVB)
+	struct mutex mdev_lock;
 	struct media_device *mdev;
 	struct media_entity *conn;
 	struct media_pad *conn_pads;
-- 
cgit v1.2.3


From 89e590535f32d4bc548bcf266f3b046e50942f6d Mon Sep 17 00:00:00 2001
From: Sanyog Kale <sanyog.r.kale@intel.com>
Date: Thu, 26 Apr 2018 18:38:08 +0530
Subject: soundwire: Add support for SoundWire stream management

This patch adds APIs and relevant stream data structures
for initialization and release of stream.

Signed-off-by: Hardik T Shah <hardik.t.shah@intel.com>
Signed-off-by: Sanyog Kale <sanyog.r.kale@intel.com>
Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/Makefile    |   2 +-
 drivers/soundwire/bus.c       |   1 +
 drivers/soundwire/bus.h       |  36 +++++
 drivers/soundwire/stream.c    | 360 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/soundwire/sdw.h | 109 +++++++++++++
 5 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 drivers/soundwire/stream.c

(limited to 'include')

diff --git a/drivers/soundwire/Makefile b/drivers/soundwire/Makefile
index e1a74c5692aa..5817beaca0e1 100644
--- a/drivers/soundwire/Makefile
+++ b/drivers/soundwire/Makefile
@@ -3,7 +3,7 @@
 #
 
 #Bus Objs
-soundwire-bus-objs := bus_type.o bus.o slave.o mipi_disco.o
+soundwire-bus-objs := bus_type.o bus.o slave.o mipi_disco.o stream.o
 obj-$(CONFIG_SOUNDWIRE_BUS) += soundwire-bus.o
 
 #Cadence Objs
diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index d6dc8e7a8614..abf046f6b188 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -32,6 +32,7 @@ int sdw_add_bus_master(struct sdw_bus *bus)
 	mutex_init(&bus->msg_lock);
 	mutex_init(&bus->bus_lock);
 	INIT_LIST_HEAD(&bus->slaves);
+	INIT_LIST_HEAD(&bus->m_rt_list);
 
 	if (bus->ops->read_prop) {
 		ret = bus->ops->read_prop(bus);
diff --git a/drivers/soundwire/bus.h b/drivers/soundwire/bus.h
index 345c34d697e9..2e5043de9a4b 100644
--- a/drivers/soundwire/bus.h
+++ b/drivers/soundwire/bus.h
@@ -45,6 +45,42 @@ struct sdw_msg {
 	bool page;
 };
 
+/**
+ * sdw_slave_runtime: Runtime Stream parameters for Slave
+ *
+ * @slave: Slave handle
+ * @direction: Data direction for Slave
+ * @ch_count: Number of channels handled by the Slave for
+ * this stream
+ * @m_rt_node: sdw_master_runtime list node
+ */
+struct sdw_slave_runtime {
+	struct sdw_slave *slave;
+	enum sdw_data_direction direction;
+	unsigned int ch_count;
+	struct list_head m_rt_node;
+};
+
+/**
+ * sdw_master_runtime: Runtime stream parameters for Master
+ *
+ * @bus: Bus handle
+ * @stream: Stream runtime handle
+ * @direction: Data direction for Master
+ * @ch_count: Number of channels handled by the Master for
+ * this stream, can be zero.
+ * @slave_rt_list: Slave runtime list
+ * @bus_node: sdw_bus m_rt_list node
+ */
+struct sdw_master_runtime {
+	struct sdw_bus *bus;
+	struct sdw_stream_runtime *stream;
+	enum sdw_data_direction direction;
+	unsigned int ch_count;
+	struct list_head slave_rt_list;
+	struct list_head bus_node;
+};
+
 int sdw_transfer(struct sdw_bus *bus, struct sdw_msg *msg);
 int sdw_transfer_defer(struct sdw_bus *bus, struct sdw_msg *msg,
 				struct sdw_defer *defer);
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
new file mode 100644
index 000000000000..89b2550ea453
--- /dev/null
+++ b/drivers/soundwire/stream.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2015-18 Intel Corporation.
+
+/*
+ *  stream.c - SoundWire Bus stream operations.
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+#include <linux/soundwire/sdw.h>
+#include "bus.h"
+
+/**
+ * sdw_release_stream() - Free the assigned stream runtime
+ *
+ * @stream: SoundWire stream runtime
+ *
+ * sdw_release_stream should be called only once per stream
+ */
+void sdw_release_stream(struct sdw_stream_runtime *stream)
+{
+	kfree(stream);
+}
+EXPORT_SYMBOL(sdw_release_stream);
+
+/**
+ * sdw_alloc_stream() - Allocate and return stream runtime
+ *
+ * @stream_name: SoundWire stream name
+ *
+ * Allocates a SoundWire stream runtime instance.
+ * sdw_alloc_stream should be called only once per stream. Typically
+ * invoked from ALSA/ASoC machine/platform driver.
+ */
+struct sdw_stream_runtime *sdw_alloc_stream(char *stream_name)
+{
+	struct sdw_stream_runtime *stream;
+
+	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+	if (!stream)
+		return NULL;
+
+	stream->name = stream_name;
+	stream->state = SDW_STREAM_ALLOCATED;
+
+	return stream;
+}
+EXPORT_SYMBOL(sdw_alloc_stream);
+
+/**
+ * sdw_alloc_master_rt() - Allocates and initialize Master runtime handle
+ *
+ * @bus: SDW bus instance
+ * @stream_config: Stream configuration
+ * @stream: Stream runtime handle.
+ *
+ * This function is to be called with bus_lock held.
+ */
+static struct sdw_master_runtime
+*sdw_alloc_master_rt(struct sdw_bus *bus,
+			struct sdw_stream_config *stream_config,
+			struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt;
+
+	m_rt = stream->m_rt;
+
+	/*
+	 * check if Master is already allocated (as a result of Slave adding
+	 * it first), if so skip allocation and go to configure
+	 */
+	if (m_rt)
+		goto stream_config;
+
+	m_rt = kzalloc(sizeof(*m_rt), GFP_KERNEL);
+	if (!m_rt)
+		return NULL;
+
+	/* Initialization of Master runtime handle */
+	INIT_LIST_HEAD(&m_rt->slave_rt_list);
+	stream->m_rt = m_rt;
+
+	list_add_tail(&m_rt->bus_node, &bus->m_rt_list);
+
+stream_config:
+	m_rt->ch_count = stream_config->ch_count;
+	m_rt->bus = bus;
+	m_rt->stream = stream;
+	m_rt->direction = stream_config->direction;
+
+	return m_rt;
+}
+
+/**
+ * sdw_alloc_slave_rt() - Allocate and initialize Slave runtime handle.
+ *
+ * @slave: Slave handle
+ * @stream_config: Stream configuration
+ * @stream: Stream runtime handle
+ *
+ * This function is to be called with bus_lock held.
+ */
+static struct sdw_slave_runtime
+*sdw_alloc_slave_rt(struct sdw_slave *slave,
+			struct sdw_stream_config *stream_config,
+			struct sdw_stream_runtime *stream)
+{
+	struct sdw_slave_runtime *s_rt = NULL;
+
+	s_rt = kzalloc(sizeof(*s_rt), GFP_KERNEL);
+	if (!s_rt)
+		return NULL;
+
+	s_rt->ch_count = stream_config->ch_count;
+	s_rt->direction = stream_config->direction;
+	s_rt->slave = slave;
+
+	return s_rt;
+}
+
+/**
+ * sdw_release_slave_stream() - Free Slave(s) runtime handle
+ *
+ * @slave: Slave handle.
+ * @stream: Stream runtime handle.
+ *
+ * This function is to be called with bus_lock held.
+ */
+static void sdw_release_slave_stream(struct sdw_slave *slave,
+			struct sdw_stream_runtime *stream)
+{
+	struct sdw_slave_runtime *s_rt, *_s_rt;
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+
+	/* Retrieve Slave runtime handle */
+	list_for_each_entry_safe(s_rt, _s_rt,
+			&m_rt->slave_rt_list, m_rt_node) {
+
+		if (s_rt->slave == slave) {
+			list_del(&s_rt->m_rt_node);
+			kfree(s_rt);
+			return;
+		}
+	}
+}
+
+/**
+ * sdw_release_master_stream() - Free Master runtime handle
+ *
+ * @stream: Stream runtime handle.
+ *
+ * This function is to be called with bus_lock held
+ * It frees the Master runtime handle and associated Slave(s) runtime
+ * handle. If this is called first then sdw_release_slave_stream() will have
+ * no effect as Slave(s) runtime handle would already be freed up.
+ */
+static void sdw_release_master_stream(struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+	struct sdw_slave_runtime *s_rt, *_s_rt;
+
+	list_for_each_entry_safe(s_rt, _s_rt,
+			&m_rt->slave_rt_list, m_rt_node)
+		sdw_stream_remove_slave(s_rt->slave, stream);
+
+	list_del(&m_rt->bus_node);
+}
+
+/**
+ * sdw_stream_remove_master() - Remove master from sdw_stream
+ *
+ * @bus: SDW Bus instance
+ * @stream: SoundWire stream
+ *
+ * This removes and frees master_rt from a stream
+ */
+int sdw_stream_remove_master(struct sdw_bus *bus,
+		struct sdw_stream_runtime *stream)
+{
+	mutex_lock(&bus->bus_lock);
+
+	sdw_release_master_stream(stream);
+	stream->state = SDW_STREAM_RELEASED;
+	kfree(stream->m_rt);
+	stream->m_rt = NULL;
+
+	mutex_unlock(&bus->bus_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(sdw_stream_remove_master);
+
+/**
+ * sdw_stream_remove_slave() - Remove slave from sdw_stream
+ *
+ * @slave: SDW Slave instance
+ * @stream: SoundWire stream
+ *
+ * This removes and frees slave_rt from a stream
+ */
+int sdw_stream_remove_slave(struct sdw_slave *slave,
+		struct sdw_stream_runtime *stream)
+{
+	mutex_lock(&slave->bus->bus_lock);
+
+	sdw_release_slave_stream(slave, stream);
+
+	mutex_unlock(&slave->bus->bus_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(sdw_stream_remove_slave);
+
+/**
+ * sdw_config_stream() - Configure the allocated stream
+ *
+ * @dev: SDW device
+ * @stream: SoundWire stream
+ * @stream_config: Stream configuration for audio stream
+ * @is_slave: is API called from Slave or Master
+ *
+ * This function is to be called with bus_lock held.
+ */
+static int sdw_config_stream(struct device *dev,
+		struct sdw_stream_runtime *stream,
+		struct sdw_stream_config *stream_config, bool is_slave)
+{
+	/*
+	 * Update the stream rate, channel and bps based on data
+	 * source. For more than one data source (multilink),
+	 * match the rate, bps, stream type and increment number of channels.
+	 *
+	 * If rate/bps is zero, it means the values are not set, so skip
+	 * comparison and allow the value to be set and stored in stream
+	 */
+	if (stream->params.rate &&
+			stream->params.rate != stream_config->frame_rate) {
+		dev_err(dev, "rate not matching, stream:%s", stream->name);
+		return -EINVAL;
+	}
+
+	if (stream->params.bps &&
+			stream->params.bps != stream_config->bps) {
+		dev_err(dev, "bps not matching, stream:%s", stream->name);
+		return -EINVAL;
+	}
+
+	stream->type = stream_config->type;
+	stream->params.rate = stream_config->frame_rate;
+	stream->params.bps = stream_config->bps;
+
+	/* TODO: Update this check during Device-device support */
+	if (is_slave)
+		stream->params.ch_count += stream_config->ch_count;
+
+	return 0;
+}
+
+/**
+ * sdw_stream_add_master() - Allocate and add master runtime to a stream
+ *
+ * @bus: SDW Bus instance
+ * @stream_config: Stream configuration for audio stream
+ * @stream: SoundWire stream
+ */
+int sdw_stream_add_master(struct sdw_bus *bus,
+		struct sdw_stream_config *stream_config,
+		struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt = NULL;
+	int ret;
+
+	mutex_lock(&bus->bus_lock);
+
+	m_rt = sdw_alloc_master_rt(bus, stream_config, stream);
+	if (!m_rt) {
+		dev_err(bus->dev,
+				"Master runtime config failed for stream:%s",
+				stream->name);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = sdw_config_stream(bus->dev, stream, stream_config, false);
+	if (ret)
+		goto stream_error;
+
+	stream->state = SDW_STREAM_CONFIGURED;
+
+stream_error:
+	sdw_release_master_stream(stream);
+error:
+	mutex_unlock(&bus->bus_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sdw_stream_add_master);
+
+/**
+ * sdw_stream_add_slave() - Allocate and add master/slave runtime to a stream
+ *
+ * @slave: SDW Slave instance
+ * @stream_config: Stream configuration for audio stream
+ * @stream: SoundWire stream
+ */
+int sdw_stream_add_slave(struct sdw_slave *slave,
+		struct sdw_stream_config *stream_config,
+		struct sdw_stream_runtime *stream)
+{
+	struct sdw_slave_runtime *s_rt;
+	struct sdw_master_runtime *m_rt;
+	int ret;
+
+	mutex_lock(&slave->bus->bus_lock);
+
+	/*
+	 * If this API is invoked by Slave first then m_rt is not valid.
+	 * So, allocate m_rt and add Slave to it.
+	 */
+	m_rt = sdw_alloc_master_rt(slave->bus, stream_config, stream);
+	if (!m_rt) {
+		dev_err(&slave->dev,
+				"alloc master runtime failed for stream:%s",
+				stream->name);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	s_rt = sdw_alloc_slave_rt(slave, stream_config, stream);
+	if (!s_rt) {
+		dev_err(&slave->dev,
+				"Slave runtime config failed for stream:%s",
+				stream->name);
+		ret = -ENOMEM;
+		goto stream_error;
+	}
+
+	ret = sdw_config_stream(&slave->dev, stream, stream_config, true);
+	if (ret)
+		goto stream_error;
+
+	list_add_tail(&s_rt->m_rt_node, &m_rt->slave_rt_list);
+
+	stream->state = SDW_STREAM_CONFIGURED;
+	goto error;
+
+stream_error:
+	/*
+	 * we hit error so cleanup the stream, release all Slave(s) and
+	 * Master runtime
+	 */
+	sdw_release_master_stream(stream);
+error:
+	mutex_unlock(&slave->bus->bus_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sdw_stream_add_slave);
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index e91fdcf41049..610a98da103e 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -61,6 +61,30 @@ enum sdw_command_response {
 	SDW_CMD_FAIL_OTHER = 4,
 };
 
+/**
+ * enum sdw_stream_type: data stream type
+ *
+ * @SDW_STREAM_PCM: PCM data stream
+ * @SDW_STREAM_PDM: PDM data stream
+ *
+ * spec doesn't define this, but is used in implementation
+ */
+enum sdw_stream_type {
+	SDW_STREAM_PCM = 0,
+	SDW_STREAM_PDM = 1,
+};
+
+/**
+ * enum sdw_data_direction: Data direction
+ *
+ * @SDW_DATA_DIR_RX: Data into Port
+ * @SDW_DATA_DIR_TX: Data out of Port
+ */
+enum sdw_data_direction {
+	SDW_DATA_DIR_RX = 0,
+	SDW_DATA_DIR_TX = 1,
+};
+
 /*
  * SDW properties, defined in MIPI DisCo spec v1.0
  */
@@ -450,6 +474,9 @@ struct sdw_master_ops {
  * @msg_lock: message lock
  * @ops: Master callback ops
  * @prop: Master properties
+ * @m_rt_list: List of Master instance of all stream(s) running on Bus. This
+ * is used to compute and program bus bandwidth, clock, frame shape,
+ * transport and port parameters
  * @defer_msg: Defer message
  * @clk_stop_timeout: Clock stop timeout computed
  */
@@ -462,6 +489,7 @@ struct sdw_bus {
 	struct mutex msg_lock;
 	const struct sdw_master_ops *ops;
 	struct sdw_master_prop prop;
+	struct list_head m_rt_list;
 	struct sdw_defer defer_msg;
 	unsigned int clk_stop_timeout;
 };
@@ -469,6 +497,87 @@ struct sdw_bus {
 int sdw_add_bus_master(struct sdw_bus *bus);
 void sdw_delete_bus_master(struct sdw_bus *bus);
 
+/**
+ * sdw_stream_config: Master or Slave stream configuration
+ *
+ * @frame_rate: Audio frame rate of the stream, in Hz
+ * @ch_count: Channel count of the stream
+ * @bps: Number of bits per audio sample
+ * @direction: Data direction
+ * @type: Stream type PCM or PDM
+ */
+struct sdw_stream_config {
+	unsigned int frame_rate;
+	unsigned int ch_count;
+	unsigned int bps;
+	enum sdw_data_direction direction;
+	enum sdw_stream_type type;
+};
+
+/**
+ * sdw_stream_state: Stream states
+ *
+ * @SDW_STREAM_ALLOCATED: New stream allocated.
+ * @SDW_STREAM_CONFIGURED: Stream configured
+ * @SDW_STREAM_PREPARED: Stream prepared
+ * @SDW_STREAM_ENABLED: Stream enabled
+ * @SDW_STREAM_DISABLED: Stream disabled
+ * @SDW_STREAM_DEPREPARED: Stream de-prepared
+ * @SDW_STREAM_RELEASED: Stream released
+ */
+enum sdw_stream_state {
+	SDW_STREAM_ALLOCATED = 0,
+	SDW_STREAM_CONFIGURED = 1,
+	SDW_STREAM_PREPARED = 2,
+	SDW_STREAM_ENABLED = 3,
+	SDW_STREAM_DISABLED = 4,
+	SDW_STREAM_DEPREPARED = 5,
+	SDW_STREAM_RELEASED = 6,
+};
+
+/**
+ * sdw_stream_params: Stream parameters
+ *
+ * @rate: Sampling frequency, in Hz
+ * @ch_count: Number of channels
+ * @bps: bits per channel sample
+ */
+struct sdw_stream_params {
+	unsigned int rate;
+	unsigned int ch_count;
+	unsigned int bps;
+};
+
+/**
+ * sdw_stream_runtime: Runtime stream parameters
+ *
+ * @name: SoundWire stream name
+ * @params: Stream parameters
+ * @state: Current state of the stream
+ * @type: Stream type PCM or PDM
+ * @m_rt: Master runtime
+ */
+struct sdw_stream_runtime {
+	char *name;
+	struct sdw_stream_params params;
+	enum sdw_stream_state state;
+	enum sdw_stream_type type;
+	struct sdw_master_runtime *m_rt;
+};
+
+struct sdw_stream_runtime *sdw_alloc_stream(char *stream_name);
+void sdw_release_stream(struct sdw_stream_runtime *stream);
+int sdw_stream_add_master(struct sdw_bus *bus,
+		struct sdw_stream_config *stream_config,
+		struct sdw_stream_runtime *stream);
+int sdw_stream_add_slave(struct sdw_slave *slave,
+		struct sdw_stream_config *stream_config,
+		struct sdw_stream_runtime *stream);
+int sdw_stream_remove_master(struct sdw_bus *bus,
+		struct sdw_stream_runtime *stream);
+int sdw_stream_remove_slave(struct sdw_slave *slave,
+		struct sdw_stream_runtime *stream);
+
 /* messaging and data APIs */
 
 int sdw_read(struct sdw_slave *slave, u32 addr);
-- 
cgit v1.2.3


From bbe7379d8040a88ea832fdcbe49a9d16e9b5755e Mon Sep 17 00:00:00 2001
From: Sanyog Kale <sanyog.r.kale@intel.com>
Date: Thu, 26 Apr 2018 18:38:13 +0530
Subject: soundwire: Add support for port management

Add Soundwire port data structures and APIS for initialization
and release of ports.

Signed-off-by: Sanyog Kale <sanyog.r.kale@intel.com>
Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.h       |  25 ++++++++
 drivers/soundwire/stream.c    | 144 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/soundwire/sdw.h |  67 ++++++++++++++++++++
 3 files changed, 234 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/bus.h b/drivers/soundwire/bus.h
index 2e5043de9a4b..39e6811e435c 100644
--- a/drivers/soundwire/bus.h
+++ b/drivers/soundwire/bus.h
@@ -45,6 +45,27 @@ struct sdw_msg {
 	bool page;
 };
 
+/**
+ * sdw_port_runtime: Runtime port parameters for Master or Slave
+ *
+ * @num: Port number. For audio streams, valid port number ranges from
+ * [1,14]
+ * @ch_mask: Channel mask
+ * @transport_params: Transport parameters
+ * @port_params: Port parameters
+ * @port_node: List node for Master or Slave port_list
+ *
+ * SoundWire spec has no mention of ports for Master interface but the
+ * concept is logically extended.
+ */
+struct sdw_port_runtime {
+	int num;
+	int ch_mask;
+	struct sdw_transport_params transport_params;
+	struct sdw_port_params port_params;
+	struct list_head port_node;
+};
+
 /**
  * sdw_slave_runtime: Runtime Stream parameters for Slave
  *
@@ -53,12 +74,14 @@ struct sdw_msg {
  * @ch_count: Number of channels handled by the Slave for
  * this stream
  * @m_rt_node: sdw_master_runtime list node
+ * @port_list: List of Slave Ports configured for this stream
  */
 struct sdw_slave_runtime {
 	struct sdw_slave *slave;
 	enum sdw_data_direction direction;
 	unsigned int ch_count;
 	struct list_head m_rt_node;
+	struct list_head port_list;
 };
 
 /**
@@ -70,6 +93,7 @@ struct sdw_slave_runtime {
  * @ch_count: Number of channels handled by the Master for
  * this stream, can be zero.
  * @slave_rt_list: Slave runtime list
+ * @port_list: List of Master Ports configured for this stream, can be zero.
  * @bus_node: sdw_bus m_rt_list node
  */
 struct sdw_master_runtime {
@@ -78,6 +102,7 @@ struct sdw_master_runtime {
 	enum sdw_data_direction direction;
 	unsigned int ch_count;
 	struct list_head slave_rt_list;
+	struct list_head port_list;
 	struct list_head bus_node;
 };
 
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 89b2550ea453..289342340262 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -81,6 +81,7 @@ static struct sdw_master_runtime
 		return NULL;
 
 	/* Initialization of Master runtime handle */
+	INIT_LIST_HEAD(&m_rt->port_list);
 	INIT_LIST_HEAD(&m_rt->slave_rt_list);
 	stream->m_rt = m_rt;
 
@@ -115,6 +116,7 @@ static struct sdw_slave_runtime
 	if (!s_rt)
 		return NULL;
 
+	INIT_LIST_HEAD(&s_rt->port_list);
 	s_rt->ch_count = stream_config->ch_count;
 	s_rt->direction = stream_config->direction;
 	s_rt->slave = slave;
@@ -122,6 +124,38 @@ static struct sdw_slave_runtime
 	return s_rt;
 }
 
+static void sdw_master_port_release(struct sdw_bus *bus,
+			struct sdw_master_runtime *m_rt)
+{
+	struct sdw_port_runtime *p_rt, *_p_rt;
+
+	list_for_each_entry_safe(p_rt, _p_rt,
+			&m_rt->port_list, port_node) {
+		list_del(&p_rt->port_node);
+		kfree(p_rt);
+	}
+}
+
+static void sdw_slave_port_release(struct sdw_bus *bus,
+			struct sdw_slave *slave,
+			struct sdw_stream_runtime *stream)
+{
+	struct sdw_port_runtime *p_rt, *_p_rt;
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+	struct sdw_slave_runtime *s_rt;
+
+	list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
+		if (s_rt->slave != slave)
+			continue;
+
+		list_for_each_entry_safe(p_rt, _p_rt,
+				&s_rt->port_list, port_node) {
+			list_del(&p_rt->port_node);
+			kfree(p_rt);
+		}
+	}
+}
+
 /**
  * sdw_release_slave_stream() - Free Slave(s) runtime handle
  *
@@ -176,7 +210,7 @@ static void sdw_release_master_stream(struct sdw_stream_runtime *stream)
  * @bus: SDW Bus instance
  * @stream: SoundWire stream
  *
- * This removes and frees master_rt from a stream
+ * This removes and frees port_rt and master_rt from a stream
  */
 int sdw_stream_remove_master(struct sdw_bus *bus,
 		struct sdw_stream_runtime *stream)
@@ -184,6 +218,7 @@ int sdw_stream_remove_master(struct sdw_bus *bus,
 	mutex_lock(&bus->bus_lock);
 
 	sdw_release_master_stream(stream);
+	sdw_master_port_release(bus, stream->m_rt);
 	stream->state = SDW_STREAM_RELEASED;
 	kfree(stream->m_rt);
 	stream->m_rt = NULL;
@@ -200,13 +235,14 @@ EXPORT_SYMBOL(sdw_stream_remove_master);
  * @slave: SDW Slave instance
  * @stream: SoundWire stream
  *
- * This removes and frees slave_rt from a stream
+ * This removes and frees port_rt and slave_rt from a stream
  */
 int sdw_stream_remove_slave(struct sdw_slave *slave,
 		struct sdw_stream_runtime *stream)
 {
 	mutex_lock(&slave->bus->bus_lock);
 
+	sdw_slave_port_release(slave->bus, slave, stream);
 	sdw_release_slave_stream(slave, stream);
 
 	mutex_unlock(&slave->bus->bus_lock);
@@ -260,15 +296,107 @@ static int sdw_config_stream(struct device *dev,
 	return 0;
 }
 
+static int sdw_is_valid_port_range(struct device *dev,
+				struct sdw_port_runtime *p_rt)
+{
+	if (!SDW_VALID_PORT_RANGE(p_rt->num)) {
+		dev_err(dev,
+			"SoundWire: Invalid port number :%d", p_rt->num);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct sdw_port_runtime *sdw_port_alloc(struct device *dev,
+				struct sdw_port_config *port_config,
+				int port_index)
+{
+	struct sdw_port_runtime *p_rt;
+
+	p_rt = kzalloc(sizeof(*p_rt), GFP_KERNEL);
+	if (!p_rt)
+		return NULL;
+
+	p_rt->ch_mask = port_config[port_index].ch_mask;
+	p_rt->num = port_config[port_index].num;
+
+	return p_rt;
+}
+
+static int sdw_master_port_config(struct sdw_bus *bus,
+			struct sdw_master_runtime *m_rt,
+			struct sdw_port_config *port_config,
+			unsigned int num_ports)
+{
+	struct sdw_port_runtime *p_rt;
+	int i;
+
+	/* Iterate for number of ports to perform initialization */
+	for (i = 0; i < num_ports; i++) {
+		p_rt = sdw_port_alloc(bus->dev, port_config, i);
+		if (!p_rt)
+			return -ENOMEM;
+
+		/*
+		 * TODO: Check port capabilities for requested
+		 * configuration (audio mode support)
+		 */
+
+		list_add_tail(&p_rt->port_node, &m_rt->port_list);
+	}
+
+	return 0;
+}
+
+static int sdw_slave_port_config(struct sdw_slave *slave,
+			struct sdw_slave_runtime *s_rt,
+			struct sdw_port_config *port_config,
+			unsigned int num_config)
+{
+	struct sdw_port_runtime *p_rt;
+	int i, ret;
+
+	/* Iterate for number of ports to perform initialization */
+	for (i = 0; i < num_config; i++) {
+		p_rt = sdw_port_alloc(&slave->dev, port_config, i);
+		if (!p_rt)
+			return -ENOMEM;
+
+		/*
+		 * TODO: Check valid port range as defined by DisCo/
+		 * slave
+		 */
+		ret = sdw_is_valid_port_range(&slave->dev, p_rt);
+		if (ret < 0) {
+			kfree(p_rt);
+			return ret;
+		}
+
+		/*
+		 * TODO: Check port capabilities for requested
+		 * configuration (audio mode support)
+		 */
+
+		list_add_tail(&p_rt->port_node, &s_rt->port_list);
+	}
+
+	return 0;
+}
+
 /**
  * sdw_stream_add_master() - Allocate and add master runtime to a stream
  *
  * @bus: SDW Bus instance
  * @stream_config: Stream configuration for audio stream
+ * @port_config: Port configuration for audio stream
+ * @num_ports: Number of ports
  * @stream: SoundWire stream
  */
 int sdw_stream_add_master(struct sdw_bus *bus,
 		struct sdw_stream_config *stream_config,
+		struct sdw_port_config *port_config,
+		unsigned int num_ports,
 		struct sdw_stream_runtime *stream)
 {
 	struct sdw_master_runtime *m_rt = NULL;
@@ -289,6 +417,10 @@ int sdw_stream_add_master(struct sdw_bus *bus,
 	if (ret)
 		goto stream_error;
 
+	ret = sdw_master_port_config(bus, m_rt, port_config, num_ports);
+	if (ret)
+		goto stream_error;
+
 	stream->state = SDW_STREAM_CONFIGURED;
 
 stream_error:
@@ -305,9 +437,13 @@ EXPORT_SYMBOL(sdw_stream_add_master);
  * @slave: SDW Slave instance
  * @stream_config: Stream configuration for audio stream
  * @stream: SoundWire stream
+ * @port_config: Port configuration for audio stream
+ * @num_ports: Number of ports
  */
 int sdw_stream_add_slave(struct sdw_slave *slave,
 		struct sdw_stream_config *stream_config,
+		struct sdw_port_config *port_config,
+		unsigned int num_ports,
 		struct sdw_stream_runtime *stream)
 {
 	struct sdw_slave_runtime *s_rt;
@@ -344,6 +480,10 @@ int sdw_stream_add_slave(struct sdw_slave *slave,
 
 	list_add_tail(&s_rt->m_rt_node, &m_rt->slave_rt_list);
 
+	ret = sdw_slave_port_config(slave, s_rt, port_config, num_ports);
+	if (ret)
+		goto stream_error;
+
 	stream->state = SDW_STREAM_CONFIGURED;
 	goto error;
 
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 610a98da103e..5e06a0de508c 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -26,6 +26,8 @@ struct sdw_slave;
 
 #define SDW_MAX_DEVICES			11
 
+#define SDW_VALID_PORT_RANGE(n)		(n <= 14 && n >= 1)
+
 /**
  * enum sdw_slave_status - Slave status
  * @SDW_SLAVE_UNATTACHED: Slave is not attached with the bus.
@@ -430,6 +432,56 @@ int sdw_handle_slave_status(struct sdw_bus *bus,
  * SDW master structures and APIs
  */
 
+/**
+ * struct sdw_port_params: Data Port parameters
+ *
+ * @num: Port number
+ * @bps: Word length of the Port
+ * @flow_mode: Port Data flow mode
+ * @data_mode: Test modes or normal mode
+ *
+ * This is used to program the Data Port based on Data Port stream
+ * parameters.
+ */
+struct sdw_port_params {
+	unsigned int num;
+	unsigned int bps;
+	unsigned int flow_mode;
+	unsigned int data_mode;
+};
+
+/**
+ * struct sdw_transport_params: Data Port Transport Parameters
+ *
+ * @blk_grp_ctrl_valid: Port implements block group control
+ * @num: Port number
+ * @blk_grp_ctrl: Block group control value
+ * @sample_interval: Sample interval
+ * @offset1: Blockoffset of the payload data
+ * @offset2: Blockoffset of the payload data
+ * @hstart: Horizontal start of the payload data
+ * @hstop: Horizontal stop of the payload data
+ * @blk_pkg_mode: Block per channel or block per port
+ * @lane_ctrl: Data lane Port uses for Data transfer. Currently only single
+ * data lane is supported in bus
+ *
+ * This is used to program the Data Port based on Data Port transport
+ * parameters. All these parameters are banked and can be modified
+ * during a bank switch without any artifacts in audio stream.
+ */
+struct sdw_transport_params {
+	bool blk_grp_ctrl_valid;
+	unsigned int port_num;
+	unsigned int blk_grp_ctrl;
+	unsigned int sample_interval;
+	unsigned int offset1;
+	unsigned int offset2;
+	unsigned int hstart;
+	unsigned int hstop;
+	unsigned int blk_pkg_mode;
+	unsigned int lane_ctrl;
+};
+
 struct sdw_msg;
 
 /**
@@ -497,6 +549,17 @@ struct sdw_bus {
 int sdw_add_bus_master(struct sdw_bus *bus);
 void sdw_delete_bus_master(struct sdw_bus *bus);
 
+/**
+ * sdw_port_config: Master or Slave Port configuration
+ *
+ * @num: Port number
+ * @ch_mask: channels mask for port
+ */
+struct sdw_port_config {
+	unsigned int num;
+	unsigned int ch_mask;
+};
+
 /**
  * sdw_stream_config: Master or Slave stream configuration
  *
@@ -569,9 +632,13 @@ struct sdw_stream_runtime *sdw_alloc_stream(char *stream_name);
 void sdw_release_stream(struct sdw_stream_runtime *stream);
 int sdw_stream_add_master(struct sdw_bus *bus,
 		struct sdw_stream_config *stream_config,
+		struct sdw_port_config *port_config,
+		unsigned int num_ports,
 		struct sdw_stream_runtime *stream);
 int sdw_stream_add_slave(struct sdw_slave *slave,
 		struct sdw_stream_config *stream_config,
+		struct sdw_port_config *port_config,
+		unsigned int num_ports,
 		struct sdw_stream_runtime *stream);
 int sdw_stream_remove_master(struct sdw_bus *bus,
 		struct sdw_stream_runtime *stream);
-- 
cgit v1.2.3


From f8101c74aa542b6e583513216176679a9e922b2f Mon Sep 17 00:00:00 2001
From: Sanyog Kale <sanyog.r.kale@intel.com>
Date: Thu, 26 Apr 2018 18:38:17 +0530
Subject: soundwire: Add Master and Slave port programming

Master and Slave port registers need to be programmed for each port
used in a stream. Add the helpers for port register programming.

Signed-off-by: Sanyog Kale <sanyog.r.kale@intel.com>
Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.h       |   4 +
 drivers/soundwire/stream.c    | 262 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/soundwire/sdw.h |  47 +++++++-
 3 files changed, 312 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/soundwire/bus.h b/drivers/soundwire/bus.h
index 39e6811e435c..133268ab8086 100644
--- a/drivers/soundwire/bus.h
+++ b/drivers/soundwire/bus.h
@@ -106,6 +106,10 @@ struct sdw_master_runtime {
 	struct list_head bus_node;
 };
 
+struct sdw_dpn_prop *sdw_get_slave_dpn_prop(struct sdw_slave *slave,
+				enum sdw_data_direction direction,
+				unsigned int port_num);
+
 int sdw_transfer(struct sdw_bus *bus, struct sdw_msg *msg);
 int sdw_transfer_defer(struct sdw_bus *bus, struct sdw_msg *msg,
 				struct sdw_defer *defer);
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 289342340262..ea6ee96b02d2 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -11,9 +11,238 @@
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
+#include <linux/soundwire/sdw_registers.h>
 #include <linux/soundwire/sdw.h>
 #include "bus.h"
 
+static int _sdw_program_slave_port_params(struct sdw_bus *bus,
+				struct sdw_slave *slave,
+				struct sdw_transport_params *t_params,
+				enum sdw_dpn_type type)
+{
+	u32 addr1, addr2, addr3, addr4;
+	int ret;
+	u16 wbuf;
+
+	if (bus->params.next_bank) {
+		addr1 = SDW_DPN_OFFSETCTRL2_B1(t_params->port_num);
+		addr2 = SDW_DPN_BLOCKCTRL3_B1(t_params->port_num);
+		addr3 = SDW_DPN_SAMPLECTRL2_B1(t_params->port_num);
+		addr4 = SDW_DPN_HCTRL_B1(t_params->port_num);
+	} else {
+		addr1 = SDW_DPN_OFFSETCTRL2_B0(t_params->port_num);
+		addr2 = SDW_DPN_BLOCKCTRL3_B0(t_params->port_num);
+		addr3 = SDW_DPN_SAMPLECTRL2_B0(t_params->port_num);
+		addr4 = SDW_DPN_HCTRL_B0(t_params->port_num);
+	}
+
+	/* Program DPN_OffsetCtrl2 registers */
+	ret = sdw_write(slave, addr1, t_params->offset2);
+	if (ret < 0) {
+		dev_err(bus->dev, "DPN_OffsetCtrl2 register write failed");
+		return ret;
+	}
+
+	/* Program DPN_BlockCtrl3 register */
+	ret = sdw_write(slave, addr2, t_params->blk_pkg_mode);
+	if (ret < 0) {
+		dev_err(bus->dev, "DPN_BlockCtrl3 register write failed");
+		return ret;
+	}
+
+	/*
+	 * Data ports are FULL, SIMPLE and REDUCED. This function handles
+	 * FULL and REDUCED only and and beyond this point only FULL is
+	 * handled, so bail out if we are not FULL data port type
+	 */
+	if (type != SDW_DPN_FULL)
+		return ret;
+
+	/* Program DPN_SampleCtrl2 register */
+	wbuf = (t_params->sample_interval - 1);
+	wbuf &= SDW_DPN_SAMPLECTRL_HIGH;
+	wbuf >>= SDW_REG_SHIFT(SDW_DPN_SAMPLECTRL_HIGH);
+
+	ret = sdw_write(slave, addr3, wbuf);
+	if (ret < 0) {
+		dev_err(bus->dev, "DPN_SampleCtrl2 register write failed");
+		return ret;
+	}
+
+	/* Program DPN_HCtrl register */
+	wbuf = t_params->hstart;
+	wbuf <<= SDW_REG_SHIFT(SDW_DPN_HCTRL_HSTART);
+	wbuf |= t_params->hstop;
+
+	ret = sdw_write(slave, addr4, wbuf);
+	if (ret < 0)
+		dev_err(bus->dev, "DPN_HCtrl register write failed");
+
+	return ret;
+}
+
+static int sdw_program_slave_port_params(struct sdw_bus *bus,
+			struct sdw_slave_runtime *s_rt,
+			struct sdw_port_runtime *p_rt)
+{
+	struct sdw_transport_params *t_params = &p_rt->transport_params;
+	struct sdw_port_params *p_params = &p_rt->port_params;
+	struct sdw_slave_prop *slave_prop = &s_rt->slave->prop;
+	u32 addr1, addr2, addr3, addr4, addr5, addr6;
+	struct sdw_dpn_prop *dpn_prop;
+	int ret;
+	u8 wbuf;
+
+	dpn_prop = sdw_get_slave_dpn_prop(s_rt->slave,
+					s_rt->direction,
+					t_params->port_num);
+	if (!dpn_prop)
+		return -EINVAL;
+
+	addr1 = SDW_DPN_PORTCTRL(t_params->port_num);
+	addr2 = SDW_DPN_BLOCKCTRL1(t_params->port_num);
+
+	if (bus->params.next_bank) {
+		addr3 = SDW_DPN_SAMPLECTRL1_B1(t_params->port_num);
+		addr4 = SDW_DPN_OFFSETCTRL1_B1(t_params->port_num);
+		addr5 = SDW_DPN_BLOCKCTRL2_B1(t_params->port_num);
+		addr6 = SDW_DPN_LANECTRL_B1(t_params->port_num);
+
+	} else {
+		addr3 = SDW_DPN_SAMPLECTRL1_B0(t_params->port_num);
+		addr4 = SDW_DPN_OFFSETCTRL1_B0(t_params->port_num);
+		addr5 = SDW_DPN_BLOCKCTRL2_B0(t_params->port_num);
+		addr6 = SDW_DPN_LANECTRL_B0(t_params->port_num);
+	}
+
+	/* Program DPN_PortCtrl register */
+	wbuf = p_params->data_mode << SDW_REG_SHIFT(SDW_DPN_PORTCTRL_DATAMODE);
+	wbuf |= p_params->flow_mode;
+
+	ret = sdw_update(s_rt->slave, addr1, 0xF, wbuf);
+	if (ret < 0) {
+		dev_err(&s_rt->slave->dev,
+			"DPN_PortCtrl register write failed for port %d",
+			t_params->port_num);
+		return ret;
+	}
+
+	/* Program DPN_BlockCtrl1 register */
+	ret = sdw_write(s_rt->slave, addr2, (p_params->bps - 1));
+	if (ret < 0) {
+		dev_err(&s_rt->slave->dev,
+			"DPN_BlockCtrl1 register write failed for port %d",
+			t_params->port_num);
+		return ret;
+	}
+
+	/* Program DPN_SampleCtrl1 register */
+	wbuf = (t_params->sample_interval - 1) & SDW_DPN_SAMPLECTRL_LOW;
+	ret = sdw_write(s_rt->slave, addr3, wbuf);
+	if (ret < 0) {
+		dev_err(&s_rt->slave->dev,
+			"DPN_SampleCtrl1 register write failed for port %d",
+			t_params->port_num);
+		return ret;
+	}
+
+	/* Program DPN_OffsetCtrl1 registers */
+	ret = sdw_write(s_rt->slave, addr4, t_params->offset1);
+	if (ret < 0) {
+		dev_err(&s_rt->slave->dev,
+			"DPN_OffsetCtrl1 register write failed for port %d",
+			t_params->port_num);
+		return ret;
+	}
+
+	/* Program DPN_BlockCtrl2 register*/
+	if (t_params->blk_grp_ctrl_valid) {
+		ret = sdw_write(s_rt->slave, addr5, t_params->blk_grp_ctrl);
+		if (ret < 0) {
+			dev_err(&s_rt->slave->dev,
+				"DPN_BlockCtrl2 reg write failed for port %d",
+				t_params->port_num);
+			return ret;
+		}
+	}
+
+	/* program DPN_LaneCtrl register */
+	if (slave_prop->lane_control_support) {
+		ret = sdw_write(s_rt->slave, addr6, t_params->lane_ctrl);
+		if (ret < 0) {
+			dev_err(&s_rt->slave->dev,
+				"DPN_LaneCtrl register write failed for port %d",
+				t_params->port_num);
+			return ret;
+		}
+	}
+
+	if (dpn_prop->type != SDW_DPN_SIMPLE) {
+		ret = _sdw_program_slave_port_params(bus, s_rt->slave,
+						t_params, dpn_prop->type);
+		if (ret < 0)
+			dev_err(&s_rt->slave->dev,
+				"Transport reg write failed for port: %d",
+				t_params->port_num);
+	}
+
+	return ret;
+}
+
+static int sdw_program_master_port_params(struct sdw_bus *bus,
+		struct sdw_port_runtime *p_rt)
+{
+	int ret;
+
+	/*
+	 * we need to set transport and port parameters for the port.
+	 * Transport parameters refers to the smaple interval, offsets and
+	 * hstart/stop etc of the data. Port parameters refers to word
+	 * length, flow mode etc of the port
+	 */
+	ret = bus->port_ops->dpn_set_port_transport_params(bus,
+					&p_rt->transport_params,
+					bus->params.next_bank);
+	if (ret < 0)
+		return ret;
+
+	return bus->port_ops->dpn_set_port_params(bus,
+				&p_rt->port_params,
+				bus->params.next_bank);
+}
+
+/**
+ * sdw_program_port_params() - Programs transport parameters of Master(s)
+ * and Slave(s)
+ *
+ * @m_rt: Master stream runtime
+ */
+static int sdw_program_port_params(struct sdw_master_runtime *m_rt)
+{
+	struct sdw_slave_runtime *s_rt = NULL;
+	struct sdw_bus *bus = m_rt->bus;
+	struct sdw_port_runtime *p_rt;
+	int ret = 0;
+
+	/* Program transport & port parameters for Slave(s) */
+	list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
+		list_for_each_entry(p_rt, &s_rt->port_list, port_node) {
+			ret = sdw_program_slave_port_params(bus, s_rt, p_rt);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	/* Program transport & port parameters for Master(s) */
+	list_for_each_entry(p_rt, &m_rt->port_list, port_node) {
+		ret = sdw_program_master_port_params(bus, p_rt);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 /**
  * sdw_release_stream() - Free the assigned stream runtime
  *
@@ -498,3 +727,36 @@ error:
 	return ret;
 }
 EXPORT_SYMBOL(sdw_stream_add_slave);
+
+/**
+ * sdw_get_slave_dpn_prop() - Get Slave port capabilities
+ *
+ * @slave: Slave handle
+ * @direction: Data direction.
+ * @port_num: Port number
+ */
+struct sdw_dpn_prop *sdw_get_slave_dpn_prop(struct sdw_slave *slave,
+				enum sdw_data_direction direction,
+				unsigned int port_num)
+{
+	struct sdw_dpn_prop *dpn_prop;
+	u8 num_ports;
+	int i;
+
+	if (direction == SDW_DATA_DIR_TX) {
+		num_ports = hweight32(slave->prop.source_ports);
+		dpn_prop = slave->prop.src_dpn_prop;
+	} else {
+		num_ports = hweight32(slave->prop.sink_ports);
+		dpn_prop = slave->prop.sink_dpn_prop;
+	}
+
+	for (i = 0; i < num_ports; i++) {
+		dpn_prop = &dpn_prop[i];
+
+		if (dpn_prop->num == port_num)
+			return &dpn_prop[i];
+	}
+
+	return NULL;
+}
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 5e06a0de508c..03b55804d830 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -367,7 +367,30 @@ struct sdw_slave_intr_status {
 };
 
 /**
- * struct sdw_slave_ops - Slave driver callback ops
+ * sdw_reg_bank - SoundWire register banks
+ * @SDW_BANK0: Soundwire register bank 0
+ * @SDW_BANK1: Soundwire register bank 1
+ */
+enum sdw_reg_bank {
+	SDW_BANK0,
+	SDW_BANK1,
+};
+
+/**
+ * struct sdw_bus_params: Structure holding bus configuration
+ *
+ * @curr_bank: Current bank in use (BANK0/BANK1)
+ * @next_bank: Next bank to use (BANK0/BANK1). next_bank will always be
+ * set to !curr_bank
+ */
+struct sdw_bus_params {
+	enum sdw_reg_bank curr_bank;
+	enum sdw_reg_bank next_bank;
+};
+
+/**
+ * struct sdw_slave_ops: Slave driver callback ops
+ *
  * @read_prop: Read Slave properties
  * @interrupt_callback: Device interrupt notification (invoked in thread
  * context)
@@ -482,6 +505,24 @@ struct sdw_transport_params {
 	unsigned int lane_ctrl;
 };
 
+/**
+ * struct sdw_master_port_ops: Callback functions from bus to Master
+ * driver to set Master Data ports.
+ *
+ * @dpn_set_port_params: Set the Port parameters for the Master Port.
+ * Mandatory callback
+ * @dpn_set_port_transport_params: Set transport parameters for the Master
+ * Port. Mandatory callback
+ */
+struct sdw_master_port_ops {
+	int (*dpn_set_port_params)(struct sdw_bus *bus,
+			struct sdw_port_params *port_params,
+			unsigned int bank);
+	int (*dpn_set_port_transport_params)(struct sdw_bus *bus,
+			struct sdw_transport_params *transport_params,
+			enum sdw_reg_bank bank);
+};
+
 struct sdw_msg;
 
 /**
@@ -525,6 +566,8 @@ struct sdw_master_ops {
  * @bus_lock: bus lock
  * @msg_lock: message lock
  * @ops: Master callback ops
+ * @port_ops: Master port callback ops
+ * @params: Current bus parameters
  * @prop: Master properties
  * @m_rt_list: List of Master instance of all stream(s) running on Bus. This
  * is used to compute and program bus bandwidth, clock, frame shape,
@@ -540,6 +583,8 @@ struct sdw_bus {
 	struct mutex bus_lock;
 	struct mutex msg_lock;
 	const struct sdw_master_ops *ops;
+	const struct sdw_master_port_ops *port_ops;
+	struct sdw_bus_params params;
 	struct sdw_master_prop prop;
 	struct list_head m_rt_list;
 	struct sdw_defer defer_msg;
-- 
cgit v1.2.3


From 79df15b7d37c80aee5dff361841500c79e62b1e0 Mon Sep 17 00:00:00 2001
From: Sanyog Kale <sanyog.r.kale@intel.com>
Date: Thu, 26 Apr 2018 18:38:23 +0530
Subject: soundwire: Add helpers for ports operations

Add helpers to configure, prepare, enable, disable and
de-prepare ports.

Signed-off-by: Sanyog Kale <sanyog.r.kale@intel.com>
Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       |  26 ++++
 drivers/soundwire/bus.h       |   2 +
 drivers/soundwire/stream.c    | 271 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/soundwire/sdw.h |  54 +++++++++
 4 files changed, 353 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index abf046f6b188..b8c93f0ac0a0 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -577,6 +577,32 @@ static void sdw_modify_slave_status(struct sdw_slave *slave,
 	mutex_unlock(&slave->bus->bus_lock);
 }
 
+int sdw_configure_dpn_intr(struct sdw_slave *slave,
+			int port, bool enable, int mask)
+{
+	u32 addr;
+	int ret;
+	u8 val = 0;
+
+	addr = SDW_DPN_INTMASK(port);
+
+	/* Set/Clear port ready interrupt mask */
+	if (enable) {
+		val |= mask;
+		val |= SDW_DPN_INT_PORT_READY;
+	} else {
+		val &= ~(mask);
+		val &= ~SDW_DPN_INT_PORT_READY;
+	}
+
+	ret = sdw_update(slave, addr, (mask | SDW_DPN_INT_PORT_READY), val);
+	if (ret < 0)
+		dev_err(slave->bus->dev,
+				"SDW_DPN_INTMASK write failed:%d", val);
+
+	return ret;
+}
+
 static int sdw_initialize_slave(struct sdw_slave *slave)
 {
 	struct sdw_slave_prop *prop = &slave->prop;
diff --git a/drivers/soundwire/bus.h b/drivers/soundwire/bus.h
index 133268ab8086..07da6c4b10c4 100644
--- a/drivers/soundwire/bus.h
+++ b/drivers/soundwire/bus.h
@@ -109,6 +109,8 @@ struct sdw_master_runtime {
 struct sdw_dpn_prop *sdw_get_slave_dpn_prop(struct sdw_slave *slave,
 				enum sdw_data_direction direction,
 				unsigned int port_num);
+int sdw_configure_dpn_intr(struct sdw_slave *slave, int port,
+					bool enable, int mask);
 
 int sdw_transfer(struct sdw_bus *bus, struct sdw_msg *msg);
 int sdw_transfer_defer(struct sdw_bus *bus, struct sdw_msg *msg,
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index ea6ee96b02d2..7acb4c59f208 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -243,6 +243,277 @@ static int sdw_program_port_params(struct sdw_master_runtime *m_rt)
 	return 0;
 }
 
+/**
+ * sdw_enable_disable_slave_ports: Enable/disable slave data port
+ *
+ * @bus: bus instance
+ * @s_rt: slave runtime
+ * @p_rt: port runtime
+ * @en: enable or disable operation
+ *
+ * This function only sets the enable/disable bits in the relevant bank, the
+ * actual enable/disable is done with a bank switch
+ */
+static int sdw_enable_disable_slave_ports(struct sdw_bus *bus,
+				struct sdw_slave_runtime *s_rt,
+				struct sdw_port_runtime *p_rt, bool en)
+{
+	struct sdw_transport_params *t_params = &p_rt->transport_params;
+	u32 addr;
+	int ret;
+
+	if (bus->params.next_bank)
+		addr = SDW_DPN_CHANNELEN_B1(p_rt->num);
+	else
+		addr = SDW_DPN_CHANNELEN_B0(p_rt->num);
+
+	/*
+	 * Since bus doesn't support sharing a port across two streams,
+	 * it is safe to reset this register
+	 */
+	if (en)
+		ret = sdw_update(s_rt->slave, addr, 0xFF, p_rt->ch_mask);
+	else
+		ret = sdw_update(s_rt->slave, addr, 0xFF, 0x0);
+
+	if (ret < 0)
+		dev_err(&s_rt->slave->dev,
+			"Slave chn_en reg write failed:%d port:%d",
+			ret, t_params->port_num);
+
+	return ret;
+}
+
+static int sdw_enable_disable_master_ports(struct sdw_master_runtime *m_rt,
+			struct sdw_port_runtime *p_rt, bool en)
+{
+	struct sdw_transport_params *t_params = &p_rt->transport_params;
+	struct sdw_bus *bus = m_rt->bus;
+	struct sdw_enable_ch enable_ch;
+	int ret = 0;
+
+	enable_ch.port_num = p_rt->num;
+	enable_ch.ch_mask = p_rt->ch_mask;
+	enable_ch.enable = en;
+
+	/* Perform Master port channel(s) enable/disable */
+	if (bus->port_ops->dpn_port_enable_ch) {
+		ret = bus->port_ops->dpn_port_enable_ch(bus,
+				&enable_ch, bus->params.next_bank);
+		if (ret < 0) {
+			dev_err(bus->dev,
+				"Master chn_en write failed:%d port:%d",
+				ret, t_params->port_num);
+			return ret;
+		}
+	} else {
+		dev_err(bus->dev,
+			"dpn_port_enable_ch not supported, %s failed\n",
+			en ? "enable" : "disable");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * sdw_enable_disable_ports() - Enable/disable port(s) for Master and
+ * Slave(s)
+ *
+ * @m_rt: Master stream runtime
+ * @en: mode (enable/disable)
+ */
+static int sdw_enable_disable_ports(struct sdw_master_runtime *m_rt, bool en)
+{
+	struct sdw_port_runtime *s_port, *m_port;
+	struct sdw_slave_runtime *s_rt = NULL;
+	int ret = 0;
+
+	/* Enable/Disable Slave port(s) */
+	list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
+		list_for_each_entry(s_port, &s_rt->port_list, port_node) {
+			ret = sdw_enable_disable_slave_ports(m_rt->bus, s_rt,
+							s_port, en);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	/* Enable/Disable Master port(s) */
+	list_for_each_entry(m_port, &m_rt->port_list, port_node) {
+		ret = sdw_enable_disable_master_ports(m_rt, m_port, en);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int sdw_do_port_prep(struct sdw_slave_runtime *s_rt,
+		struct sdw_prepare_ch prep_ch, enum sdw_port_prep_ops cmd)
+{
+	const struct sdw_slave_ops *ops = s_rt->slave->ops;
+	int ret;
+
+	if (ops->port_prep) {
+		ret = ops->port_prep(s_rt->slave, &prep_ch, cmd);
+		if (ret < 0) {
+			dev_err(&s_rt->slave->dev,
+				"Slave Port Prep cmd %d failed: %d", cmd, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
+			struct sdw_slave_runtime *s_rt,
+			struct sdw_port_runtime *p_rt, bool prep)
+{
+	struct completion *port_ready = NULL;
+	struct sdw_dpn_prop *dpn_prop;
+	struct sdw_prepare_ch prep_ch;
+	unsigned int time_left;
+	bool intr = false;
+	int ret = 0, val;
+	u32 addr;
+
+	prep_ch.num = p_rt->num;
+	prep_ch.ch_mask = p_rt->ch_mask;
+
+	dpn_prop = sdw_get_slave_dpn_prop(s_rt->slave,
+					s_rt->direction,
+					prep_ch.num);
+	if (!dpn_prop) {
+		dev_err(bus->dev,
+			"Slave Port:%d properties not found", prep_ch.num);
+		return -EINVAL;
+	}
+
+	prep_ch.prepare = prep;
+
+	prep_ch.bank = bus->params.next_bank;
+
+	if (dpn_prop->device_interrupts || !dpn_prop->simple_ch_prep_sm)
+		intr = true;
+
+	/*
+	 * Enable interrupt before Port prepare.
+	 * For Port de-prepare, it is assumed that port
+	 * was prepared earlier
+	 */
+	if (prep && intr) {
+		ret = sdw_configure_dpn_intr(s_rt->slave, p_rt->num, prep,
+						dpn_prop->device_interrupts);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* Inform slave about the impending port prepare */
+	sdw_do_port_prep(s_rt, prep_ch, SDW_OPS_PORT_PRE_PREP);
+
+	/* Prepare Slave port implementing CP_SM */
+	if (!dpn_prop->simple_ch_prep_sm) {
+		addr = SDW_DPN_PREPARECTRL(p_rt->num);
+
+		if (prep)
+			ret = sdw_update(s_rt->slave, addr,
+					0xFF, p_rt->ch_mask);
+		else
+			ret = sdw_update(s_rt->slave, addr, 0xFF, 0x0);
+
+		if (ret < 0) {
+			dev_err(&s_rt->slave->dev,
+				"Slave prep_ctrl reg write failed");
+			return ret;
+		}
+
+		/* Wait for completion on port ready */
+		port_ready = &s_rt->slave->port_ready[prep_ch.num];
+		time_left = wait_for_completion_timeout(port_ready,
+				msecs_to_jiffies(dpn_prop->ch_prep_timeout));
+
+		val = sdw_read(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
+		val &= p_rt->ch_mask;
+		if (!time_left || val) {
+			dev_err(&s_rt->slave->dev,
+				"Chn prep failed for port:%d", prep_ch.num);
+			return -ETIMEDOUT;
+		}
+	}
+
+	/* Inform slaves about ports prepared */
+	sdw_do_port_prep(s_rt, prep_ch, SDW_OPS_PORT_POST_PREP);
+
+	/* Disable interrupt after Port de-prepare */
+	if (!prep && intr)
+		ret = sdw_configure_dpn_intr(s_rt->slave, p_rt->num, prep,
+						dpn_prop->device_interrupts);
+
+	return ret;
+}
+
+static int sdw_prep_deprep_master_ports(struct sdw_master_runtime *m_rt,
+				struct sdw_port_runtime *p_rt, bool prep)
+{
+	struct sdw_transport_params *t_params = &p_rt->transport_params;
+	struct sdw_bus *bus = m_rt->bus;
+	const struct sdw_master_port_ops *ops = bus->port_ops;
+	struct sdw_prepare_ch prep_ch;
+	int ret = 0;
+
+	prep_ch.num = p_rt->num;
+	prep_ch.ch_mask = p_rt->ch_mask;
+	prep_ch.prepare = prep; /* Prepare/De-prepare */
+	prep_ch.bank = bus->params.next_bank;
+
+	/* Pre-prepare/Pre-deprepare port(s) */
+	if (ops->dpn_port_prep) {
+		ret = ops->dpn_port_prep(bus, &prep_ch);
+		if (ret < 0) {
+			dev_err(bus->dev, "Port prepare failed for port:%d",
+					t_params->port_num);
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * sdw_prep_deprep_ports() - Prepare/De-prepare port(s) for Master(s) and
+ * Slave(s)
+ *
+ * @m_rt: Master runtime handle
+ * @prep: Prepare or De-prepare
+ */
+static int sdw_prep_deprep_ports(struct sdw_master_runtime *m_rt, bool prep)
+{
+	struct sdw_slave_runtime *s_rt = NULL;
+	struct sdw_port_runtime *p_rt;
+	int ret = 0;
+
+	/* Prepare/De-prepare Slave port(s) */
+	list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
+		list_for_each_entry(p_rt, &s_rt->port_list, port_node) {
+			ret = sdw_prep_deprep_slave_ports(m_rt->bus, s_rt,
+							p_rt, prep);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	/* Prepare/De-prepare Master port(s) */
+	list_for_each_entry(p_rt, &m_rt->port_list, port_node) {
+		ret = sdw_prep_deprep_master_ports(m_rt, p_rt, prep);
+		if (ret < 0)
+			return ret;
+	}
+
+	return ret;
+}
+
 /**
  * sdw_release_stream() - Free the assigned stream runtime
  *
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 03b55804d830..fab7de3c00f1 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -376,6 +376,37 @@ enum sdw_reg_bank {
 	SDW_BANK1,
 };
 
+/**
+ * struct sdw_prepare_ch: Prepare/De-prepare Data Port channel
+ *
+ * @num: Port number
+ * @ch_mask: Active channel mask
+ * @prepare: Prepare (true) /de-prepare (false) channel
+ * @bank: Register bank, which bank Slave/Master driver should program for
+ * implementation defined registers. This is always updated to next_bank
+ * value read from bus params.
+ *
+ */
+struct sdw_prepare_ch {
+	unsigned int num;
+	unsigned int ch_mask;
+	bool prepare;
+	unsigned int bank;
+};
+
+/**
+ * enum sdw_port_prep_ops: Prepare operations for Data Port
+ *
+ * @SDW_OPS_PORT_PRE_PREP: Pre prepare operation for the Port
+ * @SDW_OPS_PORT_PREP: Prepare operation for the Port
+ * @SDW_OPS_PORT_POST_PREP: Post prepare operation for the Port
+ */
+enum sdw_port_prep_ops {
+	SDW_OPS_PORT_PRE_PREP = 0,
+	SDW_OPS_PORT_PREP = 1,
+	SDW_OPS_PORT_POST_PREP = 2,
+};
+
 /**
  * struct sdw_bus_params: Structure holding bus configuration
  *
@@ -395,6 +426,7 @@ struct sdw_bus_params {
  * @interrupt_callback: Device interrupt notification (invoked in thread
  * context)
  * @update_status: Update Slave status
+ * @port_prep: Prepare the port with parameters
  */
 struct sdw_slave_ops {
 	int (*read_prop)(struct sdw_slave *sdw);
@@ -402,6 +434,9 @@ struct sdw_slave_ops {
 			struct sdw_slave_intr_status *status);
 	int (*update_status)(struct sdw_slave *slave,
 			enum sdw_slave_status status);
+	int (*port_prep)(struct sdw_slave *slave,
+			struct sdw_prepare_ch *prepare_ch,
+			enum sdw_port_prep_ops pre_ops);
 };
 
 /**
@@ -505,6 +540,19 @@ struct sdw_transport_params {
 	unsigned int lane_ctrl;
 };
 
+/**
+ * struct sdw_enable_ch: Enable/disable Data Port channel
+ *
+ * @num: Port number
+ * @ch_mask: Active channel mask
+ * @enable: Enable (true) /disable (false) channel
+ */
+struct sdw_enable_ch {
+	unsigned int port_num;
+	unsigned int ch_mask;
+	bool enable;
+};
+
 /**
  * struct sdw_master_port_ops: Callback functions from bus to Master
  * driver to set Master Data ports.
@@ -513,6 +561,8 @@ struct sdw_transport_params {
  * Mandatory callback
  * @dpn_set_port_transport_params: Set transport parameters for the Master
  * Port. Mandatory callback
+ * @dpn_port_prep: Port prepare operations for the Master Data Port.
+ * @dpn_port_enable_ch: Enable the channels of Master Port.
  */
 struct sdw_master_port_ops {
 	int (*dpn_set_port_params)(struct sdw_bus *bus,
@@ -521,6 +571,10 @@ struct sdw_master_port_ops {
 	int (*dpn_set_port_transport_params)(struct sdw_bus *bus,
 			struct sdw_transport_params *transport_params,
 			enum sdw_reg_bank bank);
+	int (*dpn_port_prep)(struct sdw_bus *bus,
+			struct sdw_prepare_ch *prepare_ch);
+	int (*dpn_port_enable_ch)(struct sdw_bus *bus,
+			struct sdw_enable_ch *enable_ch, unsigned int bank);
 };
 
 struct sdw_msg;
-- 
cgit v1.2.3


From 99b8a5d608a64254f22483f6b67a98f26d3eac8c Mon Sep 17 00:00:00 2001
From: Sanyog Kale <sanyog.r.kale@intel.com>
Date: Thu, 26 Apr 2018 18:38:28 +0530
Subject: soundwire: Add bank switch routine

SoundWire supports two registers banks. So, program the alternate bank
with new configuration and then performs bank switch.

Signed-off-by: Sanyog Kale <sanyog.r.kale@intel.com>
Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       |   7 ++
 drivers/soundwire/bus.h       |   5 ++
 drivers/soundwire/stream.c    | 202 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/soundwire/sdw.h |  48 ++++++++++
 4 files changed, 262 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index b8c93f0ac0a0..084bf71b2b87 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -78,6 +78,13 @@ int sdw_add_bus_master(struct sdw_bus *bus)
 		return ret;
 	}
 
+	/*
+	 * Default active bank will be 0 as out of reset the Slaves have
+	 * to start with bank 0 (Table 40 of Spec)
+	 */
+	bus->params.curr_bank = SDW_BANK0;
+	bus->params.next_bank = SDW_BANK1;
+
 	return 0;
 }
 EXPORT_SYMBOL(sdw_add_bus_master);
diff --git a/drivers/soundwire/bus.h b/drivers/soundwire/bus.h
index 07da6c4b10c4..3b15c4e25a3a 100644
--- a/drivers/soundwire/bus.h
+++ b/drivers/soundwire/bus.h
@@ -45,6 +45,11 @@ struct sdw_msg {
 	bool page;
 };
 
+#define SDW_DOUBLE_RATE_FACTOR		2
+
+extern int rows[SDW_FRAME_ROWS];
+extern int cols[SDW_FRAME_COLS];
+
 /**
  * sdw_port_runtime: Runtime port parameters for Master or Slave
  *
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 7acb4c59f208..bcc922062e35 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -15,6 +15,43 @@
 #include <linux/soundwire/sdw.h>
 #include "bus.h"
 
+/*
+ * Array of supported rows and columns as per MIPI SoundWire Specification 1.1
+ *
+ * The rows are arranged as per the array index value programmed
+ * in register. The index 15 has dummy value 0 in order to fill hole.
+ */
+int rows[SDW_FRAME_ROWS] = {48, 50, 60, 64, 75, 80, 125, 147,
+			96, 100, 120, 128, 150, 160, 250, 0,
+			192, 200, 240, 256, 72, 144, 90, 180};
+
+int cols[SDW_FRAME_COLS] = {2, 4, 6, 8, 10, 12, 14, 16};
+
+static int sdw_find_col_index(int col)
+{
+	int i;
+
+	for (i = 0; i < SDW_FRAME_COLS; i++) {
+		if (cols[i] == col)
+			return i;
+	}
+
+	pr_warn("Requested column not found, selecting lowest column no: 2\n");
+	return 0;
+}
+
+static int sdw_find_row_index(int row)
+{
+	int i;
+
+	for (i = 0; i < SDW_FRAME_ROWS; i++) {
+		if (rows[i] == row)
+			return i;
+	}
+
+	pr_warn("Requested row not found, selecting lowest row no: 48\n");
+	return 0;
+}
 static int _sdw_program_slave_port_params(struct sdw_bus *bus,
 				struct sdw_slave *slave,
 				struct sdw_transport_params *t_params,
@@ -514,6 +551,171 @@ static int sdw_prep_deprep_ports(struct sdw_master_runtime *m_rt, bool prep)
 	return ret;
 }
 
+/**
+ * sdw_notify_config() - Notify bus configuration
+ *
+ * @m_rt: Master runtime handle
+ *
+ * This function notifies the Master(s) and Slave(s) of the
+ * new bus configuration.
+ */
+static int sdw_notify_config(struct sdw_master_runtime *m_rt)
+{
+	struct sdw_slave_runtime *s_rt;
+	struct sdw_bus *bus = m_rt->bus;
+	struct sdw_slave *slave;
+	int ret = 0;
+
+	if (bus->ops->set_bus_conf) {
+		ret = bus->ops->set_bus_conf(bus, &bus->params);
+		if (ret < 0)
+			return ret;
+	}
+
+	list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
+		slave = s_rt->slave;
+
+		if (slave->ops->bus_config) {
+			ret = slave->ops->bus_config(slave, &bus->params);
+			if (ret < 0)
+				dev_err(bus->dev, "Notify Slave: %d failed",
+								slave->dev_num);
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * sdw_program_params() - Program transport and port parameters for Master(s)
+ * and Slave(s)
+ *
+ * @bus: SDW bus instance
+ */
+static int sdw_program_params(struct sdw_bus *bus)
+{
+	struct sdw_master_runtime *m_rt = NULL;
+	int ret = 0;
+
+	list_for_each_entry(m_rt, &bus->m_rt_list, bus_node) {
+		ret = sdw_program_port_params(m_rt);
+		if (ret < 0) {
+			dev_err(bus->dev,
+				"Program transport params failed: %d", ret);
+			return ret;
+		}
+
+		ret = sdw_notify_config(m_rt);
+		if (ret < 0) {
+			dev_err(bus->dev, "Notify bus config failed: %d", ret);
+			return ret;
+		}
+
+		/* Enable port(s) on alternate bank for all active streams */
+		if (m_rt->stream->state != SDW_STREAM_ENABLED)
+			continue;
+
+		ret = sdw_enable_disable_ports(m_rt, true);
+		if (ret < 0) {
+			dev_err(bus->dev, "Enable channel failed: %d", ret);
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+static int sdw_bank_switch(struct sdw_bus *bus)
+{
+	int col_index, row_index;
+	struct sdw_msg *wr_msg;
+	u8 *wbuf = NULL;
+	int ret = 0;
+	u16 addr;
+
+	wr_msg = kzalloc(sizeof(*wr_msg), GFP_KERNEL);
+	if (!wr_msg)
+		return -ENOMEM;
+
+	wbuf = kzalloc(sizeof(*wbuf), GFP_KERNEL);
+	if (!wbuf) {
+		ret = -ENOMEM;
+		goto error_1;
+	}
+
+	/* Get row and column index to program register */
+	col_index = sdw_find_col_index(bus->params.col);
+	row_index = sdw_find_row_index(bus->params.row);
+	wbuf[0] = col_index | (row_index << 3);
+
+	if (bus->params.next_bank)
+		addr = SDW_SCP_FRAMECTRL_B1;
+	else
+		addr = SDW_SCP_FRAMECTRL_B0;
+
+	sdw_fill_msg(wr_msg, NULL, addr, 1, SDW_BROADCAST_DEV_NUM,
+					SDW_MSG_FLAG_WRITE, wbuf);
+	wr_msg->ssp_sync = true;
+
+	ret = sdw_transfer(bus, wr_msg);
+	if (ret < 0) {
+		dev_err(bus->dev, "Slave frame_ctrl reg write failed");
+		goto error;
+	}
+
+	kfree(wr_msg);
+	kfree(wbuf);
+	bus->defer_msg.msg = NULL;
+	bus->params.curr_bank = !bus->params.curr_bank;
+	bus->params.next_bank = !bus->params.next_bank;
+
+	return 0;
+
+error:
+	kfree(wbuf);
+error_1:
+	kfree(wr_msg);
+	return ret;
+}
+
+static int do_bank_switch(struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+	const struct sdw_master_ops *ops;
+	struct sdw_bus *bus = m_rt->bus;
+	int ret = 0;
+
+	ops = bus->ops;
+
+	/* Pre-bank switch */
+	if (ops->pre_bank_switch) {
+		ret = ops->pre_bank_switch(bus);
+		if (ret < 0) {
+			dev_err(bus->dev, "Pre bank switch op failed: %d", ret);
+			return ret;
+		}
+	}
+
+	/* Bank switch */
+	ret = sdw_bank_switch(bus);
+	if (ret < 0) {
+		dev_err(bus->dev, "Bank switch failed: %d", ret);
+		return ret;
+	}
+
+	/* Post-bank switch */
+	if (ops->post_bank_switch) {
+		ret = ops->post_bank_switch(bus);
+		if (ret < 0) {
+			dev_err(bus->dev,
+					"Post bank switch op failed: %d", ret);
+		}
+	}
+
+	return ret;
+}
+
 /**
  * sdw_release_stream() - Free the assigned stream runtime
  *
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index fab7de3c00f1..61d671271592 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -23,7 +23,17 @@ struct sdw_slave;
 #define SDW_MASTER_DEV_NUM		14
 
 #define SDW_NUM_DEV_ID_REGISTERS	6
+/* frame shape defines */
 
+/*
+ * Note: The maximum row define in SoundWire spec 1.1 is 23. In order to
+ * fill hole with 0, one more dummy entry is added
+ */
+#define SDW_FRAME_ROWS		24
+#define SDW_FRAME_COLS		8
+#define SDW_FRAME_ROW_COLS		(SDW_FRAME_ROWS * SDW_FRAME_COLS)
+
+#define SDW_FRAME_CTRL_BITS		48
 #define SDW_MAX_DEVICES			11
 
 #define SDW_VALID_PORT_RANGE(n)		(n <= 14 && n >= 1)
@@ -376,6 +386,21 @@ enum sdw_reg_bank {
 	SDW_BANK1,
 };
 
+/**
+ * struct sdw_bus_conf: Bus configuration
+ *
+ * @clk_freq: Clock frequency, in Hz
+ * @num_rows: Number of rows in frame
+ * @num_cols: Number of columns in frame
+ * @bank: Next register bank
+ */
+struct sdw_bus_conf {
+	unsigned int clk_freq;
+	unsigned int num_rows;
+	unsigned int num_cols;
+	unsigned int bank;
+};
+
 /**
  * struct sdw_prepare_ch: Prepare/De-prepare Data Port channel
  *
@@ -413,10 +438,20 @@ enum sdw_port_prep_ops {
  * @curr_bank: Current bank in use (BANK0/BANK1)
  * @next_bank: Next bank to use (BANK0/BANK1). next_bank will always be
  * set to !curr_bank
+ * @max_dr_freq: Maximum double rate clock frequency supported, in Hz
+ * @curr_dr_freq: Current double rate clock frequency, in Hz
+ * @bandwidth: Current bandwidth
+ * @col: Active columns
+ * @row: Active rows
  */
 struct sdw_bus_params {
 	enum sdw_reg_bank curr_bank;
 	enum sdw_reg_bank next_bank;
+	unsigned int max_dr_freq;
+	unsigned int curr_dr_freq;
+	unsigned int bandwidth;
+	unsigned int col;
+	unsigned int row;
 };
 
 /**
@@ -426,6 +461,7 @@ struct sdw_bus_params {
  * @interrupt_callback: Device interrupt notification (invoked in thread
  * context)
  * @update_status: Update Slave status
+ * @bus_config: Update the bus config for Slave
  * @port_prep: Prepare the port with parameters
  */
 struct sdw_slave_ops {
@@ -434,6 +470,8 @@ struct sdw_slave_ops {
 			struct sdw_slave_intr_status *status);
 	int (*update_status)(struct sdw_slave *slave,
 			enum sdw_slave_status status);
+	int (*bus_config)(struct sdw_slave *slave,
+			struct sdw_bus_params *params);
 	int (*port_prep)(struct sdw_slave *slave,
 			struct sdw_prepare_ch *prepare_ch,
 			enum sdw_port_prep_ops pre_ops);
@@ -597,6 +635,9 @@ struct sdw_defer {
  * @xfer_msg: Transfer message callback
  * @xfer_msg_defer: Defer version of transfer message callback
  * @reset_page_addr: Reset the SCP page address registers
+ * @set_bus_conf: Set the bus configuration
+ * @pre_bank_switch: Callback for pre bank switch
+ * @post_bank_switch: Callback for post bank switch
  */
 struct sdw_master_ops {
 	int (*read_prop)(struct sdw_bus *bus);
@@ -608,6 +649,11 @@ struct sdw_master_ops {
 			struct sdw_defer *defer);
 	enum sdw_command_response (*reset_page_addr)
 			(struct sdw_bus *bus, unsigned int dev_num);
+	int (*set_bus_conf)(struct sdw_bus *bus,
+			struct sdw_bus_params *params);
+	int (*pre_bank_switch)(struct sdw_bus *bus);
+	int (*post_bank_switch)(struct sdw_bus *bus);
+
 };
 
 /**
@@ -628,6 +674,7 @@ struct sdw_master_ops {
  * transport and port parameters
  * @defer_msg: Defer message
  * @clk_stop_timeout: Clock stop timeout computed
+ * @bank_switch_timeout: Bank switch timeout computed
  */
 struct sdw_bus {
 	struct device *dev;
@@ -643,6 +690,7 @@ struct sdw_bus {
 	struct list_head m_rt_list;
 	struct sdw_defer defer_msg;
 	unsigned int clk_stop_timeout;
+	u32 bank_switch_timeout;
 };
 
 int sdw_add_bus_master(struct sdw_bus *bus);
-- 
cgit v1.2.3


From 5c3eb9f7bdf1f141e051605b7b4225b1e83820a2 Mon Sep 17 00:00:00 2001
From: Sanyog Kale <sanyog.r.kale@intel.com>
Date: Thu, 26 Apr 2018 18:38:33 +0530
Subject: soundwire: Add stream configuration APIs

Add APIs for prepare, enable, disable and de-prepare stream.

Signed-off-by: Sanyog Kale <sanyog.r.kale@intel.com>
Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       |   9 ++
 drivers/soundwire/stream.c    | 244 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/soundwire/sdw.h |   4 +
 3 files changed, 257 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 084bf71b2b87..dcc0ff9f0c22 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -17,6 +17,7 @@
  */
 int sdw_add_bus_master(struct sdw_bus *bus)
 {
+	struct sdw_master_prop *prop = NULL;
 	int ret;
 
 	if (!bus->dev) {
@@ -79,9 +80,17 @@ int sdw_add_bus_master(struct sdw_bus *bus)
 	}
 
 	/*
+	 * Initialize clock values based on Master properties. The max
+	 * frequency is read from max_freq property. Current assumption
+	 * is that the bus will start at highest clock frequency when
+	 * powered on.
+	 *
 	 * Default active bank will be 0 as out of reset the Slaves have
 	 * to start with bank 0 (Table 40 of Spec)
 	 */
+	prop = &bus->prop;
+	bus->params.max_dr_freq = prop->max_freq * SDW_DOUBLE_RATE_FACTOR;
+	bus->params.curr_dr_freq = bus->params.max_dr_freq;
 	bus->params.curr_bank = SDW_BANK0;
 	bus->params.next_bank = SDW_BANK1;
 
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index bcc922062e35..8974a0fcda1b 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -1233,3 +1233,247 @@ struct sdw_dpn_prop *sdw_get_slave_dpn_prop(struct sdw_slave *slave,
 
 	return NULL;
 }
+
+static int _sdw_prepare_stream(struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+	struct sdw_bus *bus = m_rt->bus;
+	struct sdw_master_prop *prop = NULL;
+	struct sdw_bus_params params;
+	int ret;
+
+	prop = &bus->prop;
+	memcpy(&params, &bus->params, sizeof(params));
+
+	/* TODO: Support Asynchronous mode */
+	if ((prop->max_freq % stream->params.rate) != 0) {
+		dev_err(bus->dev, "Async mode not supported");
+		return -EINVAL;
+	}
+
+	/* Increment cumulative bus bandwidth */
+	/* TODO: Update this during Device-Device support */
+	bus->params.bandwidth += m_rt->stream->params.rate *
+		m_rt->ch_count * m_rt->stream->params.bps;
+
+	/* Program params */
+	ret = sdw_program_params(bus);
+	if (ret < 0) {
+		dev_err(bus->dev, "Program params failed: %d", ret);
+		goto restore_params;
+	}
+
+	ret = do_bank_switch(stream);
+	if (ret < 0) {
+		dev_err(bus->dev, "Bank switch failed: %d", ret);
+		goto restore_params;
+	}
+
+	/* Prepare port(s) on the new clock configuration */
+	ret = sdw_prep_deprep_ports(m_rt, true);
+	if (ret < 0) {
+		dev_err(bus->dev, "Prepare port(s) failed ret = %d",
+				ret);
+		return ret;
+	}
+
+	stream->state = SDW_STREAM_PREPARED;
+
+	return ret;
+
+restore_params:
+	memcpy(&bus->params, &params, sizeof(params));
+	return ret;
+}
+
+/**
+ * sdw_prepare_stream() - Prepare SoundWire stream
+ *
+ * @stream: Soundwire stream
+ *
+ * Documentation/soundwire/stream.txt explains this API in detail
+ */
+int sdw_prepare_stream(struct sdw_stream_runtime *stream)
+{
+	int ret = 0;
+
+	if (!stream) {
+		pr_err("SoundWire: Handle not found for stream");
+		return -EINVAL;
+	}
+
+	mutex_lock(&stream->m_rt->bus->bus_lock);
+
+	ret = _sdw_prepare_stream(stream);
+	if (ret < 0)
+		pr_err("Prepare for stream:%s failed: %d", stream->name, ret);
+
+	mutex_unlock(&stream->m_rt->bus->bus_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sdw_prepare_stream);
+
+static int _sdw_enable_stream(struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+	struct sdw_bus *bus = m_rt->bus;
+	int ret;
+
+	/* Program params */
+	ret = sdw_program_params(bus);
+	if (ret < 0) {
+		dev_err(bus->dev, "Program params failed: %d", ret);
+		return ret;
+	}
+
+	/* Enable port(s) */
+	ret = sdw_enable_disable_ports(m_rt, true);
+	if (ret < 0) {
+		dev_err(bus->dev, "Enable port(s) failed ret: %d", ret);
+		return ret;
+	}
+
+	ret = do_bank_switch(stream);
+	if (ret < 0) {
+		dev_err(bus->dev, "Bank switch failed: %d", ret);
+		return ret;
+	}
+
+	stream->state = SDW_STREAM_ENABLED;
+	return 0;
+}
+
+/**
+ * sdw_enable_stream() - Enable SoundWire stream
+ *
+ * @stream: Soundwire stream
+ *
+ * Documentation/soundwire/stream.txt explains this API in detail
+ */
+int sdw_enable_stream(struct sdw_stream_runtime *stream)
+{
+	int ret = 0;
+
+	if (!stream) {
+		pr_err("SoundWire: Handle not found for stream");
+		return -EINVAL;
+	}
+
+	mutex_lock(&stream->m_rt->bus->bus_lock);
+
+	ret = _sdw_enable_stream(stream);
+	if (ret < 0)
+		pr_err("Enable for stream:%s failed: %d", stream->name, ret);
+
+	mutex_unlock(&stream->m_rt->bus->bus_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sdw_enable_stream);
+
+static int _sdw_disable_stream(struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+	struct sdw_bus *bus = m_rt->bus;
+	int ret;
+
+	/* Disable port(s) */
+	ret = sdw_enable_disable_ports(m_rt, false);
+	if (ret < 0) {
+		dev_err(bus->dev, "Disable port(s) failed: %d", ret);
+		return ret;
+	}
+
+	stream->state = SDW_STREAM_DISABLED;
+
+	/* Program params */
+	ret = sdw_program_params(bus);
+	if (ret < 0) {
+		dev_err(bus->dev, "Program params failed: %d", ret);
+		return ret;
+	}
+
+	return do_bank_switch(stream);
+}
+
+/**
+ * sdw_disable_stream() - Disable SoundWire stream
+ *
+ * @stream: Soundwire stream
+ *
+ * Documentation/soundwire/stream.txt explains this API in detail
+ */
+int sdw_disable_stream(struct sdw_stream_runtime *stream)
+{
+	int ret = 0;
+
+	if (!stream) {
+		pr_err("SoundWire: Handle not found for stream");
+		return -EINVAL;
+	}
+
+	mutex_lock(&stream->m_rt->bus->bus_lock);
+
+	ret = _sdw_disable_stream(stream);
+	if (ret < 0)
+		pr_err("Disable for stream:%s failed: %d", stream->name, ret);
+
+	mutex_unlock(&stream->m_rt->bus->bus_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sdw_disable_stream);
+
+static int _sdw_deprepare_stream(struct sdw_stream_runtime *stream)
+{
+	struct sdw_master_runtime *m_rt = stream->m_rt;
+	struct sdw_bus *bus = m_rt->bus;
+	int ret = 0;
+
+	/* De-prepare port(s) */
+	ret = sdw_prep_deprep_ports(m_rt, false);
+	if (ret < 0) {
+		dev_err(bus->dev, "De-prepare port(s) failed: %d", ret);
+		return ret;
+	}
+
+	stream->state = SDW_STREAM_DEPREPARED;
+
+	/* TODO: Update this during Device-Device support */
+	bus->params.bandwidth -= m_rt->stream->params.rate *
+		m_rt->ch_count * m_rt->stream->params.bps;
+
+	/* Program params */
+	ret = sdw_program_params(bus);
+	if (ret < 0) {
+		dev_err(bus->dev, "Program params failed: %d", ret);
+		return ret;
+	}
+
+	return do_bank_switch(stream);
+}
+
+/**
+ * sdw_deprepare_stream() - Deprepare SoundWire stream
+ *
+ * @stream: Soundwire stream
+ *
+ * Documentation/soundwire/stream.txt explains this API in detail
+ */
+int sdw_deprepare_stream(struct sdw_stream_runtime *stream)
+{
+	int ret = 0;
+
+	if (!stream) {
+		pr_err("SoundWire: Handle not found for stream");
+		return -EINVAL;
+	}
+
+	mutex_lock(&stream->m_rt->bus->bus_lock);
+
+	ret = _sdw_deprepare_stream(stream);
+	if (ret < 0)
+		pr_err("De-prepare for stream:%d failed: %d", ret, ret);
+
+	mutex_unlock(&stream->m_rt->bus->bus_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sdw_deprepare_stream);
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 61d671271592..399cfb295593 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -791,6 +791,10 @@ int sdw_stream_remove_master(struct sdw_bus *bus,
 		struct sdw_stream_runtime *stream);
 int sdw_stream_remove_slave(struct sdw_slave *slave,
 		struct sdw_stream_runtime *stream);
+int sdw_prepare_stream(struct sdw_stream_runtime *stream);
+int sdw_enable_stream(struct sdw_stream_runtime *stream);
+int sdw_disable_stream(struct sdw_stream_runtime *stream);
+int sdw_deprepare_stream(struct sdw_stream_runtime *stream);
 
 /* messaging and data APIs */
 
-- 
cgit v1.2.3


From 97349b60519ee508d5e93334eb8c3c4b4e0e5faa Mon Sep 17 00:00:00 2001
From: Shreyas NC <shreyas.nc@intel.com>
Date: Thu, 26 Apr 2018 18:38:38 +0530
Subject: ASoC: Add SoundWire stream programming interface

SoundWire stream needs to be propagated to all the DAIs(cpu, codec).
So, add a snd_soc_dai_set_sdw_stream() API for the same.

Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/sound/soc-dai.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h
index 8ad11669e4d8..3ddb575eed54 100644
--- a/include/sound/soc-dai.h
+++ b/include/sound/soc-dai.h
@@ -170,6 +170,8 @@ struct snd_soc_dai_ops {
 		unsigned int rx_num, unsigned int *rx_slot);
 	int (*set_tristate)(struct snd_soc_dai *dai, int tristate);
 
+	int (*set_sdw_stream)(struct snd_soc_dai *dai,
+			void *stream, int direction);
 	/*
 	 * DAI digital mute - optional.
 	 * Called by soc-core to minimise any pops.
@@ -358,4 +360,25 @@ static inline void *snd_soc_dai_get_drvdata(struct snd_soc_dai *dai)
 	return dev_get_drvdata(dai->dev);
 }
 
+/**
+ * snd_soc_dai_set_sdw_stream() - Configures a DAI for SDW stream operation
+ * @dai: DAI
+ * @stream: STREAM
+ * @direction: Stream direction(Playback/Capture)
+ * SoundWire subsystem doesn't have a notion of direction and we reuse
+ * the ASoC stream direction to configure sink/source ports.
+ * Playback maps to source ports and Capture for sink ports.
+ *
+ * This should be invoked with NULL to clear the stream set previously.
+ * Returns 0 on success, a negative error code otherwise.
+ */
+static inline int snd_soc_dai_set_sdw_stream(struct snd_soc_dai *dai,
+				void *stream, int direction)
+{
+	if (dai->driver->ops->set_sdw_stream)
+		return dai->driver->ops->set_sdw_stream(dai, stream, direction);
+	else
+		return -ENOTSUPP;
+}
+
 #endif
-- 
cgit v1.2.3


From c46302ec554c575e79d791c7c84fd4c795e97680 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Thu, 26 Apr 2018 18:39:05 +0530
Subject: soundwire: intel: Add audio DAI ops

Add DAI registration and DAI ops for the Intel driver along with
callback for topology configuration.

Signed-off-by: Sanyog Kale <sanyog.r.kale@intel.com>
Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/Kconfig           |   2 +-
 drivers/soundwire/intel.c           | 358 ++++++++++++++++++++++++++++++++++++
 drivers/soundwire/intel.h           |   4 +
 drivers/soundwire/intel_init.c      |   3 +
 include/linux/soundwire/sdw.h       |   3 +
 include/linux/soundwire/sdw_intel.h |  14 ++
 6 files changed, 383 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/soundwire/Kconfig b/drivers/soundwire/Kconfig
index b46084b4b1f8..19c8efb9a5ee 100644
--- a/drivers/soundwire/Kconfig
+++ b/drivers/soundwire/Kconfig
@@ -27,7 +27,7 @@ config SOUNDWIRE_INTEL
 	tristate "Intel SoundWire Master driver"
 	select SOUNDWIRE_CADENCE
 	select SOUNDWIRE_BUS
-	depends on X86 && ACPI
+	depends on X86 && ACPI && SND_SOC
 	---help---
 	  SoundWire Intel Master driver.
 	  If you have an Intel platform which has a SoundWire Master then
diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 707e435b5da1..0a8990e758f9 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -87,6 +87,12 @@
 #define SDW_ALH_STRMZCFG_DMAT		GENMASK(7, 0)
 #define SDW_ALH_STRMZCFG_CHN		GENMASK(19, 16)
 
+enum intel_pdi_type {
+	INTEL_PDI_IN = 0,
+	INTEL_PDI_OUT = 1,
+	INTEL_PDI_BD = 2,
+};
+
 struct sdw_intel {
 	struct sdw_cdns cdns;
 	int instance;
@@ -379,6 +385,347 @@ intel_pdi_alh_configure(struct sdw_intel *sdw, struct sdw_cdns_pdi *pdi)
 	intel_writel(alh, SDW_ALH_STRMZCFG(pdi->intel_alh_id), conf);
 }
 
+static int intel_config_stream(struct sdw_intel *sdw,
+			struct snd_pcm_substream *substream,
+			struct snd_soc_dai *dai,
+			struct snd_pcm_hw_params *hw_params, int link_id)
+{
+	if (sdw->res->ops && sdw->res->ops->config_stream)
+		return sdw->res->ops->config_stream(sdw->res->arg,
+				substream, dai, hw_params, link_id);
+
+	return -EIO;
+}
+
+/*
+ * DAI routines
+ */
+
+static struct sdw_cdns_port *intel_alloc_port(struct sdw_intel *sdw,
+				u32 ch, u32 dir, bool pcm)
+{
+	struct sdw_cdns *cdns = &sdw->cdns;
+	struct sdw_cdns_port *port = NULL;
+	int i, ret = 0;
+
+	for (i = 0; i < cdns->num_ports; i++) {
+		if (cdns->ports[i].assigned == true)
+			continue;
+
+		port = &cdns->ports[i];
+		port->assigned = true;
+		port->direction = dir;
+		port->ch = ch;
+		break;
+	}
+
+	if (!port) {
+		dev_err(cdns->dev, "Unable to find a free port\n");
+		return NULL;
+	}
+
+	if (pcm) {
+		ret = sdw_cdns_alloc_stream(cdns, &cdns->pcm, port, ch, dir);
+		if (ret)
+			goto out;
+
+		intel_pdi_shim_configure(sdw, port->pdi);
+		sdw_cdns_config_stream(cdns, port, ch, dir, port->pdi);
+
+		intel_pdi_alh_configure(sdw, port->pdi);
+
+	} else {
+		ret = sdw_cdns_alloc_stream(cdns, &cdns->pdm, port, ch, dir);
+	}
+
+out:
+	if (ret) {
+		port->assigned = false;
+		port = NULL;
+	}
+
+	return port;
+}
+
+static void intel_port_cleanup(struct sdw_cdns_dma_data *dma)
+{
+	int i;
+
+	for (i = 0; i < dma->nr_ports; i++) {
+		if (dma->port[i]) {
+			dma->port[i]->pdi->assigned = false;
+			dma->port[i]->pdi = NULL;
+			dma->port[i]->assigned = false;
+			dma->port[i] = NULL;
+		}
+	}
+}
+
+static int intel_hw_params(struct snd_pcm_substream *substream,
+				struct snd_pcm_hw_params *params,
+				struct snd_soc_dai *dai)
+{
+	struct sdw_cdns *cdns = snd_soc_dai_get_drvdata(dai);
+	struct sdw_intel *sdw = cdns_to_intel(cdns);
+	struct sdw_cdns_dma_data *dma;
+	struct sdw_stream_config sconfig;
+	struct sdw_port_config *pconfig;
+	int ret, i, ch, dir;
+	bool pcm = true;
+
+	dma = snd_soc_dai_get_dma_data(dai, substream);
+	if (!dma)
+		return -EIO;
+
+	ch = params_channels(params);
+	if (substream->stream == SNDRV_PCM_STREAM_CAPTURE)
+		dir = SDW_DATA_DIR_RX;
+	else
+		dir = SDW_DATA_DIR_TX;
+
+	if (dma->stream_type == SDW_STREAM_PDM) {
+		/* TODO: Check whether PDM decimator is already in use */
+		dma->nr_ports = sdw_cdns_get_stream(cdns, &cdns->pdm, ch, dir);
+		pcm = false;
+	} else {
+		dma->nr_ports = sdw_cdns_get_stream(cdns, &cdns->pcm, ch, dir);
+	}
+
+	if (!dma->nr_ports) {
+		dev_err(dai->dev, "ports/resources not available");
+		return -EINVAL;
+	}
+
+	dma->port = kcalloc(dma->nr_ports, sizeof(*dma->port), GFP_KERNEL);
+	if (!dma->port)
+		return -ENOMEM;
+
+	for (i = 0; i < dma->nr_ports; i++) {
+		dma->port[i] = intel_alloc_port(sdw, ch, dir, pcm);
+		if (!dma->port[i]) {
+			ret = -EINVAL;
+			goto port_error;
+		}
+	}
+
+	/* Inform DSP about PDI stream number */
+	for (i = 0; i < dma->nr_ports; i++) {
+		ret = intel_config_stream(sdw, substream, dai, params,
+				dma->port[i]->pdi->intel_alh_id);
+		if (ret)
+			goto port_error;
+	}
+
+	sconfig.direction = dir;
+	sconfig.ch_count = ch;
+	sconfig.frame_rate = params_rate(params);
+	sconfig.type = dma->stream_type;
+
+	if (dma->stream_type == SDW_STREAM_PDM) {
+		sconfig.frame_rate *= 50;
+		sconfig.bps = 1;
+	} else {
+		sconfig.bps = snd_pcm_format_width(params_format(params));
+	}
+
+	/* Port configuration */
+	pconfig = kcalloc(dma->nr_ports, sizeof(*pconfig), GFP_KERNEL);
+	if (!pconfig) {
+		ret =  -ENOMEM;
+		goto port_error;
+	}
+
+	for (i = 0; i < dma->nr_ports; i++) {
+		pconfig[i].num = dma->port[i]->num;
+		pconfig[i].ch_mask = (1 << ch) - 1;
+	}
+
+	ret = sdw_stream_add_master(&cdns->bus, &sconfig,
+				pconfig, dma->nr_ports, dma->stream);
+	if (ret) {
+		dev_err(cdns->dev, "add master to stream failed:%d", ret);
+		goto stream_error;
+	}
+
+	kfree(pconfig);
+	return ret;
+
+stream_error:
+	kfree(pconfig);
+port_error:
+	intel_port_cleanup(dma);
+	kfree(dma->port);
+	return ret;
+}
+
+static int
+intel_hw_free(struct snd_pcm_substream *substream, struct snd_soc_dai *dai)
+{
+	struct sdw_cdns *cdns = snd_soc_dai_get_drvdata(dai);
+	struct sdw_cdns_dma_data *dma;
+	int ret;
+
+	dma = snd_soc_dai_get_dma_data(dai, substream);
+	if (!dma)
+		return -EIO;
+
+	ret = sdw_stream_remove_master(&cdns->bus, dma->stream);
+	if (ret < 0)
+		dev_err(dai->dev, "remove master from stream %s failed: %d",
+							dma->stream->name, ret);
+
+	intel_port_cleanup(dma);
+	kfree(dma->port);
+	return ret;
+}
+
+static int intel_pcm_set_sdw_stream(struct snd_soc_dai *dai,
+					void *stream, int direction)
+{
+	return cdns_set_sdw_stream(dai, stream, true, direction);
+}
+
+static int intel_pdm_set_sdw_stream(struct snd_soc_dai *dai,
+					void *stream, int direction)
+{
+	return cdns_set_sdw_stream(dai, stream, false, direction);
+}
+
+static struct snd_soc_dai_ops intel_pcm_dai_ops = {
+	.hw_params = intel_hw_params,
+	.hw_free = intel_hw_free,
+	.shutdown = sdw_cdns_shutdown,
+	.set_sdw_stream = intel_pcm_set_sdw_stream,
+};
+
+static struct snd_soc_dai_ops intel_pdm_dai_ops = {
+	.hw_params = intel_hw_params,
+	.hw_free = intel_hw_free,
+	.shutdown = sdw_cdns_shutdown,
+	.set_sdw_stream = intel_pdm_set_sdw_stream,
+};
+
+static const struct snd_soc_component_driver dai_component = {
+	.name           = "soundwire",
+};
+
+static int intel_create_dai(struct sdw_cdns *cdns,
+			struct snd_soc_dai_driver *dais,
+			enum intel_pdi_type type,
+			u32 num, u32 off, u32 max_ch, bool pcm)
+{
+	int i;
+
+	if (num == 0)
+		return 0;
+
+	 /* TODO: Read supported rates/formats from hardware */
+	for (i = off; i < (off + num); i++) {
+		dais[i].name = kasprintf(GFP_KERNEL, "SDW%d Pin%d",
+					cdns->instance, i);
+		if (!dais[i].name)
+			return -ENOMEM;
+
+		if (type == INTEL_PDI_BD || type == INTEL_PDI_OUT) {
+			dais[i].playback.stream_name = kasprintf(GFP_KERNEL,
+							"SDW%d Tx%d",
+							cdns->instance, i);
+			if (!dais[i].playback.stream_name) {
+				kfree(dais[i].name);
+				return -ENOMEM;
+			}
+
+			dais[i].playback.channels_min = 1;
+			dais[i].playback.channels_max = max_ch;
+			dais[i].playback.rates = SNDRV_PCM_RATE_48000;
+			dais[i].playback.formats = SNDRV_PCM_FMTBIT_S16_LE;
+		}
+
+		if (type == INTEL_PDI_BD || type == INTEL_PDI_IN) {
+			dais[i].capture.stream_name = kasprintf(GFP_KERNEL,
+							"SDW%d Rx%d",
+							cdns->instance, i);
+			if (!dais[i].capture.stream_name) {
+				kfree(dais[i].name);
+				kfree(dais[i].playback.stream_name);
+				return -ENOMEM;
+			}
+
+			dais[i].playback.channels_min = 1;
+			dais[i].playback.channels_max = max_ch;
+			dais[i].capture.rates = SNDRV_PCM_RATE_48000;
+			dais[i].capture.formats = SNDRV_PCM_FMTBIT_S16_LE;
+		}
+
+		dais[i].id = SDW_DAI_ID_RANGE_START + i;
+
+		if (pcm)
+			dais[i].ops = &intel_pcm_dai_ops;
+		else
+			dais[i].ops = &intel_pdm_dai_ops;
+	}
+
+	return 0;
+}
+
+static int intel_register_dai(struct sdw_intel *sdw)
+{
+	struct sdw_cdns *cdns = &sdw->cdns;
+	struct sdw_cdns_streams *stream;
+	struct snd_soc_dai_driver *dais;
+	int num_dai, ret, off = 0;
+
+	/* DAIs are created based on total number of PDIs supported */
+	num_dai = cdns->pcm.num_pdi + cdns->pdm.num_pdi;
+
+	dais = devm_kcalloc(cdns->dev, num_dai, sizeof(*dais), GFP_KERNEL);
+	if (!dais)
+		return -ENOMEM;
+
+	/* Create PCM DAIs */
+	stream = &cdns->pcm;
+
+	ret = intel_create_dai(cdns, dais, INTEL_PDI_IN,
+			stream->num_in, off, stream->num_ch_in, true);
+	if (ret)
+		return ret;
+
+	off += cdns->pcm.num_in;
+	ret = intel_create_dai(cdns, dais, INTEL_PDI_OUT,
+			cdns->pcm.num_out, off, stream->num_ch_out, true);
+	if (ret)
+		return ret;
+
+	off += cdns->pcm.num_out;
+	ret = intel_create_dai(cdns, dais, INTEL_PDI_BD,
+			cdns->pcm.num_bd, off, stream->num_ch_bd, true);
+	if (ret)
+		return ret;
+
+	/* Create PDM DAIs */
+	stream = &cdns->pdm;
+	off += cdns->pcm.num_bd;
+	ret = intel_create_dai(cdns, dais, INTEL_PDI_IN,
+			cdns->pdm.num_in, off, stream->num_ch_in, false);
+	if (ret)
+		return ret;
+
+	off += cdns->pdm.num_in;
+	ret = intel_create_dai(cdns, dais, INTEL_PDI_OUT,
+			cdns->pdm.num_out, off, stream->num_ch_out, false);
+	if (ret)
+		return ret;
+
+	off += cdns->pdm.num_bd;
+	ret = intel_create_dai(cdns, dais, INTEL_PDI_BD,
+			cdns->pdm.num_bd, off, stream->num_ch_bd, false);
+	if (ret)
+		return ret;
+
+	return snd_soc_register_component(cdns->dev, &dai_component,
+				dais, num_dai);
+}
+
 static int intel_prop_read(struct sdw_bus *bus)
 {
 	/* Initialize with default handler to read all DisCo properties */
@@ -472,8 +819,18 @@ static int intel_probe(struct platform_device *pdev)
 		goto err_init;
 	}
 
+	/* Register DAIs */
+	ret = intel_register_dai(sdw);
+	if (ret) {
+		dev_err(sdw->cdns.dev, "DAI registration failed: %d", ret);
+		snd_soc_unregister_component(sdw->cdns.dev);
+		goto err_dai;
+	}
+
 	return 0;
 
+err_dai:
+	free_irq(sdw->res->irq, sdw);
 err_init:
 	sdw_delete_bus_master(&sdw->cdns.bus);
 err_master_reg:
@@ -487,6 +844,7 @@ static int intel_remove(struct platform_device *pdev)
 	sdw = platform_get_drvdata(pdev);
 
 	free_irq(sdw->res->irq, sdw);
+	snd_soc_unregister_component(sdw->cdns.dev);
 	sdw_delete_bus_master(&sdw->cdns.bus);
 
 	return 0;
diff --git a/drivers/soundwire/intel.h b/drivers/soundwire/intel.h
index ffa30d9535a2..c1a5bac6212e 100644
--- a/drivers/soundwire/intel.h
+++ b/drivers/soundwire/intel.h
@@ -10,6 +10,8 @@
  * @shim: Audio shim pointer
  * @alh: ALH (Audio Link Hub) pointer
  * @irq: Interrupt line
+ * @ops: Shim callback ops
+ * @arg: Shim callback ops argument
  *
  * This is set as pdata for each link instance.
  */
@@ -18,6 +20,8 @@ struct sdw_intel_link_res {
 	void __iomem *shim;
 	void __iomem *alh;
 	int irq;
+	const struct sdw_intel_ops *ops;
+	void *arg;
 };
 
 #endif /* __SDW_INTEL_LOCAL_H */
diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c
index 6f2bb99526f2..d1ea6b4d0ad3 100644
--- a/drivers/soundwire/intel_init.c
+++ b/drivers/soundwire/intel_init.c
@@ -111,6 +111,9 @@ static struct sdw_intel_ctx
 		link->res.shim = res->mmio_base + SDW_SHIM_BASE;
 		link->res.alh = res->mmio_base + SDW_ALH_BASE;
 
+		link->res.ops = res->ops;
+		link->res.arg = res->arg;
+
 		memset(&pdevinfo, 0, sizeof(pdevinfo));
 
 		pdevinfo.parent = res->parent;
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 399cfb295593..962971e6a9c7 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -38,6 +38,9 @@ struct sdw_slave;
 
 #define SDW_VALID_PORT_RANGE(n)		(n <= 14 && n >= 1)
 
+#define SDW_DAI_ID_RANGE_START		100
+#define SDW_DAI_ID_RANGE_END		200
+
 /**
  * enum sdw_slave_status - Slave status
  * @SDW_SLAVE_UNATTACHED: Slave is not attached with the bus.
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 4b37528f592d..2b9573b8aedd 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -4,18 +4,32 @@
 #ifndef __SDW_INTEL_H
 #define __SDW_INTEL_H
 
+/**
+ * struct sdw_intel_ops: Intel audio driver callback ops
+ *
+ * @config_stream: configure the stream with the hw_params
+ */
+struct sdw_intel_ops {
+	int (*config_stream)(void *arg, void *substream,
+			void *dai, void *hw_params, int stream_num);
+};
+
 /**
  * struct sdw_intel_res - Soundwire Intel resource structure
  * @mmio_base: mmio base of SoundWire registers
  * @irq: interrupt number
  * @handle: ACPI parent handle
  * @parent: parent device
+ * @ops: callback ops
+ * @arg: callback arg
  */
 struct sdw_intel_res {
 	void __iomem *mmio_base;
 	int irq;
 	acpi_handle handle;
 	struct device *parent;
+	const struct sdw_intel_ops *ops;
+	void *arg;
 };
 
 void *sdw_intel_init(acpi_handle *parent_handle, struct sdw_intel_res *res);
-- 
cgit v1.2.3


From 73a6bab5aa2a83cb7df85805e08bc03b4065aea7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 May 2018 14:59:43 -0700
Subject: tcp: switch pacing timer to softirq based hrtimer

linux-4.16 got support for softirq based hrtimers.
TCP can switch its pacing hrtimer to this variant, since this
avoids going through a tasklet and some atomic operations.

pacing timer logic looks like other (jiffies based) tcp timers.

v2: use hrtimer_try_to_cancel() in tcp_clear_xmit_timers()
    to correctly release reference on socket if needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  4 ++-
 net/ipv4/tcp_output.c | 69 +++++++++++++++++++--------------------------------
 net/ipv4/tcp_timer.c  |  2 +-
 3 files changed, 29 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf803fe0fb86..3b1d617b0110 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -557,7 +557,9 @@ void tcp_fin(struct sock *sk);
 void tcp_init_xmit_timers(struct sock *);
 static inline void tcp_clear_xmit_timers(struct sock *sk)
 {
-	hrtimer_cancel(&tcp_sk(sk)->pacing_timer);
+	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
+		sock_put(sk);
+
 	inet_csk_clear_xmit_timers(sk);
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d07c0dcc99aa..0d8f950a9006 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -772,7 +772,7 @@ struct tsq_tasklet {
 };
 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
 
-static void tcp_tsq_handler(struct sock *sk)
+static void tcp_tsq_write(struct sock *sk)
 {
 	if ((1 << sk->sk_state) &
 	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
@@ -789,6 +789,16 @@ static void tcp_tsq_handler(struct sock *sk)
 			       0, GFP_ATOMIC);
 	}
 }
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		tcp_tsq_write(sk);
+	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+		sock_hold(sk);
+	bh_unlock_sock(sk);
+}
 /*
  * One tasklet per cpu tries to send more skbs.
  * We run in tasklet context but need to disable irqs when
@@ -816,16 +826,7 @@ static void tcp_tasklet_func(unsigned long data)
 		smp_mb__before_atomic();
 		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
 
-		if (!sk->sk_lock.owned &&
-		    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
-			bh_lock_sock(sk);
-			if (!sock_owned_by_user(sk)) {
-				clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
-				tcp_tsq_handler(sk);
-			}
-			bh_unlock_sock(sk);
-		}
-
+		tcp_tsq_handler(sk);
 		sk_free(sk);
 	}
 }
@@ -853,9 +854,10 @@ void tcp_release_cb(struct sock *sk)
 		nflags = flags & ~TCP_DEFERRED_ALL;
 	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
 
-	if (flags & TCPF_TSQ_DEFERRED)
-		tcp_tsq_handler(sk);
-
+	if (flags & TCPF_TSQ_DEFERRED) {
+		tcp_tsq_write(sk);
+		__sock_put(sk);
+	}
 	/* Here begins the tricky part :
 	 * We are called from release_sock() with :
 	 * 1) BH disabled
@@ -929,7 +931,7 @@ void tcp_wfree(struct sk_buff *skb)
 		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
 			goto out;
 
-		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
 		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
 		if (nval != oval)
 			continue;
@@ -948,37 +950,17 @@ out:
 	sk_free(sk);
 }
 
-/* Note: Called under hard irq.
- * We can not call TCP stack right away.
+/* Note: Called under soft irq.
+ * We can call TCP stack right away, unless socket is owned by user.
  */
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
 {
 	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
 	struct sock *sk = (struct sock *)tp;
-	unsigned long nval, oval;
 
-	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
-		struct tsq_tasklet *tsq;
-		bool empty;
+	tcp_tsq_handler(sk);
+	sock_put(sk);
 
-		if (oval & TSQF_QUEUED)
-			break;
-
-		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
-		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
-		if (nval != oval)
-			continue;
-
-		if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
-			break;
-		/* queue this socket to tasklet queue */
-		tsq = this_cpu_ptr(&tsq_tasklet);
-		empty = list_empty(&tsq->head);
-		list_add(&tp->tsq_node, &tsq->head);
-		if (empty)
-			tasklet_schedule(&tsq->tasklet);
-		break;
-	}
 	return HRTIMER_NORESTART;
 }
 
@@ -1011,7 +993,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
 	do_div(len_ns, rate);
 	hrtimer_start(&tcp_sk(sk)->pacing_timer,
 		      ktime_add_ns(ktime_get(), len_ns),
-		      HRTIMER_MODE_ABS_PINNED);
+		      HRTIMER_MODE_ABS_PINNED_SOFT);
+	sock_hold(sk);
 }
 
 static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1078,7 +1061,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	/* if no packet is in qdisc/device queue, then allow XPS to select
 	 * another queue. We can be called from tcp_tsq_handler()
-	 * which holds one reference to sk_wmem_alloc.
+	 * which holds one reference to sk.
 	 *
 	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
 	 * One way to get this would be to set skb->truesize = 2 on them.
@@ -2185,7 +2168,7 @@ static int tcp_mtu_probe(struct sock *sk)
 static bool tcp_pacing_check(const struct sock *sk)
 {
 	return tcp_needs_internal_pacing(sk) &&
-	       hrtimer_active(&tcp_sk(sk)->pacing_timer);
+	       hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
 }
 
 /* TCP Small Queues :
@@ -2365,8 +2348,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 					  skb, limit, mss_now, gfp)))
 			break;
 
-		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
-			clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
 		if (tcp_small_queue_check(sk, skb, 0))
 			break;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f7d944855f8e..92bdf64fffae 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -713,6 +713,6 @@ void tcp_init_xmit_timers(struct sock *sk)
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
 	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_ABS_PINNED);
+		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
 }
-- 
cgit v1.2.3


From bcf3ffd405df6998914b248d2f22625544a4dd56 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:26:55 -0400
Subject: svcrdma: Add proper SPDX tags for NetApp-contributed source

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 1 +
 net/sunrpc/xprtrdma/svc_rdma.c           | 1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 1 +
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 1 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
 5 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 7337e1221590..88da0c9bd7b1 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index dd8a431dc2ae..a49053296be4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 3d45015dca97..9eae95de89a7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2016, 2017 Oracle. All rights reserved.
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 649441d5087d..79bd3a394da2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2016 Oracle. All rights reserved.
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 96cc8f6597d3..36332540f8d4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
-- 
cgit v1.2.3


From 98895edbe377e990e61817d00ab029c7b8b99f21 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:11 -0400
Subject: svcrdma: Trace key RPC/RDMA protocol events

This includes:
  * Transport accept and tear-down
  * Decisions about using Write and Reply chunks
  * Each RDMA segment that is handled
  * Whenever an RDMA_ERR is sent

As a clean-up, I've standardized the order of the includes, and
removed some now redundant dprintk call sites.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/trace/events/rpcrdma.h           | 262 ++++++++++++++++++++++++++++++-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  |  36 ++---
 net/sunrpc/xprtrdma/svc_rdma_rw.c        |  23 +--
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    |  19 +--
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  19 ++-
 5 files changed, 311 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 50ed3f8bf534..633520ac99cd 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Copyright (c) 2017 Oracle.  All rights reserved.
+ * Copyright (c) 2017, 2018 Oracle.  All rights reserved.
+ *
+ * Trace point definitions for the "rpcrdma" subsystem.
  */
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rpcrdma
@@ -885,6 +887,264 @@ TRACE_EVENT(xprtrdma_cb_setup,
 DEFINE_CB_EVENT(xprtrdma_cb_call);
 DEFINE_CB_EVENT(xprtrdma_cb_reply);
 
+/**
+ ** Server-side RPC/RDMA events
+ **/
+
+DECLARE_EVENT_CLASS(svcrdma_xprt_event,
+	TP_PROTO(
+		const struct svc_xprt *xprt
+	),
+
+	TP_ARGS(xprt),
+
+	TP_STRUCT__entry(
+		__field(const void *, xprt)
+		__string(addr, xprt->xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->xprt = xprt;
+		__assign_str(addr, xprt->xpt_remotebuf);
+	),
+
+	TP_printk("xprt=%p addr=%s",
+		__entry->xprt, __get_str(addr)
+	)
+);
+
+#define DEFINE_XPRT_EVENT(name)						\
+		DEFINE_EVENT(svcrdma_xprt_event, svcrdma_xprt_##name,	\
+				TP_PROTO(				\
+					const struct svc_xprt *xprt	\
+				),					\
+				TP_ARGS(xprt))
+
+DEFINE_XPRT_EVENT(accept);
+DEFINE_XPRT_EVENT(fail);
+DEFINE_XPRT_EVENT(free);
+
+TRACE_DEFINE_ENUM(RDMA_MSG);
+TRACE_DEFINE_ENUM(RDMA_NOMSG);
+TRACE_DEFINE_ENUM(RDMA_MSGP);
+TRACE_DEFINE_ENUM(RDMA_DONE);
+TRACE_DEFINE_ENUM(RDMA_ERROR);
+
+#define show_rpcrdma_proc(x)						\
+		__print_symbolic(x,					\
+				{ RDMA_MSG, "RDMA_MSG" },		\
+				{ RDMA_NOMSG, "RDMA_NOMSG" },		\
+				{ RDMA_MSGP, "RDMA_MSGP" },		\
+				{ RDMA_DONE, "RDMA_DONE" },		\
+				{ RDMA_ERROR, "RDMA_ERROR" })
+
+TRACE_EVENT(svcrdma_decode_rqst,
+	TP_PROTO(
+		__be32 *p,
+		unsigned int hdrlen
+	),
+
+	TP_ARGS(p, hdrlen),
+
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(u32, vers)
+		__field(u32, proc)
+		__field(u32, credits)
+		__field(unsigned int, hdrlen)
+	),
+
+	TP_fast_assign(
+		__entry->xid = be32_to_cpup(p++);
+		__entry->vers = be32_to_cpup(p++);
+		__entry->credits = be32_to_cpup(p++);
+		__entry->proc = be32_to_cpup(p);
+		__entry->hdrlen = hdrlen;
+	),
+
+	TP_printk("xid=0x%08x vers=%u credits=%u proc=%s hdrlen=%u",
+		__entry->xid, __entry->vers, __entry->credits,
+		show_rpcrdma_proc(__entry->proc), __entry->hdrlen)
+);
+
+TRACE_EVENT(svcrdma_decode_short,
+	TP_PROTO(
+		unsigned int hdrlen
+	),
+
+	TP_ARGS(hdrlen),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, hdrlen)
+	),
+
+	TP_fast_assign(
+		__entry->hdrlen = hdrlen;
+	),
+
+	TP_printk("hdrlen=%u", __entry->hdrlen)
+);
+
+DECLARE_EVENT_CLASS(svcrdma_badreq_event,
+	TP_PROTO(
+		__be32 *p
+	),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(u32, vers)
+		__field(u32, proc)
+		__field(u32, credits)
+	),
+
+	TP_fast_assign(
+		__entry->xid = be32_to_cpup(p++);
+		__entry->vers = be32_to_cpup(p++);
+		__entry->credits = be32_to_cpup(p++);
+		__entry->proc = be32_to_cpup(p);
+	),
+
+	TP_printk("xid=0x%08x vers=%u credits=%u proc=%u",
+		__entry->xid, __entry->vers, __entry->credits, __entry->proc)
+);
+
+#define DEFINE_BADREQ_EVENT(name)					\
+		DEFINE_EVENT(svcrdma_badreq_event, svcrdma_decode_##name,\
+				TP_PROTO(				\
+					__be32 *p			\
+				),					\
+				TP_ARGS(p))
+
+DEFINE_BADREQ_EVENT(badvers);
+DEFINE_BADREQ_EVENT(drop);
+DEFINE_BADREQ_EVENT(badproc);
+DEFINE_BADREQ_EVENT(parse);
+
+DECLARE_EVENT_CLASS(svcrdma_segment_event,
+	TP_PROTO(
+		u32 handle,
+		u32 length,
+		u64 offset
+	),
+
+	TP_ARGS(handle, length, offset),
+
+	TP_STRUCT__entry(
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		__entry->handle = handle;
+		__entry->length = length;
+		__entry->offset = offset;
+	),
+
+	TP_printk("%u@0x%016llx:0x%08x",
+		__entry->length, (unsigned long long)__entry->offset,
+		__entry->handle
+	)
+);
+
+#define DEFINE_SEGMENT_EVENT(name)					\
+		DEFINE_EVENT(svcrdma_segment_event, svcrdma_encode_##name,\
+				TP_PROTO(				\
+					u32 handle,			\
+					u32 length,			\
+					u64 offset			\
+				),					\
+				TP_ARGS(handle, length, offset))
+
+DEFINE_SEGMENT_EVENT(rseg);
+DEFINE_SEGMENT_EVENT(wseg);
+
+DECLARE_EVENT_CLASS(svcrdma_chunk_event,
+	TP_PROTO(
+		u32 length
+	),
+
+	TP_ARGS(length),
+
+	TP_STRUCT__entry(
+		__field(u32, length)
+	),
+
+	TP_fast_assign(
+		__entry->length = length;
+	),
+
+	TP_printk("length=%u",
+		__entry->length
+	)
+);
+
+#define DEFINE_CHUNK_EVENT(name)					\
+		DEFINE_EVENT(svcrdma_chunk_event, svcrdma_encode_##name,\
+				TP_PROTO(				\
+					u32 length			\
+				),					\
+				TP_ARGS(length))
+
+DEFINE_CHUNK_EVENT(pzr);
+DEFINE_CHUNK_EVENT(write);
+DEFINE_CHUNK_EVENT(reply);
+
+TRACE_EVENT(svcrdma_encode_read,
+	TP_PROTO(
+		u32 length,
+		u32 position
+	),
+
+	TP_ARGS(length, position),
+
+	TP_STRUCT__entry(
+		__field(u32, length)
+		__field(u32, position)
+	),
+
+	TP_fast_assign(
+		__entry->length = length;
+		__entry->position = position;
+	),
+
+	TP_printk("length=%u position=%u",
+		__entry->length, __entry->position
+	)
+);
+
+DECLARE_EVENT_CLASS(svcrdma_error_event,
+	TP_PROTO(
+		__be32 xid
+	),
+
+	TP_ARGS(xid),
+
+	TP_STRUCT__entry(
+		__field(u32, xid)
+	),
+
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(xid);
+	),
+
+	TP_printk("xid=0x%08x",
+		__entry->xid
+	)
+);
+
+#define DEFINE_ERROR_EVENT(name)					\
+		DEFINE_EVENT(svcrdma_error_event, svcrdma_err_##name,	\
+				TP_PROTO(				\
+					__be32 xid			\
+				),					\
+				TP_ARGS(xid))
+
+DEFINE_ERROR_EVENT(vers);
+DEFINE_ERROR_EVENT(chunk);
+
 #endif /* _TRACE_RPCRDMA_H */
 
 #include <trace/define_trace.h>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 9eae95de89a7..78ca5806ff0a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -93,17 +93,19 @@
  * (see rdma_read_complete() below).
  */
 
+#include <linux/spinlock.h>
 #include <asm/unaligned.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 
-#include <linux/spinlock.h>
-
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/sunrpc/svc_rdma.h>
 
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
 /*
@@ -295,7 +297,6 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
 	__be32 *p, *end, *rdma_argp;
 	unsigned int hdr_len;
-	char *proc;
 
 	/* Verify that there's enough bytes for header + something */
 	if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
@@ -307,10 +308,8 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 
 	switch (*(rdma_argp + 3)) {
 	case rdma_msg:
-		proc = "RDMA_MSG";
 		break;
 	case rdma_nomsg:
-		proc = "RDMA_NOMSG";
 		break;
 
 	case rdma_done:
@@ -340,30 +339,27 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 	hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
 	rq_arg->head[0].iov_len -= hdr_len;
 	rq_arg->len -= hdr_len;
-	dprintk("svcrdma: received %s request for XID 0x%08x, hdr_len=%u\n",
-		proc, be32_to_cpup(rdma_argp), hdr_len);
+	trace_svcrdma_decode_rqst(rdma_argp, hdr_len);
 	return hdr_len;
 
 out_short:
-	dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+	trace_svcrdma_decode_short(rq_arg->len);
 	return -EINVAL;
 
 out_version:
-	dprintk("svcrdma: bad xprt version: %u\n",
-		be32_to_cpup(rdma_argp + 1));
+	trace_svcrdma_decode_badvers(rdma_argp);
 	return -EPROTONOSUPPORT;
 
 out_drop:
-	dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+	trace_svcrdma_decode_drop(rdma_argp);
 	return 0;
 
 out_proc:
-	dprintk("svcrdma: bad rdma procedure (%u)\n",
-		be32_to_cpup(rdma_argp + 3));
+	trace_svcrdma_decode_badproc(rdma_argp);
 	return -EINVAL;
 
 out_inval:
-	dprintk("svcrdma: failed to parse transport header\n");
+	trace_svcrdma_decode_parse(rdma_argp);
 	return -EINVAL;
 }
 
@@ -412,12 +408,16 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 	*p++ = *(rdma_argp + 1);
 	*p++ = xprt->sc_fc_credits;
 	*p++ = rdma_error;
-	if (status == -EPROTONOSUPPORT) {
+	switch (status) {
+	case -EPROTONOSUPPORT:
 		*p++ = err_vers;
 		*p++ = rpcrdma_version;
 		*p++ = rpcrdma_version;
-	} else {
+		trace_svcrdma_err_vers(*rdma_argp);
+		break;
+	default:
 		*p++ = err_chunk;
+		trace_svcrdma_err_chunk(*rdma_argp);
 	}
 	length = (unsigned long)p - (unsigned long)err_msgp;
 
@@ -532,8 +532,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	}
 	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 
-	dprintk("svcrdma: recvfrom: ctxt=%p on xprt=%p, rqstp=%p\n",
-		ctxt, rdma_xprt, rqstp);
 	atomic_inc(&rdma_stat_recv);
 
 	svc_rdma_build_arg_xdr(rqstp, ctxt);
@@ -559,8 +557,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 complete:
 	svc_rdma_put_context(ctxt, 0);
-	dprintk("svcrdma: recvfrom: xprt=%p, rqstp=%p, rq_arg.len=%u\n",
-		rdma_xprt, rqstp, rqstp->rq_arg.len);
 	rqstp->rq_prot = IPPROTO_MAX;
 	svc_xprt_copy_addrs(rqstp, xprt);
 	return rqstp->rq_arg.len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 12b9a7e0b6d2..4b9cb54cf58b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -5,11 +5,14 @@
  * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
  */
 
+#include <rdma/rw.h>
+
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/sunrpc/svc_rdma.h>
 #include <linux/sunrpc/debug.h>
 
-#include <rdma/rw.h>
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
@@ -437,6 +440,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
 		if (ret < 0)
 			goto out_initerr;
 
+		trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset);
 		list_add(&ctxt->rw_list, &cc->cc_rwctxts);
 		cc->cc_sqecount += ret;
 		if (write_len == seg_length - info->wi_seg_off) {
@@ -526,6 +530,8 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
 	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
 	if (ret < 0)
 		goto out_err;
+
+	trace_svcrdma_encode_write(xdr->page_len);
 	return xdr->page_len;
 
 out_err:
@@ -582,6 +588,8 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
 	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
 	if (ret < 0)
 		goto out_err;
+
+	trace_svcrdma_encode_reply(consumed);
 	return consumed;
 
 out_err:
@@ -606,9 +614,6 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
 		goto out_noctx;
 	ctxt->rw_nents = sge_no;
 
-	dprintk("svcrdma: reading segment %u@0x%016llx:0x%08x (%u sges)\n",
-		len, offset, rkey, sge_no);
-
 	sg = ctxt->rw_sg_table.sgl;
 	for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
 		seg_len = min_t(unsigned int, len,
@@ -686,6 +691,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
 		if (ret < 0)
 			break;
 
+		trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset);
 		info->ri_chunklen += rs_length;
 	}
 
@@ -706,9 +712,6 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
 	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
 	int ret;
 
-	dprintk("svcrdma: Reading Read chunk at position %u\n",
-		info->ri_position);
-
 	info->ri_pageno = head->hdr_count;
 	info->ri_pageoff = 0;
 
@@ -716,6 +719,8 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
 	if (ret < 0)
 		goto out;
 
+	trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position);
+
 	/* Split the Receive buffer between the head and tail
 	 * buffers at Read chunk's position. XDR roundup of the
 	 * chunk is not included in either the pagelist or in
@@ -764,8 +769,6 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
 	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
 	int ret;
 
-	dprintk("svcrdma: Reading Position Zero Read chunk\n");
-
 	info->ri_pageno = head->hdr_count - 1;
 	info->ri_pageoff = offset_in_page(head->byte_len);
 
@@ -773,6 +776,8 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
 	if (ret < 0)
 		goto out;
 
+	trace_svcrdma_encode_pzr(info->ri_chunklen);
+
 	head->arg.len += info->ri_chunklen;
 	head->arg.buflen += info->ri_chunklen;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 79bd3a394da2..4c580833ec2e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -99,14 +99,19 @@
  * where two different Write segments send portions of the same page.
  */
 
-#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/rpc_rdma.h>
 #include <linux/spinlock.h>
 #include <asm/unaligned.h>
+
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
 #include <linux/sunrpc/svc_rdma.h>
 
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
 static u32 xdr_padsize(u32 len)
@@ -524,12 +529,6 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 	u32 inv_rkey;
 	int ret;
 
-	dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
-		(rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
-		rqstp->rq_res.head[0].iov_len,
-		rqstp->rq_res.page_len,
-		rqstp->rq_res.tail[0].iov_len);
-
 	ctxt = svc_rdma_get_context(rdma);
 
 	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
@@ -580,6 +579,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 	/* Replace the original transport header with an
 	 * RDMA_ERROR response. XID etc are preserved.
 	 */
+	trace_svcrdma_err_chunk(*rdma_resp);
 	p = rdma_resp + 3;
 	*p++ = rdma_error;
 	*p   = err_chunk;
@@ -635,9 +635,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	rdma_argp = page_address(rqstp->rq_pages[0]);
 	svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
 
-	dprintk("svcrdma: preparing response for XID 0x%08x\n",
-		be32_to_cpup(rdma_argp));
-
 	/* Create the RDMA response header. xprt->xpt_mutex,
 	 * acquired in svc_send(), serializes RPC replies. The
 	 * code path below that inserts the credit grant value
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 22e2595b1df6..d2cdffa9fccb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -41,21 +41,25 @@
  * Author: Tom Tucker <tom@opengridcomputing.com>
  */
 
-#include <linux/sunrpc/svc_xprt.h>
-#include <linux/sunrpc/addr.h>
-#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/rpc_rdma.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
+#include <linux/export.h>
+
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 #include <rdma/rw.h>
+
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/svc_rdma.h>
-#include <linux/export.h>
+
 #include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
@@ -862,10 +866,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	dprintk("    max_requests    : %d\n", newxprt->sc_max_requests);
 	dprintk("    ord             : %d\n", conn_param.initiator_depth);
 
+	trace_svcrdma_xprt_accept(&newxprt->sc_xprt);
 	return &newxprt->sc_xprt;
 
  errout:
 	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
+	trace_svcrdma_xprt_fail(&newxprt->sc_xprt);
 	/* Take a reference in case the DTO handler runs */
 	svc_xprt_get(&newxprt->sc_xprt);
 	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
@@ -896,7 +902,6 @@ static void svc_rdma_detach(struct svc_xprt *xprt)
 {
 	struct svcxprt_rdma *rdma =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
-	dprintk("svc: svc_rdma_detach(%p)\n", xprt);
 
 	/* Disconnect and flush posted WQE */
 	rdma_disconnect(rdma->sc_cm_id);
@@ -908,7 +913,7 @@ static void __svc_rdma_free(struct work_struct *work)
 		container_of(work, struct svcxprt_rdma, sc_work);
 	struct svc_xprt *xprt = &rdma->sc_xprt;
 
-	dprintk("svcrdma: %s(%p)\n", __func__, rdma);
+	trace_svcrdma_xprt_free(xprt);
 
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
 		ib_drain_qp(rdma->sc_qp);
-- 
cgit v1.2.3


From bd2abef33394dc16d63580c38c01420db991f0f2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:16 -0400
Subject: svcrdma: Trace key RDMA API events

This includes:
  * Posting on the Send and Receive queues
  * Send, Receive, Read, and Write completion
  * Connect upcalls
  * QP errors

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/trace/events/rpcrdma.h             | 322 +++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/backchannel.c          |   1 +
 net/sunrpc/xprtrdma/fmr_ops.c              |   2 +
 net/sunrpc/xprtrdma/frwr_ops.c             |   1 +
 net/sunrpc/xprtrdma/rpc_rdma.c             |   2 +
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |   3 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    |   2 -
 net/sunrpc/xprtrdma/svc_rdma_rw.c          |  14 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      |   6 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |  67 +++---
 net/sunrpc/xprtrdma/transport.c            |   3 +
 11 files changed, 372 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 633520ac99cd..094a676d92a7 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1145,6 +1145,328 @@ DECLARE_EVENT_CLASS(svcrdma_error_event,
 DEFINE_ERROR_EVENT(vers);
 DEFINE_ERROR_EVENT(chunk);
 
+/**
+ ** Server-side RDMA API events
+ **/
+
+TRACE_EVENT(svcrdma_dma_map_page,
+	TP_PROTO(
+		const struct svcxprt_rdma *rdma,
+		const void *page
+	),
+
+	TP_ARGS(rdma, page),
+
+	TP_STRUCT__entry(
+		__field(const void *, page);
+		__string(device, rdma->sc_cm_id->device->name)
+		__string(addr, rdma->sc_xprt.xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->page = page;
+		__assign_str(device, rdma->sc_cm_id->device->name);
+		__assign_str(addr, rdma->sc_xprt.xpt_remotebuf);
+	),
+
+	TP_printk("addr=%s device=%s page=%p",
+		__get_str(addr), __get_str(device), __entry->page
+	)
+);
+
+TRACE_EVENT(svcrdma_dma_map_rwctx,
+	TP_PROTO(
+		const struct svcxprt_rdma *rdma,
+		int status
+	),
+
+	TP_ARGS(rdma, status),
+
+	TP_STRUCT__entry(
+		__field(int, status)
+		__string(device, rdma->sc_cm_id->device->name)
+		__string(addr, rdma->sc_xprt.xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->status = status;
+		__assign_str(device, rdma->sc_cm_id->device->name);
+		__assign_str(addr, rdma->sc_xprt.xpt_remotebuf);
+	),
+
+	TP_printk("addr=%s device=%s status=%d",
+		__get_str(addr), __get_str(device), __entry->status
+	)
+);
+
+TRACE_EVENT(svcrdma_send_failed,
+	TP_PROTO(
+		const struct svc_rqst *rqst,
+		int status
+	),
+
+	TP_ARGS(rqst, status),
+
+	TP_STRUCT__entry(
+		__field(int, status)
+		__field(u32, xid)
+		__field(const void *, xprt)
+		__string(addr, rqst->rq_xprt->xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->status = status;
+		__entry->xid = __be32_to_cpu(rqst->rq_xid);
+		__entry->xprt = rqst->rq_xprt;
+		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
+	),
+
+	TP_printk("xprt=%p addr=%s xid=0x%08x status=%d",
+		__entry->xprt, __get_str(addr),
+		__entry->xid, __entry->status
+	)
+);
+
+DECLARE_EVENT_CLASS(svcrdma_sendcomp_event,
+	TP_PROTO(
+		const struct ib_wc *wc
+	),
+
+	TP_ARGS(wc),
+
+	TP_STRUCT__entry(
+		__field(const void *, cqe)
+		__field(unsigned int, status)
+		__field(unsigned int, vendor_err)
+	),
+
+	TP_fast_assign(
+		__entry->cqe = wc->wr_cqe;
+		__entry->status = wc->status;
+		if (wc->status)
+			__entry->vendor_err = wc->vendor_err;
+		else
+			__entry->vendor_err = 0;
+	),
+
+	TP_printk("cqe=%p status=%s (%u/0x%x)",
+		__entry->cqe, rdma_show_wc_status(__entry->status),
+		__entry->status, __entry->vendor_err
+	)
+);
+
+#define DEFINE_SENDCOMP_EVENT(name)					\
+		DEFINE_EVENT(svcrdma_sendcomp_event, svcrdma_wc_##name,	\
+				TP_PROTO(				\
+					const struct ib_wc *wc		\
+				),					\
+				TP_ARGS(wc))
+
+TRACE_EVENT(svcrdma_post_send,
+	TP_PROTO(
+		const struct ib_send_wr *wr,
+		int status
+	),
+
+	TP_ARGS(wr, status),
+
+	TP_STRUCT__entry(
+		__field(const void *, cqe)
+		__field(unsigned int, num_sge)
+		__field(u32, inv_rkey)
+		__field(int, status)
+	),
+
+	TP_fast_assign(
+		__entry->cqe = wr->wr_cqe;
+		__entry->num_sge = wr->num_sge;
+		__entry->inv_rkey = (wr->opcode == IB_WR_SEND_WITH_INV) ?
+					wr->ex.invalidate_rkey : 0;
+		__entry->status = status;
+	),
+
+	TP_printk("cqe=%p num_sge=%u inv_rkey=0x%08x status=%d",
+		__entry->cqe, __entry->num_sge,
+		__entry->inv_rkey, __entry->status
+	)
+);
+
+DEFINE_SENDCOMP_EVENT(send);
+
+TRACE_EVENT(svcrdma_post_recv,
+	TP_PROTO(
+		const struct ib_recv_wr *wr,
+		int status
+	),
+
+	TP_ARGS(wr, status),
+
+	TP_STRUCT__entry(
+		__field(const void *, cqe)
+		__field(int, status)
+	),
+
+	TP_fast_assign(
+		__entry->cqe = wr->wr_cqe;
+		__entry->status = status;
+	),
+
+	TP_printk("cqe=%p status=%d",
+		__entry->cqe, __entry->status
+	)
+);
+
+TRACE_EVENT(svcrdma_wc_receive,
+	TP_PROTO(
+		const struct ib_wc *wc
+	),
+
+	TP_ARGS(wc),
+
+	TP_STRUCT__entry(
+		__field(const void *, cqe)
+		__field(u32, byte_len)
+		__field(unsigned int, status)
+		__field(u32, vendor_err)
+	),
+
+	TP_fast_assign(
+		__entry->cqe = wc->wr_cqe;
+		__entry->status = wc->status;
+		if (wc->status) {
+			__entry->byte_len = 0;
+			__entry->vendor_err = wc->vendor_err;
+		} else {
+			__entry->byte_len = wc->byte_len;
+			__entry->vendor_err = 0;
+		}
+	),
+
+	TP_printk("cqe=%p byte_len=%u status=%s (%u/0x%x)",
+		__entry->cqe, __entry->byte_len,
+		rdma_show_wc_status(__entry->status),
+		__entry->status, __entry->vendor_err
+	)
+);
+
+TRACE_EVENT(svcrdma_post_rw,
+	TP_PROTO(
+		const void *cqe,
+		int sqecount,
+		int status
+	),
+
+	TP_ARGS(cqe, sqecount, status),
+
+	TP_STRUCT__entry(
+		__field(const void *, cqe)
+		__field(int, sqecount)
+		__field(int, status)
+	),
+
+	TP_fast_assign(
+		__entry->cqe = cqe;
+		__entry->sqecount = sqecount;
+		__entry->status = status;
+	),
+
+	TP_printk("cqe=%p sqecount=%d status=%d",
+		__entry->cqe, __entry->sqecount, __entry->status
+	)
+);
+
+DEFINE_SENDCOMP_EVENT(read);
+DEFINE_SENDCOMP_EVENT(write);
+
+TRACE_EVENT(svcrdma_cm_event,
+	TP_PROTO(
+		const struct rdma_cm_event *event,
+		const struct sockaddr *sap
+	),
+
+	TP_ARGS(event, sap),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, event)
+		__field(int, status)
+		__array(__u8, addr, INET6_ADDRSTRLEN + 10)
+	),
+
+	TP_fast_assign(
+		__entry->event = event->event;
+		__entry->status = event->status;
+		snprintf(__entry->addr, sizeof(__entry->addr) - 1,
+			 "%pISpc", sap);
+	),
+
+	TP_printk("addr=%s event=%s (%u/%d)",
+		__entry->addr,
+		rdma_show_cm_event(__entry->event),
+		__entry->event, __entry->status
+	)
+);
+
+TRACE_EVENT(svcrdma_qp_error,
+	TP_PROTO(
+		const struct ib_event *event,
+		const struct sockaddr *sap
+	),
+
+	TP_ARGS(event, sap),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, event)
+		__string(device, event->device->name)
+		__array(__u8, addr, INET6_ADDRSTRLEN + 10)
+	),
+
+	TP_fast_assign(
+		__entry->event = event->event;
+		__assign_str(device, event->device->name);
+		snprintf(__entry->addr, sizeof(__entry->addr) - 1,
+			 "%pISpc", sap);
+	),
+
+	TP_printk("addr=%s dev=%s event=%s (%u)",
+		__entry->addr, __get_str(device),
+		rdma_show_ib_event(__entry->event), __entry->event
+	)
+);
+
+DECLARE_EVENT_CLASS(svcrdma_sendqueue_event,
+	TP_PROTO(
+		const struct svcxprt_rdma *rdma
+	),
+
+	TP_ARGS(rdma),
+
+	TP_STRUCT__entry(
+		__field(int, avail)
+		__field(int, depth)
+		__string(addr, rdma->sc_xprt.xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->avail = atomic_read(&rdma->sc_sq_avail);
+		__entry->depth = rdma->sc_sq_depth;
+		__assign_str(addr, rdma->sc_xprt.xpt_remotebuf);
+	),
+
+	TP_printk("addr=%s sc_sq_avail=%d/%d",
+		__get_str(addr), __entry->avail, __entry->depth
+	)
+);
+
+#define DEFINE_SQ_EVENT(name)						\
+		DEFINE_EVENT(svcrdma_sendqueue_event, svcrdma_sq_##name,\
+				TP_PROTO(				\
+					const struct svcxprt_rdma *rdma \
+				),					\
+				TP_ARGS(rdma))
+
+DEFINE_SQ_EVENT(full);
+DEFINE_SQ_EVENT(retry);
+
 #endif /* _TRACE_RPCRDMA_H */
 
 #include <trace/define_trace.h>
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 05c69aca3996..dbedc872ec10 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/svc_rdma.h>
 
 #include "xprt_rdma.h"
 #include <trace/events/rpcrdma.h>
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 08de7dad8abe..c74b41561495 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -20,6 +20,8 @@
  * verb (fmr_op_unmap).
  */
 
+#include <linux/sunrpc/svc_rdma.h>
+
 #include "xprt_rdma.h"
 #include <trace/events/rpcrdma.h>
 
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index f8312e30fade..5d6c01ce3c45 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -71,6 +71,7 @@
  */
 
 #include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
 
 #include "xprt_rdma.h"
 #include <trace/events/rpcrdma.h>
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index f358d1e40a57..b942d7e0aef5 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -48,6 +48,8 @@
 
 #include <linux/highmem.h>
 
+#include <linux/sunrpc/svc_rdma.h>
+
 #include "xprt_rdma.h"
 #include <trace/events/rpcrdma.h>
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a73632ca9048..d50152163190 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -6,8 +6,11 @@
  */
 
 #include <linux/module.h>
+
 #include <linux/sunrpc/svc_rdma.h>
+
 #include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 78ca5806ff0a..330d542fd96e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -432,8 +432,6 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 
 	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
 	if (ret) {
-		dprintk("svcrdma: Error %d posting send for protocol error\n",
-			ret);
 		svc_rdma_unmap_dma(ctxt);
 		svc_rdma_put_context(ctxt, 1);
 	}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 4b9cb54cf58b..887ceef125b2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -208,6 +208,8 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct svc_rdma_write_info *info =
 			container_of(cc, struct svc_rdma_write_info, wi_cc);
 
+	trace_svcrdma_wc_write(wc);
+
 	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
 	wake_up(&rdma->sc_send_wait);
 
@@ -269,6 +271,8 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct svc_rdma_read_info *info =
 			container_of(cc, struct svc_rdma_read_info, ri_cc);
 
+	trace_svcrdma_wc_read(wc);
+
 	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
 	wake_up(&rdma->sc_send_wait);
 
@@ -326,18 +330,20 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
 		if (atomic_sub_return(cc->cc_sqecount,
 				      &rdma->sc_sq_avail) > 0) {
 			ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+			trace_svcrdma_post_rw(&cc->cc_cqe,
+					      cc->cc_sqecount, ret);
 			if (ret)
 				break;
 			return 0;
 		}
 
-		atomic_inc(&rdma_stat_sq_starve);
+		trace_svcrdma_sq_full(rdma);
 		atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
 		wait_event(rdma->sc_send_wait,
 			   atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+		trace_svcrdma_sq_retry(rdma);
 	} while (1);
 
-	pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 
 	/* If even one was posted, there will be a completion. */
@@ -466,7 +472,7 @@ out_noctx:
 
 out_initerr:
 	svc_rdma_put_rw_ctxt(rdma, ctxt);
-	pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+	trace_svcrdma_dma_map_rwctx(rdma, ret);
 	return -EIO;
 }
 
@@ -661,8 +667,8 @@ out_overrun:
 	return -EINVAL;
 
 out_initerr:
+	trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret);
 	svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt);
-	pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
 	return -EIO;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4c580833ec2e..fed28de78d37 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -353,7 +353,7 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 	return 0;
 
 out_maperr:
-	pr_err("svcrdma: failed to map page\n");
+	trace_svcrdma_dma_map_page(rdma, page);
 	return -EIO;
 }
 
@@ -597,7 +597,6 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 	return 0;
 
 err:
-	pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
 	svc_rdma_unmap_dma(ctxt);
 	svc_rdma_put_context(ctxt, 1);
 	return ret;
@@ -690,8 +689,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
  err1:
 	put_page(res_page);
  err0:
-	pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
-	       ret);
+	trace_svcrdma_send_failed(rqstp, ret);
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 	return -ENOTCONN;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index d2cdffa9fccb..05edb18f8ca3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -275,16 +275,15 @@ static void qp_event_handler(struct ib_event *event, void *context)
 {
 	struct svc_xprt *xprt = context;
 
+	trace_svcrdma_qp_error(event, (struct sockaddr *)&xprt->xpt_remote);
 	switch (event->event) {
 	/* These are considered benign events */
 	case IB_EVENT_PATH_MIG:
 	case IB_EVENT_COMM_EST:
 	case IB_EVENT_SQ_DRAINED:
 	case IB_EVENT_QP_LAST_WQE_REACHED:
-		dprintk("svcrdma: QP event %s (%d) received for QP=%p\n",
-			ib_event_msg(event->event), event->event,
-			event->element.qp);
 		break;
+
 	/* These are considered fatal events */
 	case IB_EVENT_PATH_MIG_ERR:
 	case IB_EVENT_QP_FATAL:
@@ -292,10 +291,6 @@ static void qp_event_handler(struct ib_event *event, void *context)
 	case IB_EVENT_QP_ACCESS_ERR:
 	case IB_EVENT_DEVICE_FATAL:
 	default:
-		dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, "
-			"closing transport\n",
-			ib_event_msg(event->event), event->event,
-			event->element.qp);
 		set_bit(XPT_CLOSE, &xprt->xpt_flags);
 		svc_xprt_enqueue(xprt);
 		break;
@@ -314,6 +309,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct svc_rdma_op_ctxt *ctxt;
 
+	trace_svcrdma_wc_receive(wc);
+
 	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
 	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
 	svc_rdma_unmap_dma(ctxt);
@@ -360,6 +357,8 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct svc_rdma_op_ctxt *ctxt;
 
+	trace_svcrdma_wc_send(wc);
+
 	atomic_inc(&xprt->sc_sq_avail);
 	wake_up(&xprt->sc_send_wait);
 
@@ -455,6 +454,7 @@ svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 
 	svc_xprt_get(&xprt->sc_xprt);
 	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+	trace_svcrdma_post_recv(&recv_wr, ret);
 	if (ret) {
 		svc_rdma_unmap_dma(ctxt);
 		svc_rdma_put_context(ctxt, 1);
@@ -513,8 +513,6 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
 		return;
 	newxprt->sc_cm_id = new_cma_id;
 	new_cma_id->context = newxprt;
-	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
-		newxprt, newxprt->sc_cm_id, listen_xprt);
 	svc_rdma_parse_connect_private(newxprt, param);
 
 	/* Save client advertised inbound read limit for use later in accept. */
@@ -545,9 +543,11 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
 static int rdma_listen_handler(struct rdma_cm_id *cma_id,
 			       struct rdma_cm_event *event)
 {
-	struct svcxprt_rdma *xprt = cma_id->context;
+	struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr;
 	int ret = 0;
 
+	trace_svcrdma_cm_event(event, sap);
+
 	switch (event->event) {
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
 		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
@@ -555,23 +555,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
 			rdma_event_msg(event->event), event->event);
 		handle_connect_req(cma_id, &event->param.conn);
 		break;
-
-	case RDMA_CM_EVENT_ESTABLISHED:
-		/* Accept complete */
-		dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
-			"cm_id=%p\n", xprt, cma_id);
-		break;
-
-	case RDMA_CM_EVENT_DEVICE_REMOVAL:
-		dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
-			xprt, cma_id);
-		if (xprt) {
-			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
-			svc_xprt_enqueue(&xprt->sc_xprt);
-		}
-		break;
-
 	default:
+		/* NB: No device removal upcall for INADDR_ANY listeners */
 		dprintk("svcrdma: Unexpected event on listening endpoint %p, "
 			"event = %s (%d)\n", cma_id,
 			rdma_event_msg(event->event), event->event);
@@ -584,9 +569,12 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
 static int rdma_cma_handler(struct rdma_cm_id *cma_id,
 			    struct rdma_cm_event *event)
 {
-	struct svc_xprt *xprt = cma_id->context;
-	struct svcxprt_rdma *rdma =
-		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr;
+	struct svcxprt_rdma *rdma = cma_id->context;
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+
+	trace_svcrdma_cm_event(event, sap);
+
 	switch (event->event) {
 	case RDMA_CM_EVENT_ESTABLISHED:
 		/* Accept complete */
@@ -599,21 +587,17 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
 	case RDMA_CM_EVENT_DISCONNECTED:
 		dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
 			xprt, cma_id);
-		if (xprt) {
-			set_bit(XPT_CLOSE, &xprt->xpt_flags);
-			svc_xprt_enqueue(xprt);
-			svc_xprt_put(xprt);
-		}
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		svc_xprt_enqueue(xprt);
+		svc_xprt_put(xprt);
 		break;
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 		dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
 			"event = %s (%d)\n", cma_id, xprt,
 			rdma_event_msg(event->event), event->event);
-		if (xprt) {
-			set_bit(XPT_CLOSE, &xprt->xpt_flags);
-			svc_xprt_enqueue(xprt);
-			svc_xprt_put(xprt);
-		}
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		svc_xprt_enqueue(xprt);
+		svc_xprt_put(xprt);
 		break;
 	default:
 		dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
@@ -1022,13 +1006,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 	while (1) {
 		if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
 			atomic_inc(&rdma_stat_sq_starve);
-
-			/* Wait until SQ WR available if SQ still full */
+			trace_svcrdma_sq_full(xprt);
 			atomic_add(wr_count, &xprt->sc_sq_avail);
 			wait_event(xprt->sc_send_wait,
 				   atomic_read(&xprt->sc_sq_avail) > wr_count);
 			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
 				return -ENOTCONN;
+			trace_svcrdma_sq_retry(xprt);
 			continue;
 		}
 		/* Take a transport ref for each WR posted */
@@ -1037,6 +1021,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 
 		/* Bump used SQ WR count and post */
 		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
+		trace_svcrdma_post_send(wr, ret);
 		if (ret) {
 			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
 			for (i = 0; i < wr_count; i ++)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3d1b27748aba..caca977e3755 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -51,7 +51,10 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
+#include <linux/smp.h>
+
 #include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
 
 #include "xprt_rdma.h"
 #include <trace/events/rpcrdma.h>
-- 
cgit v1.2.3


From ecf85b2384ea5f7cb0577bf6143bc46d9ecfe4d3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:21 -0400
Subject: svcrdma: Introduce svc_rdma_recv_ctxt

svc_rdma_op_ctxt's are pre-allocated and maintained on a per-xprt
free list. This eliminates the overhead of calling kmalloc / kfree,
both of which grab a globally shared lock that disables interrupts.
To reduce contention further, separate the use of these objects in
the Receive and Send paths in svcrdma.

Subsequent patches will take advantage of this separation by
allocating real resources which are then cached in these objects.
The allocations are freed when the transport is torn down.

I've renamed the structure so that static type checking can be used
to ensure that uses of op_ctxt and recv_ctxt are not confused. As an
additional clean up, structure fields are renamed to conform with
kernel coding conventions.

As a final clean up, helpers related to recv_ctxt are moved closer
to the functions that use them.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  24 ++-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 318 ++++++++++++++++++++++++++-----
 net/sunrpc/xprtrdma/svc_rdma_rw.c        |  84 ++++----
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    |   2 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 142 +-------------
 5 files changed, 349 insertions(+), 221 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 88da0c9bd7b1..37f759d65348 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -128,6 +128,9 @@ struct svcxprt_rdma {
 	unsigned long	     sc_flags;
 	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
+
+	spinlock_t	     sc_recv_lock;
+	struct list_head     sc_recv_ctxts;
 };
 /* sc_flags */
 #define RDMAXPRT_CONN_PENDING	3
@@ -142,6 +145,19 @@ struct svcxprt_rdma {
 
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+struct svc_rdma_recv_ctxt {
+	struct list_head	rc_list;
+	struct ib_recv_wr	rc_recv_wr;
+	struct ib_cqe		rc_cqe;
+	struct xdr_buf		rc_arg;
+	u32			rc_byte_len;
+	unsigned int		rc_page_count;
+	unsigned int		rc_hdr_count;
+	struct ib_sge		rc_sges[1 +
+					RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
+	struct page		*rc_pages[RPCSVC_MAXPAGES];
+};
+
 /* Track DMA maps for this transport and context */
 static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
 					   struct svc_rdma_op_ctxt *ctxt)
@@ -155,13 +171,19 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 				    struct xdr_buf *rcvbuf);
 
 /* svc_rdma_recvfrom.c */
+extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma);
+extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
+				   struct svc_rdma_recv_ctxt *ctxt,
+				   int free_pages);
+extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma);
 extern int svc_rdma_recvfrom(struct svc_rqst *);
 
 /* svc_rdma_rw.c */
 extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
 extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma,
 				    struct svc_rqst *rqstp,
-				    struct svc_rdma_op_ctxt *head, __be32 *p);
+				    struct svc_rdma_recv_ctxt *head, __be32 *p);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     __be32 *wr_ch, struct xdr_buf *xdr);
 extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 330d542fd96e..b7d9c55ee896 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (c) 2016, 2017 Oracle. All rights reserved.
+ * Copyright (c) 2016-2018 Oracle. All rights reserved.
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
@@ -61,7 +61,7 @@
  * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's
  * data payload from the client. svc_rdma_recvfrom sets up the
  * RDMA Reads using pages in svc_rqst::rq_pages, which are
- * transferred to an svc_rdma_op_ctxt for the duration of the
+ * transferred to an svc_rdma_recv_ctxt for the duration of the
  * I/O. svc_rdma_recvfrom then returns zero, since the RPC message
  * is still not yet ready.
  *
@@ -70,18 +70,18 @@
  * svc_rdma_recvfrom again. This second call may use a different
  * svc_rqst than the first one, thus any information that needs
  * to be preserved across these two calls is kept in an
- * svc_rdma_op_ctxt.
+ * svc_rdma_recv_ctxt.
  *
  * The second call to svc_rdma_recvfrom performs final assembly
  * of the RPC Call message, using the RDMA Read sink pages kept in
- * the svc_rdma_op_ctxt. The xdr_buf is copied from the
- * svc_rdma_op_ctxt to the second svc_rqst. The second call returns
+ * the svc_rdma_recv_ctxt. The xdr_buf is copied from the
+ * svc_rdma_recv_ctxt to the second svc_rqst. The second call returns
  * the length of the completed RPC Call message.
  *
  * Page Management
  *
  * Pages under I/O must be transferred from the first svc_rqst to an
- * svc_rdma_op_ctxt before the first svc_rdma_recvfrom call returns.
+ * svc_rdma_recv_ctxt before the first svc_rdma_recvfrom call returns.
  *
  * The first svc_rqst supplies pages for RDMA Reads. These are moved
  * from rqstp::rq_pages into ctxt::pages. The consumed elements of
@@ -89,7 +89,7 @@
  * svc_rdma_recvfrom call returns.
  *
  * During the second svc_rdma_recvfrom call, RDMA Read sink pages
- * are transferred from the svc_rdma_op_ctxt to the second svc_rqst
+ * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst
  * (see rdma_read_complete() below).
  */
 
@@ -108,13 +108,247 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+
+static inline struct svc_rdma_recv_ctxt *
+svc_rdma_next_recv_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt,
+					rc_list);
+}
+
+/**
+ * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
+ * @rdma: svcxprt_rdma being torn down
+ *
+ */
+void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) {
+		list_del(&ctxt->rc_list);
+		kfree(ctxt);
+	}
+}
+
+static struct svc_rdma_recv_ctxt *
+svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	spin_lock(&rdma->sc_recv_lock);
+	ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts);
+	if (!ctxt)
+		goto out_empty;
+	list_del(&ctxt->rc_list);
+	spin_unlock(&rdma->sc_recv_lock);
+
+out:
+	ctxt->rc_recv_wr.num_sge = 0;
+	ctxt->rc_page_count = 0;
+	return ctxt;
+
+out_empty:
+	spin_unlock(&rdma->sc_recv_lock);
+
+	ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return NULL;
+	goto out;
+}
+
+static void svc_rdma_recv_ctxt_unmap(struct svcxprt_rdma *rdma,
+				     struct svc_rdma_recv_ctxt *ctxt)
+{
+	struct ib_device *device = rdma->sc_cm_id->device;
+	int i;
+
+	for (i = 0; i < ctxt->rc_recv_wr.num_sge; i++)
+		ib_dma_unmap_page(device,
+				  ctxt->rc_sges[i].addr,
+				  ctxt->rc_sges[i].length,
+				  DMA_FROM_DEVICE);
+}
+
+/**
+ * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list
+ * @rdma: controlling svcxprt_rdma
+ * @ctxt: object to return to the free list
+ * @free_pages: Non-zero if rc_pages should be freed
+ *
+ */
+void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
+			    struct svc_rdma_recv_ctxt *ctxt,
+			    int free_pages)
+{
+	unsigned int i;
+
+	if (free_pages)
+		for (i = 0; i < ctxt->rc_page_count; i++)
+			put_page(ctxt->rc_pages[i]);
+	spin_lock(&rdma->sc_recv_lock);
+	list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
+	spin_unlock(&rdma->sc_recv_lock);
+}
+
+static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+{
+	struct ib_device *device = rdma->sc_cm_id->device;
+	struct svc_rdma_recv_ctxt *ctxt;
+	struct ib_recv_wr *bad_recv_wr;
+	int sge_no, buflen, ret;
+	struct page *page;
+	dma_addr_t pa;
+
+	ctxt = svc_rdma_recv_ctxt_get(rdma);
+	if (!ctxt)
+		return -ENOMEM;
+
+	buflen = 0;
+	ctxt->rc_cqe.done = svc_rdma_wc_receive;
+	for (sge_no = 0; buflen < rdma->sc_max_req_size; sge_no++) {
+		if (sge_no >= rdma->sc_max_sge) {
+			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+			goto err_put_ctxt;
+		}
+
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			goto err_put_ctxt;
+		ctxt->rc_pages[sge_no] = page;
+		ctxt->rc_page_count++;
+
+		pa = ib_dma_map_page(device, ctxt->rc_pages[sge_no],
+				     0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(device, pa))
+			goto err_put_ctxt;
+		ctxt->rc_sges[sge_no].addr = pa;
+		ctxt->rc_sges[sge_no].length = PAGE_SIZE;
+		ctxt->rc_sges[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+		ctxt->rc_recv_wr.num_sge++;
+
+		buflen += PAGE_SIZE;
+	}
+	ctxt->rc_recv_wr.next = NULL;
+	ctxt->rc_recv_wr.sg_list = &ctxt->rc_sges[0];
+	ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
+
+	svc_xprt_get(&rdma->sc_xprt);
+	ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr);
+	trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
+	if (ret)
+		goto err_post;
+	return 0;
+
+err_put_ctxt:
+	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
+	svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	return -ENOMEM;
+err_post:
+	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
+	svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	svc_xprt_put(&rdma->sc_xprt);
+	return ret;
+}
+
+/**
+ * svc_rdma_post_recvs - Post initial set of Recv WRs
+ * @rdma: fresh svcxprt_rdma
+ *
+ * Returns true if successful, otherwise false.
+ */
+bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
+{
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < rdma->sc_max_requests; i++) {
+		ret = svc_rdma_post_recv(rdma);
+		if (ret) {
+			pr_err("svcrdma: failure posting recv buffers: %d\n",
+			       ret);
+			return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
+ * @cq: Completion Queue context
+ * @wc: Work Completion object
+ *
+ * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
+ * the Receive completion handler could be running.
+ */
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct svcxprt_rdma *rdma = cq->cq_context;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	trace_svcrdma_wc_receive(wc);
+
+	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
+	ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
+	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
+
+	if (wc->status != IB_WC_SUCCESS)
+		goto flushed;
+
+	if (svc_rdma_post_recv(rdma))
+		goto post_err;
+
+	/* All wc fields are now known to be valid */
+	ctxt->rc_byte_len = wc->byte_len;
+	spin_lock(&rdma->sc_rq_dto_lock);
+	list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
+	spin_unlock(&rdma->sc_rq_dto_lock);
+	set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
+	if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
+		svc_xprt_enqueue(&rdma->sc_xprt);
+	goto out;
+
+flushed:
+	if (wc->status != IB_WC_WR_FLUSH_ERR)
+		pr_err("svcrdma: Recv: %s (%u/0x%x)\n",
+		       ib_wc_status_msg(wc->status),
+		       wc->status, wc->vendor_err);
+post_err:
+	svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+	svc_xprt_enqueue(&rdma->sc_xprt);
+out:
+	svc_xprt_put(&rdma->sc_xprt);
+}
+
+/**
+ * svc_rdma_flush_recv_queues - Drain pending Receive work
+ * @rdma: svcxprt_rdma being shut down
+ *
+ */
+void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
+		list_del(&ctxt->rc_list);
+		svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	}
+	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
+		list_del(&ctxt->rc_list);
+		svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	}
+}
+
 /*
  * Replace the pages in the rq_argpages array with the pages from the SGE in
  * the RDMA_RECV completion. The SGL should contain full pages up until the
  * last one.
  */
 static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
-				   struct svc_rdma_op_ctxt *ctxt)
+				   struct svc_rdma_recv_ctxt *ctxt)
 {
 	struct page *page;
 	int sge_no;
@@ -123,30 +357,30 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	/* The reply path assumes the Call's transport header resides
 	 * in rqstp->rq_pages[0].
 	 */
-	page = ctxt->pages[0];
+	page = ctxt->rc_pages[0];
 	put_page(rqstp->rq_pages[0]);
 	rqstp->rq_pages[0] = page;
 
 	/* Set up the XDR head */
 	rqstp->rq_arg.head[0].iov_base = page_address(page);
 	rqstp->rq_arg.head[0].iov_len =
-		min_t(size_t, ctxt->byte_len, ctxt->sge[0].length);
-	rqstp->rq_arg.len = ctxt->byte_len;
-	rqstp->rq_arg.buflen = ctxt->byte_len;
+		min_t(size_t, ctxt->rc_byte_len, ctxt->rc_sges[0].length);
+	rqstp->rq_arg.len = ctxt->rc_byte_len;
+	rqstp->rq_arg.buflen = ctxt->rc_byte_len;
 
 	/* Compute bytes past head in the SGL */
-	len = ctxt->byte_len - rqstp->rq_arg.head[0].iov_len;
+	len = ctxt->rc_byte_len - rqstp->rq_arg.head[0].iov_len;
 
 	/* If data remains, store it in the pagelist */
 	rqstp->rq_arg.page_len = len;
 	rqstp->rq_arg.page_base = 0;
 
 	sge_no = 1;
-	while (len && sge_no < ctxt->count) {
-		page = ctxt->pages[sge_no];
+	while (len && sge_no < ctxt->rc_recv_wr.num_sge) {
+		page = ctxt->rc_pages[sge_no];
 		put_page(rqstp->rq_pages[sge_no]);
 		rqstp->rq_pages[sge_no] = page;
-		len -= min_t(u32, len, ctxt->sge[sge_no].length);
+		len -= min_t(u32, len, ctxt->rc_sges[sge_no].length);
 		sge_no++;
 	}
 	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
@@ -154,11 +388,11 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
 
 	/* If not all pages were used from the SGL, free the remaining ones */
 	len = sge_no;
-	while (sge_no < ctxt->count) {
-		page = ctxt->pages[sge_no++];
+	while (sge_no < ctxt->rc_recv_wr.num_sge) {
+		page = ctxt->rc_pages[sge_no++];
 		put_page(page);
 	}
-	ctxt->count = len;
+	ctxt->rc_page_count = len;
 
 	/* Set up tail */
 	rqstp->rq_arg.tail[0].iov_base = NULL;
@@ -364,29 +598,29 @@ out_inval:
 }
 
 static void rdma_read_complete(struct svc_rqst *rqstp,
-			       struct svc_rdma_op_ctxt *head)
+			       struct svc_rdma_recv_ctxt *head)
 {
 	int page_no;
 
 	/* Copy RPC pages */
-	for (page_no = 0; page_no < head->count; page_no++) {
+	for (page_no = 0; page_no < head->rc_page_count; page_no++) {
 		put_page(rqstp->rq_pages[page_no]);
-		rqstp->rq_pages[page_no] = head->pages[page_no];
+		rqstp->rq_pages[page_no] = head->rc_pages[page_no];
 	}
 
 	/* Point rq_arg.pages past header */
-	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
-	rqstp->rq_arg.page_len = head->arg.page_len;
+	rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count];
+	rqstp->rq_arg.page_len = head->rc_arg.page_len;
 
 	/* rq_respages starts after the last arg page */
 	rqstp->rq_respages = &rqstp->rq_pages[page_no];
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
 	/* Rebuild rq_arg head and tail. */
-	rqstp->rq_arg.head[0] = head->arg.head[0];
-	rqstp->rq_arg.tail[0] = head->arg.tail[0];
-	rqstp->rq_arg.len = head->arg.len;
-	rqstp->rq_arg.buflen = head->arg.buflen;
+	rqstp->rq_arg.head[0] = head->rc_arg.head[0];
+	rqstp->rq_arg.tail[0] = head->rc_arg.tail[0];
+	rqstp->rq_arg.len = head->rc_arg.len;
+	rqstp->rq_arg.buflen = head->rc_arg.buflen;
 }
 
 static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
@@ -506,28 +740,26 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	struct svc_xprt *xprt = rqstp->rq_xprt;
 	struct svcxprt_rdma *rdma_xprt =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
-	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_recv_ctxt *ctxt;
 	__be32 *p;
 	int ret;
 
 	spin_lock(&rdma_xprt->sc_rq_dto_lock);
-	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
-		ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
-					struct svc_rdma_op_ctxt, list);
-		list_del(&ctxt->list);
+	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
+	if (ctxt) {
+		list_del(&ctxt->rc_list);
 		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 		rdma_read_complete(rqstp, ctxt);
 		goto complete;
-	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
-		ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
-					struct svc_rdma_op_ctxt, list);
-		list_del(&ctxt->list);
-	} else {
+	}
+	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
+	if (!ctxt) {
 		/* No new incoming requests, terminate the loop */
 		clear_bit(XPT_DATA, &xprt->xpt_flags);
 		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 		return 0;
 	}
+	list_del(&ctxt->rc_list);
 	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 
 	atomic_inc(&rdma_stat_recv);
@@ -545,7 +777,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	if (svc_rdma_is_backchannel_reply(xprt, p)) {
 		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p,
 					       &rqstp->rq_arg);
-		svc_rdma_put_context(ctxt, 0);
+		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0);
 		return ret;
 	}
 
@@ -554,7 +786,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		goto out_readchunk;
 
 complete:
-	svc_rdma_put_context(ctxt, 0);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0);
 	rqstp->rq_prot = IPPROTO_MAX;
 	svc_xprt_copy_addrs(rqstp, xprt);
 	return rqstp->rq_arg.len;
@@ -567,16 +799,16 @@ out_readchunk:
 
 out_err:
 	svc_rdma_send_error(rdma_xprt, p, ret);
-	svc_rdma_put_context(ctxt, 0);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0);
 	return 0;
 
 out_postfail:
 	if (ret == -EINVAL)
 		svc_rdma_send_error(rdma_xprt, p, ret);
-	svc_rdma_put_context(ctxt, 1);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1);
 	return ret;
 
 out_drop:
-	svc_rdma_put_context(ctxt, 1);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1);
 	return 0;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 887ceef125b2..c080ce20ff40 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2016 Oracle.  All rights reserved.
+ * Copyright (c) 2016-2018 Oracle.  All rights reserved.
  *
  * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
  */
@@ -227,7 +227,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 /* State for pulling a Read chunk.
  */
 struct svc_rdma_read_info {
-	struct svc_rdma_op_ctxt		*ri_readctxt;
+	struct svc_rdma_recv_ctxt	*ri_readctxt;
 	unsigned int			ri_position;
 	unsigned int			ri_pageno;
 	unsigned int			ri_pageoff;
@@ -282,10 +282,10 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 			pr_err("svcrdma: read ctx: %s (%u/0x%x)\n",
 			       ib_wc_status_msg(wc->status),
 			       wc->status, wc->vendor_err);
-		svc_rdma_put_context(info->ri_readctxt, 1);
+		svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt, 1);
 	} else {
 		spin_lock(&rdma->sc_rq_dto_lock);
-		list_add_tail(&info->ri_readctxt->list,
+		list_add_tail(&info->ri_readctxt->rc_list,
 			      &rdma->sc_read_complete_q);
 		spin_unlock(&rdma->sc_rq_dto_lock);
 
@@ -607,7 +607,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
 				       struct svc_rqst *rqstp,
 				       u32 rkey, u32 len, u64 offset)
 {
-	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
 	struct svc_rdma_rw_ctxt *ctxt;
 	unsigned int sge_no, seg_len;
@@ -625,10 +625,10 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
 		seg_len = min_t(unsigned int, len,
 				PAGE_SIZE - info->ri_pageoff);
 
-		head->arg.pages[info->ri_pageno] =
+		head->rc_arg.pages[info->ri_pageno] =
 			rqstp->rq_pages[info->ri_pageno];
 		if (!info->ri_pageoff)
-			head->count++;
+			head->rc_page_count++;
 
 		sg_set_page(sg, rqstp->rq_pages[info->ri_pageno],
 			    seg_len, info->ri_pageoff);
@@ -705,9 +705,9 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
 }
 
 /* Construct RDMA Reads to pull over a normal Read chunk. The chunk
- * data lands in the page list of head->arg.pages.
+ * data lands in the page list of head->rc_arg.pages.
  *
- * Currently NFSD does not look at the head->arg.tail[0] iovec.
+ * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
  * Therefore, XDR round-up of the Read chunk and trailing
  * inline content must both be added at the end of the pagelist.
  */
@@ -715,10 +715,10 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
 					    struct svc_rdma_read_info *info,
 					    __be32 *p)
 {
-	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	int ret;
 
-	info->ri_pageno = head->hdr_count;
+	info->ri_pageno = head->rc_hdr_count;
 	info->ri_pageoff = 0;
 
 	ret = svc_rdma_build_read_chunk(rqstp, info, p);
@@ -732,11 +732,11 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
 	 * chunk is not included in either the pagelist or in
 	 * the tail.
 	 */
-	head->arg.tail[0].iov_base =
-		head->arg.head[0].iov_base + info->ri_position;
-	head->arg.tail[0].iov_len =
-		head->arg.head[0].iov_len - info->ri_position;
-	head->arg.head[0].iov_len = info->ri_position;
+	head->rc_arg.tail[0].iov_base =
+		head->rc_arg.head[0].iov_base + info->ri_position;
+	head->rc_arg.tail[0].iov_len =
+		head->rc_arg.head[0].iov_len - info->ri_position;
+	head->rc_arg.head[0].iov_len = info->ri_position;
 
 	/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
 	 *
@@ -749,9 +749,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
 	 */
 	info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2;
 
-	head->arg.page_len = info->ri_chunklen;
-	head->arg.len += info->ri_chunklen;
-	head->arg.buflen += info->ri_chunklen;
+	head->rc_arg.page_len = info->ri_chunklen;
+	head->rc_arg.len += info->ri_chunklen;
+	head->rc_arg.buflen += info->ri_chunklen;
 
 out:
 	return ret;
@@ -760,7 +760,7 @@ out:
 /* Construct RDMA Reads to pull over a Position Zero Read chunk.
  * The start of the data lands in the first page just after
  * the Transport header, and the rest lands in the page list of
- * head->arg.pages.
+ * head->rc_arg.pages.
  *
  * Assumptions:
  *	- A PZRC has an XDR-aligned length (no implicit round-up).
@@ -772,11 +772,11 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
 					struct svc_rdma_read_info *info,
 					__be32 *p)
 {
-	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	int ret;
 
-	info->ri_pageno = head->hdr_count - 1;
-	info->ri_pageoff = offset_in_page(head->byte_len);
+	info->ri_pageno = head->rc_hdr_count - 1;
+	info->ri_pageoff = offset_in_page(head->rc_byte_len);
 
 	ret = svc_rdma_build_read_chunk(rqstp, info, p);
 	if (ret < 0)
@@ -784,22 +784,22 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
 
 	trace_svcrdma_encode_pzr(info->ri_chunklen);
 
-	head->arg.len += info->ri_chunklen;
-	head->arg.buflen += info->ri_chunklen;
+	head->rc_arg.len += info->ri_chunklen;
+	head->rc_arg.buflen += info->ri_chunklen;
 
-	if (head->arg.buflen <= head->sge[0].length) {
+	if (head->rc_arg.buflen <= head->rc_sges[0].length) {
 		/* Transport header and RPC message fit entirely
 		 * in page where head iovec resides.
 		 */
-		head->arg.head[0].iov_len = info->ri_chunklen;
+		head->rc_arg.head[0].iov_len = info->ri_chunklen;
 	} else {
 		/* Transport header and part of RPC message reside
 		 * in the head iovec's page.
 		 */
-		head->arg.head[0].iov_len =
-				head->sge[0].length - head->byte_len;
-		head->arg.page_len =
-				info->ri_chunklen - head->arg.head[0].iov_len;
+		head->rc_arg.head[0].iov_len =
+			head->rc_sges[0].length - head->rc_byte_len;
+		head->rc_arg.page_len =
+			info->ri_chunklen - head->rc_arg.head[0].iov_len;
 	}
 
 out:
@@ -824,24 +824,24 @@ out:
  * - All Read segments in @p have the same Position value.
  */
 int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
-			     struct svc_rdma_op_ctxt *head, __be32 *p)
+			     struct svc_rdma_recv_ctxt *head, __be32 *p)
 {
 	struct svc_rdma_read_info *info;
 	struct page **page;
 	int ret;
 
 	/* The request (with page list) is constructed in
-	 * head->arg. Pages involved with RDMA Read I/O are
+	 * head->rc_arg. Pages involved with RDMA Read I/O are
 	 * transferred there.
 	 */
-	head->hdr_count = head->count;
-	head->arg.head[0] = rqstp->rq_arg.head[0];
-	head->arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->arg.pages = head->pages;
-	head->arg.page_base = 0;
-	head->arg.page_len = 0;
-	head->arg.len = rqstp->rq_arg.len;
-	head->arg.buflen = rqstp->rq_arg.buflen;
+	head->rc_hdr_count = head->rc_page_count;
+	head->rc_arg.head[0] = rqstp->rq_arg.head[0];
+	head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->rc_arg.pages = head->rc_pages;
+	head->rc_arg.page_base = 0;
+	head->rc_arg.page_len = 0;
+	head->rc_arg.len = rqstp->rq_arg.len;
+	head->rc_arg.buflen = rqstp->rq_arg.buflen;
 
 	info = svc_rdma_read_info_alloc(rdma);
 	if (!info)
@@ -867,7 +867,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
 
 out:
 	/* Read sink pages have been moved from rqstp->rq_pages to
-	 * head->arg.pages. Force svc_recv to refill those slots
+	 * head->rc_arg.pages. Force svc_recv to refill those slots
 	 * in rq_pages.
 	 */
 	for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index fed28de78d37..a397d9a3d80e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (c) 2016 Oracle. All rights reserved.
+ * Copyright (c) 2016-2018 Oracle. All rights reserved.
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 05edb18f8ca3..05544f2f50d4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -63,7 +63,6 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
-static int svc_rdma_post_recv(struct svcxprt_rdma *xprt);
 static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 						 struct net *net);
 static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
@@ -175,11 +174,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
 {
 	unsigned int i;
 
-	/* Each RPC/RDMA credit can consume one Receive and
-	 * one Send WQE at the same time.
-	 */
-	i = xprt->sc_sq_depth + xprt->sc_rq_depth;
-
+	i = xprt->sc_sq_depth;
 	while (i--) {
 		struct svc_rdma_op_ctxt *ctxt;
 
@@ -297,54 +292,6 @@ static void qp_event_handler(struct ib_event *event, void *context)
 	}
 }
 
-/**
- * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
- * @cq:        completion queue
- * @wc:        completed WR
- *
- */
-static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
-{
-	struct svcxprt_rdma *xprt = cq->cq_context;
-	struct ib_cqe *cqe = wc->wr_cqe;
-	struct svc_rdma_op_ctxt *ctxt;
-
-	trace_svcrdma_wc_receive(wc);
-
-	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
-	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
-	svc_rdma_unmap_dma(ctxt);
-
-	if (wc->status != IB_WC_SUCCESS)
-		goto flushed;
-
-	/* All wc fields are now known to be valid */
-	ctxt->byte_len = wc->byte_len;
-	spin_lock(&xprt->sc_rq_dto_lock);
-	list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
-	spin_unlock(&xprt->sc_rq_dto_lock);
-
-	svc_rdma_post_recv(xprt);
-
-	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
-	if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
-		goto out;
-	goto out_enqueue;
-
-flushed:
-	if (wc->status != IB_WC_WR_FLUSH_ERR)
-		pr_err("svcrdma: Recv: %s (%u/0x%x)\n",
-		       ib_wc_status_msg(wc->status),
-		       wc->status, wc->vendor_err);
-	set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
-	svc_rdma_put_context(ctxt, 1);
-
-out_enqueue:
-	svc_xprt_enqueue(&xprt->sc_xprt);
-out:
-	svc_xprt_put(&xprt->sc_xprt);
-}
-
 /**
  * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
  * @cq:        completion queue
@@ -392,12 +339,14 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+	INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
+	spin_lock_init(&cma_xprt->sc_recv_lock);
 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
 
 	/*
@@ -411,63 +360,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	return cma_xprt;
 }
 
-static int
-svc_rdma_post_recv(struct svcxprt_rdma *xprt)
-{
-	struct ib_recv_wr recv_wr, *bad_recv_wr;
-	struct svc_rdma_op_ctxt *ctxt;
-	struct page *page;
-	dma_addr_t pa;
-	int sge_no;
-	int buflen;
-	int ret;
-
-	ctxt = svc_rdma_get_context(xprt);
-	buflen = 0;
-	ctxt->direction = DMA_FROM_DEVICE;
-	ctxt->cqe.done = svc_rdma_wc_receive;
-	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
-		if (sge_no >= xprt->sc_max_sge) {
-			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
-			goto err_put_ctxt;
-		}
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			goto err_put_ctxt;
-		ctxt->pages[sge_no] = page;
-		pa = ib_dma_map_page(xprt->sc_cm_id->device,
-				     page, 0, PAGE_SIZE,
-				     DMA_FROM_DEVICE);
-		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
-			goto err_put_ctxt;
-		svc_rdma_count_mappings(xprt, ctxt);
-		ctxt->sge[sge_no].addr = pa;
-		ctxt->sge[sge_no].length = PAGE_SIZE;
-		ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
-		ctxt->count = sge_no + 1;
-		buflen += PAGE_SIZE;
-	}
-	recv_wr.next = NULL;
-	recv_wr.sg_list = &ctxt->sge[0];
-	recv_wr.num_sge = ctxt->count;
-	recv_wr.wr_cqe = &ctxt->cqe;
-
-	svc_xprt_get(&xprt->sc_xprt);
-	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
-	trace_svcrdma_post_recv(&recv_wr, ret);
-	if (ret) {
-		svc_rdma_unmap_dma(ctxt);
-		svc_rdma_put_context(ctxt, 1);
-		svc_xprt_put(&xprt->sc_xprt);
-	}
-	return ret;
-
- err_put_ctxt:
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 1);
-	return -ENOMEM;
-}
-
 static void
 svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
 			       struct rdma_conn_param *param)
@@ -698,7 +590,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	struct ib_qp_init_attr qp_attr;
 	struct ib_device *dev;
 	struct sockaddr *sap;
-	unsigned int i, ctxts;
+	unsigned int ctxts;
 	int ret = 0;
 
 	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
@@ -803,14 +695,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	    !rdma_ib_or_roce(dev, newxprt->sc_port_num))
 		goto errout;
 
-	/* Post receive buffers */
-	for (i = 0; i < newxprt->sc_max_requests; i++) {
-		ret = svc_rdma_post_recv(newxprt);
-		if (ret) {
-			dprintk("svcrdma: failure posting receive buffers\n");
-			goto errout;
-		}
-	}
+	if (!svc_rdma_post_recvs(newxprt))
+		goto errout;
 
 	/* Swap out the handler */
 	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
@@ -907,20 +793,7 @@ static void __svc_rdma_free(struct work_struct *work)
 		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
 		       kref_read(&xprt->xpt_ref));
 
-	while (!list_empty(&rdma->sc_read_complete_q)) {
-		struct svc_rdma_op_ctxt *ctxt;
-		ctxt = list_first_entry(&rdma->sc_read_complete_q,
-					struct svc_rdma_op_ctxt, list);
-		list_del(&ctxt->list);
-		svc_rdma_put_context(ctxt, 1);
-	}
-	while (!list_empty(&rdma->sc_rq_dto_q)) {
-		struct svc_rdma_op_ctxt *ctxt;
-		ctxt = list_first_entry(&rdma->sc_rq_dto_q,
-					struct svc_rdma_op_ctxt, list);
-		list_del(&ctxt->list);
-		svc_rdma_put_context(ctxt, 1);
-	}
+	svc_rdma_flush_recv_queues(rdma);
 
 	/* Warn if we leaked a resource or under-referenced */
 	if (rdma->sc_ctxt_used != 0)
@@ -935,6 +808,7 @@ static void __svc_rdma_free(struct work_struct *work)
 
 	svc_rdma_destroy_rw_ctxts(rdma);
 	svc_rdma_destroy_ctxts(rdma);
+	svc_rdma_recv_ctxts_destroy(rdma);
 
 	/* Destroy the QP if present (not a listener) */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
-- 
cgit v1.2.3


From 2c577bfea85e421bfa91df16ccf5156361aa8d4b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:27 -0400
Subject: svcrdma: Remove sc_rq_depth

Clean up: No need to retain rq_depth in struct svcrdma_xprt, it is
used only in svc_rdma_accept().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 -
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 ++++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 37f759d65348..3cb66319a814 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -101,7 +101,6 @@ struct svcxprt_rdma {
 
 	atomic_t             sc_sq_avail;	/* SQEs ready to be consumed */
 	unsigned int	     sc_sq_depth;	/* Depth of SQ */
-	unsigned int	     sc_rq_depth;	/* Depth of RQ */
 	__be32		     sc_fc_credits;	/* Forward credits */
 	u32		     sc_max_requests;	/* Max requests */
 	u32		     sc_max_bc_requests;/* Backward credits */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 05544f2f50d4..ef32c46a234c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -588,9 +588,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	struct rdma_conn_param conn_param;
 	struct rpcrdma_connect_private pmsg;
 	struct ib_qp_init_attr qp_attr;
+	unsigned int ctxts, rq_depth;
 	struct ib_device *dev;
 	struct sockaddr *sap;
-	unsigned int ctxts;
 	int ret = 0;
 
 	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
@@ -621,19 +621,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	newxprt->sc_max_req_size = svcrdma_max_req_size;
 	newxprt->sc_max_requests = svcrdma_max_requests;
 	newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
-	newxprt->sc_rq_depth = newxprt->sc_max_requests +
-			       newxprt->sc_max_bc_requests;
-	if (newxprt->sc_rq_depth > dev->attrs.max_qp_wr) {
+	rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests;
+	if (rq_depth > dev->attrs.max_qp_wr) {
 		pr_warn("svcrdma: reducing receive depth to %d\n",
 			dev->attrs.max_qp_wr);
-		newxprt->sc_rq_depth = dev->attrs.max_qp_wr;
-		newxprt->sc_max_requests = newxprt->sc_rq_depth - 2;
+		rq_depth = dev->attrs.max_qp_wr;
+		newxprt->sc_max_requests = rq_depth - 2;
 		newxprt->sc_max_bc_requests = 2;
 	}
 	newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
 	ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
 	ctxts *= newxprt->sc_max_requests;
-	newxprt->sc_sq_depth = newxprt->sc_rq_depth + ctxts;
+	newxprt->sc_sq_depth = rq_depth + ctxts;
 	if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) {
 		pr_warn("svcrdma: reducing send depth to %d\n",
 			dev->attrs.max_qp_wr);
@@ -655,7 +654,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
 		goto errout;
 	}
-	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
+	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth,
 					0, IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_rq_cq)) {
 		dprintk("svcrdma: error creating RQ CQ for connect request\n");
@@ -668,7 +667,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	qp_attr.port_num = newxprt->sc_port_num;
 	qp_attr.cap.max_rdma_ctxs = ctxts;
 	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
-	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
+	qp_attr.cap.max_recv_wr = rq_depth;
 	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
 	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
-- 
cgit v1.2.3


From 1e5f4160745690a0476929d128a336cae95c1df9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:32 -0400
Subject: svcrdma: Simplify svc_rdma_recv_ctxt_put

Currently svc_rdma_recv_ctxt_put's callers have to know whether they
want to free the ctxt's pages or not. This means the human
developers have to know when and why to set that free_pages
argument.

Instead, the ctxt should carry that information with it so that
svc_rdma_recv_ctxt_put does the right thing no matter who is
calling.

We want to keep track of the number of pages in the Receive buffer
separately from the number of pages pulled over by RDMA Read. This
is so that the correct number of pages can be freed properly and
that number is well-documented.

So now, rc_hdr_count is the number of pages consumed by head[0]
(ie., the page index where the Read chunk should start); and
rc_page_count is always the number of pages that need to be released
when the ctxt is put.

The @free_pages argument is no longer needed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  3 +--
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 41 ++++++++++++++++++---------------
 net/sunrpc/xprtrdma/svc_rdma_rw.c       |  4 ++--
 3 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 3cb66319a814..f0bd0b6d8931 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -173,8 +173,7 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma);
 extern bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma);
 extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
-				   struct svc_rdma_recv_ctxt *ctxt,
-				   int free_pages);
+				   struct svc_rdma_recv_ctxt *ctxt);
 extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma);
 extern int svc_rdma_recvfrom(struct svc_rqst *);
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index b7d9c55ee896..ecfe7c90a268 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -175,18 +175,15 @@ static void svc_rdma_recv_ctxt_unmap(struct svcxprt_rdma *rdma,
  * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list
  * @rdma: controlling svcxprt_rdma
  * @ctxt: object to return to the free list
- * @free_pages: Non-zero if rc_pages should be freed
  *
  */
 void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
-			    struct svc_rdma_recv_ctxt *ctxt,
-			    int free_pages)
+			    struct svc_rdma_recv_ctxt *ctxt)
 {
 	unsigned int i;
 
-	if (free_pages)
-		for (i = 0; i < ctxt->rc_page_count; i++)
-			put_page(ctxt->rc_pages[i]);
+	for (i = 0; i < ctxt->rc_page_count; i++)
+		put_page(ctxt->rc_pages[i]);
 	spin_lock(&rdma->sc_recv_lock);
 	list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
 	spin_unlock(&rdma->sc_recv_lock);
@@ -243,11 +240,11 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
 
 err_put_ctxt:
 	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
-	svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
 	return -ENOMEM;
 err_post:
 	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
-	svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
 	svc_xprt_put(&rdma->sc_xprt);
 	return ret;
 }
@@ -316,7 +313,7 @@ flushed:
 		       ib_wc_status_msg(wc->status),
 		       wc->status, wc->vendor_err);
 post_err:
-	svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
 	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
 	svc_xprt_enqueue(&rdma->sc_xprt);
 out:
@@ -334,11 +331,11 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
 
 	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
 		list_del(&ctxt->rc_list);
-		svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+		svc_rdma_recv_ctxt_put(rdma, ctxt);
 	}
 	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
 		list_del(&ctxt->rc_list);
-		svc_rdma_recv_ctxt_put(rdma, ctxt, 1);
+		svc_rdma_recv_ctxt_put(rdma, ctxt);
 	}
 }
 
@@ -383,16 +380,19 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
 		len -= min_t(u32, len, ctxt->rc_sges[sge_no].length);
 		sge_no++;
 	}
+	ctxt->rc_hdr_count = sge_no;
 	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
 	/* If not all pages were used from the SGL, free the remaining ones */
-	len = sge_no;
 	while (sge_no < ctxt->rc_recv_wr.num_sge) {
 		page = ctxt->rc_pages[sge_no++];
 		put_page(page);
 	}
-	ctxt->rc_page_count = len;
+
+	/* @ctxt's pages have all been released or moved to @rqstp->rq_pages.
+	 */
+	ctxt->rc_page_count = 0;
 
 	/* Set up tail */
 	rqstp->rq_arg.tail[0].iov_base = NULL;
@@ -602,11 +602,14 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
 {
 	int page_no;
 
-	/* Copy RPC pages */
+	/* Move Read chunk pages to rqstp so that they will be released
+	 * when svc_process is done with them.
+	 */
 	for (page_no = 0; page_no < head->rc_page_count; page_no++) {
 		put_page(rqstp->rq_pages[page_no]);
 		rqstp->rq_pages[page_no] = head->rc_pages[page_no];
 	}
+	head->rc_page_count = 0;
 
 	/* Point rq_arg.pages past header */
 	rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count];
@@ -777,7 +780,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	if (svc_rdma_is_backchannel_reply(xprt, p)) {
 		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p,
 					       &rqstp->rq_arg);
-		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0);
+		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 		return ret;
 	}
 
@@ -786,7 +789,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		goto out_readchunk;
 
 complete:
-	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 	rqstp->rq_prot = IPPROTO_MAX;
 	svc_xprt_copy_addrs(rqstp, xprt);
 	return rqstp->rq_arg.len;
@@ -799,16 +802,16 @@ out_readchunk:
 
 out_err:
 	svc_rdma_send_error(rdma_xprt, p, ret);
-	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 	return 0;
 
 out_postfail:
 	if (ret == -EINVAL)
 		svc_rdma_send_error(rdma_xprt, p, ret);
-	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 	return ret;
 
 out_drop:
-	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 	return 0;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index c080ce20ff40..8242aa318ac1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -282,7 +282,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 			pr_err("svcrdma: read ctx: %s (%u/0x%x)\n",
 			       ib_wc_status_msg(wc->status),
 			       wc->status, wc->vendor_err);
-		svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt, 1);
+		svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
 	} else {
 		spin_lock(&rdma->sc_rq_dto_lock);
 		list_add_tail(&info->ri_readctxt->rc_list,
@@ -834,7 +834,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
 	 * head->rc_arg. Pages involved with RDMA Read I/O are
 	 * transferred there.
 	 */
-	head->rc_hdr_count = head->rc_page_count;
+	head->rc_page_count = head->rc_hdr_count;
 	head->rc_arg.head[0] = rqstp->rq_arg.head[0];
 	head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
 	head->rc_arg.pages = head->rc_pages;
-- 
cgit v1.2.3


From 3316f0631139c87631f2652c118da1a0354bd40d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:43 -0400
Subject: svcrdma: Persistently allocate and DMA-map Receive buffers

The current Receive path uses an array of pages which are allocated
and DMA mapped when each Receive WR is posted, and then handed off
to the upper layer in rqstp::rq_arg. The page flip releases unused
pages in the rq_pages pagelist. This mechanism introduces a
significant amount of overhead.

So instead, kmalloc the Receive buffer, and leave it DMA-mapped
while the transport remains connected. This confers a number of
benefits:

* Each Receive WR requires only one receive SGE, no matter how large
  the inline threshold is. This helps the server-side NFS/RDMA
  transport operate on less capable RDMA devices.

* The Receive buffer is left allocated and mapped all the time. This
  relieves svc_rdma_post_recv from the overhead of allocating and
  DMA-mapping a fresh buffer.

* svc_rdma_wc_receive no longer has to DMA unmap the Receive buffer.
  It has to DMA sync only the number of bytes that were received.

* svc_rdma_build_arg_xdr no longer has to free a page in rq_pages
  for each page in the Receive buffer, making it a constant-time
  function.

* The Receive buffer is now plugged directly into the rq_arg's
  head[0].iov_vec, and can be larger than a page without spilling
  over into rq_arg's page list. This enables simplification of
  the RDMA Read path in subsequent patches.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |   4 +-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 168 +++++++++++--------------------
 net/sunrpc/xprtrdma/svc_rdma_rw.c        |  32 ++----
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    |   5 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   2 +-
 5 files changed, 75 insertions(+), 136 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f0bd0b6d8931..01baabfb863b 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -148,12 +148,12 @@ struct svc_rdma_recv_ctxt {
 	struct list_head	rc_list;
 	struct ib_recv_wr	rc_recv_wr;
 	struct ib_cqe		rc_cqe;
+	struct ib_sge		rc_recv_sge;
+	void			*rc_recv_buf;
 	struct xdr_buf		rc_arg;
 	u32			rc_byte_len;
 	unsigned int		rc_page_count;
 	unsigned int		rc_hdr_count;
-	struct ib_sge		rc_sges[1 +
-					RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index d9fef5211116..d4ccd1c0142c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -117,6 +117,43 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
 					rc_list);
 }
 
+static struct svc_rdma_recv_ctxt *
+svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+	dma_addr_t addr;
+	void *buffer;
+
+	ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		goto fail0;
+	buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+	if (!buffer)
+		goto fail1;
+	addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
+				 rdma->sc_max_req_size, DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
+		goto fail2;
+
+	ctxt->rc_recv_wr.next = NULL;
+	ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
+	ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
+	ctxt->rc_recv_wr.num_sge = 1;
+	ctxt->rc_cqe.done = svc_rdma_wc_receive;
+	ctxt->rc_recv_sge.addr = addr;
+	ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
+	ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
+	ctxt->rc_recv_buf = buffer;
+	return ctxt;
+
+fail2:
+	kfree(buffer);
+fail1:
+	kfree(ctxt);
+fail0:
+	return NULL;
+}
+
 /**
  * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
  * @rdma: svcxprt_rdma being torn down
@@ -128,6 +165,11 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
 
 	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) {
 		list_del(&ctxt->rc_list);
+		ib_dma_unmap_single(rdma->sc_pd->device,
+				    ctxt->rc_recv_sge.addr,
+				    ctxt->rc_recv_sge.length,
+				    DMA_FROM_DEVICE);
+		kfree(ctxt->rc_recv_buf);
 		kfree(ctxt);
 	}
 }
@@ -145,32 +187,18 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
 	spin_unlock(&rdma->sc_recv_lock);
 
 out:
-	ctxt->rc_recv_wr.num_sge = 0;
 	ctxt->rc_page_count = 0;
 	return ctxt;
 
 out_empty:
 	spin_unlock(&rdma->sc_recv_lock);
 
-	ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+	ctxt = svc_rdma_recv_ctxt_alloc(rdma);
 	if (!ctxt)
 		return NULL;
 	goto out;
 }
 
-static void svc_rdma_recv_ctxt_unmap(struct svcxprt_rdma *rdma,
-				     struct svc_rdma_recv_ctxt *ctxt)
-{
-	struct ib_device *device = rdma->sc_cm_id->device;
-	int i;
-
-	for (i = 0; i < ctxt->rc_recv_wr.num_sge; i++)
-		ib_dma_unmap_page(device,
-				  ctxt->rc_sges[i].addr,
-				  ctxt->rc_sges[i].length,
-				  DMA_FROM_DEVICE);
-}
-
 /**
  * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list
  * @rdma: controlling svcxprt_rdma
@@ -191,46 +219,14 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 
 static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
 {
-	struct ib_device *device = rdma->sc_cm_id->device;
 	struct svc_rdma_recv_ctxt *ctxt;
 	struct ib_recv_wr *bad_recv_wr;
-	int sge_no, buflen, ret;
-	struct page *page;
-	dma_addr_t pa;
+	int ret;
 
 	ctxt = svc_rdma_recv_ctxt_get(rdma);
 	if (!ctxt)
 		return -ENOMEM;
 
-	buflen = 0;
-	ctxt->rc_cqe.done = svc_rdma_wc_receive;
-	for (sge_no = 0; buflen < rdma->sc_max_req_size; sge_no++) {
-		if (sge_no >= rdma->sc_max_sge) {
-			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
-			goto err_put_ctxt;
-		}
-
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			goto err_put_ctxt;
-		ctxt->rc_pages[sge_no] = page;
-		ctxt->rc_page_count++;
-
-		pa = ib_dma_map_page(device, ctxt->rc_pages[sge_no],
-				     0, PAGE_SIZE, DMA_FROM_DEVICE);
-		if (ib_dma_mapping_error(device, pa))
-			goto err_put_ctxt;
-		ctxt->rc_sges[sge_no].addr = pa;
-		ctxt->rc_sges[sge_no].length = PAGE_SIZE;
-		ctxt->rc_sges[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-		ctxt->rc_recv_wr.num_sge++;
-
-		buflen += PAGE_SIZE;
-	}
-	ctxt->rc_recv_wr.next = NULL;
-	ctxt->rc_recv_wr.sg_list = &ctxt->rc_sges[0];
-	ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
-
 	svc_xprt_get(&rdma->sc_xprt);
 	ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr);
 	trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
@@ -238,12 +234,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
 		goto err_post;
 	return 0;
 
-err_put_ctxt:
-	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
-	svc_rdma_recv_ctxt_put(rdma, ctxt);
-	return -ENOMEM;
 err_post:
-	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
 	svc_rdma_recv_ctxt_put(rdma, ctxt);
 	svc_xprt_put(&rdma->sc_xprt);
 	return ret;
@@ -289,7 +280,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 
 	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
 	ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
-	svc_rdma_recv_ctxt_unmap(rdma, ctxt);
 
 	if (wc->status != IB_WC_SUCCESS)
 		goto flushed;
@@ -299,6 +289,10 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 
 	/* All wc fields are now known to be valid */
 	ctxt->rc_byte_len = wc->byte_len;
+	ib_dma_sync_single_for_cpu(rdma->sc_pd->device,
+				   ctxt->rc_recv_sge.addr,
+				   wc->byte_len, DMA_FROM_DEVICE);
+
 	spin_lock(&rdma->sc_rq_dto_lock);
 	list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
 	spin_unlock(&rdma->sc_rq_dto_lock);
@@ -339,64 +333,22 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
 	}
 }
 
-/*
- * Replace the pages in the rq_argpages array with the pages from the SGE in
- * the RDMA_RECV completion. The SGL should contain full pages up until the
- * last one.
- */
 static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
 				   struct svc_rdma_recv_ctxt *ctxt)
 {
-	struct page *page;
-	int sge_no;
-	u32 len;
-
-	/* The reply path assumes the Call's transport header resides
-	 * in rqstp->rq_pages[0].
-	 */
-	page = ctxt->rc_pages[0];
-	put_page(rqstp->rq_pages[0]);
-	rqstp->rq_pages[0] = page;
-
-	/* Set up the XDR head */
-	rqstp->rq_arg.head[0].iov_base = page_address(page);
-	rqstp->rq_arg.head[0].iov_len =
-		min_t(size_t, ctxt->rc_byte_len, ctxt->rc_sges[0].length);
-	rqstp->rq_arg.len = ctxt->rc_byte_len;
-	rqstp->rq_arg.buflen = ctxt->rc_byte_len;
-
-	/* Compute bytes past head in the SGL */
-	len = ctxt->rc_byte_len - rqstp->rq_arg.head[0].iov_len;
-
-	/* If data remains, store it in the pagelist */
-	rqstp->rq_arg.page_len = len;
-	rqstp->rq_arg.page_base = 0;
-
-	sge_no = 1;
-	while (len && sge_no < ctxt->rc_recv_wr.num_sge) {
-		page = ctxt->rc_pages[sge_no];
-		put_page(rqstp->rq_pages[sge_no]);
-		rqstp->rq_pages[sge_no] = page;
-		len -= min_t(u32, len, ctxt->rc_sges[sge_no].length);
-		sge_no++;
-	}
-	ctxt->rc_hdr_count = sge_no;
-	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+	struct xdr_buf *arg = &rqstp->rq_arg;
+
+	arg->head[0].iov_base = ctxt->rc_recv_buf;
+	arg->head[0].iov_len = ctxt->rc_byte_len;
+	arg->tail[0].iov_base = NULL;
+	arg->tail[0].iov_len = 0;
+	arg->page_len = 0;
+	arg->page_base = 0;
+	arg->buflen = ctxt->rc_byte_len;
+	arg->len = ctxt->rc_byte_len;
+
+	rqstp->rq_respages = &rqstp->rq_pages[0];
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
-
-	/* If not all pages were used from the SGL, free the remaining ones */
-	while (sge_no < ctxt->rc_recv_wr.num_sge) {
-		page = ctxt->rc_pages[sge_no++];
-		put_page(page);
-	}
-
-	/* @ctxt's pages have all been released or moved to @rqstp->rq_pages.
-	 */
-	ctxt->rc_page_count = 0;
-
-	/* Set up tail */
-	rqstp->rq_arg.tail[0].iov_base = NULL;
-	rqstp->rq_arg.tail[0].iov_len = 0;
 }
 
 /* This accommodates the largest possible Write chunk,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 8242aa318ac1..ce3ea8419704 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -718,15 +718,14 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
 	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	int ret;
 
-	info->ri_pageno = head->rc_hdr_count;
-	info->ri_pageoff = 0;
-
 	ret = svc_rdma_build_read_chunk(rqstp, info, p);
 	if (ret < 0)
 		goto out;
 
 	trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position);
 
+	head->rc_hdr_count = 0;
+
 	/* Split the Receive buffer between the head and tail
 	 * buffers at Read chunk's position. XDR roundup of the
 	 * chunk is not included in either the pagelist or in
@@ -775,9 +774,6 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
 	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	int ret;
 
-	info->ri_pageno = head->rc_hdr_count - 1;
-	info->ri_pageoff = offset_in_page(head->rc_byte_len);
-
 	ret = svc_rdma_build_read_chunk(rqstp, info, p);
 	if (ret < 0)
 		goto out;
@@ -787,20 +783,13 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
 	head->rc_arg.len += info->ri_chunklen;
 	head->rc_arg.buflen += info->ri_chunklen;
 
-	if (head->rc_arg.buflen <= head->rc_sges[0].length) {
-		/* Transport header and RPC message fit entirely
-		 * in page where head iovec resides.
-		 */
-		head->rc_arg.head[0].iov_len = info->ri_chunklen;
-	} else {
-		/* Transport header and part of RPC message reside
-		 * in the head iovec's page.
-		 */
-		head->rc_arg.head[0].iov_len =
-			head->rc_sges[0].length - head->rc_byte_len;
-		head->rc_arg.page_len =
-			info->ri_chunklen - head->rc_arg.head[0].iov_len;
-	}
+	head->rc_hdr_count = 1;
+	head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]);
+	head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE,
+					     info->ri_chunklen);
+
+	head->rc_arg.page_len = info->ri_chunklen -
+				head->rc_arg.head[0].iov_len;
 
 out:
 	return ret;
@@ -834,7 +823,6 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
 	 * head->rc_arg. Pages involved with RDMA Read I/O are
 	 * transferred there.
 	 */
-	head->rc_page_count = head->rc_hdr_count;
 	head->rc_arg.head[0] = rqstp->rq_arg.head[0];
 	head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
 	head->rc_arg.pages = head->rc_pages;
@@ -847,6 +835,8 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
 	if (!info)
 		return -ENOMEM;
 	info->ri_readctxt = head;
+	info->ri_pageno = 0;
+	info->ri_pageoff = 0;
 
 	info->ri_position = be32_to_cpup(p + 1);
 	if (info->ri_position)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index cbbde70eeec5..b27b597d94da 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -629,10 +629,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	struct page *res_page;
 	int ret;
 
-	/* Find the call's chunk lists to decide how to send the reply.
-	 * Receive places the Call's xprt header at the start of page 0.
-	 */
-	rdma_argp = page_address(rqstp->rq_pages[0]);
+	rdma_argp = rctxt->rc_recv_buf;
 	svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
 
 	/* Create the RDMA response header. xprt->xpt_mutex,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ef32c46a234c..baeecbb2f763 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -669,7 +669,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
 	qp_attr.cap.max_recv_wr = rq_depth;
 	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
-	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
+	qp_attr.cap.max_recv_sge = 1;
 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	qp_attr.qp_type = IB_QPT_RC;
 	qp_attr.send_cq = newxprt->sc_sq_cq;
-- 
cgit v1.2.3


From eb5d7a622e0bbe3fd316b2325d3840a0e030a3c4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:48 -0400
Subject: svcrdma: Allocate recv_ctxt's on CPU handling Receives

There is a significant latency penalty when processing an ingress
Receive if the Receive buffer resides in memory that is not on the
same NUMA node as the the CPU handling completions for a CQ.

The system administrator and the device driver determine which CPU
handles completions. This CPU does not change during life of the CQ.
Further the Upper Layer does not have any visibility of which CPU it
is.

Allocating Receive buffers in the Receive completion handler
guarantees that Receive buffers are allocated on the preferred NUMA
node for that CQ.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 52 +++++++++++++++++++++++----------
 2 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 01baabfb863b..27cf59c7085f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -151,6 +151,7 @@ struct svc_rdma_recv_ctxt {
 	struct ib_sge		rc_recv_sge;
 	void			*rc_recv_buf;
 	struct xdr_buf		rc_arg;
+	bool			rc_temp;
 	u32			rc_byte_len;
 	unsigned int		rc_page_count;
 	unsigned int		rc_hdr_count;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index d4ccd1c0142c..0445e75d76a2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -144,6 +144,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 	ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
 	ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
 	ctxt->rc_recv_buf = buffer;
+	ctxt->rc_temp = false;
 	return ctxt;
 
 fail2:
@@ -154,6 +155,15 @@ fail0:
 	return NULL;
 }
 
+static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
+				       struct svc_rdma_recv_ctxt *ctxt)
+{
+	ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
+			    ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
+	kfree(ctxt->rc_recv_buf);
+	kfree(ctxt);
+}
+
 /**
  * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
  * @rdma: svcxprt_rdma being torn down
@@ -165,12 +175,7 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
 
 	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) {
 		list_del(&ctxt->rc_list);
-		ib_dma_unmap_single(rdma->sc_pd->device,
-				    ctxt->rc_recv_sge.addr,
-				    ctxt->rc_recv_sge.length,
-				    DMA_FROM_DEVICE);
-		kfree(ctxt->rc_recv_buf);
-		kfree(ctxt);
+		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
 	}
 }
 
@@ -212,21 +217,21 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 
 	for (i = 0; i < ctxt->rc_page_count; i++)
 		put_page(ctxt->rc_pages[i]);
-	spin_lock(&rdma->sc_recv_lock);
-	list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
-	spin_unlock(&rdma->sc_recv_lock);
+
+	if (!ctxt->rc_temp) {
+		spin_lock(&rdma->sc_recv_lock);
+		list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
+		spin_unlock(&rdma->sc_recv_lock);
+	} else
+		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
 }
 
-static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
+				struct svc_rdma_recv_ctxt *ctxt)
 {
-	struct svc_rdma_recv_ctxt *ctxt;
 	struct ib_recv_wr *bad_recv_wr;
 	int ret;
 
-	ctxt = svc_rdma_recv_ctxt_get(rdma);
-	if (!ctxt)
-		return -ENOMEM;
-
 	svc_xprt_get(&rdma->sc_xprt);
 	ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr);
 	trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
@@ -240,6 +245,16 @@ err_post:
 	return ret;
 }
 
+static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	ctxt = svc_rdma_recv_ctxt_get(rdma);
+	if (!ctxt)
+		return -ENOMEM;
+	return __svc_rdma_post_recv(rdma, ctxt);
+}
+
 /**
  * svc_rdma_post_recvs - Post initial set of Recv WRs
  * @rdma: fresh svcxprt_rdma
@@ -248,11 +263,16 @@ err_post:
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
+	struct svc_rdma_recv_ctxt *ctxt;
 	unsigned int i;
 	int ret;
 
 	for (i = 0; i < rdma->sc_max_requests; i++) {
-		ret = svc_rdma_post_recv(rdma);
+		ctxt = svc_rdma_recv_ctxt_get(rdma);
+		if (!ctxt)
+			return -ENOMEM;
+		ctxt->rc_temp = true;
+		ret = __svc_rdma_post_recv(rdma, ctxt);
 		if (ret) {
 			pr_err("svcrdma: failure posting recv buffers: %d\n",
 			       ret);
-- 
cgit v1.2.3


From f016f305f98159a9131ce200ed3b4ed92133012c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:53 -0400
Subject: svcrdma: Refactor svc_rdma_dma_map_buf

Clean up: svc_rdma_dma_map_buf does mostly the same thing as
svc_rdma_dma_map_page, so let's fold these together.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h       |  7 -----
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 50 ++++++++++++-----------------------
 2 files changed, 17 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 27cf59c7085f..95530bc7bfab 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -158,13 +158,6 @@ struct svc_rdma_recv_ctxt {
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
-/* Track DMA maps for this transport and context */
-static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
-					   struct svc_rdma_op_ctxt *ctxt)
-{
-	ctxt->mapped_sges++;
-}
-
 /* svc_rdma_backchannel.c */
 extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 				    __be32 *rdma_resp,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index b27b597d94da..ee9ba0736ceb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -302,41 +302,11 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
 	return be32_to_cpup(p);
 }
 
-/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
- * is used during completion to DMA-unmap this memory, and
- * it uses ib_dma_unmap_page() exclusively.
- */
-static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
-				struct svc_rdma_op_ctxt *ctxt,
-				unsigned int sge_no,
-				unsigned char *base,
-				unsigned int len)
-{
-	unsigned long offset = (unsigned long)base & ~PAGE_MASK;
-	struct ib_device *dev = rdma->sc_cm_id->device;
-	dma_addr_t dma_addr;
-
-	dma_addr = ib_dma_map_page(dev, virt_to_page(base),
-				   offset, len, DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(dev, dma_addr))
-		goto out_maperr;
-
-	ctxt->sge[sge_no].addr = dma_addr;
-	ctxt->sge[sge_no].length = len;
-	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-	svc_rdma_count_mappings(rdma, ctxt);
-	return 0;
-
-out_maperr:
-	pr_err("svcrdma: failed to map buffer\n");
-	return -EIO;
-}
-
 static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
 				 unsigned int sge_no,
 				 struct page *page,
-				 unsigned int offset,
+				 unsigned long offset,
 				 unsigned int len)
 {
 	struct ib_device *dev = rdma->sc_cm_id->device;
@@ -349,7 +319,7 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 	ctxt->sge[sge_no].addr = dma_addr;
 	ctxt->sge[sge_no].length = len;
 	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-	svc_rdma_count_mappings(rdma, ctxt);
+	ctxt->mapped_sges++;
 	return 0;
 
 out_maperr:
@@ -357,6 +327,19 @@ out_maperr:
 	return -EIO;
 }
 
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+				struct svc_rdma_op_ctxt *ctxt,
+				unsigned int sge_no,
+				unsigned char *base,
+				unsigned int len)
+{
+	return svc_rdma_dma_map_page(rdma, ctxt, sge_no, virt_to_page(base),
+				     offset_in_page(base), len);
+}
+
 /**
  * svc_rdma_map_reply_hdr - DMA map the transport header buffer
  * @rdma: controlling transport
@@ -389,7 +372,8 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_op_ctxt *ctxt,
 				  struct xdr_buf *xdr, __be32 *wr_lst)
 {
-	unsigned int len, sge_no, remaining, page_off;
+	unsigned int len, sge_no, remaining;
+	unsigned long page_off;
 	struct page **ppages;
 	unsigned char *base;
 	u32 xdr_pad;
-- 
cgit v1.2.3


From 232627905f12a05df75853c62451ce0886803cee Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:27:59 -0400
Subject: svcrdma: Clean up Send SGE accounting

Clean up: Since there's already a svc_rdma_op_ctxt being passed
around with the running count of mapped SGEs, drop unneeded
parameters to svc_rdma_post_send_wr().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 17 ++++++++---------
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 95530bc7bfab..8827b4e36c3c 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -188,7 +188,7 @@ extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 				  __be32 *rdma_resp, unsigned int len);
 extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
-				 int num_sge, u32 inv_rkey);
+				 u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
 
 /* svc_rdma_transport.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index d50152163190..0b9ba9f50a76 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -135,7 +135,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	 * the rq_buffer before all retransmits are complete.
 	 */
 	get_page(virt_to_page(rqst->rq_buffer));
-	ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 0);
 	if (ret)
 		goto out_unmap;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0445e75d76a2..af6d2f3b3242 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -639,7 +639,7 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 		return;
 	}
 
-	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+	ret = svc_rdma_post_send_wr(xprt, ctxt, 0);
 	if (ret) {
 		svc_rdma_unmap_dma(ctxt);
 		svc_rdma_put_context(ctxt, 1);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index ee9ba0736ceb..4591017adc1e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -365,8 +365,7 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 /* Load the xdr_buf into the ctxt's sge array, and DMA map each
  * element as it is added.
  *
- * Returns the number of sge elements loaded on success, or
- * a negative errno on failure.
+ * Returns zero on success, or a negative errno on failure.
  */
 static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_op_ctxt *ctxt,
@@ -429,7 +428,7 @@ tail:
 			return ret;
 	}
 
-	return sge_no - 1;
+	return 0;
 }
 
 /* The svc_rqst and all resources it owns are released as soon as
@@ -453,7 +452,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  * svc_rdma_post_send_wr - Set up and post one Send Work Request
  * @rdma: controlling transport
  * @ctxt: op_ctxt for transmitting the Send WR
- * @num_sge: number of SGEs to send
  * @inv_rkey: R_key argument to Send With Invalidate, or zero
  *
  * Returns:
@@ -463,18 +461,19 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  *	%-ENOMEM if ib_post_send failed.
  */
 int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
-			  struct svc_rdma_op_ctxt *ctxt, int num_sge,
+			  struct svc_rdma_op_ctxt *ctxt,
 			  u32 inv_rkey)
 {
 	struct ib_send_wr *send_wr = &ctxt->send_wr;
 
-	dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
+	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
+		ctxt->mapped_sges);
 
 	send_wr->next = NULL;
 	ctxt->cqe.done = svc_rdma_wc_send;
 	send_wr->wr_cqe = &ctxt->cqe;
 	send_wr->sg_list = ctxt->sge;
-	send_wr->num_sge = num_sge;
+	send_wr->num_sge = ctxt->mapped_sges;
 	send_wr->send_flags = IB_SEND_SIGNALED;
 	if (inv_rkey) {
 		send_wr->opcode = IB_WR_SEND_WITH_INV;
@@ -532,7 +531,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 	inv_rkey = 0;
 	if (rdma->sc_snd_w_inv)
 		inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
-	ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, inv_rkey);
 	if (ret)
 		goto err;
 
@@ -574,7 +573,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 
 	svc_rdma_save_io_pages(rqstp, ctxt);
 
-	ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 0);
 	if (ret)
 		goto err;
 
-- 
cgit v1.2.3


From 4201c7464753827803366b40e82eb050c04ebdef Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:28:04 -0400
Subject: svcrdma: Introduce svc_rdma_send_ctxt

svc_rdma_op_ctxt's are pre-allocated and maintained on a per-xprt
free list. This eliminates the overhead of calling kmalloc / kfree,
both of which grab a globally shared lock that disables interrupts.
Introduce a replacement to svc_rdma_op_ctxt's that is built
especially for the svcrdma Send path.

Subsequent patches will take advantage of this new structure by
allocating real resources which are then cached in these objects.
The allocations are freed when the transport is torn down.

I've renamed the structure so that static type checking can be used
to ensure that uses of op_ctxt and send_ctxt are not confused. As an
additional clean up, structure fields are renamed to conform with
kernel coding conventions.

Additional clean ups:
- Handle svc_rdma_send_ctxt_get allocation failure at each call
  site, rather than pre-allocating and hoping we guessed correctly
- All send_ctxt_put call-sites request page freeing, so remove
  the @free_pages argument
- All send_ctxt_put call-sites unmap SGEs, so fold that into
  svc_rdma_send_ctxt_put

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  35 ++--
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  13 +-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    |  13 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 254 ++++++++++++++++++++++++-----
 net/sunrpc/xprtrdma/svc_rdma_transport.c   | 205 +----------------------
 5 files changed, 254 insertions(+), 266 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 8827b4e36c3c..d3e2bb331264 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -109,8 +109,8 @@ struct svcxprt_rdma {
 
 	struct ib_pd         *sc_pd;
 
-	spinlock_t	     sc_ctxt_lock;
-	struct list_head     sc_ctxts;
+	spinlock_t	     sc_send_lock;
+	struct list_head     sc_send_ctxts;
 	int		     sc_ctxt_used;
 	spinlock_t	     sc_rw_ctxt_lock;
 	struct list_head     sc_rw_ctxts;
@@ -158,6 +158,19 @@ struct svc_rdma_recv_ctxt {
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
+enum {
+	RPCRDMA_MAX_SGES	= 1 + (RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE),
+};
+
+struct svc_rdma_send_ctxt {
+	struct list_head	sc_list;
+	struct ib_send_wr	sc_send_wr;
+	struct ib_cqe		sc_cqe;
+	int			sc_page_count;
+	struct page		*sc_pages[RPCSVC_MAXPAGES];
+	struct ib_sge		sc_sges[RPCRDMA_MAX_SGES];
+};
+
 /* svc_rdma_backchannel.c */
 extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 				    __be32 *rdma_resp,
@@ -183,24 +196,22 @@ extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     struct xdr_buf *xdr);
 
 /* svc_rdma_sendto.c */
+extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern struct svc_rdma_send_ctxt *
+		svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
+extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
+				   struct svc_rdma_send_ctxt *ctxt);
+extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr);
 extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
-				  struct svc_rdma_op_ctxt *ctxt,
+				  struct svc_rdma_send_ctxt *ctxt,
 				  __be32 *rdma_resp, unsigned int len);
 extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
-				 struct svc_rdma_op_ctxt *ctxt,
+				 struct svc_rdma_send_ctxt *ctxt,
 				 u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
 
 /* svc_rdma_transport.c */
-extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
-extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *);
-extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *);
-extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *);
-extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
 extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
-extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
-extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
-extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
 extern void svc_sq_reap(struct svcxprt_rdma *);
 extern void svc_rq_reap(struct svcxprt_rdma *);
 extern void svc_rdma_prep_reply_hdr(struct svc_rqst *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 0b9ba9f50a76..95e33511cc6f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2015-2018 Oracle.  All rights reserved.
  *
  * Support for backward direction RPCs on RPC/RDMA (server-side).
  */
@@ -117,10 +117,14 @@ out_notfound:
 static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 			      struct rpc_rqst *rqst)
 {
-	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_send_ctxt *ctxt;
 	int ret;
 
-	ctxt = svc_rdma_get_context(rdma);
+	ctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!ctxt) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
 
 	/* rpcrdma_bc_send_request builds the transport header and
 	 * the backchannel RPC message in the same buffer. Thus only
@@ -144,8 +148,7 @@ out_err:
 	return ret;
 
 out_unmap:
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 1);
+	svc_rdma_send_ctxt_put(rdma, ctxt);
 	ret = -EIO;
 	goto out_err;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index af6d2f3b3242..2d1e0db4c869 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -601,7 +601,7 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
 static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 				__be32 *rdma_argp, int status)
 {
-	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_send_ctxt *ctxt;
 	__be32 *p, *err_msgp;
 	unsigned int length;
 	struct page *page;
@@ -631,7 +631,10 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 	length = (unsigned long)p - (unsigned long)err_msgp;
 
 	/* Map transport header; no RPC message payload */
-	ctxt = svc_rdma_get_context(xprt);
+	ctxt = svc_rdma_send_ctxt_get(xprt);
+	if (!ctxt)
+		return;
+
 	ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
 	if (ret) {
 		dprintk("svcrdma: Error %d mapping send for protocol error\n",
@@ -640,10 +643,8 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 	}
 
 	ret = svc_rdma_post_send_wr(xprt, ctxt, 0);
-	if (ret) {
-		svc_rdma_unmap_dma(ctxt);
-		svc_rdma_put_context(ctxt, 1);
-	}
+	if (ret)
+		svc_rdma_send_ctxt_put(xprt, ctxt);
 }
 
 /* By convention, backchannel calls arrive via rdma_msg type
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4591017adc1e..b286d6a6e429 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -75,11 +75,11 @@
  * DMA-unmap the pages under I/O for that Write segment. The Write
  * completion handler does not release any pages.
  *
- * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * When the Send WR is constructed, it also gets its own svc_rdma_send_ctxt.
  * The ownership of all of the Reply's pages are transferred into that
  * ctxt, the Send WR is posted, and sendto returns.
  *
- * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * The svc_rdma_send_ctxt is presented when the Send WR completes. The
  * Send completion handler finally releases the Reply's pages.
  *
  * This mechanism also assumes that completions on the transport's Send
@@ -114,6 +114,184 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
+static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);
+
+static inline struct svc_rdma_send_ctxt *
+svc_rdma_next_send_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_send_ctxt,
+					sc_list);
+}
+
+static struct svc_rdma_send_ctxt *
+svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_send_ctxt *ctxt;
+	int i;
+
+	ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return NULL;
+
+	ctxt->sc_cqe.done = svc_rdma_wc_send;
+	ctxt->sc_send_wr.next = NULL;
+	ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
+	ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
+	ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
+	for (i = 0; i < ARRAY_SIZE(ctxt->sc_sges); i++)
+		ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey;
+	return ctxt;
+}
+
+/**
+ * svc_rdma_send_ctxts_destroy - Release all send_ctxt's for an xprt
+ * @rdma: svcxprt_rdma being torn down
+ *
+ */
+void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_send_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) {
+		list_del(&ctxt->sc_list);
+		kfree(ctxt);
+	}
+}
+
+/**
+ * svc_rdma_send_ctxt_get - Get a free send_ctxt
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Returns a ready-to-use send_ctxt, or NULL if none are
+ * available and a fresh one cannot be allocated.
+ */
+struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_send_ctxt *ctxt;
+
+	spin_lock(&rdma->sc_send_lock);
+	ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts);
+	if (!ctxt)
+		goto out_empty;
+	list_del(&ctxt->sc_list);
+	spin_unlock(&rdma->sc_send_lock);
+
+out:
+	ctxt->sc_send_wr.num_sge = 0;
+	ctxt->sc_page_count = 0;
+	return ctxt;
+
+out_empty:
+	spin_unlock(&rdma->sc_send_lock);
+	ctxt = svc_rdma_send_ctxt_alloc(rdma);
+	if (!ctxt)
+		return NULL;
+	goto out;
+}
+
+/**
+ * svc_rdma_send_ctxt_put - Return send_ctxt to free list
+ * @rdma: controlling svcxprt_rdma
+ * @ctxt: object to return to the free list
+ *
+ * Pages left in sc_pages are DMA unmapped and released.
+ */
+void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
+			    struct svc_rdma_send_ctxt *ctxt)
+{
+	struct ib_device *device = rdma->sc_cm_id->device;
+	unsigned int i;
+
+	for (i = 0; i < ctxt->sc_send_wr.num_sge; i++)
+		ib_dma_unmap_page(device,
+				  ctxt->sc_sges[i].addr,
+				  ctxt->sc_sges[i].length,
+				  DMA_TO_DEVICE);
+
+	for (i = 0; i < ctxt->sc_page_count; ++i)
+		put_page(ctxt->sc_pages[i]);
+
+	spin_lock(&rdma->sc_send_lock);
+	list_add(&ctxt->sc_list, &rdma->sc_send_ctxts);
+	spin_unlock(&rdma->sc_send_lock);
+}
+
+/**
+ * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
+ * @cq: Completion Queue context
+ * @wc: Work Completion object
+ *
+ * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
+ * the Send completion handler could be running.
+ */
+static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct svcxprt_rdma *rdma = cq->cq_context;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_send_ctxt *ctxt;
+
+	trace_svcrdma_wc_send(wc);
+
+	atomic_inc(&rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
+	svc_rdma_send_ctxt_put(rdma, ctxt);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		svc_xprt_enqueue(&rdma->sc_xprt);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			pr_err("svcrdma: Send: %s (%u/0x%x)\n",
+			       ib_wc_status_msg(wc->status),
+			       wc->status, wc->vendor_err);
+	}
+
+	svc_xprt_put(&rdma->sc_xprt);
+}
+
+int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
+{
+	struct ib_send_wr *bad_wr, *n_wr;
+	int wr_count;
+	int i;
+	int ret;
+
+	wr_count = 1;
+	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+		wr_count++;
+
+	/* If the SQ is full, wait until an SQ entry is available */
+	while (1) {
+		if ((atomic_sub_return(wr_count, &rdma->sc_sq_avail) < 0)) {
+			atomic_inc(&rdma_stat_sq_starve);
+			trace_svcrdma_sq_full(rdma);
+			atomic_add(wr_count, &rdma->sc_sq_avail);
+			wait_event(rdma->sc_send_wait,
+				   atomic_read(&rdma->sc_sq_avail) > wr_count);
+			if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+				return -ENOTCONN;
+			trace_svcrdma_sq_retry(rdma);
+			continue;
+		}
+		/* Take a transport ref for each WR posted */
+		for (i = 0; i < wr_count; i++)
+			svc_xprt_get(&rdma->sc_xprt);
+
+		/* Bump used SQ WR count and post */
+		ret = ib_post_send(rdma->sc_qp, wr, &bad_wr);
+		trace_svcrdma_post_send(wr, ret);
+		if (ret) {
+			set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+			for (i = 0; i < wr_count; i++)
+				svc_xprt_put(&rdma->sc_xprt);
+			wake_up(&rdma->sc_send_wait);
+		}
+		break;
+	}
+	return ret;
+}
+
 static u32 xdr_padsize(u32 len)
 {
 	return (len & 3) ? (4 - (len & 3)) : 0;
@@ -303,7 +481,7 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
 }
 
 static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
-				 struct svc_rdma_op_ctxt *ctxt,
+				 struct svc_rdma_send_ctxt *ctxt,
 				 unsigned int sge_no,
 				 struct page *page,
 				 unsigned long offset,
@@ -316,10 +494,9 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 	if (ib_dma_mapping_error(dev, dma_addr))
 		goto out_maperr;
 
-	ctxt->sge[sge_no].addr = dma_addr;
-	ctxt->sge[sge_no].length = len;
-	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->mapped_sges++;
+	ctxt->sc_sges[sge_no].addr = dma_addr;
+	ctxt->sc_sges[sge_no].length = len;
+	ctxt->sc_send_wr.num_sge++;
 	return 0;
 
 out_maperr:
@@ -331,7 +508,7 @@ out_maperr:
  * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
  */
 static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
-				struct svc_rdma_op_ctxt *ctxt,
+				struct svc_rdma_send_ctxt *ctxt,
 				unsigned int sge_no,
 				unsigned char *base,
 				unsigned int len)
@@ -352,14 +529,13 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
  *	%-EIO if DMA mapping failed.
  */
 int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
-			   struct svc_rdma_op_ctxt *ctxt,
+			   struct svc_rdma_send_ctxt *ctxt,
 			   __be32 *rdma_resp,
 			   unsigned int len)
 {
-	ctxt->direction = DMA_TO_DEVICE;
-	ctxt->pages[0] = virt_to_page(rdma_resp);
-	ctxt->count = 1;
-	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
+	ctxt->sc_pages[0] = virt_to_page(rdma_resp);
+	ctxt->sc_page_count++;
+	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->sc_pages[0], 0, len);
 }
 
 /* Load the xdr_buf into the ctxt's sge array, and DMA map each
@@ -368,7 +544,7 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
  * Returns zero on success, or a negative errno on failure.
  */
 static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
-				  struct svc_rdma_op_ctxt *ctxt,
+				  struct svc_rdma_send_ctxt *ctxt,
 				  struct xdr_buf *xdr, __be32 *wr_lst)
 {
 	unsigned int len, sge_no, remaining;
@@ -436,13 +612,13 @@ tail:
  * so they are released by the Send completion handler.
  */
 static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
-				   struct svc_rdma_op_ctxt *ctxt)
+				   struct svc_rdma_send_ctxt *ctxt)
 {
 	int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
 
-	ctxt->count += pages;
+	ctxt->sc_page_count += pages;
 	for (i = 0; i < pages; i++) {
-		ctxt->pages[i + 1] = rqstp->rq_respages[i];
+		ctxt->sc_pages[i + 1] = rqstp->rq_respages[i];
 		rqstp->rq_respages[i] = NULL;
 	}
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
@@ -461,37 +637,29 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  *	%-ENOMEM if ib_post_send failed.
  */
 int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
-			  struct svc_rdma_op_ctxt *ctxt,
+			  struct svc_rdma_send_ctxt *ctxt,
 			  u32 inv_rkey)
 {
-	struct ib_send_wr *send_wr = &ctxt->send_wr;
-
 	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
-		ctxt->mapped_sges);
-
-	send_wr->next = NULL;
-	ctxt->cqe.done = svc_rdma_wc_send;
-	send_wr->wr_cqe = &ctxt->cqe;
-	send_wr->sg_list = ctxt->sge;
-	send_wr->num_sge = ctxt->mapped_sges;
-	send_wr->send_flags = IB_SEND_SIGNALED;
+		ctxt->sc_send_wr.num_sge);
+
 	if (inv_rkey) {
-		send_wr->opcode = IB_WR_SEND_WITH_INV;
-		send_wr->ex.invalidate_rkey = inv_rkey;
+		ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
+		ctxt->sc_send_wr.ex.invalidate_rkey = inv_rkey;
 	} else {
-		send_wr->opcode = IB_WR_SEND;
+		ctxt->sc_send_wr.opcode = IB_WR_SEND;
 	}
 
-	return svc_rdma_send(rdma, send_wr);
+	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
 }
 
 /* Prepare the portion of the RPC Reply that will be transmitted
  * via RDMA Send. The RPC-over-RDMA transport header is prepared
- * in sge[0], and the RPC xdr_buf is prepared in following sges.
+ * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
  *
  * Depending on whether a Write list or Reply chunk is present,
  * the server may send all, a portion of, or none of the xdr_buf.
- * In the latter case, only the transport header (sge[0]) is
+ * In the latter case, only the transport header (sc_sges[0]) is
  * transmitted.
  *
  * RDMA Send is the last step of transmitting an RPC reply. Pages
@@ -508,11 +676,13 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 				   struct svc_rqst *rqstp,
 				   __be32 *wr_lst, __be32 *rp_ch)
 {
-	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_send_ctxt *ctxt;
 	u32 inv_rkey;
 	int ret;
 
-	ctxt = svc_rdma_get_context(rdma);
+	ctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!ctxt)
+		return -ENOMEM;
 
 	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
 				     svc_rdma_reply_hdr_len(rdma_resp));
@@ -538,8 +708,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 	return 0;
 
 err:
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 1);
+	svc_rdma_send_ctxt_put(rdma, ctxt);
 	return ret;
 }
 
@@ -553,11 +722,13 @@ err:
 static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 				   __be32 *rdma_resp, struct svc_rqst *rqstp)
 {
-	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_send_ctxt *ctxt;
 	__be32 *p;
 	int ret;
 
-	ctxt = svc_rdma_get_context(rdma);
+	ctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!ctxt)
+		return -ENOMEM;
 
 	/* Replace the original transport header with an
 	 * RDMA_ERROR response. XID etc are preserved.
@@ -580,8 +751,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 	return 0;
 
 err:
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 1);
+	svc_rdma_send_ctxt_put(rdma, ctxt);
 	return ret;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index baeecbb2f763..3de81735a6cc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
+ * Copyright (c) 2015-2018 Oracle. All rights reserved.
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
  *
@@ -157,114 +158,6 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt)
 }
 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
 
-static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
-					   gfp_t flags)
-{
-	struct svc_rdma_op_ctxt *ctxt;
-
-	ctxt = kmalloc(sizeof(*ctxt), flags);
-	if (ctxt) {
-		ctxt->xprt = xprt;
-		INIT_LIST_HEAD(&ctxt->list);
-	}
-	return ctxt;
-}
-
-static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
-{
-	unsigned int i;
-
-	i = xprt->sc_sq_depth;
-	while (i--) {
-		struct svc_rdma_op_ctxt *ctxt;
-
-		ctxt = alloc_ctxt(xprt, GFP_KERNEL);
-		if (!ctxt) {
-			dprintk("svcrdma: No memory for RDMA ctxt\n");
-			return false;
-		}
-		list_add(&ctxt->list, &xprt->sc_ctxts);
-	}
-	return true;
-}
-
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
-{
-	struct svc_rdma_op_ctxt *ctxt = NULL;
-
-	spin_lock(&xprt->sc_ctxt_lock);
-	xprt->sc_ctxt_used++;
-	if (list_empty(&xprt->sc_ctxts))
-		goto out_empty;
-
-	ctxt = list_first_entry(&xprt->sc_ctxts,
-				struct svc_rdma_op_ctxt, list);
-	list_del(&ctxt->list);
-	spin_unlock(&xprt->sc_ctxt_lock);
-
-out:
-	ctxt->count = 0;
-	ctxt->mapped_sges = 0;
-	return ctxt;
-
-out_empty:
-	/* Either pre-allocation missed the mark, or send
-	 * queue accounting is broken.
-	 */
-	spin_unlock(&xprt->sc_ctxt_lock);
-
-	ctxt = alloc_ctxt(xprt, GFP_NOIO);
-	if (ctxt)
-		goto out;
-
-	spin_lock(&xprt->sc_ctxt_lock);
-	xprt->sc_ctxt_used--;
-	spin_unlock(&xprt->sc_ctxt_lock);
-	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
-	return NULL;
-}
-
-void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
-{
-	struct svcxprt_rdma *xprt = ctxt->xprt;
-	struct ib_device *device = xprt->sc_cm_id->device;
-	unsigned int i;
-
-	for (i = 0; i < ctxt->mapped_sges; i++)
-		ib_dma_unmap_page(device,
-				  ctxt->sge[i].addr,
-				  ctxt->sge[i].length,
-				  ctxt->direction);
-	ctxt->mapped_sges = 0;
-}
-
-void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
-{
-	struct svcxprt_rdma *xprt = ctxt->xprt;
-	int i;
-
-	if (free_pages)
-		for (i = 0; i < ctxt->count; i++)
-			put_page(ctxt->pages[i]);
-
-	spin_lock(&xprt->sc_ctxt_lock);
-	xprt->sc_ctxt_used--;
-	list_add(&ctxt->list, &xprt->sc_ctxts);
-	spin_unlock(&xprt->sc_ctxt_lock);
-}
-
-static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
-{
-	while (!list_empty(&xprt->sc_ctxts)) {
-		struct svc_rdma_op_ctxt *ctxt;
-
-		ctxt = list_first_entry(&xprt->sc_ctxts,
-					struct svc_rdma_op_ctxt, list);
-		list_del(&ctxt->list);
-		kfree(ctxt);
-	}
-}
-
 /* QP event handler */
 static void qp_event_handler(struct ib_event *event, void *context)
 {
@@ -292,39 +185,6 @@ static void qp_event_handler(struct ib_event *event, void *context)
 	}
 }
 
-/**
- * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
- * @cq:        completion queue
- * @wc:        completed WR
- *
- */
-void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
-{
-	struct svcxprt_rdma *xprt = cq->cq_context;
-	struct ib_cqe *cqe = wc->wr_cqe;
-	struct svc_rdma_op_ctxt *ctxt;
-
-	trace_svcrdma_wc_send(wc);
-
-	atomic_inc(&xprt->sc_sq_avail);
-	wake_up(&xprt->sc_send_wait);
-
-	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 1);
-
-	if (unlikely(wc->status != IB_WC_SUCCESS)) {
-		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
-		svc_xprt_enqueue(&xprt->sc_xprt);
-		if (wc->status != IB_WC_WR_FLUSH_ERR)
-			pr_err("svcrdma: Send: %s (%u/0x%x)\n",
-			       ib_wc_status_msg(wc->status),
-			       wc->status, wc->vendor_err);
-	}
-
-	svc_xprt_put(&xprt->sc_xprt);
-}
-
 static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 						 struct net *net)
 {
@@ -338,14 +198,14 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
-	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+	INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
-	spin_lock_init(&cma_xprt->sc_ctxt_lock);
+	spin_lock_init(&cma_xprt->sc_send_lock);
 	spin_lock_init(&cma_xprt->sc_recv_lock);
 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
 
@@ -640,9 +500,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	}
 	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
 
-	if (!svc_rdma_prealloc_ctxts(newxprt))
-		goto errout;
-
 	newxprt->sc_pd = ib_alloc_pd(dev, 0);
 	if (IS_ERR(newxprt->sc_pd)) {
 		dprintk("svcrdma: error creating PD for connect request\n");
@@ -794,11 +651,6 @@ static void __svc_rdma_free(struct work_struct *work)
 
 	svc_rdma_flush_recv_queues(rdma);
 
-	/* Warn if we leaked a resource or under-referenced */
-	if (rdma->sc_ctxt_used != 0)
-		pr_err("svcrdma: ctxt still in use? (%d)\n",
-		       rdma->sc_ctxt_used);
-
 	/* Final put of backchannel client transport */
 	if (xprt->xpt_bc_xprt) {
 		xprt_put(xprt->xpt_bc_xprt);
@@ -806,7 +658,7 @@ static void __svc_rdma_free(struct work_struct *work)
 	}
 
 	svc_rdma_destroy_rw_ctxts(rdma);
-	svc_rdma_destroy_ctxts(rdma);
+	svc_rdma_send_ctxts_destroy(rdma);
 	svc_rdma_recv_ctxts_destroy(rdma);
 
 	/* Destroy the QP if present (not a listener) */
@@ -860,52 +712,3 @@ static void svc_rdma_secure_port(struct svc_rqst *rqstp)
 static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
 {
 }
-
-int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
-{
-	struct ib_send_wr *bad_wr, *n_wr;
-	int wr_count;
-	int i;
-	int ret;
-
-	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
-		return -ENOTCONN;
-
-	wr_count = 1;
-	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
-		wr_count++;
-
-	/* If the SQ is full, wait until an SQ entry is available */
-	while (1) {
-		if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
-			atomic_inc(&rdma_stat_sq_starve);
-			trace_svcrdma_sq_full(xprt);
-			atomic_add(wr_count, &xprt->sc_sq_avail);
-			wait_event(xprt->sc_send_wait,
-				   atomic_read(&xprt->sc_sq_avail) > wr_count);
-			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
-				return -ENOTCONN;
-			trace_svcrdma_sq_retry(xprt);
-			continue;
-		}
-		/* Take a transport ref for each WR posted */
-		for (i = 0; i < wr_count; i++)
-			svc_xprt_get(&xprt->sc_xprt);
-
-		/* Bump used SQ WR count and post */
-		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
-		trace_svcrdma_post_send(wr, ret);
-		if (ret) {
-			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
-			for (i = 0; i < wr_count; i ++)
-				svc_xprt_put(&xprt->sc_xprt);
-			dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
-			dprintk("    sc_sq_avail=%d, sc_sq_depth=%d\n",
-				atomic_read(&xprt->sc_sq_avail),
-				xprt->sc_sq_depth);
-			wake_up(&xprt->sc_send_wait);
-		}
-		break;
-	}
-	return ret;
-}
-- 
cgit v1.2.3


From 25fd86eca11c26bad2aede6dd4709ff58f89c7cb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:28:09 -0400
Subject: svcrdma: Don't overrun the SGE array in svc_rdma_send_ctxt

Receive buffers are always the same size, but each Send WR has a
variable number of SGEs, based on the contents of the xdr_buf being
sent.

While assembling a Send WR, keep track of the number of SGEs so that
we don't exceed the device's maximum, or walk off the end of the
Send SGE array.

For now the Send path just fails if it exceeds the maximum.

The current logic in svc_rdma_accept bases the maximum number of
Send SGEs on the largest NFS request that can be sent or received.
In the transport layer, the limit is actually based on the
capabilities of the underlying device, not on properties of the
Upper Layer Protocol.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  9 +++-----
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 36 +++++++++++++++++++-------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 13 ++++++++----
 3 files changed, 33 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d3e2bb331264..bfb8824e31e1 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -96,7 +96,7 @@ struct svcxprt_rdma {
 	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
 	struct list_head     sc_accept_q;	/* Conn. waiting accept */
 	int		     sc_ord;		/* RDMA read limit */
-	int                  sc_max_sge;
+	int                  sc_max_send_sges;
 	bool		     sc_snd_w_inv;	/* OK to use Send With Invalidate */
 
 	atomic_t             sc_sq_avail;	/* SQEs ready to be consumed */
@@ -158,17 +158,14 @@ struct svc_rdma_recv_ctxt {
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
-enum {
-	RPCRDMA_MAX_SGES	= 1 + (RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE),
-};
-
 struct svc_rdma_send_ctxt {
 	struct list_head	sc_list;
 	struct ib_send_wr	sc_send_wr;
 	struct ib_cqe		sc_cqe;
 	int			sc_page_count;
+	int			sc_cur_sge_no;
 	struct page		*sc_pages[RPCSVC_MAXPAGES];
-	struct ib_sge		sc_sges[RPCRDMA_MAX_SGES];
+	struct ib_sge		sc_sges[];
 };
 
 /* svc_rdma_backchannel.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index b286d6a6e429..53d8db6bfaf2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -127,9 +127,12 @@ static struct svc_rdma_send_ctxt *
 svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_send_ctxt *ctxt;
+	size_t size;
 	int i;
 
-	ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+	size = sizeof(*ctxt);
+	size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
+	ctxt = kmalloc(size, GFP_KERNEL);
 	if (!ctxt)
 		return NULL;
 
@@ -138,7 +141,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 	ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
 	ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
 	ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
-	for (i = 0; i < ARRAY_SIZE(ctxt->sc_sges); i++)
+	for (i = 0; i < rdma->sc_max_send_sges; i++)
 		ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey;
 	return ctxt;
 }
@@ -482,7 +485,6 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
 
 static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_send_ctxt *ctxt,
-				 unsigned int sge_no,
 				 struct page *page,
 				 unsigned long offset,
 				 unsigned int len)
@@ -494,8 +496,8 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 	if (ib_dma_mapping_error(dev, dma_addr))
 		goto out_maperr;
 
-	ctxt->sc_sges[sge_no].addr = dma_addr;
-	ctxt->sc_sges[sge_no].length = len;
+	ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr;
+	ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len;
 	ctxt->sc_send_wr.num_sge++;
 	return 0;
 
@@ -509,11 +511,10 @@ out_maperr:
  */
 static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
 				struct svc_rdma_send_ctxt *ctxt,
-				unsigned int sge_no,
 				unsigned char *base,
 				unsigned int len)
 {
-	return svc_rdma_dma_map_page(rdma, ctxt, sge_no, virt_to_page(base),
+	return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base),
 				     offset_in_page(base), len);
 }
 
@@ -535,7 +536,8 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 {
 	ctxt->sc_pages[0] = virt_to_page(rdma_resp);
 	ctxt->sc_page_count++;
-	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->sc_pages[0], 0, len);
+	ctxt->sc_cur_sge_no = 0;
+	return svc_rdma_dma_map_page(rdma, ctxt, ctxt->sc_pages[0], 0, len);
 }
 
 /* Load the xdr_buf into the ctxt's sge array, and DMA map each
@@ -547,16 +549,16 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *ctxt,
 				  struct xdr_buf *xdr, __be32 *wr_lst)
 {
-	unsigned int len, sge_no, remaining;
+	unsigned int len, remaining;
 	unsigned long page_off;
 	struct page **ppages;
 	unsigned char *base;
 	u32 xdr_pad;
 	int ret;
 
-	sge_no = 1;
-
-	ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
+	if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
+		return -EIO;
+	ret = svc_rdma_dma_map_buf(rdma, ctxt,
 				   xdr->head[0].iov_base,
 				   xdr->head[0].iov_len);
 	if (ret < 0)
@@ -586,8 +588,10 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	while (remaining) {
 		len = min_t(u32, PAGE_SIZE - page_off, remaining);
 
-		ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
-					    *ppages++, page_off, len);
+		if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
+			return -EIO;
+		ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
+					    page_off, len);
 		if (ret < 0)
 			return ret;
 
@@ -599,7 +603,9 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	len = xdr->tail[0].iov_len;
 tail:
 	if (len) {
-		ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+		if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
+			return -EIO;
+		ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
 		if (ret < 0)
 			return ret;
 	}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3de81735a6cc..e9535a66bab0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -476,8 +476,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 
 	/* Qualify the transport resource defaults with the
 	 * capabilities of this particular device */
-	newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
-				  (size_t)RPCSVC_MAXPAGES);
+	newxprt->sc_max_send_sges = dev->attrs.max_sge;
+	/* transport hdr, head iovec, one page list entry, tail iovec */
+	if (newxprt->sc_max_send_sges < 4) {
+		pr_err("svcrdma: too few Send SGEs available (%d)\n",
+		       newxprt->sc_max_send_sges);
+		goto errout;
+	}
 	newxprt->sc_max_req_size = svcrdma_max_req_size;
 	newxprt->sc_max_requests = svcrdma_max_requests;
 	newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
@@ -525,7 +530,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	qp_attr.cap.max_rdma_ctxs = ctxts;
 	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
 	qp_attr.cap.max_recv_wr = rq_depth;
-	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
+	qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges;
 	qp_attr.cap.max_recv_sge = 1;
 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	qp_attr.qp_type = IB_QPT_RC;
@@ -586,7 +591,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	dprintk("    local address   : %pIS:%u\n", sap, rpc_get_port(sap));
 	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
 	dprintk("    remote address  : %pIS:%u\n", sap, rpc_get_port(sap));
-	dprintk("    max_sge         : %d\n", newxprt->sc_max_sge);
+	dprintk("    max_sge         : %d\n", newxprt->sc_max_send_sges);
 	dprintk("    sq_depth        : %d\n", newxprt->sc_sq_depth);
 	dprintk("    rdma_rw_ctxs    : %d\n", ctxts);
 	dprintk("    max_requests    : %d\n", newxprt->sc_max_requests);
-- 
cgit v1.2.3


From 986b78894b268f605e9ea055b99959bdce0e5945 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:28:15 -0400
Subject: svcrdma: Remove post_send_wr

Clean up: Now that the send_wr is part of the svc_rdma_send_ctxt,
svc_rdma_post_send_wr is nearly empty.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  3 --
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  3 +-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    |  3 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 47 ++++++++----------------------
 4 files changed, 16 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index bfb8824e31e1..a8bfc214614b 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -202,9 +202,6 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr);
 extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *ctxt,
 				  __be32 *rdma_resp, unsigned int len);
-extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
-				 struct svc_rdma_send_ctxt *ctxt,
-				 u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
 
 /* svc_rdma_transport.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 95e33511cc6f..40f5e4afbcc8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -139,7 +139,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	 * the rq_buffer before all retransmits are complete.
 	 */
 	get_page(virt_to_page(rqst->rq_buffer));
-	ret = svc_rdma_post_send_wr(rdma, ctxt, 0);
+	ctxt->sc_send_wr.opcode = IB_WR_SEND;
+	ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
 	if (ret)
 		goto out_unmap;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2d1e0db4c869..68648e6c5be2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -642,7 +642,8 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 		return;
 	}
 
-	ret = svc_rdma_post_send_wr(xprt, ctxt, 0);
+	ctxt->sc_send_wr.opcode = IB_WR_SEND;
+	ret = svc_rdma_send(xprt, &ctxt->sc_send_wr);
 	if (ret)
 		svc_rdma_send_ctxt_put(xprt, ctxt);
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 53d8db6bfaf2..0ebdc0c76483 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -630,35 +630,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 }
 
-/**
- * svc_rdma_post_send_wr - Set up and post one Send Work Request
- * @rdma: controlling transport
- * @ctxt: op_ctxt for transmitting the Send WR
- * @inv_rkey: R_key argument to Send With Invalidate, or zero
- *
- * Returns:
- *	%0 if the Send* was posted successfully,
- *	%-ENOTCONN if the connection was lost or dropped,
- *	%-EINVAL if there was a problem with the Send we built,
- *	%-ENOMEM if ib_post_send failed.
- */
-int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
-			  struct svc_rdma_send_ctxt *ctxt,
-			  u32 inv_rkey)
-{
-	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
-		ctxt->sc_send_wr.num_sge);
-
-	if (inv_rkey) {
-		ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
-		ctxt->sc_send_wr.ex.invalidate_rkey = inv_rkey;
-	} else {
-		ctxt->sc_send_wr.opcode = IB_WR_SEND;
-	}
-
-	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
-}
-
 /* Prepare the portion of the RPC Reply that will be transmitted
  * via RDMA Send. The RPC-over-RDMA transport header is prepared
  * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
@@ -683,7 +654,6 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 				   __be32 *wr_lst, __be32 *rp_ch)
 {
 	struct svc_rdma_send_ctxt *ctxt;
-	u32 inv_rkey;
 	int ret;
 
 	ctxt = svc_rdma_send_ctxt_get(rdma);
@@ -704,10 +674,16 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 
 	svc_rdma_save_io_pages(rqstp, ctxt);
 
-	inv_rkey = 0;
-	if (rdma->sc_snd_w_inv)
-		inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
-	ret = svc_rdma_post_send_wr(rdma, ctxt, inv_rkey);
+	ctxt->sc_send_wr.opcode = IB_WR_SEND;
+	if (rdma->sc_snd_w_inv) {
+		ctxt->sc_send_wr.ex.invalidate_rkey =
+			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+		if (ctxt->sc_send_wr.ex.invalidate_rkey)
+			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
+	}
+	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
+		ctxt->sc_send_wr.num_sge);
+	ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
 	if (ret)
 		goto err;
 
@@ -750,7 +726,8 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 
 	svc_rdma_save_io_pages(rqstp, ctxt);
 
-	ret = svc_rdma_post_send_wr(rdma, ctxt, 0);
+	ctxt->sc_send_wr.opcode = IB_WR_SEND;
+	ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
 	if (ret)
 		goto err;
 
-- 
cgit v1.2.3


From 99722fe4d5a634707ced8d8f42b883b87a86b3c5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:28:25 -0400
Subject: svcrdma: Persistently allocate and DMA-map Send buffers

While sending each RPC Reply, svc_rdma_sendto allocates and DMA-
maps a separate buffer where the RPC/RDMA transport header is
constructed. The buffer is unmapped and released in the Send
completion handler. This is significant per-RPC overhead,
especially for small RPCs.

Instead, allocate and DMA-map a buffer, and cache it in each
svc_rdma_send_ctxt. This buffer and its mapping can be re-used
for each RPC, saving the cost of memory allocation and DMA
mapping.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |   8 +-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  51 ++++------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    |  25 ++---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 149 +++++++++++++++--------------
 4 files changed, 105 insertions(+), 128 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a8bfc214614b..96b14a72d359 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -162,6 +162,7 @@ struct svc_rdma_send_ctxt {
 	struct list_head	sc_list;
 	struct ib_send_wr	sc_send_wr;
 	struct ib_cqe		sc_cqe;
+	void			*sc_xprt_buf;
 	int			sc_page_count;
 	int			sc_cur_sge_no;
 	struct page		*sc_pages[RPCSVC_MAXPAGES];
@@ -199,9 +200,12 @@ extern struct svc_rdma_send_ctxt *
 extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 				   struct svc_rdma_send_ctxt *ctxt);
 extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr);
-extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+extern void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
+				    struct svc_rdma_send_ctxt *ctxt,
+				    unsigned int len);
+extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *ctxt,
-				  __be32 *rdma_resp, unsigned int len);
+				  struct xdr_buf *xdr, __be32 *wr_lst);
 extern int svc_rdma_sendto(struct svc_rqst *);
 
 /* svc_rdma_transport.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 40f5e4afbcc8..343e7add672c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -115,43 +115,21 @@ out_notfound:
  * the adapter has a small maximum SQ depth.
  */
 static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
-			      struct rpc_rqst *rqst)
+			      struct rpc_rqst *rqst,
+			      struct svc_rdma_send_ctxt *ctxt)
 {
-	struct svc_rdma_send_ctxt *ctxt;
 	int ret;
 
-	ctxt = svc_rdma_send_ctxt_get(rdma);
-	if (!ctxt) {
-		ret = -ENOMEM;
-		goto out_err;
-	}
-
-	/* rpcrdma_bc_send_request builds the transport header and
-	 * the backchannel RPC message in the same buffer. Thus only
-	 * one SGE is needed to send both.
-	 */
-	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
-				     rqst->rq_snd_buf.len);
+	ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL);
 	if (ret < 0)
-		goto out_err;
+		return -EIO;
 
 	/* Bump page refcnt so Send completion doesn't release
 	 * the rq_buffer before all retransmits are complete.
 	 */
 	get_page(virt_to_page(rqst->rq_buffer));
 	ctxt->sc_send_wr.opcode = IB_WR_SEND;
-	ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
-	if (ret)
-		goto out_unmap;
-
-out_err:
-	dprintk("svcrdma: %s returns %d\n", __func__, ret);
-	return ret;
-
-out_unmap:
-	svc_rdma_send_ctxt_put(rdma, ctxt);
-	ret = -EIO;
-	goto out_err;
+	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
 }
 
 /* Server-side transport endpoint wants a whole page for its send
@@ -198,13 +176,15 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
 {
 	struct rpc_xprt *xprt = rqst->rq_xprt;
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct svc_rdma_send_ctxt *ctxt;
 	__be32 *p;
 	int rc;
 
-	/* Space in the send buffer for an RPC/RDMA header is reserved
-	 * via xprt->tsh_size.
-	 */
-	p = rqst->rq_buffer;
+	ctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!ctxt)
+		goto drop_connection;
+
+	p = ctxt->sc_xprt_buf;
 	*p++ = rqst->rq_xid;
 	*p++ = rpcrdma_version;
 	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
@@ -212,14 +192,17 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
 	*p++ = xdr_zero;
 	*p++ = xdr_zero;
 	*p   = xdr_zero;
+	svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN);
 
 #ifdef SVCRDMA_BACKCHANNEL_DEBUG
 	pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
 #endif
 
-	rc = svc_rdma_bc_sendto(rdma, rqst);
-	if (rc)
+	rc = svc_rdma_bc_sendto(rdma, rqst, ctxt);
+	if (rc) {
+		svc_rdma_send_ctxt_put(rdma, ctxt);
 		goto drop_connection;
+	}
 	return rc;
 
 drop_connection:
@@ -327,7 +310,7 @@ xprt_setup_rdma_bc(struct xprt_create *args)
 	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
 
 	xprt->prot = XPRT_TRANSPORT_BC_RDMA;
-	xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
+	xprt->tsh_size = 0;
 	xprt->ops = &xprt_rdma_bc_procs;
 
 	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 68648e6c5be2..09ce09b3ac6e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -602,17 +602,15 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 				__be32 *rdma_argp, int status)
 {
 	struct svc_rdma_send_ctxt *ctxt;
-	__be32 *p, *err_msgp;
 	unsigned int length;
-	struct page *page;
+	__be32 *p;
 	int ret;
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
+	ctxt = svc_rdma_send_ctxt_get(xprt);
+	if (!ctxt)
 		return;
-	err_msgp = page_address(page);
 
-	p = err_msgp;
+	p = ctxt->sc_xprt_buf;
 	*p++ = *rdma_argp;
 	*p++ = *(rdma_argp + 1);
 	*p++ = xprt->sc_fc_credits;
@@ -628,19 +626,8 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
 		*p++ = err_chunk;
 		trace_svcrdma_err_chunk(*rdma_argp);
 	}
-	length = (unsigned long)p - (unsigned long)err_msgp;
-
-	/* Map transport header; no RPC message payload */
-	ctxt = svc_rdma_send_ctxt_get(xprt);
-	if (!ctxt)
-		return;
-
-	ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
-	if (ret) {
-		dprintk("svcrdma: Error %d mapping send for protocol error\n",
-			ret);
-		return;
-	}
+	length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf;
+	svc_rdma_sync_reply_hdr(xprt, ctxt, length);
 
 	ctxt->sc_send_wr.opcode = IB_WR_SEND;
 	ret = svc_rdma_send(xprt, &ctxt->sc_send_wr);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index edfeca45ac1c..4a3efaea277c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -127,6 +127,8 @@ static struct svc_rdma_send_ctxt *
 svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_send_ctxt *ctxt;
+	dma_addr_t addr;
+	void *buffer;
 	size_t size;
 	int i;
 
@@ -134,16 +136,33 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 	size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
 	ctxt = kmalloc(size, GFP_KERNEL);
 	if (!ctxt)
-		return NULL;
+		goto fail0;
+	buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+	if (!buffer)
+		goto fail1;
+	addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
+				 rdma->sc_max_req_size, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
+		goto fail2;
 
-	ctxt->sc_cqe.done = svc_rdma_wc_send;
 	ctxt->sc_send_wr.next = NULL;
 	ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
 	ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
 	ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
+	ctxt->sc_cqe.done = svc_rdma_wc_send;
+	ctxt->sc_xprt_buf = buffer;
+	ctxt->sc_sges[0].addr = addr;
+
 	for (i = 0; i < rdma->sc_max_send_sges; i++)
 		ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey;
 	return ctxt;
+
+fail2:
+	kfree(buffer);
+fail1:
+	kfree(ctxt);
+fail0:
+	return NULL;
 }
 
 /**
@@ -157,6 +176,11 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
 
 	while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) {
 		list_del(&ctxt->sc_list);
+		ib_dma_unmap_single(rdma->sc_pd->device,
+				    ctxt->sc_sges[0].addr,
+				    rdma->sc_max_req_size,
+				    DMA_TO_DEVICE);
+		kfree(ctxt->sc_xprt_buf);
 		kfree(ctxt);
 	}
 }
@@ -181,6 +205,7 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
 
 out:
 	ctxt->sc_send_wr.num_sge = 0;
+	ctxt->sc_cur_sge_no = 0;
 	ctxt->sc_page_count = 0;
 	return ctxt;
 
@@ -205,7 +230,10 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 	struct ib_device *device = rdma->sc_cm_id->device;
 	unsigned int i;
 
-	for (i = 0; i < ctxt->sc_send_wr.num_sge; i++)
+	/* The first SGE contains the transport header, which
+	 * remains mapped until @ctxt is destroyed.
+	 */
+	for (i = 1; i < ctxt->sc_send_wr.num_sge; i++)
 		ib_dma_unmap_page(device,
 				  ctxt->sc_sges[i].addr,
 				  ctxt->sc_sges[i].length,
@@ -519,35 +547,37 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
 }
 
 /**
- * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer
  * @rdma: controlling transport
- * @ctxt: op_ctxt for the Send WR
- * @rdma_resp: buffer containing transport header
+ * @ctxt: send_ctxt for the Send WR
  * @len: length of transport header
  *
- * Returns:
- *	%0 if the header is DMA mapped,
- *	%-EIO if DMA mapping failed.
  */
-int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
-			   struct svc_rdma_send_ctxt *ctxt,
-			   __be32 *rdma_resp,
-			   unsigned int len)
+void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_send_ctxt *ctxt,
+			     unsigned int len)
 {
-	ctxt->sc_pages[0] = virt_to_page(rdma_resp);
-	ctxt->sc_page_count++;
-	ctxt->sc_cur_sge_no = 0;
-	return svc_rdma_dma_map_page(rdma, ctxt, ctxt->sc_pages[0], 0, len);
+	ctxt->sc_sges[0].length = len;
+	ctxt->sc_send_wr.num_sge++;
+	ib_dma_sync_single_for_device(rdma->sc_pd->device,
+				      ctxt->sc_sges[0].addr, len,
+				      DMA_TO_DEVICE);
 }
 
-/* Load the xdr_buf into the ctxt's sge array, and DMA map each
+/* svc_rdma_map_reply_msg - Map the buffer holding RPC message
+ * @rdma: controlling transport
+ * @ctxt: send_ctxt for the Send WR
+ * @xdr: prepared xdr_buf containing RPC message
+ * @wr_lst: pointer to Call header's Write list, or NULL
+ *
+ * Load the xdr_buf into the ctxt's sge array, and DMA map each
  * element as it is added.
  *
  * Returns zero on success, or a negative errno on failure.
  */
-static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
-				  struct svc_rdma_send_ctxt *ctxt,
-				  struct xdr_buf *xdr, __be32 *wr_lst)
+int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+			   struct svc_rdma_send_ctxt *ctxt,
+			   struct xdr_buf *xdr, __be32 *wr_lst)
 {
 	unsigned int len, remaining;
 	unsigned long page_off;
@@ -624,7 +654,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
 
 	ctxt->sc_page_count += pages;
 	for (i = 0; i < pages; i++) {
-		ctxt->sc_pages[i + 1] = rqstp->rq_respages[i];
+		ctxt->sc_pages[i] = rqstp->rq_respages[i];
 		rqstp->rq_respages[i] = NULL;
 	}
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
@@ -649,27 +679,18 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  * - The Reply's transport header will never be larger than a page.
  */
 static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
-				   __be32 *rdma_argp, __be32 *rdma_resp,
+				   struct svc_rdma_send_ctxt *ctxt,
+				   __be32 *rdma_argp,
 				   struct svc_rqst *rqstp,
 				   __be32 *wr_lst, __be32 *rp_ch)
 {
-	struct svc_rdma_send_ctxt *ctxt;
 	int ret;
 
-	ctxt = svc_rdma_send_ctxt_get(rdma);
-	if (!ctxt)
-		return -ENOMEM;
-
-	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
-				     svc_rdma_reply_hdr_len(rdma_resp));
-	if (ret < 0)
-		goto err;
-
 	if (!rp_ch) {
 		ret = svc_rdma_map_reply_msg(rdma, ctxt,
 					     &rqstp->rq_res, wr_lst);
 		if (ret < 0)
-			goto err;
+			return ret;
 	}
 
 	svc_rdma_save_io_pages(rqstp, ctxt);
@@ -683,15 +704,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 	}
 	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
 		ctxt->sc_send_wr.num_sge);
-	ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
-	if (ret)
-		goto err;
-
-	return 0;
-
-err:
-	svc_rdma_send_ctxt_put(rdma, ctxt);
-	return ret;
+	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
 }
 
 /* Given the client-provided Write and Reply chunks, the server was not
@@ -702,40 +715,29 @@ err:
  * Remote Invalidation is skipped for simplicity.
  */
 static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
-				   __be32 *rdma_resp, struct svc_rqst *rqstp)
+				   struct svc_rdma_send_ctxt *ctxt,
+				   struct svc_rqst *rqstp)
 {
-	struct svc_rdma_send_ctxt *ctxt;
 	__be32 *p;
 	int ret;
 
-	ctxt = svc_rdma_send_ctxt_get(rdma);
-	if (!ctxt)
-		return -ENOMEM;
-
-	/* Replace the original transport header with an
-	 * RDMA_ERROR response. XID etc are preserved.
-	 */
-	trace_svcrdma_err_chunk(*rdma_resp);
-	p = rdma_resp + 3;
+	p = ctxt->sc_xprt_buf;
+	trace_svcrdma_err_chunk(*p);
+	p += 3;
 	*p++ = rdma_error;
 	*p   = err_chunk;
-
-	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
-	if (ret < 0)
-		goto err;
+	svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR);
 
 	svc_rdma_save_io_pages(rqstp, ctxt);
 
 	ctxt->sc_send_wr.opcode = IB_WR_SEND;
 	ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
-	if (ret)
-		goto err;
+	if (ret) {
+		svc_rdma_send_ctxt_put(rdma, ctxt);
+		return ret;
+	}
 
 	return 0;
-
-err:
-	svc_rdma_send_ctxt_put(rdma, ctxt);
-	return ret;
 }
 
 void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -762,7 +764,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
 	__be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
 	struct xdr_buf *xdr = &rqstp->rq_res;
-	struct page *res_page;
+	struct svc_rdma_send_ctxt *sctxt;
 	int ret;
 
 	rdma_argp = rctxt->rc_recv_buf;
@@ -775,10 +777,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	 * critical section.
 	 */
 	ret = -ENOMEM;
-	res_page = alloc_page(GFP_KERNEL);
-	if (!res_page)
+	sctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!sctxt)
 		goto err0;
-	rdma_resp = page_address(res_page);
+	rdma_resp = sctxt->sc_xprt_buf;
 
 	p = rdma_resp;
 	*p++ = *rdma_argp;
@@ -805,10 +807,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 		svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
 	}
 
-	ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
+	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
+	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
 				      wr_lst, rp_ch);
 	if (ret < 0)
-		goto err0;
+		goto err1;
 	ret = 0;
 
 out:
@@ -820,14 +823,14 @@ out:
 	if (ret != -E2BIG && ret != -EINVAL)
 		goto err1;
 
-	ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
+	ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp);
 	if (ret < 0)
-		goto err0;
+		goto err1;
 	ret = 0;
 	goto out;
 
  err1:
-	put_page(res_page);
+	svc_rdma_send_ctxt_put(rdma, sctxt);
  err0:
 	trace_svcrdma_send_failed(rqstp, ret);
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
-- 
cgit v1.2.3


From 51cc257a1186f69dbb6faec1bd48cc7e1fc31079 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 7 May 2018 15:28:31 -0400
Subject: svcrdma: Remove unused svc_rdma_op_ctxt

Clean up: Eliminate a structure that is no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 96b14a72d359..fd78f78df5c6 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -71,26 +71,6 @@ extern atomic_t rdma_stat_rq_prod;
 extern atomic_t rdma_stat_sq_poll;
 extern atomic_t rdma_stat_sq_prod;
 
-/*
- * Contexts are built when an RDMA request is created and are a
- * record of the resources that can be recovered when the request
- * completes.
- */
-struct svc_rdma_op_ctxt {
-	struct list_head list;
-	struct xdr_buf arg;
-	struct ib_cqe cqe;
-	u32 byte_len;
-	struct svcxprt_rdma *xprt;
-	enum dma_data_direction direction;
-	int count;
-	unsigned int mapped_sges;
-	int hdr_count;
-	struct ib_send_wr send_wr;
-	struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
-	struct page *pages[RPCSVC_MAXPAGES];
-};
-
 struct svcxprt_rdma {
 	struct svc_xprt      sc_xprt;		/* SVC transport structure */
 	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
@@ -111,7 +91,6 @@ struct svcxprt_rdma {
 
 	spinlock_t	     sc_send_lock;
 	struct list_head     sc_send_ctxts;
-	int		     sc_ctxt_used;
 	spinlock_t	     sc_rw_ctxt_lock;
 	struct list_head     sc_rw_ctxts;
 
-- 
cgit v1.2.3


From 5ac40790b4708e4cb1a64ba2cb77320939bc5240 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 11 May 2018 12:51:03 -0600
Subject: libata: introduce notion of separate hardware tags

Rigth now these are the same, but drivers should be using ->hw_tag
for their command setup and issue.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 4 ++--
 include/linux/libata.h    | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 40caad1d8b43..4dc67c770429 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1600,7 +1600,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 
 	qc = __ata_qc_from_tag(ap, tag);
 
-	qc->tag = tag;
+	qc->tag = qc->hw_tag = tag;
 	qc->scsicmd = NULL;
 	qc->ap = ap;
 	qc->dev = dev;
@@ -5125,7 +5125,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
 	}
 
 	qc = __ata_qc_from_tag(ap, tag);
-	qc->tag = tag;
+	qc->tag = qc->hw_tag = tag;
 	qc->scsicmd = NULL;
 	qc->ap = ap;
 	qc->dev = dev;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1795fecdea17..07a8d54ba7af 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -637,7 +637,8 @@ struct ata_queued_cmd {
 	u8			cdb[ATAPI_CDB_LEN];
 
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
-	unsigned int		tag;
+	unsigned int		tag;		/* libata core tag */
+	unsigned int		hw_tag;		/* driver tag */
 	unsigned int		n_elem;
 	unsigned int		orig_n_elem;
 
-- 
cgit v1.2.3


From e3ed8939644166a7560a33c46f508584a7f1756a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 11 May 2018 12:51:05 -0600
Subject: libata: bump ->qc_active to a 64-bit type

This is in preparation for allowing full usage of the tag space,
which means that our reserved error handling command will be
using an internal tag value of 32. This doesn't fit in a u32, so
move to a u64.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 17 +++++++++--------
 drivers/ata/sata_fsl.c    |  2 +-
 drivers/ata/sata_mv.c     |  2 +-
 drivers/ata/sata_nv.c     |  2 +-
 include/linux/libata.h    |  4 ++--
 5 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 1687e24d3633..b079c3b5ec27 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1571,7 +1571,8 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	int auto_timeout = 0;
 	struct ata_queued_cmd *qc;
 	unsigned int tag, preempted_tag;
-	u32 preempted_sactive, preempted_qc_active;
+	u32 preempted_sactive;
+	u64 preempted_qc_active;
 	int preempted_nr_active_links;
 	DECLARE_COMPLETION_ONSTACK(wait);
 	unsigned long flags;
@@ -5195,7 +5196,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc)
 	 * is called. (when rc != 0 and atapi request sense is needed)
 	 */
 	qc->flags &= ~ATA_QCFLAG_ACTIVE;
-	ap->qc_active &= ~(1 << qc->tag);
+	ap->qc_active &= ~(1ULL << qc->tag);
 
 	/* call completion callback */
 	qc->complete_fn(qc);
@@ -5352,29 +5353,29 @@ void ata_qc_complete(struct ata_queued_cmd *qc)
  *	RETURNS:
  *	Number of completed commands on success, -errno otherwise.
  */
-int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active)
+int ata_qc_complete_multiple(struct ata_port *ap, u64 qc_active)
 {
 	int nr_done = 0;
-	u32 done_mask;
+	u64 done_mask;
 
 	done_mask = ap->qc_active ^ qc_active;
 
 	if (unlikely(done_mask & qc_active)) {
-		ata_port_err(ap, "illegal qc_active transition (%08x->%08x)\n",
+		ata_port_err(ap, "illegal qc_active transition (%08llx->%08llx)\n",
 			     ap->qc_active, qc_active);
 		return -EINVAL;
 	}
 
 	while (done_mask) {
 		struct ata_queued_cmd *qc;
-		unsigned int tag = __ffs(done_mask);
+		unsigned int tag = __ffs64(done_mask);
 
 		qc = ata_qc_from_tag(ap, tag);
 		if (qc) {
 			ata_qc_complete(qc);
 			nr_done++;
 		}
-		done_mask &= ~(1 << tag);
+		done_mask &= ~(1ULL << tag);
 	}
 
 	return nr_done;
@@ -5418,7 +5419,7 @@ void ata_qc_issue(struct ata_queued_cmd *qc)
 	}
 
 	qc->flags |= ATA_QCFLAG_ACTIVE;
-	ap->qc_active |= 1 << qc->tag;
+	ap->qc_active |= 1ULL << qc->tag;
 
 	/*
 	 * We guarantee to LLDs that they will have at least one
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index cb67847d2157..1b22d5c339d7 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1266,7 +1266,7 @@ static void sata_fsl_host_intr(struct ata_port *ap)
 	}
 
 	VPRINTK("Status of all queues :\n");
-	VPRINTK("done_mask/CC = 0x%x, CA = 0x%x, CE=0x%x,CQ=0x%x,apqa=0x%x\n",
+	VPRINTK("done_mask/CC = 0x%x, CA = 0x%x, CE=0x%x,CQ=0x%x,apqa=0x%llx\n",
 		done_mask,
 		ioread32(hcr_base + CA),
 		ioread32(hcr_base + CE),
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 3a08f38c695c..cddf96f6e431 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -2539,7 +2539,7 @@ static int mv_handle_fbs_ncq_dev_err(struct ata_port *ap)
 	failed_links = hweight16(new_map);
 
 	ata_port_info(ap,
-		      "%s: pmp_map=%04x qc_map=%04x failed_links=%d nr_active_links=%d\n",
+		      "%s: pmp_map=%04x qc_map=%04llx failed_links=%d nr_active_links=%d\n",
 		      __func__, pp->delayed_eh_pmp_map,
 		      ap->qc_active, failed_links,
 		      ap->nr_active_links);
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index d83afc3dbf94..9a74f9ff5ebc 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -1782,7 +1782,7 @@ static void nv_swncq_ncq_stop(struct ata_port *ap)
 	u32 sactive;
 	u32 done_mask;
 
-	ata_port_err(ap, "EH in SWNCQ mode,QC:qc_active 0x%X sactive 0x%X\n",
+	ata_port_err(ap, "EH in SWNCQ mode,QC:qc_active 0x%llX sactive 0x%X\n",
 		     ap->qc_active, ap->link.sactive);
 	ata_port_err(ap,
 		"SWNCQ:qc_active 0x%X defer_bits 0x%X last_issue_tag 0x%x\n  "
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 07a8d54ba7af..2881919b3728 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -852,7 +852,7 @@ struct ata_port {
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
 	unsigned long		sas_tag_allocated; /* for sas tag allocation only */
-	unsigned int		qc_active;
+	u64			qc_active;
 	int			nr_active_links; /* #links with active qcs */
 	unsigned int		sas_last_tag;	/* track next tag hw expects */
 
@@ -1185,7 +1185,7 @@ extern void ata_id_c_string(const u16 *id, unsigned char *s,
 extern unsigned int ata_do_dev_read_id(struct ata_device *dev,
 					struct ata_taskfile *tf, u16 *id);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
-extern int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active);
+extern int ata_qc_complete_multiple(struct ata_port *ap, u64 qc_active);
 extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd);
 extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
-- 
cgit v1.2.3


From 2e2cc676cee8962cdc82a23723df2fb394d35c64 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 11 May 2018 12:51:06 -0600
Subject: libata: use ata_tag_internal() consistently

Some check for the value directly, use the provided helper instead.
Also make it return a bool, since that's what it does.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 2 +-
 drivers/ata/libata-scsi.c | 2 +-
 include/linux/libata.h    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index b079c3b5ec27..8fd352d4c190 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -759,7 +759,7 @@ int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
 	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
 	tf->flags |= tf_flags;
 
-	if (ata_ncq_enabled(dev) && likely(tag != ATA_TAG_INTERNAL)) {
+	if (ata_ncq_enabled(dev) && !ata_tag_internal(tag)) {
 		/* yay, NCQ */
 		if (!lba_48_ok(block, n_block))
 			return -ERANGE;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 28e1af2bae5f..143cdad7d81a 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -5120,7 +5120,7 @@ int ata_sas_allocate_tag(struct ata_port *ap)
 		tag = tag < max_queue ? tag : 0;
 
 		/* the last tag is reserved for internal command. */
-		if (tag == ATA_TAG_INTERNAL)
+		if (ata_tag_internal(tag))
 			continue;
 
 		if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2881919b3728..60ce1ba26fdd 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1491,7 +1491,7 @@ static inline unsigned int ata_tag_valid(unsigned int tag)
 	return (tag < ATA_MAX_QUEUE) ? 1 : 0;
 }
 
-static inline unsigned int ata_tag_internal(unsigned int tag)
+static inline bool ata_tag_internal(unsigned int tag)
 {
 	return tag == ATA_TAG_INTERNAL;
 }
-- 
cgit v1.2.3


From 28361c403683c2b00d4f5e76045f3ccd299bf99d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 11 May 2018 12:51:09 -0600
Subject: libata: add extra internal command

Bump the internal tag to 32, instead of stealing the last tag in
our regular command space. This works just fine, since we don't
actually need a separate hardware tag for this. Internal commands
cannot coexist with NCQ commands.

As a bonus, we get rid of the special casing of what tag to use
for the internal command.

This is in preparation for utilizing all 32 commands for normal IO.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 22 ++++++----------------
 drivers/ata/libata-eh.c   |  3 ++-
 include/linux/libata.h    | 15 +++++++--------
 3 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 8fd352d4c190..5e2f679322cc 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1570,7 +1570,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	u8 command = tf->command;
 	int auto_timeout = 0;
 	struct ata_queued_cmd *qc;
-	unsigned int tag, preempted_tag;
+	unsigned int preempted_tag;
 	u32 preempted_sactive;
 	u64 preempted_qc_active;
 	int preempted_nr_active_links;
@@ -1588,20 +1588,10 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	}
 
 	/* initialize internal qc */
+	qc = __ata_qc_from_tag(ap, ATA_TAG_INTERNAL);
 
-	/* XXX: Tag 0 is used for drivers with legacy EH as some
-	 * drivers choke if any other tag is given.  This breaks
-	 * ata_tag_internal() test for those drivers.  Don't use new
-	 * EH stuff without converting to it.
-	 */
-	if (ap->ops->error_handler)
-		tag = ATA_TAG_INTERNAL;
-	else
-		tag = 0;
-
-	qc = __ata_qc_from_tag(ap, tag);
-
-	qc->tag = qc->hw_tag = tag;
+	qc->tag = ATA_TAG_INTERNAL;
+	qc->hw_tag = 0;
 	qc->scsicmd = NULL;
 	qc->ap = ap;
 	qc->dev = dev;
@@ -5156,7 +5146,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 
 	qc->flags = 0;
 	tag = qc->tag;
-	if (likely(ata_tag_valid(tag))) {
+	if (ata_tag_valid(tag)) {
 		qc->tag = ATA_TAG_POISON;
 		if (ap->flags & ATA_FLAG_SAS_HOST)
 			ata_sas_free_tag(tag, ap);
@@ -5415,7 +5405,7 @@ void ata_qc_issue(struct ata_queued_cmd *qc)
 		WARN_ON_ONCE(link->sactive);
 
 		ap->nr_active_links++;
-		link->active_tag = qc->hw_tag;
+		link->active_tag = qc->tag;
 	}
 
 	qc->flags |= ATA_QCFLAG_ACTIVE;
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 9f9aad77fbcd..eadbe26ba3dc 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1057,7 +1057,8 @@ static int ata_do_link_abort(struct ata_port *ap, struct ata_link *link)
 	/* we're gonna abort all commands, no need for fast drain */
 	ata_eh_set_pending(ap, 0);
 
-	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
+	/* include internal tag in iteration */
+	for (tag = 0; tag <= ATA_MAX_QUEUE; tag++) {
 		struct ata_queued_cmd *qc = ata_qc_from_tag(ap, tag);
 
 		if (qc && (!link || qc->dev->link == link)) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 60ce1ba26fdd..1d026719461b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -125,9 +125,8 @@ enum {
 	LIBATA_MAX_PRD		= ATA_MAX_PRD / 2,
 	LIBATA_DUMB_MAX_PRD	= ATA_MAX_PRD / 4,	/* Worst case */
 	ATA_DEF_QUEUE		= 1,
-	/* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */
 	ATA_MAX_QUEUE		= 32,
-	ATA_TAG_INTERNAL	= ATA_MAX_QUEUE - 1,
+	ATA_TAG_INTERNAL	= ATA_MAX_QUEUE,
 	ATA_SHORT_PAUSE		= 16,
 
 	ATAPI_MAX_DRAIN		= 16 << 10,
@@ -850,7 +849,7 @@ struct ata_port {
 	unsigned int		udma_mask;
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
 
-	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
+	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE + 1];
 	unsigned long		sas_tag_allocated; /* for sas tag allocation only */
 	u64			qc_active;
 	int			nr_active_links; /* #links with active qcs */
@@ -1486,14 +1485,14 @@ extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
 			       const char *name);
 #endif
 
-static inline unsigned int ata_tag_valid(unsigned int tag)
+static inline bool ata_tag_internal(unsigned int tag)
 {
-	return (tag < ATA_MAX_QUEUE) ? 1 : 0;
+	return tag == ATA_TAG_INTERNAL;
 }
 
-static inline bool ata_tag_internal(unsigned int tag)
+static inline bool ata_tag_valid(unsigned int tag)
 {
-	return tag == ATA_TAG_INTERNAL;
+	return tag < ATA_MAX_QUEUE || ata_tag_internal(tag);
 }
 
 /*
@@ -1656,7 +1655,7 @@ static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 static inline struct ata_queued_cmd *__ata_qc_from_tag(struct ata_port *ap,
 						       unsigned int tag)
 {
-	if (likely(ata_tag_valid(tag)))
+	if (ata_tag_valid(tag))
 		return &ap->qcmd[tag];
 	return NULL;
 }
-- 
cgit v1.2.3


From 305d0e006a1736fbfec25e7a2d1e813694f82ed0 Mon Sep 17 00:00:00 2001
From: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Date: Mon, 30 Apr 2018 16:33:50 -0500
Subject: EDAC, ghes: Remove unused argument to ghes_edac_report_mem_error()

The use of the @ghes argument was removed in a previous commit, but
function signature was not updated to reflect this.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Acked-by: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: linux-acpi@vger.kernel.org
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20180430213358.8319-1-mr.nuke.me@gmail.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/acpi/apei/ghes.c | 2 +-
 drivers/edac/ghes_edac.c | 3 +--
 include/acpi/ghes.h      | 5 ++---
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 88103333ee1b..02c6fd9caff7 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -481,7 +481,7 @@ static void ghes_do_proc(struct ghes *ghes,
 		if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
-			ghes_edac_report_mem_error(ghes, sev, mem_err);
+			ghes_edac_report_mem_error(sev, mem_err);
 
 			arch_apei_report_mem_error(sev, mem_err);
 			ghes_handle_memory_failure(gdata, sev);
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 758744038d93..b9bea77b3de8 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -172,8 +172,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 	}
 }
 
-void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
-				struct cper_sec_mem_err *mem_err)
+void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 {
 	enum hw_event_mc_err_type type;
 	struct edac_raw_error_desc *e;
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 4fc92289f7f8..1624e2be485c 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -55,15 +55,14 @@ enum {
 /* From drivers/edac/ghes_edac.c */
 
 #ifdef CONFIG_EDAC_GHES
-void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
-				struct cper_sec_mem_err *mem_err);
+void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err);
 
 int ghes_edac_register(struct ghes *ghes, struct device *dev);
 
 void ghes_edac_unregister(struct ghes *ghes);
 
 #else
-static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
+static inline void ghes_edac_report_mem_error(int sev,
 				       struct cper_sec_mem_err *mem_err)
 {
 }
-- 
cgit v1.2.3


From c06c4d793584b965bf5fa3fb107f6279643574e2 Mon Sep 17 00:00:00 2001
From: Brian Masney <masneyb@onstation.org>
Date: Thu, 10 May 2018 20:12:23 -0400
Subject: staging: iio: tsl2x7x/tsl2772: move out of staging

Move the tsl2772 driver out of staging and into mainline.

Signed-off-by: Brian Masney <masneyb@onstation.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/light/Kconfig             |    8 +
 drivers/iio/light/Makefile            |    1 +
 drivers/iio/light/tsl2772.c           | 1800 +++++++++++++++++++++++++++++++++
 drivers/staging/iio/Kconfig           |    1 -
 drivers/staging/iio/Makefile          |    1 -
 drivers/staging/iio/light/Kconfig     |   14 -
 drivers/staging/iio/light/Makefile    |    5 -
 drivers/staging/iio/light/tsl2772.c   | 1800 ---------------------------------
 drivers/staging/iio/light/tsl2772.h   |  101 --
 include/linux/platform_data/tsl2772.h |  101 ++
 10 files changed, 1910 insertions(+), 1922 deletions(-)
 create mode 100644 drivers/iio/light/tsl2772.c
 delete mode 100644 drivers/staging/iio/light/Kconfig
 delete mode 100644 drivers/staging/iio/light/Makefile
 delete mode 100644 drivers/staging/iio/light/tsl2772.c
 delete mode 100644 drivers/staging/iio/light/tsl2772.h
 create mode 100644 include/linux/platform_data/tsl2772.h

(limited to 'include')

diff --git a/drivers/iio/light/Kconfig b/drivers/iio/light/Kconfig
index 074e50657366..c7ef8d1862d6 100644
--- a/drivers/iio/light/Kconfig
+++ b/drivers/iio/light/Kconfig
@@ -409,6 +409,14 @@ config TSL2583
 	 Provides support for the TAOS tsl2580, tsl2581 and tsl2583 devices.
 	 Access ALS data via iio, sysfs.
 
+config TSL2772
+	tristate "TAOS TSL/TMD2x71 and TSL/TMD2x72 Family of light and proximity sensors"
+	depends on I2C
+	help
+	 Support for: tsl2571, tsl2671, tmd2671, tsl2771, tmd2771, tsl2572, tsl2672,
+	 tmd2672, tsl2772, tmd2772 devices.
+	 Provides iio_events and direct access via sysfs.
+
 config TSL4531
 	tristate "TAOS TSL4531 ambient light sensors"
 	depends on I2C
diff --git a/drivers/iio/light/Makefile b/drivers/iio/light/Makefile
index f1777036d4f8..80943af5d627 100644
--- a/drivers/iio/light/Makefile
+++ b/drivers/iio/light/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_ST_UVIS25_SPI)	+= st_uvis25_spi.o
 obj-$(CONFIG_TCS3414)		+= tcs3414.o
 obj-$(CONFIG_TCS3472)		+= tcs3472.o
 obj-$(CONFIG_TSL2583)		+= tsl2583.o
+obj-$(CONFIG_TSL2772)		+= tsl2772.o
 obj-$(CONFIG_TSL4531)		+= tsl4531.o
 obj-$(CONFIG_US5182D)		+= us5182d.o
 obj-$(CONFIG_VCNL4000)		+= vcnl4000.o
diff --git a/drivers/iio/light/tsl2772.c b/drivers/iio/light/tsl2772.c
new file mode 100644
index 000000000000..34d42a2504c9
--- /dev/null
+++ b/drivers/iio/light/tsl2772.c
@@ -0,0 +1,1800 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Device driver for monitoring ambient light intensity in (lux) and proximity
+ * detection (prox) within the TAOS TSL2571, TSL2671, TMD2671, TSL2771, TMD2771,
+ * TSL2572, TSL2672, TMD2672, TSL2772, and TMD2772 devices.
+ *
+ * Copyright (c) 2012, TAOS Corporation.
+ * Copyright (c) 2017-2018 Brian Masney <masneyb@onstation.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/iio/events.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/sysfs.h>
+#include <linux/platform_data/tsl2772.h>
+
+/* Cal defs */
+#define PROX_STAT_CAL			0
+#define PROX_STAT_SAMP			1
+#define MAX_SAMPLES_CAL			200
+
+/* TSL2772 Device ID */
+#define TRITON_ID			0x00
+#define SWORDFISH_ID			0x30
+#define HALIBUT_ID			0x20
+
+/* Lux calculation constants */
+#define TSL2772_LUX_CALC_OVER_FLOW	65535
+
+/*
+ * TAOS Register definitions - Note: depending on device, some of these register
+ * are not used and the register address is benign.
+ */
+
+/* Register offsets */
+#define TSL2772_MAX_CONFIG_REG		16
+
+/* Device Registers and Masks */
+#define TSL2772_CNTRL			0x00
+#define TSL2772_ALS_TIME		0X01
+#define TSL2772_PRX_TIME		0x02
+#define TSL2772_WAIT_TIME		0x03
+#define TSL2772_ALS_MINTHRESHLO		0X04
+#define TSL2772_ALS_MINTHRESHHI		0X05
+#define TSL2772_ALS_MAXTHRESHLO		0X06
+#define TSL2772_ALS_MAXTHRESHHI		0X07
+#define TSL2772_PRX_MINTHRESHLO		0X08
+#define TSL2772_PRX_MINTHRESHHI		0X09
+#define TSL2772_PRX_MAXTHRESHLO		0X0A
+#define TSL2772_PRX_MAXTHRESHHI		0X0B
+#define TSL2772_PERSISTENCE		0x0C
+#define TSL2772_ALS_PRX_CONFIG		0x0D
+#define TSL2772_PRX_COUNT		0x0E
+#define TSL2772_GAIN			0x0F
+#define TSL2772_NOTUSED			0x10
+#define TSL2772_REVID			0x11
+#define TSL2772_CHIPID			0x12
+#define TSL2772_STATUS			0x13
+#define TSL2772_ALS_CHAN0LO		0x14
+#define TSL2772_ALS_CHAN0HI		0x15
+#define TSL2772_ALS_CHAN1LO		0x16
+#define TSL2772_ALS_CHAN1HI		0x17
+#define TSL2772_PRX_LO			0x18
+#define TSL2772_PRX_HI			0x19
+
+/* tsl2772 cmd reg masks */
+#define TSL2772_CMD_REG			0x80
+#define TSL2772_CMD_SPL_FN		0x60
+#define TSL2772_CMD_REPEAT_PROTO	0x00
+#define TSL2772_CMD_AUTOINC_PROTO	0x20
+
+#define TSL2772_CMD_PROX_INT_CLR	0X05
+#define TSL2772_CMD_ALS_INT_CLR		0x06
+#define TSL2772_CMD_PROXALS_INT_CLR	0X07
+
+/* tsl2772 cntrl reg masks */
+#define TSL2772_CNTL_ADC_ENBL		0x02
+#define TSL2772_CNTL_PWR_ON		0x01
+
+/* tsl2772 status reg masks */
+#define TSL2772_STA_ADC_VALID		0x01
+#define TSL2772_STA_PRX_VALID		0x02
+#define TSL2772_STA_ADC_PRX_VALID	(TSL2772_STA_ADC_VALID | \
+					 TSL2772_STA_PRX_VALID)
+#define TSL2772_STA_ALS_INTR		0x10
+#define TSL2772_STA_PRX_INTR		0x20
+
+/* tsl2772 cntrl reg masks */
+#define TSL2772_CNTL_REG_CLEAR		0x00
+#define TSL2772_CNTL_PROX_INT_ENBL	0X20
+#define TSL2772_CNTL_ALS_INT_ENBL	0X10
+#define TSL2772_CNTL_WAIT_TMR_ENBL	0X08
+#define TSL2772_CNTL_PROX_DET_ENBL	0X04
+#define TSL2772_CNTL_PWRON		0x01
+#define TSL2772_CNTL_ALSPON_ENBL	0x03
+#define TSL2772_CNTL_INTALSPON_ENBL	0x13
+#define TSL2772_CNTL_PROXPON_ENBL	0x0F
+#define TSL2772_CNTL_INTPROXPON_ENBL	0x2F
+
+#define TSL2772_ALS_GAIN_TRIM_MIN	250
+#define TSL2772_ALS_GAIN_TRIM_MAX	4000
+
+/* Device family members */
+enum {
+	tsl2571,
+	tsl2671,
+	tmd2671,
+	tsl2771,
+	tmd2771,
+	tsl2572,
+	tsl2672,
+	tmd2672,
+	tsl2772,
+	tmd2772
+};
+
+enum {
+	TSL2772_CHIP_UNKNOWN = 0,
+	TSL2772_CHIP_WORKING = 1,
+	TSL2772_CHIP_SUSPENDED = 2
+};
+
+/* Per-device data */
+struct tsl2772_als_info {
+	u16 als_ch0;
+	u16 als_ch1;
+	u16 lux;
+};
+
+struct tsl2772_chip_info {
+	int chan_table_elements;
+	struct iio_chan_spec channel_with_events[4];
+	struct iio_chan_spec channel_without_events[4];
+	const struct iio_info *info;
+};
+
+struct tsl2772_chip {
+	kernel_ulong_t id;
+	struct mutex prox_mutex;
+	struct mutex als_mutex;
+	struct i2c_client *client;
+	u16 prox_data;
+	struct tsl2772_als_info als_cur_info;
+	struct tsl2772_settings settings;
+	struct tsl2772_platform_data *pdata;
+	int als_gain_time_scale;
+	int als_saturation;
+	int tsl2772_chip_status;
+	u8 tsl2772_config[TSL2772_MAX_CONFIG_REG];
+	const struct tsl2772_chip_info	*chip_info;
+	const struct iio_info *info;
+	s64 event_timestamp;
+	/*
+	 * This structure is intentionally large to accommodate
+	 * updates via sysfs.
+	 * Sized to 9 = max 8 segments + 1 termination segment
+	 */
+	struct tsl2772_lux tsl2772_device_lux[TSL2772_MAX_LUX_TABLE_SIZE];
+};
+
+/*
+ * Different devices require different coefficents, and these numbers were
+ * derived from the 'Lux Equation' section of the various device datasheets.
+ * All of these coefficients assume a Glass Attenuation (GA) factor of 1.
+ * The coefficients are multiplied by 1000 to avoid floating point operations.
+ * The two rows in each table correspond to the Lux1 and Lux2 equations from
+ * the datasheets.
+ */
+static const struct tsl2772_lux tsl2x71_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
+	{ 53000, 106000 },
+	{ 31800,  53000 },
+	{ 0,          0 },
+};
+
+static const struct tsl2772_lux tmd2x71_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
+	{ 24000,  48000 },
+	{ 14400,  24000 },
+	{ 0,          0 },
+};
+
+static const struct tsl2772_lux tsl2x72_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
+	{ 60000, 112200 },
+	{ 37800,  60000 },
+	{     0,      0 },
+};
+
+static const struct tsl2772_lux tmd2x72_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
+	{ 20000,  35000 },
+	{ 12600,  20000 },
+	{     0,      0 },
+};
+
+static const struct tsl2772_lux *tsl2772_default_lux_table_group[] = {
+	[tsl2571] = tsl2x71_lux_table,
+	[tsl2671] = tsl2x71_lux_table,
+	[tmd2671] = tmd2x71_lux_table,
+	[tsl2771] = tsl2x71_lux_table,
+	[tmd2771] = tmd2x71_lux_table,
+	[tsl2572] = tsl2x72_lux_table,
+	[tsl2672] = tsl2x72_lux_table,
+	[tmd2672] = tmd2x72_lux_table,
+	[tsl2772] = tsl2x72_lux_table,
+	[tmd2772] = tmd2x72_lux_table,
+};
+
+static const struct tsl2772_settings tsl2772_default_settings = {
+	.als_time = 255, /* 2.72 / 2.73 ms */
+	.als_gain = 0,
+	.prox_time = 255, /* 2.72 / 2.73 ms */
+	.prox_gain = 0,
+	.wait_time = 255,
+	.als_prox_config = 0,
+	.als_gain_trim = 1000,
+	.als_cal_target = 150,
+	.als_persistence = 1,
+	.als_interrupt_en = false,
+	.als_thresh_low = 200,
+	.als_thresh_high = 256,
+	.prox_persistence = 1,
+	.prox_interrupt_en = false,
+	.prox_thres_low  = 0,
+	.prox_thres_high = 512,
+	.prox_max_samples_cal = 30,
+	.prox_pulse_count = 8,
+	.prox_diode = TSL2772_DIODE1,
+	.prox_power = TSL2772_100_mA
+};
+
+static const s16 tsl2772_als_gain[] = {
+	1,
+	8,
+	16,
+	120
+};
+
+static const s16 tsl2772_prox_gain[] = {
+	1,
+	2,
+	4,
+	8
+};
+
+static const int tsl2772_int_time_avail[][6] = {
+	[tsl2571] = { 0, 2720, 0, 2720, 0, 696000 },
+	[tsl2671] = { 0, 2720, 0, 2720, 0, 696000 },
+	[tmd2671] = { 0, 2720, 0, 2720, 0, 696000 },
+	[tsl2771] = { 0, 2720, 0, 2720, 0, 696000 },
+	[tmd2771] = { 0, 2720, 0, 2720, 0, 696000 },
+	[tsl2572] = { 0, 2730, 0, 2730, 0, 699000 },
+	[tsl2672] = { 0, 2730, 0, 2730, 0, 699000 },
+	[tmd2672] = { 0, 2730, 0, 2730, 0, 699000 },
+	[tsl2772] = { 0, 2730, 0, 2730, 0, 699000 },
+	[tmd2772] = { 0, 2730, 0, 2730, 0, 699000 },
+};
+
+static int tsl2772_int_calibscale_avail[] = { 1, 8, 16, 120 };
+
+static int tsl2772_prox_calibscale_avail[] = { 1, 2, 4, 8 };
+
+/* Channel variations */
+enum {
+	ALS,
+	PRX,
+	ALSPRX,
+	PRX2,
+	ALSPRX2,
+};
+
+static const u8 device_channel_config[] = {
+	[tsl2571] = ALS,
+	[tsl2671] = PRX,
+	[tmd2671] = PRX,
+	[tsl2771] = ALSPRX,
+	[tmd2771] = ALSPRX,
+	[tsl2572] = ALS,
+	[tsl2672] = PRX2,
+	[tmd2672] = PRX2,
+	[tsl2772] = ALSPRX2,
+	[tmd2772] = ALSPRX2
+};
+
+static int tsl2772_read_status(struct tsl2772_chip *chip)
+{
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(chip->client,
+				       TSL2772_CMD_REG | TSL2772_STATUS);
+	if (ret < 0)
+		dev_err(&chip->client->dev,
+			"%s: failed to read STATUS register: %d\n", __func__,
+			ret);
+
+	return ret;
+}
+
+static int tsl2772_write_control_reg(struct tsl2772_chip *chip, u8 data)
+{
+	int ret;
+
+	ret = i2c_smbus_write_byte_data(chip->client,
+					TSL2772_CMD_REG | TSL2772_CNTRL, data);
+	if (ret < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to write to control register %x: %d\n",
+			__func__, data, ret);
+	}
+
+	return ret;
+}
+
+static int tsl2772_read_autoinc_regs(struct tsl2772_chip *chip, int lower_reg,
+				     int upper_reg)
+{
+	u8 buf[2];
+	int ret;
+
+	ret = i2c_smbus_write_byte(chip->client,
+				   TSL2772_CMD_REG | TSL2772_CMD_AUTOINC_PROTO |
+				   lower_reg);
+	if (ret < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to enable auto increment protocol: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
+	ret = i2c_smbus_read_byte_data(chip->client,
+				       TSL2772_CMD_REG | lower_reg);
+	if (ret < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to read from register %x: %d\n", __func__,
+			lower_reg, ret);
+		return ret;
+	}
+	buf[0] = ret;
+
+	ret = i2c_smbus_read_byte_data(chip->client,
+				       TSL2772_CMD_REG | upper_reg);
+	if (ret < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to read from register %x: %d\n", __func__,
+			upper_reg, ret);
+		return ret;
+	}
+	buf[1] = ret;
+
+	ret = i2c_smbus_write_byte(chip->client,
+				   TSL2772_CMD_REG | TSL2772_CMD_REPEAT_PROTO |
+				   lower_reg);
+	if (ret < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to enable repeated byte protocol: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
+	return le16_to_cpup((const __le16 *)&buf[0]);
+}
+
+/**
+ * tsl2772_get_lux() - Reads and calculates current lux value.
+ * @indio_dev:	pointer to IIO device
+ *
+ * The raw ch0 and ch1 values of the ambient light sensed in the last
+ * integration cycle are read from the device. The raw values are multiplied
+ * by a device-specific scale factor, and divided by the integration time and
+ * device gain. The code supports multiple lux equations through the lux table
+ * coefficients. A lux gain trim is applied to each lux equation, and then the
+ * maximum lux within the interval 0..65535 is selected.
+ */
+static int tsl2772_get_lux(struct iio_dev *indio_dev)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	struct tsl2772_lux *p;
+	int max_lux, ret;
+	bool overflow;
+
+	mutex_lock(&chip->als_mutex);
+
+	if (chip->tsl2772_chip_status != TSL2772_CHIP_WORKING) {
+		dev_err(&chip->client->dev, "%s: device is not enabled\n",
+			__func__);
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = tsl2772_read_status(chip);
+	if (ret < 0)
+		goto out_unlock;
+
+	if (!(ret & TSL2772_STA_ADC_VALID)) {
+		dev_err(&chip->client->dev,
+			"%s: data not valid yet\n", __func__);
+		ret = chip->als_cur_info.lux; /* return LAST VALUE */
+		goto out_unlock;
+	}
+
+	ret = tsl2772_read_autoinc_regs(chip, TSL2772_ALS_CHAN0LO,
+					TSL2772_ALS_CHAN0HI);
+	if (ret < 0)
+		goto out_unlock;
+	chip->als_cur_info.als_ch0 = ret;
+
+	ret = tsl2772_read_autoinc_regs(chip, TSL2772_ALS_CHAN1LO,
+					TSL2772_ALS_CHAN1HI);
+	if (ret < 0)
+		goto out_unlock;
+	chip->als_cur_info.als_ch1 = ret;
+
+	if (chip->als_cur_info.als_ch0 >= chip->als_saturation) {
+		max_lux = TSL2772_LUX_CALC_OVER_FLOW;
+		goto update_struct_with_max_lux;
+	}
+
+	if (!chip->als_cur_info.als_ch0) {
+		/* have no data, so return LAST VALUE */
+		ret = chip->als_cur_info.lux;
+		goto out_unlock;
+	}
+
+	max_lux = 0;
+	overflow = false;
+	for (p = (struct tsl2772_lux *)chip->tsl2772_device_lux; p->ch0 != 0;
+	     p++) {
+		int lux;
+
+		lux = ((chip->als_cur_info.als_ch0 * p->ch0) -
+		       (chip->als_cur_info.als_ch1 * p->ch1)) /
+			chip->als_gain_time_scale;
+
+		/*
+		 * The als_gain_trim can have a value within the range 250..4000
+		 * and is a multiplier for the lux. A trim of 1000 makes no
+		 * changes to the lux, less than 1000 scales it down, and
+		 * greater than 1000 scales it up.
+		 */
+		lux = (lux * chip->settings.als_gain_trim) / 1000;
+
+		if (lux > TSL2772_LUX_CALC_OVER_FLOW) {
+			overflow = true;
+			continue;
+		}
+
+		max_lux = max(max_lux, lux);
+	}
+
+	if (overflow && max_lux == 0)
+		max_lux = TSL2772_LUX_CALC_OVER_FLOW;
+
+update_struct_with_max_lux:
+	chip->als_cur_info.lux = max_lux;
+	ret = max_lux;
+
+out_unlock:
+	mutex_unlock(&chip->als_mutex);
+
+	return ret;
+}
+
+/**
+ * tsl2772_get_prox() - Reads proximity data registers and updates
+ *                      chip->prox_data.
+ *
+ * @indio_dev:	pointer to IIO device
+ */
+static int tsl2772_get_prox(struct iio_dev *indio_dev)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int ret;
+
+	mutex_lock(&chip->prox_mutex);
+
+	ret = tsl2772_read_status(chip);
+	if (ret < 0)
+		goto prox_poll_err;
+
+	switch (chip->id) {
+	case tsl2571:
+	case tsl2671:
+	case tmd2671:
+	case tsl2771:
+	case tmd2771:
+		if (!(ret & TSL2772_STA_ADC_VALID)) {
+			ret = -EINVAL;
+			goto prox_poll_err;
+		}
+		break;
+	case tsl2572:
+	case tsl2672:
+	case tmd2672:
+	case tsl2772:
+	case tmd2772:
+		if (!(ret & TSL2772_STA_PRX_VALID)) {
+			ret = -EINVAL;
+			goto prox_poll_err;
+		}
+		break;
+	}
+
+	ret = tsl2772_read_autoinc_regs(chip, TSL2772_PRX_LO, TSL2772_PRX_HI);
+	if (ret < 0)
+		goto prox_poll_err;
+	chip->prox_data = ret;
+
+prox_poll_err:
+	mutex_unlock(&chip->prox_mutex);
+
+	return ret;
+}
+
+/**
+ * tsl2772_defaults() - Populates the device nominal operating parameters
+ *                      with those provided by a 'platform' data struct or
+ *                      with prefined defaults.
+ *
+ * @chip:               pointer to device structure.
+ */
+static void tsl2772_defaults(struct tsl2772_chip *chip)
+{
+	/* If Operational settings defined elsewhere.. */
+	if (chip->pdata && chip->pdata->platform_default_settings)
+		memcpy(&chip->settings, chip->pdata->platform_default_settings,
+		       sizeof(tsl2772_default_settings));
+	else
+		memcpy(&chip->settings, &tsl2772_default_settings,
+		       sizeof(tsl2772_default_settings));
+
+	/* Load up the proper lux table. */
+	if (chip->pdata && chip->pdata->platform_lux_table[0].ch0 != 0)
+		memcpy(chip->tsl2772_device_lux,
+		       chip->pdata->platform_lux_table,
+		       sizeof(chip->pdata->platform_lux_table));
+	else
+		memcpy(chip->tsl2772_device_lux,
+		       tsl2772_default_lux_table_group[chip->id],
+		       TSL2772_DEFAULT_TABLE_BYTES);
+}
+
+/**
+ * tsl2772_als_calibrate() -	Obtain single reading and calculate
+ *                              the als_gain_trim.
+ *
+ * @indio_dev:	pointer to IIO device
+ */
+static int tsl2772_als_calibrate(struct iio_dev *indio_dev)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int ret, lux_val;
+
+	ret = i2c_smbus_read_byte_data(chip->client,
+				       TSL2772_CMD_REG | TSL2772_CNTRL);
+	if (ret < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to read from the CNTRL register\n",
+			__func__);
+		return ret;
+	}
+
+	if ((ret & (TSL2772_CNTL_ADC_ENBL | TSL2772_CNTL_PWR_ON))
+			!= (TSL2772_CNTL_ADC_ENBL | TSL2772_CNTL_PWR_ON)) {
+		dev_err(&chip->client->dev,
+			"%s: Device is not powered on and/or ADC is not enabled\n",
+			__func__);
+		return -EINVAL;
+	} else if ((ret & TSL2772_STA_ADC_VALID) != TSL2772_STA_ADC_VALID) {
+		dev_err(&chip->client->dev,
+			"%s: The two ADC channels have not completed an integration cycle\n",
+			__func__);
+		return -ENODATA;
+	}
+
+	lux_val = tsl2772_get_lux(indio_dev);
+	if (lux_val < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to get lux\n", __func__);
+		return lux_val;
+	}
+
+	ret = (chip->settings.als_cal_target * chip->settings.als_gain_trim) /
+			lux_val;
+	if (ret < TSL2772_ALS_GAIN_TRIM_MIN || ret > TSL2772_ALS_GAIN_TRIM_MAX)
+		return -ERANGE;
+
+	chip->settings.als_gain_trim = ret;
+
+	return ret;
+}
+
+static int tsl2772_chip_on(struct iio_dev *indio_dev)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int ret, i, als_count, als_time_us;
+	u8 *dev_reg, reg_val;
+
+	/* Non calculated parameters */
+	chip->tsl2772_config[TSL2772_ALS_TIME] = chip->settings.als_time;
+	chip->tsl2772_config[TSL2772_PRX_TIME] = chip->settings.prox_time;
+	chip->tsl2772_config[TSL2772_WAIT_TIME] = chip->settings.wait_time;
+	chip->tsl2772_config[TSL2772_ALS_PRX_CONFIG] =
+		chip->settings.als_prox_config;
+
+	chip->tsl2772_config[TSL2772_ALS_MINTHRESHLO] =
+		(chip->settings.als_thresh_low) & 0xFF;
+	chip->tsl2772_config[TSL2772_ALS_MINTHRESHHI] =
+		(chip->settings.als_thresh_low >> 8) & 0xFF;
+	chip->tsl2772_config[TSL2772_ALS_MAXTHRESHLO] =
+		(chip->settings.als_thresh_high) & 0xFF;
+	chip->tsl2772_config[TSL2772_ALS_MAXTHRESHHI] =
+		(chip->settings.als_thresh_high >> 8) & 0xFF;
+	chip->tsl2772_config[TSL2772_PERSISTENCE] =
+		(chip->settings.prox_persistence & 0xFF) << 4 |
+		(chip->settings.als_persistence & 0xFF);
+
+	chip->tsl2772_config[TSL2772_PRX_COUNT] =
+			chip->settings.prox_pulse_count;
+	chip->tsl2772_config[TSL2772_PRX_MINTHRESHLO] =
+			(chip->settings.prox_thres_low) & 0xFF;
+	chip->tsl2772_config[TSL2772_PRX_MINTHRESHHI] =
+			(chip->settings.prox_thres_low >> 8) & 0xFF;
+	chip->tsl2772_config[TSL2772_PRX_MAXTHRESHLO] =
+			(chip->settings.prox_thres_high) & 0xFF;
+	chip->tsl2772_config[TSL2772_PRX_MAXTHRESHHI] =
+			(chip->settings.prox_thres_high >> 8) & 0xFF;
+
+	/* and make sure we're not already on */
+	if (chip->tsl2772_chip_status == TSL2772_CHIP_WORKING) {
+		/* if forcing a register update - turn off, then on */
+		dev_info(&chip->client->dev, "device is already enabled\n");
+		return -EINVAL;
+	}
+
+	/* Set the gain based on tsl2772_settings struct */
+	chip->tsl2772_config[TSL2772_GAIN] =
+		(chip->settings.als_gain & 0xFF) |
+		((chip->settings.prox_gain & 0xFF) << 2) |
+		(chip->settings.prox_diode << 4) |
+		(chip->settings.prox_power << 6);
+
+	/* set chip time scaling and saturation */
+	als_count = 256 - chip->settings.als_time;
+	als_time_us = als_count * tsl2772_int_time_avail[chip->id][3];
+	chip->als_saturation = als_count * 768; /* 75% of full scale */
+	chip->als_gain_time_scale = als_time_us *
+		tsl2772_als_gain[chip->settings.als_gain];
+
+	/*
+	 * TSL2772 Specific power-on / adc enable sequence
+	 * Power on the device 1st.
+	 */
+	ret = tsl2772_write_control_reg(chip, TSL2772_CNTL_PWR_ON);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Use the following shadow copy for our delay before enabling ADC.
+	 * Write all the registers.
+	 */
+	for (i = 0, dev_reg = chip->tsl2772_config;
+			i < TSL2772_MAX_CONFIG_REG; i++) {
+		int reg = TSL2772_CMD_REG + i;
+
+		ret = i2c_smbus_write_byte_data(chip->client, reg,
+						*dev_reg++);
+		if (ret < 0) {
+			dev_err(&chip->client->dev,
+				"%s: failed to write to register %x: %d\n",
+				__func__, reg, ret);
+			return ret;
+		}
+	}
+
+	/* Power-on settling time */
+	usleep_range(3000, 3500);
+
+	reg_val = TSL2772_CNTL_PWR_ON | TSL2772_CNTL_ADC_ENBL |
+		  TSL2772_CNTL_PROX_DET_ENBL;
+	if (chip->settings.als_interrupt_en)
+		reg_val |= TSL2772_CNTL_ALS_INT_ENBL;
+	if (chip->settings.prox_interrupt_en)
+		reg_val |= TSL2772_CNTL_PROX_INT_ENBL;
+
+	ret = tsl2772_write_control_reg(chip, reg_val);
+	if (ret < 0)
+		return ret;
+
+	ret = i2c_smbus_write_byte(chip->client,
+				   TSL2772_CMD_REG | TSL2772_CMD_SPL_FN |
+				   TSL2772_CMD_PROXALS_INT_CLR);
+	if (ret < 0) {
+		dev_err(&chip->client->dev,
+			"%s: failed to clear interrupt status: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
+	chip->tsl2772_chip_status = TSL2772_CHIP_WORKING;
+
+	return ret;
+}
+
+static int tsl2772_chip_off(struct iio_dev *indio_dev)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+
+	/* turn device off */
+	chip->tsl2772_chip_status = TSL2772_CHIP_SUSPENDED;
+	return tsl2772_write_control_reg(chip, 0x00);
+}
+
+/**
+ * tsl2772_invoke_change - power cycle the device to implement the user
+ *                         parameters
+ * @indio_dev:	pointer to IIO device
+ *
+ * Obtain and lock both ALS and PROX resources, determine and save device state
+ * (On/Off), cycle device to implement updated parameter, put device back into
+ * proper state, and unlock resource.
+ */
+static int tsl2772_invoke_change(struct iio_dev *indio_dev)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int device_status = chip->tsl2772_chip_status;
+	int ret;
+
+	mutex_lock(&chip->als_mutex);
+	mutex_lock(&chip->prox_mutex);
+
+	if (device_status == TSL2772_CHIP_WORKING) {
+		ret = tsl2772_chip_off(indio_dev);
+		if (ret < 0)
+			goto unlock;
+	}
+
+	ret = tsl2772_chip_on(indio_dev);
+
+unlock:
+	mutex_unlock(&chip->prox_mutex);
+	mutex_unlock(&chip->als_mutex);
+
+	return ret;
+}
+
+static int tsl2772_prox_cal(struct iio_dev *indio_dev)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int prox_history[MAX_SAMPLES_CAL + 1];
+	int i, ret, mean, max, sample_sum;
+
+	if (chip->settings.prox_max_samples_cal < 1 ||
+	    chip->settings.prox_max_samples_cal > MAX_SAMPLES_CAL)
+		return -EINVAL;
+
+	for (i = 0; i < chip->settings.prox_max_samples_cal; i++) {
+		usleep_range(15000, 17500);
+		ret = tsl2772_get_prox(indio_dev);
+		if (ret < 0)
+			return ret;
+
+		prox_history[i] = chip->prox_data;
+	}
+
+	sample_sum = 0;
+	max = INT_MIN;
+	for (i = 0; i < chip->settings.prox_max_samples_cal; i++) {
+		sample_sum += prox_history[i];
+		max = max(max, prox_history[i]);
+	}
+	mean = sample_sum / chip->settings.prox_max_samples_cal;
+
+	chip->settings.prox_thres_high = (max << 1) - mean;
+
+	return tsl2772_invoke_change(indio_dev);
+}
+
+static int tsl2772_read_avail(struct iio_dev *indio_dev,
+			      struct iio_chan_spec const *chan,
+			      const int **vals, int *type, int *length,
+			      long mask)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_CALIBSCALE:
+		if (chan->type == IIO_INTENSITY) {
+			*length = ARRAY_SIZE(tsl2772_int_calibscale_avail);
+			*vals = tsl2772_int_calibscale_avail;
+		} else {
+			*length = ARRAY_SIZE(tsl2772_prox_calibscale_avail);
+			*vals = tsl2772_prox_calibscale_avail;
+		}
+		*type = IIO_VAL_INT;
+		return IIO_AVAIL_LIST;
+	case IIO_CHAN_INFO_INT_TIME:
+		*length = ARRAY_SIZE(tsl2772_int_time_avail[chip->id]);
+		*vals = tsl2772_int_time_avail[chip->id];
+		*type = IIO_VAL_INT_PLUS_MICRO;
+		return IIO_AVAIL_RANGE;
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t in_illuminance0_target_input_show(struct device *dev,
+						 struct device_attribute *attr,
+						 char *buf)
+{
+	struct tsl2772_chip *chip = iio_priv(dev_to_iio_dev(dev));
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", chip->settings.als_cal_target);
+}
+
+static ssize_t in_illuminance0_target_input_store(struct device *dev,
+						  struct device_attribute *attr,
+						  const char *buf, size_t len)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	u16 value;
+	int ret;
+
+	if (kstrtou16(buf, 0, &value))
+		return -EINVAL;
+
+	chip->settings.als_cal_target = value;
+	ret = tsl2772_invoke_change(indio_dev);
+	if (ret < 0)
+		return ret;
+
+	return len;
+}
+
+static ssize_t in_illuminance0_calibrate_store(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t len)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	bool value;
+	int ret;
+
+	if (kstrtobool(buf, &value) || !value)
+		return -EINVAL;
+
+	ret = tsl2772_als_calibrate(indio_dev);
+	if (ret < 0)
+		return ret;
+
+	ret = tsl2772_invoke_change(indio_dev);
+	if (ret < 0)
+		return ret;
+
+	return len;
+}
+
+static ssize_t in_illuminance0_lux_table_show(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf)
+{
+	struct tsl2772_chip *chip = iio_priv(dev_to_iio_dev(dev));
+	int i = 0;
+	int offset = 0;
+
+	while (i < TSL2772_MAX_LUX_TABLE_SIZE) {
+		offset += snprintf(buf + offset, PAGE_SIZE, "%u,%u,",
+			chip->tsl2772_device_lux[i].ch0,
+			chip->tsl2772_device_lux[i].ch1);
+		if (chip->tsl2772_device_lux[i].ch0 == 0) {
+			/*
+			 * We just printed the first "0" entry.
+			 * Now get rid of the extra "," and break.
+			 */
+			offset--;
+			break;
+		}
+		i++;
+	}
+
+	offset += snprintf(buf + offset, PAGE_SIZE, "\n");
+	return offset;
+}
+
+static ssize_t in_illuminance0_lux_table_store(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t len)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int value[ARRAY_SIZE(chip->tsl2772_device_lux) * 2 + 1];
+	int n, ret;
+
+	get_options(buf, ARRAY_SIZE(value), value);
+
+	/*
+	 * We now have an array of ints starting at value[1], and
+	 * enumerated by value[0].
+	 * We expect each group of two ints to be one table entry,
+	 * and the last table entry is all 0.
+	 */
+	n = value[0];
+	if ((n % 2) || n < 4 ||
+	    n > ((ARRAY_SIZE(chip->tsl2772_device_lux) - 1) * 2))
+		return -EINVAL;
+
+	if ((value[(n - 1)] | value[n]) != 0)
+		return -EINVAL;
+
+	if (chip->tsl2772_chip_status == TSL2772_CHIP_WORKING) {
+		ret = tsl2772_chip_off(indio_dev);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* Zero out the table */
+	memset(chip->tsl2772_device_lux, 0, sizeof(chip->tsl2772_device_lux));
+	memcpy(chip->tsl2772_device_lux, &value[1], (value[0] * 4));
+
+	ret = tsl2772_invoke_change(indio_dev);
+	if (ret < 0)
+		return ret;
+
+	return len;
+}
+
+static ssize_t in_proximity0_calibrate_store(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t len)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	bool value;
+	int ret;
+
+	if (kstrtobool(buf, &value) || !value)
+		return -EINVAL;
+
+	ret = tsl2772_prox_cal(indio_dev);
+	if (ret < 0)
+		return ret;
+
+	ret = tsl2772_invoke_change(indio_dev);
+	if (ret < 0)
+		return ret;
+
+	return len;
+}
+
+static int tsl2772_read_interrupt_config(struct iio_dev *indio_dev,
+					 const struct iio_chan_spec *chan,
+					 enum iio_event_type type,
+					 enum iio_event_direction dir)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+
+	if (chan->type == IIO_INTENSITY)
+		return chip->settings.als_interrupt_en;
+	else
+		return chip->settings.prox_interrupt_en;
+}
+
+static int tsl2772_write_interrupt_config(struct iio_dev *indio_dev,
+					  const struct iio_chan_spec *chan,
+					  enum iio_event_type type,
+					  enum iio_event_direction dir,
+					  int val)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+
+	if (chan->type == IIO_INTENSITY)
+		chip->settings.als_interrupt_en = val ? true : false;
+	else
+		chip->settings.prox_interrupt_en = val ? true : false;
+
+	return tsl2772_invoke_change(indio_dev);
+}
+
+static int tsl2772_write_event_value(struct iio_dev *indio_dev,
+				     const struct iio_chan_spec *chan,
+				     enum iio_event_type type,
+				     enum iio_event_direction dir,
+				     enum iio_event_info info,
+				     int val, int val2)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int ret = -EINVAL, count, persistence;
+	u8 time;
+
+	switch (info) {
+	case IIO_EV_INFO_VALUE:
+		if (chan->type == IIO_INTENSITY) {
+			switch (dir) {
+			case IIO_EV_DIR_RISING:
+				chip->settings.als_thresh_high = val;
+				ret = 0;
+				break;
+			case IIO_EV_DIR_FALLING:
+				chip->settings.als_thresh_low = val;
+				ret = 0;
+				break;
+			default:
+				break;
+			}
+		} else {
+			switch (dir) {
+			case IIO_EV_DIR_RISING:
+				chip->settings.prox_thres_high = val;
+				ret = 0;
+				break;
+			case IIO_EV_DIR_FALLING:
+				chip->settings.prox_thres_low = val;
+				ret = 0;
+				break;
+			default:
+				break;
+			}
+		}
+		break;
+	case IIO_EV_INFO_PERIOD:
+		if (chan->type == IIO_INTENSITY)
+			time = chip->settings.als_time;
+		else
+			time = chip->settings.prox_time;
+
+		count = 256 - time;
+		persistence = ((val * 1000000) + val2) /
+			(count * tsl2772_int_time_avail[chip->id][3]);
+
+		if (chan->type == IIO_INTENSITY) {
+			/* ALS filter values are 1, 2, 3, 5, 10, 15, ..., 60 */
+			if (persistence > 3)
+				persistence = (persistence / 5) + 3;
+
+			chip->settings.als_persistence = persistence;
+		} else {
+			chip->settings.prox_persistence = persistence;
+		}
+
+		ret = 0;
+		break;
+	default:
+		break;
+	}
+
+	if (ret < 0)
+		return ret;
+
+	return tsl2772_invoke_change(indio_dev);
+}
+
+static int tsl2772_read_event_value(struct iio_dev *indio_dev,
+				    const struct iio_chan_spec *chan,
+				    enum iio_event_type type,
+				    enum iio_event_direction dir,
+				    enum iio_event_info info,
+				    int *val, int *val2)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	int filter_delay, persistence;
+	u8 time;
+
+	switch (info) {
+	case IIO_EV_INFO_VALUE:
+		if (chan->type == IIO_INTENSITY) {
+			switch (dir) {
+			case IIO_EV_DIR_RISING:
+				*val = chip->settings.als_thresh_high;
+				return IIO_VAL_INT;
+			case IIO_EV_DIR_FALLING:
+				*val = chip->settings.als_thresh_low;
+				return IIO_VAL_INT;
+			default:
+				return -EINVAL;
+			}
+		} else {
+			switch (dir) {
+			case IIO_EV_DIR_RISING:
+				*val = chip->settings.prox_thres_high;
+				return IIO_VAL_INT;
+			case IIO_EV_DIR_FALLING:
+				*val = chip->settings.prox_thres_low;
+				return IIO_VAL_INT;
+			default:
+				return -EINVAL;
+			}
+		}
+		break;
+	case IIO_EV_INFO_PERIOD:
+		if (chan->type == IIO_INTENSITY) {
+			time = chip->settings.als_time;
+			persistence = chip->settings.als_persistence;
+
+			/* ALS filter values are 1, 2, 3, 5, 10, 15, ..., 60 */
+			if (persistence > 3)
+				persistence = (persistence - 3) * 5;
+		} else {
+			time = chip->settings.prox_time;
+			persistence = chip->settings.prox_persistence;
+		}
+
+		filter_delay = persistence * (256 - time) *
+			tsl2772_int_time_avail[chip->id][3];
+
+		*val = filter_delay / 1000000;
+		*val2 = filter_delay % 1000000;
+		return IIO_VAL_INT_PLUS_MICRO;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int tsl2772_read_raw(struct iio_dev *indio_dev,
+			    struct iio_chan_spec const *chan,
+			    int *val,
+			    int *val2,
+			    long mask)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_PROCESSED:
+		switch (chan->type) {
+		case IIO_LIGHT:
+			tsl2772_get_lux(indio_dev);
+			*val = chip->als_cur_info.lux;
+			return IIO_VAL_INT;
+		default:
+			return -EINVAL;
+		}
+	case IIO_CHAN_INFO_RAW:
+		switch (chan->type) {
+		case IIO_INTENSITY:
+			tsl2772_get_lux(indio_dev);
+			if (chan->channel == 0)
+				*val = chip->als_cur_info.als_ch0;
+			else
+				*val = chip->als_cur_info.als_ch1;
+			return IIO_VAL_INT;
+		case IIO_PROXIMITY:
+			tsl2772_get_prox(indio_dev);
+			*val = chip->prox_data;
+			return IIO_VAL_INT;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case IIO_CHAN_INFO_CALIBSCALE:
+		if (chan->type == IIO_LIGHT)
+			*val = tsl2772_als_gain[chip->settings.als_gain];
+		else
+			*val = tsl2772_prox_gain[chip->settings.prox_gain];
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_CALIBBIAS:
+		*val = chip->settings.als_gain_trim;
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_INT_TIME:
+		*val = 0;
+		*val2 = (256 - chip->settings.als_time) *
+			tsl2772_int_time_avail[chip->id][3];
+		return IIO_VAL_INT_PLUS_MICRO;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int tsl2772_write_raw(struct iio_dev *indio_dev,
+			     struct iio_chan_spec const *chan,
+			     int val,
+			     int val2,
+			     long mask)
+{
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_CALIBSCALE:
+		if (chan->type == IIO_INTENSITY) {
+			switch (val) {
+			case 1:
+				chip->settings.als_gain = 0;
+				break;
+			case 8:
+				chip->settings.als_gain = 1;
+				break;
+			case 16:
+				chip->settings.als_gain = 2;
+				break;
+			case 120:
+				chip->settings.als_gain = 3;
+				break;
+			default:
+				return -EINVAL;
+			}
+		} else {
+			switch (val) {
+			case 1:
+				chip->settings.prox_gain = 0;
+				break;
+			case 2:
+				chip->settings.prox_gain = 1;
+				break;
+			case 4:
+				chip->settings.prox_gain = 2;
+				break;
+			case 8:
+				chip->settings.prox_gain = 3;
+				break;
+			default:
+				return -EINVAL;
+			}
+		}
+		break;
+	case IIO_CHAN_INFO_CALIBBIAS:
+		if (val < TSL2772_ALS_GAIN_TRIM_MIN ||
+		    val > TSL2772_ALS_GAIN_TRIM_MAX)
+			return -EINVAL;
+
+		chip->settings.als_gain_trim = val;
+		break;
+	case IIO_CHAN_INFO_INT_TIME:
+		if (val != 0 || val2 < tsl2772_int_time_avail[chip->id][1] ||
+		    val2 > tsl2772_int_time_avail[chip->id][5])
+			return -EINVAL;
+
+		chip->settings.als_time = 256 -
+			(val2 / tsl2772_int_time_avail[chip->id][3]);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return tsl2772_invoke_change(indio_dev);
+}
+
+static DEVICE_ATTR_RW(in_illuminance0_target_input);
+
+static DEVICE_ATTR_WO(in_illuminance0_calibrate);
+
+static DEVICE_ATTR_WO(in_proximity0_calibrate);
+
+static DEVICE_ATTR_RW(in_illuminance0_lux_table);
+
+/* Use the default register values to identify the Taos device */
+static int tsl2772_device_id_verif(int id, int target)
+{
+	switch (target) {
+	case tsl2571:
+	case tsl2671:
+	case tsl2771:
+		return (id & 0xf0) == TRITON_ID;
+	case tmd2671:
+	case tmd2771:
+		return (id & 0xf0) == HALIBUT_ID;
+	case tsl2572:
+	case tsl2672:
+	case tmd2672:
+	case tsl2772:
+	case tmd2772:
+		return (id & 0xf0) == SWORDFISH_ID;
+	}
+
+	return -EINVAL;
+}
+
+static irqreturn_t tsl2772_event_handler(int irq, void *private)
+{
+	struct iio_dev *indio_dev = private;
+	struct tsl2772_chip *chip = iio_priv(indio_dev);
+	s64 timestamp = iio_get_time_ns(indio_dev);
+	int ret;
+
+	ret = tsl2772_read_status(chip);
+	if (ret < 0)
+		return IRQ_HANDLED;
+
+	/* What type of interrupt do we need to process */
+	if (ret & TSL2772_STA_PRX_INTR) {
+		iio_push_event(indio_dev,
+			       IIO_UNMOD_EVENT_CODE(IIO_PROXIMITY,
+						    0,
+						    IIO_EV_TYPE_THRESH,
+						    IIO_EV_DIR_EITHER),
+			       timestamp);
+	}
+
+	if (ret & TSL2772_STA_ALS_INTR) {
+		iio_push_event(indio_dev,
+			       IIO_UNMOD_EVENT_CODE(IIO_LIGHT,
+						    0,
+						    IIO_EV_TYPE_THRESH,
+						    IIO_EV_DIR_EITHER),
+			       timestamp);
+	}
+
+	ret = i2c_smbus_write_byte(chip->client,
+				   TSL2772_CMD_REG | TSL2772_CMD_SPL_FN |
+				   TSL2772_CMD_PROXALS_INT_CLR);
+	if (ret < 0)
+		dev_err(&chip->client->dev,
+			"%s: failed to clear interrupt status: %d\n",
+			__func__, ret);
+
+	return IRQ_HANDLED;
+}
+
+static struct attribute *tsl2772_ALS_device_attrs[] = {
+	&dev_attr_in_illuminance0_target_input.attr,
+	&dev_attr_in_illuminance0_calibrate.attr,
+	&dev_attr_in_illuminance0_lux_table.attr,
+	NULL
+};
+
+static struct attribute *tsl2772_PRX_device_attrs[] = {
+	&dev_attr_in_proximity0_calibrate.attr,
+	NULL
+};
+
+static struct attribute *tsl2772_ALSPRX_device_attrs[] = {
+	&dev_attr_in_illuminance0_target_input.attr,
+	&dev_attr_in_illuminance0_calibrate.attr,
+	&dev_attr_in_illuminance0_lux_table.attr,
+	NULL
+};
+
+static struct attribute *tsl2772_PRX2_device_attrs[] = {
+	&dev_attr_in_proximity0_calibrate.attr,
+	NULL
+};
+
+static struct attribute *tsl2772_ALSPRX2_device_attrs[] = {
+	&dev_attr_in_illuminance0_target_input.attr,
+	&dev_attr_in_illuminance0_calibrate.attr,
+	&dev_attr_in_illuminance0_lux_table.attr,
+	&dev_attr_in_proximity0_calibrate.attr,
+	NULL
+};
+
+static const struct attribute_group tsl2772_device_attr_group_tbl[] = {
+	[ALS] = {
+		.attrs = tsl2772_ALS_device_attrs,
+	},
+	[PRX] = {
+		.attrs = tsl2772_PRX_device_attrs,
+	},
+	[ALSPRX] = {
+		.attrs = tsl2772_ALSPRX_device_attrs,
+	},
+	[PRX2] = {
+		.attrs = tsl2772_PRX2_device_attrs,
+	},
+	[ALSPRX2] = {
+		.attrs = tsl2772_ALSPRX2_device_attrs,
+	},
+};
+
+#define TSL2772_DEVICE_INFO(type)[type] = \
+	{ \
+		.attrs = &tsl2772_device_attr_group_tbl[type], \
+		.read_raw = &tsl2772_read_raw, \
+		.read_avail = &tsl2772_read_avail, \
+		.write_raw = &tsl2772_write_raw, \
+		.read_event_value = &tsl2772_read_event_value, \
+		.write_event_value = &tsl2772_write_event_value, \
+		.read_event_config = &tsl2772_read_interrupt_config, \
+		.write_event_config = &tsl2772_write_interrupt_config, \
+	}
+
+static const struct iio_info tsl2772_device_info[] = {
+	TSL2772_DEVICE_INFO(ALS),
+	TSL2772_DEVICE_INFO(PRX),
+	TSL2772_DEVICE_INFO(ALSPRX),
+	TSL2772_DEVICE_INFO(PRX2),
+	TSL2772_DEVICE_INFO(ALSPRX2),
+};
+
+static const struct iio_event_spec tsl2772_events[] = {
+	{
+		.type = IIO_EV_TYPE_THRESH,
+		.dir = IIO_EV_DIR_RISING,
+		.mask_separate = BIT(IIO_EV_INFO_VALUE),
+	}, {
+		.type = IIO_EV_TYPE_THRESH,
+		.dir = IIO_EV_DIR_FALLING,
+		.mask_separate = BIT(IIO_EV_INFO_VALUE),
+	}, {
+		.type = IIO_EV_TYPE_THRESH,
+		.dir = IIO_EV_DIR_EITHER,
+		.mask_separate = BIT(IIO_EV_INFO_PERIOD) |
+			BIT(IIO_EV_INFO_ENABLE),
+	},
+};
+
+static const struct tsl2772_chip_info tsl2772_chip_info_tbl[] = {
+	[ALS] = {
+		.channel_with_events = {
+			{
+			.type = IIO_LIGHT,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE) |
+				BIT(IIO_CHAN_INFO_CALIBBIAS),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.event_spec = tsl2772_events,
+			.num_event_specs = ARRAY_SIZE(tsl2772_events),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 1,
+			},
+		},
+		.channel_without_events = {
+			{
+			.type = IIO_LIGHT,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE) |
+				BIT(IIO_CHAN_INFO_CALIBBIAS),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 1,
+			},
+		},
+		.chan_table_elements = 3,
+		.info = &tsl2772_device_info[ALS],
+	},
+	[PRX] = {
+		.channel_with_events = {
+			{
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			.event_spec = tsl2772_events,
+			.num_event_specs = ARRAY_SIZE(tsl2772_events),
+			},
+		},
+		.channel_without_events = {
+			{
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			},
+		},
+		.chan_table_elements = 1,
+		.info = &tsl2772_device_info[PRX],
+	},
+	[ALSPRX] = {
+		.channel_with_events = {
+			{
+			.type = IIO_LIGHT,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE) |
+				BIT(IIO_CHAN_INFO_CALIBBIAS),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.event_spec = tsl2772_events,
+			.num_event_specs = ARRAY_SIZE(tsl2772_events),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 1,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			}, {
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			.event_spec = tsl2772_events,
+			.num_event_specs = ARRAY_SIZE(tsl2772_events),
+			},
+		},
+		.channel_without_events = {
+			{
+			.type = IIO_LIGHT,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE) |
+				BIT(IIO_CHAN_INFO_CALIBBIAS),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 1,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			}, {
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			},
+		},
+		.chan_table_elements = 4,
+		.info = &tsl2772_device_info[ALSPRX],
+	},
+	[PRX2] = {
+		.channel_with_events = {
+			{
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.event_spec = tsl2772_events,
+			.num_event_specs = ARRAY_SIZE(tsl2772_events),
+			},
+		},
+		.channel_without_events = {
+			{
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			},
+		},
+		.chan_table_elements = 1,
+		.info = &tsl2772_device_info[PRX2],
+	},
+	[ALSPRX2] = {
+		.channel_with_events = {
+			{
+			.type = IIO_LIGHT,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE) |
+				BIT(IIO_CHAN_INFO_CALIBBIAS),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.event_spec = tsl2772_events,
+			.num_event_specs = ARRAY_SIZE(tsl2772_events),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 1,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			}, {
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.event_spec = tsl2772_events,
+			.num_event_specs = ARRAY_SIZE(tsl2772_events),
+			},
+		},
+		.channel_without_events = {
+			{
+			.type = IIO_LIGHT,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE) |
+				BIT(IIO_CHAN_INFO_CALIBBIAS),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_INT_TIME) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			}, {
+			.type = IIO_INTENSITY,
+			.indexed = 1,
+			.channel = 1,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+			}, {
+			.type = IIO_PROXIMITY,
+			.indexed = 1,
+			.channel = 0,
+			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			.info_mask_separate_available =
+				BIT(IIO_CHAN_INFO_CALIBSCALE),
+			},
+		},
+		.chan_table_elements = 4,
+		.info = &tsl2772_device_info[ALSPRX2],
+	},
+};
+
+static int tsl2772_probe(struct i2c_client *clientp,
+			 const struct i2c_device_id *id)
+{
+	struct iio_dev *indio_dev;
+	struct tsl2772_chip *chip;
+	int ret;
+
+	indio_dev = devm_iio_device_alloc(&clientp->dev, sizeof(*chip));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	chip = iio_priv(indio_dev);
+	chip->client = clientp;
+	i2c_set_clientdata(clientp, indio_dev);
+
+	ret = i2c_smbus_read_byte_data(chip->client,
+				       TSL2772_CMD_REG | TSL2772_CHIPID);
+	if (ret < 0)
+		return ret;
+
+	if (tsl2772_device_id_verif(ret, id->driver_data) <= 0) {
+		dev_info(&chip->client->dev,
+			 "%s: i2c device found does not match expected id\n",
+				__func__);
+		return -EINVAL;
+	}
+
+	ret = i2c_smbus_write_byte(clientp, TSL2772_CMD_REG | TSL2772_CNTRL);
+	if (ret < 0) {
+		dev_err(&clientp->dev,
+			"%s: Failed to write to CMD register: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
+	mutex_init(&chip->als_mutex);
+	mutex_init(&chip->prox_mutex);
+
+	chip->tsl2772_chip_status = TSL2772_CHIP_UNKNOWN;
+	chip->pdata = dev_get_platdata(&clientp->dev);
+	chip->id = id->driver_data;
+	chip->chip_info =
+		&tsl2772_chip_info_tbl[device_channel_config[id->driver_data]];
+
+	indio_dev->info = chip->chip_info->info;
+	indio_dev->dev.parent = &clientp->dev;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+	indio_dev->name = chip->client->name;
+	indio_dev->num_channels = chip->chip_info->chan_table_elements;
+
+	if (clientp->irq) {
+		indio_dev->channels = chip->chip_info->channel_with_events;
+
+		ret = devm_request_threaded_irq(&clientp->dev, clientp->irq,
+						NULL,
+						&tsl2772_event_handler,
+						IRQF_TRIGGER_FALLING |
+						IRQF_ONESHOT,
+						"TSL2772_event",
+						indio_dev);
+		if (ret) {
+			dev_err(&clientp->dev,
+				"%s: irq request failed\n", __func__);
+			return ret;
+		}
+	} else {
+		indio_dev->channels = chip->chip_info->channel_without_events;
+	}
+
+	tsl2772_defaults(chip);
+	ret = tsl2772_chip_on(indio_dev);
+	if (ret < 0)
+		return ret;
+
+	ret = iio_device_register(indio_dev);
+	if (ret) {
+		tsl2772_chip_off(indio_dev);
+		dev_err(&clientp->dev,
+			"%s: iio registration failed\n", __func__);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int tsl2772_suspend(struct device *dev)
+{
+	struct iio_dev *indio_dev = dev_get_drvdata(dev);
+
+	return tsl2772_chip_off(indio_dev);
+}
+
+static int tsl2772_resume(struct device *dev)
+{
+	struct iio_dev *indio_dev = dev_get_drvdata(dev);
+
+	return tsl2772_chip_on(indio_dev);
+}
+
+static int tsl2772_remove(struct i2c_client *client)
+{
+	struct iio_dev *indio_dev = i2c_get_clientdata(client);
+
+	tsl2772_chip_off(indio_dev);
+
+	iio_device_unregister(indio_dev);
+
+	return 0;
+}
+
+static const struct i2c_device_id tsl2772_idtable[] = {
+	{ "tsl2571", tsl2571 },
+	{ "tsl2671", tsl2671 },
+	{ "tmd2671", tmd2671 },
+	{ "tsl2771", tsl2771 },
+	{ "tmd2771", tmd2771 },
+	{ "tsl2572", tsl2572 },
+	{ "tsl2672", tsl2672 },
+	{ "tmd2672", tmd2672 },
+	{ "tsl2772", tsl2772 },
+	{ "tmd2772", tmd2772 },
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, tsl2772_idtable);
+
+static const struct of_device_id tsl2772_of_match[] = {
+	{ .compatible = "amstaos,tsl2571" },
+	{ .compatible = "amstaos,tsl2671" },
+	{ .compatible = "amstaos,tmd2671" },
+	{ .compatible = "amstaos,tsl2771" },
+	{ .compatible = "amstaos,tmd2771" },
+	{ .compatible = "amstaos,tsl2572" },
+	{ .compatible = "amstaos,tsl2672" },
+	{ .compatible = "amstaos,tmd2672" },
+	{ .compatible = "amstaos,tsl2772" },
+	{ .compatible = "amstaos,tmd2772" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, tsl2772_of_match);
+
+static const struct dev_pm_ops tsl2772_pm_ops = {
+	.suspend = tsl2772_suspend,
+	.resume  = tsl2772_resume,
+};
+
+static struct i2c_driver tsl2772_driver = {
+	.driver = {
+		.name = "tsl2772",
+		.of_match_table = tsl2772_of_match,
+		.pm = &tsl2772_pm_ops,
+	},
+	.id_table = tsl2772_idtable,
+	.probe = tsl2772_probe,
+	.remove = tsl2772_remove,
+};
+
+module_i2c_driver(tsl2772_driver);
+
+MODULE_AUTHOR("J. August Brenner <Jon.Brenner@ams.com>");
+MODULE_AUTHOR("Brian Masney <masneyb@onstation.org>");
+MODULE_DESCRIPTION("TAOS tsl2772 ambient and proximity light sensor driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/iio/Kconfig b/drivers/staging/iio/Kconfig
index bd9445956511..aee2335a25a1 100644
--- a/drivers/staging/iio/Kconfig
+++ b/drivers/staging/iio/Kconfig
@@ -11,7 +11,6 @@ source "drivers/staging/iio/cdc/Kconfig"
 source "drivers/staging/iio/frequency/Kconfig"
 source "drivers/staging/iio/gyro/Kconfig"
 source "drivers/staging/iio/impedance-analyzer/Kconfig"
-source "drivers/staging/iio/light/Kconfig"
 source "drivers/staging/iio/meter/Kconfig"
 source "drivers/staging/iio/resolver/Kconfig"
 
diff --git a/drivers/staging/iio/Makefile b/drivers/staging/iio/Makefile
index e99a375c07b9..c28d657497de 100644
--- a/drivers/staging/iio/Makefile
+++ b/drivers/staging/iio/Makefile
@@ -10,6 +10,5 @@ obj-y += cdc/
 obj-y += frequency/
 obj-y += gyro/
 obj-y += impedance-analyzer/
-obj-y += light/
 obj-y += meter/
 obj-y += resolver/
diff --git a/drivers/staging/iio/light/Kconfig b/drivers/staging/iio/light/Kconfig
deleted file mode 100644
index dfa37386ad2c..000000000000
--- a/drivers/staging/iio/light/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-#
-# Light sensors
-#
-menu "Light sensors"
-
-config TSL2772
-	tristate "TAOS TSL/TMD2x71 and TSL/TMD2x72 Family of light and proximity sensors"
-	depends on I2C
-	help
-	 Support for: tsl2571, tsl2671, tmd2671, tsl2771, tmd2771, tsl2572, tsl2672,
-	 tmd2672, tsl2772, tmd2772 devices.
-	 Provides iio_events and direct access via sysfs.
-
-endmenu
diff --git a/drivers/staging/iio/light/Makefile b/drivers/staging/iio/light/Makefile
deleted file mode 100644
index e7e77a11f02a..000000000000
--- a/drivers/staging/iio/light/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for industrial I/O Light sensors
-#
-
-obj-$(CONFIG_TSL2772)	+= tsl2772.o
diff --git a/drivers/staging/iio/light/tsl2772.c b/drivers/staging/iio/light/tsl2772.c
deleted file mode 100644
index a59bf39c28d4..000000000000
--- a/drivers/staging/iio/light/tsl2772.c
+++ /dev/null
@@ -1,1800 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Device driver for monitoring ambient light intensity in (lux) and proximity
- * detection (prox) within the TAOS TSL2571, TSL2671, TMD2671, TSL2771, TMD2771,
- * TSL2572, TSL2672, TMD2672, TSL2772, and TMD2772 devices.
- *
- * Copyright (c) 2012, TAOS Corporation.
- * Copyright (c) 2017-2018 Brian Masney <masneyb@onstation.org>
- */
-
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/i2c.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/iio/events.h>
-#include <linux/iio/iio.h>
-#include <linux/iio/sysfs.h>
-#include "tsl2772.h"
-
-/* Cal defs */
-#define PROX_STAT_CAL			0
-#define PROX_STAT_SAMP			1
-#define MAX_SAMPLES_CAL			200
-
-/* TSL2772 Device ID */
-#define TRITON_ID			0x00
-#define SWORDFISH_ID			0x30
-#define HALIBUT_ID			0x20
-
-/* Lux calculation constants */
-#define TSL2772_LUX_CALC_OVER_FLOW	65535
-
-/*
- * TAOS Register definitions - Note: depending on device, some of these register
- * are not used and the register address is benign.
- */
-
-/* Register offsets */
-#define TSL2772_MAX_CONFIG_REG		16
-
-/* Device Registers and Masks */
-#define TSL2772_CNTRL			0x00
-#define TSL2772_ALS_TIME		0X01
-#define TSL2772_PRX_TIME		0x02
-#define TSL2772_WAIT_TIME		0x03
-#define TSL2772_ALS_MINTHRESHLO		0X04
-#define TSL2772_ALS_MINTHRESHHI		0X05
-#define TSL2772_ALS_MAXTHRESHLO		0X06
-#define TSL2772_ALS_MAXTHRESHHI		0X07
-#define TSL2772_PRX_MINTHRESHLO		0X08
-#define TSL2772_PRX_MINTHRESHHI		0X09
-#define TSL2772_PRX_MAXTHRESHLO		0X0A
-#define TSL2772_PRX_MAXTHRESHHI		0X0B
-#define TSL2772_PERSISTENCE		0x0C
-#define TSL2772_ALS_PRX_CONFIG		0x0D
-#define TSL2772_PRX_COUNT		0x0E
-#define TSL2772_GAIN			0x0F
-#define TSL2772_NOTUSED			0x10
-#define TSL2772_REVID			0x11
-#define TSL2772_CHIPID			0x12
-#define TSL2772_STATUS			0x13
-#define TSL2772_ALS_CHAN0LO		0x14
-#define TSL2772_ALS_CHAN0HI		0x15
-#define TSL2772_ALS_CHAN1LO		0x16
-#define TSL2772_ALS_CHAN1HI		0x17
-#define TSL2772_PRX_LO			0x18
-#define TSL2772_PRX_HI			0x19
-
-/* tsl2772 cmd reg masks */
-#define TSL2772_CMD_REG			0x80
-#define TSL2772_CMD_SPL_FN		0x60
-#define TSL2772_CMD_REPEAT_PROTO	0x00
-#define TSL2772_CMD_AUTOINC_PROTO	0x20
-
-#define TSL2772_CMD_PROX_INT_CLR	0X05
-#define TSL2772_CMD_ALS_INT_CLR		0x06
-#define TSL2772_CMD_PROXALS_INT_CLR	0X07
-
-/* tsl2772 cntrl reg masks */
-#define TSL2772_CNTL_ADC_ENBL		0x02
-#define TSL2772_CNTL_PWR_ON		0x01
-
-/* tsl2772 status reg masks */
-#define TSL2772_STA_ADC_VALID		0x01
-#define TSL2772_STA_PRX_VALID		0x02
-#define TSL2772_STA_ADC_PRX_VALID	(TSL2772_STA_ADC_VALID | \
-					 TSL2772_STA_PRX_VALID)
-#define TSL2772_STA_ALS_INTR		0x10
-#define TSL2772_STA_PRX_INTR		0x20
-
-/* tsl2772 cntrl reg masks */
-#define TSL2772_CNTL_REG_CLEAR		0x00
-#define TSL2772_CNTL_PROX_INT_ENBL	0X20
-#define TSL2772_CNTL_ALS_INT_ENBL	0X10
-#define TSL2772_CNTL_WAIT_TMR_ENBL	0X08
-#define TSL2772_CNTL_PROX_DET_ENBL	0X04
-#define TSL2772_CNTL_PWRON		0x01
-#define TSL2772_CNTL_ALSPON_ENBL	0x03
-#define TSL2772_CNTL_INTALSPON_ENBL	0x13
-#define TSL2772_CNTL_PROXPON_ENBL	0x0F
-#define TSL2772_CNTL_INTPROXPON_ENBL	0x2F
-
-#define TSL2772_ALS_GAIN_TRIM_MIN	250
-#define TSL2772_ALS_GAIN_TRIM_MAX	4000
-
-/* Device family members */
-enum {
-	tsl2571,
-	tsl2671,
-	tmd2671,
-	tsl2771,
-	tmd2771,
-	tsl2572,
-	tsl2672,
-	tmd2672,
-	tsl2772,
-	tmd2772
-};
-
-enum {
-	TSL2772_CHIP_UNKNOWN = 0,
-	TSL2772_CHIP_WORKING = 1,
-	TSL2772_CHIP_SUSPENDED = 2
-};
-
-/* Per-device data */
-struct tsl2772_als_info {
-	u16 als_ch0;
-	u16 als_ch1;
-	u16 lux;
-};
-
-struct tsl2772_chip_info {
-	int chan_table_elements;
-	struct iio_chan_spec channel_with_events[4];
-	struct iio_chan_spec channel_without_events[4];
-	const struct iio_info *info;
-};
-
-struct tsl2772_chip {
-	kernel_ulong_t id;
-	struct mutex prox_mutex;
-	struct mutex als_mutex;
-	struct i2c_client *client;
-	u16 prox_data;
-	struct tsl2772_als_info als_cur_info;
-	struct tsl2772_settings settings;
-	struct tsl2772_platform_data *pdata;
-	int als_gain_time_scale;
-	int als_saturation;
-	int tsl2772_chip_status;
-	u8 tsl2772_config[TSL2772_MAX_CONFIG_REG];
-	const struct tsl2772_chip_info	*chip_info;
-	const struct iio_info *info;
-	s64 event_timestamp;
-	/*
-	 * This structure is intentionally large to accommodate
-	 * updates via sysfs.
-	 * Sized to 9 = max 8 segments + 1 termination segment
-	 */
-	struct tsl2772_lux tsl2772_device_lux[TSL2772_MAX_LUX_TABLE_SIZE];
-};
-
-/*
- * Different devices require different coefficents, and these numbers were
- * derived from the 'Lux Equation' section of the various device datasheets.
- * All of these coefficients assume a Glass Attenuation (GA) factor of 1.
- * The coefficients are multiplied by 1000 to avoid floating point operations.
- * The two rows in each table correspond to the Lux1 and Lux2 equations from
- * the datasheets.
- */
-static const struct tsl2772_lux tsl2x71_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
-	{ 53000, 106000 },
-	{ 31800,  53000 },
-	{ 0,          0 },
-};
-
-static const struct tsl2772_lux tmd2x71_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
-	{ 24000,  48000 },
-	{ 14400,  24000 },
-	{ 0,          0 },
-};
-
-static const struct tsl2772_lux tsl2x72_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
-	{ 60000, 112200 },
-	{ 37800,  60000 },
-	{     0,      0 },
-};
-
-static const struct tsl2772_lux tmd2x72_lux_table[TSL2772_DEF_LUX_TABLE_SZ] = {
-	{ 20000,  35000 },
-	{ 12600,  20000 },
-	{     0,      0 },
-};
-
-static const struct tsl2772_lux *tsl2772_default_lux_table_group[] = {
-	[tsl2571] = tsl2x71_lux_table,
-	[tsl2671] = tsl2x71_lux_table,
-	[tmd2671] = tmd2x71_lux_table,
-	[tsl2771] = tsl2x71_lux_table,
-	[tmd2771] = tmd2x71_lux_table,
-	[tsl2572] = tsl2x72_lux_table,
-	[tsl2672] = tsl2x72_lux_table,
-	[tmd2672] = tmd2x72_lux_table,
-	[tsl2772] = tsl2x72_lux_table,
-	[tmd2772] = tmd2x72_lux_table,
-};
-
-static const struct tsl2772_settings tsl2772_default_settings = {
-	.als_time = 255, /* 2.72 / 2.73 ms */
-	.als_gain = 0,
-	.prox_time = 255, /* 2.72 / 2.73 ms */
-	.prox_gain = 0,
-	.wait_time = 255,
-	.als_prox_config = 0,
-	.als_gain_trim = 1000,
-	.als_cal_target = 150,
-	.als_persistence = 1,
-	.als_interrupt_en = false,
-	.als_thresh_low = 200,
-	.als_thresh_high = 256,
-	.prox_persistence = 1,
-	.prox_interrupt_en = false,
-	.prox_thres_low  = 0,
-	.prox_thres_high = 512,
-	.prox_max_samples_cal = 30,
-	.prox_pulse_count = 8,
-	.prox_diode = TSL2772_DIODE1,
-	.prox_power = TSL2772_100_mA
-};
-
-static const s16 tsl2772_als_gain[] = {
-	1,
-	8,
-	16,
-	120
-};
-
-static const s16 tsl2772_prox_gain[] = {
-	1,
-	2,
-	4,
-	8
-};
-
-static const int tsl2772_int_time_avail[][6] = {
-	[tsl2571] = { 0, 2720, 0, 2720, 0, 696000 },
-	[tsl2671] = { 0, 2720, 0, 2720, 0, 696000 },
-	[tmd2671] = { 0, 2720, 0, 2720, 0, 696000 },
-	[tsl2771] = { 0, 2720, 0, 2720, 0, 696000 },
-	[tmd2771] = { 0, 2720, 0, 2720, 0, 696000 },
-	[tsl2572] = { 0, 2730, 0, 2730, 0, 699000 },
-	[tsl2672] = { 0, 2730, 0, 2730, 0, 699000 },
-	[tmd2672] = { 0, 2730, 0, 2730, 0, 699000 },
-	[tsl2772] = { 0, 2730, 0, 2730, 0, 699000 },
-	[tmd2772] = { 0, 2730, 0, 2730, 0, 699000 },
-};
-
-static int tsl2772_int_calibscale_avail[] = { 1, 8, 16, 120 };
-
-static int tsl2772_prox_calibscale_avail[] = { 1, 2, 4, 8 };
-
-/* Channel variations */
-enum {
-	ALS,
-	PRX,
-	ALSPRX,
-	PRX2,
-	ALSPRX2,
-};
-
-static const u8 device_channel_config[] = {
-	[tsl2571] = ALS,
-	[tsl2671] = PRX,
-	[tmd2671] = PRX,
-	[tsl2771] = ALSPRX,
-	[tmd2771] = ALSPRX,
-	[tsl2572] = ALS,
-	[tsl2672] = PRX2,
-	[tmd2672] = PRX2,
-	[tsl2772] = ALSPRX2,
-	[tmd2772] = ALSPRX2
-};
-
-static int tsl2772_read_status(struct tsl2772_chip *chip)
-{
-	int ret;
-
-	ret = i2c_smbus_read_byte_data(chip->client,
-				       TSL2772_CMD_REG | TSL2772_STATUS);
-	if (ret < 0)
-		dev_err(&chip->client->dev,
-			"%s: failed to read STATUS register: %d\n", __func__,
-			ret);
-
-	return ret;
-}
-
-static int tsl2772_write_control_reg(struct tsl2772_chip *chip, u8 data)
-{
-	int ret;
-
-	ret = i2c_smbus_write_byte_data(chip->client,
-					TSL2772_CMD_REG | TSL2772_CNTRL, data);
-	if (ret < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to write to control register %x: %d\n",
-			__func__, data, ret);
-	}
-
-	return ret;
-}
-
-static int tsl2772_read_autoinc_regs(struct tsl2772_chip *chip, int lower_reg,
-				     int upper_reg)
-{
-	u8 buf[2];
-	int ret;
-
-	ret = i2c_smbus_write_byte(chip->client,
-				   TSL2772_CMD_REG | TSL2772_CMD_AUTOINC_PROTO |
-				   lower_reg);
-	if (ret < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to enable auto increment protocol: %d\n",
-			__func__, ret);
-		return ret;
-	}
-
-	ret = i2c_smbus_read_byte_data(chip->client,
-				       TSL2772_CMD_REG | lower_reg);
-	if (ret < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to read from register %x: %d\n", __func__,
-			lower_reg, ret);
-		return ret;
-	}
-	buf[0] = ret;
-
-	ret = i2c_smbus_read_byte_data(chip->client,
-				       TSL2772_CMD_REG | upper_reg);
-	if (ret < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to read from register %x: %d\n", __func__,
-			upper_reg, ret);
-		return ret;
-	}
-	buf[1] = ret;
-
-	ret = i2c_smbus_write_byte(chip->client,
-				   TSL2772_CMD_REG | TSL2772_CMD_REPEAT_PROTO |
-				   lower_reg);
-	if (ret < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to enable repeated byte protocol: %d\n",
-			__func__, ret);
-		return ret;
-	}
-
-	return le16_to_cpup((const __le16 *)&buf[0]);
-}
-
-/**
- * tsl2772_get_lux() - Reads and calculates current lux value.
- * @indio_dev:	pointer to IIO device
- *
- * The raw ch0 and ch1 values of the ambient light sensed in the last
- * integration cycle are read from the device. The raw values are multiplied
- * by a device-specific scale factor, and divided by the integration time and
- * device gain. The code supports multiple lux equations through the lux table
- * coefficients. A lux gain trim is applied to each lux equation, and then the
- * maximum lux within the interval 0..65535 is selected.
- */
-static int tsl2772_get_lux(struct iio_dev *indio_dev)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	struct tsl2772_lux *p;
-	int max_lux, ret;
-	bool overflow;
-
-	mutex_lock(&chip->als_mutex);
-
-	if (chip->tsl2772_chip_status != TSL2772_CHIP_WORKING) {
-		dev_err(&chip->client->dev, "%s: device is not enabled\n",
-			__func__);
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
-	ret = tsl2772_read_status(chip);
-	if (ret < 0)
-		goto out_unlock;
-
-	if (!(ret & TSL2772_STA_ADC_VALID)) {
-		dev_err(&chip->client->dev,
-			"%s: data not valid yet\n", __func__);
-		ret = chip->als_cur_info.lux; /* return LAST VALUE */
-		goto out_unlock;
-	}
-
-	ret = tsl2772_read_autoinc_regs(chip, TSL2772_ALS_CHAN0LO,
-					TSL2772_ALS_CHAN0HI);
-	if (ret < 0)
-		goto out_unlock;
-	chip->als_cur_info.als_ch0 = ret;
-
-	ret = tsl2772_read_autoinc_regs(chip, TSL2772_ALS_CHAN1LO,
-					TSL2772_ALS_CHAN1HI);
-	if (ret < 0)
-		goto out_unlock;
-	chip->als_cur_info.als_ch1 = ret;
-
-	if (chip->als_cur_info.als_ch0 >= chip->als_saturation) {
-		max_lux = TSL2772_LUX_CALC_OVER_FLOW;
-		goto update_struct_with_max_lux;
-	}
-
-	if (!chip->als_cur_info.als_ch0) {
-		/* have no data, so return LAST VALUE */
-		ret = chip->als_cur_info.lux;
-		goto out_unlock;
-	}
-
-	max_lux = 0;
-	overflow = false;
-	for (p = (struct tsl2772_lux *)chip->tsl2772_device_lux; p->ch0 != 0;
-	     p++) {
-		int lux;
-
-		lux = ((chip->als_cur_info.als_ch0 * p->ch0) -
-		       (chip->als_cur_info.als_ch1 * p->ch1)) /
-			chip->als_gain_time_scale;
-
-		/*
-		 * The als_gain_trim can have a value within the range 250..4000
-		 * and is a multiplier for the lux. A trim of 1000 makes no
-		 * changes to the lux, less than 1000 scales it down, and
-		 * greater than 1000 scales it up.
-		 */
-		lux = (lux * chip->settings.als_gain_trim) / 1000;
-
-		if (lux > TSL2772_LUX_CALC_OVER_FLOW) {
-			overflow = true;
-			continue;
-		}
-
-		max_lux = max(max_lux, lux);
-	}
-
-	if (overflow && max_lux == 0)
-		max_lux = TSL2772_LUX_CALC_OVER_FLOW;
-
-update_struct_with_max_lux:
-	chip->als_cur_info.lux = max_lux;
-	ret = max_lux;
-
-out_unlock:
-	mutex_unlock(&chip->als_mutex);
-
-	return ret;
-}
-
-/**
- * tsl2772_get_prox() - Reads proximity data registers and updates
- *                      chip->prox_data.
- *
- * @indio_dev:	pointer to IIO device
- */
-static int tsl2772_get_prox(struct iio_dev *indio_dev)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int ret;
-
-	mutex_lock(&chip->prox_mutex);
-
-	ret = tsl2772_read_status(chip);
-	if (ret < 0)
-		goto prox_poll_err;
-
-	switch (chip->id) {
-	case tsl2571:
-	case tsl2671:
-	case tmd2671:
-	case tsl2771:
-	case tmd2771:
-		if (!(ret & TSL2772_STA_ADC_VALID)) {
-			ret = -EINVAL;
-			goto prox_poll_err;
-		}
-		break;
-	case tsl2572:
-	case tsl2672:
-	case tmd2672:
-	case tsl2772:
-	case tmd2772:
-		if (!(ret & TSL2772_STA_PRX_VALID)) {
-			ret = -EINVAL;
-			goto prox_poll_err;
-		}
-		break;
-	}
-
-	ret = tsl2772_read_autoinc_regs(chip, TSL2772_PRX_LO, TSL2772_PRX_HI);
-	if (ret < 0)
-		goto prox_poll_err;
-	chip->prox_data = ret;
-
-prox_poll_err:
-	mutex_unlock(&chip->prox_mutex);
-
-	return ret;
-}
-
-/**
- * tsl2772_defaults() - Populates the device nominal operating parameters
- *                      with those provided by a 'platform' data struct or
- *                      with prefined defaults.
- *
- * @chip:               pointer to device structure.
- */
-static void tsl2772_defaults(struct tsl2772_chip *chip)
-{
-	/* If Operational settings defined elsewhere.. */
-	if (chip->pdata && chip->pdata->platform_default_settings)
-		memcpy(&chip->settings, chip->pdata->platform_default_settings,
-		       sizeof(tsl2772_default_settings));
-	else
-		memcpy(&chip->settings, &tsl2772_default_settings,
-		       sizeof(tsl2772_default_settings));
-
-	/* Load up the proper lux table. */
-	if (chip->pdata && chip->pdata->platform_lux_table[0].ch0 != 0)
-		memcpy(chip->tsl2772_device_lux,
-		       chip->pdata->platform_lux_table,
-		       sizeof(chip->pdata->platform_lux_table));
-	else
-		memcpy(chip->tsl2772_device_lux,
-		       tsl2772_default_lux_table_group[chip->id],
-		       TSL2772_DEFAULT_TABLE_BYTES);
-}
-
-/**
- * tsl2772_als_calibrate() -	Obtain single reading and calculate
- *                              the als_gain_trim.
- *
- * @indio_dev:	pointer to IIO device
- */
-static int tsl2772_als_calibrate(struct iio_dev *indio_dev)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int ret, lux_val;
-
-	ret = i2c_smbus_read_byte_data(chip->client,
-				       TSL2772_CMD_REG | TSL2772_CNTRL);
-	if (ret < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to read from the CNTRL register\n",
-			__func__);
-		return ret;
-	}
-
-	if ((ret & (TSL2772_CNTL_ADC_ENBL | TSL2772_CNTL_PWR_ON))
-			!= (TSL2772_CNTL_ADC_ENBL | TSL2772_CNTL_PWR_ON)) {
-		dev_err(&chip->client->dev,
-			"%s: Device is not powered on and/or ADC is not enabled\n",
-			__func__);
-		return -EINVAL;
-	} else if ((ret & TSL2772_STA_ADC_VALID) != TSL2772_STA_ADC_VALID) {
-		dev_err(&chip->client->dev,
-			"%s: The two ADC channels have not completed an integration cycle\n",
-			__func__);
-		return -ENODATA;
-	}
-
-	lux_val = tsl2772_get_lux(indio_dev);
-	if (lux_val < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to get lux\n", __func__);
-		return lux_val;
-	}
-
-	ret = (chip->settings.als_cal_target * chip->settings.als_gain_trim) /
-			lux_val;
-	if (ret < TSL2772_ALS_GAIN_TRIM_MIN || ret > TSL2772_ALS_GAIN_TRIM_MAX)
-		return -ERANGE;
-
-	chip->settings.als_gain_trim = ret;
-
-	return ret;
-}
-
-static int tsl2772_chip_on(struct iio_dev *indio_dev)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int ret, i, als_count, als_time_us;
-	u8 *dev_reg, reg_val;
-
-	/* Non calculated parameters */
-	chip->tsl2772_config[TSL2772_ALS_TIME] = chip->settings.als_time;
-	chip->tsl2772_config[TSL2772_PRX_TIME] = chip->settings.prox_time;
-	chip->tsl2772_config[TSL2772_WAIT_TIME] = chip->settings.wait_time;
-	chip->tsl2772_config[TSL2772_ALS_PRX_CONFIG] =
-		chip->settings.als_prox_config;
-
-	chip->tsl2772_config[TSL2772_ALS_MINTHRESHLO] =
-		(chip->settings.als_thresh_low) & 0xFF;
-	chip->tsl2772_config[TSL2772_ALS_MINTHRESHHI] =
-		(chip->settings.als_thresh_low >> 8) & 0xFF;
-	chip->tsl2772_config[TSL2772_ALS_MAXTHRESHLO] =
-		(chip->settings.als_thresh_high) & 0xFF;
-	chip->tsl2772_config[TSL2772_ALS_MAXTHRESHHI] =
-		(chip->settings.als_thresh_high >> 8) & 0xFF;
-	chip->tsl2772_config[TSL2772_PERSISTENCE] =
-		(chip->settings.prox_persistence & 0xFF) << 4 |
-		(chip->settings.als_persistence & 0xFF);
-
-	chip->tsl2772_config[TSL2772_PRX_COUNT] =
-			chip->settings.prox_pulse_count;
-	chip->tsl2772_config[TSL2772_PRX_MINTHRESHLO] =
-			(chip->settings.prox_thres_low) & 0xFF;
-	chip->tsl2772_config[TSL2772_PRX_MINTHRESHHI] =
-			(chip->settings.prox_thres_low >> 8) & 0xFF;
-	chip->tsl2772_config[TSL2772_PRX_MAXTHRESHLO] =
-			(chip->settings.prox_thres_high) & 0xFF;
-	chip->tsl2772_config[TSL2772_PRX_MAXTHRESHHI] =
-			(chip->settings.prox_thres_high >> 8) & 0xFF;
-
-	/* and make sure we're not already on */
-	if (chip->tsl2772_chip_status == TSL2772_CHIP_WORKING) {
-		/* if forcing a register update - turn off, then on */
-		dev_info(&chip->client->dev, "device is already enabled\n");
-		return -EINVAL;
-	}
-
-	/* Set the gain based on tsl2772_settings struct */
-	chip->tsl2772_config[TSL2772_GAIN] =
-		(chip->settings.als_gain & 0xFF) |
-		((chip->settings.prox_gain & 0xFF) << 2) |
-		(chip->settings.prox_diode << 4) |
-		(chip->settings.prox_power << 6);
-
-	/* set chip time scaling and saturation */
-	als_count = 256 - chip->settings.als_time;
-	als_time_us = als_count * tsl2772_int_time_avail[chip->id][3];
-	chip->als_saturation = als_count * 768; /* 75% of full scale */
-	chip->als_gain_time_scale = als_time_us *
-		tsl2772_als_gain[chip->settings.als_gain];
-
-	/*
-	 * TSL2772 Specific power-on / adc enable sequence
-	 * Power on the device 1st.
-	 */
-	ret = tsl2772_write_control_reg(chip, TSL2772_CNTL_PWR_ON);
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * Use the following shadow copy for our delay before enabling ADC.
-	 * Write all the registers.
-	 */
-	for (i = 0, dev_reg = chip->tsl2772_config;
-			i < TSL2772_MAX_CONFIG_REG; i++) {
-		int reg = TSL2772_CMD_REG + i;
-
-		ret = i2c_smbus_write_byte_data(chip->client, reg,
-						*dev_reg++);
-		if (ret < 0) {
-			dev_err(&chip->client->dev,
-				"%s: failed to write to register %x: %d\n",
-				__func__, reg, ret);
-			return ret;
-		}
-	}
-
-	/* Power-on settling time */
-	usleep_range(3000, 3500);
-
-	reg_val = TSL2772_CNTL_PWR_ON | TSL2772_CNTL_ADC_ENBL |
-		  TSL2772_CNTL_PROX_DET_ENBL;
-	if (chip->settings.als_interrupt_en)
-		reg_val |= TSL2772_CNTL_ALS_INT_ENBL;
-	if (chip->settings.prox_interrupt_en)
-		reg_val |= TSL2772_CNTL_PROX_INT_ENBL;
-
-	ret = tsl2772_write_control_reg(chip, reg_val);
-	if (ret < 0)
-		return ret;
-
-	ret = i2c_smbus_write_byte(chip->client,
-				   TSL2772_CMD_REG | TSL2772_CMD_SPL_FN |
-				   TSL2772_CMD_PROXALS_INT_CLR);
-	if (ret < 0) {
-		dev_err(&chip->client->dev,
-			"%s: failed to clear interrupt status: %d\n",
-			__func__, ret);
-		return ret;
-	}
-
-	chip->tsl2772_chip_status = TSL2772_CHIP_WORKING;
-
-	return ret;
-}
-
-static int tsl2772_chip_off(struct iio_dev *indio_dev)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-
-	/* turn device off */
-	chip->tsl2772_chip_status = TSL2772_CHIP_SUSPENDED;
-	return tsl2772_write_control_reg(chip, 0x00);
-}
-
-/**
- * tsl2772_invoke_change - power cycle the device to implement the user
- *                         parameters
- * @indio_dev:	pointer to IIO device
- *
- * Obtain and lock both ALS and PROX resources, determine and save device state
- * (On/Off), cycle device to implement updated parameter, put device back into
- * proper state, and unlock resource.
- */
-static int tsl2772_invoke_change(struct iio_dev *indio_dev)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int device_status = chip->tsl2772_chip_status;
-	int ret;
-
-	mutex_lock(&chip->als_mutex);
-	mutex_lock(&chip->prox_mutex);
-
-	if (device_status == TSL2772_CHIP_WORKING) {
-		ret = tsl2772_chip_off(indio_dev);
-		if (ret < 0)
-			goto unlock;
-	}
-
-	ret = tsl2772_chip_on(indio_dev);
-
-unlock:
-	mutex_unlock(&chip->prox_mutex);
-	mutex_unlock(&chip->als_mutex);
-
-	return ret;
-}
-
-static int tsl2772_prox_cal(struct iio_dev *indio_dev)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int prox_history[MAX_SAMPLES_CAL + 1];
-	int i, ret, mean, max, sample_sum;
-
-	if (chip->settings.prox_max_samples_cal < 1 ||
-	    chip->settings.prox_max_samples_cal > MAX_SAMPLES_CAL)
-		return -EINVAL;
-
-	for (i = 0; i < chip->settings.prox_max_samples_cal; i++) {
-		usleep_range(15000, 17500);
-		ret = tsl2772_get_prox(indio_dev);
-		if (ret < 0)
-			return ret;
-
-		prox_history[i] = chip->prox_data;
-	}
-
-	sample_sum = 0;
-	max = INT_MIN;
-	for (i = 0; i < chip->settings.prox_max_samples_cal; i++) {
-		sample_sum += prox_history[i];
-		max = max(max, prox_history[i]);
-	}
-	mean = sample_sum / chip->settings.prox_max_samples_cal;
-
-	chip->settings.prox_thres_high = (max << 1) - mean;
-
-	return tsl2772_invoke_change(indio_dev);
-}
-
-static int tsl2772_read_avail(struct iio_dev *indio_dev,
-			      struct iio_chan_spec const *chan,
-			      const int **vals, int *type, int *length,
-			      long mask)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-
-	switch (mask) {
-	case IIO_CHAN_INFO_CALIBSCALE:
-		if (chan->type == IIO_INTENSITY) {
-			*length = ARRAY_SIZE(tsl2772_int_calibscale_avail);
-			*vals = tsl2772_int_calibscale_avail;
-		} else {
-			*length = ARRAY_SIZE(tsl2772_prox_calibscale_avail);
-			*vals = tsl2772_prox_calibscale_avail;
-		}
-		*type = IIO_VAL_INT;
-		return IIO_AVAIL_LIST;
-	case IIO_CHAN_INFO_INT_TIME:
-		*length = ARRAY_SIZE(tsl2772_int_time_avail[chip->id]);
-		*vals = tsl2772_int_time_avail[chip->id];
-		*type = IIO_VAL_INT_PLUS_MICRO;
-		return IIO_AVAIL_RANGE;
-	}
-
-	return -EINVAL;
-}
-
-static ssize_t in_illuminance0_target_input_show(struct device *dev,
-						 struct device_attribute *attr,
-						 char *buf)
-{
-	struct tsl2772_chip *chip = iio_priv(dev_to_iio_dev(dev));
-
-	return snprintf(buf, PAGE_SIZE, "%d\n", chip->settings.als_cal_target);
-}
-
-static ssize_t in_illuminance0_target_input_store(struct device *dev,
-						  struct device_attribute *attr,
-						  const char *buf, size_t len)
-{
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	u16 value;
-	int ret;
-
-	if (kstrtou16(buf, 0, &value))
-		return -EINVAL;
-
-	chip->settings.als_cal_target = value;
-	ret = tsl2772_invoke_change(indio_dev);
-	if (ret < 0)
-		return ret;
-
-	return len;
-}
-
-static ssize_t in_illuminance0_calibrate_store(struct device *dev,
-					       struct device_attribute *attr,
-					       const char *buf, size_t len)
-{
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	bool value;
-	int ret;
-
-	if (kstrtobool(buf, &value) || !value)
-		return -EINVAL;
-
-	ret = tsl2772_als_calibrate(indio_dev);
-	if (ret < 0)
-		return ret;
-
-	ret = tsl2772_invoke_change(indio_dev);
-	if (ret < 0)
-		return ret;
-
-	return len;
-}
-
-static ssize_t in_illuminance0_lux_table_show(struct device *dev,
-					      struct device_attribute *attr,
-					      char *buf)
-{
-	struct tsl2772_chip *chip = iio_priv(dev_to_iio_dev(dev));
-	int i = 0;
-	int offset = 0;
-
-	while (i < TSL2772_MAX_LUX_TABLE_SIZE) {
-		offset += snprintf(buf + offset, PAGE_SIZE, "%u,%u,",
-			chip->tsl2772_device_lux[i].ch0,
-			chip->tsl2772_device_lux[i].ch1);
-		if (chip->tsl2772_device_lux[i].ch0 == 0) {
-			/*
-			 * We just printed the first "0" entry.
-			 * Now get rid of the extra "," and break.
-			 */
-			offset--;
-			break;
-		}
-		i++;
-	}
-
-	offset += snprintf(buf + offset, PAGE_SIZE, "\n");
-	return offset;
-}
-
-static ssize_t in_illuminance0_lux_table_store(struct device *dev,
-					       struct device_attribute *attr,
-					       const char *buf, size_t len)
-{
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int value[ARRAY_SIZE(chip->tsl2772_device_lux) * 2 + 1];
-	int n, ret;
-
-	get_options(buf, ARRAY_SIZE(value), value);
-
-	/*
-	 * We now have an array of ints starting at value[1], and
-	 * enumerated by value[0].
-	 * We expect each group of two ints to be one table entry,
-	 * and the last table entry is all 0.
-	 */
-	n = value[0];
-	if ((n % 2) || n < 4 ||
-	    n > ((ARRAY_SIZE(chip->tsl2772_device_lux) - 1) * 2))
-		return -EINVAL;
-
-	if ((value[(n - 1)] | value[n]) != 0)
-		return -EINVAL;
-
-	if (chip->tsl2772_chip_status == TSL2772_CHIP_WORKING) {
-		ret = tsl2772_chip_off(indio_dev);
-		if (ret < 0)
-			return ret;
-	}
-
-	/* Zero out the table */
-	memset(chip->tsl2772_device_lux, 0, sizeof(chip->tsl2772_device_lux));
-	memcpy(chip->tsl2772_device_lux, &value[1], (value[0] * 4));
-
-	ret = tsl2772_invoke_change(indio_dev);
-	if (ret < 0)
-		return ret;
-
-	return len;
-}
-
-static ssize_t in_proximity0_calibrate_store(struct device *dev,
-					     struct device_attribute *attr,
-					     const char *buf, size_t len)
-{
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	bool value;
-	int ret;
-
-	if (kstrtobool(buf, &value) || !value)
-		return -EINVAL;
-
-	ret = tsl2772_prox_cal(indio_dev);
-	if (ret < 0)
-		return ret;
-
-	ret = tsl2772_invoke_change(indio_dev);
-	if (ret < 0)
-		return ret;
-
-	return len;
-}
-
-static int tsl2772_read_interrupt_config(struct iio_dev *indio_dev,
-					 const struct iio_chan_spec *chan,
-					 enum iio_event_type type,
-					 enum iio_event_direction dir)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-
-	if (chan->type == IIO_INTENSITY)
-		return chip->settings.als_interrupt_en;
-	else
-		return chip->settings.prox_interrupt_en;
-}
-
-static int tsl2772_write_interrupt_config(struct iio_dev *indio_dev,
-					  const struct iio_chan_spec *chan,
-					  enum iio_event_type type,
-					  enum iio_event_direction dir,
-					  int val)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-
-	if (chan->type == IIO_INTENSITY)
-		chip->settings.als_interrupt_en = val ? true : false;
-	else
-		chip->settings.prox_interrupt_en = val ? true : false;
-
-	return tsl2772_invoke_change(indio_dev);
-}
-
-static int tsl2772_write_event_value(struct iio_dev *indio_dev,
-				     const struct iio_chan_spec *chan,
-				     enum iio_event_type type,
-				     enum iio_event_direction dir,
-				     enum iio_event_info info,
-				     int val, int val2)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int ret = -EINVAL, count, persistence;
-	u8 time;
-
-	switch (info) {
-	case IIO_EV_INFO_VALUE:
-		if (chan->type == IIO_INTENSITY) {
-			switch (dir) {
-			case IIO_EV_DIR_RISING:
-				chip->settings.als_thresh_high = val;
-				ret = 0;
-				break;
-			case IIO_EV_DIR_FALLING:
-				chip->settings.als_thresh_low = val;
-				ret = 0;
-				break;
-			default:
-				break;
-			}
-		} else {
-			switch (dir) {
-			case IIO_EV_DIR_RISING:
-				chip->settings.prox_thres_high = val;
-				ret = 0;
-				break;
-			case IIO_EV_DIR_FALLING:
-				chip->settings.prox_thres_low = val;
-				ret = 0;
-				break;
-			default:
-				break;
-			}
-		}
-		break;
-	case IIO_EV_INFO_PERIOD:
-		if (chan->type == IIO_INTENSITY)
-			time = chip->settings.als_time;
-		else
-			time = chip->settings.prox_time;
-
-		count = 256 - time;
-		persistence = ((val * 1000000) + val2) /
-			(count * tsl2772_int_time_avail[chip->id][3]);
-
-		if (chan->type == IIO_INTENSITY) {
-			/* ALS filter values are 1, 2, 3, 5, 10, 15, ..., 60 */
-			if (persistence > 3)
-				persistence = (persistence / 5) + 3;
-
-			chip->settings.als_persistence = persistence;
-		} else {
-			chip->settings.prox_persistence = persistence;
-		}
-
-		ret = 0;
-		break;
-	default:
-		break;
-	}
-
-	if (ret < 0)
-		return ret;
-
-	return tsl2772_invoke_change(indio_dev);
-}
-
-static int tsl2772_read_event_value(struct iio_dev *indio_dev,
-				    const struct iio_chan_spec *chan,
-				    enum iio_event_type type,
-				    enum iio_event_direction dir,
-				    enum iio_event_info info,
-				    int *val, int *val2)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	int filter_delay, persistence;
-	u8 time;
-
-	switch (info) {
-	case IIO_EV_INFO_VALUE:
-		if (chan->type == IIO_INTENSITY) {
-			switch (dir) {
-			case IIO_EV_DIR_RISING:
-				*val = chip->settings.als_thresh_high;
-				return IIO_VAL_INT;
-			case IIO_EV_DIR_FALLING:
-				*val = chip->settings.als_thresh_low;
-				return IIO_VAL_INT;
-			default:
-				return -EINVAL;
-			}
-		} else {
-			switch (dir) {
-			case IIO_EV_DIR_RISING:
-				*val = chip->settings.prox_thres_high;
-				return IIO_VAL_INT;
-			case IIO_EV_DIR_FALLING:
-				*val = chip->settings.prox_thres_low;
-				return IIO_VAL_INT;
-			default:
-				return -EINVAL;
-			}
-		}
-		break;
-	case IIO_EV_INFO_PERIOD:
-		if (chan->type == IIO_INTENSITY) {
-			time = chip->settings.als_time;
-			persistence = chip->settings.als_persistence;
-
-			/* ALS filter values are 1, 2, 3, 5, 10, 15, ..., 60 */
-			if (persistence > 3)
-				persistence = (persistence - 3) * 5;
-		} else {
-			time = chip->settings.prox_time;
-			persistence = chip->settings.prox_persistence;
-		}
-
-		filter_delay = persistence * (256 - time) *
-			tsl2772_int_time_avail[chip->id][3];
-
-		*val = filter_delay / 1000000;
-		*val2 = filter_delay % 1000000;
-		return IIO_VAL_INT_PLUS_MICRO;
-	default:
-		return -EINVAL;
-	}
-}
-
-static int tsl2772_read_raw(struct iio_dev *indio_dev,
-			    struct iio_chan_spec const *chan,
-			    int *val,
-			    int *val2,
-			    long mask)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-
-	switch (mask) {
-	case IIO_CHAN_INFO_PROCESSED:
-		switch (chan->type) {
-		case IIO_LIGHT:
-			tsl2772_get_lux(indio_dev);
-			*val = chip->als_cur_info.lux;
-			return IIO_VAL_INT;
-		default:
-			return -EINVAL;
-		}
-	case IIO_CHAN_INFO_RAW:
-		switch (chan->type) {
-		case IIO_INTENSITY:
-			tsl2772_get_lux(indio_dev);
-			if (chan->channel == 0)
-				*val = chip->als_cur_info.als_ch0;
-			else
-				*val = chip->als_cur_info.als_ch1;
-			return IIO_VAL_INT;
-		case IIO_PROXIMITY:
-			tsl2772_get_prox(indio_dev);
-			*val = chip->prox_data;
-			return IIO_VAL_INT;
-		default:
-			return -EINVAL;
-		}
-		break;
-	case IIO_CHAN_INFO_CALIBSCALE:
-		if (chan->type == IIO_LIGHT)
-			*val = tsl2772_als_gain[chip->settings.als_gain];
-		else
-			*val = tsl2772_prox_gain[chip->settings.prox_gain];
-		return IIO_VAL_INT;
-	case IIO_CHAN_INFO_CALIBBIAS:
-		*val = chip->settings.als_gain_trim;
-		return IIO_VAL_INT;
-	case IIO_CHAN_INFO_INT_TIME:
-		*val = 0;
-		*val2 = (256 - chip->settings.als_time) *
-			tsl2772_int_time_avail[chip->id][3];
-		return IIO_VAL_INT_PLUS_MICRO;
-	default:
-		return -EINVAL;
-	}
-}
-
-static int tsl2772_write_raw(struct iio_dev *indio_dev,
-			     struct iio_chan_spec const *chan,
-			     int val,
-			     int val2,
-			     long mask)
-{
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-
-	switch (mask) {
-	case IIO_CHAN_INFO_CALIBSCALE:
-		if (chan->type == IIO_INTENSITY) {
-			switch (val) {
-			case 1:
-				chip->settings.als_gain = 0;
-				break;
-			case 8:
-				chip->settings.als_gain = 1;
-				break;
-			case 16:
-				chip->settings.als_gain = 2;
-				break;
-			case 120:
-				chip->settings.als_gain = 3;
-				break;
-			default:
-				return -EINVAL;
-			}
-		} else {
-			switch (val) {
-			case 1:
-				chip->settings.prox_gain = 0;
-				break;
-			case 2:
-				chip->settings.prox_gain = 1;
-				break;
-			case 4:
-				chip->settings.prox_gain = 2;
-				break;
-			case 8:
-				chip->settings.prox_gain = 3;
-				break;
-			default:
-				return -EINVAL;
-			}
-		}
-		break;
-	case IIO_CHAN_INFO_CALIBBIAS:
-		if (val < TSL2772_ALS_GAIN_TRIM_MIN ||
-		    val > TSL2772_ALS_GAIN_TRIM_MAX)
-			return -EINVAL;
-
-		chip->settings.als_gain_trim = val;
-		break;
-	case IIO_CHAN_INFO_INT_TIME:
-		if (val != 0 || val2 < tsl2772_int_time_avail[chip->id][1] ||
-		    val2 > tsl2772_int_time_avail[chip->id][5])
-			return -EINVAL;
-
-		chip->settings.als_time = 256 -
-			(val2 / tsl2772_int_time_avail[chip->id][3]);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return tsl2772_invoke_change(indio_dev);
-}
-
-static DEVICE_ATTR_RW(in_illuminance0_target_input);
-
-static DEVICE_ATTR_WO(in_illuminance0_calibrate);
-
-static DEVICE_ATTR_WO(in_proximity0_calibrate);
-
-static DEVICE_ATTR_RW(in_illuminance0_lux_table);
-
-/* Use the default register values to identify the Taos device */
-static int tsl2772_device_id_verif(int id, int target)
-{
-	switch (target) {
-	case tsl2571:
-	case tsl2671:
-	case tsl2771:
-		return (id & 0xf0) == TRITON_ID;
-	case tmd2671:
-	case tmd2771:
-		return (id & 0xf0) == HALIBUT_ID;
-	case tsl2572:
-	case tsl2672:
-	case tmd2672:
-	case tsl2772:
-	case tmd2772:
-		return (id & 0xf0) == SWORDFISH_ID;
-	}
-
-	return -EINVAL;
-}
-
-static irqreturn_t tsl2772_event_handler(int irq, void *private)
-{
-	struct iio_dev *indio_dev = private;
-	struct tsl2772_chip *chip = iio_priv(indio_dev);
-	s64 timestamp = iio_get_time_ns(indio_dev);
-	int ret;
-
-	ret = tsl2772_read_status(chip);
-	if (ret < 0)
-		return IRQ_HANDLED;
-
-	/* What type of interrupt do we need to process */
-	if (ret & TSL2772_STA_PRX_INTR) {
-		iio_push_event(indio_dev,
-			       IIO_UNMOD_EVENT_CODE(IIO_PROXIMITY,
-						    0,
-						    IIO_EV_TYPE_THRESH,
-						    IIO_EV_DIR_EITHER),
-			       timestamp);
-	}
-
-	if (ret & TSL2772_STA_ALS_INTR) {
-		iio_push_event(indio_dev,
-			       IIO_UNMOD_EVENT_CODE(IIO_LIGHT,
-						    0,
-						    IIO_EV_TYPE_THRESH,
-						    IIO_EV_DIR_EITHER),
-			       timestamp);
-	}
-
-	ret = i2c_smbus_write_byte(chip->client,
-				   TSL2772_CMD_REG | TSL2772_CMD_SPL_FN |
-				   TSL2772_CMD_PROXALS_INT_CLR);
-	if (ret < 0)
-		dev_err(&chip->client->dev,
-			"%s: failed to clear interrupt status: %d\n",
-			__func__, ret);
-
-	return IRQ_HANDLED;
-}
-
-static struct attribute *tsl2772_ALS_device_attrs[] = {
-	&dev_attr_in_illuminance0_target_input.attr,
-	&dev_attr_in_illuminance0_calibrate.attr,
-	&dev_attr_in_illuminance0_lux_table.attr,
-	NULL
-};
-
-static struct attribute *tsl2772_PRX_device_attrs[] = {
-	&dev_attr_in_proximity0_calibrate.attr,
-	NULL
-};
-
-static struct attribute *tsl2772_ALSPRX_device_attrs[] = {
-	&dev_attr_in_illuminance0_target_input.attr,
-	&dev_attr_in_illuminance0_calibrate.attr,
-	&dev_attr_in_illuminance0_lux_table.attr,
-	NULL
-};
-
-static struct attribute *tsl2772_PRX2_device_attrs[] = {
-	&dev_attr_in_proximity0_calibrate.attr,
-	NULL
-};
-
-static struct attribute *tsl2772_ALSPRX2_device_attrs[] = {
-	&dev_attr_in_illuminance0_target_input.attr,
-	&dev_attr_in_illuminance0_calibrate.attr,
-	&dev_attr_in_illuminance0_lux_table.attr,
-	&dev_attr_in_proximity0_calibrate.attr,
-	NULL
-};
-
-static const struct attribute_group tsl2772_device_attr_group_tbl[] = {
-	[ALS] = {
-		.attrs = tsl2772_ALS_device_attrs,
-	},
-	[PRX] = {
-		.attrs = tsl2772_PRX_device_attrs,
-	},
-	[ALSPRX] = {
-		.attrs = tsl2772_ALSPRX_device_attrs,
-	},
-	[PRX2] = {
-		.attrs = tsl2772_PRX2_device_attrs,
-	},
-	[ALSPRX2] = {
-		.attrs = tsl2772_ALSPRX2_device_attrs,
-	},
-};
-
-#define TSL2772_DEVICE_INFO(type)[type] = \
-	{ \
-		.attrs = &tsl2772_device_attr_group_tbl[type], \
-		.read_raw = &tsl2772_read_raw, \
-		.read_avail = &tsl2772_read_avail, \
-		.write_raw = &tsl2772_write_raw, \
-		.read_event_value = &tsl2772_read_event_value, \
-		.write_event_value = &tsl2772_write_event_value, \
-		.read_event_config = &tsl2772_read_interrupt_config, \
-		.write_event_config = &tsl2772_write_interrupt_config, \
-	}
-
-static const struct iio_info tsl2772_device_info[] = {
-	TSL2772_DEVICE_INFO(ALS),
-	TSL2772_DEVICE_INFO(PRX),
-	TSL2772_DEVICE_INFO(ALSPRX),
-	TSL2772_DEVICE_INFO(PRX2),
-	TSL2772_DEVICE_INFO(ALSPRX2),
-};
-
-static const struct iio_event_spec tsl2772_events[] = {
-	{
-		.type = IIO_EV_TYPE_THRESH,
-		.dir = IIO_EV_DIR_RISING,
-		.mask_separate = BIT(IIO_EV_INFO_VALUE),
-	}, {
-		.type = IIO_EV_TYPE_THRESH,
-		.dir = IIO_EV_DIR_FALLING,
-		.mask_separate = BIT(IIO_EV_INFO_VALUE),
-	}, {
-		.type = IIO_EV_TYPE_THRESH,
-		.dir = IIO_EV_DIR_EITHER,
-		.mask_separate = BIT(IIO_EV_INFO_PERIOD) |
-			BIT(IIO_EV_INFO_ENABLE),
-	},
-};
-
-static const struct tsl2772_chip_info tsl2772_chip_info_tbl[] = {
-	[ALS] = {
-		.channel_with_events = {
-			{
-			.type = IIO_LIGHT,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE) |
-				BIT(IIO_CHAN_INFO_CALIBBIAS),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.event_spec = tsl2772_events,
-			.num_event_specs = ARRAY_SIZE(tsl2772_events),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 1,
-			},
-		},
-		.channel_without_events = {
-			{
-			.type = IIO_LIGHT,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE) |
-				BIT(IIO_CHAN_INFO_CALIBBIAS),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 1,
-			},
-		},
-		.chan_table_elements = 3,
-		.info = &tsl2772_device_info[ALS],
-	},
-	[PRX] = {
-		.channel_with_events = {
-			{
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			.event_spec = tsl2772_events,
-			.num_event_specs = ARRAY_SIZE(tsl2772_events),
-			},
-		},
-		.channel_without_events = {
-			{
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			},
-		},
-		.chan_table_elements = 1,
-		.info = &tsl2772_device_info[PRX],
-	},
-	[ALSPRX] = {
-		.channel_with_events = {
-			{
-			.type = IIO_LIGHT,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE) |
-				BIT(IIO_CHAN_INFO_CALIBBIAS),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.event_spec = tsl2772_events,
-			.num_event_specs = ARRAY_SIZE(tsl2772_events),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 1,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			}, {
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			.event_spec = tsl2772_events,
-			.num_event_specs = ARRAY_SIZE(tsl2772_events),
-			},
-		},
-		.channel_without_events = {
-			{
-			.type = IIO_LIGHT,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE) |
-				BIT(IIO_CHAN_INFO_CALIBBIAS),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 1,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			}, {
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			},
-		},
-		.chan_table_elements = 4,
-		.info = &tsl2772_device_info[ALSPRX],
-	},
-	[PRX2] = {
-		.channel_with_events = {
-			{
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.event_spec = tsl2772_events,
-			.num_event_specs = ARRAY_SIZE(tsl2772_events),
-			},
-		},
-		.channel_without_events = {
-			{
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			},
-		},
-		.chan_table_elements = 1,
-		.info = &tsl2772_device_info[PRX2],
-	},
-	[ALSPRX2] = {
-		.channel_with_events = {
-			{
-			.type = IIO_LIGHT,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE) |
-				BIT(IIO_CHAN_INFO_CALIBBIAS),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.event_spec = tsl2772_events,
-			.num_event_specs = ARRAY_SIZE(tsl2772_events),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 1,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			}, {
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.event_spec = tsl2772_events,
-			.num_event_specs = ARRAY_SIZE(tsl2772_events),
-			},
-		},
-		.channel_without_events = {
-			{
-			.type = IIO_LIGHT,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE) |
-				BIT(IIO_CHAN_INFO_CALIBBIAS),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_INT_TIME) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			}, {
-			.type = IIO_INTENSITY,
-			.indexed = 1,
-			.channel = 1,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-			}, {
-			.type = IIO_PROXIMITY,
-			.indexed = 1,
-			.channel = 0,
-			.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			.info_mask_separate_available =
-				BIT(IIO_CHAN_INFO_CALIBSCALE),
-			},
-		},
-		.chan_table_elements = 4,
-		.info = &tsl2772_device_info[ALSPRX2],
-	},
-};
-
-static int tsl2772_probe(struct i2c_client *clientp,
-			 const struct i2c_device_id *id)
-{
-	struct iio_dev *indio_dev;
-	struct tsl2772_chip *chip;
-	int ret;
-
-	indio_dev = devm_iio_device_alloc(&clientp->dev, sizeof(*chip));
-	if (!indio_dev)
-		return -ENOMEM;
-
-	chip = iio_priv(indio_dev);
-	chip->client = clientp;
-	i2c_set_clientdata(clientp, indio_dev);
-
-	ret = i2c_smbus_read_byte_data(chip->client,
-				       TSL2772_CMD_REG | TSL2772_CHIPID);
-	if (ret < 0)
-		return ret;
-
-	if (tsl2772_device_id_verif(ret, id->driver_data) <= 0) {
-		dev_info(&chip->client->dev,
-			 "%s: i2c device found does not match expected id\n",
-				__func__);
-		return -EINVAL;
-	}
-
-	ret = i2c_smbus_write_byte(clientp, TSL2772_CMD_REG | TSL2772_CNTRL);
-	if (ret < 0) {
-		dev_err(&clientp->dev,
-			"%s: Failed to write to CMD register: %d\n",
-			__func__, ret);
-		return ret;
-	}
-
-	mutex_init(&chip->als_mutex);
-	mutex_init(&chip->prox_mutex);
-
-	chip->tsl2772_chip_status = TSL2772_CHIP_UNKNOWN;
-	chip->pdata = dev_get_platdata(&clientp->dev);
-	chip->id = id->driver_data;
-	chip->chip_info =
-		&tsl2772_chip_info_tbl[device_channel_config[id->driver_data]];
-
-	indio_dev->info = chip->chip_info->info;
-	indio_dev->dev.parent = &clientp->dev;
-	indio_dev->modes = INDIO_DIRECT_MODE;
-	indio_dev->name = chip->client->name;
-	indio_dev->num_channels = chip->chip_info->chan_table_elements;
-
-	if (clientp->irq) {
-		indio_dev->channels = chip->chip_info->channel_with_events;
-
-		ret = devm_request_threaded_irq(&clientp->dev, clientp->irq,
-						NULL,
-						&tsl2772_event_handler,
-						IRQF_TRIGGER_FALLING |
-						IRQF_ONESHOT,
-						"TSL2772_event",
-						indio_dev);
-		if (ret) {
-			dev_err(&clientp->dev,
-				"%s: irq request failed\n", __func__);
-			return ret;
-		}
-	} else {
-		indio_dev->channels = chip->chip_info->channel_without_events;
-	}
-
-	tsl2772_defaults(chip);
-	ret = tsl2772_chip_on(indio_dev);
-	if (ret < 0)
-		return ret;
-
-	ret = iio_device_register(indio_dev);
-	if (ret) {
-		tsl2772_chip_off(indio_dev);
-		dev_err(&clientp->dev,
-			"%s: iio registration failed\n", __func__);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int tsl2772_suspend(struct device *dev)
-{
-	struct iio_dev *indio_dev = dev_get_drvdata(dev);
-
-	return tsl2772_chip_off(indio_dev);
-}
-
-static int tsl2772_resume(struct device *dev)
-{
-	struct iio_dev *indio_dev = dev_get_drvdata(dev);
-
-	return tsl2772_chip_on(indio_dev);
-}
-
-static int tsl2772_remove(struct i2c_client *client)
-{
-	struct iio_dev *indio_dev = i2c_get_clientdata(client);
-
-	tsl2772_chip_off(indio_dev);
-
-	iio_device_unregister(indio_dev);
-
-	return 0;
-}
-
-static const struct i2c_device_id tsl2772_idtable[] = {
-	{ "tsl2571", tsl2571 },
-	{ "tsl2671", tsl2671 },
-	{ "tmd2671", tmd2671 },
-	{ "tsl2771", tsl2771 },
-	{ "tmd2771", tmd2771 },
-	{ "tsl2572", tsl2572 },
-	{ "tsl2672", tsl2672 },
-	{ "tmd2672", tmd2672 },
-	{ "tsl2772", tsl2772 },
-	{ "tmd2772", tmd2772 },
-	{}
-};
-
-MODULE_DEVICE_TABLE(i2c, tsl2772_idtable);
-
-static const struct of_device_id tsl2772_of_match[] = {
-	{ .compatible = "amstaos,tsl2571" },
-	{ .compatible = "amstaos,tsl2671" },
-	{ .compatible = "amstaos,tmd2671" },
-	{ .compatible = "amstaos,tsl2771" },
-	{ .compatible = "amstaos,tmd2771" },
-	{ .compatible = "amstaos,tsl2572" },
-	{ .compatible = "amstaos,tsl2672" },
-	{ .compatible = "amstaos,tmd2672" },
-	{ .compatible = "amstaos,tsl2772" },
-	{ .compatible = "amstaos,tmd2772" },
-	{}
-};
-MODULE_DEVICE_TABLE(of, tsl2772_of_match);
-
-static const struct dev_pm_ops tsl2772_pm_ops = {
-	.suspend = tsl2772_suspend,
-	.resume  = tsl2772_resume,
-};
-
-static struct i2c_driver tsl2772_driver = {
-	.driver = {
-		.name = "tsl2772",
-		.of_match_table = tsl2772_of_match,
-		.pm = &tsl2772_pm_ops,
-	},
-	.id_table = tsl2772_idtable,
-	.probe = tsl2772_probe,
-	.remove = tsl2772_remove,
-};
-
-module_i2c_driver(tsl2772_driver);
-
-MODULE_AUTHOR("J. August Brenner <Jon.Brenner@ams.com>");
-MODULE_AUTHOR("Brian Masney <masneyb@onstation.org>");
-MODULE_DESCRIPTION("TAOS tsl2772 ambient and proximity light sensor driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/staging/iio/light/tsl2772.h b/drivers/staging/iio/light/tsl2772.h
deleted file mode 100644
index f8ade15a35e2..000000000000
--- a/drivers/staging/iio/light/tsl2772.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Device driver for monitoring ambient light intensity (lux)
- * and proximity (prox) within the TAOS TSL2772 family of devices.
- *
- * Copyright (c) 2012, TAOS Corporation.
- * Copyright (c) 2017-2018 Brian Masney <masneyb@onstation.org>
- */
-
-#ifndef __TSL2772_H
-#define __TSL2772_H
-
-struct tsl2772_lux {
-	unsigned int ch0;
-	unsigned int ch1;
-};
-
-/* Max number of segments allowable in LUX table */
-#define TSL2772_MAX_LUX_TABLE_SIZE		6
-/* The default LUX tables all have 3 elements.  */
-#define TSL2772_DEF_LUX_TABLE_SZ		3
-#define TSL2772_DEFAULT_TABLE_BYTES (sizeof(struct tsl2772_lux) * \
-				     TSL2772_DEF_LUX_TABLE_SZ)
-
-/* Proximity diode to use */
-#define TSL2772_DIODE0                  0x01
-#define TSL2772_DIODE1                  0x02
-#define TSL2772_DIODE_BOTH              0x03
-
-/* LED Power */
-#define TSL2772_100_mA                  0x00
-#define TSL2772_50_mA                   0x01
-#define TSL2772_25_mA                   0x02
-#define TSL2772_13_mA                   0x03
-
-/**
- * struct tsl2772_settings - Settings for the tsl2772 driver
- *  @als_time:              Integration time of the ALS channel ADCs in 2.73 ms
- *                          increments. Total integration time is
- *                          (256 - als_time) * 2.73.
- *  @als_gain:              Index into the tsl2772_als_gain array.
- *  @als_gain_trim:         Default gain trim to account for aperture effects.
- *  @wait_time:             Time between proximity and ALS cycles in 2.73
- *                          periods.
- *  @prox_time:             Integration time of the proximity ADC in 2.73 ms
- *                          increments. Total integration time is
- *                          (256 - prx_time) * 2.73.
- *  @prox_gain:             Index into the tsl2772_prx_gain array.
- *  @als_prox_config:       The value of the ALS / Proximity configuration
- *                          register.
- *  @als_cal_target:        Known external ALS reading for calibration.
- *  @als_persistence:       H/W Filters, Number of 'out of limits' ALS readings.
- *  @als_interrupt_en:      Enable/Disable ALS interrupts
- *  @als_thresh_low:        CH0 'low' count to trigger interrupt.
- *  @als_thresh_high:       CH0 'high' count to trigger interrupt.
- *  @prox_persistence:      H/W Filters, Number of 'out of limits' proximity
- *                          readings.
- *  @prox_interrupt_en:     Enable/Disable proximity interrupts.
- *  @prox_thres_low:        Low threshold proximity detection.
- *  @prox_thres_high:       High threshold proximity detection.
- *  @prox_pulse_count:      Number if proximity emitter pulses.
- *  @prox_max_samples_cal:  The number of samples that are taken when performing
- *                          a proximity calibration.
- *  @prox_diode             Which diode(s) to use for driving the external
- *                          LED(s) for proximity sensing.
- *  @prox_power             The amount of power to use for the external LED(s).
- */
-struct tsl2772_settings {
-	int als_time;
-	int als_gain;
-	int als_gain_trim;
-	int wait_time;
-	int prox_time;
-	int prox_gain;
-	int als_prox_config;
-	int als_cal_target;
-	u8 als_persistence;
-	bool als_interrupt_en;
-	int als_thresh_low;
-	int als_thresh_high;
-	u8 prox_persistence;
-	bool prox_interrupt_en;
-	int prox_thres_low;
-	int prox_thres_high;
-	int prox_pulse_count;
-	int prox_max_samples_cal;
-	int prox_diode;
-	int prox_power;
-};
-
-/**
- * struct tsl2772_platform_data - Platform callback, glass and defaults
- * @platform_lux_table:        Device specific glass coefficents
- * @platform_default_settings: Device specific power on defaults
- */
-struct tsl2772_platform_data {
-	struct tsl2772_lux platform_lux_table[TSL2772_MAX_LUX_TABLE_SIZE];
-	struct tsl2772_settings *platform_default_settings;
-};
-
-#endif /* __TSL2772_H */
diff --git a/include/linux/platform_data/tsl2772.h b/include/linux/platform_data/tsl2772.h
new file mode 100644
index 000000000000..f8ade15a35e2
--- /dev/null
+++ b/include/linux/platform_data/tsl2772.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Device driver for monitoring ambient light intensity (lux)
+ * and proximity (prox) within the TAOS TSL2772 family of devices.
+ *
+ * Copyright (c) 2012, TAOS Corporation.
+ * Copyright (c) 2017-2018 Brian Masney <masneyb@onstation.org>
+ */
+
+#ifndef __TSL2772_H
+#define __TSL2772_H
+
+struct tsl2772_lux {
+	unsigned int ch0;
+	unsigned int ch1;
+};
+
+/* Max number of segments allowable in LUX table */
+#define TSL2772_MAX_LUX_TABLE_SIZE		6
+/* The default LUX tables all have 3 elements.  */
+#define TSL2772_DEF_LUX_TABLE_SZ		3
+#define TSL2772_DEFAULT_TABLE_BYTES (sizeof(struct tsl2772_lux) * \
+				     TSL2772_DEF_LUX_TABLE_SZ)
+
+/* Proximity diode to use */
+#define TSL2772_DIODE0                  0x01
+#define TSL2772_DIODE1                  0x02
+#define TSL2772_DIODE_BOTH              0x03
+
+/* LED Power */
+#define TSL2772_100_mA                  0x00
+#define TSL2772_50_mA                   0x01
+#define TSL2772_25_mA                   0x02
+#define TSL2772_13_mA                   0x03
+
+/**
+ * struct tsl2772_settings - Settings for the tsl2772 driver
+ *  @als_time:              Integration time of the ALS channel ADCs in 2.73 ms
+ *                          increments. Total integration time is
+ *                          (256 - als_time) * 2.73.
+ *  @als_gain:              Index into the tsl2772_als_gain array.
+ *  @als_gain_trim:         Default gain trim to account for aperture effects.
+ *  @wait_time:             Time between proximity and ALS cycles in 2.73
+ *                          periods.
+ *  @prox_time:             Integration time of the proximity ADC in 2.73 ms
+ *                          increments. Total integration time is
+ *                          (256 - prx_time) * 2.73.
+ *  @prox_gain:             Index into the tsl2772_prx_gain array.
+ *  @als_prox_config:       The value of the ALS / Proximity configuration
+ *                          register.
+ *  @als_cal_target:        Known external ALS reading for calibration.
+ *  @als_persistence:       H/W Filters, Number of 'out of limits' ALS readings.
+ *  @als_interrupt_en:      Enable/Disable ALS interrupts
+ *  @als_thresh_low:        CH0 'low' count to trigger interrupt.
+ *  @als_thresh_high:       CH0 'high' count to trigger interrupt.
+ *  @prox_persistence:      H/W Filters, Number of 'out of limits' proximity
+ *                          readings.
+ *  @prox_interrupt_en:     Enable/Disable proximity interrupts.
+ *  @prox_thres_low:        Low threshold proximity detection.
+ *  @prox_thres_high:       High threshold proximity detection.
+ *  @prox_pulse_count:      Number if proximity emitter pulses.
+ *  @prox_max_samples_cal:  The number of samples that are taken when performing
+ *                          a proximity calibration.
+ *  @prox_diode             Which diode(s) to use for driving the external
+ *                          LED(s) for proximity sensing.
+ *  @prox_power             The amount of power to use for the external LED(s).
+ */
+struct tsl2772_settings {
+	int als_time;
+	int als_gain;
+	int als_gain_trim;
+	int wait_time;
+	int prox_time;
+	int prox_gain;
+	int als_prox_config;
+	int als_cal_target;
+	u8 als_persistence;
+	bool als_interrupt_en;
+	int als_thresh_low;
+	int als_thresh_high;
+	u8 prox_persistence;
+	bool prox_interrupt_en;
+	int prox_thres_low;
+	int prox_thres_high;
+	int prox_pulse_count;
+	int prox_max_samples_cal;
+	int prox_diode;
+	int prox_power;
+};
+
+/**
+ * struct tsl2772_platform_data - Platform callback, glass and defaults
+ * @platform_lux_table:        Device specific glass coefficents
+ * @platform_default_settings: Device specific power on defaults
+ */
+struct tsl2772_platform_data {
+	struct tsl2772_lux platform_lux_table[TSL2772_MAX_LUX_TABLE_SIZE];
+	struct tsl2772_settings *platform_default_settings;
+};
+
+#endif /* __TSL2772_H */
-- 
cgit v1.2.3


From 98004a78bb6cf18c260cecb49cb01e36cf6a72be Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Tue, 27 Mar 2018 10:02:01 +0000
Subject: platform_data/mlxreg: Document fixes for hotplug device

Remove redunadant description of label in struct mlxreg_hotplug_device.

Change location of access_mode in struct mlxreg_hotplug_device.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 include/linux/platform_data/mlxreg.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index 2744cff1b297..19f5cb618c55 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -58,11 +58,10 @@ struct mlxreg_hotplug_device {
  * struct mlxreg_core_data - attributes control data:
  *
  * @label: attribute label;
- * @label: attribute register offset;
  * @reg: attribute register;
  * @mask: attribute access mask;
- * @mode: access mode;
  * @bit: attribute effective bit;
+ * @mode: access mode;
  * @np - pointer to node platform associated with attribute;
  * @hpdev - hotplug device data;
  * @health_cntr: dynamic device health indication counter;
-- 
cgit v1.2.3


From 63fab6977a1f2f70ef0bc97e5de33dd5c51cb5b0 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Sat, 5 May 2018 22:36:03 +0200
Subject: ACPI: Add missing prototype_for arch_post_acpi_subsys_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit e7ff3a47630d (x86/amd: Check for the C1E bug post ACPI subsystem
init) a new function arch_post_acpi_subsys_init() was introduced. This
weak function can potentially be overridden on a per arch basis, introduce
the prototype for clarity.

Silence the following gcc warning (W=1):

  init/main.c:484:20: warning: no previous prototype for ‘arch_post_acpi_subsys_init’ [-Wmissing-prototypes]

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 15bfb15c2fa5..cb4d7b6b085c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -578,6 +578,7 @@ int acpi_match_platform_list(const struct acpi_platform_list *plat);
 
 extern void acpi_early_init(void);
 extern void acpi_subsystem_init(void);
+extern void arch_post_acpi_subsys_init(void);
 
 extern int acpi_nvs_register(__u64 start, __u64 size);
 
-- 
cgit v1.2.3


From 08f9f4485f2158de0fa77506687a073cb869e803 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Wed, 9 May 2018 17:53:04 -0700
Subject: ALSA: core api: define offsets for TLV items

Currently, there are no pre-defined accessors for the elements
in topology TLV data. In the absence of such offsets, the
tlv data will have to be decoded using hardwired offset
numbers 0-N depending on the type of TLV. This patch defines
accessor offsets for the type, length, min and mute/step items
in TLV data for DB_SCALE type tlv's. These will be used by drivers to
decode the TLV data while loading topology thereby improving
code readability. The type and len offsets are common for all TLV
types. The min and step/mute offsets are specific to DB_SCALE tlv type.

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/tlv.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/sound/tlv.h b/include/uapi/sound/tlv.h
index be5371f09a62..e3437e96519a 100644
--- a/include/uapi/sound/tlv.h
+++ b/include/uapi/sound/tlv.h
@@ -42,6 +42,10 @@
 #define SNDRV_CTL_TLVD_LENGTH(...) \
 	((unsigned int)sizeof((const unsigned int[]) { __VA_ARGS__ }))
 
+/* Accessor offsets for TLV data items */
+#define SNDRV_CTL_TLVO_TYPE		0
+#define SNDRV_CTL_TLVO_LEN		1
+
 #define SNDRV_CTL_TLVD_CONTAINER_ITEM(...) \
 	SNDRV_CTL_TLVD_ITEM(SNDRV_CTL_TLVT_CONTAINER, __VA_ARGS__)
 #define SNDRV_CTL_TLVD_DECLARE_CONTAINER(name, ...) \
@@ -61,6 +65,10 @@
 		SNDRV_CTL_TLVD_DB_SCALE_ITEM(min, step, mute) \
 	}
 
+/* Accessor offsets for min, mute and step items in dB scale type TLV */
+#define SNDRV_CTL_TLVO_DB_SCALE_MIN		2
+#define SNDRV_CTL_TLVO_DB_SCALE_MUTE_AND_STEP	3
+
 /* dB scale specified with min/max values instead of step */
 #define SNDRV_CTL_TLVD_DB_MINMAX_ITEM(min_dB, max_dB) \
 	SNDRV_CTL_TLVD_ITEM(SNDRV_CTL_TLVT_DB_MINMAX, (min_dB), (max_dB))
-- 
cgit v1.2.3


From 0be8153cbc2af9a96e9ab8631fc3ba23bb52dbe3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 8 May 2018 13:14:30 +0100
Subject: genirq/msi: Allow level-triggered MSIs to be exposed by MSI providers

So far, MSIs have been used to signal edge-triggered interrupts, as
a write is a good model for an edge (you can't "unwrite" something).
On the other hand, routing zillions of wires in an SoC because you
need level interrupts is a bit extreme.

People have come up with a variety of schemes to support this, which
involves sending two messages: one to signal the interrupt, and one
to clear it. Since the kernel cannot represent this, we've ended up
with side-band mechanisms that are pretty awful.

Instead, let's acknoledge the requirement, and ensure that, under the
right circumstances, the irq_compose_msg and irq_write_msg can take
as a parameter an array of two messages instead of a pointer to a
single one. We also add some checking that the compose method only
clobbers the second message if the MSI domain has been created with
the MSI_FLAG_LEVEL_CAPABLE flags.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rob Herring <robh@kernel.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lkml.kernel.org/r/20180508121438.11301-2-marc.zyngier@arm.com
---
 include/linux/msi.h |  2 ++
 kernel/irq/msi.c    | 33 ++++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 1f1bbb5b4679..5839d8062dfc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -289,6 +289,8 @@ enum {
 	 * MSI_FLAG_ACTIVATE_EARLY has been set.
 	 */
 	MSI_FLAG_MUST_REACTIVATE	= (1 << 5),
+	/* Is level-triggered capable, using two messages */
+	MSI_FLAG_LEVEL_CAPABLE		= (1 << 6),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2a8571f72b17..4ca2fd46645d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -76,6 +76,19 @@ static inline void irq_chip_write_msi_msg(struct irq_data *data,
 	data->chip->irq_write_msi_msg(data, msg);
 }
 
+static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
+{
+	struct msi_domain_info *info = domain->host_data;
+
+	/*
+	 * If the MSI provider has messed with the second message and
+	 * not advertized that it is level-capable, signal the breakage.
+	 */
+	WARN_ON(!((info->flags & MSI_FLAG_LEVEL_CAPABLE) &&
+		  (info->chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)) &&
+		(msg[1].address_lo || msg[1].address_hi || msg[1].data));
+}
+
 /**
  * msi_domain_set_affinity - Generic affinity setter function for MSI domains
  * @irq_data:	The irq data associated to the interrupt
@@ -89,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
 			    const struct cpumask *mask, bool force)
 {
 	struct irq_data *parent = irq_data->parent_data;
-	struct msi_msg msg;
+	struct msi_msg msg[2] = { [1] = { }, };
 	int ret;
 
 	ret = parent->chip->irq_set_affinity(parent, mask, force);
 	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
-		BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
-		irq_chip_write_msi_msg(irq_data, &msg);
+		BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+		msi_check_level(irq_data->domain, msg);
+		irq_chip_write_msi_msg(irq_data, msg);
 	}
 
 	return ret;
@@ -104,20 +118,21 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
 static int msi_domain_activate(struct irq_domain *domain,
 			       struct irq_data *irq_data, bool early)
 {
-	struct msi_msg msg;
+	struct msi_msg msg[2] = { [1] = { }, };
 
-	BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
-	irq_chip_write_msi_msg(irq_data, &msg);
+	BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+	msi_check_level(irq_data->domain, msg);
+	irq_chip_write_msi_msg(irq_data, msg);
 	return 0;
 }
 
 static void msi_domain_deactivate(struct irq_domain *domain,
 				  struct irq_data *irq_data)
 {
-	struct msi_msg msg;
+	struct msi_msg msg[2];
 
-	memset(&msg, 0, sizeof(msg));
-	irq_chip_write_msi_msg(irq_data, &msg);
+	memset(msg, 0, sizeof(msg));
+	irq_chip_write_msi_msg(irq_data, msg);
 }
 
 static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
-- 
cgit v1.2.3


From 6988e0e0d28328467e218f59589b2770675a9ebd Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 8 May 2018 13:14:31 +0100
Subject: genirq/msi: Limit level-triggered MSI to platform devices

Nobody would be insane enough to try and use level triggered
MSIs on PCI, but let's make sure it doesn't happen. Also,
let's mandate that the irqchip backing the platform MSI domain
is providing the IRQCHIP_SUPPORTS_LEVEL_MSI flag.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rob Herring <robh@kernel.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lkml.kernel.org/r/20180508121438.11301-3-marc.zyngier@arm.com
---
 drivers/base/platform-msi.c     | 3 +++
 drivers/bus/fsl-mc/fsl-mc-msi.c | 2 ++
 drivers/pci/msi.c               | 3 +++
 include/linux/irq.h             | 1 +
 4 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 8e22073aeeed..60d6cc618f1c 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -101,6 +101,9 @@ static void platform_msi_update_chip_ops(struct msi_domain_info *info)
 		chip->irq_set_affinity = msi_domain_set_affinity;
 	if (!chip->irq_write_msi_msg)
 		chip->irq_write_msi_msg = platform_msi_write_msg;
+	if (WARN_ON((info->flags & MSI_FLAG_LEVEL_CAPABLE) &&
+		    !(chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)))
+		info->flags &= ~MSI_FLAG_LEVEL_CAPABLE;
 }
 
 static void platform_msi_free_descs(struct device *dev, int base, int nvec)
diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c
index ec35e255b496..8b9c66d7c4ff 100644
--- a/drivers/bus/fsl-mc/fsl-mc-msi.c
+++ b/drivers/bus/fsl-mc/fsl-mc-msi.c
@@ -163,6 +163,8 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode,
 {
 	struct irq_domain *domain;
 
+	if (WARN_ON((info->flags & MSI_FLAG_LEVEL_CAPABLE)))
+		info->flags &= ~MSI_FLAG_LEVEL_CAPABLE;
 	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
 		fsl_mc_msi_update_dom_ops(info);
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 30250631efe7..f45b74fcc059 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1434,6 +1434,9 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 {
 	struct irq_domain *domain;
 
+	if (WARN_ON(info->flags & MSI_FLAG_LEVEL_CAPABLE))
+		info->flags &= ~MSI_FLAG_LEVEL_CAPABLE;
+
 	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
 		pci_msi_domain_update_dom_ops(info);
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 65916a305f3d..b2067083aa94 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -512,6 +512,7 @@ enum {
 	IRQCHIP_SKIP_SET_WAKE		= (1 <<  4),
 	IRQCHIP_ONESHOT_SAFE		= (1 <<  5),
 	IRQCHIP_EOI_THREADED		= (1 <<  6),
+	IRQCHIP_SUPPORTS_LEVEL_MSI	= (1 <<  7),
 };
 
 #include <linux/irqdesc.h>
-- 
cgit v1.2.3


From 8a22a3e1e768c309b718f99bd86f9f25a453e0dc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 8 May 2018 13:14:33 +0100
Subject: dma-iommu: Fix compilation when !CONFIG_IOMMU_DMA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inclusion of include/dma-iommu.h when CONFIG_IOMMU_DMA is not selected
results in the following splat:

In file included from drivers/irqchip/irq-gic-v3-mbi.c:20:0:
./include/linux/dma-iommu.h:95:69: error: unknown type name ‘dma_addr_t’
 static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
                                                                     ^~~~~~~~~~
./include/linux/dma-iommu.h:108:74: warning: ‘struct list_head’ declared inside parameter list will not be visible outside of this definition or declaration
 static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
                                                                          ^~~~~~~~~
scripts/Makefile.build:312: recipe for target 'drivers/irqchip/irq-gic-v3-mbi.o' failed

Fix it by including linux/types.h.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rob Herring <robh@kernel.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lkml.kernel.org/r/20180508121438.11301-5-marc.zyngier@arm.com
---
 include/linux/dma-iommu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 92f20832fd28..e8ca5e654277 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -17,6 +17,7 @@
 #define __DMA_IOMMU_H
 
 #ifdef __KERNEL__
+#include <linux/types.h>
 #include <asm/errno.h>
 
 #ifdef CONFIG_IOMMU_DMA
-- 
cgit v1.2.3


From 6461934371bfc1bfe6b424b197a546b4effd0a32 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 8 May 2018 13:14:34 +0100
Subject: irqdomain: Let irq_find_host default to DOMAIN_BUS_WIRED

At the beginning of times, irq_find_host() was simple. Each device node
implemented at most one irq domain, and we were happy. Over time, things
have become more complex, and we now have nodes implementing a plurality
of domains, tagged by "bus_token".

Crutially, users of irq_find_host() all expect the most basic domain
to be returned, and not any other domain such as a bus-specific MSI
domain.

So let's change irq_find_host() to first look for a DOMAIN_BUS_WIRED
domain, and only if this fails fallback to DOMAIN_BUS_ANY. Note that
this is consistent with what irq_create_fwspec_mapping is already
doing, see 530cbe100ef7 ("irqdomain: Allow domain lookup with
DOMAIN_BUS_WIRED token").

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rob Herring <robh@kernel.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lkml.kernel.org/r/20180508121438.11301-6-marc.zyngier@arm.com
---
 include/linux/irqdomain.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 48c7e86bb556..dccfa65aee96 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -301,7 +301,13 @@ static inline struct irq_domain *irq_find_matching_host(struct device_node *node
 
 static inline struct irq_domain *irq_find_host(struct device_node *node)
 {
-	return irq_find_matching_host(node, DOMAIN_BUS_ANY);
+	struct irq_domain *d;
+
+	d = irq_find_matching_host(node, DOMAIN_BUS_WIRED);
+	if (!d)
+		d = irq_find_matching_host(node, DOMAIN_BUS_ANY);
+
+	return d;
 }
 
 /**
-- 
cgit v1.2.3


From 505287525c24d5c78b662fd73721ad9900b91fcc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 8 May 2018 13:14:36 +0100
Subject: irqchip/gic-v3: Add support for Message Based Interrupts as an MSI
 controller

GICv3 offers the possibility to signal SPIs using a pair of doorbells
(SETPI, CLRSPI) under the name of Message Based Interrupts (MBI).
They can be used as either traditional (edge) MSIs, or the more exotic
level-triggered flavour.

Let's implement support for platform MSI, which is the original intent
for this feature.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rob Herring <robh@kernel.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lkml.kernel.org/r/20180508121438.11301-8-marc.zyngier@arm.com
---
 drivers/irqchip/Makefile           |   2 +-
 drivers/irqchip/irq-gic-v3-mbi.c   | 275 +++++++++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v3.c       |   6 +
 include/linux/irqchip/arm-gic-v3.h |   1 +
 4 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 drivers/irqchip/irq-gic-v3-mbi.c

(limited to 'include')

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 5ed465ab1c76..15f268f646bf 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -27,7 +27,7 @@ obj-$(CONFIG_ARM_GIC)			+= irq-gic.o irq-gic-common.o
 obj-$(CONFIG_ARM_GIC_PM)		+= irq-gic-pm.o
 obj-$(CONFIG_ARCH_REALVIEW)		+= irq-gic-realview.o
 obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
-obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-common.o
+obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
 obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_PCI)	+= irq-gic-v3-its-pci-msi.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c
new file mode 100644
index 000000000000..2b3b767050aa
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v3-mbi.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#define pr_fmt(fmt) "GICv3: " fmt
+
+#include <linux/dma-iommu.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+struct mbi_range {
+	u32			spi_start;
+	u32			nr_spis;
+	unsigned long		*bm;
+};
+
+static struct mutex		mbi_lock;
+static phys_addr_t		mbi_phys_base;
+static struct mbi_range		*mbi_ranges;
+static unsigned int		mbi_range_nr;
+
+static struct irq_chip mbi_irq_chip = {
+	.name			= "MBI",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_type		= irq_chip_set_type_parent,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
+};
+
+static int mbi_irq_gic_domain_alloc(struct irq_domain *domain,
+				       unsigned int virq,
+				       irq_hw_number_t hwirq)
+{
+	struct irq_fwspec fwspec;
+	struct irq_data *d;
+	int err;
+
+	/*
+	 * Using ACPI? There is no MBI support in the spec, you
+	 * shouldn't even be here.
+	 */
+	if (!is_of_node(domain->parent->fwnode))
+		return -EINVAL;
+
+	/*
+	 * Let's default to edge. This is consistent with traditional
+	 * MSIs, and systems requiring level signaling will just
+	 * enforce the trigger on their own.
+	 */
+	fwspec.fwnode = domain->parent->fwnode;
+	fwspec.param_count = 3;
+	fwspec.param[0] = 0;
+	fwspec.param[1] = hwirq - 32;
+	fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
+
+	err = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
+	if (err)
+		return err;
+
+	d = irq_domain_get_irq_data(domain->parent, virq);
+	return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
+}
+
+static void mbi_free_msi(struct mbi_range *mbi, unsigned int hwirq,
+			 int nr_irqs)
+{
+	mutex_lock(&mbi_lock);
+	bitmap_release_region(mbi->bm, hwirq - mbi->spi_start,
+			      get_count_order(nr_irqs));
+	mutex_unlock(&mbi_lock);
+}
+
+static int mbi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				   unsigned int nr_irqs, void *args)
+{
+	struct mbi_range *mbi = NULL;
+	int hwirq, offset, i, err = 0;
+
+	mutex_lock(&mbi_lock);
+	for (i = 0; i < mbi_range_nr; i++) {
+		offset = bitmap_find_free_region(mbi_ranges[i].bm,
+						 mbi_ranges[i].nr_spis,
+						 get_count_order(nr_irqs));
+		if (offset >= 0) {
+			mbi = &mbi_ranges[i];
+			break;
+		}
+	}
+	mutex_unlock(&mbi_lock);
+
+	if (!mbi)
+		return -ENOSPC;
+
+	hwirq = mbi->spi_start + offset;
+
+	for (i = 0; i < nr_irqs; i++) {
+		err = mbi_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
+		if (err)
+			goto fail;
+
+		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+					      &mbi_irq_chip, mbi);
+	}
+
+	return 0;
+
+fail:
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+	mbi_free_msi(mbi, hwirq, nr_irqs);
+	return err;
+}
+
+static void mbi_irq_domain_free(struct irq_domain *domain,
+				unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+	struct mbi_range *mbi = irq_data_get_irq_chip_data(d);
+
+	mbi_free_msi(mbi, d->hwirq, nr_irqs);
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops mbi_domain_ops = {
+	.alloc			= mbi_irq_domain_alloc,
+	.free			= mbi_irq_domain_free,
+};
+
+static void mbi_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	msg[0].address_hi = upper_32_bits(mbi_phys_base + GICD_SETSPI_NSR);
+	msg[0].address_lo = lower_32_bits(mbi_phys_base + GICD_SETSPI_NSR);
+	msg[0].data = data->parent_data->hwirq;
+
+	iommu_dma_map_msi_msg(data->irq, msg);
+}
+
+static void mbi_compose_mbi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	mbi_compose_msi_msg(data, msg);
+
+	msg[1].address_hi = upper_32_bits(mbi_phys_base + GICD_CLRSPI_NSR);
+	msg[1].address_lo = lower_32_bits(mbi_phys_base + GICD_CLRSPI_NSR);
+	msg[1].data = data->parent_data->hwirq;
+
+	iommu_dma_map_msi_msg(data->irq, &msg[1]);
+}
+
+/* Platform-MSI specific irqchip */
+static struct irq_chip mbi_pmsi_irq_chip = {
+	.name			= "pMSI",
+	.irq_set_type		= irq_chip_set_type_parent,
+	.irq_compose_msi_msg	= mbi_compose_mbi_msg,
+	.flags			= IRQCHIP_SUPPORTS_LEVEL_MSI,
+};
+
+static struct msi_domain_ops mbi_pmsi_ops = {
+};
+
+static struct msi_domain_info mbi_pmsi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+		   MSI_FLAG_LEVEL_CAPABLE),
+	.ops	= &mbi_pmsi_ops,
+	.chip	= &mbi_pmsi_irq_chip,
+};
+
+static int mbi_allocate_domains(struct irq_domain *parent)
+{
+	struct irq_domain *nexus_domain, *plat_domain;
+
+	nexus_domain = irq_domain_create_tree(parent->fwnode,
+					      &mbi_domain_ops, NULL);
+	if (!nexus_domain)
+		return -ENOMEM;
+
+	irq_domain_update_bus_token(nexus_domain, DOMAIN_BUS_NEXUS);
+	nexus_domain->parent = parent;
+
+	plat_domain = platform_msi_create_irq_domain(parent->fwnode,
+						     &mbi_pmsi_domain_info,
+						     nexus_domain);
+
+	if (!plat_domain) {
+		irq_domain_remove(plat_domain);
+		irq_domain_remove(nexus_domain);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int __init mbi_init(struct fwnode_handle *fwnode, struct irq_domain *parent)
+{
+	struct device_node *np;
+	const __be32 *reg;
+	int ret, n;
+
+	np = to_of_node(fwnode);
+
+	if (!of_property_read_bool(np, "msi-controller"))
+		return 0;
+
+	n = of_property_count_elems_of_size(np, "mbi-ranges", sizeof(u32));
+	if (n <= 0 || n % 2)
+		return -EINVAL;
+
+	mbi_range_nr = n / 2;
+	mbi_ranges = kcalloc(mbi_range_nr, sizeof(*mbi_ranges), GFP_KERNEL);
+	if (!mbi_ranges)
+		return -ENOMEM;
+
+	for (n = 0; n < mbi_range_nr; n++) {
+		ret = of_property_read_u32_index(np, "mbi-ranges", n * 2,
+						 &mbi_ranges[n].spi_start);
+		if (ret)
+			goto err_free_mbi;
+		ret = of_property_read_u32_index(np, "mbi-ranges", n * 2 + 1,
+						 &mbi_ranges[n].nr_spis);
+		if (ret)
+			goto err_free_mbi;
+
+		mbi_ranges[n].bm = kcalloc(BITS_TO_LONGS(mbi_ranges[n].nr_spis),
+					   sizeof(long), GFP_KERNEL);
+		if (!mbi_ranges[n].bm) {
+			ret = -ENOMEM;
+			goto err_free_mbi;
+		}
+		pr_info("MBI range [%d:%d]\n", mbi_ranges[n].spi_start,
+			mbi_ranges[n].spi_start + mbi_ranges[n].nr_spis - 1);
+	}
+
+	reg = of_get_property(np, "mbi-alias", NULL);
+	if (reg) {
+		mbi_phys_base = of_translate_address(np, reg);
+		if (mbi_phys_base == OF_BAD_ADDR) {
+			ret = -ENXIO;
+			goto err_free_mbi;
+		}
+	} else {
+		struct resource res;
+
+		if (of_address_to_resource(np, 0, &res)) {
+			ret = -ENXIO;
+			goto err_free_mbi;
+		}
+
+		mbi_phys_base = res.start;
+	}
+
+	pr_info("Using MBI frame %pa\n", &mbi_phys_base);
+
+	ret = mbi_allocate_domains(parent);
+	if (ret)
+		goto err_free_mbi;
+
+	return 0;
+
+err_free_mbi:
+	if (mbi_ranges) {
+		for (n = 0; n < mbi_range_nr; n++)
+			kfree(mbi_ranges[n].bm);
+		kfree(mbi_ranges);
+	}
+
+	return ret;
+}
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 94164d7b87a6..5a67ec084588 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1113,6 +1113,12 @@ static int __init gic_init_bases(void __iomem *dist_base,
 	pr_info("Distributor has %sRange Selector support\n",
 		gic_data.has_rss ? "" : "no ");
 
+	if (typer & GICD_TYPER_MBIS) {
+		err = mbi_init(handle, gic_data.domain);
+		if (err)
+			pr_err("Failed to initialize MBIs\n");
+	}
+
 	set_handle_irq(gic_handle_irq);
 
 	gic_update_vlpi_properties();
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index f5af3b594e6e..cbb872c1b607 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -587,6 +587,7 @@ struct fwnode_handle;
 int its_cpu_init(void);
 int its_init(struct fwnode_handle *handle, struct rdists *rdists,
 	     struct irq_domain *domain);
+int mbi_init(struct fwnode_handle *fwnode, struct irq_domain *parent);
 
 static inline bool gic_enable_sre(void)
 {
-- 
cgit v1.2.3


From 0c8e3fe35db9b66ae0030849545030ec7c0fc45c Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 13 May 2018 22:40:30 -0400
Subject: vfs: add the sb_start_intwrite_trylock() helper

Needed by ext4 to test frozen fs before updating s_last_mounted.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..cac41f1bad05 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1597,6 +1597,11 @@ static inline void sb_start_intwrite(struct super_block *sb)
 	__sb_start_write(sb, SB_FREEZE_FS, true);
 }
 
+static inline int sb_start_intwrite_trylock(struct super_block *sb)
+{
+	return __sb_start_write(sb, SB_FREEZE_FS, false);
+}
+
 
 extern bool inode_owner_or_capable(const struct inode *inode);
 
-- 
cgit v1.2.3


From c9ac371d4b5982d2f179d42bb99781e510d55f50 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 8 May 2018 17:14:25 -0700
Subject: drm: Fix render node numbering regression from control node removal.

drm_minor_alloc() does multiplication on this enum, so the removal
ended up moving render nodes down from 128 base to 64.  This caused
Mesa's surfaceless backend to be unable to open the render nodes,
since it was still looking up at 128.

v2: Add a comment warning the next person.

Signed-off-by: Eric Anholt <eric@anholt.net>
Fixes: 0d49f303e8a7 ("drm: remove all control node code")
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20180509001425.12574-1-eric@anholt.net
---
 include/drm/drm_file.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 91a65a360079..027ac16da3d1 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -47,8 +47,12 @@ struct device;
  * header include loops we need it here for now.
  */
 
+/* Note that the order of this enum is ABI (it determines
+ * /dev/dri/renderD* numbers).
+ */
 enum drm_minor_type {
 	DRM_MINOR_PRIMARY,
+	DRM_MINOR_CONTROL,
 	DRM_MINOR_RENDER,
 };
 
-- 
cgit v1.2.3


From 742632d237ce180439ab4af31e9891df0df81233 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 4 May 2018 07:59:49 +0200
Subject: efi: Fix IA32/X64 Processor Error Record definition

Based on UEFI 2.7 Table 255. Processor Error Record, the "Local APIC_ID"
field is 8 bytes but Linux defines this field as 1 byte.

Fix this in the struct cper_sec_proc_ia definition.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180504060003.19618-4-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cper.h b/include/linux/cper.h
index d14ef4e77c8a..4b5f8459b403 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -381,7 +381,7 @@ struct cper_sec_proc_generic {
 /* IA32/X64 Processor Error Section */
 struct cper_sec_proc_ia {
 	__u64	validation_bits;
-	__u8	lapic_id;
+	__u64	lapic_id;
 	__u8	cpuid[48];
 };
 
-- 
cgit v1.2.3


From f9e1bdb9f35f4f5cfa7c9025ac68c02909b6d3b1 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 4 May 2018 07:59:50 +0200
Subject: efi: Decode IA32/X64 Processor Error Section

Recognize the IA32/X64 Processor Error Section.

Do the section decoding in a new "cper-x86.c" file and add this to the
Makefile depending on a new "UEFI_CPER_X86" config option.

Print the Local APIC ID and CPUID info from the Processor Error Record.

The "Processor Error Info" and "Processor Context" fields will be
decoded in following patches.

Based on UEFI 2.7 Table 252. Processor Error Record.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180504060003.19618-5-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/Kconfig    |  5 +++++
 drivers/firmware/efi/Makefile   |  1 +
 drivers/firmware/efi/cper-x86.c | 23 +++++++++++++++++++++++
 drivers/firmware/efi/cper.c     | 10 ++++++++++
 include/linux/cper.h            |  2 ++
 5 files changed, 41 insertions(+)
 create mode 100644 drivers/firmware/efi/cper-x86.c

(limited to 'include')

diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 3098410abad8..781a4a337557 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -174,6 +174,11 @@ config UEFI_CPER_ARM
 	depends on UEFI_CPER && ( ARM || ARM64 )
 	default y
 
+config UEFI_CPER_X86
+	bool
+	depends on UEFI_CPER && X86
+	default y
+
 config EFI_DEV_PATH_PARSER
 	bool
 	depends on ACPI
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index cb805374f4bc..5f9f5039de50 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_ARM)			+= $(arm-obj-y)
 obj-$(CONFIG_ARM64)			+= $(arm-obj-y)
 obj-$(CONFIG_EFI_CAPSULE_LOADER)	+= capsule-loader.o
 obj-$(CONFIG_UEFI_CPER_ARM)		+= cper-arm.o
+obj-$(CONFIG_UEFI_CPER_X86)		+= cper-x86.o
diff --git a/drivers/firmware/efi/cper-x86.c b/drivers/firmware/efi/cper-x86.c
new file mode 100644
index 000000000000..863f0cd2a0ff
--- /dev/null
+++ b/drivers/firmware/efi/cper-x86.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+#include <linux/cper.h>
+
+/*
+ * We don't need a "CPER_IA" prefix since these are all locally defined.
+ * This will save us a lot of line space.
+ */
+#define VALID_LAPIC_ID			BIT_ULL(0)
+#define VALID_CPUID_INFO		BIT_ULL(1)
+
+void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc)
+{
+	if (proc->validation_bits & VALID_LAPIC_ID)
+		printk("%sLocal APIC_ID: 0x%llx\n", pfx, proc->lapic_id);
+
+	if (proc->validation_bits & VALID_CPUID_INFO) {
+		printk("%sCPUID Info:\n", pfx);
+		print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, proc->cpuid,
+			       sizeof(proc->cpuid), 0);
+	}
+}
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index ab21f1614007..3bf0dca378a6 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -467,6 +467,16 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 			cper_print_proc_arm(newpfx, arm_err);
 		else
 			goto err_section_too_small;
+#endif
+#if defined(CONFIG_UEFI_CPER_X86)
+	} else if (guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
+		struct cper_sec_proc_ia *ia_err = acpi_hest_get_payload(gdata);
+
+		printk("%ssection_type: IA32/X64 processor error\n", newpfx);
+		if (gdata->error_data_length >= sizeof(*ia_err))
+			cper_print_proc_ia(newpfx, ia_err);
+		else
+			goto err_section_too_small;
 #endif
 	} else {
 		const void *err = acpi_hest_get_payload(gdata);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 4b5f8459b403..9c703a0abe6e 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -551,5 +551,7 @@ const char *cper_mem_err_unpack(struct trace_seq *,
 				struct cper_mem_err_compact *);
 void cper_print_proc_arm(const char *pfx,
 			 const struct cper_sec_proc_arm *proc);
+void cper_print_proc_ia(const char *pfx,
+			const struct cper_sec_proc_ia *proc);
 
 #endif
-- 
cgit v1.2.3


From cb0ba793525788e40e7a9ee82de8f3b017ca4459 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 4 May 2018 07:59:59 +0200
Subject: efi: Align efi_pci_io_protocol typedefs to type naming convention

In order to use the helper macros that perform type mangling with the
EFI PCI I/O protocol struct typedefs, align their Linux typenames with
the convention we use for definitionns that originate in the UEFI spec,
and add the trailing _t to each.

Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180504060003.19618-14-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c | 8 ++++----
 include/linux/efi.h              | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 09f36c0d9d4f..3994f48c4043 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -109,7 +109,7 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
 }
 
 static efi_status_t
-__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
+__setup_efi_pci32(efi_pci_io_protocol_32_t *pci, struct pci_setup_rom **__rom)
 {
 	struct pci_setup_rom *rom = NULL;
 	efi_status_t status;
@@ -176,7 +176,7 @@ static void
 setup_efi_pci32(struct boot_params *params, void **pci_handle,
 		unsigned long size)
 {
-	efi_pci_io_protocol_32 *pci = NULL;
+	efi_pci_io_protocol_32_t *pci = NULL;
 	efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
 	u32 *handles = (u32 *)(unsigned long)pci_handle;
 	efi_status_t status;
@@ -218,7 +218,7 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
 }
 
 static efi_status_t
-__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
+__setup_efi_pci64(efi_pci_io_protocol_64_t *pci, struct pci_setup_rom **__rom)
 {
 	struct pci_setup_rom *rom;
 	efi_status_t status;
@@ -284,7 +284,7 @@ static void
 setup_efi_pci64(struct boot_params *params, void **pci_handle,
 		unsigned long size)
 {
-	efi_pci_io_protocol_64 *pci = NULL;
+	efi_pci_io_protocol_64_t *pci = NULL;
 	efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
 	u64 *handles = (u64 *)(unsigned long)pci_handle;
 	efi_status_t status;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 3016d8c456bc..56add823f190 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -397,7 +397,7 @@ typedef struct {
 	u32 set_bar_attributes;
 	u64 romsize;
 	u32 romimage;
-} efi_pci_io_protocol_32;
+} efi_pci_io_protocol_32_t;
 
 typedef struct {
 	u64 poll_mem;
@@ -417,7 +417,7 @@ typedef struct {
 	u64 set_bar_attributes;
 	u64 romsize;
 	u64 romimage;
-} efi_pci_io_protocol_64;
+} efi_pci_io_protocol_64_t;
 
 typedef struct {
 	void *poll_mem;
@@ -437,7 +437,7 @@ typedef struct {
 	void *set_bar_attributes;
 	uint64_t romsize;
 	void *romimage;
-} efi_pci_io_protocol;
+} efi_pci_io_protocol_t;
 
 #define EFI_PCI_IO_ATTRIBUTE_ISA_MOTHERBOARD_IO 0x0001
 #define EFI_PCI_IO_ATTRIBUTE_ISA_IO 0x0002
-- 
cgit v1.2.3


From a59a68fee05d46cccc4279ab6609421f7270398e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 4 May 2018 12:42:24 +0200
Subject: sched/wait: Include <linux/wait.h> in <linux/swait.h>

kbuild bot reported against an intermediate RT patch that the build
fails with:

>    In file included from include/linux/completion.h:12:0,
>                     from include/linux/rcupdate_wait.h:10,
>                     from kernel/rcu/srcutiny.c:27:
>    kernel/rcu/srcutiny.c: In function 'srcu_drive_gp':
> >> include/linux/swait.h:172:7: error: implicit declaration of function '___wait_is_interruptible'; did you mean '__swait_event_interruptible'?
>       if (___wait_is_interruptible(state) && __int) {  \

That error vanishes a few patches later (in the RT queue) because wait.h
is then pulled in by other means. It does not seem to surface on !RT.
I think that swait should include a header file for a function/macro
(___wait_is_interruptible()) it is using.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20180504104224.20218-1-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/swait.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/swait.h b/include/linux/swait.h
index c98aaf677466..84f9745365ff 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
+#include <linux/wait.h>
 #include <asm/current.h>
 
 /*
-- 
cgit v1.2.3


From 943d355d7feef380e15a95892be3dff1095ef54b Mon Sep 17 00:00:00 2001
From: Rohit Jain <rohit.k.jain@oracle.com>
Date: Wed, 9 May 2018 09:39:48 -0700
Subject: sched/core: Distinguish between idle_cpu() calls based on desired
 effect, introduce available_idle_cpu()

In the following commit:

  247f2f6f3c70 ("sched/core: Don't schedule threads on pre-empted vCPUs")

... we distinguish between idle_cpu() when the vCPU is not running for
scheduling threads.

However, the idle_cpu() function is used in other places for
actually checking whether the state of the CPU is idle or not.

Hence split the use of that function based on the desired return value,
by introducing the available_idle_cpu() function.

This fixes a (slight) regression in that initial vCPU commit, because
some code paths (like the load-balancer) don't care and shouldn't care
if the vCPU is preempted or not, they just want to know if there's any
tasks on the CPU.

Signed-off-by: Rohit Jain <rohit.k.jain@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dhaval.giani@oracle.com
Cc: linux-kernel@vger.kernel.org
Cc: matt@codeblueprint.co.uk
Cc: steven.sistare@oracle.com
Cc: subhra.mazumdar@oracle.com
Link: http://lkml.kernel.org/r/1525883988-10356-1-git-send-email-rohit.k.jain@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   | 14 ++++++++++++++
 kernel/sched/fair.c   | 20 ++++++++++----------
 3 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c2413703f45d..959a8588e365 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1504,6 +1504,7 @@ static inline int task_nice(const struct task_struct *p)
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
+extern int available_idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 102c36c317dc..d1555185c054 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4009,6 +4009,20 @@ int idle_cpu(int cpu)
 		return 0;
 #endif
 
+	return 1;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+	if (!idle_cpu(cpu))
+		return 0;
+
 	if (vcpu_is_preempted(cpu))
 		return 0;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f32b97d4c63b..748cb054fefd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5899,8 +5899,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 	 * a cpufreq perspective, it's better to have higher utilisation
 	 * on one CPU.
 	 */
-	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
-		return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 		return this_cpu;
@@ -6143,7 +6143,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
-		if (idle_cpu(i)) {
+		if (available_idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
@@ -6272,7 +6272,7 @@ void __update_idle_core(struct rq *rq)
 		if (cpu == core)
 			continue;
 
-		if (!idle_cpu(cpu))
+		if (!available_idle_cpu(cpu))
 			goto unlock;
 	}
 
@@ -6304,7 +6304,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 
 		for_each_cpu(cpu, cpu_smt_mask(core)) {
 			cpumask_clear_cpu(cpu, cpus);
-			if (!idle_cpu(cpu))
+			if (!available_idle_cpu(cpu))
 				idle = false;
 		}
 
@@ -6333,7 +6333,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			return cpu;
 	}
 
@@ -6396,7 +6396,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 			return -1;
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			break;
 	}
 
@@ -6416,13 +6416,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i, recent_used_cpu;
 
-	if (idle_cpu(target))
+	if (available_idle_cpu(target))
 		return target;
 
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
 		return prev;
 
 	/* Check a recently used CPU as a potential idle candidate: */
@@ -6430,7 +6430,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    idle_cpu(recent_used_cpu) &&
+	    available_idle_cpu(recent_used_cpu) &&
 	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
-- 
cgit v1.2.3


From 110ab11d413881395773df29e8bdf5bd3a2164ee Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 3 May 2018 12:10:21 +1000
Subject: drm/virtio: add define for second capset to the virgl code.

Although the kernel doesn't use this, qemu imports these headers
and it's best to keep them consistent.

This define is also something userspace may want to use.

Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20180503021021.10694-1-airlied@gmail.com
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 include/uapi/linux/virtio_gpu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h
index 4b04ead26cd9..f43c3c6171ff 100644
--- a/include/uapi/linux/virtio_gpu.h
+++ b/include/uapi/linux/virtio_gpu.h
@@ -260,6 +260,7 @@ struct virtio_gpu_cmd_submit {
 };
 
 #define VIRTIO_GPU_CAPSET_VIRGL 1
+#define VIRTIO_GPU_CAPSET_VIRGL2 2
 
 /* VIRTIO_GPU_CMD_GET_CAPSET_INFO */
 struct virtio_gpu_get_capset_info {
-- 
cgit v1.2.3


From 0f6f47bacba514f4e9f61de0d85940dfb41498cc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 8 May 2018 15:38:19 +0200
Subject: softirq/core: Turn default irq_cpustat_t to standard per-cpu

In order to optimize and consolidate softirq mask accesses, let's
convert the default irq_cpustat_t implementation to per-CPU standard API.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Link: http://lkml.kernel.org/r/1525786706-22846-5-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irq_cpustat.h | 4 ++--
 kernel/softirq.c            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
index 4954948d1973..ddea03c7c39d 100644
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -18,8 +18,8 @@
  */
 
 #ifndef __ARCH_IRQ_STAT
-extern irq_cpustat_t irq_stat[];		/* defined in asm/hardirq.h */
-#define __IRQ_STAT(cpu, member)	(irq_stat[cpu].member)
+DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);	/* defined in asm/hardirq.h */
+#define __IRQ_STAT(cpu, member)	(per_cpu(irq_stat.member, cpu))
 #endif
 
   /* arch independent irq_stat fields */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 177de3640c78..c5fafd792df1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -49,8 +49,8 @@
  */
 
 #ifndef __ARCH_IRQ_STAT
-irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
-EXPORT_SYMBOL(irq_stat);
+DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
 #endif
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
-- 
cgit v1.2.3


From 0fd7d86285290ccebc0dc6eb536b6b043dd6a1e4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 8 May 2018 15:38:20 +0200
Subject: softirq/core: Consolidate default local_softirq_pending()
 implementations

Consolidate and optimize default softirq mask API implementations.
Per-CPU operations are expected to be faster and a few architectures
already rely on them to implement local_softirq_pending() and related
accessors/mutators. Those will be migrated to the new generic code.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Link: http://lkml.kernel.org/r/1525786706-22846-6-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/interrupt.h   | 14 ++++++++++++++
 include/linux/irq_cpustat.h |  6 +-----
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5426627f9c55..7a11f73c5c3b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -432,11 +432,25 @@ extern bool force_irqthreads;
 #define force_irqthreads	(0)
 #endif
 
+#ifndef local_softirq_pending
+
+#ifndef local_softirq_pending_ref
+#define local_softirq_pending_ref irq_stat.__softirq_pending
+#endif
+
+#define local_softirq_pending()	(__this_cpu_read(local_softirq_pending_ref))
+#define set_softirq_pending(x)	(__this_cpu_write(local_softirq_pending_ref, (x)))
+#define or_softirq_pending(x)	(__this_cpu_or(local_softirq_pending_ref, (x)))
+
+#else /* local_softirq_pending */
+
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
 #endif
 
+#endif /* local_softirq_pending */
+
 /* Some architectures might implement lazy enabling/disabling of
  * interrupts. In some cases, such as stop_machine, we might want
  * to ensure that after a local_irq_disable(), interrupts have
diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
index ddea03c7c39d..6e8895cd4d92 100644
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -22,11 +22,7 @@ DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);	/* defined in asm/hardirq.h */
 #define __IRQ_STAT(cpu, member)	(per_cpu(irq_stat.member, cpu))
 #endif
 
-  /* arch independent irq_stat fields */
-#define local_softirq_pending() \
-	__IRQ_STAT(smp_processor_id(), __softirq_pending)
-
-  /* arch dependent irq_stat fields */
+/* arch dependent irq_stat fields */
 #define nmi_count(cpu)		__IRQ_STAT((cpu), __nmi_count)	/* i386 */
 
 #endif	/* __irq_cpustat_h */
-- 
cgit v1.2.3


From 48bda43eabb8d086204f543cf8bbad696b8c6391 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 8 May 2018 15:38:26 +0200
Subject: softirq/s390: Move default mutators of overwritten softirq mask to
 s390

s390 is now the last architecture that entirely overwrites
local_softirq_pending() and uses the according default definitions of
set_softirq_pending() and or_softirq_pending().

Just move these to s390 to debloat the generic code complexity.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Link: http://lkml.kernel.org/r/1525786706-22846-12-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/s390/include/asm/hardirq.h | 2 ++
 include/linux/interrupt.h       | 7 -------
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index a296c6acfd07..dfbc3c6c0674 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -14,6 +14,8 @@
 #include <asm/lowcore.h>
 
 #define local_softirq_pending() (S390_lowcore.softirq_pending)
+#define set_softirq_pending(x) (S390_lowcore.softirq_pending = (x))
+#define or_softirq_pending(x)  (S390_lowcore.softirq_pending |= (x))
 
 #define __ARCH_IRQ_STAT
 #define __ARCH_HAS_DO_SOFTIRQ
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7a11f73c5c3b..eeceac3376fc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -442,13 +442,6 @@ extern bool force_irqthreads;
 #define set_softirq_pending(x)	(__this_cpu_write(local_softirq_pending_ref, (x)))
 #define or_softirq_pending(x)	(__this_cpu_or(local_softirq_pending_ref, (x)))
 
-#else /* local_softirq_pending */
-
-#ifndef __ARCH_SET_SOFTIRQ_PENDING
-#define set_softirq_pending(x) (local_softirq_pending() = (x))
-#define or_softirq_pending(x)  (local_softirq_pending() |= (x))
-#endif
-
 #endif /* local_softirq_pending */
 
 /* Some architectures might implement lazy enabling/disabling of
-- 
cgit v1.2.3


From e0d51e6ceff818c5d7a812c26bbd1bb84348a71a Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Thu, 10 May 2018 16:41:15 -0400
Subject: media: rc: default to idle on at startup or after reset

Any spaces events received after a reset or startup should be discarded,
so ensure the rc device is in idle mode.

This also makes it much easier to detect incorrect raw events, as we will
do in a following commit.

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/rc/rc-ir-raw.c | 1 +
 include/media/rc-core.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index 2ab8a2b7092a..2e50104ae138 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -611,6 +611,7 @@ int ir_raw_event_prepare(struct rc_dev *dev)
 
 	dev->raw->dev = dev;
 	dev->change_protocol = change_protocol;
+	dev->idle = true;
 	spin_lock_init(&dev->raw->edge_spinlock);
 	timer_setup(&dev->raw->edge_handle, ir_raw_edge_handle, 0);
 	INIT_KFIFO(dev->raw->kfifo);
diff --git a/include/media/rc-core.h b/include/media/rc-core.h
index 6742fd86ff65..61571773a98d 100644
--- a/include/media/rc-core.h
+++ b/include/media/rc-core.h
@@ -347,6 +347,7 @@ static inline void ir_raw_event_reset(struct rc_dev *dev)
 	struct ir_raw_event ev = { .reset = true };
 
 	ir_raw_event_store(dev, &ev);
+	dev->idle = true;
 	ir_raw_event_handle(dev);
 }
 
-- 
cgit v1.2.3


From 3ad0876554cafa368f574d4d408468510543e9ff Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Wed, 9 May 2018 14:16:12 +0100
Subject: xen/privcmd: add IOCTL_PRIVCMD_MMAP_RESOURCE

My recent Xen patch series introduces a new HYPERVISOR_memory_op to
support direct priv-mapping of certain guest resources (such as ioreq
pages, used by emulators) by a tools domain, rather than having to access
such resources via the guest P2M.

This patch adds the necessary infrastructure to the privcmd driver and
Xen MMU code to support direct resource mapping.

NOTE: The adjustment in the MMU code is partially cosmetic. Xen will now
      allow a PV tools domain to map guest pages either by GFN or MFN, thus
      the term 'mfn' has been swapped for 'pfn' in the lower layers of the
      remap code.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/arm/xen/enlighten.c       |  11 ++++
 arch/x86/xen/mmu.c             |  60 +++++++++++++------
 drivers/xen/privcmd.c          | 133 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/xen/privcmd.h     |  11 ++++
 include/xen/interface/memory.h |  66 ++++++++++++++++++++
 include/xen/interface/xen.h    |   7 ++-
 include/xen/xen-ops.h          |  24 +++++++-
 7 files changed, 291 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index ba7f4c8f5c3e..8073625371f5 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -89,6 +89,17 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
 
+/* Not used by XENFEAT_auto_translated guests. */
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       xen_pfn_t *mfn, int nr,
+			       int *err_ptr, pgprot_t prot,
+			       unsigned int domid, struct page **pages)
+{
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
 static void xen_read_wallclock(struct timespec64 *ts)
 {
 	u32 version;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d33e7dbe3129..af2960cb7a3e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -65,37 +65,44 @@ static void xen_flush_tlb_all(void)
 #define REMAP_BATCH_SIZE 16
 
 struct remap_data {
-	xen_pfn_t *mfn;
+	xen_pfn_t *pfn;
 	bool contiguous;
+	bool no_translate;
 	pgprot_t prot;
 	struct mmu_update *mmu_update;
 };
 
-static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token,
 				 unsigned long addr, void *data)
 {
 	struct remap_data *rmd = data;
-	pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
+	pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
 
-	/* If we have a contiguous range, just update the mfn itself,
-	   else update pointer to be "next mfn". */
+	/*
+	 * If we have a contiguous range, just update the pfn itself,
+	 * else update pointer to be "next pfn".
+	 */
 	if (rmd->contiguous)
-		(*rmd->mfn)++;
+		(*rmd->pfn)++;
 	else
-		rmd->mfn++;
+		rmd->pfn++;
 
-	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+	rmd->mmu_update->ptr |= rmd->no_translate ?
+		MMU_PT_UPDATE_NO_TRANSLATE :
+		MMU_NORMAL_PT_UPDATE;
 	rmd->mmu_update->val = pte_val_ma(pte);
 	rmd->mmu_update++;
 
 	return 0;
 }
 
-static int do_remap_gfn(struct vm_area_struct *vma,
+static int do_remap_pfn(struct vm_area_struct *vma,
 			unsigned long addr,
-			xen_pfn_t *gfn, int nr,
+			xen_pfn_t *pfn, int nr,
 			int *err_ptr, pgprot_t prot,
-			unsigned domid,
+			unsigned int domid,
+			bool no_translate,
 			struct page **pages)
 {
 	int err = 0;
@@ -106,11 +113,14 @@ static int do_remap_gfn(struct vm_area_struct *vma,
 
 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
-	rmd.mfn = gfn;
+	rmd.pfn = pfn;
 	rmd.prot = prot;
-	/* We use the err_ptr to indicate if there we are doing a contiguous
-	 * mapping or a discontigious mapping. */
+	/*
+	 * We use the err_ptr to indicate if there we are doing a contiguous
+	 * mapping or a discontigious mapping.
+	 */
 	rmd.contiguous = !err_ptr;
+	rmd.no_translate = no_translate;
 
 	while (nr) {
 		int index = 0;
@@ -121,7 +131,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
 
 		rmd.mmu_update = mmu_update;
 		err = apply_to_page_range(vma->vm_mm, addr, range,
-					  remap_area_mfn_pte_fn, &rmd);
+					  remap_area_pfn_pte_fn, &rmd);
 		if (err)
 			goto out;
 
@@ -175,7 +185,8 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return -EOPNOTSUPP;
 
-	return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
+	return do_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false,
+			    pages);
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
@@ -194,10 +205,25 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 	 * cause of "wrong memory was mapped in".
 	 */
 	BUG_ON(err_ptr == NULL);
-	return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
+	return do_remap_pfn(vma, addr, gfn, nr, err_ptr, prot, domid,
+			    false, pages);
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       xen_pfn_t *mfn, int nr,
+			       int *err_ptr, pgprot_t prot,
+			       unsigned int domid, struct page **pages)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -EOPNOTSUPP;
+
+	return do_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid,
+			    true, pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
 /* Returns: 0 success */
 int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int nr, struct page **pages)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 0a778d30d333..8ae0349d9f0a 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -33,6 +33,7 @@
 #include <xen/xen.h>
 #include <xen/privcmd.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
 #include <xen/interface/hvm/dm_op.h>
 #include <xen/features.h>
 #include <xen/page.h>
@@ -722,6 +723,134 @@ static long privcmd_ioctl_restrict(struct file *file, void __user *udata)
 	return 0;
 }
 
+struct remap_pfn {
+	struct mm_struct *mm;
+	struct page **pages;
+	pgprot_t prot;
+	unsigned long i;
+};
+
+static int remap_pfn_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+			void *data)
+{
+	struct remap_pfn *r = data;
+	struct page *page = r->pages[r->i];
+	pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), r->prot));
+
+	set_pte_at(r->mm, addr, ptep, pte);
+	r->i++;
+
+	return 0;
+}
+
+static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct privcmd_mmap_resource kdata;
+	xen_pfn_t *pfns = NULL;
+	struct xen_mem_acquire_resource xdata;
+	int rc;
+
+	if (copy_from_user(&kdata, udata, sizeof(kdata)))
+		return -EFAULT;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
+		return -EPERM;
+
+	down_write(&mm->mmap_sem);
+
+	vma = find_vma(mm, kdata.addr);
+	if (!vma || vma->vm_ops != &privcmd_vm_ops) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL);
+	if (!pfns) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE);
+		struct page **pages;
+		unsigned int i;
+
+		rc = alloc_empty_pages(vma, nr);
+		if (rc < 0)
+			goto out;
+
+		pages = vma->vm_private_data;
+		for (i = 0; i < kdata.num; i++) {
+			xen_pfn_t pfn =
+				page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
+
+			pfns[i] = pfn + (i % XEN_PFN_PER_PAGE);
+		}
+	} else
+		vma->vm_private_data = PRIV_VMA_LOCKED;
+
+	memset(&xdata, 0, sizeof(xdata));
+	xdata.domid = kdata.dom;
+	xdata.type = kdata.type;
+	xdata.id = kdata.id;
+	xdata.frame = kdata.idx;
+	xdata.nr_frames = kdata.num;
+	set_xen_guest_handle(xdata.frame_list, pfns);
+
+	xen_preemptible_hcall_begin();
+	rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
+	xen_preemptible_hcall_end();
+
+	if (rc)
+		goto out;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		struct remap_pfn r = {
+			.mm = vma->vm_mm,
+			.pages = vma->vm_private_data,
+			.prot = vma->vm_page_prot,
+		};
+
+		rc = apply_to_page_range(r.mm, kdata.addr,
+					 kdata.num << PAGE_SHIFT,
+					 remap_pfn_fn, &r);
+	} else {
+		unsigned int domid =
+			(xdata.flags & XENMEM_rsrc_acq_caller_owned) ?
+			DOMID_SELF : kdata.dom;
+		int num;
+
+		num = xen_remap_domain_mfn_array(vma,
+						 kdata.addr & PAGE_MASK,
+						 pfns, kdata.num, (int *)pfns,
+						 vma->vm_page_prot,
+						 domid,
+						 vma->vm_private_data);
+		if (num < 0)
+			rc = num;
+		else if (num != kdata.num) {
+			unsigned int i;
+
+			for (i = 0; i < num; i++) {
+				rc = pfns[i];
+				if (rc < 0)
+					break;
+			}
+		} else
+			rc = 0;
+	}
+
+out:
+	up_write(&mm->mmap_sem);
+	kfree(pfns);
+
+	return rc;
+}
+
 static long privcmd_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long data)
 {
@@ -753,6 +882,10 @@ static long privcmd_ioctl(struct file *file,
 		ret = privcmd_ioctl_restrict(file, udata);
 		break;
 
+	case IOCTL_PRIVCMD_MMAP_RESOURCE:
+		ret = privcmd_ioctl_mmap_resource(file, udata);
+		break;
+
 	default:
 		break;
 	}
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index 39d3e7b8e993..d2029556083e 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -89,6 +89,15 @@ struct privcmd_dm_op {
 	const struct privcmd_dm_op_buf __user *ubufs;
 };
 
+struct privcmd_mmap_resource {
+	domid_t dom;
+	__u32 type;
+	__u32 id;
+	__u32 idx;
+	__u64 num;
+	__u64 addr;
+};
+
 /*
  * @cmd: IOCTL_PRIVCMD_HYPERCALL
  * @arg: &privcmd_hypercall_t
@@ -114,5 +123,7 @@ struct privcmd_dm_op {
 	_IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_dm_op))
 #define IOCTL_PRIVCMD_RESTRICT					\
 	_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
+#define IOCTL_PRIVCMD_MMAP_RESOURCE				\
+	_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
 
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 583dd93b3016..4c5751c26f87 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -265,4 +265,70 @@ struct xen_remove_from_physmap {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
 
+/*
+ * Get the pages for a particular guest resource, so that they can be
+ * mapped directly by a tools domain.
+ */
+#define XENMEM_acquire_resource 28
+struct xen_mem_acquire_resource {
+    /* IN - The domain whose resource is to be mapped */
+    domid_t domid;
+    /* IN - the type of resource */
+    uint16_t type;
+
+#define XENMEM_resource_ioreq_server 0
+#define XENMEM_resource_grant_table 1
+
+    /*
+     * IN - a type-specific resource identifier, which must be zero
+     *      unless stated otherwise.
+     *
+     * type == XENMEM_resource_ioreq_server -> id == ioreq server id
+     * type == XENMEM_resource_grant_table -> id defined below
+     */
+    uint32_t id;
+
+#define XENMEM_resource_grant_table_id_shared 0
+#define XENMEM_resource_grant_table_id_status 1
+
+    /* IN/OUT - As an IN parameter number of frames of the resource
+     *          to be mapped. However, if the specified value is 0 and
+     *          frame_list is NULL then this field will be set to the
+     *          maximum value supported by the implementation on return.
+     */
+    uint32_t nr_frames;
+    /*
+     * OUT - Must be zero on entry. On return this may contain a bitwise
+     *       OR of the following values.
+     */
+    uint32_t flags;
+
+    /* The resource pages have been assigned to the calling domain */
+#define _XENMEM_rsrc_acq_caller_owned 0
+#define XENMEM_rsrc_acq_caller_owned (1u << _XENMEM_rsrc_acq_caller_owned)
+
+    /*
+     * IN - the index of the initial frame to be mapped. This parameter
+     *      is ignored if nr_frames is 0.
+     */
+    uint64_t frame;
+
+#define XENMEM_resource_ioreq_server_frame_bufioreq 0
+#define XENMEM_resource_ioreq_server_frame_ioreq(n) (1 + (n))
+
+    /*
+     * IN/OUT - If the tools domain is PV then, upon return, frame_list
+     *          will be populated with the MFNs of the resource.
+     *          If the tools domain is HVM then it is expected that, on
+     *          entry, frame_list will be populated with a list of GFNs
+     *          that will be mapped to the MFNs of the resource.
+     *          If -EIO is returned then the frame_list has only been
+     *          partially mapped and it is up to the caller to unmap all
+     *          the GFNs.
+     *          This parameter may be NULL if nr_frames is 0.
+     */
+    GUEST_HANDLE(xen_pfn_t) frame_list;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mem_acquire_resource);
+
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 4f4830ef8f93..8bfb242f433e 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -265,9 +265,10 @@
  *
  * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7.
  */
-#define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.       */
-#define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for  */
-#define MMU_PT_UPDATE_PRESERVE_AD 2 /* atomically: *ptr = val | (*ptr&(A|D)) */
+#define MMU_NORMAL_PT_UPDATE       0 /* checked '*ptr = val'. ptr is MA.      */
+#define MMU_MACHPHYS_UPDATE        1 /* ptr = MA of frame to modify entry for */
+#define MMU_PT_UPDATE_PRESERVE_AD  2 /* atomically: *ptr = val | (*ptr&(A|D)) */
+#define MMU_PT_UPDATE_NO_TRANSLATE 3 /* checked '*ptr = val'. ptr is MA.      */
 
 /*
  * MMU EXTENDED OPERATIONS
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index fd23e42c6024..fd18c974a619 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -63,7 +63,7 @@ static inline void xen_destroy_contiguous_region(phys_addr_t pstart,
 struct vm_area_struct;
 
 /*
- * xen_remap_domain_gfn_array() - map an array of foreign frames
+ * xen_remap_domain_gfn_array() - map an array of foreign frames by gfn
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
  * @gfn:     Array of GFNs to map
@@ -86,6 +86,28 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 			       unsigned domid,
 			       struct page **pages);
 
+/*
+ * xen_remap_domain_mfn_array() - map an array of foreign frames by mfn
+ * @vma:     VMA to map the pages into
+ * @addr:    Address at which to map the pages
+ * @mfn:     Array of MFNs to map
+ * @nr:      Number entries in the MFN array
+ * @err_ptr: Returns per-MFN error status.
+ * @prot:    page protection mask
+ * @domid:   Domain owning the pages
+ * @pages:   Array of pages if this domain has an auto-translated physmap
+ *
+ * @mfn and @err_ptr may point to the same buffer, the MFNs will be
+ * overwritten by the error codes after they are mapped.
+ *
+ * Returns the number of successfully mapped frames, or a -ve error
+ * code.
+ */
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+			       unsigned long addr, xen_pfn_t *mfn, int nr,
+			       int *err_ptr, pgprot_t prot,
+			       unsigned int domid, struct page **pages);
+
 /* xen_remap_domain_gfn_range() - map a range of foreign frames
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
-- 
cgit v1.2.3


From 745f8c14e35d32ff43a52a9415e4dcc2955c4321 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 14 May 2018 15:47:30 +0200
Subject: video: fbdev: remove unused auo_k190xfb drivers

auo_k1900fb and auo_k1901fb drivers have been introduced six
years ago by following commits:

commit 2c8304d3125b ("video: auo_k190x: add code shared by controller drivers")
commit 96b1d500e028 ("video: auo_k190x: add driver for AUO-K1900 variant")
commit 53027cdf2a67 ("video: auo_k190x: add driver for AUO-K1901 variant")

They never had any in-kernel user so just remove them (since
they are platform drivers they need corresponding platform
devices to be registered by kernel and it has never happened).

Reviewed-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/Kconfig       |   33 -
 drivers/video/fbdev/Makefile      |    3 -
 drivers/video/fbdev/auo_k1900fb.c |  204 -------
 drivers/video/fbdev/auo_k1901fb.c |  257 --------
 drivers/video/fbdev/auo_k190x.c   | 1190 -------------------------------------
 drivers/video/fbdev/auo_k190x.h   |  129 ----
 include/video/auo_k190xfb.h       |  107 ----
 7 files changed, 1923 deletions(-)
 delete mode 100644 drivers/video/fbdev/auo_k1900fb.c
 delete mode 100644 drivers/video/fbdev/auo_k1901fb.c
 delete mode 100644 drivers/video/fbdev/auo_k190x.c
 delete mode 100644 drivers/video/fbdev/auo_k190x.h
 delete mode 100644 include/video/auo_k190xfb.h

(limited to 'include')

diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 434e95b9320e..381a9c588792 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -2253,39 +2253,6 @@ config FB_BROADSHEET
 	  and could also have been called by other names when coupled with
 	  a bridge adapter.
 
-config FB_AUO_K190X
-	tristate "AUO-K190X EPD controller support"
-	depends on FB
-	select FB_SYS_FILLRECT
-	select FB_SYS_COPYAREA
-	select FB_SYS_IMAGEBLIT
-	select FB_SYS_FOPS
-	select FB_DEFERRED_IO
-	help
-	  Provides support for epaper controllers from the K190X series
-	  of AUO. These controllers can be used to drive epaper displays
-	  from Sipix.
-
-	  This option enables the common support, shared by the individual
-	  controller drivers. You will also have to enable the driver
-	  for the controller type used in your device.
-
-config FB_AUO_K1900
-	tristate "AUO-K1900 EPD controller support"
-	depends on FB && FB_AUO_K190X
-	help
-	  This driver implements support for the AUO K1900 epd-controller.
-	  This controller can drive Sipix epaper displays but can only do
-	  serial updates, reducing the number of possible frames per second.
-
-config FB_AUO_K1901
-	tristate "AUO-K1901 EPD controller support"
-	depends on FB && FB_AUO_K190X
-	help
-	  This driver implements support for the AUO K1901 epd-controller.
-	  This controller can drive Sipix epaper displays and supports
-	  concurrent updates, making higher frames per second possible.
-
 config FB_JZ4740
 	tristate "JZ4740 LCD framebuffer support"
 	depends on FB && MACH_JZ4740
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index 55282a21b500..317a6fd0e54e 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -100,9 +100,6 @@ obj-$(CONFIG_FB_PMAGB_B)	  += pmagb-b-fb.o
 obj-$(CONFIG_FB_MAXINE)		  += maxinefb.o
 obj-$(CONFIG_FB_METRONOME)        += metronomefb.o
 obj-$(CONFIG_FB_BROADSHEET)       += broadsheetfb.o
-obj-$(CONFIG_FB_AUO_K190X)	  += auo_k190x.o
-obj-$(CONFIG_FB_AUO_K1900)	  += auo_k1900fb.o
-obj-$(CONFIG_FB_AUO_K1901)	  += auo_k1901fb.o
 obj-$(CONFIG_FB_S1D13XXX)	  += s1d13xxxfb.o
 obj-$(CONFIG_FB_SH7760)		  += sh7760fb.o
 obj-$(CONFIG_FB_IMX)              += imxfb.o
diff --git a/drivers/video/fbdev/auo_k1900fb.c b/drivers/video/fbdev/auo_k1900fb.c
deleted file mode 100644
index 7637c60eae3d..000000000000
--- a/drivers/video/fbdev/auo_k1900fb.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * auok190xfb.c -- FB driver for AUO-K1900 controllers
- *
- * Copyright (C) 2011, 2012 Heiko Stuebner <heiko@sntech.de>
- *
- * based on broadsheetfb.c
- *
- * Copyright (C) 2008, Jaya Kumar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Layout is based on skeletonfb.c by James Simmons and Geert Uytterhoeven.
- *
- * This driver is written to be used with the AUO-K1900 display controller.
- *
- * It is intended to be architecture independent. A board specific driver
- * must be used to perform all the physical IO interactions.
- *
- * The controller supports different update modes:
- * mode0+1 16 step gray (4bit)
- * mode2 4 step gray (2bit) - FIXME: add strange refresh
- * mode3 2 step gray (1bit) - FIXME: add strange refresh
- * mode4 handwriting mode (strange behaviour)
- * mode5 automatic selection of update mode
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/list.h>
-#include <linux/firmware.h>
-#include <linux/gpio.h>
-#include <linux/pm_runtime.h>
-
-#include <video/auo_k190xfb.h>
-
-#include "auo_k190x.h"
-
-/*
- * AUO-K1900 specific commands
- */
-
-#define AUOK1900_CMD_PARTIALDISP	0x1001
-#define AUOK1900_CMD_ROTATION		0x1006
-#define AUOK1900_CMD_LUT_STOP		0x1009
-
-#define AUOK1900_INIT_TEMP_AVERAGE	(1 << 13)
-#define AUOK1900_INIT_ROTATE(_x)	((_x & 0x3) << 10)
-#define AUOK1900_INIT_RESOLUTION(_res)	((_res & 0x7) << 2)
-
-static void auok1900_init(struct auok190xfb_par *par)
-{
-	struct device *dev = par->info->device;
-	struct auok190x_board *board = par->board;
-	u16 init_param = 0;
-
-	pm_runtime_get_sync(dev);
-
-	init_param |= AUOK1900_INIT_TEMP_AVERAGE;
-	init_param |= AUOK1900_INIT_ROTATE(par->rotation);
-	init_param |= AUOK190X_INIT_INVERSE_WHITE;
-	init_param |= AUOK190X_INIT_FORMAT0;
-	init_param |= AUOK1900_INIT_RESOLUTION(par->resolution);
-	init_param |= AUOK190X_INIT_SHIFT_RIGHT;
-
-	auok190x_send_cmdargs(par, AUOK190X_CMD_INIT, 1, &init_param);
-
-	/* let the controller finish */
-	board->wait_for_rdy(par);
-
-	pm_runtime_mark_last_busy(dev);
-	pm_runtime_put_autosuspend(dev);
-}
-
-static void auok1900_update_region(struct auok190xfb_par *par, int mode,
-						u16 y1, u16 y2)
-{
-	struct device *dev = par->info->device;
-	unsigned char *buf = (unsigned char *)par->info->screen_base;
-	int xres = par->info->var.xres;
-	int line_length = par->info->fix.line_length;
-	u16 args[4];
-
-	pm_runtime_get_sync(dev);
-
-	mutex_lock(&(par->io_lock));
-
-	/* y1 and y2 must be a multiple of 2 so drop the lowest bit */
-	y1 &= 0xfffe;
-	y2 &= 0xfffe;
-
-	dev_dbg(dev, "update (x,y,w,h,mode)=(%d,%d,%d,%d,%d)\n",
-		1, y1+1, xres, y2-y1, mode);
-
-	/* to FIX handle different partial update modes */
-	args[0] = mode | 1;
-	args[1] = y1 + 1;
-	args[2] = xres;
-	args[3] = y2 - y1;
-	buf += y1 * line_length;
-	auok190x_send_cmdargs_pixels(par, AUOK1900_CMD_PARTIALDISP, 4, args,
-				     ((y2 - y1) * line_length)/2, (u16 *) buf);
-	auok190x_send_command(par, AUOK190X_CMD_DATA_STOP);
-
-	par->update_cnt++;
-
-	mutex_unlock(&(par->io_lock));
-
-	pm_runtime_mark_last_busy(dev);
-	pm_runtime_put_autosuspend(dev);
-}
-
-static void auok1900fb_dpy_update_pages(struct auok190xfb_par *par,
-						u16 y1, u16 y2)
-{
-	int mode;
-
-	if (par->update_mode < 0) {
-		mode = AUOK190X_UPDATE_MODE(1);
-		par->last_mode = -1;
-	} else {
-		mode = AUOK190X_UPDATE_MODE(par->update_mode);
-		par->last_mode = par->update_mode;
-	}
-
-	if (par->flash)
-		mode |= AUOK190X_UPDATE_NONFLASH;
-
-	auok1900_update_region(par, mode, y1, y2);
-}
-
-static void auok1900fb_dpy_update(struct auok190xfb_par *par)
-{
-	int mode;
-
-	if (par->update_mode < 0) {
-		mode = AUOK190X_UPDATE_MODE(0);
-		par->last_mode = -1;
-	} else {
-		mode = AUOK190X_UPDATE_MODE(par->update_mode);
-		par->last_mode = par->update_mode;
-	}
-
-	if (par->flash)
-		mode |= AUOK190X_UPDATE_NONFLASH;
-
-	auok1900_update_region(par, mode, 0, par->info->var.yres);
-	par->update_cnt = 0;
-}
-
-static bool auok1900fb_need_refresh(struct auok190xfb_par *par)
-{
-	return (par->update_cnt > 10);
-}
-
-static int auok1900fb_probe(struct platform_device *pdev)
-{
-	struct auok190x_init_data init;
-	struct auok190x_board *board;
-
-	/* pick up board specific routines */
-	board = pdev->dev.platform_data;
-	if (!board)
-		return -EINVAL;
-
-	/* fill temporary init struct for common init */
-	init.id = "auo_k1900fb";
-	init.board = board;
-	init.update_partial = auok1900fb_dpy_update_pages;
-	init.update_all = auok1900fb_dpy_update;
-	init.need_refresh = auok1900fb_need_refresh;
-	init.init = auok1900_init;
-
-	return auok190x_common_probe(pdev, &init);
-}
-
-static int auok1900fb_remove(struct platform_device *pdev)
-{
-	return auok190x_common_remove(pdev);
-}
-
-static struct platform_driver auok1900fb_driver = {
-	.probe	= auok1900fb_probe,
-	.remove = auok1900fb_remove,
-	.driver	= {
-		.name	= "auo_k1900fb",
-		.pm = &auok190x_pm,
-	},
-};
-module_platform_driver(auok1900fb_driver);
-
-MODULE_DESCRIPTION("framebuffer driver for the AUO-K1900 EPD controller");
-MODULE_AUTHOR("Heiko Stuebner <heiko@sntech.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/auo_k1901fb.c b/drivers/video/fbdev/auo_k1901fb.c
deleted file mode 100644
index 681fe61957b6..000000000000
--- a/drivers/video/fbdev/auo_k1901fb.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * auok190xfb.c -- FB driver for AUO-K1901 controllers
- *
- * Copyright (C) 2011, 2012 Heiko Stuebner <heiko@sntech.de>
- *
- * based on broadsheetfb.c
- *
- * Copyright (C) 2008, Jaya Kumar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Layout is based on skeletonfb.c by James Simmons and Geert Uytterhoeven.
- *
- * This driver is written to be used with the AUO-K1901 display controller.
- *
- * It is intended to be architecture independent. A board specific driver
- * must be used to perform all the physical IO interactions.
- *
- * The controller supports different update modes:
- * mode0+1 16 step gray (4bit)
- * mode2+3 4 step gray (2bit)
- * mode4+5 2 step gray (1bit)
- * - mode4 is described as "without LUT"
- * mode7 automatic selection of update mode
- *
- * The most interesting difference to the K1900 is the ability to do screen
- * updates in an asynchronous fashion. Where the K1900 needs to wait for the
- * current update to complete, the K1901 can process later updates already.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/list.h>
-#include <linux/firmware.h>
-#include <linux/gpio.h>
-#include <linux/pm_runtime.h>
-
-#include <video/auo_k190xfb.h>
-
-#include "auo_k190x.h"
-
-/*
- * AUO-K1901 specific commands
- */
-
-#define AUOK1901_CMD_LUT_INTERFACE	0x0005
-#define AUOK1901_CMD_DMA_START		0x1001
-#define AUOK1901_CMD_CURSOR_START	0x1007
-#define AUOK1901_CMD_CURSOR_STOP	AUOK190X_CMD_DATA_STOP
-#define AUOK1901_CMD_DDMA_START		0x1009
-
-#define AUOK1901_INIT_GATE_PULSE_LOW	(0 << 14)
-#define AUOK1901_INIT_GATE_PULSE_HIGH	(1 << 14)
-#define AUOK1901_INIT_SINGLE_GATE	(0 << 13)
-#define AUOK1901_INIT_DOUBLE_GATE	(1 << 13)
-
-/* Bits to pixels
- *   Mode	15-12	11-8	7-4	3-0
- *   format2	2	T	1	T
- *   format3	1	T	2	T
- *   format4	T	2	T	1
- *   format5	T	1	T	2
- *
- *   halftone modes:
- *   format6	2	2	1	1
- *   format7	1	1	2	2
- */
-#define AUOK1901_INIT_FORMAT2		(1 << 7)
-#define AUOK1901_INIT_FORMAT3		((1 << 7) | (1 << 6))
-#define AUOK1901_INIT_FORMAT4		(1 << 8)
-#define AUOK1901_INIT_FORMAT5		((1 << 8) | (1 << 6))
-#define AUOK1901_INIT_FORMAT6		((1 << 8) | (1 << 7))
-#define AUOK1901_INIT_FORMAT7		((1 << 8) | (1 << 7) | (1 << 6))
-
-/* res[4] to bit 10
- * res[3-0] to bits 5-2
- */
-#define AUOK1901_INIT_RESOLUTION(_res)	(((_res & (1 << 4)) << 6) \
-					 | ((_res & 0xf) << 2))
-
-/*
- * portrait / landscape orientation in AUOK1901_CMD_DMA_START
- */
-#define AUOK1901_DMA_ROTATE90(_rot)		((_rot & 1) << 13)
-
-/*
- * equivalent to 1 << 11, needs the ~ to have same rotation like K1900
- */
-#define AUOK1901_DDMA_ROTATE180(_rot)		((~_rot & 2) << 10)
-
-static void auok1901_init(struct auok190xfb_par *par)
-{
-	struct device *dev = par->info->device;
-	struct auok190x_board *board = par->board;
-	u16 init_param = 0;
-
-	pm_runtime_get_sync(dev);
-
-	init_param |= AUOK190X_INIT_INVERSE_WHITE;
-	init_param |= AUOK190X_INIT_FORMAT0;
-	init_param |= AUOK1901_INIT_RESOLUTION(par->resolution);
-	init_param |= AUOK190X_INIT_SHIFT_LEFT;
-
-	auok190x_send_cmdargs(par, AUOK190X_CMD_INIT, 1, &init_param);
-
-	/* let the controller finish */
-	board->wait_for_rdy(par);
-
-	pm_runtime_mark_last_busy(dev);
-	pm_runtime_put_autosuspend(dev);
-}
-
-static void auok1901_update_region(struct auok190xfb_par *par, int mode,
-						u16 y1, u16 y2)
-{
-	struct device *dev = par->info->device;
-	unsigned char *buf = (unsigned char *)par->info->screen_base;
-	int xres = par->info->var.xres;
-	int line_length = par->info->fix.line_length;
-	u16 args[5];
-
-	pm_runtime_get_sync(dev);
-
-	mutex_lock(&(par->io_lock));
-
-	/* y1 and y2 must be a multiple of 2 so drop the lowest bit */
-	y1 &= 0xfffe;
-	y2 &= 0xfffe;
-
-	dev_dbg(dev, "update (x,y,w,h,mode)=(%d,%d,%d,%d,%d)\n",
-		1, y1+1, xres, y2-y1, mode);
-
-	/* K1901: first transfer the region data */
-	args[0] = AUOK1901_DMA_ROTATE90(par->rotation) | 1;
-	args[1] = y1 + 1;
-	args[2] = xres;
-	args[3] = y2 - y1;
-	buf += y1 * line_length;
-	auok190x_send_cmdargs_pixels_nowait(par, AUOK1901_CMD_DMA_START, 4,
-					    args, ((y2 - y1) * line_length)/2,
-					    (u16 *) buf);
-	auok190x_send_command_nowait(par, AUOK190X_CMD_DATA_STOP);
-
-	/* K1901: second tell the controller to update the region with mode */
-	args[0] = mode | AUOK1901_DDMA_ROTATE180(par->rotation);
-	args[1] = 1;
-	args[2] = y1 + 1;
-	args[3] = xres;
-	args[4] = y2 - y1;
-	auok190x_send_cmdargs_nowait(par, AUOK1901_CMD_DDMA_START, 5, args);
-
-	par->update_cnt++;
-
-	mutex_unlock(&(par->io_lock));
-
-	pm_runtime_mark_last_busy(dev);
-	pm_runtime_put_autosuspend(dev);
-}
-
-static void auok1901fb_dpy_update_pages(struct auok190xfb_par *par,
-						u16 y1, u16 y2)
-{
-	int mode;
-
-	if (par->update_mode < 0) {
-		mode = AUOK190X_UPDATE_MODE(1);
-		par->last_mode = -1;
-	} else {
-		mode = AUOK190X_UPDATE_MODE(par->update_mode);
-		par->last_mode = par->update_mode;
-	}
-
-	if (par->flash)
-		mode |= AUOK190X_UPDATE_NONFLASH;
-
-	auok1901_update_region(par, mode, y1, y2);
-}
-
-static void auok1901fb_dpy_update(struct auok190xfb_par *par)
-{
-	int mode;
-
-	/* When doing full updates, wait for the controller to be ready
-	 * This will hopefully catch some hangs of the K1901
-	 */
-	par->board->wait_for_rdy(par);
-
-	if (par->update_mode < 0) {
-		mode = AUOK190X_UPDATE_MODE(0);
-		par->last_mode = -1;
-	} else {
-		mode = AUOK190X_UPDATE_MODE(par->update_mode);
-		par->last_mode = par->update_mode;
-	}
-
-	if (par->flash)
-		mode |= AUOK190X_UPDATE_NONFLASH;
-
-	auok1901_update_region(par, mode, 0, par->info->var.yres);
-	par->update_cnt = 0;
-}
-
-static bool auok1901fb_need_refresh(struct auok190xfb_par *par)
-{
-	return (par->update_cnt > 10);
-}
-
-static int auok1901fb_probe(struct platform_device *pdev)
-{
-	struct auok190x_init_data init;
-	struct auok190x_board *board;
-
-	/* pick up board specific routines */
-	board = pdev->dev.platform_data;
-	if (!board)
-		return -EINVAL;
-
-	/* fill temporary init struct for common init */
-	init.id = "auo_k1901fb";
-	init.board = board;
-	init.update_partial = auok1901fb_dpy_update_pages;
-	init.update_all = auok1901fb_dpy_update;
-	init.need_refresh = auok1901fb_need_refresh;
-	init.init = auok1901_init;
-
-	return auok190x_common_probe(pdev, &init);
-}
-
-static int auok1901fb_remove(struct platform_device *pdev)
-{
-	return auok190x_common_remove(pdev);
-}
-
-static struct platform_driver auok1901fb_driver = {
-	.probe	= auok1901fb_probe,
-	.remove = auok1901fb_remove,
-	.driver	= {
-		.name	= "auo_k1901fb",
-		.pm = &auok190x_pm,
-	},
-};
-module_platform_driver(auok1901fb_driver);
-
-MODULE_DESCRIPTION("framebuffer driver for the AUO-K1901 EPD controller");
-MODULE_AUTHOR("Heiko Stuebner <heiko@sntech.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/auo_k190x.c b/drivers/video/fbdev/auo_k190x.c
deleted file mode 100644
index b39681e6f4fd..000000000000
--- a/drivers/video/fbdev/auo_k190x.c
+++ /dev/null
@@ -1,1190 +0,0 @@
-/*
- * Common code for AUO-K190X framebuffer drivers
- *
- * Copyright (C) 2012 Heiko Stuebner <heiko@sntech.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/sched/mm.h>
-#include <linux/kernel.h>
-#include <linux/gpio.h>
-#include <linux/platform_device.h>
-#include <linux/pm_runtime.h>
-#include <linux/fb.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/regulator/consumer.h>
-
-#include <video/auo_k190xfb.h>
-
-#include "auo_k190x.h"
-
-struct panel_info {
-	int w;
-	int h;
-};
-
-/* table of panel specific parameters to be indexed into by the board drivers */
-static struct panel_info panel_table[] = {
-	/* standard 6" */
-	[AUOK190X_RESOLUTION_800_600] = {
-		.w = 800,
-		.h = 600,
-	},
-	/* standard 9" */
-	[AUOK190X_RESOLUTION_1024_768] = {
-		.w = 1024,
-		.h = 768,
-	},
-	[AUOK190X_RESOLUTION_600_800] = {
-		.w = 600,
-		.h = 800,
-	},
-	[AUOK190X_RESOLUTION_768_1024] = {
-		.w = 768,
-		.h = 1024,
-	},
-};
-
-/*
- * private I80 interface to the board driver
- */
-
-static void auok190x_issue_data(struct auok190xfb_par *par, u16 data)
-{
-	par->board->set_ctl(par, AUOK190X_I80_WR, 0);
-	par->board->set_hdb(par, data);
-	par->board->set_ctl(par, AUOK190X_I80_WR, 1);
-}
-
-static void auok190x_issue_cmd(struct auok190xfb_par *par, u16 data)
-{
-	par->board->set_ctl(par, AUOK190X_I80_DC, 0);
-	auok190x_issue_data(par, data);
-	par->board->set_ctl(par, AUOK190X_I80_DC, 1);
-}
-
-/**
- * Conversion of 16bit color to 4bit grayscale
- * does roughly (0.3 * R + 0.6 G + 0.1 B) / 2
- */
-static inline int rgb565_to_gray4(u16 data, struct fb_var_screeninfo *var)
-{
-	return ((((data & 0xF800) >> var->red.offset) * 77 +
-		 ((data & 0x07E0) >> (var->green.offset + 1)) * 151 +
-		 ((data & 0x1F) >> var->blue.offset) * 28) >> 8 >> 1);
-}
-
-static int auok190x_issue_pixels_rgb565(struct auok190xfb_par *par, int size,
-					u16 *data)
-{
-	struct fb_var_screeninfo *var = &par->info->var;
-	struct device *dev = par->info->device;
-	int i;
-	u16 tmp;
-
-	if (size & 7) {
-		dev_err(dev, "issue_pixels: size %d must be a multiple of 8\n",
-			size);
-		return -EINVAL;
-	}
-
-	for (i = 0; i < (size >> 2); i++) {
-		par->board->set_ctl(par, AUOK190X_I80_WR, 0);
-
-		tmp  = (rgb565_to_gray4(data[4*i], var) & 0x000F);
-		tmp |= (rgb565_to_gray4(data[4*i+1], var) << 4) & 0x00F0;
-		tmp |= (rgb565_to_gray4(data[4*i+2], var) << 8) & 0x0F00;
-		tmp |= (rgb565_to_gray4(data[4*i+3], var) << 12) & 0xF000;
-
-		par->board->set_hdb(par, tmp);
-		par->board->set_ctl(par, AUOK190X_I80_WR, 1);
-	}
-
-	return 0;
-}
-
-static int auok190x_issue_pixels_gray8(struct auok190xfb_par *par, int size,
-				       u16 *data)
-{
-	struct device *dev = par->info->device;
-	int i;
-	u16 tmp;
-
-	if (size & 3) {
-		dev_err(dev, "issue_pixels: size %d must be a multiple of 4\n",
-			size);
-		return -EINVAL;
-	}
-
-	for (i = 0; i < (size >> 1); i++) {
-		par->board->set_ctl(par, AUOK190X_I80_WR, 0);
-
-		/* simple reduction of 8bit staticgray to 4bit gray
-		 * combines 4 * 4bit pixel values into a 16bit value
-		 */
-		tmp  = (data[2*i] & 0xF0) >> 4;
-		tmp |= (data[2*i] & 0xF000) >> 8;
-		tmp |= (data[2*i+1] & 0xF0) << 4;
-		tmp |= (data[2*i+1] & 0xF000);
-
-		par->board->set_hdb(par, tmp);
-		par->board->set_ctl(par, AUOK190X_I80_WR, 1);
-	}
-
-	return 0;
-}
-
-static int auok190x_issue_pixels(struct auok190xfb_par *par, int size,
-				 u16 *data)
-{
-	struct fb_info *info = par->info;
-	struct device *dev = par->info->device;
-
-	if (info->var.bits_per_pixel == 8 && info->var.grayscale)
-		auok190x_issue_pixels_gray8(par, size, data);
-	else if (info->var.bits_per_pixel == 16)
-		auok190x_issue_pixels_rgb565(par, size, data);
-	else
-		dev_err(dev, "unsupported color mode (bits: %d, gray: %d)\n",
-			info->var.bits_per_pixel, info->var.grayscale);
-
-	return 0;
-}
-
-static u16 auok190x_read_data(struct auok190xfb_par *par)
-{
-	u16 data;
-
-	par->board->set_ctl(par, AUOK190X_I80_OE, 0);
-	data = par->board->get_hdb(par);
-	par->board->set_ctl(par, AUOK190X_I80_OE, 1);
-
-	return data;
-}
-
-/*
- * Command interface for the controller drivers
- */
-
-void auok190x_send_command_nowait(struct auok190xfb_par *par, u16 data)
-{
-	par->board->set_ctl(par, AUOK190X_I80_CS, 0);
-	auok190x_issue_cmd(par, data);
-	par->board->set_ctl(par, AUOK190X_I80_CS, 1);
-}
-EXPORT_SYMBOL_GPL(auok190x_send_command_nowait);
-
-void auok190x_send_cmdargs_nowait(struct auok190xfb_par *par, u16 cmd,
-				  int argc, u16 *argv)
-{
-	int i;
-
-	par->board->set_ctl(par, AUOK190X_I80_CS, 0);
-	auok190x_issue_cmd(par, cmd);
-
-	for (i = 0; i < argc; i++)
-		auok190x_issue_data(par, argv[i]);
-	par->board->set_ctl(par, AUOK190X_I80_CS, 1);
-}
-EXPORT_SYMBOL_GPL(auok190x_send_cmdargs_nowait);
-
-int auok190x_send_command(struct auok190xfb_par *par, u16 data)
-{
-	int ret;
-
-	ret = par->board->wait_for_rdy(par);
-	if (ret)
-		return ret;
-
-	auok190x_send_command_nowait(par, data);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(auok190x_send_command);
-
-int auok190x_send_cmdargs(struct auok190xfb_par *par, u16 cmd,
-			   int argc, u16 *argv)
-{
-	int ret;
-
-	ret = par->board->wait_for_rdy(par);
-	if (ret)
-		return ret;
-
-	auok190x_send_cmdargs_nowait(par, cmd, argc, argv);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(auok190x_send_cmdargs);
-
-int auok190x_read_cmdargs(struct auok190xfb_par *par, u16 cmd,
-			   int argc, u16 *argv)
-{
-	int i, ret;
-
-	ret = par->board->wait_for_rdy(par);
-	if (ret)
-		return ret;
-
-	par->board->set_ctl(par, AUOK190X_I80_CS, 0);
-	auok190x_issue_cmd(par, cmd);
-
-	for (i = 0; i < argc; i++)
-		argv[i] = auok190x_read_data(par);
-	par->board->set_ctl(par, AUOK190X_I80_CS, 1);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(auok190x_read_cmdargs);
-
-void auok190x_send_cmdargs_pixels_nowait(struct auok190xfb_par *par, u16 cmd,
-				  int argc, u16 *argv, int size, u16 *data)
-{
-	int i;
-
-	par->board->set_ctl(par, AUOK190X_I80_CS, 0);
-
-	auok190x_issue_cmd(par, cmd);
-
-	for (i = 0; i < argc; i++)
-		auok190x_issue_data(par, argv[i]);
-
-	auok190x_issue_pixels(par, size, data);
-
-	par->board->set_ctl(par, AUOK190X_I80_CS, 1);
-}
-EXPORT_SYMBOL_GPL(auok190x_send_cmdargs_pixels_nowait);
-
-int auok190x_send_cmdargs_pixels(struct auok190xfb_par *par, u16 cmd,
-				  int argc, u16 *argv, int size, u16 *data)
-{
-	int ret;
-
-	ret = par->board->wait_for_rdy(par);
-	if (ret)
-		return ret;
-
-	auok190x_send_cmdargs_pixels_nowait(par, cmd, argc, argv, size, data);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(auok190x_send_cmdargs_pixels);
-
-/*
- * fbdefio callbacks - common on both controllers.
- */
-
-static void auok190xfb_dpy_first_io(struct fb_info *info)
-{
-	/* tell runtime-pm that we wish to use the device in a short time */
-	pm_runtime_get(info->device);
-}
-
-/* this is called back from the deferred io workqueue */
-static void auok190xfb_dpy_deferred_io(struct fb_info *info,
-				struct list_head *pagelist)
-{
-	struct fb_deferred_io *fbdefio = info->fbdefio;
-	struct auok190xfb_par *par = info->par;
-	u16 line_length = info->fix.line_length;
-	u16 yres = info->var.yres;
-	u16 y1 = 0, h = 0;
-	int prev_index = -1;
-	struct page *cur;
-	int h_inc;
-	int threshold;
-
-	if (!list_empty(pagelist))
-		/* the device resume should've been requested through first_io,
-		 * if the resume did not finish until now, wait for it.
-		 */
-		pm_runtime_barrier(info->device);
-	else
-		/* We reached this via the fsync or some other way.
-		 * In either case the first_io function did not run,
-		 * so we runtime_resume the device here synchronously.
-		 */
-		pm_runtime_get_sync(info->device);
-
-	/* Do a full screen update every n updates to prevent
-	 * excessive darkening of the Sipix display.
-	 * If we do this, there is no need to walk the pages.
-	 */
-	if (par->need_refresh(par)) {
-		par->update_all(par);
-		goto out;
-	}
-
-	/* height increment is fixed per page */
-	h_inc = DIV_ROUND_UP(PAGE_SIZE , line_length);
-
-	/* calculate number of pages from pixel height */
-	threshold = par->consecutive_threshold / h_inc;
-	if (threshold < 1)
-		threshold = 1;
-
-	/* walk the written page list and swizzle the data */
-	list_for_each_entry(cur, &fbdefio->pagelist, lru) {
-		if (prev_index < 0) {
-			/* just starting so assign first page */
-			y1 = (cur->index << PAGE_SHIFT) / line_length;
-			h = h_inc;
-		} else if ((cur->index - prev_index) <= threshold) {
-			/* page is within our threshold for single updates */
-			h += h_inc * (cur->index - prev_index);
-		} else {
-			/* page not consecutive, issue previous update first */
-			par->update_partial(par, y1, y1 + h);
-
-			/* start over with our non consecutive page */
-			y1 = (cur->index << PAGE_SHIFT) / line_length;
-			h = h_inc;
-		}
-		prev_index = cur->index;
-	}
-
-	/* if we still have any pages to update we do so now */
-	if (h >= yres)
-		/* its a full screen update, just do it */
-		par->update_all(par);
-	else
-		par->update_partial(par, y1, min((u16) (y1 + h), yres));
-
-out:
-	pm_runtime_mark_last_busy(info->device);
-	pm_runtime_put_autosuspend(info->device);
-}
-
-/*
- * framebuffer operations
- */
-
-/*
- * this is the slow path from userspace. they can seek and write to
- * the fb. it's inefficient to do anything less than a full screen draw
- */
-static ssize_t auok190xfb_write(struct fb_info *info, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct auok190xfb_par *par = info->par;
-	unsigned long p = *ppos;
-	void *dst;
-	int err = 0;
-	unsigned long total_size;
-
-	if (info->state != FBINFO_STATE_RUNNING)
-		return -EPERM;
-
-	total_size = info->fix.smem_len;
-
-	if (p > total_size)
-		return -EFBIG;
-
-	if (count > total_size) {
-		err = -EFBIG;
-		count = total_size;
-	}
-
-	if (count + p > total_size) {
-		if (!err)
-			err = -ENOSPC;
-
-		count = total_size - p;
-	}
-
-	dst = (void *)(info->screen_base + p);
-
-	if (copy_from_user(dst, buf, count))
-		err = -EFAULT;
-
-	if  (!err)
-		*ppos += count;
-
-	par->update_all(par);
-
-	return (err) ? err : count;
-}
-
-static void auok190xfb_fillrect(struct fb_info *info,
-				   const struct fb_fillrect *rect)
-{
-	struct auok190xfb_par *par = info->par;
-
-	sys_fillrect(info, rect);
-
-	par->update_all(par);
-}
-
-static void auok190xfb_copyarea(struct fb_info *info,
-				   const struct fb_copyarea *area)
-{
-	struct auok190xfb_par *par = info->par;
-
-	sys_copyarea(info, area);
-
-	par->update_all(par);
-}
-
-static void auok190xfb_imageblit(struct fb_info *info,
-				const struct fb_image *image)
-{
-	struct auok190xfb_par *par = info->par;
-
-	sys_imageblit(info, image);
-
-	par->update_all(par);
-}
-
-static int auok190xfb_check_var(struct fb_var_screeninfo *var,
-				   struct fb_info *info)
-{
-	struct device *dev = info->device;
-	struct auok190xfb_par *par = info->par;
-	struct panel_info *panel = &panel_table[par->resolution];
-	int size;
-
-	/*
-	 * Color depth
-	 */
-
-	if (var->bits_per_pixel == 8 && var->grayscale == 1) {
-		/*
-		 * For 8-bit grayscale, R, G, and B offset are equal.
-		 */
-		var->red.length = 8;
-		var->red.offset = 0;
-		var->red.msb_right = 0;
-
-		var->green.length = 8;
-		var->green.offset = 0;
-		var->green.msb_right = 0;
-
-		var->blue.length = 8;
-		var->blue.offset = 0;
-		var->blue.msb_right = 0;
-
-		var->transp.length = 0;
-		var->transp.offset = 0;
-		var->transp.msb_right = 0;
-	} else if (var->bits_per_pixel == 16) {
-		var->red.length = 5;
-		var->red.offset = 11;
-		var->red.msb_right = 0;
-
-		var->green.length = 6;
-		var->green.offset = 5;
-		var->green.msb_right = 0;
-
-		var->blue.length = 5;
-		var->blue.offset = 0;
-		var->blue.msb_right = 0;
-
-		var->transp.length = 0;
-		var->transp.offset = 0;
-		var->transp.msb_right = 0;
-	} else {
-		dev_warn(dev, "unsupported color mode (bits: %d, grayscale: %d)\n",
-			info->var.bits_per_pixel, info->var.grayscale);
-		return -EINVAL;
-	}
-
-	/*
-	 * Dimensions
-	 */
-
-	switch (var->rotate) {
-	case FB_ROTATE_UR:
-	case FB_ROTATE_UD:
-		var->xres = panel->w;
-		var->yres = panel->h;
-		break;
-	case FB_ROTATE_CW:
-	case FB_ROTATE_CCW:
-		var->xres = panel->h;
-		var->yres = panel->w;
-		break;
-	default:
-		dev_dbg(dev, "Invalid rotation request\n");
-		return -EINVAL;
-	}
-
-	var->xres_virtual = var->xres;
-	var->yres_virtual = var->yres;
-
-	/*
-	 *  Memory limit
-	 */
-
-	size = var->xres_virtual * var->yres_virtual * var->bits_per_pixel / 8;
-	if (size > info->fix.smem_len) {
-		dev_err(dev, "Memory limit exceeded, requested %dK\n",
-			size >> 10);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static int auok190xfb_set_fix(struct fb_info *info)
-{
-	struct fb_fix_screeninfo *fix = &info->fix;
-	struct fb_var_screeninfo *var = &info->var;
-
-	fix->line_length = var->xres_virtual * var->bits_per_pixel / 8;
-
-	fix->type = FB_TYPE_PACKED_PIXELS;
-	fix->accel = FB_ACCEL_NONE;
-	fix->visual = (var->grayscale) ? FB_VISUAL_STATIC_PSEUDOCOLOR
-				       : FB_VISUAL_TRUECOLOR;
-	fix->xpanstep = 0;
-	fix->ypanstep = 0;
-	fix->ywrapstep = 0;
-
-	return 0;
-}
-
-static int auok190xfb_set_par(struct fb_info *info)
-{
-	struct auok190xfb_par *par = info->par;
-
-	par->rotation = info->var.rotate;
-	auok190xfb_set_fix(info);
-
-	/* reinit the controller to honor the rotation */
-	par->init(par);
-
-	/* wait for init to complete */
-	par->board->wait_for_rdy(par);
-
-	return 0;
-}
-
-static struct fb_ops auok190xfb_ops = {
-	.owner		= THIS_MODULE,
-	.fb_read	= fb_sys_read,
-	.fb_write	= auok190xfb_write,
-	.fb_fillrect	= auok190xfb_fillrect,
-	.fb_copyarea	= auok190xfb_copyarea,
-	.fb_imageblit	= auok190xfb_imageblit,
-	.fb_check_var	= auok190xfb_check_var,
-	.fb_set_par     = auok190xfb_set_par,
-};
-
-/*
- * Controller-functions common to both K1900 and K1901
- */
-
-static int auok190x_read_temperature(struct auok190xfb_par *par)
-{
-	struct device *dev = par->info->device;
-	u16 data[4];
-	int temp;
-
-	pm_runtime_get_sync(dev);
-
-	mutex_lock(&(par->io_lock));
-
-	auok190x_read_cmdargs(par, AUOK190X_CMD_READ_VERSION, 4, data);
-
-	mutex_unlock(&(par->io_lock));
-
-	pm_runtime_mark_last_busy(dev);
-	pm_runtime_put_autosuspend(dev);
-
-	/* sanitize and split of half-degrees for now */
-	temp = ((data[0] & AUOK190X_VERSION_TEMP_MASK) >> 1);
-
-	/* handle positive and negative temperatures */
-	if (temp >= 201)
-		return (255 - temp + 1) * (-1);
-	else
-		return temp;
-}
-
-static void auok190x_identify(struct auok190xfb_par *par)
-{
-	struct device *dev = par->info->device;
-	u16 data[4];
-
-	pm_runtime_get_sync(dev);
-
-	mutex_lock(&(par->io_lock));
-
-	auok190x_read_cmdargs(par, AUOK190X_CMD_READ_VERSION, 4, data);
-
-	mutex_unlock(&(par->io_lock));
-
-	par->epd_type = data[1] & AUOK190X_VERSION_TEMP_MASK;
-
-	par->panel_size_int = AUOK190X_VERSION_SIZE_INT(data[2]);
-	par->panel_size_float = AUOK190X_VERSION_SIZE_FLOAT(data[2]);
-	par->panel_model = AUOK190X_VERSION_MODEL(data[2]);
-
-	par->tcon_version = AUOK190X_VERSION_TCON(data[3]);
-	par->lut_version = AUOK190X_VERSION_LUT(data[3]);
-
-	dev_dbg(dev, "panel %d.%din, model 0x%x, EPD 0x%x TCON-rev 0x%x, LUT-rev 0x%x",
-		par->panel_size_int, par->panel_size_float, par->panel_model,
-		par->epd_type, par->tcon_version, par->lut_version);
-
-	pm_runtime_mark_last_busy(dev);
-	pm_runtime_put_autosuspend(dev);
-}
-
-/*
- * Sysfs functions
- */
-
-static ssize_t update_mode_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-
-	return sprintf(buf, "%d\n", par->update_mode);
-}
-
-static ssize_t update_mode_store(struct device *dev,
-				 struct device_attribute *attr,
-				 const char *buf, size_t count)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-	int mode, ret;
-
-	ret = kstrtoint(buf, 10, &mode);
-	if (ret)
-		return ret;
-
-	par->update_mode = mode;
-
-	/* if we enter a better mode, do a full update */
-	if (par->last_mode > 1 && mode < par->last_mode)
-		par->update_all(par);
-
-	return count;
-}
-
-static ssize_t flash_show(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-
-	return sprintf(buf, "%d\n", par->flash);
-}
-
-static ssize_t flash_store(struct device *dev, struct device_attribute *attr,
-			   const char *buf, size_t count)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-	int flash, ret;
-
-	ret = kstrtoint(buf, 10, &flash);
-	if (ret)
-		return ret;
-
-	if (flash > 0)
-		par->flash = 1;
-	else
-		par->flash = 0;
-
-	return count;
-}
-
-static ssize_t temp_show(struct device *dev, struct device_attribute *attr,
-			 char *buf)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-	int temp;
-
-	temp = auok190x_read_temperature(par);
-	return sprintf(buf, "%d\n", temp);
-}
-
-static DEVICE_ATTR_RW(update_mode);
-static DEVICE_ATTR_RW(flash);
-static DEVICE_ATTR(temp, 0644, temp_show, NULL);
-
-static struct attribute *auok190x_attributes[] = {
-	&dev_attr_update_mode.attr,
-	&dev_attr_flash.attr,
-	&dev_attr_temp.attr,
-	NULL
-};
-
-static const struct attribute_group auok190x_attr_group = {
-	.attrs		= auok190x_attributes,
-};
-
-static int auok190x_power(struct auok190xfb_par *par, bool on)
-{
-	struct auok190x_board *board = par->board;
-	int ret;
-
-	if (on) {
-		/* We should maintain POWER up for at least 80ms before set
-		 * RST_N and SLP_N to high (TCON spec 20100803_v35 p59)
-		 */
-		ret = regulator_enable(par->regulator);
-		if (ret)
-			return ret;
-
-		msleep(200);
-		gpio_set_value(board->gpio_nrst, 1);
-		gpio_set_value(board->gpio_nsleep, 1);
-		msleep(200);
-	} else {
-		regulator_disable(par->regulator);
-		gpio_set_value(board->gpio_nrst, 0);
-		gpio_set_value(board->gpio_nsleep, 0);
-	}
-
-	return 0;
-}
-
-/*
- * Recovery - powercycle the controller
- */
-
-static void auok190x_recover(struct auok190xfb_par *par)
-{
-	struct device *dev = par->info->device;
-
-	auok190x_power(par, 0);
-	msleep(100);
-	auok190x_power(par, 1);
-
-	/* after powercycling the device, it's always active */
-	pm_runtime_set_active(dev);
-	par->standby = 0;
-
-	par->init(par);
-
-	/* wait for init to complete */
-	par->board->wait_for_rdy(par);
-}
-
-/*
- * Power-management
- */
-static int __maybe_unused auok190x_runtime_suspend(struct device *dev)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-	struct auok190x_board *board = par->board;
-	u16 standby_param;
-
-	/* take and keep the lock until we are resumed, as the controller
-	 * will never reach the non-busy state when in standby mode
-	 */
-	mutex_lock(&(par->io_lock));
-
-	if (par->standby) {
-		dev_warn(dev, "already in standby, runtime-pm pairing mismatch\n");
-		mutex_unlock(&(par->io_lock));
-		return 0;
-	}
-
-	/* according to runtime_pm.txt runtime_suspend only means, that the
-	 * device will not process data and will not communicate with the CPU
-	 * As we hold the lock, this stays true even without standby
-	 */
-	if (board->quirks & AUOK190X_QUIRK_STANDBYBROKEN) {
-		dev_dbg(dev, "runtime suspend without standby\n");
-		goto finish;
-	} else if (board->quirks & AUOK190X_QUIRK_STANDBYPARAM) {
-		/* for some TCON versions STANDBY expects a parameter (0) but
-		 * it seems the real tcon version has to be determined yet.
-		 */
-		dev_dbg(dev, "runtime suspend with additional empty param\n");
-		standby_param = 0;
-		auok190x_send_cmdargs(par, AUOK190X_CMD_STANDBY, 1,
-				      &standby_param);
-	} else {
-		dev_dbg(dev, "runtime suspend without param\n");
-		auok190x_send_command(par, AUOK190X_CMD_STANDBY);
-	}
-
-	msleep(64);
-
-finish:
-	par->standby = 1;
-
-	return 0;
-}
-
-static int __maybe_unused auok190x_runtime_resume(struct device *dev)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-	struct auok190x_board *board = par->board;
-
-	if (!par->standby) {
-		dev_warn(dev, "not in standby, runtime-pm pairing mismatch\n");
-		return 0;
-	}
-
-	if (board->quirks & AUOK190X_QUIRK_STANDBYBROKEN) {
-		dev_dbg(dev, "runtime resume without standby\n");
-	} else {
-		/* when in standby, controller is always busy
-		 * and only accepts the wakeup command
-		 */
-		dev_dbg(dev, "runtime resume from standby\n");
-		auok190x_send_command_nowait(par, AUOK190X_CMD_WAKEUP);
-
-		msleep(160);
-
-		/* wait for the controller to be ready and release the lock */
-		board->wait_for_rdy(par);
-	}
-
-	par->standby = 0;
-
-	mutex_unlock(&(par->io_lock));
-
-	return 0;
-}
-
-static int __maybe_unused auok190x_suspend(struct device *dev)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-	struct auok190x_board *board = par->board;
-	int ret;
-
-	dev_dbg(dev, "suspend\n");
-	if (board->quirks & AUOK190X_QUIRK_STANDBYBROKEN) {
-		/* suspend via powering off the ic */
-		dev_dbg(dev, "suspend with broken standby\n");
-
-		auok190x_power(par, 0);
-	} else {
-		dev_dbg(dev, "suspend using sleep\n");
-
-		/* the sleep state can only be entered from the standby state.
-		 * pm_runtime_get_noresume gets called before the suspend call.
-		 * So the devices usage count is >0 but it is not necessarily
-		 * active.
-		 */
-		if (!pm_runtime_status_suspended(dev)) {
-			ret = auok190x_runtime_suspend(dev);
-			if (ret < 0) {
-				dev_err(dev, "auok190x_runtime_suspend failed with %d\n",
-					ret);
-				return ret;
-			}
-			par->manual_standby = 1;
-		}
-
-		gpio_direction_output(board->gpio_nsleep, 0);
-	}
-
-	msleep(100);
-
-	return 0;
-}
-
-static int __maybe_unused auok190x_resume(struct device *dev)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct auok190xfb_par *par = info->par;
-	struct auok190x_board *board = par->board;
-
-	dev_dbg(dev, "resume\n");
-	if (board->quirks & AUOK190X_QUIRK_STANDBYBROKEN) {
-		dev_dbg(dev, "resume with broken standby\n");
-
-		auok190x_power(par, 1);
-
-		par->init(par);
-	} else {
-		dev_dbg(dev, "resume from sleep\n");
-
-		/* device should be in runtime suspend when we were suspended
-		 * and pm_runtime_put_sync gets called after this function.
-		 * So there is no need to touch the standby mode here at all.
-		 */
-		gpio_direction_output(board->gpio_nsleep, 1);
-		msleep(100);
-
-		/* an additional init call seems to be necessary after sleep */
-		auok190x_runtime_resume(dev);
-		par->init(par);
-
-		/* if we were runtime-suspended before, suspend again*/
-		if (!par->manual_standby)
-			auok190x_runtime_suspend(dev);
-		else
-			par->manual_standby = 0;
-	}
-
-	return 0;
-}
-
-const struct dev_pm_ops auok190x_pm = {
-	SET_RUNTIME_PM_OPS(auok190x_runtime_suspend, auok190x_runtime_resume,
-			   NULL)
-	SET_SYSTEM_SLEEP_PM_OPS(auok190x_suspend, auok190x_resume)
-};
-EXPORT_SYMBOL_GPL(auok190x_pm);
-
-/*
- * Common probe and remove code
- */
-
-int auok190x_common_probe(struct platform_device *pdev,
-			  struct auok190x_init_data *init)
-{
-	struct auok190x_board *board = init->board;
-	struct auok190xfb_par *par;
-	struct fb_info *info;
-	struct panel_info *panel;
-	int videomemorysize, ret;
-	unsigned char *videomemory;
-
-	/* check board contents */
-	if (!board->init || !board->cleanup || !board->wait_for_rdy
-	    || !board->set_ctl || !board->set_hdb || !board->get_hdb
-	    || !board->setup_irq)
-		return -EINVAL;
-
-	info = framebuffer_alloc(sizeof(struct auok190xfb_par), &pdev->dev);
-	if (!info)
-		return -ENOMEM;
-
-	par = info->par;
-	par->info = info;
-	par->board = board;
-	par->recover = auok190x_recover;
-	par->update_partial = init->update_partial;
-	par->update_all = init->update_all;
-	par->need_refresh = init->need_refresh;
-	par->init = init->init;
-
-	/* init update modes */
-	par->update_cnt = 0;
-	par->update_mode = -1;
-	par->last_mode = -1;
-	par->flash = 0;
-
-	par->regulator = regulator_get(info->device, "vdd");
-	if (IS_ERR(par->regulator)) {
-		ret = PTR_ERR(par->regulator);
-		dev_err(info->device, "Failed to get regulator: %d\n", ret);
-		goto err_reg;
-	}
-
-	ret = board->init(par);
-	if (ret) {
-		dev_err(info->device, "board init failed, %d\n", ret);
-		goto err_board;
-	}
-
-	ret = gpio_request(board->gpio_nsleep, "AUOK190x sleep");
-	if (ret) {
-		dev_err(info->device, "could not request sleep gpio, %d\n",
-			ret);
-		goto err_gpio1;
-	}
-
-	ret = gpio_direction_output(board->gpio_nsleep, 0);
-	if (ret) {
-		dev_err(info->device, "could not set sleep gpio, %d\n", ret);
-		goto err_gpio2;
-	}
-
-	ret = gpio_request(board->gpio_nrst, "AUOK190x reset");
-	if (ret) {
-		dev_err(info->device, "could not request reset gpio, %d\n",
-			ret);
-		goto err_gpio2;
-	}
-
-	ret = gpio_direction_output(board->gpio_nrst, 0);
-	if (ret) {
-		dev_err(info->device, "could not set reset gpio, %d\n", ret);
-		goto err_gpio3;
-	}
-
-	ret = auok190x_power(par, 1);
-	if (ret) {
-		dev_err(info->device, "could not power on the device, %d\n",
-			ret);
-		goto err_gpio3;
-	}
-
-	mutex_init(&par->io_lock);
-
-	init_waitqueue_head(&par->waitq);
-
-	ret = par->board->setup_irq(par->info);
-	if (ret) {
-		dev_err(info->device, "could not setup ready-irq, %d\n", ret);
-		goto err_irq;
-	}
-
-	/* wait for init to complete */
-	par->board->wait_for_rdy(par);
-
-	/*
-	 * From here on the controller can talk to us
-	 */
-
-	/* initialise fix, var, resolution and rotation */
-
-	strlcpy(info->fix.id, init->id, 16);
-	info->var.bits_per_pixel = 8;
-	info->var.grayscale = 1;
-
-	panel = &panel_table[board->resolution];
-
-	par->resolution = board->resolution;
-	par->rotation = 0;
-
-	/* videomemory handling */
-
-	videomemorysize = roundup((panel->w * panel->h) * 2, PAGE_SIZE);
-	videomemory = vzalloc(videomemorysize);
-	if (!videomemory) {
-		ret = -ENOMEM;
-		goto err_irq;
-	}
-
-	info->screen_base = (char *)videomemory;
-	info->fix.smem_len = videomemorysize;
-
-	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
-	info->fbops = &auok190xfb_ops;
-
-	ret = auok190xfb_check_var(&info->var, info);
-	if (ret)
-		goto err_defio;
-
-	auok190xfb_set_fix(info);
-
-	/* deferred io init */
-
-	info->fbdefio = devm_kzalloc(info->device,
-				     sizeof(struct fb_deferred_io),
-				     GFP_KERNEL);
-	if (!info->fbdefio) {
-		ret = -ENOMEM;
-		goto err_defio;
-	}
-
-	dev_dbg(info->device, "targeting %d frames per second\n", board->fps);
-	info->fbdefio->delay = HZ / board->fps;
-	info->fbdefio->first_io = auok190xfb_dpy_first_io,
-	info->fbdefio->deferred_io = auok190xfb_dpy_deferred_io,
-	fb_deferred_io_init(info);
-
-	/* color map */
-
-	ret = fb_alloc_cmap(&info->cmap, 256, 0);
-	if (ret < 0) {
-		dev_err(info->device, "Failed to allocate colormap\n");
-		goto err_cmap;
-	}
-
-	/* controller init */
-
-	par->consecutive_threshold = 100;
-	par->init(par);
-	auok190x_identify(par);
-
-	platform_set_drvdata(pdev, info);
-
-	ret = register_framebuffer(info);
-	if (ret < 0)
-		goto err_regfb;
-
-	ret = sysfs_create_group(&info->device->kobj, &auok190x_attr_group);
-	if (ret)
-		goto err_sysfs;
-
-	dev_info(info->device, "fb%d: %dx%d using %dK of video memory\n",
-		 info->node, info->var.xres, info->var.yres,
-		 videomemorysize >> 10);
-
-	/* increase autosuspend_delay when we use alternative methods
-	 * for runtime_pm
-	 */
-	par->autosuspend_delay = (board->quirks & AUOK190X_QUIRK_STANDBYBROKEN)
-					? 1000 : 200;
-
-	pm_runtime_set_active(info->device);
-	pm_runtime_enable(info->device);
-	pm_runtime_set_autosuspend_delay(info->device, par->autosuspend_delay);
-	pm_runtime_use_autosuspend(info->device);
-
-	return 0;
-
-err_sysfs:
-	unregister_framebuffer(info);
-err_regfb:
-	fb_dealloc_cmap(&info->cmap);
-err_cmap:
-	fb_deferred_io_cleanup(info);
-err_defio:
-	vfree((void *)info->screen_base);
-err_irq:
-	auok190x_power(par, 0);
-err_gpio3:
-	gpio_free(board->gpio_nrst);
-err_gpio2:
-	gpio_free(board->gpio_nsleep);
-err_gpio1:
-	board->cleanup(par);
-err_board:
-	regulator_put(par->regulator);
-err_reg:
-	framebuffer_release(info);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(auok190x_common_probe);
-
-int  auok190x_common_remove(struct platform_device *pdev)
-{
-	struct fb_info *info = platform_get_drvdata(pdev);
-	struct auok190xfb_par *par = info->par;
-	struct auok190x_board *board = par->board;
-
-	pm_runtime_disable(info->device);
-
-	sysfs_remove_group(&info->device->kobj, &auok190x_attr_group);
-
-	unregister_framebuffer(info);
-
-	fb_dealloc_cmap(&info->cmap);
-
-	fb_deferred_io_cleanup(info);
-
-	vfree((void *)info->screen_base);
-
-	auok190x_power(par, 0);
-
-	gpio_free(board->gpio_nrst);
-	gpio_free(board->gpio_nsleep);
-
-	board->cleanup(par);
-
-	regulator_put(par->regulator);
-
-	framebuffer_release(info);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(auok190x_common_remove);
-
-MODULE_DESCRIPTION("Common code for AUO-K190X controllers");
-MODULE_AUTHOR("Heiko Stuebner <heiko@sntech.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/auo_k190x.h b/drivers/video/fbdev/auo_k190x.h
deleted file mode 100644
index e35af1f51b28..000000000000
--- a/drivers/video/fbdev/auo_k190x.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Private common definitions for AUO-K190X framebuffer drivers
- *
- * Copyright (C) 2012 Heiko Stuebner <heiko@sntech.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/*
- * I80 interface specific defines
- */
-
-#define AUOK190X_I80_CS			0x01
-#define AUOK190X_I80_DC			0x02
-#define AUOK190X_I80_WR			0x03
-#define AUOK190X_I80_OE			0x04
-
-/*
- * AUOK190x commands, common to both controllers
- */
-
-#define AUOK190X_CMD_INIT		0x0000
-#define AUOK190X_CMD_STANDBY		0x0001
-#define AUOK190X_CMD_WAKEUP		0x0002
-#define AUOK190X_CMD_TCON_RESET		0x0003
-#define AUOK190X_CMD_DATA_STOP		0x1002
-#define AUOK190X_CMD_LUT_START		0x1003
-#define AUOK190X_CMD_DISP_REFRESH	0x1004
-#define AUOK190X_CMD_DISP_RESET		0x1005
-#define AUOK190X_CMD_PRE_DISPLAY_START	0x100D
-#define AUOK190X_CMD_PRE_DISPLAY_STOP	0x100F
-#define AUOK190X_CMD_FLASH_W		0x2000
-#define AUOK190X_CMD_FLASH_E		0x2001
-#define AUOK190X_CMD_FLASH_STS		0x2002
-#define AUOK190X_CMD_FRAMERATE		0x3000
-#define AUOK190X_CMD_READ_VERSION	0x4000
-#define AUOK190X_CMD_READ_STATUS	0x4001
-#define AUOK190X_CMD_READ_LUT		0x4003
-#define AUOK190X_CMD_DRIVERTIMING	0x5000
-#define AUOK190X_CMD_LBALANCE		0x5001
-#define AUOK190X_CMD_AGINGMODE		0x6000
-#define AUOK190X_CMD_AGINGEXIT		0x6001
-
-/*
- * Common settings for AUOK190X_CMD_INIT
- */
-
-#define AUOK190X_INIT_DATA_FILTER	(0 << 12)
-#define AUOK190X_INIT_DATA_BYPASS	(1 << 12)
-#define AUOK190X_INIT_INVERSE_WHITE	(0 << 9)
-#define AUOK190X_INIT_INVERSE_BLACK	(1 << 9)
-#define AUOK190X_INIT_SCAN_DOWN		(0 << 1)
-#define AUOK190X_INIT_SCAN_UP		(1 << 1)
-#define AUOK190X_INIT_SHIFT_LEFT	(0 << 0)
-#define AUOK190X_INIT_SHIFT_RIGHT	(1 << 0)
-
-/* Common bits to pixels
- *   Mode	15-12	11-8	7-4	3-0
- *   format0	4	3	2	1
- *   format1	3	4	1	2
- */
-
-#define AUOK190X_INIT_FORMAT0		0
-#define AUOK190X_INIT_FORMAT1		(1 << 6)
-
-/*
- * settings for AUOK190X_CMD_RESET
- */
-
-#define AUOK190X_RESET_TCON		(0 << 0)
-#define AUOK190X_RESET_NORMAL		(1 << 0)
-#define AUOK190X_RESET_PON		(1 << 1)
-
-/*
- * AUOK190X_CMD_VERSION
- */
-
-#define AUOK190X_VERSION_TEMP_MASK		(0x1ff)
-#define AUOK190X_VERSION_EPD_MASK		(0xff)
-#define AUOK190X_VERSION_SIZE_INT(_val)		((_val & 0xfc00) >> 10)
-#define AUOK190X_VERSION_SIZE_FLOAT(_val)	((_val & 0x3c0) >> 6)
-#define AUOK190X_VERSION_MODEL(_val)		(_val & 0x3f)
-#define AUOK190X_VERSION_LUT(_val)		(_val & 0xff)
-#define AUOK190X_VERSION_TCON(_val)		((_val & 0xff00) >> 8)
-
-/*
- * update modes for CMD_PARTIALDISP on K1900 and CMD_DDMA on K1901
- */
-
-#define AUOK190X_UPDATE_MODE(_res)		((_res & 0x7) << 12)
-#define AUOK190X_UPDATE_NONFLASH		(1 << 15)
-
-/*
- * track panel specific parameters for common init
- */
-
-struct auok190x_init_data {
-	char *id;
-	struct auok190x_board *board;
-
-	void (*update_partial)(struct auok190xfb_par *par, u16 y1, u16 y2);
-	void (*update_all)(struct auok190xfb_par *par);
-	bool (*need_refresh)(struct auok190xfb_par *par);
-	void (*init)(struct auok190xfb_par *par);
-};
-
-
-extern void auok190x_send_command_nowait(struct auok190xfb_par *par, u16 data);
-extern int auok190x_send_command(struct auok190xfb_par *par, u16 data);
-extern void auok190x_send_cmdargs_nowait(struct auok190xfb_par *par, u16 cmd,
-					 int argc, u16 *argv);
-extern int auok190x_send_cmdargs(struct auok190xfb_par *par, u16 cmd,
-				  int argc, u16 *argv);
-extern void auok190x_send_cmdargs_pixels_nowait(struct auok190xfb_par *par,
-						u16 cmd, int argc, u16 *argv,
-						int size, u16 *data);
-extern int auok190x_send_cmdargs_pixels(struct auok190xfb_par *par, u16 cmd,
-					int argc, u16 *argv, int size,
-					u16 *data);
-extern int auok190x_read_cmdargs(struct auok190xfb_par *par, u16 cmd,
-				  int argc, u16 *argv);
-
-extern int auok190x_common_probe(struct platform_device *pdev,
-				 struct auok190x_init_data *init);
-extern int auok190x_common_remove(struct platform_device *pdev);
-
-extern const struct dev_pm_ops auok190x_pm;
diff --git a/include/video/auo_k190xfb.h b/include/video/auo_k190xfb.h
deleted file mode 100644
index ac329ee1d753..000000000000
--- a/include/video/auo_k190xfb.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Definitions for AUO-K190X framebuffer drivers
- *
- * Copyright (C) 2012 Heiko Stuebner <heiko@sntech.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _LINUX_VIDEO_AUO_K190XFB_H_
-#define _LINUX_VIDEO_AUO_K190XFB_H_
-
-/* Controller standby command needs a param */
-#define AUOK190X_QUIRK_STANDBYPARAM	(1 << 0)
-
-/* Controller standby is completely broken */
-#define AUOK190X_QUIRK_STANDBYBROKEN	(1 << 1)
-
-/*
- * Resolutions for the displays
- */
-#define AUOK190X_RESOLUTION_800_600		0
-#define AUOK190X_RESOLUTION_1024_768		1
-#define AUOK190X_RESOLUTION_600_800		4
-#define AUOK190X_RESOLUTION_768_1024		5
-
-/*
- * struct used by auok190x. board specific stuff comes from *board
- */
-struct auok190xfb_par {
-	struct fb_info *info;
-	struct auok190x_board *board;
-
-	struct regulator *regulator;
-
-	struct mutex io_lock;
-	struct delayed_work work;
-	wait_queue_head_t waitq;
-	int resolution;
-	int rotation;
-	int consecutive_threshold;
-	int update_cnt;
-
-	/* panel and controller informations */
-	int epd_type;
-	int panel_size_int;
-	int panel_size_float;
-	int panel_model;
-	int tcon_version;
-	int lut_version;
-
-	/* individual controller callbacks */
-	void (*update_partial)(struct auok190xfb_par *par, u16 y1, u16 y2);
-	void (*update_all)(struct auok190xfb_par *par);
-	bool (*need_refresh)(struct auok190xfb_par *par);
-	void (*init)(struct auok190xfb_par *par);
-	void (*recover)(struct auok190xfb_par *par);
-
-	int update_mode; /* mode to use for updates */
-	int last_mode; /* update mode last used */
-	int flash;
-
-	/* power management */
-	int autosuspend_delay;
-	bool standby;
-	bool manual_standby;
-};
-
-/**
- * Board specific platform-data
- * @init:		initialize the controller interface
- * @cleanup:		cleanup the controller interface
- * @wait_for_rdy:	wait until the controller is not busy anymore
- * @set_ctl:		change an interface control
- * @set_hdb:		write a value to the data register
- * @get_hdb:		read a value from the data register
- * @setup_irq:		method to setup the irq handling on the busy gpio
- * @gpio_nsleep:	sleep gpio
- * @gpio_nrst:		reset gpio
- * @gpio_nbusy:		busy gpio
- * @resolution:		one of the AUOK190X_RESOLUTION constants
- * @rotation:		rotation of the framebuffer
- * @quirks:		controller quirks to honor
- * @fps:		frames per second for defio
- */
-struct auok190x_board {
-	int (*init)(struct auok190xfb_par *);
-	void (*cleanup)(struct auok190xfb_par *);
-	int (*wait_for_rdy)(struct auok190xfb_par *);
-
-	void (*set_ctl)(struct auok190xfb_par *, unsigned char, u8);
-	void (*set_hdb)(struct auok190xfb_par *, u16);
-	u16 (*get_hdb)(struct auok190xfb_par *);
-
-	int (*setup_irq)(struct fb_info *);
-
-	int gpio_nsleep;
-	int gpio_nrst;
-	int gpio_nbusy;
-
-	int resolution;
-	int quirks;
-	int fps;
-};
-
-#endif
-- 
cgit v1.2.3


From 9076aa994a9e3b63ed9c79f5f46ffa2b5a001249 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 14 May 2018 15:47:30 +0200
Subject: video: fbdev: sh_mobile_lcdcfb: remove unused MERAM support

Since commit a521422ea4ae  ("ARM: shmobile: mackerel: Remove Legacy C
board code") MERAM functionality is unused. Remove it.

Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/Kconfig            |  1 -
 drivers/video/fbdev/sh_mobile_lcdcfb.c | 63 +---------------------------------
 drivers/video/fbdev/sh_mobile_lcdcfb.h |  1 -
 include/video/sh_mobile_lcdc.h         |  3 --
 4 files changed, 1 insertion(+), 67 deletions(-)

(limited to 'include')

diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 381a9c588792..79634a134074 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -1888,7 +1888,6 @@ config FB_W100
 config FB_SH_MOBILE_LCDC
 	tristate "SuperH Mobile LCDC framebuffer support"
 	depends on FB && (SUPERH || ARCH_RENESAS) && HAVE_CLK
-	depends on FB_SH_MOBILE_MERAM || !FB_SH_MOBILE_MERAM
 	select FB_SYS_FILLRECT
 	select FB_SYS_COPYAREA
 	select FB_SYS_IMAGEBLIT
diff --git a/drivers/video/fbdev/sh_mobile_lcdcfb.c b/drivers/video/fbdev/sh_mobile_lcdcfb.c
index 929b1c51aab6..dc46be38c970 100644
--- a/drivers/video/fbdev/sh_mobile_lcdcfb.c
+++ b/drivers/video/fbdev/sh_mobile_lcdcfb.c
@@ -29,7 +29,6 @@
 #include <linux/vmalloc.h>
 
 #include <video/sh_mobile_lcdc.h>
-#include <video/sh_mobile_meram.h>
 
 #include "sh_mobile_lcdcfb.h"
 
@@ -217,7 +216,6 @@ struct sh_mobile_lcdc_priv {
 	struct notifier_block notifier;
 	int started;
 	int forced_fourcc; /* 2 channel LCDC must share fourcc setting */
-	struct sh_mobile_meram_info *meram_dev;
 };
 
 /* -----------------------------------------------------------------------------
@@ -346,16 +344,12 @@ static void sh_mobile_lcdc_clk_on(struct sh_mobile_lcdc_priv *priv)
 		if (priv->dot_clk)
 			clk_prepare_enable(priv->dot_clk);
 		pm_runtime_get_sync(priv->dev);
-		if (priv->meram_dev && priv->meram_dev->pdev)
-			pm_runtime_get_sync(&priv->meram_dev->pdev->dev);
 	}
 }
 
 static void sh_mobile_lcdc_clk_off(struct sh_mobile_lcdc_priv *priv)
 {
 	if (atomic_sub_return(1, &priv->hw_usecnt) == -1) {
-		if (priv->meram_dev && priv->meram_dev->pdev)
-			pm_runtime_put_sync(&priv->meram_dev->pdev->dev);
 		pm_runtime_put(priv->dev);
 		if (priv->dot_clk)
 			clk_disable_unprepare(priv->dot_clk);
@@ -1073,7 +1067,6 @@ static void __sh_mobile_lcdc_start(struct sh_mobile_lcdc_priv *priv)
 
 static int sh_mobile_lcdc_start(struct sh_mobile_lcdc_priv *priv)
 {
-	struct sh_mobile_meram_info *mdev = priv->meram_dev;
 	struct sh_mobile_lcdc_chan *ch;
 	unsigned long tmp;
 	int ret;
@@ -1106,9 +1099,6 @@ static int sh_mobile_lcdc_start(struct sh_mobile_lcdc_priv *priv)
 
 	/* Compute frame buffer base address and pitch for each channel. */
 	for (k = 0; k < ARRAY_SIZE(priv->ch); k++) {
-		int pixelformat;
-		void *cache;
-
 		ch = &priv->ch[k];
 		if (!ch->enabled)
 			continue;
@@ -1117,45 +1107,6 @@ static int sh_mobile_lcdc_start(struct sh_mobile_lcdc_priv *priv)
 		ch->base_addr_c = ch->dma_handle
 				+ ch->xres_virtual * ch->yres_virtual;
 		ch->line_size = ch->pitch;
-
-		/* Enable MERAM if possible. */
-		if (mdev == NULL || ch->cfg->meram_cfg == NULL)
-			continue;
-
-		/* Free the allocated MERAM cache. */
-		if (ch->cache) {
-			sh_mobile_meram_cache_free(mdev, ch->cache);
-			ch->cache = NULL;
-		}
-
-		switch (ch->format->fourcc) {
-		case V4L2_PIX_FMT_NV12:
-		case V4L2_PIX_FMT_NV21:
-		case V4L2_PIX_FMT_NV16:
-		case V4L2_PIX_FMT_NV61:
-			pixelformat = SH_MOBILE_MERAM_PF_NV;
-			break;
-		case V4L2_PIX_FMT_NV24:
-		case V4L2_PIX_FMT_NV42:
-			pixelformat = SH_MOBILE_MERAM_PF_NV24;
-			break;
-		case V4L2_PIX_FMT_RGB565:
-		case V4L2_PIX_FMT_BGR24:
-		case V4L2_PIX_FMT_BGR32:
-		default:
-			pixelformat = SH_MOBILE_MERAM_PF_RGB;
-			break;
-		}
-
-		cache = sh_mobile_meram_cache_alloc(mdev, ch->cfg->meram_cfg,
-					ch->pitch, ch->yres, pixelformat,
-					&ch->line_size);
-		if (!IS_ERR(cache)) {
-			sh_mobile_meram_cache_update(mdev, cache,
-					ch->base_addr_y, ch->base_addr_c,
-					&ch->base_addr_y, &ch->base_addr_c);
-			ch->cache = cache;
-		}
 	}
 
 	for (k = 0; k < ARRAY_SIZE(priv->overlays); ++k) {
@@ -1223,13 +1174,6 @@ static void sh_mobile_lcdc_stop(struct sh_mobile_lcdc_priv *priv)
 		}
 
 		sh_mobile_lcdc_display_off(ch);
-
-		/* Free the MERAM cache. */
-		if (ch->cache) {
-			sh_mobile_meram_cache_free(priv->meram_dev, ch->cache);
-			ch->cache = NULL;
-		}
-
 	}
 
 	/* stop the lcdc */
@@ -1851,11 +1795,6 @@ static int sh_mobile_lcdc_pan(struct fb_var_screeninfo *var,
 	base_addr_c = ch->dma_handle + ch->xres_virtual * ch->yres_virtual
 		    + c_offset;
 
-	if (ch->cache)
-		sh_mobile_meram_cache_update(priv->meram_dev, ch->cache,
-					     base_addr_y, base_addr_c,
-					     &base_addr_y, &base_addr_c);
-
 	ch->base_addr_y = base_addr_y;
 	ch->base_addr_c = base_addr_c;
 	ch->pan_y_offset = y_offset;
@@ -2718,7 +2657,7 @@ static int sh_mobile_lcdc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	priv->dev = &pdev->dev;
-	priv->meram_dev = pdata->meram_dev;
+
 	for (i = 0; i < ARRAY_SIZE(priv->ch); i++)
 		mutex_init(&priv->ch[i].open_lock);
 	platform_set_drvdata(pdev, priv);
diff --git a/drivers/video/fbdev/sh_mobile_lcdcfb.h b/drivers/video/fbdev/sh_mobile_lcdcfb.h
index cc52c74721fe..b8e47a8bd8ab 100644
--- a/drivers/video/fbdev/sh_mobile_lcdcfb.h
+++ b/drivers/video/fbdev/sh_mobile_lcdcfb.h
@@ -61,7 +61,6 @@ struct sh_mobile_lcdc_chan {
 	unsigned long *reg_offs;
 	unsigned long ldmt1r_value;
 	unsigned long enabled; /* ME and SE in LDCNT2R */
-	void *cache;
 
 	struct mutex open_lock;		/* protects the use counter */
 	int use_count;
diff --git a/include/video/sh_mobile_lcdc.h b/include/video/sh_mobile_lcdc.h
index f706b0fed399..84aa976ca4ea 100644
--- a/include/video/sh_mobile_lcdc.h
+++ b/include/video/sh_mobile_lcdc.h
@@ -3,7 +3,6 @@
 #define __ASM_SH_MOBILE_LCDC_H__
 
 #include <linux/fb.h>
-#include <video/sh_mobile_meram.h>
 
 /* Register definitions */
 #define _LDDCKR			0x410
@@ -184,7 +183,6 @@ struct sh_mobile_lcdc_chan_cfg {
 	struct sh_mobile_lcdc_panel_cfg panel_cfg;
 	struct sh_mobile_lcdc_bl_info bl_info;
 	struct sh_mobile_lcdc_sys_bus_cfg sys_bus_cfg; /* only for SYSn I/F */
-	const struct sh_mobile_meram_cfg *meram_cfg;
 
 	struct platform_device *tx_dev;	/* HDMI/DSI transmitter device */
 };
@@ -193,7 +191,6 @@ struct sh_mobile_lcdc_info {
 	int clock_source;
 	struct sh_mobile_lcdc_chan_cfg ch[2];
 	struct sh_mobile_lcdc_overlay_cfg overlays[4];
-	struct sh_mobile_meram_info *meram_dev;
 };
 
 #endif /* __ASM_SH_MOBILE_LCDC_H__ */
-- 
cgit v1.2.3


From e7deb3c7741eaa558458696e55f57141886fcc5c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 14 May 2018 15:47:30 +0200
Subject: drm: shmobile: remove unused MERAM support

Since commit a521422ea4ae  ("ARM: shmobile: mackerel: Remove Legacy C
board code") MERAM functionality is unused. Remove it.

Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/gpu/drm/shmobile/Kconfig           |  1 -
 drivers/gpu/drm/shmobile/shmob_drm_crtc.c  | 42 ------------------------------
 drivers/gpu/drm/shmobile/shmob_drm_crtc.h  |  1 -
 drivers/gpu/drm/shmobile/shmob_drm_drv.h   |  2 --
 drivers/gpu/drm/shmobile/shmob_drm_kms.c   | 11 --------
 drivers/gpu/drm/shmobile/shmob_drm_kms.h   |  1 -
 drivers/gpu/drm/shmobile/shmob_drm_plane.c |  2 --
 include/linux/platform_data/shmob_drm.h    |  4 ---
 8 files changed, 64 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/shmobile/Kconfig b/drivers/gpu/drm/shmobile/Kconfig
index c987c826daa3..0426d66660d1 100644
--- a/drivers/gpu/drm/shmobile/Kconfig
+++ b/drivers/gpu/drm/shmobile/Kconfig
@@ -2,7 +2,6 @@ config DRM_SHMOBILE
 	tristate "DRM Support for SH Mobile"
 	depends on DRM && ARM
 	depends on ARCH_SHMOBILE || COMPILE_TEST
-	depends on FB_SH_MOBILE_MERAM || !FB_SH_MOBILE_MERAM
 	select BACKLIGHT_CLASS_DEVICE
 	select BACKLIGHT_LCD_SUPPORT
 	select DRM_KMS_HELPER
diff --git a/drivers/gpu/drm/shmobile/shmob_drm_crtc.c b/drivers/gpu/drm/shmobile/shmob_drm_crtc.c
index e7738939a86d..40df8887fc17 100644
--- a/drivers/gpu/drm/shmobile/shmob_drm_crtc.c
+++ b/drivers/gpu/drm/shmobile/shmob_drm_crtc.c
@@ -21,8 +21,6 @@
 #include <drm/drm_gem_cma_helper.h>
 #include <drm/drm_plane_helper.h>
 
-#include <video/sh_mobile_meram.h>
-
 #include "shmob_drm_backlight.h"
 #include "shmob_drm_crtc.h"
 #include "shmob_drm_drv.h"
@@ -47,20 +45,12 @@ static int shmob_drm_clk_on(struct shmob_drm_device *sdev)
 		if (ret < 0)
 			return ret;
 	}
-#if 0
-	if (sdev->meram_dev && sdev->meram_dev->pdev)
-		pm_runtime_get_sync(&sdev->meram_dev->pdev->dev);
-#endif
 
 	return 0;
 }
 
 static void shmob_drm_clk_off(struct shmob_drm_device *sdev)
 {
-#if 0
-	if (sdev->meram_dev && sdev->meram_dev->pdev)
-		pm_runtime_put_sync(&sdev->meram_dev->pdev->dev);
-#endif
 	if (sdev->clock)
 		clk_disable_unprepare(sdev->clock);
 }
@@ -269,12 +259,6 @@ static void shmob_drm_crtc_stop(struct shmob_drm_crtc *scrtc)
 	if (!scrtc->started)
 		return;
 
-	/* Disable the MERAM cache. */
-	if (scrtc->cache) {
-		sh_mobile_meram_cache_free(sdev->meram, scrtc->cache);
-		scrtc->cache = NULL;
-	}
-
 	/* Stop the LCDC. */
 	shmob_drm_crtc_start_stop(scrtc, false);
 
@@ -305,7 +289,6 @@ static void shmob_drm_crtc_compute_base(struct shmob_drm_crtc *scrtc,
 {
 	struct drm_crtc *crtc = &scrtc->crtc;
 	struct drm_framebuffer *fb = crtc->primary->fb;
-	struct shmob_drm_device *sdev = crtc->dev->dev_private;
 	struct drm_gem_cma_object *gem;
 	unsigned int bpp;
 
@@ -321,11 +304,6 @@ static void shmob_drm_crtc_compute_base(struct shmob_drm_crtc *scrtc,
 			      + y / (bpp == 4 ? 2 : 1) * fb->pitches[1]
 			      + x * (bpp == 16 ? 2 : 1);
 	}
-
-	if (scrtc->cache)
-		sh_mobile_meram_cache_update(sdev->meram, scrtc->cache,
-					     scrtc->dma[0], scrtc->dma[1],
-					     &scrtc->dma[0], &scrtc->dma[1]);
 }
 
 static void shmob_drm_crtc_update_base(struct shmob_drm_crtc *scrtc)
@@ -372,9 +350,7 @@ static int shmob_drm_crtc_mode_set(struct drm_crtc *crtc,
 {
 	struct shmob_drm_crtc *scrtc = to_shmob_crtc(crtc);
 	struct shmob_drm_device *sdev = crtc->dev->dev_private;
-	const struct sh_mobile_meram_cfg *mdata = sdev->pdata->meram;
 	const struct shmob_drm_format_info *format;
-	void *cache;
 
 	format = shmob_drm_format_info(crtc->primary->fb->format->format);
 	if (format == NULL) {
@@ -386,24 +362,6 @@ static int shmob_drm_crtc_mode_set(struct drm_crtc *crtc,
 	scrtc->format = format;
 	scrtc->line_size = crtc->primary->fb->pitches[0];
 
-	if (sdev->meram) {
-		/* Enable MERAM cache if configured. We need to de-init
-		 * configured ICBs before we can re-initialize them.
-		 */
-		if (scrtc->cache) {
-			sh_mobile_meram_cache_free(sdev->meram, scrtc->cache);
-			scrtc->cache = NULL;
-		}
-
-		cache = sh_mobile_meram_cache_alloc(sdev->meram, mdata,
-						    crtc->primary->fb->pitches[0],
-						    adjusted_mode->vdisplay,
-						    format->meram,
-						    &scrtc->line_size);
-		if (!IS_ERR(cache))
-			scrtc->cache = cache;
-	}
-
 	shmob_drm_crtc_compute_base(scrtc, x, y);
 
 	return 0;
diff --git a/drivers/gpu/drm/shmobile/shmob_drm_crtc.h b/drivers/gpu/drm/shmobile/shmob_drm_crtc.h
index f152973df11c..c11f421737dc 100644
--- a/drivers/gpu/drm/shmobile/shmob_drm_crtc.h
+++ b/drivers/gpu/drm/shmobile/shmob_drm_crtc.h
@@ -28,7 +28,6 @@ struct shmob_drm_crtc {
 	int dpms;
 
 	const struct shmob_drm_format_info *format;
-	void *cache;
 	unsigned long dma[2];
 	unsigned int line_size;
 	bool started;
diff --git a/drivers/gpu/drm/shmobile/shmob_drm_drv.h b/drivers/gpu/drm/shmobile/shmob_drm_drv.h
index 02ea315ba69a..088a6e55fa29 100644
--- a/drivers/gpu/drm/shmobile/shmob_drm_drv.h
+++ b/drivers/gpu/drm/shmobile/shmob_drm_drv.h
@@ -23,7 +23,6 @@
 struct clk;
 struct device;
 struct drm_device;
-struct sh_mobile_meram_info;
 
 struct shmob_drm_device {
 	struct device *dev;
@@ -31,7 +30,6 @@ struct shmob_drm_device {
 
 	void __iomem *mmio;
 	struct clk *clock;
-	struct sh_mobile_meram_info *meram;
 	u32 lddckr;
 	u32 ldmt1r;
 
diff --git a/drivers/gpu/drm/shmobile/shmob_drm_kms.c b/drivers/gpu/drm/shmobile/shmob_drm_kms.c
index d36919b14da7..447638581c08 100644
--- a/drivers/gpu/drm/shmobile/shmob_drm_kms.c
+++ b/drivers/gpu/drm/shmobile/shmob_drm_kms.c
@@ -18,8 +18,6 @@
 #include <drm/drm_gem_cma_helper.h>
 #include <drm/drm_gem_framebuffer_helper.h>
 
-#include <video/sh_mobile_meram.h>
-
 #include "shmob_drm_crtc.h"
 #include "shmob_drm_drv.h"
 #include "shmob_drm_kms.h"
@@ -35,55 +33,46 @@ static const struct shmob_drm_format_info shmob_drm_format_infos[] = {
 		.bpp = 16,
 		.yuv = false,
 		.lddfr = LDDFR_PKF_RGB16,
-		.meram = SH_MOBILE_MERAM_PF_RGB,
 	}, {
 		.fourcc = DRM_FORMAT_RGB888,
 		.bpp = 24,
 		.yuv = false,
 		.lddfr = LDDFR_PKF_RGB24,
-		.meram = SH_MOBILE_MERAM_PF_RGB,
 	}, {
 		.fourcc = DRM_FORMAT_ARGB8888,
 		.bpp = 32,
 		.yuv = false,
 		.lddfr = LDDFR_PKF_ARGB32,
-		.meram = SH_MOBILE_MERAM_PF_RGB,
 	}, {
 		.fourcc = DRM_FORMAT_NV12,
 		.bpp = 12,
 		.yuv = true,
 		.lddfr = LDDFR_CC | LDDFR_YF_420,
-		.meram = SH_MOBILE_MERAM_PF_NV,
 	}, {
 		.fourcc = DRM_FORMAT_NV21,
 		.bpp = 12,
 		.yuv = true,
 		.lddfr = LDDFR_CC | LDDFR_YF_420,
-		.meram = SH_MOBILE_MERAM_PF_NV,
 	}, {
 		.fourcc = DRM_FORMAT_NV16,
 		.bpp = 16,
 		.yuv = true,
 		.lddfr = LDDFR_CC | LDDFR_YF_422,
-		.meram = SH_MOBILE_MERAM_PF_NV,
 	}, {
 		.fourcc = DRM_FORMAT_NV61,
 		.bpp = 16,
 		.yuv = true,
 		.lddfr = LDDFR_CC | LDDFR_YF_422,
-		.meram = SH_MOBILE_MERAM_PF_NV,
 	}, {
 		.fourcc = DRM_FORMAT_NV24,
 		.bpp = 24,
 		.yuv = true,
 		.lddfr = LDDFR_CC | LDDFR_YF_444,
-		.meram = SH_MOBILE_MERAM_PF_NV24,
 	}, {
 		.fourcc = DRM_FORMAT_NV42,
 		.bpp = 24,
 		.yuv = true,
 		.lddfr = LDDFR_CC | LDDFR_YF_444,
-		.meram = SH_MOBILE_MERAM_PF_NV24,
 	},
 };
 
diff --git a/drivers/gpu/drm/shmobile/shmob_drm_kms.h b/drivers/gpu/drm/shmobile/shmob_drm_kms.h
index 06d5b7caa026..753e2817dc2c 100644
--- a/drivers/gpu/drm/shmobile/shmob_drm_kms.h
+++ b/drivers/gpu/drm/shmobile/shmob_drm_kms.h
@@ -24,7 +24,6 @@ struct shmob_drm_format_info {
 	unsigned int bpp;
 	bool yuv;
 	u32 lddfr;
-	unsigned int meram;
 };
 
 const struct shmob_drm_format_info *shmob_drm_format_info(u32 fourcc);
diff --git a/drivers/gpu/drm/shmobile/shmob_drm_plane.c b/drivers/gpu/drm/shmobile/shmob_drm_plane.c
index 97f6e4a3eb0d..1d0359f713ca 100644
--- a/drivers/gpu/drm/shmobile/shmob_drm_plane.c
+++ b/drivers/gpu/drm/shmobile/shmob_drm_plane.c
@@ -17,8 +17,6 @@
 #include <drm/drm_fb_cma_helper.h>
 #include <drm/drm_gem_cma_helper.h>
 
-#include <video/sh_mobile_meram.h>
-
 #include "shmob_drm_drv.h"
 #include "shmob_drm_kms.h"
 #include "shmob_drm_plane.h"
diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index 7c686d335c12..ee495d707f17 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -18,9 +18,6 @@
 
 #include <drm/drm_mode.h>
 
-struct sh_mobile_meram_cfg;
-struct sh_mobile_meram_info;
-
 enum shmob_drm_clk_source {
 	SHMOB_DRM_CLK_BUS,
 	SHMOB_DRM_CLK_PERIPHERAL,
@@ -93,7 +90,6 @@ struct shmob_drm_platform_data {
 	struct shmob_drm_interface_data iface;
 	struct shmob_drm_panel_data panel;
 	struct shmob_drm_backlight_data backlight;
-	const struct sh_mobile_meram_cfg *meram;
 };
 
 #endif /* __SHMOB_DRM_H__ */
-- 
cgit v1.2.3


From 187a60358a90125e97671e6e4e5a1e412667bdab Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 14 May 2018 15:47:30 +0200
Subject: video: fbdev: remove unused sh_mobile_meram driver

Since commit a521422ea4ae  ("ARM: shmobile: mackerel: Remove Legacy C
board code") MERAM functionality is unused. Remove it.

Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/Kconfig           |  12 -
 drivers/video/fbdev/Makefile          |   1 -
 drivers/video/fbdev/sh_mobile_meram.c | 754 ----------------------------------
 include/video/sh_mobile_meram.h       |  95 -----
 4 files changed, 862 deletions(-)
 delete mode 100644 drivers/video/fbdev/sh_mobile_meram.c
 delete mode 100644 include/video/sh_mobile_meram.h

(limited to 'include')

diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 79634a134074..c8c3e9ffd47d 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -2312,18 +2312,6 @@ source "drivers/video/fbdev/omap/Kconfig"
 source "drivers/video/fbdev/omap2/Kconfig"
 source "drivers/video/fbdev/mmp/Kconfig"
 
-config FB_SH_MOBILE_MERAM
-	tristate "SuperH Mobile MERAM read ahead support"
-	depends on ARCH_SHMOBILE
-	select GENERIC_ALLOCATOR
-	---help---
-	  Enable MERAM support for the SuperH controller.
-
-	  This will allow for caching of the framebuffer to provide more
-	  reliable access under heavy main memory bus traffic situations.
-	  Up to 4 memory channels can be configured, allowing 4 RGB or
-	  2 YCbCr framebuffers to be configured.
-
 config FB_SSD1307
 	tristate "Solomon SSD1307 framebuffer support"
 	depends on FB && I2C
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index 317a6fd0e54e..13c900320c2c 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -113,7 +113,6 @@ obj-$(CONFIG_FB_SM501)            += sm501fb.o
 obj-$(CONFIG_FB_UDL)		  += udlfb.o
 obj-$(CONFIG_FB_SMSCUFX)	  += smscufx.o
 obj-$(CONFIG_FB_XILINX)           += xilinxfb.o
-obj-$(CONFIG_FB_SH_MOBILE_MERAM)  += sh_mobile_meram.o
 obj-$(CONFIG_FB_SH_MOBILE_LCDC)	  += sh_mobile_lcdcfb.o
 obj-$(CONFIG_FB_OMAP)             += omap/
 obj-y                             += omap2/
diff --git a/drivers/video/fbdev/sh_mobile_meram.c b/drivers/video/fbdev/sh_mobile_meram.c
deleted file mode 100644
index da9df12f63f0..000000000000
--- a/drivers/video/fbdev/sh_mobile_meram.c
+++ /dev/null
@@ -1,754 +0,0 @@
-/*
- * SuperH Mobile MERAM Driver for SuperH Mobile LCDC Driver
- *
- * Copyright (c) 2011	Damian Hobson-Garcia <dhobsong@igel.co.jp>
- *                      Takanari Hayama <taki@igel.co.jp>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/genalloc.h>
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/pm_runtime.h>
-#include <linux/slab.h>
-
-#include <video/sh_mobile_meram.h>
-
-/* -----------------------------------------------------------------------------
- * MERAM registers
- */
-
-#define MEVCR1			0x4
-#define MEVCR1_RST		(1 << 31)
-#define MEVCR1_WD		(1 << 30)
-#define MEVCR1_AMD1		(1 << 29)
-#define MEVCR1_AMD0		(1 << 28)
-#define MEQSEL1			0x40
-#define MEQSEL2			0x44
-
-#define MExxCTL			0x400
-#define MExxCTL_BV		(1 << 31)
-#define MExxCTL_BSZ_SHIFT	28
-#define MExxCTL_MSAR_MASK	(0x7ff << MExxCTL_MSAR_SHIFT)
-#define MExxCTL_MSAR_SHIFT	16
-#define MExxCTL_NXT_MASK	(0x1f << MExxCTL_NXT_SHIFT)
-#define MExxCTL_NXT_SHIFT	11
-#define MExxCTL_WD1		(1 << 10)
-#define MExxCTL_WD0		(1 << 9)
-#define MExxCTL_WS		(1 << 8)
-#define MExxCTL_CB		(1 << 7)
-#define MExxCTL_WBF		(1 << 6)
-#define MExxCTL_WF		(1 << 5)
-#define MExxCTL_RF		(1 << 4)
-#define MExxCTL_CM		(1 << 3)
-#define MExxCTL_MD_READ		(1 << 0)
-#define MExxCTL_MD_WRITE	(2 << 0)
-#define MExxCTL_MD_ICB_WB	(3 << 0)
-#define MExxCTL_MD_ICB		(4 << 0)
-#define MExxCTL_MD_FB		(7 << 0)
-#define MExxCTL_MD_MASK		(7 << 0)
-#define MExxBSIZE		0x404
-#define MExxBSIZE_RCNT_SHIFT	28
-#define MExxBSIZE_YSZM1_SHIFT	16
-#define MExxBSIZE_XSZM1_SHIFT	0
-#define MExxMNCF		0x408
-#define MExxMNCF_KWBNM_SHIFT	28
-#define MExxMNCF_KRBNM_SHIFT	24
-#define MExxMNCF_BNM_SHIFT	16
-#define MExxMNCF_XBV		(1 << 15)
-#define MExxMNCF_CPL_YCBCR444	(1 << 12)
-#define MExxMNCF_CPL_YCBCR420	(2 << 12)
-#define MExxMNCF_CPL_YCBCR422	(3 << 12)
-#define MExxMNCF_CPL_MSK	(3 << 12)
-#define MExxMNCF_BL		(1 << 2)
-#define MExxMNCF_LNM_SHIFT	0
-#define MExxSARA		0x410
-#define MExxSARB		0x414
-#define MExxSBSIZE		0x418
-#define MExxSBSIZE_HDV		(1 << 31)
-#define MExxSBSIZE_HSZ16	(0 << 28)
-#define MExxSBSIZE_HSZ32	(1 << 28)
-#define MExxSBSIZE_HSZ64	(2 << 28)
-#define MExxSBSIZE_HSZ128	(3 << 28)
-#define MExxSBSIZE_SBSIZZ_SHIFT	0
-
-#define MERAM_MExxCTL_VAL(next, addr)	\
-	((((next) << MExxCTL_NXT_SHIFT) & MExxCTL_NXT_MASK) | \
-	 (((addr) << MExxCTL_MSAR_SHIFT) & MExxCTL_MSAR_MASK))
-#define	MERAM_MExxBSIZE_VAL(rcnt, yszm1, xszm1) \
-	(((rcnt) << MExxBSIZE_RCNT_SHIFT) | \
-	 ((yszm1) << MExxBSIZE_YSZM1_SHIFT) | \
-	 ((xszm1) << MExxBSIZE_XSZM1_SHIFT))
-
-static const unsigned long common_regs[] = {
-	MEVCR1,
-	MEQSEL1,
-	MEQSEL2,
-};
-#define MERAM_REGS_SIZE ARRAY_SIZE(common_regs)
-
-static const unsigned long icb_regs[] = {
-	MExxCTL,
-	MExxBSIZE,
-	MExxMNCF,
-	MExxSARA,
-	MExxSARB,
-	MExxSBSIZE,
-};
-#define ICB_REGS_SIZE ARRAY_SIZE(icb_regs)
-
-/*
- * sh_mobile_meram_icb - MERAM ICB information
- * @regs: Registers cache
- * @index: ICB index
- * @offset: MERAM block offset
- * @size: MERAM block size in KiB
- * @cache_unit: Bytes to cache per ICB
- * @pixelformat: Video pixel format of the data stored in the ICB
- * @current_reg: Which of Start Address Register A (0) or B (1) is in use
- */
-struct sh_mobile_meram_icb {
-	unsigned long regs[ICB_REGS_SIZE];
-	unsigned int index;
-	unsigned long offset;
-	unsigned int size;
-
-	unsigned int cache_unit;
-	unsigned int pixelformat;
-	unsigned int current_reg;
-};
-
-#define MERAM_ICB_NUM			32
-
-struct sh_mobile_meram_fb_plane {
-	struct sh_mobile_meram_icb *marker;
-	struct sh_mobile_meram_icb *cache;
-};
-
-struct sh_mobile_meram_fb_cache {
-	unsigned int nplanes;
-	struct sh_mobile_meram_fb_plane planes[2];
-};
-
-/*
- * sh_mobile_meram_priv - MERAM device
- * @base: Registers base address
- * @meram: MERAM physical address
- * @regs: Registers cache
- * @lock: Protects used_icb and icbs
- * @used_icb: Bitmask of used ICBs
- * @icbs: ICBs
- * @pool: Allocation pool to manage the MERAM
- */
-struct sh_mobile_meram_priv {
-	void __iomem *base;
-	unsigned long meram;
-	unsigned long regs[MERAM_REGS_SIZE];
-
-	struct mutex lock;
-	unsigned long used_icb;
-	struct sh_mobile_meram_icb icbs[MERAM_ICB_NUM];
-
-	struct gen_pool *pool;
-};
-
-/* settings */
-#define MERAM_GRANULARITY		1024
-#define MERAM_SEC_LINE			15
-#define MERAM_LINE_WIDTH		2048
-
-/* -----------------------------------------------------------------------------
- * Registers access
- */
-
-#define MERAM_ICB_OFFSET(base, idx, off)	((base) + (off) + (idx) * 0x20)
-
-static inline void meram_write_icb(void __iomem *base, unsigned int idx,
-				   unsigned int off, unsigned long val)
-{
-	iowrite32(val, MERAM_ICB_OFFSET(base, idx, off));
-}
-
-static inline unsigned long meram_read_icb(void __iomem *base, unsigned int idx,
-					   unsigned int off)
-{
-	return ioread32(MERAM_ICB_OFFSET(base, idx, off));
-}
-
-static inline void meram_write_reg(void __iomem *base, unsigned int off,
-				   unsigned long val)
-{
-	iowrite32(val, base + off);
-}
-
-static inline unsigned long meram_read_reg(void __iomem *base, unsigned int off)
-{
-	return ioread32(base + off);
-}
-
-/* -----------------------------------------------------------------------------
- * MERAM allocation and free
- */
-
-static unsigned long meram_alloc(struct sh_mobile_meram_priv *priv, size_t size)
-{
-	return gen_pool_alloc(priv->pool, size);
-}
-
-static void meram_free(struct sh_mobile_meram_priv *priv, unsigned long mem,
-		       size_t size)
-{
-	gen_pool_free(priv->pool, mem, size);
-}
-
-/* -----------------------------------------------------------------------------
- * LCDC cache planes allocation, init, cleanup and free
- */
-
-/* Allocate ICBs and MERAM for a plane. */
-static int meram_plane_alloc(struct sh_mobile_meram_priv *priv,
-			     struct sh_mobile_meram_fb_plane *plane,
-			     size_t size)
-{
-	unsigned long mem;
-	unsigned long idx;
-
-	idx = find_first_zero_bit(&priv->used_icb, 28);
-	if (idx == 28)
-		return -ENOMEM;
-	plane->cache = &priv->icbs[idx];
-
-	idx = find_next_zero_bit(&priv->used_icb, 32, 28);
-	if (idx == 32)
-		return -ENOMEM;
-	plane->marker = &priv->icbs[idx];
-
-	mem = meram_alloc(priv, size * 1024);
-	if (mem == 0)
-		return -ENOMEM;
-
-	__set_bit(plane->marker->index, &priv->used_icb);
-	__set_bit(plane->cache->index, &priv->used_icb);
-
-	plane->marker->offset = mem - priv->meram;
-	plane->marker->size = size;
-
-	return 0;
-}
-
-/* Free ICBs and MERAM for a plane. */
-static void meram_plane_free(struct sh_mobile_meram_priv *priv,
-			     struct sh_mobile_meram_fb_plane *plane)
-{
-	meram_free(priv, priv->meram + plane->marker->offset,
-		   plane->marker->size * 1024);
-
-	__clear_bit(plane->marker->index, &priv->used_icb);
-	__clear_bit(plane->cache->index, &priv->used_icb);
-}
-
-/* Is this a YCbCr(NV12, NV16 or NV24) colorspace? */
-static int is_nvcolor(int cspace)
-{
-	if (cspace == SH_MOBILE_MERAM_PF_NV ||
-	    cspace == SH_MOBILE_MERAM_PF_NV24)
-		return 1;
-	return 0;
-}
-
-/* Set the next address to fetch. */
-static void meram_set_next_addr(struct sh_mobile_meram_priv *priv,
-				struct sh_mobile_meram_fb_cache *cache,
-				unsigned long base_addr_y,
-				unsigned long base_addr_c)
-{
-	struct sh_mobile_meram_icb *icb = cache->planes[0].marker;
-	unsigned long target;
-
-	icb->current_reg ^= 1;
-	target = icb->current_reg ? MExxSARB : MExxSARA;
-
-	/* set the next address to fetch */
-	meram_write_icb(priv->base, cache->planes[0].cache->index, target,
-			base_addr_y);
-	meram_write_icb(priv->base, cache->planes[0].marker->index, target,
-			base_addr_y + cache->planes[0].marker->cache_unit);
-
-	if (cache->nplanes == 2) {
-		meram_write_icb(priv->base, cache->planes[1].cache->index,
-				target, base_addr_c);
-		meram_write_icb(priv->base, cache->planes[1].marker->index,
-				target, base_addr_c +
-				cache->planes[1].marker->cache_unit);
-	}
-}
-
-/* Get the next ICB address. */
-static void
-meram_get_next_icb_addr(struct sh_mobile_meram_info *pdata,
-			struct sh_mobile_meram_fb_cache *cache,
-			unsigned long *icb_addr_y, unsigned long *icb_addr_c)
-{
-	struct sh_mobile_meram_icb *icb = cache->planes[0].marker;
-	unsigned long icb_offset;
-
-	if (pdata->addr_mode == SH_MOBILE_MERAM_MODE0)
-		icb_offset = 0x80000000 | (icb->current_reg << 29);
-	else
-		icb_offset = 0xc0000000 | (icb->current_reg << 23);
-
-	*icb_addr_y = icb_offset | (cache->planes[0].marker->index << 24);
-	if (cache->nplanes == 2)
-		*icb_addr_c = icb_offset
-			    | (cache->planes[1].marker->index << 24);
-}
-
-#define MERAM_CALC_BYTECOUNT(x, y) \
-	(((x) * (y) + (MERAM_LINE_WIDTH - 1)) & ~(MERAM_LINE_WIDTH - 1))
-
-/* Initialize MERAM. */
-static int meram_plane_init(struct sh_mobile_meram_priv *priv,
-			    struct sh_mobile_meram_fb_plane *plane,
-			    unsigned int xres, unsigned int yres,
-			    unsigned int *out_pitch)
-{
-	struct sh_mobile_meram_icb *marker = plane->marker;
-	unsigned long total_byte_count = MERAM_CALC_BYTECOUNT(xres, yres);
-	unsigned long bnm;
-	unsigned int lcdc_pitch;
-	unsigned int xpitch;
-	unsigned int line_cnt;
-	unsigned int save_lines;
-
-	/* adjust pitch to 1024, 2048, 4096 or 8192 */
-	lcdc_pitch = (xres - 1) | 1023;
-	lcdc_pitch = lcdc_pitch | (lcdc_pitch >> 1);
-	lcdc_pitch = lcdc_pitch | (lcdc_pitch >> 2);
-	lcdc_pitch += 1;
-
-	/* derive settings */
-	if (lcdc_pitch == 8192 && yres >= 1024) {
-		lcdc_pitch = xpitch = MERAM_LINE_WIDTH;
-		line_cnt = total_byte_count >> 11;
-		*out_pitch = xres;
-		save_lines = plane->marker->size / 16 / MERAM_SEC_LINE;
-		save_lines *= MERAM_SEC_LINE;
-	} else {
-		xpitch = xres;
-		line_cnt = yres;
-		*out_pitch = lcdc_pitch;
-		save_lines = plane->marker->size / (lcdc_pitch >> 10) / 2;
-		save_lines &= 0xff;
-	}
-	bnm = (save_lines - 1) << 16;
-
-	/* TODO: we better to check if we have enough MERAM buffer size */
-
-	/* set up ICB */
-	meram_write_icb(priv->base, plane->cache->index,  MExxBSIZE,
-			MERAM_MExxBSIZE_VAL(0x0, line_cnt - 1, xpitch - 1));
-	meram_write_icb(priv->base, plane->marker->index, MExxBSIZE,
-			MERAM_MExxBSIZE_VAL(0xf, line_cnt - 1, xpitch - 1));
-
-	meram_write_icb(priv->base, plane->cache->index,  MExxMNCF, bnm);
-	meram_write_icb(priv->base, plane->marker->index, MExxMNCF, bnm);
-
-	meram_write_icb(priv->base, plane->cache->index,  MExxSBSIZE, xpitch);
-	meram_write_icb(priv->base, plane->marker->index, MExxSBSIZE, xpitch);
-
-	/* save a cache unit size */
-	plane->cache->cache_unit = xres * save_lines;
-	plane->marker->cache_unit = xres * save_lines;
-
-	/*
-	 * Set MERAM for framebuffer
-	 *
-	 * we also chain the cache_icb and the marker_icb.
-	 * we also split the allocated MERAM buffer between two ICBs.
-	 */
-	meram_write_icb(priv->base, plane->cache->index, MExxCTL,
-			MERAM_MExxCTL_VAL(plane->marker->index, marker->offset)
-			| MExxCTL_WD1 | MExxCTL_WD0 | MExxCTL_WS | MExxCTL_CM |
-			MExxCTL_MD_FB);
-	meram_write_icb(priv->base, plane->marker->index, MExxCTL,
-			MERAM_MExxCTL_VAL(plane->cache->index, marker->offset +
-					  plane->marker->size / 2) |
-			MExxCTL_WD1 | MExxCTL_WD0 | MExxCTL_WS | MExxCTL_CM |
-			MExxCTL_MD_FB);
-
-	return 0;
-}
-
-static void meram_plane_cleanup(struct sh_mobile_meram_priv *priv,
-				struct sh_mobile_meram_fb_plane *plane)
-{
-	/* disable ICB */
-	meram_write_icb(priv->base, plane->cache->index,  MExxCTL,
-			MExxCTL_WBF | MExxCTL_WF | MExxCTL_RF);
-	meram_write_icb(priv->base, plane->marker->index, MExxCTL,
-			MExxCTL_WBF | MExxCTL_WF | MExxCTL_RF);
-
-	plane->cache->cache_unit = 0;
-	plane->marker->cache_unit = 0;
-}
-
-/* -----------------------------------------------------------------------------
- * MERAM operations
- */
-
-unsigned long sh_mobile_meram_alloc(struct sh_mobile_meram_info *pdata,
-				    size_t size)
-{
-	struct sh_mobile_meram_priv *priv = pdata->priv;
-
-	return meram_alloc(priv, size);
-}
-EXPORT_SYMBOL_GPL(sh_mobile_meram_alloc);
-
-void sh_mobile_meram_free(struct sh_mobile_meram_info *pdata, unsigned long mem,
-			  size_t size)
-{
-	struct sh_mobile_meram_priv *priv = pdata->priv;
-
-	meram_free(priv, mem, size);
-}
-EXPORT_SYMBOL_GPL(sh_mobile_meram_free);
-
-/* Allocate memory for the ICBs and mark them as used. */
-static struct sh_mobile_meram_fb_cache *
-meram_cache_alloc(struct sh_mobile_meram_priv *priv,
-		  const struct sh_mobile_meram_cfg *cfg,
-		  int pixelformat)
-{
-	unsigned int nplanes = is_nvcolor(pixelformat) ? 2 : 1;
-	struct sh_mobile_meram_fb_cache *cache;
-	int ret;
-
-	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
-	if (cache == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	cache->nplanes = nplanes;
-
-	ret = meram_plane_alloc(priv, &cache->planes[0],
-				cfg->icb[0].meram_size);
-	if (ret < 0)
-		goto error;
-
-	cache->planes[0].marker->current_reg = 1;
-	cache->planes[0].marker->pixelformat = pixelformat;
-
-	if (cache->nplanes == 1)
-		return cache;
-
-	ret = meram_plane_alloc(priv, &cache->planes[1],
-				cfg->icb[1].meram_size);
-	if (ret < 0) {
-		meram_plane_free(priv, &cache->planes[0]);
-		goto error;
-	}
-
-	return cache;
-
-error:
-	kfree(cache);
-	return ERR_PTR(-ENOMEM);
-}
-
-void *sh_mobile_meram_cache_alloc(struct sh_mobile_meram_info *pdata,
-				  const struct sh_mobile_meram_cfg *cfg,
-				  unsigned int xres, unsigned int yres,
-				  unsigned int pixelformat, unsigned int *pitch)
-{
-	struct sh_mobile_meram_fb_cache *cache;
-	struct sh_mobile_meram_priv *priv = pdata->priv;
-	struct platform_device *pdev = pdata->pdev;
-	unsigned int nplanes = is_nvcolor(pixelformat) ? 2 : 1;
-	unsigned int out_pitch;
-
-	if (priv == NULL)
-		return ERR_PTR(-ENODEV);
-
-	if (pixelformat != SH_MOBILE_MERAM_PF_NV &&
-	    pixelformat != SH_MOBILE_MERAM_PF_NV24 &&
-	    pixelformat != SH_MOBILE_MERAM_PF_RGB)
-		return ERR_PTR(-EINVAL);
-
-	dev_dbg(&pdev->dev, "registering %dx%d (%s)", xres, yres,
-		!pixelformat ? "yuv" : "rgb");
-
-	/* we can't handle wider than 8192px */
-	if (xres > 8192) {
-		dev_err(&pdev->dev, "width exceeding the limit (> 8192).");
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (cfg->icb[0].meram_size == 0)
-		return ERR_PTR(-EINVAL);
-
-	if (nplanes == 2 && cfg->icb[1].meram_size == 0)
-		return ERR_PTR(-EINVAL);
-
-	mutex_lock(&priv->lock);
-
-	/* We now register the ICBs and allocate the MERAM regions. */
-	cache = meram_cache_alloc(priv, cfg, pixelformat);
-	if (IS_ERR(cache)) {
-		dev_err(&pdev->dev, "MERAM allocation failed (%ld).",
-			PTR_ERR(cache));
-		goto err;
-	}
-
-	/* initialize MERAM */
-	meram_plane_init(priv, &cache->planes[0], xres, yres, &out_pitch);
-	*pitch = out_pitch;
-	if (pixelformat == SH_MOBILE_MERAM_PF_NV)
-		meram_plane_init(priv, &cache->planes[1],
-				 xres, (yres + 1) / 2, &out_pitch);
-	else if (pixelformat == SH_MOBILE_MERAM_PF_NV24)
-		meram_plane_init(priv, &cache->planes[1],
-				 2 * xres, (yres + 1) / 2, &out_pitch);
-
-err:
-	mutex_unlock(&priv->lock);
-	return cache;
-}
-EXPORT_SYMBOL_GPL(sh_mobile_meram_cache_alloc);
-
-void
-sh_mobile_meram_cache_free(struct sh_mobile_meram_info *pdata, void *data)
-{
-	struct sh_mobile_meram_fb_cache *cache = data;
-	struct sh_mobile_meram_priv *priv = pdata->priv;
-
-	mutex_lock(&priv->lock);
-
-	/* Cleanup and free. */
-	meram_plane_cleanup(priv, &cache->planes[0]);
-	meram_plane_free(priv, &cache->planes[0]);
-
-	if (cache->nplanes == 2) {
-		meram_plane_cleanup(priv, &cache->planes[1]);
-		meram_plane_free(priv, &cache->planes[1]);
-	}
-
-	kfree(cache);
-
-	mutex_unlock(&priv->lock);
-}
-EXPORT_SYMBOL_GPL(sh_mobile_meram_cache_free);
-
-void
-sh_mobile_meram_cache_update(struct sh_mobile_meram_info *pdata, void *data,
-			     unsigned long base_addr_y,
-			     unsigned long base_addr_c,
-			     unsigned long *icb_addr_y,
-			     unsigned long *icb_addr_c)
-{
-	struct sh_mobile_meram_fb_cache *cache = data;
-	struct sh_mobile_meram_priv *priv = pdata->priv;
-
-	mutex_lock(&priv->lock);
-
-	meram_set_next_addr(priv, cache, base_addr_y, base_addr_c);
-	meram_get_next_icb_addr(pdata, cache, icb_addr_y, icb_addr_c);
-
-	mutex_unlock(&priv->lock);
-}
-EXPORT_SYMBOL_GPL(sh_mobile_meram_cache_update);
-
-/* -----------------------------------------------------------------------------
- * Power management
- */
-
-#ifdef CONFIG_PM
-static int sh_mobile_meram_suspend(struct device *dev)
-{
-	struct sh_mobile_meram_priv *priv = dev_get_drvdata(dev);
-	unsigned int i, j;
-
-	for (i = 0; i < MERAM_REGS_SIZE; i++)
-		priv->regs[i] = meram_read_reg(priv->base, common_regs[i]);
-
-	for (i = 0; i < 32; i++) {
-		if (!test_bit(i, &priv->used_icb))
-			continue;
-		for (j = 0; j < ICB_REGS_SIZE; j++) {
-			priv->icbs[i].regs[j] =
-				meram_read_icb(priv->base, i, icb_regs[j]);
-			/* Reset ICB on resume */
-			if (icb_regs[j] == MExxCTL)
-				priv->icbs[i].regs[j] |=
-					MExxCTL_WBF | MExxCTL_WF | MExxCTL_RF;
-		}
-	}
-	return 0;
-}
-
-static int sh_mobile_meram_resume(struct device *dev)
-{
-	struct sh_mobile_meram_priv *priv = dev_get_drvdata(dev);
-	unsigned int i, j;
-
-	for (i = 0; i < 32; i++) {
-		if (!test_bit(i, &priv->used_icb))
-			continue;
-		for (j = 0; j < ICB_REGS_SIZE; j++)
-			meram_write_icb(priv->base, i, icb_regs[j],
-					priv->icbs[i].regs[j]);
-	}
-
-	for (i = 0; i < MERAM_REGS_SIZE; i++)
-		meram_write_reg(priv->base, common_regs[i], priv->regs[i]);
-	return 0;
-}
-#endif /* CONFIG_PM */
-
-static UNIVERSAL_DEV_PM_OPS(sh_mobile_meram_dev_pm_ops,
-			    sh_mobile_meram_suspend,
-			    sh_mobile_meram_resume, NULL);
-
-/* -----------------------------------------------------------------------------
- * Probe/remove and driver init/exit
- */
-
-static int sh_mobile_meram_probe(struct platform_device *pdev)
-{
-	struct sh_mobile_meram_priv *priv;
-	struct sh_mobile_meram_info *pdata = pdev->dev.platform_data;
-	struct resource *regs;
-	struct resource *meram;
-	unsigned int i;
-	int error;
-
-	if (!pdata) {
-		dev_err(&pdev->dev, "no platform data defined\n");
-		return -EINVAL;
-	}
-
-	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	meram = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	if (regs == NULL || meram == NULL) {
-		dev_err(&pdev->dev, "cannot get platform resources\n");
-		return -ENOENT;
-	}
-
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
-
-	/* Initialize private data. */
-	mutex_init(&priv->lock);
-	priv->used_icb = pdata->reserved_icbs;
-
-	for (i = 0; i < MERAM_ICB_NUM; ++i)
-		priv->icbs[i].index = i;
-
-	pdata->priv = priv;
-	pdata->pdev = pdev;
-
-	/* Request memory regions and remap the registers. */
-	if (!request_mem_region(regs->start, resource_size(regs), pdev->name)) {
-		dev_err(&pdev->dev, "MERAM registers region already claimed\n");
-		error = -EBUSY;
-		goto err_req_regs;
-	}
-
-	if (!request_mem_region(meram->start, resource_size(meram),
-				pdev->name)) {
-		dev_err(&pdev->dev, "MERAM memory region already claimed\n");
-		error = -EBUSY;
-		goto err_req_meram;
-	}
-
-	priv->base = ioremap_nocache(regs->start, resource_size(regs));
-	if (!priv->base) {
-		dev_err(&pdev->dev, "ioremap failed\n");
-		error = -EFAULT;
-		goto err_ioremap;
-	}
-
-	priv->meram = meram->start;
-
-	/* Create and initialize the MERAM memory pool. */
-	priv->pool = gen_pool_create(ilog2(MERAM_GRANULARITY), -1);
-	if (priv->pool == NULL) {
-		error = -ENOMEM;
-		goto err_genpool;
-	}
-
-	error = gen_pool_add(priv->pool, meram->start, resource_size(meram),
-			     -1);
-	if (error < 0)
-		goto err_genpool;
-
-	/* initialize ICB addressing mode */
-	if (pdata->addr_mode == SH_MOBILE_MERAM_MODE1)
-		meram_write_reg(priv->base, MEVCR1, MEVCR1_AMD1);
-
-	platform_set_drvdata(pdev, priv);
-	pm_runtime_enable(&pdev->dev);
-
-	dev_info(&pdev->dev, "sh_mobile_meram initialized.");
-
-	return 0;
-
-err_genpool:
-	if (priv->pool)
-		gen_pool_destroy(priv->pool);
-	iounmap(priv->base);
-err_ioremap:
-	release_mem_region(meram->start, resource_size(meram));
-err_req_meram:
-	release_mem_region(regs->start, resource_size(regs));
-err_req_regs:
-	mutex_destroy(&priv->lock);
-	kfree(priv);
-
-	return error;
-}
-
-
-static int sh_mobile_meram_remove(struct platform_device *pdev)
-{
-	struct sh_mobile_meram_priv *priv = platform_get_drvdata(pdev);
-	struct resource *regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	struct resource *meram = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-
-	pm_runtime_disable(&pdev->dev);
-
-	gen_pool_destroy(priv->pool);
-
-	iounmap(priv->base);
-	release_mem_region(meram->start, resource_size(meram));
-	release_mem_region(regs->start, resource_size(regs));
-
-	mutex_destroy(&priv->lock);
-
-	kfree(priv);
-
-	return 0;
-}
-
-static struct platform_driver sh_mobile_meram_driver = {
-	.driver	= {
-		.name		= "sh_mobile_meram",
-		.pm		= &sh_mobile_meram_dev_pm_ops,
-	},
-	.probe		= sh_mobile_meram_probe,
-	.remove		= sh_mobile_meram_remove,
-};
-
-module_platform_driver(sh_mobile_meram_driver);
-
-MODULE_DESCRIPTION("SuperH Mobile MERAM driver");
-MODULE_AUTHOR("Damian Hobson-Garcia / Takanari Hayama");
-MODULE_LICENSE("GPL v2");
diff --git a/include/video/sh_mobile_meram.h b/include/video/sh_mobile_meram.h
deleted file mode 100644
index f4efc21e205d..000000000000
--- a/include/video/sh_mobile_meram.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __VIDEO_SH_MOBILE_MERAM_H__
-#define __VIDEO_SH_MOBILE_MERAM_H__
-
-/* For sh_mobile_meram_info.addr_mode */
-enum {
-	SH_MOBILE_MERAM_MODE0 = 0,
-	SH_MOBILE_MERAM_MODE1
-};
-
-enum {
-	SH_MOBILE_MERAM_PF_NV = 0,
-	SH_MOBILE_MERAM_PF_RGB,
-	SH_MOBILE_MERAM_PF_NV24
-};
-
-
-struct sh_mobile_meram_priv;
-
-/*
- * struct sh_mobile_meram_info - MERAM platform data
- * @reserved_icbs: Bitmask of reserved ICBs (for instance used through UIO)
- */
-struct sh_mobile_meram_info {
-	int				addr_mode;
-	u32				reserved_icbs;
-	struct sh_mobile_meram_priv	*priv;
-	struct platform_device		*pdev;
-};
-
-/* icb config */
-struct sh_mobile_meram_icb_cfg {
-	unsigned int meram_size;	/* MERAM Buffer Size to use */
-};
-
-struct sh_mobile_meram_cfg {
-	struct sh_mobile_meram_icb_cfg icb[2];
-};
-
-#if defined(CONFIG_FB_SH_MOBILE_MERAM) || \
-    defined(CONFIG_FB_SH_MOBILE_MERAM_MODULE)
-unsigned long sh_mobile_meram_alloc(struct sh_mobile_meram_info *meram_dev,
-				    size_t size);
-void sh_mobile_meram_free(struct sh_mobile_meram_info *meram_dev,
-			  unsigned long mem, size_t size);
-void *sh_mobile_meram_cache_alloc(struct sh_mobile_meram_info *dev,
-				  const struct sh_mobile_meram_cfg *cfg,
-				  unsigned int xres, unsigned int yres,
-				  unsigned int pixelformat,
-				  unsigned int *pitch);
-void sh_mobile_meram_cache_free(struct sh_mobile_meram_info *dev, void *data);
-void sh_mobile_meram_cache_update(struct sh_mobile_meram_info *dev, void *data,
-				  unsigned long base_addr_y,
-				  unsigned long base_addr_c,
-				  unsigned long *icb_addr_y,
-				  unsigned long *icb_addr_c);
-#else
-static inline unsigned long
-sh_mobile_meram_alloc(struct sh_mobile_meram_info *meram_dev, size_t size)
-{
-	return 0;
-}
-
-static inline void
-sh_mobile_meram_free(struct sh_mobile_meram_info *meram_dev,
-		     unsigned long mem, size_t size)
-{
-}
-
-static inline void *
-sh_mobile_meram_cache_alloc(struct sh_mobile_meram_info *dev,
-			    const struct sh_mobile_meram_cfg *cfg,
-			    unsigned int xres, unsigned int yres,
-			    unsigned int pixelformat,
-			    unsigned int *pitch)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void
-sh_mobile_meram_cache_free(struct sh_mobile_meram_info *dev, void *data)
-{
-}
-
-static inline void
-sh_mobile_meram_cache_update(struct sh_mobile_meram_info *dev, void *data,
-			     unsigned long base_addr_y,
-			     unsigned long base_addr_c,
-			     unsigned long *icb_addr_y,
-			     unsigned long *icb_addr_c)
-{
-}
-#endif
-
-#endif /* __VIDEO_SH_MOBILE_MERAM_H__  */
-- 
cgit v1.2.3


From 2220c5b0a7fb7d7d00c9a0b1e5222a7ae1f35956 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Apr 2018 21:22:04 -0400
Subject: make xattr_getsecurity() static

many years overdue...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xattr.c            | 3 +--
 include/linux/xattr.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/xattr.c b/fs/xattr.c
index 61cd28ba25f3..28e1dfc27b95 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -229,7 +229,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
 
-ssize_t
+static ssize_t
 xattr_getsecurity(struct inode *inode, const char *name, void *value,
 			size_t size)
 {
@@ -254,7 +254,6 @@ out:
 out_noalloc:
 	return len;
 }
-EXPORT_SYMBOL_GPL(xattr_getsecurity);
 
 /*
  * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d70f77a4b62a..6dad031be3c2 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -46,7 +46,6 @@ struct xattr {
 	size_t value_len;
 };
 
-ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t);
 ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
 ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
 ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
-- 
cgit v1.2.3


From 0fccb85ad21d65c6ee1ea912acc3f0ae4df31876 Mon Sep 17 00:00:00 2001
From: Bruce Allan <bruce.w.allan@intel.com>
Date: Thu, 10 May 2018 05:59:39 -0700
Subject: virtchnl: Whitespace and parenthesis cleanup

Clean up existing instances of unnecessary parentheses in if
statement and change order of conditionals to make it easier to read

The opening /* should be followed by a single space and the closing */
should be preceded with a single space.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/avf/virtchnl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index b0a7f315bfbe..212b3822d180 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -485,7 +485,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_key);
 struct virtchnl_rss_lut {
 	u16 vsi_id;
 	u16 lut_entries;
-	u8 lut[1];        /* RSS lookup table*/
+	u8 lut[1];        /* RSS lookup table */
 };
 
 VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_lut);
@@ -819,7 +819,7 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 		return VIRTCHNL_ERR_PARAM;
 	}
 	/* few more checks */
-	if ((valid_len != msglen) || (err_msg_format))
+	if (err_msg_format || valid_len != msglen)
 		return VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH;
 
 	return 0;
-- 
cgit v1.2.3


From ae20b254306a6a47054ea0eb8444e0bd8e71fe12 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Sat, 12 May 2018 02:30:33 -0700
Subject: Drivers: hv: vmbus: enable VMBus protocol version 5.0

With VMBus protocol 5.0, we're able to better support new features, e.g.
running two or more VMBus drivers simultaneously in a single VM -- note:
we can't simply load the current VMBus driver twice, instead, a secondary
VMBus driver must be implemented.

This patch adds the support for the new VMBus protocol, which is available
on new Windows hosts, by:

1) We still use SINT2 for compatibility;
2) We must use Connection ID 4 for the Initiate Contact Message, and for
subsequent messages, we must use the Message Connection ID field in
the host-returned VersionResponse Message.

Notes for developers of the secondary VMBus driver:
1) Must use VMBus protocol 5.0 as well;
2) Must use a different SINT number that is not in use.
3) Must use Connection ID 4 for the Initiate Contact Message, and for
subsequent messages, must use the Message Connection ID field in
the host-returned VersionResponse Message.
4) It's possible that the primary VMBus driver using protocol version 4.0
can work with a secondary VMBus driver using protocol version 5.0, but it's
recommended that both should use 5.0 for new Hyper-V features in the future.

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/connection.c   | 44 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/hv/hyperv_vmbus.h |  3 +++
 include/linux/hyperv.h    | 26 ++++++++++++++++++++++++--
 3 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 72855182b191..19e046820fda 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -63,6 +63,9 @@ static __u32 vmbus_get_next_version(__u32 current_version)
 	case (VERSION_WIN10):
 		return VERSION_WIN8_1;
 
+	case (VERSION_WIN10_V5):
+		return VERSION_WIN10;
+
 	case (VERSION_WS2008):
 	default:
 		return VERSION_INVAL;
@@ -80,9 +83,29 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
 
 	msg = (struct vmbus_channel_initiate_contact *)msginfo->msg;
 
+	memset(msg, 0, sizeof(*msg));
 	msg->header.msgtype = CHANNELMSG_INITIATE_CONTACT;
 	msg->vmbus_version_requested = version;
-	msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
+
+	/*
+	 * VMBus protocol 5.0 (VERSION_WIN10_V5) requires that we must use
+	 * VMBUS_MESSAGE_CONNECTION_ID_4 for the Initiate Contact Message,
+	 * and for subsequent messages, we must use the Message Connection ID
+	 * field in the host-returned Version Response Message. And, with
+	 * VERSION_WIN10_V5, we don't use msg->interrupt_page, but we tell
+	 * the host explicitly that we still use VMBUS_MESSAGE_SINT(2) for
+	 * compatibility.
+	 *
+	 * On old hosts, we should always use VMBUS_MESSAGE_CONNECTION_ID (1).
+	 */
+	if (version >= VERSION_WIN10_V5) {
+		msg->msg_sint = VMBUS_MESSAGE_SINT;
+		vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4;
+	} else {
+		msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
+		vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID;
+	}
+
 	msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]);
 	msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]);
 	/*
@@ -137,6 +160,10 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
 	/* Check if successful */
 	if (msginfo->response.version_response.version_supported) {
 		vmbus_connection.conn_state = CONNECTED;
+
+		if (version >= VERSION_WIN10_V5)
+			vmbus_connection.msg_conn_id =
+				msginfo->response.version_response.msg_conn_id;
 	} else {
 		return -ECONNREFUSED;
 	}
@@ -354,13 +381,14 @@ void vmbus_on_event(unsigned long data)
  */
 int vmbus_post_msg(void *buffer, size_t buflen, bool can_sleep)
 {
+	struct vmbus_channel_message_header *hdr;
 	union hv_connection_id conn_id;
 	int ret = 0;
 	int retries = 0;
 	u32 usec = 1;
 
 	conn_id.asu32 = 0;
-	conn_id.u.id = VMBUS_MESSAGE_CONNECTION_ID;
+	conn_id.u.id = vmbus_connection.msg_conn_id;
 
 	/*
 	 * hv_post_message() can have transient failures because of
@@ -372,6 +400,18 @@ int vmbus_post_msg(void *buffer, size_t buflen, bool can_sleep)
 
 		switch (ret) {
 		case HV_STATUS_INVALID_CONNECTION_ID:
+			/*
+			 * See vmbus_negotiate_version(): VMBus protocol 5.0
+			 * requires that we must use
+			 * VMBUS_MESSAGE_CONNECTION_ID_4 for the Initiate
+			 * Contact message, but on old hosts that only
+			 * support VMBus protocol 4.0 or lower, here we get
+			 * HV_STATUS_INVALID_CONNECTION_ID and we should
+			 * return an error immediately without retrying.
+			 */
+			hdr = (struct vmbus_channel_message_header *)buffer;
+			if (hdr->msgtype == CHANNELMSG_INITIATE_CONTACT)
+				return -EINVAL;
 			/*
 			 * We could get this if we send messages too
 			 * frequently.
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index f761bef36e77..72eaba3d50fc 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -187,6 +187,7 @@ struct hv_input_post_message {
 
 enum {
 	VMBUS_MESSAGE_CONNECTION_ID	= 1,
+	VMBUS_MESSAGE_CONNECTION_ID_4	= 4,
 	VMBUS_MESSAGE_PORT_ID		= 1,
 	VMBUS_EVENT_CONNECTION_ID	= 2,
 	VMBUS_EVENT_PORT_ID		= 2,
@@ -302,6 +303,8 @@ struct vmbus_connection {
 	 */
 	int connect_cpu;
 
+	u32 msg_conn_id;
+
 	atomic_t offer_in_progress;
 
 	enum vmbus_connect_state conn_state;
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 192ed8fbc403..11b5612dc066 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -163,6 +163,7 @@ static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi)
  * 2 . 4  (Windows 8)
  * 3 . 0  (Windows 8 R2)
  * 4 . 0  (Windows 10)
+ * 5 . 0  (Newer Windows 10)
  */
 
 #define VERSION_WS2008  ((0 << 16) | (13))
@@ -170,10 +171,11 @@ static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi)
 #define VERSION_WIN8    ((2 << 16) | (4))
 #define VERSION_WIN8_1    ((3 << 16) | (0))
 #define VERSION_WIN10	((4 << 16) | (0))
+#define VERSION_WIN10_V5 ((5 << 16) | (0))
 
 #define VERSION_INVAL -1
 
-#define VERSION_CURRENT VERSION_WIN10
+#define VERSION_CURRENT VERSION_WIN10_V5
 
 /* Make maximum size of pipe payload of 16K */
 #define MAX_PIPE_DATA_PAYLOAD		(sizeof(u8) * 16384)
@@ -570,7 +572,14 @@ struct vmbus_channel_initiate_contact {
 	struct vmbus_channel_message_header header;
 	u32 vmbus_version_requested;
 	u32 target_vcpu; /* The VCPU the host should respond to */
-	u64 interrupt_page;
+	union {
+		u64 interrupt_page;
+		struct {
+			u8	msg_sint;
+			u8	padding1[3];
+			u32	padding2;
+		};
+	};
 	u64 monitor_page1;
 	u64 monitor_page2;
 } __packed;
@@ -585,6 +594,19 @@ struct vmbus_channel_tl_connect_request {
 struct vmbus_channel_version_response {
 	struct vmbus_channel_message_header header;
 	u8 version_supported;
+
+	u8 connection_state;
+	u16 padding;
+
+	/*
+	 * On new hosts that support VMBus protocol 5.0, we must use
+	 * VMBUS_MESSAGE_CONNECTION_ID_4 for the Initiate Contact Message,
+	 * and for subsequent messages, we must use the Message Connection ID
+	 * field in the host-returned Version Response Message.
+	 *
+	 * On old hosts, we should always use VMBUS_MESSAGE_CONNECTION_ID (1).
+	 */
+	u32 msg_conn_id;
 } __packed;
 
 enum vmbus_channel_state {
-- 
cgit v1.2.3


From a93e7b331568227500186a465fee3c2cb5dffd1f Mon Sep 17 00:00:00 2001
From: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Date: Mon, 14 May 2018 13:32:23 +1200
Subject: uio: Prevent device destruction while fds are open

Prevent destruction of a uio_device while user space apps hold open
file descriptors to that device. Further, access to the 'info' member
of the struct uio_device is protected by spinlock. This is to ensure
stale pointers to data not under control of the UIO subsystem are not
dereferenced.

Signed-off-by: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Reviewed-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/uio/uio.c          | 98 ++++++++++++++++++++++++++++++++++------------
 include/linux/uio_driver.h |  4 +-
 2 files changed, 75 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 1883ce620ea9..e8f4ac9400ea 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -270,7 +270,7 @@ static int uio_dev_add_attributes(struct uio_device *idev)
 		if (!map_found) {
 			map_found = 1;
 			idev->map_dir = kobject_create_and_add("maps",
-							&idev->dev->kobj);
+							&idev->dev.kobj);
 			if (!idev->map_dir) {
 				ret = -ENOMEM;
 				goto err_map;
@@ -299,7 +299,7 @@ static int uio_dev_add_attributes(struct uio_device *idev)
 		if (!portio_found) {
 			portio_found = 1;
 			idev->portio_dir = kobject_create_and_add("portio",
-							&idev->dev->kobj);
+							&idev->dev.kobj);
 			if (!idev->portio_dir) {
 				ret = -ENOMEM;
 				goto err_portio;
@@ -342,7 +342,7 @@ err_map_kobj:
 		kobject_put(&map->kobj);
 	}
 	kobject_put(idev->map_dir);
-	dev_err(idev->dev, "error creating sysfs files (%d)\n", ret);
+	dev_err(&idev->dev, "error creating sysfs files (%d)\n", ret);
 	return ret;
 }
 
@@ -379,7 +379,7 @@ static int uio_get_minor(struct uio_device *idev)
 		idev->minor = retval;
 		retval = 0;
 	} else if (retval == -ENOSPC) {
-		dev_err(idev->dev, "too many uio devices\n");
+		dev_err(&idev->dev, "too many uio devices\n");
 		retval = -EINVAL;
 	}
 	mutex_unlock(&minor_lock);
@@ -433,6 +433,7 @@ static int uio_open(struct inode *inode, struct file *filep)
 	struct uio_device *idev;
 	struct uio_listener *listener;
 	int ret = 0;
+	unsigned long flags;
 
 	mutex_lock(&minor_lock);
 	idev = idr_find(&uio_idr, iminor(inode));
@@ -442,9 +443,11 @@ static int uio_open(struct inode *inode, struct file *filep)
 		goto out;
 	}
 
+	get_device(&idev->dev);
+
 	if (!try_module_get(idev->owner)) {
 		ret = -ENODEV;
-		goto out;
+		goto err_module_get;
 	}
 
 	listener = kmalloc(sizeof(*listener), GFP_KERNEL);
@@ -457,11 +460,13 @@ static int uio_open(struct inode *inode, struct file *filep)
 	listener->event_count = atomic_read(&idev->event);
 	filep->private_data = listener;
 
-	if (idev->info->open) {
+	spin_lock_irqsave(&idev->info_lock, flags);
+	if (idev->info && idev->info->open)
 		ret = idev->info->open(idev->info, inode);
-		if (ret)
-			goto err_infoopen;
-	}
+	spin_unlock_irqrestore(&idev->info_lock, flags);
+	if (ret)
+		goto err_infoopen;
+
 	return 0;
 
 err_infoopen:
@@ -470,6 +475,9 @@ err_infoopen:
 err_alloc_listener:
 	module_put(idev->owner);
 
+err_module_get:
+	put_device(&idev->dev);
+
 out:
 	return ret;
 }
@@ -487,12 +495,16 @@ static int uio_release(struct inode *inode, struct file *filep)
 	int ret = 0;
 	struct uio_listener *listener = filep->private_data;
 	struct uio_device *idev = listener->dev;
+	unsigned long flags;
 
-	if (idev->info->release)
+	spin_lock_irqsave(&idev->info_lock, flags);
+	if (idev->info && idev->info->release)
 		ret = idev->info->release(idev->info, inode);
+	spin_unlock_irqrestore(&idev->info_lock, flags);
 
 	module_put(idev->owner);
 	kfree(listener);
+	put_device(&idev->dev);
 	return ret;
 }
 
@@ -500,9 +512,16 @@ static __poll_t uio_poll(struct file *filep, poll_table *wait)
 {
 	struct uio_listener *listener = filep->private_data;
 	struct uio_device *idev = listener->dev;
+	__poll_t ret = 0;
+	unsigned long flags;
 
-	if (!idev->info->irq)
-		return -EIO;
+	spin_lock_irqsave(&idev->info_lock, flags);
+	if (!idev->info || !idev->info->irq)
+		ret = -EIO;
+	spin_unlock_irqrestore(&idev->info_lock, flags);
+
+	if (ret)
+		return ret;
 
 	poll_wait(filep, &idev->wait, wait);
 	if (listener->event_count != atomic_read(&idev->event))
@@ -516,11 +535,17 @@ static ssize_t uio_read(struct file *filep, char __user *buf,
 	struct uio_listener *listener = filep->private_data;
 	struct uio_device *idev = listener->dev;
 	DECLARE_WAITQUEUE(wait, current);
-	ssize_t retval;
+	ssize_t retval = 0;
 	s32 event_count;
+	unsigned long flags;
 
-	if (!idev->info->irq)
-		return -EIO;
+	spin_lock_irqsave(&idev->info_lock, flags);
+	if (!idev->info || !idev->info->irq)
+		retval = -EIO;
+	spin_unlock_irqrestore(&idev->info_lock, flags);
+
+	if (retval)
+		return retval;
 
 	if (count != sizeof(s32))
 		return -EINVAL;
@@ -567,8 +592,10 @@ static ssize_t uio_write(struct file *filep, const char __user *buf,
 	struct uio_device *idev = listener->dev;
 	ssize_t retval;
 	s32 irq_on;
+	unsigned long flags;
 
-	if (!idev->info->irq) {
+	spin_lock_irqsave(&idev->info_lock, flags);
+	if (!idev->info || !idev->info->irq) {
 		retval = -EIO;
 		goto out;
 	}
@@ -591,6 +618,7 @@ static ssize_t uio_write(struct file *filep, const char __user *buf,
 	retval = idev->info->irqcontrol(idev->info, irq_on);
 
 out:
+	spin_unlock_irqrestore(&idev->info_lock, flags);
 	return retval ? retval : sizeof(s32);
 }
 
@@ -803,6 +831,13 @@ static void release_uio_class(void)
 	uio_major_cleanup();
 }
 
+static void uio_device_release(struct device *dev)
+{
+	struct uio_device *idev = dev_get_drvdata(dev);
+
+	kfree(idev);
+}
+
 /**
  * uio_register_device - register a new userspace IO device
  * @owner:	module that creates the new device
@@ -823,13 +858,14 @@ int __uio_register_device(struct module *owner,
 
 	info->uio_dev = NULL;
 
-	idev = devm_kzalloc(parent, sizeof(*idev), GFP_KERNEL);
+	idev = kzalloc(sizeof(*idev), GFP_KERNEL);
 	if (!idev) {
 		return -ENOMEM;
 	}
 
 	idev->owner = owner;
 	idev->info = info;
+	spin_lock_init(&idev->info_lock);
 	init_waitqueue_head(&idev->wait);
 	atomic_set(&idev->event, 0);
 
@@ -837,14 +873,19 @@ int __uio_register_device(struct module *owner,
 	if (ret)
 		return ret;
 
-	idev->dev = device_create(&uio_class, parent,
-				  MKDEV(uio_major, idev->minor), idev,
-				  "uio%d", idev->minor);
-	if (IS_ERR(idev->dev)) {
-		printk(KERN_ERR "UIO: device register failed\n");
-		ret = PTR_ERR(idev->dev);
+	idev->dev.devt = MKDEV(uio_major, idev->minor);
+	idev->dev.class = &uio_class;
+	idev->dev.parent = parent;
+	idev->dev.release = uio_device_release;
+	dev_set_drvdata(&idev->dev, idev);
+
+	ret = dev_set_name(&idev->dev, "uio%d", idev->minor);
+	if (ret)
+		goto err_device_create;
+
+	ret = device_register(&idev->dev);
+	if (ret)
 		goto err_device_create;
-	}
 
 	ret = uio_dev_add_attributes(idev);
 	if (ret)
@@ -872,7 +913,7 @@ int __uio_register_device(struct module *owner,
 err_request_irq:
 	uio_dev_del_attributes(idev);
 err_uio_dev_add_attributes:
-	device_destroy(&uio_class, MKDEV(uio_major, idev->minor));
+	device_unregister(&idev->dev);
 err_device_create:
 	uio_free_minor(idev);
 	return ret;
@@ -887,6 +928,7 @@ EXPORT_SYMBOL_GPL(__uio_register_device);
 void uio_unregister_device(struct uio_info *info)
 {
 	struct uio_device *idev;
+	unsigned long flags;
 
 	if (!info || !info->uio_dev)
 		return;
@@ -900,7 +942,11 @@ void uio_unregister_device(struct uio_info *info)
 	if (info->irq && info->irq != UIO_IRQ_CUSTOM)
 		free_irq(info->irq, idev);
 
-	device_destroy(&uio_class, MKDEV(uio_major, idev->minor));
+	spin_lock_irqsave(&idev->info_lock, flags);
+	idev->info = NULL;
+	spin_unlock_irqrestore(&idev->info_lock, flags);
+
+	device_unregister(&idev->dev);
 
 	return;
 }
diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h
index 3c85c81b0027..6c5f2074e14f 100644
--- a/include/linux/uio_driver.h
+++ b/include/linux/uio_driver.h
@@ -14,6 +14,7 @@
 #ifndef _UIO_DRIVER_H_
 #define _UIO_DRIVER_H_
 
+#include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/interrupt.h>
 
@@ -68,12 +69,13 @@ struct uio_port {
 
 struct uio_device {
         struct module           *owner;
-        struct device           *dev;
+	struct device		dev;
         int                     minor;
         atomic_t                event;
         struct fasync_struct    *async_queue;
         wait_queue_head_t       wait;
         struct uio_info         *info;
+	spinlock_t		info_lock;
         struct kobject          *map_dir;
         struct kobject          *portio_dir;
 };
-- 
cgit v1.2.3


From ad0dfdfd874c38d42bc4e5ef8969d86f3da0b93a Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Wed, 9 May 2018 12:06:04 -0600
Subject: coresight: Moving framework and drivers to SPDX identifier

Moving all kernel side CoreSight framework and drivers to SPDX identifier.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-cpu-debug.c          | 14 +-------------
 drivers/hwtracing/coresight/coresight-dynamic-replicator.c | 10 +---------
 drivers/hwtracing/coresight/coresight-etb10.c              | 13 +++----------
 drivers/hwtracing/coresight/coresight-etm-cp14.c           | 13 +++----------
 drivers/hwtracing/coresight/coresight-etm-perf.c           | 13 +------------
 drivers/hwtracing/coresight/coresight-etm-perf.h           | 13 +------------
 drivers/hwtracing/coresight/coresight-etm.h                | 13 +++----------
 drivers/hwtracing/coresight/coresight-etm3x-sysfs.c        | 13 +------------
 drivers/hwtracing/coresight/coresight-etm3x.c              | 13 +++----------
 drivers/hwtracing/coresight/coresight-etm4x-sysfs.c        | 13 +------------
 drivers/hwtracing/coresight/coresight-etm4x.c              | 13 +++----------
 drivers/hwtracing/coresight/coresight-etm4x.h              | 13 +++----------
 drivers/hwtracing/coresight/coresight-funnel.c             | 13 +++----------
 drivers/hwtracing/coresight/coresight-priv.h               | 13 +++----------
 drivers/hwtracing/coresight/coresight-replicator.c         | 13 +++----------
 drivers/hwtracing/coresight/coresight-stm.c                | 13 +++----------
 drivers/hwtracing/coresight/coresight-tmc-etf.c            | 13 +------------
 drivers/hwtracing/coresight/coresight-tmc-etr.c            | 13 +------------
 drivers/hwtracing/coresight/coresight-tmc.c                | 10 +---------
 drivers/hwtracing/coresight/coresight-tmc.h                | 13 +------------
 drivers/hwtracing/coresight/coresight-tpiu.c               | 13 +++----------
 drivers/hwtracing/coresight/coresight.c                    | 13 +++----------
 drivers/hwtracing/coresight/of_coresight.c                 | 13 +++----------
 drivers/hwtracing/stm/ftrace.c                             | 10 +---------
 include/linux/coresight.h                                  | 13 +++----------
 25 files changed, 53 insertions(+), 264 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c
index 9cdb3fbc8c1f..33d2470b0595 100644
--- a/drivers/hwtracing/coresight/coresight-cpu-debug.c
+++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2017 Linaro Limited. All rights reserved.
  *
  * Author: Leo Yan <leo.yan@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- *
  */
 #include <linux/amba/bus.h>
 #include <linux/coresight.h>
diff --git a/drivers/hwtracing/coresight/coresight-dynamic-replicator.c b/drivers/hwtracing/coresight/coresight-dynamic-replicator.c
index 043da86b0fe9..f6d0571ab9dd 100644
--- a/drivers/hwtracing/coresight/coresight-dynamic-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-dynamic-replicator.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2011-2015, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/amba/bus.h>
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 580cd381adf3..9b6c55523c58 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -1,15 +1,8 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * Description: CoreSight Embedded Trace Buffer driver
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <asm/local.h>
diff --git a/drivers/hwtracing/coresight/coresight-etm-cp14.c b/drivers/hwtracing/coresight/coresight-etm-cp14.c
index 12a220682117..4174a8d355d2 100644
--- a/drivers/hwtracing/coresight/coresight-etm-cp14.c
+++ b/drivers/hwtracing/coresight/coresight-etm-cp14.c
@@ -1,13 +1,6 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2012, The Linux Foundation. All rights reserved.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 4e5ed6597f2f..677695635211 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/coresight.h>
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.h b/drivers/hwtracing/coresight/coresight-etm-perf.h
index 3ffc9feb2d64..4197df4faf5e 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.h
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.h
@@ -1,18 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _CORESIGHT_ETM_PERF_H
diff --git a/drivers/hwtracing/coresight/coresight-etm.h b/drivers/hwtracing/coresight/coresight-etm.h
index 70b0a248c321..e8b4549e30e2 100644
--- a/drivers/hwtracing/coresight/coresight-etm.h
+++ b/drivers/hwtracing/coresight/coresight-etm.h
@@ -1,13 +1,6 @@
-/* Copyright (c) 2014-2015, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2014-2015, The Linux Foundation. All rights reserved.
  */
 
 #ifndef _CORESIGHT_CORESIGHT_ETM_H
diff --git a/drivers/hwtracing/coresight/coresight-etm3x-sysfs.c b/drivers/hwtracing/coresight/coresight-etm3x-sysfs.c
index 6e547ec6fead..9435c1481f61 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x-sysfs.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x-sysfs.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/pm_runtime.h>
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index 39f42fdd503d..15ed64d51a5b 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -1,15 +1,8 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * Description: CoreSight Program Flow Trace driver
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
index d21961710713..4eb8da785ce0 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/pm_runtime.h>
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index cf364a514c12..9bc04c50d45b 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -1,13 +1,6 @@
-/* Copyright (c) 2014, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014, The Linux Foundation. All rights reserved.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h
index b3b5ea7b7fb3..b7c4a6f6c6b9 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -1,13 +1,6 @@
-/* Copyright (c) 2014-2015, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2014-2015, The Linux Foundation. All rights reserved.
  */
 
 #ifndef _CORESIGHT_CORESIGHT_ETM_H
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 9f8ac0bef853..448145a36675 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -1,15 +1,8 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * Description: CoreSight Funnel driver
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index f1d0e21d8cab..0e5a74dae6a6 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -1,13 +1,6 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  */
 
 #ifndef _CORESIGHT_PRIV_H
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 3756e71cb8f5..8d2eaaab6c2f 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -1,15 +1,8 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * Description: CoreSight Replicator driver
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 15e7ef3891f5..c46c70aec1d5 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -1,16 +1,9 @@
-/* Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
  *
  * Description: CoreSight System Trace Macrocell driver
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  * Initial implementation by Pratik Patel
  * (C) 2014-2015 Pratik Patel <pratikp@codeaurora.org>
  *
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index e2513b786242..61d849b11c26 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2016 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/circ_buf.h>
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 68fbc8f7450e..cce2ff42e8e2 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2016 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/coresight.h>
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c
index 0ea04f588de0..456f122df74f 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * Description: CoreSight Trace Memory Controller driver
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h
index 8df7a813f537..dfaff077a7fc 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.h
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -1,18 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _CORESIGHT_TMC_H
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index 805f7c2210fe..01b7457fe8fc 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -1,15 +1,8 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * Description: CoreSight Trace Port Interface Unit driver
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 132dfbcd90c1..29e834aab539 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -1,13 +1,6 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2012, The Linux Foundation. All rights reserved.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/hwtracing/coresight/of_coresight.c b/drivers/hwtracing/coresight/of_coresight.c
index 7c375443ede6..a33a92ebe74b 100644
--- a/drivers/hwtracing/coresight/of_coresight.c
+++ b/drivers/hwtracing/coresight/of_coresight.c
@@ -1,13 +1,6 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2012, The Linux Foundation. All rights reserved.
  */
 
 #include <linux/types.h>
diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c
index 7da75644c750..ce868e095410 100644
--- a/drivers/hwtracing/stm/ftrace.c
+++ b/drivers/hwtracing/stm/ftrace.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Simple kernel driver to link kernel Ftrace and an STM device
  * Copyright (c) 2016, Linaro Ltd.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
  * STM Ftrace will be registered as a trace_export.
  */
 
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index d950dad5056a..c265e0468414 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -1,13 +1,6 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2012, The Linux Foundation. All rights reserved.
  */
 
 #ifndef _LINUX_CORESIGHT_H
-- 
cgit v1.2.3


From b3db17e4b864e46ad150ebef69c0e0130a1c5fca Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Fri, 11 May 2018 12:06:56 +0100
Subject: drivers: nvmem: Export nvmem_add_cells()

Not all platforms use device tree. It is useful to be able to add
cells to a NVMEM device from code. Export nvmem_add_cells() so making
this possible.

This required changing the parameters a bit, so that just the cells
and the number of cells are passed, not the whole nvmem config
structure.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 24 +++++++++++++++++-------
 include/linux/nvmem-provider.h | 11 +++++++++++
 2 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index b05aa8e81303..b1c95ef78544 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -353,18 +353,27 @@ static int nvmem_cell_info_to_nvmem_cell(struct nvmem_device *nvmem,
 	return 0;
 }
 
-static int nvmem_add_cells(struct nvmem_device *nvmem,
-			   const struct nvmem_config *cfg)
+/**
+ * nvmem_add_cells() - Add cell information to an nvmem device
+ *
+ * @nvmem: nvmem device to add cells to.
+ * @info: nvmem cell info to add to the device
+ * @ncells: number of cells in info
+ *
+ * Return: 0 or negative error code on failure.
+ */
+int nvmem_add_cells(struct nvmem_device *nvmem,
+		    const struct nvmem_cell_info *info,
+		    int ncells)
 {
 	struct nvmem_cell **cells;
-	const struct nvmem_cell_info *info = cfg->cells;
 	int i, rval;
 
-	cells = kcalloc(cfg->ncells, sizeof(*cells), GFP_KERNEL);
+	cells = kcalloc(ncells, sizeof(*cells), GFP_KERNEL);
 	if (!cells)
 		return -ENOMEM;
 
-	for (i = 0; i < cfg->ncells; i++) {
+	for (i = 0; i < ncells; i++) {
 		cells[i] = kzalloc(sizeof(**cells), GFP_KERNEL);
 		if (!cells[i]) {
 			rval = -ENOMEM;
@@ -380,7 +389,7 @@ static int nvmem_add_cells(struct nvmem_device *nvmem,
 		nvmem_cell_add(cells[i]);
 	}
 
-	nvmem->ncells = cfg->ncells;
+	nvmem->ncells = ncells;
 	/* remove tmp array */
 	kfree(cells);
 
@@ -393,6 +402,7 @@ err:
 
 	return rval;
 }
+EXPORT_SYMBOL_GPL(nvmem_add_cells);
 
 /*
  * nvmem_setup_compat() - Create an additional binary entry in
@@ -509,7 +519,7 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	}
 
 	if (config->cells)
-		nvmem_add_cells(nvmem, config);
+		nvmem_add_cells(nvmem, config->cells, config->ncells);
 
 	return nvmem;
 
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index f89598bc4e1c..24def6ad09bb 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -77,6 +77,9 @@ struct nvmem_device *devm_nvmem_register(struct device *dev,
 
 int devm_nvmem_unregister(struct device *dev, struct nvmem_device *nvmem);
 
+int nvmem_add_cells(struct nvmem_device *nvmem,
+		    const struct nvmem_cell_info *info,
+		    int ncells);
 #else
 
 static inline struct nvmem_device *nvmem_register(const struct nvmem_config *c)
@@ -99,6 +102,14 @@ static inline int
 devm_nvmem_unregister(struct device *dev, struct nvmem_device *nvmem)
 {
 	return nvmem_unregister(nvmem);
+
+}
+
+static inline int nvmem_add_cells(struct nvmem_device *nvmem,
+				  const struct nvmem_cell_info *info,
+				  int ncells)
+{
+	return -ENOSYS;
 }
 
 #endif /* CONFIG_NVMEM */
-- 
cgit v1.2.3


From bdeeed098811b36d1f988521600a89a400830a4b Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Wed, 9 May 2018 11:04:48 +1000
Subject: nubus: Call bus_register unconditionally

Loading a NuBus driver module on a non-NuBus machine triggers the
BUG_ON(!drv->bus->p) in driver_register(), because bus_register() was
not called, because it is conditional on MACH_IS_MAC.

Fix the crash by calling bus_register() unconditionally. Call it from
a postcore_initcall(), like other busses do.

Hence, the bus type is available for device_register(), which happens
in a subsys initcall, and for driver_register(), which happens in a
device or module initcall.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reported-by: Michael Schmitz <schmitzmic@gmail.com>
Tested-by: Stan Johnson <userm57@yahoo.com>
Fixes: 7f86c765a6a2 ("nubus: Add support for the driver model")
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nubus/bus.c   | 19 +++++++------------
 drivers/nubus/nubus.c |  2 +-
 include/linux/nubus.h |  2 +-
 3 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/nubus/bus.c b/drivers/nubus/bus.c
index d306c348c857..a59b6c4bb5b8 100644
--- a/drivers/nubus/bus.c
+++ b/drivers/nubus/bus.c
@@ -63,20 +63,15 @@ static struct device nubus_parent = {
 	.init_name	= "nubus",
 };
 
-int __init nubus_bus_register(void)
+static int __init nubus_bus_register(void)
 {
-	int err;
-
-	err = device_register(&nubus_parent);
-	if (err)
-		return err;
-
-	err = bus_register(&nubus_bus_type);
-	if (!err)
-		return 0;
+	return bus_register(&nubus_bus_type);
+}
+postcore_initcall(nubus_bus_register);
 
-	device_unregister(&nubus_parent);
-	return err;
+int __init nubus_parent_device_register(void)
+{
+	return device_register(&nubus_parent);
 }
 
 static void nubus_device_release(struct device *dev)
diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c
index 4621ff98138c..bb0d63a44f41 100644
--- a/drivers/nubus/nubus.c
+++ b/drivers/nubus/nubus.c
@@ -875,7 +875,7 @@ static int __init nubus_init(void)
 		return 0;
 
 	nubus_proc_init();
-	err = nubus_bus_register();
+	err = nubus_parent_device_register();
 	if (err)
 		return err;
 	nubus_scan_bus();
diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index 6e8200215321..eba50b057f6f 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -163,7 +163,7 @@ void nubus_seq_write_rsrc_mem(struct seq_file *m,
 unsigned char *nubus_dirptr(const struct nubus_dirent *nd);
 
 /* Declarations relating to driver model objects */
-int nubus_bus_register(void);
+int nubus_parent_device_register(void);
 int nubus_device_register(struct nubus_board *board);
 int nubus_driver_register(struct nubus_driver *ndrv);
 void nubus_driver_unregister(struct nubus_driver *ndrv);
-- 
cgit v1.2.3


From 7dcc01343e483efda0882456f8361f061a5f416d Mon Sep 17 00:00:00 2001
From: Andres Rodriguez <andresx7@gmail.com>
Date: Thu, 10 May 2018 13:08:45 -0700
Subject: firmware: add firmware_request_nowarn() - load firmware without
 warnings

Currently the firmware loader only exposes one silent path for querying
optional firmware, and that is firmware_request_direct(). This function
also disables the sysfs fallback mechanism, which might not always be the
desired behaviour [0].

This patch introduces a variations of request_firmware() that enable the
caller to disable the undesired warning messages but enables the sysfs
fallback mechanism. This is equivalent to adding FW_OPT_NO_WARN to the
old behaviour.

[0]: https://git.kernel.org/linus/c0cc00f250e1

Signed-off-by: Andres Rodriguez <andresx7@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Luis R. Rodriguez <mcgrof@kernel.org>
[mcgrof: used the old API calls as the full rename is not done yet, and
 add the caller for when FW_LOADER is disabled, enhance documentation ]
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../driver-api/firmware/request_firmware.rst       |  5 ++++
 drivers/base/firmware_loader/main.c                | 27 ++++++++++++++++++++++
 include/linux/firmware.h                           | 10 ++++++++
 3 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/Documentation/driver-api/firmware/request_firmware.rst b/Documentation/driver-api/firmware/request_firmware.rst
index d5ec95a7195b..f62bdcbfed5b 100644
--- a/Documentation/driver-api/firmware/request_firmware.rst
+++ b/Documentation/driver-api/firmware/request_firmware.rst
@@ -20,6 +20,11 @@ request_firmware
 .. kernel-doc:: drivers/base/firmware_loader/main.c
    :functions: request_firmware
 
+firmware_request_nowarn
+-----------------------
+.. kernel-doc:: drivers/base/firmware_loader/main.c
+   :functions: firmware_request_nowarn
+
 request_firmware_direct
 -----------------------
 .. kernel-doc:: drivers/base/firmware_loader/main.c
diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index abdc4e4d55f1..0943e7065e0e 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -631,6 +631,33 @@ request_firmware(const struct firmware **firmware_p, const char *name,
 }
 EXPORT_SYMBOL(request_firmware);
 
+/**
+ * firmware_request_nowarn() - request for an optional fw module
+ * @firmware: pointer to firmware image
+ * @name: name of firmware file
+ * @device: device for which firmware is being loaded
+ *
+ * This function is similar in behaviour to request_firmware(), except
+ * it doesn't produce warning messages when the file is not found.
+ * The sysfs fallback mechanism is enabled if direct filesystem lookup fails,
+ * however, however failures to find the firmware file with it are still
+ * suppressed. It is therefore up to the driver to check for the return value
+ * of this call and to decide when to inform the users of errors.
+ **/
+int firmware_request_nowarn(const struct firmware **firmware, const char *name,
+			    struct device *device)
+{
+	int ret;
+
+	/* Need to pin this module until return */
+	__module_get(THIS_MODULE);
+	ret = _request_firmware(firmware, name, device, NULL, 0,
+				FW_OPT_UEVENT | FW_OPT_NO_WARN);
+	module_put(THIS_MODULE);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(firmware_request_nowarn);
+
 /**
  * request_firmware_direct() - load firmware directly without usermode helper
  * @firmware_p: pointer to firmware image
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 41050417cafb..2dd566c91d44 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -42,6 +42,8 @@ struct builtin_fw {
 #if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE))
 int request_firmware(const struct firmware **fw, const char *name,
 		     struct device *device);
+int firmware_request_nowarn(const struct firmware **fw, const char *name,
+			    struct device *device);
 int request_firmware_nowait(
 	struct module *module, bool uevent,
 	const char *name, struct device *device, gfp_t gfp, void *context,
@@ -59,6 +61,14 @@ static inline int request_firmware(const struct firmware **fw,
 {
 	return -EINVAL;
 }
+
+static inline int firmware_request_nowarn(const struct firmware **fw,
+					  const char *name,
+					  struct device *device)
+{
+	return -EINVAL;
+}
+
 static inline int request_firmware_nowait(
 	struct module *module, bool uevent,
 	const char *name, struct device *device, gfp_t gfp, void *context,
-- 
cgit v1.2.3


From ac613e4566f9df3a8cf8b18e2dcc8af087143868 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 May 2018 09:54:03 +0200
Subject: scsi/osd: remove the gfp argument to osd_start_request

Always GFP_KERNEL, and keeping it would cause serious complications for
the next change.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/osd/osd_initiator.c | 24 +++++++++++-------------
 fs/exofs/ore.c                   | 10 +++++-----
 fs/exofs/super.c                 |  2 +-
 include/scsi/osd_initiator.h     |  6 +-----
 4 files changed, 18 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index e18877177f1b..f48bae267dc2 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -99,7 +99,7 @@ static int _osd_get_print_system_info(struct osd_dev *od,
 	int nelem = ARRAY_SIZE(get_attrs), a = 0;
 	int ret;
 
-	or = osd_start_request(od, GFP_KERNEL);
+	or = osd_start_request(od);
 	if (!or)
 		return -ENOMEM;
 
@@ -409,16 +409,15 @@ static void _osd_request_free(struct osd_request *or)
 	kfree(or);
 }
 
-struct osd_request *osd_start_request(struct osd_dev *dev, gfp_t gfp)
+struct osd_request *osd_start_request(struct osd_dev *dev)
 {
 	struct osd_request *or;
 
-	or = _osd_request_alloc(gfp);
+	or = _osd_request_alloc(GFP_KERNEL);
 	if (!or)
 		return NULL;
 
 	or->osd_dev = dev;
-	or->alloc_flags = gfp;
 	or->timeout = dev->def_timeout;
 	or->retries = OSD_REQ_RETRIES;
 
@@ -546,7 +545,7 @@ static int _osd_realloc_seg(struct osd_request *or,
 	if (seg->alloc_size >= max_bytes)
 		return 0;
 
-	buff = krealloc(seg->buff, max_bytes, or->alloc_flags);
+	buff = krealloc(seg->buff, max_bytes, GFP_KERNEL);
 	if (!buff) {
 		OSD_ERR("Failed to Realloc %d-bytes was-%d\n", max_bytes,
 			seg->alloc_size);
@@ -728,7 +727,7 @@ static int _osd_req_list_objects(struct osd_request *or,
 		_osd_req_encode_olist(or, list);
 
 	WARN_ON(or->in.bio);
-	bio = bio_map_kern(q, list, len, or->alloc_flags);
+	bio = bio_map_kern(q, list, len, GFP_KERNEL);
 	if (IS_ERR(bio)) {
 		OSD_ERR("!!! Failed to allocate list_objects BIO\n");
 		return PTR_ERR(bio);
@@ -1190,14 +1189,14 @@ static int _req_append_segment(struct osd_request *or,
 			pad_buff = io->pad_buff;
 
 		ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding,
-				       or->alloc_flags);
+				       GFP_KERNEL);
 		if (ret)
 			return ret;
 		io->total_bytes += padding;
 	}
 
 	ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes,
-			       or->alloc_flags);
+			       GFP_KERNEL);
 	if (ret)
 		return ret;
 
@@ -1564,14 +1563,14 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
  * osd_finalize_request and helpers
  */
 static struct request *_make_request(struct request_queue *q, bool has_write,
-			      struct _osd_io_info *oii, gfp_t flags)
+			      struct _osd_io_info *oii)
 {
 	struct request *req;
 	struct bio *bio = oii->bio;
 	int ret;
 
 	req = blk_get_request(q, has_write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			flags);
+			GFP_KERNEL);
 	if (IS_ERR(req))
 		return req;
 
@@ -1589,13 +1588,12 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 static int _init_blk_request(struct osd_request *or,
 	bool has_in, bool has_out)
 {
-	gfp_t flags = or->alloc_flags;
 	struct scsi_device *scsi_device = or->osd_dev->scsi_device;
 	struct request_queue *q = scsi_device->request_queue;
 	struct request *req;
 	int ret;
 
-	req = _make_request(q, has_out, has_out ? &or->out : &or->in, flags);
+	req = _make_request(q, has_out, has_out ? &or->out : &or->in);
 	if (IS_ERR(req)) {
 		ret = PTR_ERR(req);
 		goto out;
@@ -1611,7 +1609,7 @@ static int _init_blk_request(struct osd_request *or,
 		or->out.req = req;
 		if (has_in) {
 			/* allocate bidi request */
-			req = _make_request(q, false, &or->in, flags);
+			req = _make_request(q, false, &or->in);
 			if (IS_ERR(req)) {
 				OSD_DEBUG("blk_get_request for bidi failed\n");
 				ret = PTR_ERR(req);
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 3c6a9c156b7a..ddbf87246898 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -790,7 +790,7 @@ int ore_create(struct ore_io_state *ios)
 	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, i));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -815,7 +815,7 @@ int ore_remove(struct ore_io_state *ios)
 	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, i));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -847,7 +847,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, dev));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -966,7 +966,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 		return 0; /* Just an empty slot */
 
 	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
-	or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
+	or = osd_start_request(_ios_od(ios, first_dev));
 	if (unlikely(!or)) {
 		ORE_ERR("%s: osd_start_request failed\n", __func__);
 		return -ENOMEM;
@@ -1060,7 +1060,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
 		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, cur_comp));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			return -ENOMEM;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 179cd5c2f52a..719a3152da80 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
 static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 		    u64 offset, void *p, unsigned length)
 {
-	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+	struct osd_request *or = osd_start_request(od);
 /*	struct osd_sense_info osi = {.key = 0};*/
 	int ret;
 
diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h
index a29d3086eb56..86a569d008b2 100644
--- a/include/scsi/osd_initiator.h
+++ b/include/scsi/osd_initiator.h
@@ -148,7 +148,6 @@ struct osd_request {
 		u8 *pad_buff;
 	} out, in;
 
-	gfp_t alloc_flags;
 	unsigned timeout;
 	unsigned retries;
 	unsigned sense_len;
@@ -202,14 +201,11 @@ static inline bool osd_req_is_ver1(struct osd_request *or)
  *
  * @osd_dev:    OSD device that holds the scsi-device and default values
  *              that the request is associated with.
- * @gfp:        The allocation flags to use for request allocation, and all
- *              subsequent allocations. This will be stored at
- *              osd_request->alloc_flags, can be changed by user later
  *
  * Allocate osd_request and initialize all members to the
  * default/initial state.
  */
-struct osd_request *osd_start_request(struct osd_dev *od, gfp_t gfp);
+struct osd_request *osd_start_request(struct osd_dev *od);
 
 enum osd_req_options {
 	OSD_REQ_FUA = 0x08,	/* Force Unit Access */
-- 
cgit v1.2.3


From ff005a066240efb73ae29a2bb9269ae726bc2eae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 May 2018 09:54:05 +0200
Subject: block: sanitize blk_get_request calling conventions

Switch everyone to blk_get_request_flags, and then rename
blk_get_request_flags to blk_get_request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c                   | 14 +++-----------
 block/bsg.c                        |  5 ++---
 block/scsi_ioctl.c                 |  8 +++-----
 drivers/block/paride/pd.c          |  2 +-
 drivers/block/pktcdvd.c            |  2 +-
 drivers/block/sx8.c                |  2 +-
 drivers/block/virtio_blk.c         |  2 +-
 drivers/cdrom/cdrom.c              |  2 +-
 drivers/ide/ide-atapi.c            |  2 +-
 drivers/ide/ide-cd.c               |  2 +-
 drivers/ide/ide-cd_ioctl.c         |  2 +-
 drivers/ide/ide-devsets.c          |  2 +-
 drivers/ide/ide-disk.c             |  2 +-
 drivers/ide/ide-ioctls.c           |  4 ++--
 drivers/ide/ide-park.c             |  4 ++--
 drivers/ide/ide-pm.c               |  5 ++---
 drivers/ide/ide-tape.c             |  2 +-
 drivers/ide/ide-taskfile.c         |  2 +-
 drivers/md/dm-mpath.c              |  3 ++-
 drivers/mmc/core/block.c           | 12 +++++-------
 drivers/scsi/osd/osd_initiator.c   |  2 +-
 drivers/scsi/osst.c                |  2 +-
 drivers/scsi/scsi_error.c          |  2 +-
 drivers/scsi/scsi_lib.c            |  2 +-
 drivers/scsi/sg.c                  |  2 +-
 drivers/scsi/st.c                  |  2 +-
 drivers/target/target_core_pscsi.c |  3 +--
 fs/nfsd/blocklayout.c              |  2 +-
 include/linux/blkdev.h             |  5 +----
 29 files changed, 42 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index e78560f04d77..5da706ae3441 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1605,13 +1605,13 @@ static struct request *blk_old_get_request(struct request_queue *q,
 }
 
 /**
- * blk_get_request_flags - allocate a request
+ * blk_get_request - allocate a request
  * @q: request queue to allocate a request for
  * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
  * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
  */
-struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
-				      blk_mq_req_flags_t flags)
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+				blk_mq_req_flags_t flags)
 {
 	struct request *req;
 
@@ -1630,14 +1630,6 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
 
 	return req;
 }
-EXPORT_SYMBOL(blk_get_request_flags);
-
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-				gfp_t gfp_mask)
-{
-	return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
-				     0 : BLK_MQ_REQ_NOWAIT);
-}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
diff --git a/block/bsg.c b/block/bsg.c
index defa06c11858..30876c5fbf15 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -226,8 +226,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
 		return ERR_PTR(ret);
 
 	rq = blk_get_request(q, hdr->dout_xfer_len ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return rq;
 
@@ -249,7 +248,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
 			goto out;
 		}
 
-		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 		if (IS_ERR(next_rq)) {
 			ret = PTR_ERR(next_rq);
 			goto out;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 60b471f8621b..04b54f9a4152 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,8 +321,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		at_head = 1;
 
 	ret = -ENOMEM;
-	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	req = scsi_req(rq);
@@ -449,8 +448,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	}
 
-	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			__GFP_RECLAIM);
+	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
@@ -538,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	struct request *rq;
 	int err;
 
-	rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 27a44b97393a..8961b190e256 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -740,7 +740,7 @@ static int pd_special_command(struct pd_unit *disk,
 {
 	struct request *rq;
 
-	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index c61d20c9f3f8..4880a4a9f52d 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -704,7 +704,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	int ret = 0;
 
 	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 08586dc14e85..4d90e5eba2f5 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -567,7 +567,7 @@ static struct carm_request *carm_get_special(struct carm_host *host)
 	if (!crq)
 		return NULL;
 
-	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, GFP_KERNEL);
+	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
 	if (IS_ERR(rq)) {
 		spin_lock_irqsave(&host->lock, flags);
 		carm_put_request(host, crq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c2efd..0617b9922d59 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -298,7 +298,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	struct request *req;
 	int err;
 
-	req = blk_get_request(q, REQ_OP_DRV_IN, GFP_KERNEL);
+	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index bfc566d3f31a..9adc8c3eb0fa 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2192,7 +2192,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+		rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			break;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 0e6bc631a1ca..8b2b72b93885 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	struct request *rq;
 	int error;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = (char *)pc;
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5a8e8e3c22cd..32af6f063cb3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -437,7 +437,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		bool delay = false;
 
 		rq = blk_get_request(drive->queue,
-			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,  __GFP_RECLAIM);
+			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 		memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
 		ide_req(rq)->type = ATA_PRIV_PC;
 		rq->rq_flags |= rq_flags;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 2acca12b9c94..b1322400887b 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	struct request *rq;
 	int ret;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
 	blk_execute_rq(drive->queue, cd->disk, rq, 0);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 4e20747af32e..f4f8afdf8bbe 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	if (!(setting->flags & DS_SYNC))
 		return setting->set(drive, arg);
 
-	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 5;
 	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index f1a7c58fe418..e3b4e659082d 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -478,7 +478,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 	if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
 		return -EBUSY;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	drive->mult_req = arg;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 3661abb16a5f..af5119a73689 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 	if (NULL == (void *) arg) {
 		struct request *rq;
 
-		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
 		blk_execute_rq(drive->queue, NULL, rq, 0);
 		err = scsi_req(rq)->result ? -EIO : 0;
@@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	struct request *rq;
 	int ret = 0;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 6465bcc7cea6..622f0edb3945 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -32,7 +32,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	}
 	spin_unlock_irq(&hwif->lock);
 
-	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
@@ -47,7 +47,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	 * Make sure that *some* command is sent to the drive after the
 	 * timeout has expired, so power management will be reenabled.
 	 */
-	rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, BLK_MQ_REQ_NOWAIT);
 	if (IS_ERR(rq))
 		goto out;
 
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ad8a125defdd..59217aa1d1fb 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
@@ -90,8 +90,7 @@ int generic_ide_resume(struct device *dev)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
-				   BLK_MQ_REQ_PREEMPT);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
 	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fd57e8ccc47a..66661031f3f1 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -854,7 +854,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 	BUG_ON(size < 0 || size % tape->blk_size);
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index abe0822dd429..6308bb0dab50 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -431,7 +431,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue,
 		(cmd->tf_flags & IDE_TFLAG_WRITE) ?
-			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM);
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	/*
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 203a0419d2b0..d94ba6f72ff5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -520,7 +520,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
 	bdev = pgpath->path.dev->bdev;
 	q = bdev_get_queue(bdev);
-	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
+	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
+			BLK_MQ_REQ_NOWAIT);
 	if (IS_ERR(clone)) {
 		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
 		if (blk_queue_dying(q)) {
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 9e923cd1d80e..6e3bde824db8 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -244,7 +244,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
 	mq = &md->queue;
 
 	/* Dispatch locking to the block layer */
-	req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, __GFP_RECLAIM);
+	req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, 0);
 	if (IS_ERR(req)) {
 		count = PTR_ERR(req);
 		goto out_put;
@@ -650,8 +650,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
 	 */
 	mq = &md->queue;
 	req = blk_get_request(mq->queue,
-		idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
-		__GFP_RECLAIM);
+		idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto cmd_done;
@@ -721,8 +720,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
 	 */
 	mq = &md->queue;
 	req = blk_get_request(mq->queue,
-		idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
-		__GFP_RECLAIM);
+		idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto cmd_err;
@@ -2750,7 +2748,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
 	int ret;
 
 	/* Ask the block layer about the card status */
-	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
@@ -2786,7 +2784,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
 		return -ENOMEM;
 
 	/* Ask the block layer for the EXT CSD */
-	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out_free;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index f48bae267dc2..5a33e1ad9881 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1570,7 +1570,7 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 	int ret;
 
 	req = blk_get_request(q, has_write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+			0);
 	if (IS_ERR(req))
 		return req;
 
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 20ec1c01dbd5..2bbe797f8c3d 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -368,7 +368,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 	int write = (data_direction == DMA_TO_DEVICE);
 
 	req = blk_get_request(SRpnt->stp->device->request_queue,
-			write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+			write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 946039117bf4..61d280f560df 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1937,7 +1937,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	 * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a
 	 * request becomes available
 	 */
-	req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, GFP_KERNEL);
+	req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req))
 		return;
 	rq = scsi_req(req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e9b4f279d29c..6b0f3ec487bd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -265,7 +265,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	struct scsi_request *rq;
 	int ret = DRIVER_ERROR << 24;
 
-	req = blk_get_request_flags(sdev->request_queue,
+	req = blk_get_request(sdev->request_queue,
 			data_direction == DMA_TO_DEVICE ?
 			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT);
 	if (IS_ERR(req))
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index c198b96368dd..0512648ebc8e 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1715,7 +1715,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 	 * does not sleep except under memory pressure.
 	 */
 	rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq)) {
 		kfree(long_cmdp);
 		return PTR_ERR(rq);
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 6c399480783d..a427ce9497be 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -545,7 +545,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 
 	req = blk_get_request(SRpnt->stp->device->request_queue,
 			data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 	rq = scsi_req(req);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 6cb933ecc084..668934ea74cb 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -986,8 +986,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 
 	req = blk_get_request(pdv->pdv_sd->request_queue,
 			cmd->data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req)) {
 		pr_err("PSCSI: blk_get_request() failed\n");
 		ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 70b8bf781fce..a43dfedd69ec 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -227,7 +227,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	if (!buf)
 		return -ENOMEM;
 
-	rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+	rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq)) {
 		error = -ENOMEM;
 		goto out_free_buf;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e42d510daf3c..f3999719f828 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -976,11 +976,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
-extern struct request *blk_get_request_flags(struct request_queue *,
-					     unsigned int op,
-					     blk_mq_req_flags_t flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
-				       gfp_t gfp_mask);
+				       blk_mq_req_flags_t flags);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-- 
cgit v1.2.3


From e6f32bf48fb1d3b7aedc0deb6e791362af71cb17 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 14 May 2018 07:09:50 +0900
Subject: ALSA: control: complement TLV macro for db-minmax and db-linear types

A commit 08f9f4485f21 ('ALSA: core api: define offsets for TLV items')
introduced a series of macro for offset of db-scale type of TLV, however
there are some types of TLV to add similar macros.

This commit complements macros for offset of db-minmax and db-linear types
of TLV data.

Cc: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/tlv.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/sound/tlv.h b/include/uapi/sound/tlv.h
index e3437e96519a..7d6d65f60a42 100644
--- a/include/uapi/sound/tlv.h
+++ b/include/uapi/sound/tlv.h
@@ -83,6 +83,10 @@
 		SNDRV_CTL_TLVD_DB_MINMAX_MUTE_ITEM(min_dB, max_dB) \
 	}
 
+/* Accessor offsets for min, max items in db-minmax types of TLV. */
+#define SNDRV_CTL_TLVO_DB_MINMAX_MIN	2
+#define SNDRV_CTL_TLVO_DB_MINMAX_MAX	3
+
 /* linear volume between min_dB and max_dB (.01dB unit) */
 #define SNDRV_CTL_TLVD_DB_LINEAR_ITEM(min_dB, max_dB) \
 	SNDRV_CTL_TLVD_ITEM(SNDRV_CTL_TLVT_DB_LINEAR, (min_dB), (max_dB))
@@ -91,6 +95,10 @@
 		SNDRV_CTL_TLVD_DB_LINEAR_ITEM(min_dB, max_dB) \
 	}
 
+/* Accessor offsets for min, max items in db-linear type of TLV. */
+#define SNDRV_CTL_TLVO_DB_LINEAR_MIN	2
+#define SNDRV_CTL_TLVO_DB_LINEAR_MAX	3
+
 /* dB range container:
  * Items in dB range container must be ordered by their values and by their
  * dB values. This implies that larger values must correspond with larger
-- 
cgit v1.2.3


From 9e2aee80c78d5084e0c58745e9762c29da6bd53f Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 11 May 2018 12:15:30 -0500
Subject: PCI: Move private DT related functions into private header

The functions in linux/of_pci.h are primarily used by host bridge
drivers, so they can be private to drivers/pci/.

The remaining functions are still used mostly in host bridge drivers
that still live in arch specific code. Hopefully someday, those will get
moved into drivers/pci as well.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Jingoo Han <jingoohan1@gmail.com>
---
 drivers/pci/dwc/pci-dra7xx.c           |  1 +
 drivers/pci/dwc/pcie-designware-host.c |  1 +
 drivers/pci/host/pci-aardvark.c        |  2 ++
 drivers/pci/host/pci-ftpci100.c        |  2 ++
 drivers/pci/host/pci-mvebu.c           |  2 ++
 drivers/pci/host/pci-rcar-gen2.c       |  2 ++
 drivers/pci/host/pci-tegra.c           |  2 ++
 drivers/pci/host/pci-v3-semi.c         |  2 ++
 drivers/pci/host/pci-xgene.c           |  2 ++
 drivers/pci/host/pcie-altera.c         |  2 ++
 drivers/pci/host/pcie-iproc-platform.c |  1 +
 drivers/pci/host/pcie-mediatek.c       |  2 ++
 drivers/pci/host/pcie-rcar.c           |  2 ++
 drivers/pci/host/pcie-rockchip.c       |  2 ++
 drivers/pci/host/pcie-xilinx-nwl.c     |  2 ++
 drivers/pci/host/pcie-xilinx.c         |  2 ++
 drivers/pci/pci.h                      | 40 ++++++++++++++++++++++++++++++++++
 include/linux/of_pci.h                 | 34 -----------------------------
 18 files changed, 69 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/dwc/pci-dra7xx.c b/drivers/pci/dwc/pci-dra7xx.c
index ed8558d638e5..620709d38a64 100644
--- a/drivers/pci/dwc/pci-dra7xx.c
+++ b/drivers/pci/dwc/pci-dra7xx.c
@@ -27,6 +27,7 @@
 #include <linux/mfd/syscon.h>
 #include <linux/regmap.h>
 
+#include "../pci.h"
 #include "pcie-designware.h"
 
 /* PCIe controller wrapper DRA7XX configuration registers */
diff --git a/drivers/pci/dwc/pcie-designware-host.c b/drivers/pci/dwc/pcie-designware-host.c
index a7657ab3a6e0..396cdb239ed5 100644
--- a/drivers/pci/dwc/pcie-designware-host.c
+++ b/drivers/pci/dwc/pcie-designware-host.c
@@ -15,6 +15,7 @@
 #include <linux/pci_regs.h>
 #include <linux/platform_device.h>
 
+#include "../pci.h"
 #include "pcie-designware.h"
 
 static struct pci_ops dw_pcie_ops;
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
index b04d37b3c5de..55b94911bc9a 100644
--- a/drivers/pci/host/pci-aardvark.c
+++ b/drivers/pci/host/pci-aardvark.c
@@ -19,6 +19,8 @@
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
 
+#include "../pci.h"
+
 /* PCIe core registers */
 #define PCIE_CORE_CMD_STATUS_REG				0x4
 #define     PCIE_CORE_CMD_IO_ACCESS_EN				BIT(0)
diff --git a/drivers/pci/host/pci-ftpci100.c b/drivers/pci/host/pci-ftpci100.c
index 5008fd87956a..474faa2e922e 100644
--- a/drivers/pci/host/pci-ftpci100.c
+++ b/drivers/pci/host/pci-ftpci100.c
@@ -28,6 +28,8 @@
 #include <linux/irq.h>
 #include <linux/clk.h>
 
+#include "../pci.h"
+
 /*
  * Special configuration registers directly in the first few words
  * in I/O space.
diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c
index 5d4dccfc9d81..23e270839e6a 100644
--- a/drivers/pci/host/pci-mvebu.c
+++ b/drivers/pci/host/pci-mvebu.c
@@ -21,6 +21,8 @@
 #include <linux/of_pci.h>
 #include <linux/of_platform.h>
 
+#include "../pci.h"
+
 /*
  * PCIe unit register offsets.
  */
diff --git a/drivers/pci/host/pci-rcar-gen2.c b/drivers/pci/host/pci-rcar-gen2.c
index dd4f1a6b57c5..326171cb1a97 100644
--- a/drivers/pci/host/pci-rcar-gen2.c
+++ b/drivers/pci/host/pci-rcar-gen2.c
@@ -21,6 +21,8 @@
 #include <linux/sizes.h>
 #include <linux/slab.h>
 
+#include "../pci.h"
+
 /* AHB-PCI Bridge PCI communication registers */
 #define RCAR_AHBPCI_PCICOM_OFFSET	0x800
 
diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c
index 389e74be846c..f4f53d092e00 100644
--- a/drivers/pci/host/pci-tegra.c
+++ b/drivers/pci/host/pci-tegra.c
@@ -40,6 +40,8 @@
 #include <soc/tegra/cpuidle.h>
 #include <soc/tegra/pmc.h>
 
+#include "../pci.h"
+
 #define INT_PCI_MSI_NR (8 * 32)
 
 /* register definitions */
diff --git a/drivers/pci/host/pci-v3-semi.c b/drivers/pci/host/pci-v3-semi.c
index 0a4dea796663..04bf53d02f23 100644
--- a/drivers/pci/host/pci-v3-semi.c
+++ b/drivers/pci/host/pci-v3-semi.c
@@ -33,6 +33,8 @@
 #include <linux/regmap.h>
 #include <linux/clk.h>
 
+#include "../pci.h"
+
 #define V3_PCI_VENDOR			0x00000000
 #define V3_PCI_DEVICE			0x00000002
 #define V3_PCI_CMD			0x00000004
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index 0a0d7ee6d3c9..648a50243022 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -22,6 +22,8 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 
+#include "../pci.h"
+
 #define PCIECORE_CTLANDSTATUS		0x50
 #define PIM1_1L				0x80
 #define IBAR2				0x98
diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index a6af62e0256d..dc4985087e04 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -17,6 +17,8 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 
+#include "../pci.h"
+
 #define RP_TX_REG0			0x2000
 #define RP_TX_REG1			0x2004
 #define RP_TX_CNTRL			0x2008
diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c
index e764a2a2693c..fb23fdf919c8 100644
--- a/drivers/pci/host/pcie-iproc-platform.c
+++ b/drivers/pci/host/pcie-iproc-platform.c
@@ -16,6 +16,7 @@
 #include <linux/of_platform.h>
 #include <linux/phy/phy.h>
 
+#include "../pci.h"
 #include "pcie-iproc.h"
 
 static const struct of_device_id iproc_pcie_of_match_table[] = {
diff --git a/drivers/pci/host/pcie-mediatek.c b/drivers/pci/host/pcie-mediatek.c
index a8b20c5012a9..5b3da5856ca2 100644
--- a/drivers/pci/host/pcie-mediatek.c
+++ b/drivers/pci/host/pcie-mediatek.c
@@ -22,6 +22,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/reset.h>
 
+#include "../pci.h"
+
 /* PCIe shared registers */
 #define PCIE_SYS_CFG		0x00
 #define PCIE_INT_ENABLE		0x0c
diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c
index 6ab28f29ac6a..9629ec039deb 100644
--- a/drivers/pci/host/pcie-rcar.c
+++ b/drivers/pci/host/pcie-rcar.c
@@ -28,6 +28,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
 
+#include "../pci.h"
+
 #define PCIECAR			0x000010
 #define PCIECCTLR		0x000018
 #define  CONFIG_SEND_ENABLE	(1 << 31)
diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
index f1e8f97ea1fb..daf9120a4350 100644
--- a/drivers/pci/host/pcie-rockchip.c
+++ b/drivers/pci/host/pcie-rockchip.c
@@ -36,6 +36,8 @@
 #include <linux/reset.h>
 #include <linux/regmap.h>
 
+#include "../pci.h"
+
 /*
  * The upper 16 bits of PCIE_CLIENT_CONFIG are a write mask for the lower 16
  * bits.  This allows atomic updates of the register without locking.
diff --git a/drivers/pci/host/pcie-xilinx-nwl.c b/drivers/pci/host/pcie-xilinx-nwl.c
index 4839ae578711..9505bb9649d0 100644
--- a/drivers/pci/host/pcie-xilinx-nwl.c
+++ b/drivers/pci/host/pcie-xilinx-nwl.c
@@ -21,6 +21,8 @@
 #include <linux/platform_device.h>
 #include <linux/irqchip/chained_irq.h>
 
+#include "../pci.h"
+
 /* Bridge core config registers */
 #define BRCFG_PCIE_RX0			0x00000000
 #define BRCFG_INTERRUPT			0x00000010
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index 0ad188effc09..ec193e930caa 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -23,6 +23,8 @@
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 
+#include "../pci.h"
+
 /* Register definitions */
 #define XILINX_PCIE_REG_BIR		0x00000130
 #define XILINX_PCIE_REG_IDR		0x00000138
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf25bff..6c7cd16a1d1c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -407,4 +407,44 @@ static inline u64 pci_rebar_size_to_bytes(int size)
 	return 1ULL << (size + 20);
 }
 
+struct device_node;
+
+#ifdef CONFIG_OF
+int of_pci_parse_bus_range(struct device_node *node, struct resource *res);
+int of_get_pci_domain_nr(struct device_node *node);
+int of_pci_get_max_link_speed(struct device_node *node);
+
+#else
+static inline int
+of_pci_parse_bus_range(struct device_node *node, struct resource *res)
+{
+	return -EINVAL;
+}
+
+static inline int
+of_get_pci_domain_nr(struct device_node *node)
+{
+	return -1;
+}
+
+static inline int
+of_pci_get_max_link_speed(struct device_node *node)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_OF */
+
+#if defined(CONFIG_OF_ADDRESS)
+int of_pci_get_host_bridge_resources(struct device_node *dev,
+			unsigned char busno, unsigned char bus_max,
+			struct list_head *resources, resource_size_t *io_base);
+#else
+static inline int of_pci_get_host_bridge_resources(struct device_node *dev,
+			unsigned char busno, unsigned char bus_max,
+			struct list_head *resources, resource_size_t *io_base)
+{
+	return -EINVAL;
+}
+#endif
+
 #endif /* DRIVERS_PCI_H */
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index 091033a6b836..e83d87fc5673 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -13,9 +13,6 @@ struct device_node;
 struct device_node *of_pci_find_child_device(struct device_node *parent,
 					     unsigned int devfn);
 int of_pci_get_devfn(struct device_node *np);
-int of_pci_parse_bus_range(struct device_node *node, struct resource *res);
-int of_get_pci_domain_nr(struct device_node *node);
-int of_pci_get_max_link_speed(struct device_node *node);
 void of_pci_check_probe_only(void);
 int of_pci_map_rid(struct device_node *np, u32 rid,
 		   const char *map_name, const char *map_mask_name,
@@ -32,18 +29,6 @@ static inline int of_pci_get_devfn(struct device_node *np)
 	return -EINVAL;
 }
 
-static inline int
-of_pci_parse_bus_range(struct device_node *node, struct resource *res)
-{
-	return -EINVAL;
-}
-
-static inline int
-of_get_pci_domain_nr(struct device_node *node)
-{
-	return -1;
-}
-
 static inline int of_pci_map_rid(struct device_node *np, u32 rid,
 			const char *map_name, const char *map_mask_name,
 			struct device_node **target, u32 *id_out)
@@ -51,12 +36,6 @@ static inline int of_pci_map_rid(struct device_node *np, u32 rid,
 	return -EINVAL;
 }
 
-static inline int
-of_pci_get_max_link_speed(struct device_node *node)
-{
-	return -EINVAL;
-}
-
 static inline void of_pci_check_probe_only(void) { }
 #endif
 
@@ -70,17 +49,4 @@ of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin)
 }
 #endif
 
-#if defined(CONFIG_OF_ADDRESS)
-int of_pci_get_host_bridge_resources(struct device_node *dev,
-			unsigned char busno, unsigned char bus_max,
-			struct list_head *resources, resource_size_t *io_base);
-#else
-static inline int of_pci_get_host_bridge_resources(struct device_node *dev,
-			unsigned char busno, unsigned char bus_max,
-			struct list_head *resources, resource_size_t *io_base)
-{
-	return -EINVAL;
-}
-#endif
-
 #endif
-- 
cgit v1.2.3


From 2724273e8fd00b512596a77ee063f49b25f36507 Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Wed, 2 May 2018 15:17:17 +0530
Subject: vmcore: add API to collect hardware dump in second kernel

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /proc/vmcore are as follows:

1. During probe (before hardware is initialized), device drivers
register to the vmcore module (via vmcore_add_device_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. vmcore module allocates the buffer with requested size. It adds
an Elf note and invokes the device driver's registered callback
function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to vmcore module.

Ensure that the device dump buffer size is always aligned to page size
so that it can be mmaped.

Also, rename alloc_elfnotes_buf() to vmcore_alloc_buf() to make it more
generic and reserve NT_VMCOREDD note type to indicate vmcore device
dump.

Suggested-by: Eric Biederman <ebiederm@xmission.com>.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/Kconfig             |  15 +++++
 fs/proc/vmcore.c            | 135 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/crash_dump.h  |  18 ++++++
 include/linux/kcore.h       |   6 ++
 include/uapi/linux/elf.h    |   1 +
 include/uapi/linux/vmcore.h |  18 ++++++
 6 files changed, 184 insertions(+), 9 deletions(-)
 create mode 100644 include/uapi/linux/vmcore.h

(limited to 'include')

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..0eaeb41453f5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -43,6 +43,21 @@ config PROC_VMCORE
         help
         Exports the dump image of crashed kernel in ELF format.
 
+config PROC_VMCORE_DEVICE_DUMP
+	bool "Device Hardware/Firmware Log Collection"
+	depends on PROC_VMCORE
+	default n
+	help
+	  After kernel panic, device drivers can collect the device
+	  specific snapshot of their hardware or firmware before the
+	  underlying devices are initialized in crash recovery kernel.
+	  Note that the device driver must be present in the crash
+	  recovery kernel's initramfs to collect its underlying device
+	  snapshot.
+
+	  If you say Y here, the collected device dumps will be added
+	  as ELF notes to /proc/vmcore.
+
 config PROC_SYSCTL
 	bool "Sysctl support (/proc/sys)" if EXPERT
 	depends on PROC_FS
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a45f0af22a60..abb3dba0fa49 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
@@ -44,6 +45,12 @@ static u64 vmcore_size;
 
 static struct proc_dir_entry *proc_vmcore;
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+static DEFINE_MUTEX(vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
 /*
  * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
  * The called function has to take care of module refcounting.
@@ -302,10 +309,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
 };
 
 /**
- * alloc_elfnotes_buf - allocate buffer for ELF note segment in
- *                      vmalloc memory
- *
- * @notes_sz: size of buffer
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @sizez: size of buffer
  *
  * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
  * the buffer to user-space by means of remap_vmalloc_range().
@@ -313,12 +318,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
  * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
  * disabled and there's no need to allow users to mmap the buffer.
  */
-static inline char *alloc_elfnotes_buf(size_t notes_sz)
+static inline char *vmcore_alloc_buf(size_t size)
 {
 #ifdef CONFIG_MMU
-	return vmalloc_user(notes_sz);
+	return vmalloc_user(size);
 #else
-	return vzalloc(notes_sz);
+	return vzalloc(size);
 #endif
 }
 
@@ -665,7 +670,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -851,7 +856,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -1145,6 +1150,115 @@ static int __init parse_crash_elf_headers(void)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_write_header - Write vmcore device dump header at the
+ * beginning of the dump's buffer.
+ * @buf: Output buffer where the note is written
+ * @data: Dump info
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's buffer with vmcore device dump header.
+ */
+static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
+				  u32 size)
+{
+	struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
+
+	vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
+	vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
+	vdd_hdr->n_type = NT_VMCOREDD;
+
+	strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
+		sizeof(vdd_hdr->name));
+	memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	struct vmcoredd_node *dump;
+	void *buf = NULL;
+	size_t data_size;
+	int ret;
+
+	if (!data || !strlen(data->dump_name) ||
+	    !data->vmcoredd_callback || !data->size)
+		return -EINVAL;
+
+	dump = vzalloc(sizeof(*dump));
+	if (!dump) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Keep size of the buffer page aligned so that it can be mmaped */
+	data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
+			    PAGE_SIZE);
+
+	/* Allocate buffer for driver's to write their dumps */
+	buf = vmcore_alloc_buf(data_size);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	vmcoredd_write_header(buf, data, data_size -
+			      sizeof(struct vmcoredd_header));
+
+	/* Invoke the driver's dump collection routing */
+	ret = data->vmcoredd_callback(data, buf +
+				      sizeof(struct vmcoredd_header));
+	if (ret)
+		goto out_err;
+
+	dump->buf = buf;
+	dump->size = data_size;
+
+	/* Add the dump to driver sysfs list */
+	mutex_lock(&vmcoredd_mutex);
+	list_add_tail(&dump->list, &vmcoredd_list);
+	mutex_unlock(&vmcoredd_mutex);
+
+	return 0;
+
+out_err:
+	if (buf)
+		vfree(buf);
+
+	if (dump)
+		vfree(dump);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+	mutex_lock(&vmcoredd_mutex);
+	while (!list_empty(&vmcoredd_list)) {
+		struct vmcoredd_node *dump;
+
+		dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+					list);
+		list_del(&dump->list);
+		vfree(dump->buf);
+		vfree(dump);
+	}
+	mutex_unlock(&vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
 /* Init function for vmcore module. */
 static int __init vmcore_init(void)
 {
@@ -1192,4 +1306,7 @@ void vmcore_cleanup(void)
 		kfree(m);
 	}
 	free_elfcorebuf();
+
+	/* clear vmcore device dump list */
+	vmcore_free_device_dumps();
 }
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index f7ac2aa93269..3e4ba9d753c8 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -5,6 +5,7 @@
 #include <linux/kexec.h>
 #include <linux/proc_fs.h>
 #include <linux/elf.h>
+#include <uapi/linux/vmcore.h>
 
 #include <asm/pgtable.h> /* for pgprot_t */
 
@@ -93,4 +94,21 @@ static inline bool is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
 
 extern unsigned long saved_max_pfn;
+
+/* Device Dump information to be filled by drivers */
+struct vmcoredd_data {
+	char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */
+	unsigned int size;                       /* Size of the dump */
+	/* Driver's registered callback to be invoked to collect dump */
+	int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf);
+};
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+int vmcore_add_device_dump(struct vmcoredd_data *data);
+#else
+static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 80db19d3a505..8de55e4b5ee9 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -28,6 +28,12 @@ struct vmcore {
 	loff_t offset;
 };
 
+struct vmcoredd_node {
+	struct list_head list;	/* List of dumps */
+	void *buf;		/* Buffer containing device's dump */
+	unsigned int size;	/* Size of the buffer */
+};
+
 #ifdef CONFIG_PROC_KCORE
 extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index e2535d6dcec7..4e12c423b9fe 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -421,6 +421,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
+#define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h
new file mode 100644
index 000000000000..022619668e0e
--- /dev/null
+++ b/include/uapi/linux/vmcore.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_VMCORE_H
+#define _UAPI_VMCORE_H
+
+#include <linux/types.h>
+
+#define VMCOREDD_NOTE_NAME "LINUX"
+#define VMCOREDD_MAX_NAME_BYTES 44
+
+struct vmcoredd_header {
+	__u32 n_namesz; /* Name size */
+	__u32 n_descsz; /* Content size */
+	__u32 n_type;   /* NT_VMCOREDD */
+	__u8 name[8];   /* LINUX\0\0\0 */
+	__u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */
+};
+
+#endif /* _UAPI_VMCORE_H */
-- 
cgit v1.2.3


From c1a67fefd0546a5552289c65fe31b1d60e64b643 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 May 2015 16:52:20 -0700
Subject: mempool: Add mempool_init()/mempool_exit()

Allows mempools to be embedded in other structs, getting rid of a
pointer indirection from allocation fastpaths.

mempool_exit() is safe to call on an uninitialized but zeroed mempool.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/mempool.h |  34 +++++++++++++++
 mm/mempool.c            | 108 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 115 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index b51f5c430c26..0c964ac107c2 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -25,6 +25,18 @@ typedef struct mempool_s {
 	wait_queue_head_t wait;
 } mempool_t;
 
+static inline bool mempool_initialized(mempool_t *pool)
+{
+	return pool->elements != NULL;
+}
+
+void mempool_exit(mempool_t *pool);
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		      mempool_free_t *free_fn, void *pool_data,
+		      gfp_t gfp_mask, int node_id);
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		 mempool_free_t *free_fn, void *pool_data);
+
 extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
 			mempool_free_t *free_fn, void *pool_data);
 extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
@@ -43,6 +55,14 @@ extern void mempool_free(void *element, mempool_t *pool);
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
+
+static inline int
+mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
+{
+	return mempool_init(pool, min_nr, mempool_alloc_slab,
+			    mempool_free_slab, (void *) kc);
+}
+
 static inline mempool_t *
 mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
 {
@@ -56,6 +76,13 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kfree(void *element, void *pool_data);
+
+static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return mempool_init(pool, min_nr, mempool_kmalloc,
+			    mempool_kfree, (void *) size);
+}
+
 static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 {
 	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
@@ -68,6 +95,13 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
  */
 void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
 void mempool_free_pages(void *element, void *pool_data);
+
+static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
+{
+	return mempool_init(pool, min_nr, mempool_alloc_pages,
+			    mempool_free_pages, (void *)(long)order);
+}
+
 static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
 {
 	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
diff --git a/mm/mempool.c b/mm/mempool.c
index 5c9dce34719b..b54f2c20e5e0 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -137,6 +137,28 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
 	return element;
 }
 
+/**
+ * mempool_exit - exit a mempool initialized with mempool_init()
+ * @pool:      pointer to the memory pool which was initialized with
+ *             mempool_init().
+ *
+ * Free all reserved elements in @pool and @pool itself.  This function
+ * only sleeps if the free_fn() function sleeps.
+ *
+ * May be called on a zeroed but uninitialized mempool (i.e. allocated with
+ * kzalloc()).
+ */
+void mempool_exit(mempool_t *pool)
+{
+	while (pool->curr_nr) {
+		void *element = remove_element(pool, GFP_KERNEL);
+		pool->free(element, pool->pool_data);
+	}
+	kfree(pool->elements);
+	pool->elements = NULL;
+}
+EXPORT_SYMBOL(mempool_exit);
+
 /**
  * mempool_destroy - deallocate a memory pool
  * @pool:      pointer to the memory pool which was allocated via
@@ -150,15 +172,65 @@ void mempool_destroy(mempool_t *pool)
 	if (unlikely(!pool))
 		return;
 
-	while (pool->curr_nr) {
-		void *element = remove_element(pool, GFP_KERNEL);
-		pool->free(element, pool->pool_data);
-	}
-	kfree(pool->elements);
+	mempool_exit(pool);
 	kfree(pool);
 }
 EXPORT_SYMBOL(mempool_destroy);
 
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		      mempool_free_t *free_fn, void *pool_data,
+		      gfp_t gfp_mask, int node_id)
+{
+	spin_lock_init(&pool->lock);
+	pool->min_nr	= min_nr;
+	pool->pool_data = pool_data;
+	pool->alloc	= alloc_fn;
+	pool->free	= free_fn;
+	init_waitqueue_head(&pool->wait);
+
+	pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+					    gfp_mask, node_id);
+	if (!pool->elements)
+		return -ENOMEM;
+
+	/*
+	 * First pre-allocate the guaranteed number of buffers.
+	 */
+	while (pool->curr_nr < pool->min_nr) {
+		void *element;
+
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (unlikely(!element)) {
+			mempool_exit(pool);
+			return -ENOMEM;
+		}
+		add_element(pool, element);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mempool_init_node);
+
+/**
+ * mempool_init - initialize a memory pool
+ * @min_nr:    the minimum number of elements guaranteed to be
+ *             allocated for this pool.
+ * @alloc_fn:  user-defined element-allocation function.
+ * @free_fn:   user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * Like mempool_create(), but initializes the pool in (i.e. embedded in another
+ * structure).
+ */
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		 mempool_free_t *free_fn, void *pool_data)
+{
+	return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
+				 pool_data, GFP_KERNEL, NUMA_NO_NODE);
+
+}
+EXPORT_SYMBOL(mempool_init);
+
 /**
  * mempool_create - create a memory pool
  * @min_nr:    the minimum number of elements guaranteed to be
@@ -186,35 +258,17 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 			       gfp_t gfp_mask, int node_id)
 {
 	mempool_t *pool;
+
 	pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
 	if (!pool)
 		return NULL;
-	pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
-				      gfp_mask, node_id);
-	if (!pool->elements) {
+
+	if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
+			      gfp_mask, node_id)) {
 		kfree(pool);
 		return NULL;
 	}
-	spin_lock_init(&pool->lock);
-	pool->min_nr = min_nr;
-	pool->pool_data = pool_data;
-	init_waitqueue_head(&pool->wait);
-	pool->alloc = alloc_fn;
-	pool->free = free_fn;
 
-	/*
-	 * First pre-allocate the guaranteed number of buffers.
-	 */
-	while (pool->curr_nr < pool->min_nr) {
-		void *element;
-
-		element = pool->alloc(gfp_mask, pool->pool_data);
-		if (unlikely(!element)) {
-			mempool_destroy(pool);
-			return NULL;
-		}
-		add_element(pool, element);
-	}
 	return pool;
 }
 EXPORT_SYMBOL(mempool_create_node);
-- 
cgit v1.2.3


From 8aa6ba2f6e3deaff70e517e3cfbf38d1105f9d4f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 May 2018 21:33:50 -0400
Subject: block: Convert bio_set to mempool_init()

Minor performance improvement by getting rid of pointer indirections
from allocation/freeing fastpaths.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 29 ++++++++++++++---------------
 block/bio.c           | 36 +++++++++++++++++-------------------
 include/linux/bio.h   | 10 +++++-----
 3 files changed, 36 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9cfdd6c83b5b..add7c7c85335 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -56,12 +56,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	struct bio_set *bs = bio->bi_pool;
 	unsigned inline_vecs;
 
-	if (!bs || !bs->bio_integrity_pool) {
+	if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
 		bip = kmalloc(sizeof(struct bio_integrity_payload) +
 			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
 		inline_vecs = nr_vecs;
 	} else {
-		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+		bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
 		inline_vecs = BIP_INLINE_VECS;
 	}
 
@@ -74,7 +74,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 		unsigned long idx = 0;
 
 		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
-					  bs->bvec_integrity_pool);
+					  &bs->bvec_integrity_pool);
 		if (!bip->bip_vec)
 			goto err;
 		bip->bip_max_vcnt = bvec_nr_vecs(idx);
@@ -90,7 +90,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 
 	return bip;
 err:
-	mempool_free(bip, bs->bio_integrity_pool);
+	mempool_free(bip, &bs->bio_integrity_pool);
 	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
@@ -111,10 +111,10 @@ static void bio_integrity_free(struct bio *bio)
 		kfree(page_address(bip->bip_vec->bv_page) +
 		      bip->bip_vec->bv_offset);
 
-	if (bs && bs->bio_integrity_pool) {
-		bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
+	if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
+		bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
 
-		mempool_free(bip, bs->bio_integrity_pool);
+		mempool_free(bip, &bs->bio_integrity_pool);
 	} else {
 		kfree(bip);
 	}
@@ -465,16 +465,15 @@ EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-	if (bs->bio_integrity_pool)
+	if (mempool_initialized(&bs->bio_integrity_pool))
 		return 0;
 
-	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
-	if (!bs->bio_integrity_pool)
+	if (mempool_init_slab_pool(&bs->bio_integrity_pool,
+				   pool_size, bip_slab))
 		return -1;
 
-	bs->bvec_integrity_pool = biovec_create_pool(pool_size);
-	if (!bs->bvec_integrity_pool) {
-		mempool_destroy(bs->bio_integrity_pool);
+	if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
+		mempool_exit(&bs->bio_integrity_pool);
 		return -1;
 	}
 
@@ -484,8 +483,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
 
 void bioset_integrity_free(struct bio_set *bs)
 {
-	mempool_destroy(bs->bio_integrity_pool);
-	mempool_destroy(bs->bvec_integrity_pool);
+	mempool_exit(&bs->bio_integrity_pool);
+	mempool_exit(&bs->bvec_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
diff --git a/block/bio.c b/block/bio.c
index 53e0f0a1ed94..06df6cf2ed8c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -254,7 +254,7 @@ static void bio_free(struct bio *bio)
 	bio_uninit(bio);
 
 	if (bs) {
-		bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
+		bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
 
 		/*
 		 * If we have front padding, adjust the bio pointer before freeing
@@ -262,7 +262,7 @@ static void bio_free(struct bio *bio)
 		p = bio;
 		p -= bs->front_pad;
 
-		mempool_free(p, bs->bio_pool);
+		mempool_free(p, &bs->bio_pool);
 	} else {
 		/* Bio was allocated by bio_kmalloc() */
 		kfree(bio);
@@ -454,7 +454,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 		inline_vecs = nr_iovecs;
 	} else {
 		/* should not use nobvec bioset for nr_iovecs > 0 */
-		if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0))
+		if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
+				 nr_iovecs > 0))
 			return NULL;
 		/*
 		 * generic_make_request() converts recursion to iteration; this
@@ -483,11 +484,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 		    bs->rescue_workqueue)
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
-		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		p = mempool_alloc(&bs->bio_pool, gfp_mask);
 		if (!p && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			p = mempool_alloc(bs->bio_pool, gfp_mask);
+			p = mempool_alloc(&bs->bio_pool, gfp_mask);
 		}
 
 		front_pad = bs->front_pad;
@@ -503,11 +504,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 	if (nr_iovecs > inline_vecs) {
 		unsigned long idx = 0;
 
-		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		}
 
 		if (unlikely(!bvl))
@@ -524,7 +525,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 	return bio;
 
 err_free:
-	mempool_free(p, bs->bio_pool);
+	mempool_free(p, &bs->bio_pool);
 	return NULL;
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
@@ -1848,11 +1849,11 @@ EXPORT_SYMBOL_GPL(bio_trim);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-mempool_t *biovec_create_pool(int pool_entries)
+int biovec_init_pool(mempool_t *pool, int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
 
-	return mempool_create_slab_pool(pool_entries, bp->slab);
+	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
 }
 
 void bioset_free(struct bio_set *bs)
@@ -1860,8 +1861,8 @@ void bioset_free(struct bio_set *bs)
 	if (bs->rescue_workqueue)
 		destroy_workqueue(bs->rescue_workqueue);
 
-	mempool_destroy(bs->bio_pool);
-	mempool_destroy(bs->bvec_pool);
+	mempool_exit(&bs->bio_pool);
+	mempool_exit(&bs->bvec_pool);
 
 	bioset_integrity_free(bs);
 	bio_put_slab(bs);
@@ -1913,15 +1914,12 @@ struct bio_set *bioset_create(unsigned int pool_size,
 		return NULL;
 	}
 
-	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
-	if (!bs->bio_pool)
+	if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
 		goto bad;
 
-	if (flags & BIOSET_NEED_BVECS) {
-		bs->bvec_pool = biovec_create_pool(pool_size);
-		if (!bs->bvec_pool)
-			goto bad;
-	}
+	if ((flags & BIOSET_NEED_BVECS) &&
+	    biovec_init_pool(&bs->bvec_pool, pool_size))
+		goto bad;
 
 	if (!(flags & BIOSET_NEED_RESCUER))
 		return bs;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce547a25e8ae..720f7261d042 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -412,7 +412,7 @@ enum {
 	BIOSET_NEED_RESCUER = BIT(1),
 };
 extern void bioset_free(struct bio_set *);
-extern mempool_t *biovec_create_pool(int pool_entries);
+extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 
 extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
 extern void bio_put(struct bio *);
@@ -722,11 +722,11 @@ struct bio_set {
 	struct kmem_cache *bio_slab;
 	unsigned int front_pad;
 
-	mempool_t *bio_pool;
-	mempool_t *bvec_pool;
+	mempool_t bio_pool;
+	mempool_t bvec_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-	mempool_t *bio_integrity_pool;
-	mempool_t *bvec_integrity_pool;
+	mempool_t bio_integrity_pool;
+	mempool_t bvec_integrity_pool;
 #endif
 
 	/*
-- 
cgit v1.2.3


From 917a38c71af82185c39e31589587591fa764fb85 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 May 2018 21:33:51 -0400
Subject: block: Add bioset_init()/bioset_exit()

Similarly to mempool_init()/mempool_exit(), take a pointer indirection
out of allocation/freeing by allowing biosets to be embedded in other
structs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 93 +++++++++++++++++++++++++++++++++++++----------------
 include/linux/bio.h |  2 ++
 2 files changed, 67 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 06df6cf2ed8c..ee182de97cad 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1856,21 +1856,83 @@ int biovec_init_pool(mempool_t *pool, int pool_entries)
 	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
 }
 
-void bioset_free(struct bio_set *bs)
+/*
+ * bioset_exit - exit a bioset initialized with bioset_init()
+ *
+ * May be called on a zeroed but uninitialized bioset (i.e. allocated with
+ * kzalloc()).
+ */
+void bioset_exit(struct bio_set *bs)
 {
 	if (bs->rescue_workqueue)
 		destroy_workqueue(bs->rescue_workqueue);
+	bs->rescue_workqueue = NULL;
 
 	mempool_exit(&bs->bio_pool);
 	mempool_exit(&bs->bvec_pool);
 
 	bioset_integrity_free(bs);
-	bio_put_slab(bs);
+	if (bs->bio_slab)
+		bio_put_slab(bs);
+	bs->bio_slab = NULL;
+}
+EXPORT_SYMBOL(bioset_exit);
 
+void bioset_free(struct bio_set *bs)
+{
+	bioset_exit(bs);
 	kfree(bs);
 }
 EXPORT_SYMBOL(bioset_free);
 
+/**
+ * bioset_init - Initialize a bio_set
+ * @pool_size:	Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:	Number of bytes to allocate in front of the returned bio
+ * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
+ *              and %BIOSET_NEED_RESCUER
+ *
+ * Similar to bioset_create(), but initializes a passed-in bioset instead of
+ * separately allocating it.
+ */
+int bioset_init(struct bio_set *bs,
+		unsigned int pool_size,
+		unsigned int front_pad,
+		int flags)
+{
+	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+
+	bs->front_pad = front_pad;
+
+	spin_lock_init(&bs->rescue_lock);
+	bio_list_init(&bs->rescue_list);
+	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
+	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+	if (!bs->bio_slab)
+		return -ENOMEM;
+
+	if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
+		goto bad;
+
+	if ((flags & BIOSET_NEED_BVECS) &&
+	    biovec_init_pool(&bs->bvec_pool, pool_size))
+		goto bad;
+
+	if (!(flags & BIOSET_NEED_RESCUER))
+		return 0;
+
+	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+	if (!bs->rescue_workqueue)
+		goto bad;
+
+	return 0;
+bad:
+	bioset_exit(bs);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(bioset_init);
+
 /**
  * bioset_create  - Create a bio_set
  * @pool_size:	Number of bio and bio_vecs to cache in the mempool
@@ -1895,43 +1957,18 @@ struct bio_set *bioset_create(unsigned int pool_size,
 			      unsigned int front_pad,
 			      int flags)
 {
-	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
 	struct bio_set *bs;
 
 	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
 	if (!bs)
 		return NULL;
 
-	bs->front_pad = front_pad;
-
-	spin_lock_init(&bs->rescue_lock);
-	bio_list_init(&bs->rescue_list);
-	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
-
-	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
-	if (!bs->bio_slab) {
+	if (bioset_init(bs, pool_size, front_pad, flags)) {
 		kfree(bs);
 		return NULL;
 	}
 
-	if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
-		goto bad;
-
-	if ((flags & BIOSET_NEED_BVECS) &&
-	    biovec_init_pool(&bs->bvec_pool, pool_size))
-		goto bad;
-
-	if (!(flags & BIOSET_NEED_RESCUER))
-		return bs;
-
-	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
-	if (!bs->rescue_workqueue)
-		goto bad;
-
 	return bs;
-bad:
-	bioset_free(bs);
-	return NULL;
 }
 EXPORT_SYMBOL(bioset_create);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 720f7261d042..fa3cf94a5015 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -406,6 +406,8 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 	return bio_split(bio, sectors, gfp, bs);
 }
 
+extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
+extern void bioset_exit(struct bio_set *);
 extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags);
 enum {
 	BIOSET_NEED_BVECS = BIT(0),
-- 
cgit v1.2.3


From f4f8154a08bc5801a3f130db69e370d249e791ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 May 2018 21:33:52 -0400
Subject: block: Use bioset_init() for fs_bio_set

Minor optimization - remove a pointer indirection when using fs_bio_set.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                         | 7 +++----
 block/blk-core.c                    | 2 +-
 drivers/target/target_core_iblock.c | 2 +-
 include/linux/bio.h                 | 4 ++--
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index ee182de97cad..ca845e440526 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -53,7 +53,7 @@ static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
  */
-struct bio_set *fs_bio_set;
+struct bio_set fs_bio_set;
 EXPORT_SYMBOL(fs_bio_set);
 
 /*
@@ -2055,11 +2055,10 @@ static int __init init_bio(void)
 	bio_integrity_init();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!fs_bio_set)
+	if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
 		panic("bio: can't allocate bios\n");
 
-	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+	if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
 		panic("bio: can't create integrity pool\n");
 
 	return 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 341501c5e239..b431558f39bc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3511,7 +3511,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	struct bio *bio, *bio_src;
 
 	if (!bs)
-		bs = fs_bio_set;
+		bs = &fs_bio_set;
 
 	__rq_for_each_bio(bio_src, rq_src) {
 		bio = bio_clone_fast(bio_src, gfp_mask, bs);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 60429011292a..44cacd001aa6 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -164,7 +164,7 @@ static int iblock_configure_device(struct se_device *dev)
 				goto out_blkdev_put;
 			}
 			pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n",
-				 bs->bio_integrity_pool);
+				 &bs->bio_integrity_pool);
 		}
 		dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type;
 	}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index fa3cf94a5015..91b02520e2f8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -423,11 +423,11 @@ extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
-extern struct bio_set *fs_bio_set;
+extern struct bio_set fs_bio_set;
 
 static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
 
 static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-- 
cgit v1.2.3


From 38a72dac48f631c2a90b831a3847cde116f21d3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 May 2018 21:33:53 -0400
Subject: block: Add bio_copy_data_iter(), zero_fill_bio_iter()

Add versions that take bvec_iter args instead of using bio->bi_iter - to
be used by bcachefs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 44 ++++++++++++++++++++++++--------------------
 include/linux/bio.h | 18 +++++++++++++++---
 2 files changed, 39 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index ca845e440526..e849efff4616 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -530,20 +530,20 @@ err_free:
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 {
 	unsigned long flags;
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
-	bio_for_each_segment(bv, bio, iter) {
+	__bio_for_each_segment(bv, bio, iter, start) {
 		char *data = bvec_kmap_irq(&bv, &flags);
 		memset(data, 0, bv.bv_len);
 		flush_dcache_page(bv.bv_page);
 		bvec_kunmap_irq(data, &flags);
 	}
 }
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
 
 /**
  * bio_put - release a reference to a bio
@@ -971,28 +971,13 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
+			struct bio *src, struct bvec_iter src_iter)
 {
-	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
 	void *src_p, *dst_p;
 	unsigned bytes;
 
-	src_iter = src->bi_iter;
-	dst_iter = dst->bi_iter;
-
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1029,6 +1014,25 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
+EXPORT_SYMBOL(bio_copy_data_iter);
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	bio_copy_data_iter(dst, dst->bi_iter,
+			   src, src->bi_iter);
+}
 EXPORT_SYMBOL(bio_copy_data);
 
 struct bio_map_data {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 91b02520e2f8..5a6ee955a8ac 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -67,8 +67,12 @@
 
 #define bio_multiple_segments(bio)				\
 	((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
-#define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
-#define bio_end_sector(bio)	((bio)->bi_iter.bi_sector + bio_sectors((bio)))
+
+#define bvec_iter_sectors(iter)	((iter).bi_size >> 9)
+#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))
+
+#define bio_sectors(bio)	bvec_iter_sectors((bio)->bi_iter)
+#define bio_end_sector(bio)	bvec_iter_end_sector((bio)->bi_iter)
 
 /*
  * Return the data direction, READ or WRITE.
@@ -501,6 +505,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 }
 #endif
 
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
+			       struct bio *src, struct bvec_iter src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
 
@@ -509,7 +515,13 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct iov_iter *,
 				     gfp_t);
 extern int bio_uncopy_user(struct bio *);
-void zero_fill_bio(struct bio *bio);
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
+
+static inline void zero_fill_bio(struct bio *bio)
+{
+	zero_fill_bio_iter(bio, bio->bi_iter);
+}
+
 extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
 extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
-- 
cgit v1.2.3


From 45db54d58de0b7171fe1f0d1dfc14378e464949e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 May 2018 21:33:54 -0400
Subject: block: Split out bio_list_copy_data()

Found a bug (with ASAN) where we were passing a bio to bio_copy_data()
with bi_next not NULL, when it should have been - a driver had left
bi_next set to something after calling bio_endio().

Since the normal case is only copying single bios, split out
bio_list_copy_data() to avoid more bugs like this in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c             | 83 ++++++++++++++++++++++++++++++-------------------
 drivers/block/pktcdvd.c |  2 +-
 include/linux/bio.h     |  5 +--
 3 files changed, 55 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index e849efff4616..858f56f3edcb 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -971,32 +971,16 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
-			struct bio *src, struct bvec_iter src_iter)
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+			struct bio *src, struct bvec_iter *src_iter)
 {
 	struct bio_vec src_bv, dst_bv;
 	void *src_p, *dst_p;
 	unsigned bytes;
 
-	while (1) {
-		if (!src_iter.bi_size) {
-			src = src->bi_next;
-			if (!src)
-				break;
-
-			src_iter = src->bi_iter;
-		}
-
-		if (!dst_iter.bi_size) {
-			dst = dst->bi_next;
-			if (!dst)
-				break;
-
-			dst_iter = dst->bi_iter;
-		}
-
-		src_bv = bio_iter_iovec(src, src_iter);
-		dst_bv = bio_iter_iovec(dst, dst_iter);
+	while (src_iter->bi_size && dst_iter->bi_size) {
+		src_bv = bio_iter_iovec(src, *src_iter);
+		dst_bv = bio_iter_iovec(dst, *dst_iter);
 
 		bytes = min(src_bv.bv_len, dst_bv.bv_len);
 
@@ -1010,31 +994,66 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
 		kunmap_atomic(dst_p);
 		kunmap_atomic(src_p);
 
-		bio_advance_iter(src, &src_iter, bytes);
-		bio_advance_iter(dst, &dst_iter, bytes);
+		bio_advance_iter(src, src_iter, bytes);
+		bio_advance_iter(dst, dst_iter, bytes);
 	}
 }
 EXPORT_SYMBOL(bio_copy_data_iter);
 
 /**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
+ * bio_copy_data - copy contents of data buffers from one bio to another
+ * @src: source bio
+ * @dst: destination bio
  *
  * Stops when it reaches the end of either @src or @dst - that is, copies
  * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
  */
 void bio_copy_data(struct bio *dst, struct bio *src)
 {
-	bio_copy_data_iter(dst, dst->bi_iter,
-			   src, src->bi_iter);
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
+
+	bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 }
 EXPORT_SYMBOL(bio_copy_data);
 
+/**
+ * bio_list_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * Stops when it reaches the end of either the @src list or @dst list - that is,
+ * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
+ * bios).
+ */
+void bio_list_copy_data(struct bio *dst, struct bio *src)
+{
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
+
+	while (1) {
+		if (!src_iter.bi_size) {
+			src = src->bi_next;
+			if (!src)
+				break;
+
+			src_iter = src->bi_iter;
+		}
+
+		if (!dst_iter.bi_size) {
+			dst = dst->bi_next;
+			if (!dst)
+				break;
+
+			dst_iter = dst->bi_iter;
+		}
+
+		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+	}
+}
+EXPORT_SYMBOL(bio_list_copy_data);
+
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ccfcf544830f..d8aff7f3256a 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1285,7 +1285,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	 * Fill-in bvec with data from orig_bios.
 	 */
 	spin_lock(&pkt->lock);
-	bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
+	bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
 
 	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
 	spin_unlock(&pkt->lock);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a6ee955a8ac..98b175cc00d5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -505,9 +505,10 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 }
 #endif
 
-extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
-			       struct bio *src, struct bvec_iter src_iter);
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+			       struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern void bio_list_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
-- 
cgit v1.2.3


From f0b752168d7091f38e7d61a80de2542e8b71d266 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sat, 12 May 2018 21:58:19 -0400
Subject: audit: convert sessionid unset to a macro

Use a macro, "AUDIT_SID_UNSET", to replace each instance of
initialization and comparison to an audit session ID.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h      | 2 +-
 include/net/xfrm.h         | 2 +-
 include/uapi/linux/audit.h | 1 +
 init/init_task.c           | 3 ++-
 kernel/auditsc.c           | 4 ++--
 5 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b639cf1f55ff..2d15bce7aa3c 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -510,7 +510,7 @@ static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
 }
 static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
-	return -1;
+	return AUDIT_SID_UNSET;
 }
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a872379b69da..fcce8eef6c70 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -751,7 +751,7 @@ static inline void xfrm_audit_helper_usrinfo(bool task_valid,
 					    audit_get_loginuid(current) :
 					    INVALID_UID);
 	const unsigned int ses = task_valid ? audit_get_sessionid(current) :
-		(unsigned int) -1;
+		AUDIT_SID_UNSET;
 
 	audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses);
 	audit_log_task_context(audit_buf);
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 4e61a9e05132..04f9bd249094 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -465,6 +465,7 @@ struct audit_tty_status {
 };
 
 #define AUDIT_UID_UNSET (unsigned int)-1
+#define AUDIT_SID_UNSET ((unsigned int)-1)
 
 /* audit_rule_data supports filter rules with both integer and string
  * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
diff --git a/init/init_task.c b/init/init_task.c
index 3ac6e754cf64..74f60baa2799 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/audit.h>
 
 #include <asm/pgtable.h>
 #include <linux/uaccess.h>
@@ -119,7 +120,7 @@ struct task_struct init_task
 	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
 #ifdef CONFIG_AUDITSYSCALL
 	.loginuid	= INVALID_UID,
-	.sessionid	= (unsigned int)-1,
+	.sessionid	= AUDIT_SID_UNSET,
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 0d4e7ab847b1..378c45b92775 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2050,7 +2050,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 int audit_set_loginuid(kuid_t loginuid)
 {
 	struct task_struct *task = current;
-	unsigned int oldsessionid, sessionid = (unsigned int)-1;
+	unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
 	kuid_t oldloginuid;
 	int rc;
 
@@ -2064,7 +2064,7 @@ int audit_set_loginuid(kuid_t loginuid)
 	/* are we setting or clearing? */
 	if (uid_valid(loginuid)) {
 		sessionid = (unsigned int)atomic_inc_return(&session_id);
-		if (unlikely(sessionid == (unsigned int)-1))
+		if (unlikely(sessionid == AUDIT_SID_UNSET))
 			sessionid = (unsigned int)atomic_inc_return(&session_id);
 	}
 
-- 
cgit v1.2.3


From 81c7288b170a59d01427fa33d6591da1dbf59277 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Sun, 13 May 2018 17:44:27 -0300
Subject: sched: cls: enable verbose logging

Currently, when the rule is not to be exclusively executed by the
hardware, extack is not passed along and offloading failures don't
get logged. The idea was that hardware failures are okay because the
rule will get executed in software then and this way it doesn't confuse
unware users.

But this is not helpful in case one needs to understand why a certain
rule failed to get offloaded. Considering it may have been a temporary
failure, like resources exceeded or so, reproducing it later and knowing
that it is triggering the same reason may be challenging.

The ultimate goal is to improve Open vSwitch debuggability when using
flower offloading.

This patch adds a new flag to enable verbose logging. With the flag set,
extack will be passed to the driver, which will be able to log the
error. As the operation itself probably won't fail (not because of this,
at least), current iproute will already log it as a Warning.

The flag is generic, so it can be reused later. No need to restrict it
just for HW offloading. The command line will follow the syntax that
tc-ebpf already uses, tc ... [ verbose ] ... , and extend its meaning.

For example:
# ./tc qdisc add dev p7p1 ingress
# ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \
	flower verbose \
	src_mac ed:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \
	src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop
Warning: TC offload is disabled on net device.
# echo $?
0
# ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \
	flower \
	src_mac ff:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \
	src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop
# echo $?
0

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 6 ++++--
 include/uapi/linux/pkt_cls.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e828d31be5da..0005f0b40fe9 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -683,9 +683,11 @@ static inline bool tc_skip_sw(u32 flags)
 /* SKIP_HW and SKIP_SW are mutually exclusive flags. */
 static inline bool tc_flags_valid(u32 flags)
 {
-	if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))
+	if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW |
+		      TCA_CLS_FLAGS_VERBOSE))
 		return false;
 
+	flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW;
 	if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)))
 		return false;
 
@@ -705,7 +707,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-	if (tc_skip_sw(flags))
+	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
 		cls_common->extack = extack;
 }
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index be05e66c167b..84e4c1d0f874 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -129,6 +129,7 @@ enum {
 #define TCA_CLS_FLAGS_SKIP_SW	(1 << 1) /* don't use filter in SW */
 #define TCA_CLS_FLAGS_IN_HW	(1 << 2) /* filter is offloaded to HW */
 #define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */
+#define TCA_CLS_FLAGS_VERBOSE	(1 << 4) /* verbose logging */
 
 /* U32 filters */
 
-- 
cgit v1.2.3


From 919b7308fcc452cd4e282bab389c33384a9f3790 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 9 May 2018 12:17:52 +0200
Subject: PM / Domains: Allow a better error handling of dev_pm_domain_attach()

The callers of dev_pm_domain_attach() currently checks the returned error
code for -EPROBE_DEFER and needs to ignore other error codes. This is an
unnecessary limitation, which also leads to a rather strange behaviour in
the error path.

Address this limitation, by changing the return codes from
acpi_dev_pm_attach() and genpd_dev_pm_attach(). More precisely, let them
return 0, when no PM domain is needed for the device and then return 1, in
case the device was successfully attached to its PM domain. In this way,
dev_pm_domain_attach(), gets a better understanding of what happens in the
attach attempts and also allowing its caller to better act on real errors
codes.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/device_pm.c    |  6 +++---
 drivers/base/power/common.c |  7 ++++---
 drivers/base/power/domain.c | 19 ++++++++++---------
 include/linux/acpi.h        |  2 +-
 include/linux/pm_domain.h   |  4 ++--
 5 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index d00630016176..a7c2673ffd36 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1257,7 +1257,7 @@ int acpi_dev_pm_attach(struct device *dev, bool power_on)
 	struct acpi_device *adev = ACPI_COMPANION(dev);
 
 	if (!adev)
-		return -ENODEV;
+		return 0;
 
 	/*
 	 * Only attach the power domain to the first device if the
@@ -1265,7 +1265,7 @@ int acpi_dev_pm_attach(struct device *dev, bool power_on)
 	 * management twice.
 	 */
 	if (!acpi_device_is_first_physical_node(adev, dev))
-		return -EBUSY;
+		return 0;
 
 	acpi_add_pm_notifier(adev, dev, acpi_pm_notify_work_func);
 	dev_pm_domain_set(dev, &acpi_general_pm_domain);
@@ -1275,7 +1275,7 @@ int acpi_dev_pm_attach(struct device *dev, bool power_on)
 	}
 
 	dev->pm_domain->detach = acpi_dev_pm_detach;
-	return 0;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_pm_attach);
 #endif /* CONFIG_PM */
diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index f3cf61f58f25..5e4b481595bd 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -98,7 +98,8 @@ EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data);
  * Callers must ensure proper synchronization of this function with power
  * management callbacks.
  *
- * Returns 0 on successfully attached PM domain or negative error code.
+ * Returns 0 on successfully attached PM domain and when it found that the
+ * device don't need a PM domain, else a negative error code.
  */
 int dev_pm_domain_attach(struct device *dev, bool power_on)
 {
@@ -108,10 +109,10 @@ int dev_pm_domain_attach(struct device *dev, bool power_on)
 		return -EEXIST;
 
 	ret = acpi_dev_pm_attach(dev, power_on);
-	if (ret)
+	if (!ret)
 		ret = genpd_dev_pm_attach(dev);
 
-	return ret;
+	return ret < 0 ? ret : 0;
 }
 EXPORT_SYMBOL_GPL(dev_pm_domain_attach);
 
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index b816adbe1e62..455ecea6c812 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2180,10 +2180,11 @@ static void genpd_dev_pm_sync(struct device *dev)
  * Parse device's OF node to find a PM domain specifier. If such is found,
  * attaches the device to retrieved pm_domain ops.
  *
- * Returns 0 on successfully attached PM domain or negative error code. Note
- * that if a power-domain exists for the device, but it cannot be found or
- * turned on, then return -EPROBE_DEFER to ensure that the device is not
- * probed and to re-try again later.
+ * Returns 1 on successfully attached PM domain, 0 when the device don't need a
+ * PM domain or a negative error code in case of failures. Note that if a
+ * power-domain exists for the device, but it cannot be found or turned on,
+ * then return -EPROBE_DEFER to ensure that the device is not probed and to
+ * re-try again later.
  */
 int genpd_dev_pm_attach(struct device *dev)
 {
@@ -2192,12 +2193,12 @@ int genpd_dev_pm_attach(struct device *dev)
 	int ret;
 
 	if (!dev->of_node)
-		return -ENODEV;
+		return 0;
 
 	ret = of_parse_phandle_with_args(dev->of_node, "power-domains",
 					"#power-domain-cells", 0, &pd_args);
 	if (ret < 0)
-		return ret;
+		return 0;
 
 	mutex_lock(&gpd_list_lock);
 	pd = genpd_get_from_provider(&pd_args);
@@ -2218,7 +2219,7 @@ int genpd_dev_pm_attach(struct device *dev)
 		if (ret != -EPROBE_DEFER)
 			dev_err(dev, "failed to add to PM domain %s: %d",
 				pd->name, ret);
-		goto out;
+		return ret;
 	}
 
 	dev->pm_domain->detach = genpd_dev_pm_detach;
@@ -2230,8 +2231,8 @@ int genpd_dev_pm_attach(struct device *dev)
 
 	if (ret)
 		genpd_remove_device(pd, dev);
-out:
-	return ret ? -EPROBE_DEFER : 0;
+
+	return ret ? -EPROBE_DEFER : 1;
 }
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 15bfb15c2fa5..c01675b3d93f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -899,7 +899,7 @@ static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; }
 static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
 {
-	return -ENODEV;
+	return 0;
 }
 #endif
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 04dbef9847d3..ab1854863a8c 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -280,7 +280,7 @@ static inline int of_genpd_parse_idle_states(struct device_node *dn,
 
 static inline int genpd_dev_pm_attach(struct device *dev)
 {
-	return -ENODEV;
+	return 0;
 }
 
 static inline
@@ -297,7 +297,7 @@ extern void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
 #else
 static inline int dev_pm_domain_attach(struct device *dev, bool power_on)
 {
-	return -ENODEV;
+	return 0;
 }
 static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {}
 static inline void dev_pm_domain_set(struct device *dev,
-- 
cgit v1.2.3


From cdfb6b341f0f2409aba24b84f3b4b2bba50be5c5 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sat, 12 May 2018 21:58:20 -0400
Subject: audit: use inline function to get audit context

Recognizing that the audit context is an internal audit value, use an
access function to retrieve the audit context pointer for the task
rather than reaching directly into the task struct to get it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: merge fuzz in auditsc.c and selinuxfs.c, checkpatch.pl fixes]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h                | 15 ++++++--
 include/net/xfrm.h                   |  2 +-
 kernel/audit.c                       |  6 ++--
 kernel/audit_watch.c                 |  2 +-
 kernel/auditsc.c                     | 66 +++++++++++++++++-------------------
 net/bridge/netfilter/ebtables.c      |  2 +-
 net/core/dev.c                       | 18 +++++-----
 net/netfilter/x_tables.c             |  2 +-
 net/netlabel/netlabel_user.c         |  2 +-
 security/integrity/ima/ima_api.c     |  2 +-
 security/integrity/integrity_audit.c |  2 +-
 security/lsm_audit.c                 |  2 +-
 security/selinux/hooks.c             |  7 ++--
 security/selinux/selinuxfs.c         |  6 ++--
 security/selinux/ss/services.c       | 12 +++----
 15 files changed, 77 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2d15bce7aa3c..831a4684df40 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -237,9 +237,14 @@ extern void audit_seccomp_actions_logged(const char *names,
 					 const char *old_names, int res);
 extern void __audit_ptrace(struct task_struct *t);
 
+static inline struct audit_context *audit_context(void)
+{
+	return current->audit_context;
+}
+
 static inline bool audit_dummy_context(void)
 {
-	void *p = current->audit_context;
+	void *p = audit_context();
 	return !p || *(int *)p;
 }
 static inline void audit_free(struct task_struct *task)
@@ -251,12 +256,12 @@ static inline void audit_syscall_entry(int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(audit_context()))
 		__audit_syscall_entry(major, a0, a1, a2, a3);
 }
 static inline void audit_syscall_exit(void *pt_regs)
 {
-	if (unlikely(current->audit_context)) {
+	if (unlikely(audit_context())) {
 		int success = is_syscall_success(pt_regs);
 		long return_code = regs_return_value(pt_regs);
 
@@ -464,6 +469,10 @@ static inline bool audit_dummy_context(void)
 {
 	return true;
 }
+static inline struct audit_context *audit_context(void)
+{
+	return NULL;
+}
 static inline struct filename *audit_reusename(const __user char *name)
 {
 	return NULL;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index fcce8eef6c70..7f2e31aa3d65 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -736,7 +736,7 @@ static inline struct audit_buffer *xfrm_audit_start(const char *op)
 
 	if (audit_enabled == 0)
 		return NULL;
-	audit_buf = audit_log_start(current->audit_context, GFP_ATOMIC,
+	audit_buf = audit_log_start(audit_context(), GFP_ATOMIC,
 				    AUDIT_MAC_IPSEC_EVENT);
 	if (audit_buf == NULL)
 		return NULL;
diff --git a/kernel/audit.c b/kernel/audit.c
index e9f9a90790e5..e7478cb58079 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1099,8 +1099,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
 
 	if (audit_enabled == AUDIT_OFF)
 		return;
-	ab = audit_log_start(current->audit_context,
-			     GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
 	if (!ab)
 		return;
 	audit_log_task_info(ab, current);
@@ -2317,8 +2316,7 @@ void audit_log_link_denied(const char *operation)
 		return;
 
 	/* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
-	ab = audit_log_start(current->audit_context, GFP_KERNEL,
-			     AUDIT_ANOM_LINK);
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK);
 	if (!ab)
 		return;
 	audit_log_format(ab, "op=%s", operation);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9eb8b3511636..f1ba88994508 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -274,7 +274,7 @@ static void audit_update_watch(struct audit_parent *parent,
 		/* If the update involves invalidating rules, do the inode-based
 		 * filtering now, so we don't omit records. */
 		if (invalidating && !audit_dummy_context())
-			audit_filter_inodes(current, current->audit_context);
+			audit_filter_inodes(current, audit_context());
 
 		/* updating ino will likely change which audit_hash_list we
 		 * are on so we need a new watch for the new list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 378c45b92775..fce4acba576d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1509,8 +1509,7 @@ void __audit_free(struct task_struct *tsk)
 void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 			   unsigned long a3, unsigned long a4)
 {
-	struct task_struct *tsk = current;
-	struct audit_context *context = tsk->audit_context;
+	struct audit_context *context = audit_context();
 	enum audit_state     state;
 
 	if (!audit_enabled || !context)
@@ -1525,7 +1524,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 	context->dummy = !audit_n_rules;
 	if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
 		context->prio = 0;
-		if (auditd_test_task(tsk))
+		if (auditd_test_task(current))
 			return;
 	}
 
@@ -1563,12 +1562,12 @@ void __audit_syscall_exit(int success, long return_code)
 	else
 		success = AUDITSC_FAILURE;
 
-	context = audit_take_context(tsk, success, return_code);
+	context = audit_take_context(current, success, return_code);
 	if (!context)
 		return;
 
 	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
-		audit_log_exit(context, tsk);
+		audit_log_exit(context, current);
 
 	context->in_syscall = 0;
 	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
@@ -1602,7 +1601,7 @@ static inline void handle_one(const struct inode *inode)
 	int count;
 	if (likely(!inode->i_fsnotify_marks))
 		return;
-	context = current->audit_context;
+	context = audit_context();
 	p = context->trees;
 	count = context->tree_count;
 	rcu_read_lock();
@@ -1633,7 +1632,7 @@ static void handle_path(const struct dentry *dentry)
 	unsigned long seq;
 	int count;
 
-	context = current->audit_context;
+	context = audit_context();
 	p = context->trees;
 	count = context->tree_count;
 retry:
@@ -1715,7 +1714,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
 struct filename *
 __audit_reusename(const __user char *uptr)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct audit_names *n;
 
 	list_for_each_entry(n, &context->names_list, list) {
@@ -1738,7 +1737,7 @@ __audit_reusename(const __user char *uptr)
  */
 void __audit_getname(struct filename *name)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct audit_names *n;
 
 	if (!context->in_syscall)
@@ -1766,7 +1765,7 @@ void __audit_getname(struct filename *name)
 void __audit_inode(struct filename *name, const struct dentry *dentry,
 		   unsigned int flags)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct inode *inode = d_backing_inode(dentry);
 	struct audit_names *n;
 	bool parent = flags & AUDIT_INODE_PARENT;
@@ -1865,7 +1864,7 @@ void __audit_inode_child(struct inode *parent,
 			 const struct dentry *dentry,
 			 const unsigned char type)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
@@ -2084,7 +2083,7 @@ out:
  */
 void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (attr)
 		memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
@@ -2108,7 +2107,7 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
 			const struct timespec64 *abs_timeout)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
 
 	if (abs_timeout)
@@ -2132,7 +2131,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
 
 void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (notification)
 		context->mq_notify.sigev_signo = notification->sigev_signo;
@@ -2151,7 +2150,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
  */
 void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->mq_getsetattr.mqdes = mqdes;
 	context->mq_getsetattr.mqstat = *mqstat;
 	context->type = AUDIT_MQ_GETSETATTR;
@@ -2164,7 +2163,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
  */
 void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->ipc.uid = ipcp->uid;
 	context->ipc.gid = ipcp->gid;
 	context->ipc.mode = ipcp->mode;
@@ -2184,7 +2183,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
  */
 void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->ipc.qbytes = qbytes;
 	context->ipc.perm_uid = uid;
@@ -2195,7 +2194,7 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
 
 void __audit_bprm(struct linux_binprm *bprm)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->type = AUDIT_EXECVE;
 	context->execve.argc = bprm->argc;
@@ -2210,7 +2209,7 @@ void __audit_bprm(struct linux_binprm *bprm)
  */
 int __audit_socketcall(int nargs, unsigned long *args)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
 		return -EINVAL;
@@ -2228,7 +2227,7 @@ int __audit_socketcall(int nargs, unsigned long *args)
  */
 void __audit_fd_pair(int fd1, int fd2)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->fds[0] = fd1;
 	context->fds[1] = fd2;
 }
@@ -2242,7 +2241,7 @@ void __audit_fd_pair(int fd1, int fd2)
  */
 int __audit_sockaddr(int len, void *a)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (!context->sockaddr) {
 		void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
@@ -2258,7 +2257,7 @@ int __audit_sockaddr(int len, void *a)
 
 void __audit_ptrace(struct task_struct *t)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->target_pid = task_tgid_nr(t);
 	context->target_auid = audit_get_loginuid(t);
@@ -2279,19 +2278,18 @@ void __audit_ptrace(struct task_struct *t)
 int audit_signal_info(int sig, struct task_struct *t)
 {
 	struct audit_aux_data_pids *axp;
-	struct task_struct *tsk = current;
-	struct audit_context *ctx = tsk->audit_context;
+	struct audit_context *ctx = audit_context();
 	kuid_t uid = current_uid(), t_uid = task_uid(t);
 
 	if (auditd_test_task(t) &&
 	    (sig == SIGTERM || sig == SIGHUP ||
 	     sig == SIGUSR1 || sig == SIGUSR2)) {
-		audit_sig_pid = task_tgid_nr(tsk);
-		if (uid_valid(tsk->loginuid))
-			audit_sig_uid = tsk->loginuid;
+		audit_sig_pid = task_tgid_nr(current);
+		if (uid_valid(current->loginuid))
+			audit_sig_uid = current->loginuid;
 		else
 			audit_sig_uid = uid;
-		security_task_getsecid(tsk, &audit_sig_sid);
+		security_task_getsecid(current, &audit_sig_sid);
 	}
 
 	if (!audit_signals || audit_dummy_context())
@@ -2347,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 			   const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_bprm_fcaps *ax;
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct cpu_vfs_cap_data vcaps;
 
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
@@ -2387,7 +2385,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
  */
 void __audit_log_capset(const struct cred *new, const struct cred *old)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->capset.pid = task_tgid_nr(current);
 	context->capset.cap.effective   = new->cap_effective;
 	context->capset.cap.inheritable = new->cap_effective;
@@ -2398,7 +2396,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
 
 void __audit_mmap_fd(int fd, int flags)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->mmap.fd = fd;
 	context->mmap.flags = flags;
 	context->type = AUDIT_MMAP;
@@ -2406,7 +2404,7 @@ void __audit_mmap_fd(int fd, int flags)
 
 void __audit_log_kern_module(char *name)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
 	strcpy(context->module.name, name);
@@ -2415,7 +2413,7 @@ void __audit_log_kern_module(char *name)
 
 void __audit_fanotify(unsigned int response)
 {
-	audit_log(current->audit_context, GFP_KERNEL,
+	audit_log(audit_context(), GFP_KERNEL,
 		AUDIT_FANOTIFY,	"resp=%u", response);
 }
 
@@ -2514,7 +2512,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
 
 struct list_head *audit_killed_trees(void)
 {
-	struct audit_context *ctx = current->audit_context;
+	struct audit_context *ctx = audit_context();
 	if (likely(!ctx || !ctx->in_syscall))
 		return NULL;
 	return &ctx->killed_trees;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 032e0fe45940..894c96a26223 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1062,7 +1062,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 
 #ifdef CONFIG_AUDIT
 	if (audit_enabled) {
-		audit_log(current->audit_context, GFP_KERNEL,
+		audit_log(audit_context(), GFP_KERNEL,
 			  AUDIT_NETFILTER_CFG,
 			  "table=%s family=%u entries=%u",
 			  repl->name, AF_BRIDGE, repl->nentries);
diff --git a/net/core/dev.c b/net/core/dev.c
index 969462ebb296..ee8bc8d0797f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6749,15 +6749,15 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 			dev->flags & IFF_PROMISC ? "entered" : "left");
 		if (audit_enabled) {
 			current_uid_gid(&uid, &gid);
-			audit_log(current->audit_context, GFP_ATOMIC,
-				AUDIT_ANOM_PROMISCUOUS,
-				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
-				dev->name, (dev->flags & IFF_PROMISC),
-				(old_flags & IFF_PROMISC),
-				from_kuid(&init_user_ns, audit_get_loginuid(current)),
-				from_kuid(&init_user_ns, uid),
-				from_kgid(&init_user_ns, gid),
-				audit_get_sessionid(current));
+			audit_log(audit_context(), GFP_ATOMIC,
+				  AUDIT_ANOM_PROMISCUOUS,
+				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
+				  dev->name, (dev->flags & IFF_PROMISC),
+				  (old_flags & IFF_PROMISC),
+				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
+				  from_kuid(&init_user_ns, uid),
+				  from_kgid(&init_user_ns, gid),
+				  audit_get_sessionid(current));
 		}
 
 		dev_change_rx_flags(dev, IFF_PROMISC);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 71325fef647d..15402636b9ed 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1414,7 +1414,7 @@ xt_replace_table(struct xt_table *table,
 
 #ifdef CONFIG_AUDIT
 	if (audit_enabled) {
-		audit_log(current->audit_context, GFP_KERNEL,
+		audit_log(audit_context(), GFP_KERNEL,
 			  AUDIT_NETFILTER_CFG,
 			  "table=%s family=%u entries=%u",
 			  table->name, table->af, private->number);
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 58495f44c62a..2f328af91a52 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -104,7 +104,7 @@ struct audit_buffer *netlbl_audit_start_common(int type,
 	if (audit_enabled == 0)
 		return NULL;
 
-	audit_buf = audit_log_start(current->audit_context, GFP_ATOMIC, type);
+	audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, type);
 	if (audit_buf == NULL)
 		return NULL;
 
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index bf88236b7a0b..a02c5acfd403 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -326,7 +326,7 @@ void ima_audit_measurement(struct integrity_iint_cache *iint,
 		hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]);
 	hash[i * 2] = '\0';
 
-	ab = audit_log_start(current->audit_context, GFP_KERNEL,
+	ab = audit_log_start(audit_context(), GFP_KERNEL,
 			     AUDIT_INTEGRITY_RULE);
 	if (!ab)
 		goto out;
diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c
index 90987d15b6fe..ab10a25310a1 100644
--- a/security/integrity/integrity_audit.c
+++ b/security/integrity/integrity_audit.c
@@ -38,7 +38,7 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 	if (!integrity_audit_info && audit_info == 1)	/* Skip info messages */
 		return;
 
-	ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno);
+	ab = audit_log_start(audit_context(), GFP_KERNEL, audit_msgno);
 	audit_log_format(ab, "pid=%d uid=%u auid=%u ses=%u",
 			 task_pid_nr(current),
 			 from_kuid(&init_user_ns, current_cred()->uid),
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 67703dbe29ea..f84001019356 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -447,7 +447,7 @@ void common_lsm_audit(struct common_audit_data *a,
 	if (a == NULL)
 		return;
 	/* we use GFP_ATOMIC so we won't sleep */
-	ab = audit_log_start(current->audit_context, GFP_ATOMIC | __GFP_NOWARN,
+	ab = audit_log_start(audit_context(), GFP_ATOMIC | __GFP_NOWARN,
 			     AUDIT_AVC);
 
 	if (ab == NULL)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..713c1648014f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3294,7 +3294,8 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
 			} else {
 				audit_size = 0;
 			}
-			ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_SELINUX_ERR);
+			ab = audit_log_start(audit_context(),
+					     GFP_ATOMIC, AUDIT_SELINUX_ERR);
 			audit_log_format(ab, "op=setxattr invalid_context=");
 			audit_log_n_untrustedstring(ab, value, audit_size);
 			audit_log_end(ab);
@@ -6431,7 +6432,9 @@ static int selinux_setprocattr(const char *name, void *value, size_t size)
 					audit_size = size - 1;
 				else
 					audit_size = size;
-				ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_SELINUX_ERR);
+				ab = audit_log_start(audit_context(),
+						     GFP_ATOMIC,
+						     AUDIT_SELINUX_ERR);
 				audit_log_format(ab, "op=fscreate invalid_context=");
 				audit_log_n_untrustedstring(ab, value, audit_size);
 				audit_log_end(ab);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 245160373dab..35fd77737c59 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -167,7 +167,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf,
 				      NULL);
 		if (length)
 			goto out;
-		audit_log(current->audit_context, GFP_KERNEL, AUDIT_MAC_STATUS,
+		audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_STATUS,
 			"enforcing=%d old_enforcing=%d auid=%u ses=%u",
 			new_value, old_value,
 			from_kuid(&init_user_ns, audit_get_loginuid(current)),
@@ -299,7 +299,7 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf,
 		length = selinux_disable(fsi->state);
 		if (length)
 			goto out;
-		audit_log(current->audit_context, GFP_KERNEL, AUDIT_MAC_STATUS,
+		audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_STATUS,
 			"selinux=0 auid=%u ses=%u",
 			from_kuid(&init_user_ns, audit_get_loginuid(current)),
 			audit_get_sessionid(current));
@@ -575,7 +575,7 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf,
 	length = count;
 
 out1:
-	audit_log(current->audit_context, GFP_KERNEL, AUDIT_MAC_POLICY_LOAD,
+	audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_POLICY_LOAD,
 		"policy loaded auid=%u ses=%u",
 		from_kuid(&init_user_ns, audit_get_loginuid(current)),
 		audit_get_sessionid(current));
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 8057e19dc15f..0fd8ad955a38 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -501,7 +501,7 @@ static void security_dump_masked_av(struct policydb *policydb,
 		goto out;
 
 	/* audit a message */
-	ab = audit_log_start(current->audit_context,
+	ab = audit_log_start(audit_context(),
 			     GFP_ATOMIC, AUDIT_SELINUX_ERR);
 	if (!ab)
 		goto out;
@@ -743,7 +743,7 @@ static int security_validtrans_handle_fail(struct selinux_state *state,
 		goto out;
 	if (context_struct_to_string(p, tcontext, &t, &tlen))
 		goto out;
-	audit_log(current->audit_context, GFP_ATOMIC, AUDIT_SELINUX_ERR,
+	audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR,
 		  "op=security_validate_transition seresult=denied"
 		  " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s",
 		  o, n, t, sym_name(p, SYM_CLASSES, tclass-1));
@@ -929,7 +929,7 @@ int security_bounded_transition(struct selinux_state *state,
 					      &old_name, &length) &&
 		    !context_struct_to_string(policydb, new_context,
 					      &new_name, &length)) {
-			audit_log(current->audit_context,
+			audit_log(audit_context(),
 				  GFP_ATOMIC, AUDIT_SELINUX_ERR,
 				  "op=security_bounded_transition "
 				  "seresult=denied "
@@ -1586,7 +1586,7 @@ static int compute_sid_handle_invalid_context(
 		goto out;
 	if (context_struct_to_string(policydb, newcontext, &n, &nlen))
 		goto out;
-	audit_log(current->audit_context, GFP_ATOMIC, AUDIT_SELINUX_ERR,
+	audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR,
 		  "op=security_compute_sid invalid_context=%s"
 		  " scontext=%s"
 		  " tcontext=%s"
@@ -2882,7 +2882,7 @@ int security_set_bools(struct selinux_state *state, int len, int *values)
 
 	for (i = 0; i < len; i++) {
 		if (!!values[i] != policydb->bool_val_to_struct[i]->state) {
-			audit_log(current->audit_context, GFP_ATOMIC,
+			audit_log(audit_context(), GFP_ATOMIC,
 				AUDIT_MAC_CONFIG_CHANGE,
 				"bool=%s val=%d old_val=%d auid=%u ses=%u",
 				sym_name(policydb, SYM_BOOLS, i),
@@ -3025,7 +3025,7 @@ int security_sid_mls_copy(struct selinux_state *state,
 		if (rc) {
 			if (!context_struct_to_string(policydb, &newcon, &s,
 						      &len)) {
-				audit_log(current->audit_context,
+				audit_log(audit_context(),
 					  GFP_ATOMIC, AUDIT_SELINUX_ERR,
 					  "op=security_sid_mls_copy "
 					  "invalid_context=%s", s);
-- 
cgit v1.2.3


From c0b0ae8a871bc2ebbe1ff9c9871efcf88994ffec Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sat, 12 May 2018 21:58:21 -0400
Subject: audit: use inline function to set audit context

Recognizing that the audit context is an internal audit value, use an
access function to set the audit context pointer for the task
rather than reaching directly into the task struct to set it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: merge fuzz in audit.h]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 7 +++++++
 kernel/auditsc.c      | 7 +++----
 kernel/fork.c         | 2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 831a4684df40..69c78477590b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -237,6 +237,11 @@ extern void audit_seccomp_actions_logged(const char *names,
 					 const char *old_names, int res);
 extern void __audit_ptrace(struct task_struct *t);
 
+static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
+{
+	task->audit_context = ctx;
+}
+
 static inline struct audit_context *audit_context(void)
 {
 	return current->audit_context;
@@ -469,6 +474,8 @@ static inline bool audit_dummy_context(void)
 {
 	return true;
 }
+static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
+{ }
 static inline struct audit_context *audit_context(void)
 {
 	return NULL;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fce4acba576d..cbab0da86d15 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -865,7 +865,7 @@ static inline struct audit_context *audit_take_context(struct task_struct *tsk,
 		audit_filter_inodes(tsk, context);
 	}
 
-	tsk->audit_context = NULL;
+	audit_set_context(tsk, NULL);
 	return context;
 }
 
@@ -952,7 +952,7 @@ int audit_alloc(struct task_struct *tsk)
 	}
 	context->filterkey = key;
 
-	tsk->audit_context  = context;
+	audit_set_context(tsk, context);
 	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
 	return 0;
 }
@@ -1554,7 +1554,6 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
  */
 void __audit_syscall_exit(int success, long return_code)
 {
-	struct task_struct *tsk = current;
 	struct audit_context *context;
 
 	if (success)
@@ -1589,7 +1588,7 @@ void __audit_syscall_exit(int success, long return_code)
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 	}
-	tsk->audit_context = context;
+	audit_set_context(current, context);
 }
 
 static inline void handle_one(const struct inode *inode)
diff --git a/kernel/fork.c b/kernel/fork.c
index 242c8c93d285..cd18448b025a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1713,7 +1713,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->start_time = ktime_get_ns();
 	p->real_start_time = ktime_get_boot_ns();
 	p->io_context = NULL;
-	p->audit_context = NULL;
+	audit_set_context(p, NULL);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
-- 
cgit v1.2.3


From 6cfd839ae78ec3fac5ddbf7148155898727e90c3 Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Date: Fri, 11 May 2018 16:25:34 +0100
Subject: ALSA: usb-audio: UAC3. Add support for mixer unit.

This adds support for the MIXER UNIT in UAC3. All the information
is obtained from the (HIGH CAPABILITY) Cluster's header. We don't
read the rest of the logical cluster to obtain the channel config
as that wont make any difference in the current mixer behaviour.

The name of the mixer unit is not yet requested as there is not
support for the UAC3 Class Specific String requests.

Tested in an UAC3 device working as a HEADSET with a basic mixer
unit (same as the one in the BADD spec) with no controls.

Signed-off-by: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Reviewed-by: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Tested-by: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/linux/usb/audio.h | 19 +++++++--
 sound/usb/mixer.c              | 88 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 97 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h
index 3a78e7145689..13d98e6e0db1 100644
--- a/include/uapi/linux/usb/audio.h
+++ b/include/uapi/linux/usb/audio.h
@@ -285,9 +285,22 @@ static inline __u8 uac_mixer_unit_iChannelNames(struct uac_mixer_unit_descriptor
 static inline __u8 *uac_mixer_unit_bmControls(struct uac_mixer_unit_descriptor *desc,
 					      int protocol)
 {
-	return (protocol == UAC_VERSION_1) ?
-		&desc->baSourceID[desc->bNrInPins + 4] :
-		&desc->baSourceID[desc->bNrInPins + 6];
+	switch (protocol) {
+	case UAC_VERSION_1:
+		return &desc->baSourceID[desc->bNrInPins + 4];
+	case UAC_VERSION_2:
+		return &desc->baSourceID[desc->bNrInPins + 6];
+	case UAC_VERSION_3:
+		return &desc->baSourceID[desc->bNrInPins + 2];
+	default:
+		return NULL;
+	}
+}
+
+static inline __u16 uac3_mixer_unit_wClusterDescrID(struct uac_mixer_unit_descriptor *desc)
+{
+	return (desc->baSourceID[desc->bNrInPins + 1] << 8) |
+		desc->baSourceID[desc->bNrInPins];
 }
 
 static inline __u8 uac_mixer_unit_iMixer(struct uac_mixer_unit_descriptor *desc)
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 173979e05e63..c4abb3bca2ad 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -716,6 +716,66 @@ static int get_term_name(struct snd_usb_audio *chip, struct usb_audio_term *iter
 	return 0;
 }
 
+/*
+ * Get logical cluster information for UAC3 devices.
+ */
+static int get_cluster_channels_v3(struct mixer_build *state, unsigned int cluster_id)
+{
+	struct uac3_cluster_header_descriptor c_header;
+	int err;
+
+	err = snd_usb_ctl_msg(state->chip->dev,
+			usb_rcvctrlpipe(state->chip->dev, 0),
+			UAC3_CS_REQ_HIGH_CAPABILITY_DESCRIPTOR,
+			USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN,
+			cluster_id,
+			snd_usb_ctrl_intf(state->chip),
+			&c_header, sizeof(c_header));
+	if (err < 0)
+		goto error;
+	if (err != sizeof(c_header)) {
+		err = -EIO;
+		goto error;
+	}
+
+	return c_header.bNrChannels;
+
+error:
+	usb_audio_err(state->chip, "cannot request logical cluster ID: %d (err: %d)\n", cluster_id, err);
+	return err;
+}
+
+/*
+ * Get number of channels for a Mixer Unit.
+ */
+static int uac_mixer_unit_get_channels(struct mixer_build *state,
+				       struct uac_mixer_unit_descriptor *desc)
+{
+	int mu_channels;
+
+	if (desc->bLength < 11)
+		return -EINVAL;
+	if (!desc->bNrInPins)
+		return -EINVAL;
+
+	switch (state->mixer->protocol) {
+	case UAC_VERSION_1:
+	case UAC_VERSION_2:
+	default:
+		mu_channels = uac_mixer_unit_bNrChannels(desc);
+		break;
+	case UAC_VERSION_3:
+		mu_channels = get_cluster_channels_v3(state,
+				uac3_mixer_unit_wClusterDescrID(desc));
+		break;
+	}
+
+	if (!mu_channels)
+		return -EINVAL;
+
+	return mu_channels;
+}
+
 /*
  * parse the source unit recursively until it reaches to a terminal
  * or a branched unit.
@@ -863,6 +923,18 @@ static int check_input_term(struct mixer_build *state, int id,
 				term->name = le16_to_cpu(d->wClockSourceStr);
 				return 0;
 			}
+			case UAC3_MIXER_UNIT: {
+				struct uac_mixer_unit_descriptor *d = p1;
+
+				err = uac_mixer_unit_get_channels(state, d);
+				if (err < 0)
+					return err;
+
+				term->channels = err;
+				term->type = d->bDescriptorSubtype << 16; /* virtual type */
+
+				return 0;
+			}
 			default:
 				return -ENODEV;
 			}
@@ -1826,11 +1898,10 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid,
  */
 static void build_mixer_unit_ctl(struct mixer_build *state,
 				 struct uac_mixer_unit_descriptor *desc,
-				 int in_pin, int in_ch, int unitid,
-				 struct usb_audio_term *iterm)
+				 int in_pin, int in_ch, int num_outs,
+				 int unitid, struct usb_audio_term *iterm)
 {
 	struct usb_mixer_elem_info *cval;
-	unsigned int num_outs = uac_mixer_unit_bNrChannels(desc);
 	unsigned int i, len;
 	struct snd_kcontrol *kctl;
 	const struct usbmix_name_map *map;
@@ -1907,14 +1978,17 @@ static int parse_audio_mixer_unit(struct mixer_build *state, int unitid,
 	int input_pins, num_ins, num_outs;
 	int pin, ich, err;
 
-	if (desc->bLength < 11 || !(input_pins = desc->bNrInPins) ||
-	    !(num_outs = uac_mixer_unit_bNrChannels(desc))) {
+	err = uac_mixer_unit_get_channels(state, desc);
+	if (err < 0) {
 		usb_audio_err(state->chip,
 			      "invalid MIXER UNIT descriptor %d\n",
 			      unitid);
-		return -EINVAL;
+		return err;
 	}
 
+	num_outs = err;
+	input_pins = desc->bNrInPins;
+
 	num_ins = 0;
 	ich = 0;
 	for (pin = 0; pin < input_pins; pin++) {
@@ -1941,7 +2015,7 @@ static int parse_audio_mixer_unit(struct mixer_build *state, int unitid,
 				}
 			}
 			if (ich_has_controls)
-				build_mixer_unit_ctl(state, desc, pin, ich,
+				build_mixer_unit_ctl(state, desc, pin, ich, num_outs,
 						     unitid, &iterm);
 		}
 	}
-- 
cgit v1.2.3


From 1d38f5d828b45546e53095a172c77556642a8a04 Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Date: Fri, 11 May 2018 16:25:36 +0100
Subject: ALSA: usb-audio: UAC3 Add support for connector insertion.

This adds support for the UAC3 insertion controls. The status
is reported as a boolean value in the same way it used to do
for UAC2. Hence, the presence of any connector in the response
will make the control saying the jack is connected.

The UAC2 support for this control has been moved to a dedicated
control for connectors as both UAC2 and UAC3 follow a specific
Control Request Parameter Block for this control. This parameter
block for UAC3 could not be read in the same simplistic
manner as in UAC2.

This implementation is not requesting additional information
from the HIGH CAPABILITY Connectors descriptor.

Tested with an UAC3 device with UAC2 as legacy configuration.
The connector status can be read with `amixer` and the interrupt
is also caught with `alsactl monitor`.

Signed-off-by: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Reviewed-by: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Tested-by: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v2.h |   7 +++
 include/linux/usb/audio-v3.h |  14 ++++++
 sound/usb/mixer.c            | 108 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 115 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index aaafecf073ff..a96ed2ce3254 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -189,6 +189,13 @@ struct uac2_iso_endpoint_descriptor {
 #define UAC2_CONTROL_DATA_OVERRUN	(3 << 2)
 #define UAC2_CONTROL_DATA_UNDERRUN	(3 << 4)
 
+/* 5.2.5.4.2 Connector Control Parameter Block */
+struct uac2_connectors_ctl_blk {
+	__u8 bNrChannels;
+	__le32 bmChannelConfig;
+	__u8 iChannelNames;
+} __attribute__((packed));
+
 /* 6.1 Interrupt Data Message */
 
 #define UAC2_INTERRUPT_DATA_MSG_VENDOR	(1 << 0)
diff --git a/include/linux/usb/audio-v3.h b/include/linux/usb/audio-v3.h
index 38add1dedf2e..a710e28b5215 100644
--- a/include/linux/usb/audio-v3.h
+++ b/include/linux/usb/audio-v3.h
@@ -221,6 +221,12 @@ struct uac3_iso_endpoint_descriptor {
 	__le16 wLockDelay;
 } __attribute__((packed));
 
+/* 5.2.1.6.1 INSERTION CONTROL PARAMETER BLOCK */
+struct uac3_insertion_ctl_blk {
+	__u8 bSize;
+	__u8 bmConInserted;
+} __attribute__ ((packed));
+
 /* 6.1 INTERRUPT DATA MESSAGE */
 struct uac3_interrupt_data_msg {
 	__u8 bInfo;
@@ -392,6 +398,14 @@ struct uac3_interrupt_data_msg {
 #define UAC3_AC_ACTIVE_INTERFACE_CONTROL	0x01
 #define UAC3_AC_POWER_DOMAIN_CONTROL		0x02
 
+/* A.23.5 TERMINAL CONTROL SELECTORS */
+#define UAC3_TE_UNDEFINED			0x00
+#define UAC3_TE_INSERTION			0x01
+#define UAC3_TE_OVERLOAD			0x02
+#define UAC3_TE_UNDERFLOW			0x03
+#define UAC3_TE_OVERFLOW			0x04
+#define UAC3_TE_LATENCY 			0x05
+
 /* BADD predefined Unit/Terminal values */
 #define UAC3_BADD_IT_ID1	1  /* Input Terminal ID1: bTerminalID = 1 */
 #define UAC3_BADD_FU_ID2	2  /* Feature Unit ID2: bUnitID = 2 */
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index c4abb3bca2ad..fb77847cbffc 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -1328,6 +1328,51 @@ static int mixer_ctl_master_bool_get(struct snd_kcontrol *kcontrol,
 	return 0;
 }
 
+/* get the connectors status and report it as boolean type */
+static int mixer_ctl_connector_get(struct snd_kcontrol *kcontrol,
+				   struct snd_ctl_elem_value *ucontrol)
+{
+	struct usb_mixer_elem_info *cval = kcontrol->private_data;
+	struct snd_usb_audio *chip = cval->head.mixer->chip;
+	int idx = 0, validx, ret, val;
+
+	validx = cval->control << 8 | 0;
+
+	ret = snd_usb_lock_shutdown(chip) ? -EIO : 0;
+	if (ret)
+		goto error;
+
+	idx = snd_usb_ctrl_intf(chip) | (cval->head.id << 8);
+	if (cval->head.mixer->protocol == UAC_VERSION_2) {
+		struct uac2_connectors_ctl_blk uac2_conn;
+
+		ret = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), UAC2_CS_CUR,
+				      USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN,
+				      validx, idx, &uac2_conn, sizeof(uac2_conn));
+		val = !!uac2_conn.bNrChannels;
+	} else { /* UAC_VERSION_3 */
+		struct uac3_insertion_ctl_blk uac3_conn;
+
+		ret = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), UAC2_CS_CUR,
+				      USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN,
+				      validx, idx, &uac3_conn, sizeof(uac3_conn));
+		val = !!uac3_conn.bmConInserted;
+	}
+
+	snd_usb_unlock_shutdown(chip);
+
+	if (ret < 0) {
+error:
+		usb_audio_err(chip,
+			"cannot get connectors status: req = %#x, wValue = %#x, wIndex = %#x, type = %d\n",
+			UAC_GET_CUR, validx, idx, cval->val_type);
+		return ret;
+	}
+
+	ucontrol->value.integer.value[0] = val;
+	return 0;
+}
+
 static struct snd_kcontrol_new usb_feature_unit_ctl = {
 	.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
 	.name = "", /* will be filled later manually */
@@ -1358,6 +1403,15 @@ static struct snd_kcontrol_new usb_bool_master_control_ctl_ro = {
 	.put = NULL,
 };
 
+static const struct snd_kcontrol_new usb_connector_ctl_ro = {
+	.iface = SNDRV_CTL_ELEM_IFACE_CARD,
+	.name = "", /* will be filled later manually */
+	.access = SNDRV_CTL_ELEM_ACCESS_READ,
+	.info = snd_ctl_boolean_mono_info,
+	.get = mixer_ctl_connector_get,
+	.put = NULL,
+};
+
 /*
  * This symbol is exported in order to allow the mixer quirks to
  * hook up to the standard feature unit control mechanism
@@ -1626,17 +1680,25 @@ static void build_connector_control(struct mixer_build *state,
 		return;
 	snd_usb_mixer_elem_init_std(&cval->head, state->mixer, term->id);
 	/*
-	 * The first byte from reading the UAC2_TE_CONNECTOR control returns the
-	 * number of channels connected.  This boolean ctl will simply report
-	 * if any channels are connected or not.
-	 * (Audio20_final.pdf Table 5-10: Connector Control CUR Parameter Block)
+	 * UAC2: The first byte from reading the UAC2_TE_CONNECTOR control returns the
+	 * number of channels connected.
+	 *
+	 * UAC3: The first byte specifies size of bitmap for the inserted controls. The
+	 * following byte(s) specifies which connectors are inserted.
+	 *
+	 * This boolean ctl will simply report if any channels are connected
+	 * or not.
 	 */
-	cval->control = UAC2_TE_CONNECTOR;
+	if (state->mixer->protocol == UAC_VERSION_2)
+		cval->control = UAC2_TE_CONNECTOR;
+	else /* UAC_VERSION_3 */
+		cval->control = UAC3_TE_INSERTION;
+
 	cval->val_type = USB_MIXER_BOOLEAN;
 	cval->channels = 1; /* report true if any channel is connected */
 	cval->min = 0;
 	cval->max = 1;
-	kctl = snd_ctl_new1(&usb_bool_master_control_ctl_ro, cval);
+	kctl = snd_ctl_new1(&usb_connector_ctl_ro, cval);
 	if (!kctl) {
 		usb_audio_err(state->chip, "cannot malloc kcontrol\n");
 		kfree(cval);
@@ -1954,16 +2016,28 @@ static int parse_audio_input_terminal(struct mixer_build *state, int unitid,
 				      void *raw_desc)
 {
 	struct usb_audio_term iterm;
-	struct uac2_input_terminal_descriptor *d = raw_desc;
+	unsigned int control, bmctls, term_id;
 
-	check_input_term(state, d->bTerminalID, &iterm);
 	if (state->mixer->protocol == UAC_VERSION_2) {
-		/* Check for jack detection. */
-		if (uac_v2v3_control_is_readable(le16_to_cpu(d->bmControls),
-						 UAC2_TE_CONNECTOR)) {
-			build_connector_control(state, &iterm, true);
-		}
+		struct uac2_input_terminal_descriptor *d_v2 = raw_desc;
+		control = UAC2_TE_CONNECTOR;
+		term_id = d_v2->bTerminalID;
+		bmctls = le16_to_cpu(d_v2->bmControls);
+	} else if (state->mixer->protocol == UAC_VERSION_3) {
+		struct uac3_input_terminal_descriptor *d_v3 = raw_desc;
+		control = UAC3_TE_INSERTION;
+		term_id = d_v3->bTerminalID;
+		bmctls = le32_to_cpu(d_v3->bmControls);
+	} else {
+		return 0; /* UAC1. No Insertion control */
 	}
+
+	check_input_term(state, term_id, &iterm);
+
+	/* Check for jack detection. */
+	if (uac_v2v3_control_is_readable(bmctls, control))
+		build_connector_control(state, &iterm, true);
+
 	return 0;
 }
 
@@ -2554,7 +2628,7 @@ static int parse_audio_unit(struct mixer_build *state, int unitid)
 	} else { /* UAC_VERSION_3 */
 		switch (p1[2]) {
 		case UAC_INPUT_TERMINAL:
-			return 0; /* NOP */
+			return parse_audio_input_terminal(state, unitid, p1);
 		case UAC3_MIXER_UNIT:
 			return parse_audio_mixer_unit(state, unitid, p1);
 		case UAC3_CLOCK_SOURCE:
@@ -2932,6 +3006,12 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer)
 			err = parse_audio_unit(&state, desc->bCSourceID);
 			if (err < 0 && err != -EINVAL)
 				return err;
+
+			if (uac_v2v3_control_is_readable(le32_to_cpu(desc->bmControls),
+							 UAC3_TE_INSERTION)) {
+				build_connector_control(&state, &state.oterm,
+							false);
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From b7e4aadef28f217de8907eec60a964328797a2be Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Mon, 14 May 2018 16:01:27 -0700
Subject: locking/spinlocks: Document the semantics of spin_is_locked()

There appeared to be a certain, recurrent uncertainty concerning the
semantics of spin_is_locked(), likely a consequence of the fact that
this semantics remains undocumented or that it has been historically
linked to the (likewise unclear) semantics of spin_unlock_wait().

A recent auditing [1] of the callers of the primitive confirmed that
none of them are relying on particular ordering guarantees; document
this semantics by adding a docbook header to spin_is_locked(). Also,
describe behaviors specific to certain CONFIG_SMP=n builds.

[1] https://marc.info/?l=linux-kernel&m=151981440005264&w=2
    https://marc.info/?l=linux-kernel&m=152042843808540&w=2
    https://marc.info/?l=linux-kernel&m=152043346110262&w=2

Co-Developed-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Co-Developed-by: Alan Stern <stern@rowland.harvard.edu>
Co-Developed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Akira Yokosawa <akiyks@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: parri.andrea@gmail.com
Link: http://lkml.kernel.org/r/1526338889-7003-1-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/spinlock.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 4894d322d258..1e8a46435838 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -380,6 +380,24 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
+/**
+ * spin_is_locked() - Check whether a spinlock is locked.
+ * @lock: Pointer to the spinlock.
+ *
+ * This function is NOT required to provide any memory ordering
+ * guarantees; it could be used for debugging purposes or, when
+ * additional synchronization is needed, accompanied with other
+ * constructs (memory barriers) enforcing the synchronization.
+ *
+ * Returns: 1 if @lock is locked, 0 otherwise.
+ *
+ * Note that the function only tells you that the spinlock is
+ * seen to be locked, not that it is locked on your CPU.
+ *
+ * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
+ * the return value is always 0 (see include/linux/spinlock_up.h).
+ * Therefore you should not rely heavily on the return value.
+ */
 static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
-- 
cgit v1.2.3


From 1362ae43c503a4e333ab6948fc4c6e0e794e1558 Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Mon, 14 May 2018 16:01:29 -0700
Subject: locking/spinlocks: Clean up comment and #ifndef for
 {,queued_}spin_is_locked()

Removes "#ifndef queued_spin_is_locked" from the generic code: this is
unused and it's reasonable to conclude that it will continue to be unused.

Also removes the comment about spin_is_locked() from mutex_is_locked():
the comment remains valid but not particularly useful.

Suggested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akiyks@gmail.com
Cc: boqun.feng@gmail.com
Cc: dhowells@redhat.com
Cc: j.alglave@ucl.ac.uk
Cc: linux-arch@vger.kernel.org
Cc: luc.maranget@inria.fr
Cc: npiggin@gmail.com
Cc: parri.andrea@gmail.com
Cc: stern@rowland.harvard.edu
Link: http://lkml.kernel.org/r/1526338889-7003-3-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/qspinlock.h | 2 --
 include/linux/mutex.h           | 3 ---
 2 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index a8ed0a352d75..9cc457597ddf 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -26,7 +26,6 @@
  * @lock: Pointer to queued spinlock structure
  * Return: 1 if it is locked, 0 otherwise
  */
-#ifndef queued_spin_is_locked
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
 	/*
@@ -35,7 +34,6 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 	 */
 	return atomic_read(&lock->val);
 }
-#endif
 
 /**
  * queued_spin_value_unlocked - is the spinlock structure unlocked?
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 14bc0d5d0ee5..3093dd162424 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -146,9 +146,6 @@ extern void __mutex_init(struct mutex *lock, const char *name,
  */
 static inline bool mutex_is_locked(struct mutex *lock)
 {
-	/*
-	 * XXX think about spin_is_locked
-	 */
 	return __mutex_owner(lock) != NULL;
 }
 
-- 
cgit v1.2.3


From 60622d68227d6d71fdfba5fb39f7f3d44cdd8815 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 3 May 2018 17:06:21 -0700
Subject: x86/asm/memcpy_mcsafe: Return bytes remaining

Machine check safe memory copies are currently deployed in the pmem
driver whenever reading from persistent memory media, so that -EIO is
returned rather than triggering a kernel panic. While this protects most
pmem accesses, it is not complete in the filesystem-dax case. When
filesystem-dax is enabled reads may bypass the block layer and the
driver via dax_iomap_actor() and its usage of copy_to_iter().

In preparation for creating a copy_to_iter() variant that can handle
machine checks, teach memcpy_mcsafe() to return the number of bytes
remaining rather than -EFAULT when an exception occurs.

Co-developed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: hch@lst.de
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-nvdimm@lists.01.org
Link: http://lkml.kernel.org/r/152539238119.31796.14318473522414462886.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/string_64.h |  8 +++++---
 arch/x86/lib/memcpy_64.S         | 20 ++++++++++++++------
 drivers/nvdimm/claim.c           |  3 ++-
 drivers/nvdimm/pmem.c            |  6 +++---
 include/linux/string.h           |  4 ++--
 5 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 4752f8984923..d33f92b9fa22 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,8 @@ int strcmp(const char *cs, const char *ct);
 #endif
 
 #define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int __memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src,
+		size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
 /**
@@ -131,9 +132,10 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key);
  * actually do machine check recovery. Everyone else can just
  * use memcpy().
  *
- * Return 0 for success, -EFAULT for fail
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
  */
-static __always_inline __must_check int
+static __always_inline __must_check unsigned long
 memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 {
 #ifdef CONFIG_X86_MCE
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 5709f3ec22a4..f01a88391c98 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -252,14 +252,22 @@ ENDPROC(__memcpy_mcsafe)
 EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
 	.section .fixup, "ax"
-	/* Return -EFAULT for any failure */
-.L_memcpy_mcsafe_fail:
-	mov	$-EFAULT, %rax
+	/*
+	 * Return number of bytes not copied for any failure. Note that
+	 * there is no "tail" handling since the source buffer is 8-byte
+	 * aligned and poison is cacheline aligned.
+	 */
+.E_read_words:
+	shll	$3, %ecx
+.E_leading_bytes:
+	addl	%edx, %ecx
+.E_trailing_bytes:
+	mov	%ecx, %eax
 	ret
 
 	.previous
 
-	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_read_words, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
+	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
+	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
 #endif
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 30852270484f..2e96b34bc936 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -276,7 +276,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	if (rw == READ) {
 		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
 			return -EIO;
-		return memcpy_mcsafe(buf, nsio->addr + offset, size);
+		if (memcpy_mcsafe(buf, nsio->addr + offset, size) != 0)
+			return -EIO;
 	}
 
 	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..e023d6aa22b5 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -101,15 +101,15 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
 		void *pmem_addr, unsigned int len)
 {
 	unsigned int chunk;
-	int rc;
+	unsigned long rem;
 	void *mem;
 
 	while (len) {
 		mem = kmap_atomic(page);
 		chunk = min_t(unsigned int, len, PAGE_SIZE);
-		rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+		rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
 		kunmap_atomic(mem);
-		if (rc)
+		if (rem)
 			return BLK_STS_IOERR;
 		len -= chunk;
 		off = 0;
diff --git a/include/linux/string.h b/include/linux/string.h
index dd39a690c841..4a5a0eb7df51 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -147,8 +147,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 #ifndef __HAVE_ARCH_MEMCPY_MCSAFE
-static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
-		size_t cnt)
+static inline __must_check unsigned long memcpy_mcsafe(void *dst,
+		const void *src, size_t cnt)
 {
 	memcpy(dst, src, cnt);
 	return 0;
-- 
cgit v1.2.3


From 8780356ef630aa577fd4daa49e49b79674711fae Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 3 May 2018 17:06:31 -0700
Subject: x86/asm/memcpy_mcsafe: Define copy_to_iter_mcsafe()

Use the updated memcpy_mcsafe() implementation to define
copy_user_mcsafe() and copy_to_iter_mcsafe(). The most significant
difference from typical copy_to_iter() is that the ITER_KVEC and
ITER_BVEC iterator types can fail to complete a full transfer.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: hch@lst.de
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-nvdimm@lists.01.org
Link: http://lkml.kernel.org/r/152539239150.31796.9189779163576449784.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                  |  1 +
 arch/x86/include/asm/uaccess_64.h | 11 +++++++
 include/linux/uio.h               | 15 ++++++++++
 lib/iov_iter.c                    | 61 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..6ca22706cd64 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
+	select ARCH_HAS_UACCESS_MCSAFE		if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c63efc07891f..62acb613114b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -46,6 +46,17 @@ copy_user_generic(void *to, const void *from, unsigned len)
 	return ret;
 }
 
+static __always_inline __must_check unsigned long
+copy_to_user_mcsafe(void *to, const void *from, unsigned len)
+{
+	unsigned long ret;
+
+	__uaccess_begin();
+	ret = memcpy_mcsafe(to, from, len);
+	__uaccess_end();
+	return ret;
+}
+
 static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e67e12adb136..f5766e853a77 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -154,6 +154,12 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
 #define _copy_from_iter_flushcache _copy_from_iter_nocache
 #endif
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
+#else
+#define _copy_to_iter_mcsafe _copy_to_iter
+#endif
+
 static __always_inline __must_check
 size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -163,6 +169,15 @@ size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 		return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
+static __always_inline __must_check
+size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i)
+{
+	if (unlikely(!check_copy_size(addr, bytes, false)))
+		return 0;
+	else
+		return _copy_to_iter_mcsafe(addr, bytes, i);
+}
+
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 970212670b6a..70ebc8ede143 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -573,6 +573,67 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+static int copyout_mcsafe(void __user *to, const void *from, size_t n)
+{
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		kasan_check_read(from, n);
+		n = copy_to_user_mcsafe((__force void *) to, from, n);
+	}
+	return n;
+}
+
+static unsigned long memcpy_mcsafe_to_page(struct page *page, size_t offset,
+		const char *from, size_t len)
+{
+	unsigned long ret;
+	char *to;
+
+	to = kmap_atomic(page);
+	ret = memcpy_mcsafe(to + offset, from, len);
+	kunmap_atomic(to);
+
+	return ret;
+}
+
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i)
+{
+	const char *from = addr;
+	unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
+
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
+	if (iter_is_iovec(i))
+		might_fault();
+	iterate_and_advance(i, bytes, v,
+		copyout_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+		({
+		rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset,
+                               (from += v.bv_len) - v.bv_len, v.bv_len);
+		if (rem) {
+			curr_addr = (unsigned long) from;
+			bytes = curr_addr - s_addr - rem;
+			return bytes;
+		}
+		}),
+		({
+		rem = memcpy_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len,
+				v.iov_len);
+		if (rem) {
+			curr_addr = (unsigned long) from;
+			bytes = curr_addr - s_addr - rem;
+			return bytes;
+		}
+		})
+	)
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe);
+#endif /* CONFIG_ARCH_HAS_UACCESS_MCSAFE */
+
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
-- 
cgit v1.2.3


From 655016dc2d30379a417404ae88df9bfb7ede38e6 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 4 May 2018 06:59:42 +1000
Subject: usb/gadget: Constify usb_gadget_get_string "table" argument

The table is never modified by the function. This allows us
to use it on a statically defined table that is marked const.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/usbstring.c | 2 +-
 include/linux/usb/gadget.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/gadget/usbstring.c b/drivers/usb/gadget/usbstring.c
index 566ab261e8b7..7c24d1ce1088 100644
--- a/drivers/usb/gadget/usbstring.c
+++ b/drivers/usb/gadget/usbstring.c
@@ -33,7 +33,7 @@
  * characters (which are also widely used in C strings).
  */
 int
-usb_gadget_get_string (struct usb_gadget_strings *table, int id, u8 *buf)
+usb_gadget_get_string (const struct usb_gadget_strings *table, int id, u8 *buf)
 {
 	struct usb_string	*s;
 	int			len;
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 847f423ad9b3..e5cd84a0f84a 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -763,7 +763,7 @@ struct usb_gadget_string_container {
 };
 
 /* put descriptor for string with that id into buf (buflen >= 256) */
-int usb_gadget_get_string(struct usb_gadget_strings *table, int id, u8 *buf);
+int usb_gadget_get_string(const struct usb_gadget_strings *table, int id, u8 *buf);
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3


From d2b9889f77165891378d79efd40d74e0162f56d8 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Tue, 10 Apr 2018 01:02:58 +0300
Subject: usb: tegra: Move utmi-pads reset from ehci-tegra to tegra-phy

UTMI pads are shared by USB controllers and reset of UTMI pads is shared
with the reset of USB1 controller. Currently reset of UTMI pads is done by
the EHCI driver and ChipIdea UDC works because EHCI driver always happen
to be probed first. Move reset controls from ehci-tegra to tegra-phy in
order to resolve the problem.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/host/ehci-tegra.c     | 87 ++++++++++++++++++---------------------
 drivers/usb/phy/phy-tegra-usb.c   | 79 ++++++++++++++++++++++++++++++++---
 include/linux/usb/tegra_usb_phy.h |  2 +
 3 files changed, 115 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/host/ehci-tegra.c b/drivers/usb/host/ehci-tegra.c
index a6f4389f7e88..4d2cdec4cb78 100644
--- a/drivers/usb/host/ehci-tegra.c
+++ b/drivers/usb/host/ehci-tegra.c
@@ -36,7 +36,6 @@
 #define DRV_NAME "tegra-ehci"
 
 static struct hc_driver __read_mostly tegra_ehci_hc_driver;
-static bool usb1_reset_attempted;
 
 struct tegra_ehci_soc_config {
 	bool has_hostpc;
@@ -51,67 +50,54 @@ struct tegra_ehci_hcd {
 	enum tegra_usb_phy_port_speed port_speed;
 };
 
-/*
- * The 1st USB controller contains some UTMI pad registers that are global for
- * all the controllers on the chip. Those registers are also cleared when
- * reset is asserted to the 1st controller. This means that the 1st controller
- * can only be reset when no other controlled has finished probing. So we'll
- * reset the 1st controller before doing any other setup on any of the
- * controllers, and then never again.
- *
- * Since this is a PHY issue, the Tegra PHY driver should probably be doing
- * the resetting of the USB controllers. But to keep compatibility with old
- * device trees that don't have reset phandles in the PHYs, do it here.
- * Those old DTs will be vulnerable to total USB breakage if the 1st EHCI
- * device isn't the first one to finish probing, so warn them.
- */
 static int tegra_reset_usb_controller(struct platform_device *pdev)
 {
 	struct device_node *phy_np;
 	struct usb_hcd *hcd = platform_get_drvdata(pdev);
 	struct tegra_ehci_hcd *tegra =
 		(struct tegra_ehci_hcd *)hcd_to_ehci(hcd)->priv;
-	bool has_utmi_pad_registers = false;
+	struct reset_control *rst;
+	int err;
 
 	phy_np = of_parse_phandle(pdev->dev.of_node, "nvidia,phy", 0);
 	if (!phy_np)
 		return -ENOENT;
 
-	if (of_property_read_bool(phy_np, "nvidia,has-utmi-pad-registers"))
-		has_utmi_pad_registers = true;
+	/*
+	 * The 1st USB controller contains some UTMI pad registers that are
+	 * global for all the controllers on the chip. Those registers are
+	 * also cleared when reset is asserted to the 1st controller.
+	 */
+	rst = of_reset_control_get_shared(phy_np, "utmi-pads");
+	if (IS_ERR(rst)) {
+		dev_warn(&pdev->dev,
+			 "can't get utmi-pads reset from the PHY\n");
+		dev_warn(&pdev->dev,
+			 "continuing, but please update your DT\n");
+	} else {
+		/*
+		 * PHY driver performs UTMI-pads reset in a case of
+		 * non-legacy DT.
+		 */
+		reset_control_put(rst);
+	}
 
-	if (!usb1_reset_attempted) {
-		struct reset_control *usb1_reset;
+	of_node_put(phy_np);
 
-		if (!has_utmi_pad_registers)
-			usb1_reset = of_reset_control_get(phy_np, "utmi-pads");
-		else
-			usb1_reset = tegra->rst;
-
-		if (IS_ERR(usb1_reset)) {
-			dev_warn(&pdev->dev,
-				 "can't get utmi-pads reset from the PHY\n");
-			dev_warn(&pdev->dev,
-				 "continuing, but please update your DT\n");
-		} else {
-			reset_control_assert(usb1_reset);
-			udelay(1);
-			reset_control_deassert(usb1_reset);
-
-			if (!has_utmi_pad_registers)
-				reset_control_put(usb1_reset);
-		}
+	/* reset control is shared, hence initialize it first */
+	err = reset_control_deassert(tegra->rst);
+	if (err)
+		return err;
 
-		usb1_reset_attempted = true;
-	}
+	err = reset_control_assert(tegra->rst);
+	if (err)
+		return err;
 
-	if (!has_utmi_pad_registers) {
-		reset_control_assert(tegra->rst);
-		udelay(1);
-		reset_control_deassert(tegra->rst);
-	}
+	udelay(1);
 
-	of_node_put(phy_np);
+	err = reset_control_deassert(tegra->rst);
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -440,7 +426,7 @@ static int tegra_ehci_probe(struct platform_device *pdev)
 		goto cleanup_hcd_create;
 	}
 
-	tegra->rst = devm_reset_control_get(&pdev->dev, "usb");
+	tegra->rst = devm_reset_control_get_shared(&pdev->dev, "usb");
 	if (IS_ERR(tegra->rst)) {
 		dev_err(&pdev->dev, "Can't get ehci reset\n");
 		err = PTR_ERR(tegra->rst);
@@ -452,8 +438,10 @@ static int tegra_ehci_probe(struct platform_device *pdev)
 		goto cleanup_hcd_create;
 
 	err = tegra_reset_usb_controller(pdev);
-	if (err)
+	if (err) {
+		dev_err(&pdev->dev, "Failed to reset controller\n");
 		goto cleanup_clk_en;
+	}
 
 	u_phy = devm_usb_get_phy_by_phandle(&pdev->dev, "nvidia,phy", 0);
 	if (IS_ERR(u_phy)) {
@@ -538,6 +526,9 @@ static int tegra_ehci_remove(struct platform_device *pdev)
 	usb_phy_shutdown(hcd->usb_phy);
 	usb_remove_hcd(hcd);
 
+	reset_control_assert(tegra->rst);
+	udelay(1);
+
 	clk_disable_unprepare(tegra->clk);
 
 	usb_put_hcd(hcd);
diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c
index e46219e7fa93..ea7ef1dc0b42 100644
--- a/drivers/usb/phy/phy-tegra-usb.c
+++ b/drivers/usb/phy/phy-tegra-usb.c
@@ -236,17 +236,83 @@ static void set_phcd(struct tegra_usb_phy *phy, bool enable)
 
 static int utmip_pad_open(struct tegra_usb_phy *phy)
 {
-	int err;
+	int ret;
 
 	phy->pad_clk = devm_clk_get(phy->u_phy.dev, "utmi-pads");
 	if (IS_ERR(phy->pad_clk)) {
-		err = PTR_ERR(phy->pad_clk);
+		ret = PTR_ERR(phy->pad_clk);
 		dev_err(phy->u_phy.dev,
-			"Failed to get UTMIP pad clock: %d\n", err);
-		return err;
+			"Failed to get UTMIP pad clock: %d\n", ret);
+		return ret;
 	}
 
-	return 0;
+	phy->pad_rst = devm_reset_control_get_optional_shared(
+						phy->u_phy.dev, "utmi-pads");
+	if (IS_ERR(phy->pad_rst)) {
+		ret = PTR_ERR(phy->pad_rst);
+		dev_err(phy->u_phy.dev,
+			"Failed to get UTMI-pads reset: %d\n", ret);
+		return ret;
+	}
+
+	ret = clk_prepare_enable(phy->pad_clk);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to enable UTMI-pads clock: %d\n", ret);
+		return ret;
+	}
+
+	spin_lock(&utmip_pad_lock);
+
+	ret = reset_control_deassert(phy->pad_rst);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to initialize UTMI-pads reset: %d\n", ret);
+		goto unlock;
+	}
+
+	ret = reset_control_assert(phy->pad_rst);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to assert UTMI-pads reset: %d\n", ret);
+		goto unlock;
+	}
+
+	udelay(1);
+
+	ret = reset_control_deassert(phy->pad_rst);
+	if (ret)
+		dev_err(phy->u_phy.dev,
+			"Failed to deassert UTMI-pads reset: %d\n", ret);
+unlock:
+	spin_unlock(&utmip_pad_lock);
+
+	clk_disable_unprepare(phy->pad_clk);
+
+	return ret;
+}
+
+static int utmip_pad_close(struct tegra_usb_phy *phy)
+{
+	int ret;
+
+	ret = clk_prepare_enable(phy->pad_clk);
+	if (ret) {
+		dev_err(phy->u_phy.dev,
+			"Failed to enable UTMI-pads clock: %d\n", ret);
+		return ret;
+	}
+
+	ret = reset_control_assert(phy->pad_rst);
+	if (ret)
+		dev_err(phy->u_phy.dev,
+			"Failed to assert UTMI-pads reset: %d\n", ret);
+
+	udelay(1);
+
+	clk_disable_unprepare(phy->pad_clk);
+
+	return ret;
 }
 
 static void utmip_pad_power_on(struct tegra_usb_phy *phy)
@@ -700,6 +766,9 @@ static void tegra_usb_phy_close(struct tegra_usb_phy *phy)
 	if (!IS_ERR(phy->vbus))
 		regulator_disable(phy->vbus);
 
+	if (!phy->is_ulpi_phy)
+		utmip_pad_close(phy);
+
 	clk_disable_unprepare(phy->pll_u);
 }
 
diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index d641ea1660b7..0c5c3ea8b2d7 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -17,6 +17,7 @@
 #define __TEGRA_USB_PHY_H
 
 #include <linux/clk.h>
+#include <linux/reset.h>
 #include <linux/usb/otg.h>
 
 /*
@@ -76,6 +77,7 @@ struct tegra_usb_phy {
 	bool is_legacy_phy;
 	bool is_ulpi_phy;
 	int reset_gpio;
+	struct reset_control *pad_rst;
 };
 
 void tegra_usb_phy_preresume(struct usb_phy *phy);
-- 
cgit v1.2.3


From 2f8519f6ed42cd332ec442e3b9edc0575bc4df48 Mon Sep 17 00:00:00 2001
From: Romain Izard <romain.izard.pro@gmail.com>
Date: Fri, 11 May 2018 12:19:55 +0200
Subject: usb: gadget: udc: atmel: Remove obsolete include

The include defines the private platform_data structure used with AVR
platforms. It has no user since 7c55984e191f. Remove it.

Acked-by: Ludovic Desroches <ludovic.desroches@microchip.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Signed-off-by: Romain Izard <romain.izard.pro@gmail.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/udc/atmel_usba_udc.c |  1 -
 include/linux/usb/atmel_usba_udc.h      | 24 ------------------------
 2 files changed, 25 deletions(-)
 delete mode 100644 include/linux/usb/atmel_usba_udc.h

(limited to 'include')

diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
index 0fe3d0feb8f7..b9623e228609 100644
--- a/drivers/usb/gadget/udc/atmel_usba_udc.c
+++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
@@ -20,7 +20,6 @@
 #include <linux/ctype.h>
 #include <linux/usb/ch9.h>
 #include <linux/usb/gadget.h>
-#include <linux/usb/atmel_usba_udc.h>
 #include <linux/delay.h>
 #include <linux/of.h>
 #include <linux/irq.h>
diff --git a/include/linux/usb/atmel_usba_udc.h b/include/linux/usb/atmel_usba_udc.h
deleted file mode 100644
index 9bb00df3b53f..000000000000
--- a/include/linux/usb/atmel_usba_udc.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Platform data definitions for Atmel USBA gadget driver.
- */
-#ifndef __LINUX_USB_USBA_H
-#define __LINUX_USB_USBA_H
-
-struct usba_ep_data {
-	char	*name;
-	int	index;
-	int	fifo_size;
-	int	nr_banks;
-	int	can_dma;
-	int	can_isoc;
-};
-
-struct usba_platform_data {
-	int			vbus_pin;
-	int			vbus_pin_inverted;
-	int			num_ep;
-	struct usba_ep_data	ep[0];
-};
-
-#endif /* __LINUX_USB_USBA_H */
-- 
cgit v1.2.3


From 4032cc3e516f484f3d846369b901cad40388c391 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Tue, 8 May 2018 14:06:11 -0700
Subject: ACPICA: Improve error messages for the namespace root node

Replace "\___" with actual descriptive text.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/dbnames.c  | 12 +++++++++---
 drivers/acpi/acpica/dswscope.c |  8 ++++----
 drivers/acpi/acpica/utstring.c |  2 +-
 include/acpi/acnames.h         |  7 +++++--
 4 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/dbnames.c b/drivers/acpi/acpica/dbnames.c
index 170802c62179..dc94de91033e 100644
--- a/drivers/acpi/acpica/dbnames.c
+++ b/drivers/acpi/acpica/dbnames.c
@@ -189,9 +189,15 @@ void acpi_db_dump_namespace(char *start_arg, char *depth_arg)
 	}
 
 	acpi_db_set_output_destination(ACPI_DB_DUPLICATE_OUTPUT);
-	acpi_os_printf("ACPI Namespace (from %4.4s (%p) subtree):\n",
-		       ((struct acpi_namespace_node *)subtree_entry)->name.
-		       ascii, subtree_entry);
+
+	if (((struct acpi_namespace_node *)subtree_entry)->parent) {
+		acpi_os_printf("ACPI Namespace (from %4.4s (%p) subtree):\n",
+			       ((struct acpi_namespace_node *)subtree_entry)->
+			       name.ascii, subtree_entry);
+	} else {
+		acpi_os_printf("ACPI Namespace (from %s):\n",
+			       ACPI_NAMESPACE_ROOT);
+	}
 
 	/* Display the subtree */
 
diff --git a/drivers/acpi/acpica/dswscope.c b/drivers/acpi/acpica/dswscope.c
index d1422f984f6e..7592176a8fa2 100644
--- a/drivers/acpi/acpica/dswscope.c
+++ b/drivers/acpi/acpica/dswscope.c
@@ -115,7 +115,7 @@ acpi_ds_scope_stack_push(struct acpi_namespace_node *node,
 				      acpi_ut_get_type_name(old_scope_info->
 							    common.value)));
 	} else {
-		ACPI_DEBUG_PRINT_RAW((ACPI_DB_EXEC, "[\\___] (%s)", "ROOT"));
+		ACPI_DEBUG_PRINT_RAW((ACPI_DB_EXEC, ACPI_NAMESPACE_ROOT));
 	}
 
 	ACPI_DEBUG_PRINT_RAW((ACPI_DB_EXEC,
@@ -166,14 +166,14 @@ acpi_status acpi_ds_scope_stack_pop(struct acpi_walk_state *walk_state)
 
 	new_scope_info = walk_state->scope_info;
 	if (new_scope_info) {
-		ACPI_DEBUG_PRINT_RAW((ACPI_DB_EXEC,
-				      "[%4.4s] (%s)\n",
+		ACPI_DEBUG_PRINT_RAW((ACPI_DB_EXEC, "[%4.4s] (%s)\n",
 				      acpi_ut_get_node_name(new_scope_info->
 							    scope.node),
 				      acpi_ut_get_type_name(new_scope_info->
 							    common.value)));
 	} else {
-		ACPI_DEBUG_PRINT_RAW((ACPI_DB_EXEC, "[\\___] (ROOT)\n"));
+		ACPI_DEBUG_PRINT_RAW((ACPI_DB_EXEC, "%s\n",
+				      ACPI_NAMESPACE_ROOT));
 	}
 
 	acpi_ut_delete_generic_state(scope_info);
diff --git a/drivers/acpi/acpica/utstring.c b/drivers/acpi/acpica/utstring.c
index bd57a77bbcb2..5bef0b059406 100644
--- a/drivers/acpi/acpica/utstring.c
+++ b/drivers/acpi/acpica/utstring.c
@@ -141,7 +141,7 @@ void acpi_ut_repair_name(char *name)
 	 * Special case for the root node. This can happen if we get an
 	 * error during the execution of module-level code.
 	 */
-	if (ACPI_COMPARE_NAME(name, "\\___")) {
+	if (ACPI_COMPARE_NAME(name, ACPI_ROOT_PATHNAME)) {
 		return;
 	}
 
diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h
index 7b289dd00a30..6f69a4f638f8 100644
--- a/include/acpi/acnames.h
+++ b/include/acpi/acnames.h
@@ -49,11 +49,14 @@
 /* Definitions of the predefined namespace names  */
 
 #define ACPI_UNKNOWN_NAME       (u32) 0x3F3F3F3F	/* Unknown name is "????" */
-#define ACPI_ROOT_NAME          (u32) 0x5F5F5F5C	/* Root name is    "\___" */
-
 #define ACPI_PREFIX_MIXED       (u32) 0x69706341	/* "Acpi" */
 #define ACPI_PREFIX_LOWER       (u32) 0x69706361	/* "acpi" */
 
+/* Root name stuff */
+
+#define ACPI_ROOT_NAME          (u32) 0x5F5F5F5C	/* Root name is    "\___" */
+#define ACPI_ROOT_PATHNAME      "\\___"
+#define ACPI_NAMESPACE_ROOT     "Namespace Root"
 #define ACPI_NS_ROOT_PATH       "\\"
 
 #endif				/* __ACNAMES_H__  */
-- 
cgit v1.2.3


From ec5cd31c5b70587bb58e991b84c9fd40dce6aca7 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Tue, 8 May 2018 14:06:14 -0700
Subject: ACPICA: Update version to 20180427

Version 20180427.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index da0215ea9f44..fc58c28a4885 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -12,7 +12,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20180313
+#define ACPI_CA_VERSION                 0x20180427
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>
-- 
cgit v1.2.3


From 4fad8868afe7b168c34fab3d959d875f4f8b3624 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Sun, 25 Mar 2018 14:49:01 +0200
Subject: i2c: Get rid of i2c_board_info->archdata

The only user of i2c_board_info->archdata is the OF parsing code and it
just pass a zero-initialized object which has the same effect as leaving
->archdata to NULL since the client object is allocated with kzalloc().

Get rid of this useless field.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-base.c | 4 ----
 drivers/i2c/i2c-core-of.c   | 2 --
 include/linux/i2c.h         | 2 --
 3 files changed, 8 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 1ba40bb2b966..a407022fdc76 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -717,10 +717,6 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 	client->adapter = adap;
 
 	client->dev.platform_data = info->platform_data;
-
-	if (info->archdata)
-		client->dev.archdata = *info->archdata;
-
 	client->flags = info->flags;
 	client->addr = info->addr;
 
diff --git a/drivers/i2c/i2c-core-of.c b/drivers/i2c/i2c-core-of.c
index c405270a98b4..15bd51eca37b 100644
--- a/drivers/i2c/i2c-core-of.c
+++ b/drivers/i2c/i2c-core-of.c
@@ -27,7 +27,6 @@ static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap,
 {
 	struct i2c_client *client;
 	struct i2c_board_info info = {};
-	struct dev_archdata dev_ad = {};
 	u32 addr;
 	int ret;
 
@@ -56,7 +55,6 @@ static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap,
 	}
 
 	info.addr = addr;
-	info.archdata = &dev_ad;
 	info.of_node = of_node_get(node);
 
 	if (of_property_read_bool(node, "host-notify"))
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 44ad14e016b5..aeb655772ef8 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -394,7 +394,6 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; }
  * @addr: stored in i2c_client.addr
  * @dev_name: Overrides the default <busnr>-<addr> dev_name if set
  * @platform_data: stored in i2c_client.dev.platform_data
- * @archdata: copied into i2c_client.dev.archdata
  * @of_node: pointer to OpenFirmware device node
  * @fwnode: device node supplied by the platform firmware
  * @properties: additional device properties for the device
@@ -419,7 +418,6 @@ struct i2c_board_info {
 	unsigned short	addr;
 	const char	*dev_name;
 	void		*platform_data;
-	struct dev_archdata	*archdata;
 	struct device_node *of_node;
 	struct fwnode_handle *fwnode;
 	const struct property_entry *properties;
-- 
cgit v1.2.3


From c164d6abf3841ffacfdb757c10616f9cb1f67276 Mon Sep 17 00:00:00 2001
From: Rodrigo Rivas Costa <rodrigorivascosta@gmail.com>
Date: Mon, 16 Apr 2018 14:27:02 +0200
Subject: HID: add driver for Valve Steam Controller

There are two ways to connect the Steam Controller: directly to the USB
or with the USB wireless adapter.  Both methods are similar, but the
wireless adapter can connect up to 4 devices at the same time.

The wired device will appear as 3 interfaces: a virtual mouse, a virtual
keyboard and a custom HID device.

The wireless device will appear as 5 interfaces: a virtual keyboard and
4 custom HID devices, that will remain silent until a device is actually
connected.

The custom HID device has a report descriptor with all vendor specific
usages, so the hid-generic is not very useful. In a PC/SteamBox Valve
Steam Client provices a software translation by using hidraw and a
creates a uinput virtual gamepad and XTest keyboard/mouse.

This driver intercepts the hidraw usage, so it can get out of the way
when the Steam Client is in use.

Signed-off-by: Rodrigo Rivas Costa <rodrigorivascosta@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/Kconfig     |   8 +
 drivers/hid/Makefile    |   1 +
 drivers/hid/hid-ids.h   |   4 +
 drivers/hid/hid-steam.c | 973 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/hid.h     |   1 +
 5 files changed, 987 insertions(+)
 create mode 100644 drivers/hid/hid-steam.c

(limited to 'include')

diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index 0000434a1fbd..4a6fdf346058 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -838,6 +838,14 @@ config HID_SPEEDLINK
 	---help---
 	Support for Speedlink Vicious and Divine Cezanne mouse.
 
+config HID_STEAM
+	tristate "Steam Controller support"
+	depends on HID
+	---help---
+	Say Y here if you have a Steam Controller if you want to use it
+	without running the Steam Client. It supports both the wired and
+	the wireless adaptor.
+
 config HID_STEELSERIES
 	tristate "Steelseries SRW-S1 steering wheel support"
 	depends on HID
diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index 17a8bd97da9d..b646e67abc5f 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_HID_SAMSUNG)	+= hid-samsung.o
 obj-$(CONFIG_HID_SMARTJOYPLUS)	+= hid-sjoy.o
 obj-$(CONFIG_HID_SONY)		+= hid-sony.o
 obj-$(CONFIG_HID_SPEEDLINK)	+= hid-speedlink.o
+obj-$(CONFIG_HID_STEAM)		+= hid-steam.o
 obj-$(CONFIG_HID_STEELSERIES)	+= hid-steelseries.o
 obj-$(CONFIG_HID_SUNPLUS)	+= hid-sunplus.o
 obj-$(CONFIG_HID_GREENASIA)	+= hid-gaff.o
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 46f5ecd11bf7..e293506be7aa 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -1012,6 +1012,10 @@
 #define USB_VENDOR_ID_STANTUM_SITRONIX		0x1403
 #define USB_DEVICE_ID_MTP_SITRONIX		0x5001
 
+#define USB_VENDOR_ID_VALVE			0x28de
+#define USB_DEVICE_ID_STEAM_CONTROLLER		0x1102
+#define USB_DEVICE_ID_STEAM_CONTROLLER_WIRELESS	0x1142
+
 #define USB_VENDOR_ID_STEELSERIES	0x1038
 #define USB_DEVICE_ID_STEELSERIES_SRWS1	0x1410
 
diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c
new file mode 100644
index 000000000000..36fc85714ea5
--- /dev/null
+++ b/drivers/hid/hid-steam.c
@@ -0,0 +1,973 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * HID driver for Valve Steam Controller
+ *
+ * Copyright (c) 2018 Rodrigo Rivas Costa <rodrigorivascosta@gmail.com>
+ *
+ * Supports both the wired and wireless interfaces.
+ *
+ * This controller has a builtin emulation of mouse and keyboard: the right pad
+ * can be used as a mouse, the shoulder buttons are mouse buttons, A and B
+ * buttons are ENTER and ESCAPE, and so on. This is implemented as additional
+ * HID interfaces.
+ *
+ * This is known as the "lizard mode", because apparently lizards like to use
+ * the computer from the coach, without a proper mouse and keyboard.
+ *
+ * This driver will disable the lizard mode when the input device is opened
+ * and re-enable it when the input device is closed, so as not to break user
+ * mode behaviour. The lizard_mode parameter can be used to change that.
+ *
+ * There are a few user space applications (notably Steam Client) that use
+ * the hidraw interface directly to create input devices (XTest, uinput...).
+ * In order to avoid breaking them this driver creates a layered hidraw device,
+ * so it can detect when the client is running and then:
+ *  - it will not send any command to the controller.
+ *  - this input device will be disabled, to avoid double input of the same
+ *    user action.
+ *
+ * For additional functions, such as changing the right-pad margin or switching
+ * the led, you can use the user-space tool at:
+ *
+ *   https://github.com/rodrigorc/steamctrl
+ */
+
+#include <linux/device.h>
+#include <linux/input.h>
+#include <linux/hid.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+#include "hid-ids.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rodrigo Rivas Costa <rodrigorivascosta@gmail.com>");
+
+static bool lizard_mode = true;
+
+static DEFINE_MUTEX(steam_devices_lock);
+static LIST_HEAD(steam_devices);
+
+#define STEAM_QUIRK_WIRELESS		BIT(0)
+
+/* Touch pads are 40 mm in diameter and 65535 units */
+#define STEAM_PAD_RESOLUTION 1638
+/* Trigger runs are about 5 mm and 256 units */
+#define STEAM_TRIGGER_RESOLUTION 51
+/* Joystick runs are about 5 mm and 256 units */
+#define STEAM_JOYSTICK_RESOLUTION 51
+
+#define STEAM_PAD_FUZZ 256
+
+/*
+ * Commands that can be sent in a feature report.
+ * Thanks to Valve for some valuable hints.
+ */
+#define STEAM_CMD_SET_MAPPINGS		0x80
+#define STEAM_CMD_CLEAR_MAPPINGS	0x81
+#define STEAM_CMD_GET_MAPPINGS		0x82
+#define STEAM_CMD_GET_ATTRIB		0x83
+#define STEAM_CMD_GET_ATTRIB_LABEL	0x84
+#define STEAM_CMD_DEFAULT_MAPPINGS	0x85
+#define STEAM_CMD_FACTORY_RESET		0x86
+#define STEAM_CMD_WRITE_REGISTER	0x87
+#define STEAM_CMD_CLEAR_REGISTER	0x88
+#define STEAM_CMD_READ_REGISTER		0x89
+#define STEAM_CMD_GET_REGISTER_LABEL	0x8a
+#define STEAM_CMD_GET_REGISTER_MAX	0x8b
+#define STEAM_CMD_GET_REGISTER_DEFAULT	0x8c
+#define STEAM_CMD_SET_MODE		0x8d
+#define STEAM_CMD_DEFAULT_MOUSE		0x8e
+#define STEAM_CMD_FORCEFEEDBAK		0x8f
+#define STEAM_CMD_REQUEST_COMM_STATUS	0xb4
+#define STEAM_CMD_GET_SERIAL		0xae
+
+/* Some useful register ids */
+#define STEAM_REG_LPAD_MODE		0x07
+#define STEAM_REG_RPAD_MODE		0x08
+#define STEAM_REG_RPAD_MARGIN		0x18
+#define STEAM_REG_LED			0x2d
+#define STEAM_REG_GYRO_MODE		0x30
+
+/* Raw event identifiers */
+#define STEAM_EV_INPUT_DATA		0x01
+#define STEAM_EV_CONNECT		0x03
+#define STEAM_EV_BATTERY		0x04
+
+/* Values for GYRO_MODE (bitmask) */
+#define STEAM_GYRO_MODE_OFF		0x0000
+#define STEAM_GYRO_MODE_STEERING	0x0001
+#define STEAM_GYRO_MODE_TILT		0x0002
+#define STEAM_GYRO_MODE_SEND_ORIENTATION	0x0004
+#define STEAM_GYRO_MODE_SEND_RAW_ACCEL		0x0008
+#define STEAM_GYRO_MODE_SEND_RAW_GYRO		0x0010
+
+/* Other random constants */
+#define STEAM_SERIAL_LEN 10
+
+struct steam_device {
+	struct list_head list;
+	spinlock_t lock;
+	struct hid_device *hdev, *client_hdev;
+	struct mutex mutex;
+	bool client_opened, input_opened;
+	struct input_dev __rcu *input;
+	unsigned long quirks;
+	struct work_struct work_connect;
+	bool connected;
+	char serial_no[STEAM_SERIAL_LEN + 1];
+};
+
+static int steam_recv_report(struct steam_device *steam,
+		u8 *data, int size)
+{
+	struct hid_report *r;
+	u8 *buf;
+	int ret;
+
+	r = steam->hdev->report_enum[HID_FEATURE_REPORT].report_id_hash[0];
+	if (hid_report_len(r) < 64)
+		return -EINVAL;
+
+	buf = hid_alloc_report_buf(r, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * The report ID is always 0, so strip the first byte from the output.
+	 * hid_report_len() is not counting the report ID, so +1 to the length
+	 * or else we get a EOVERFLOW. We are safe from a buffer overflow
+	 * because hid_alloc_report_buf() allocates +7 bytes.
+	 */
+	ret = hid_hw_raw_request(steam->hdev, 0x00,
+			buf, hid_report_len(r) + 1,
+			HID_FEATURE_REPORT, HID_REQ_GET_REPORT);
+	if (ret > 0)
+		memcpy(data, buf + 1, min(size, ret - 1));
+	kfree(buf);
+	return ret;
+}
+
+static int steam_send_report(struct steam_device *steam,
+		u8 *cmd, int size)
+{
+	struct hid_report *r;
+	u8 *buf;
+	unsigned int retries = 50;
+	int ret;
+
+	r = steam->hdev->report_enum[HID_FEATURE_REPORT].report_id_hash[0];
+	if (hid_report_len(r) < 64)
+		return -EINVAL;
+
+	buf = hid_alloc_report_buf(r, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* The report ID is always 0 */
+	memcpy(buf + 1, cmd, size);
+
+	/*
+	 * Sometimes the wireless controller fails with EPIPE
+	 * when sending a feature report.
+	 * Doing a HID_REQ_GET_REPORT and waiting for a while
+	 * seems to fix that.
+	 */
+	do {
+		ret = hid_hw_raw_request(steam->hdev, 0,
+				buf, size + 1,
+				HID_FEATURE_REPORT, HID_REQ_SET_REPORT);
+		if (ret != -EPIPE)
+			break;
+		msleep(20);
+	} while (--retries);
+
+	kfree(buf);
+	if (ret < 0)
+		hid_err(steam->hdev, "%s: error %d (%*ph)\n", __func__,
+				ret, size, cmd);
+	return ret;
+}
+
+static inline int steam_send_report_byte(struct steam_device *steam, u8 cmd)
+{
+	return steam_send_report(steam, &cmd, 1);
+}
+
+static int steam_write_registers(struct steam_device *steam,
+		/* u8 reg, u16 val */...)
+{
+	/* Send: 0x87 len (reg valLo valHi)* */
+	u8 reg;
+	u16 val;
+	u8 cmd[64] = {STEAM_CMD_WRITE_REGISTER, 0x00};
+	va_list args;
+
+	va_start(args, steam);
+	for (;;) {
+		reg = va_arg(args, int);
+		if (reg == 0)
+			break;
+		val = va_arg(args, int);
+		cmd[cmd[1] + 2] = reg;
+		cmd[cmd[1] + 3] = val & 0xff;
+		cmd[cmd[1] + 4] = val >> 8;
+		cmd[1] += 3;
+	}
+	va_end(args);
+
+	return steam_send_report(steam, cmd, 2 + cmd[1]);
+}
+
+static int steam_get_serial(struct steam_device *steam)
+{
+	/*
+	 * Send: 0xae 0x15 0x01
+	 * Recv: 0xae 0x15 0x01 serialnumber (10 chars)
+	 */
+	int ret;
+	u8 cmd[] = {STEAM_CMD_GET_SERIAL, 0x15, 0x01};
+	u8 reply[3 + STEAM_SERIAL_LEN + 1];
+
+	ret = steam_send_report(steam, cmd, sizeof(cmd));
+	if (ret < 0)
+		return ret;
+	ret = steam_recv_report(steam, reply, sizeof(reply));
+	if (ret < 0)
+		return ret;
+	if (reply[0] != 0xae || reply[1] != 0x15 || reply[2] != 0x01)
+		return -EIO;
+	reply[3 + STEAM_SERIAL_LEN] = 0;
+	strlcpy(steam->serial_no, reply + 3, sizeof(steam->serial_no));
+	return 0;
+}
+
+/*
+ * This command requests the wireless adaptor to post an event
+ * with the connection status. Useful if this driver is loaded when
+ * the controller is already connected.
+ */
+static inline int steam_request_conn_status(struct steam_device *steam)
+{
+	return steam_send_report_byte(steam, STEAM_CMD_REQUEST_COMM_STATUS);
+}
+
+static void steam_set_lizard_mode(struct steam_device *steam, bool enable)
+{
+	if (enable) {
+		/* enable esc, enter, cursors */
+		steam_send_report_byte(steam, STEAM_CMD_DEFAULT_MAPPINGS);
+		/* enable mouse */
+		steam_send_report_byte(steam, STEAM_CMD_DEFAULT_MOUSE);
+		steam_write_registers(steam,
+			STEAM_REG_RPAD_MARGIN, 0x01, /* enable margin */
+			0);
+	} else {
+		/* disable esc, enter, cursor */
+		steam_send_report_byte(steam, STEAM_CMD_CLEAR_MAPPINGS);
+		steam_write_registers(steam,
+			STEAM_REG_RPAD_MODE, 0x07, /* disable mouse */
+			STEAM_REG_RPAD_MARGIN, 0x00, /* disable margin */
+			0);
+	}
+}
+
+static void steam_update_lizard_mode(struct steam_device *steam)
+{
+	mutex_lock(&steam->mutex);
+	if (!steam->client_opened) {
+		if (steam->input_opened)
+			steam_set_lizard_mode(steam, false);
+		else
+			steam_set_lizard_mode(steam, lizard_mode);
+	}
+	mutex_unlock(&steam->mutex);
+}
+
+static int steam_input_open(struct input_dev *dev)
+{
+	struct steam_device *steam = input_get_drvdata(dev);
+	int ret;
+
+	ret = hid_hw_open(steam->hdev);
+	if (ret)
+		return ret;
+
+	mutex_lock(&steam->mutex);
+	steam->input_opened = true;
+	if (!steam->client_opened && lizard_mode)
+		steam_set_lizard_mode(steam, false);
+	mutex_unlock(&steam->mutex);
+	return 0;
+}
+
+static void steam_input_close(struct input_dev *dev)
+{
+	struct steam_device *steam = input_get_drvdata(dev);
+
+	mutex_lock(&steam->mutex);
+	steam->input_opened = false;
+	if (!steam->client_opened && lizard_mode)
+		steam_set_lizard_mode(steam, true);
+	mutex_unlock(&steam->mutex);
+
+	hid_hw_close(steam->hdev);
+}
+
+static int steam_register(struct steam_device *steam)
+{
+	struct hid_device *hdev = steam->hdev;
+	struct input_dev *input;
+	int ret;
+
+	rcu_read_lock();
+	input = rcu_dereference(steam->input);
+	rcu_read_unlock();
+	if (input) {
+		dbg_hid("%s: already connected\n", __func__);
+		return 0;
+	}
+
+	/*
+	 * Unlikely, but getting the serial could fail, and it is not so
+	 * important, so make up a serial number and go on.
+	 */
+	if (steam_get_serial(steam) < 0)
+		strlcpy(steam->serial_no, "XXXXXXXXXX",
+				sizeof(steam->serial_no));
+
+	hid_info(hdev, "Steam Controller '%s' connected",
+			steam->serial_no);
+
+	input = input_allocate_device();
+	if (!input)
+		return -ENOMEM;
+
+	input_set_drvdata(input, steam);
+	input->dev.parent = &hdev->dev;
+	input->open = steam_input_open;
+	input->close = steam_input_close;
+
+	input->name = (steam->quirks & STEAM_QUIRK_WIRELESS) ?
+		"Wireless Steam Controller" :
+		"Steam Controller";
+	input->phys = hdev->phys;
+	input->uniq = steam->serial_no;
+	input->id.bustype = hdev->bus;
+	input->id.vendor = hdev->vendor;
+	input->id.product = hdev->product;
+	input->id.version = hdev->version;
+
+	input_set_capability(input, EV_KEY, BTN_TR2);
+	input_set_capability(input, EV_KEY, BTN_TL2);
+	input_set_capability(input, EV_KEY, BTN_TR);
+	input_set_capability(input, EV_KEY, BTN_TL);
+	input_set_capability(input, EV_KEY, BTN_Y);
+	input_set_capability(input, EV_KEY, BTN_B);
+	input_set_capability(input, EV_KEY, BTN_X);
+	input_set_capability(input, EV_KEY, BTN_A);
+	input_set_capability(input, EV_KEY, BTN_DPAD_UP);
+	input_set_capability(input, EV_KEY, BTN_DPAD_RIGHT);
+	input_set_capability(input, EV_KEY, BTN_DPAD_LEFT);
+	input_set_capability(input, EV_KEY, BTN_DPAD_DOWN);
+	input_set_capability(input, EV_KEY, BTN_SELECT);
+	input_set_capability(input, EV_KEY, BTN_MODE);
+	input_set_capability(input, EV_KEY, BTN_START);
+	input_set_capability(input, EV_KEY, BTN_GEAR_DOWN);
+	input_set_capability(input, EV_KEY, BTN_GEAR_UP);
+	input_set_capability(input, EV_KEY, BTN_THUMBR);
+	input_set_capability(input, EV_KEY, BTN_THUMBL);
+	input_set_capability(input, EV_KEY, BTN_THUMB);
+	input_set_capability(input, EV_KEY, BTN_THUMB2);
+
+	input_set_abs_params(input, ABS_HAT2Y, 0, 255, 0, 0);
+	input_set_abs_params(input, ABS_HAT2X, 0, 255, 0, 0);
+	input_set_abs_params(input, ABS_X, -32767, 32767, 0, 0);
+	input_set_abs_params(input, ABS_Y, -32767, 32767, 0, 0);
+	input_set_abs_params(input, ABS_RX, -32767, 32767,
+			STEAM_PAD_FUZZ, 0);
+	input_set_abs_params(input, ABS_RY, -32767, 32767,
+			STEAM_PAD_FUZZ, 0);
+	input_set_abs_params(input, ABS_HAT0X, -32767, 32767,
+			STEAM_PAD_FUZZ, 0);
+	input_set_abs_params(input, ABS_HAT0Y, -32767, 32767,
+			STEAM_PAD_FUZZ, 0);
+	input_abs_set_res(input, ABS_X, STEAM_JOYSTICK_RESOLUTION);
+	input_abs_set_res(input, ABS_Y, STEAM_JOYSTICK_RESOLUTION);
+	input_abs_set_res(input, ABS_RX, STEAM_PAD_RESOLUTION);
+	input_abs_set_res(input, ABS_RY, STEAM_PAD_RESOLUTION);
+	input_abs_set_res(input, ABS_HAT0X, STEAM_PAD_RESOLUTION);
+	input_abs_set_res(input, ABS_HAT0Y, STEAM_PAD_RESOLUTION);
+	input_abs_set_res(input, ABS_HAT2Y, STEAM_TRIGGER_RESOLUTION);
+	input_abs_set_res(input, ABS_HAT2X, STEAM_TRIGGER_RESOLUTION);
+
+	ret = input_register_device(input);
+	if (ret)
+		goto input_register_fail;
+
+	rcu_assign_pointer(steam->input, input);
+
+	return 0;
+
+input_register_fail:
+	input_free_device(input);
+	return ret;
+}
+
+static void steam_unregister(struct steam_device *steam)
+{
+	struct input_dev *input;
+
+	rcu_read_lock();
+	input = rcu_dereference(steam->input);
+	rcu_read_unlock();
+
+	if (input) {
+		RCU_INIT_POINTER(steam->input, NULL);
+		synchronize_rcu();
+		hid_info(steam->hdev, "Steam Controller '%s' disconnected",
+				steam->serial_no);
+		input_unregister_device(input);
+	}
+}
+
+static void steam_work_connect_cb(struct work_struct *work)
+{
+	struct steam_device *steam = container_of(work, struct steam_device,
+							work_connect);
+	unsigned long flags;
+	bool connected;
+	int ret;
+
+	spin_lock_irqsave(&steam->lock, flags);
+	connected = steam->connected;
+	spin_unlock_irqrestore(&steam->lock, flags);
+
+	if (connected) {
+		ret = steam_register(steam);
+		if (ret) {
+			hid_err(steam->hdev,
+				"%s:steam_register failed with error %d\n",
+				__func__, ret);
+		}
+	} else {
+		steam_unregister(steam);
+	}
+}
+
+static bool steam_is_valve_interface(struct hid_device *hdev)
+{
+	struct hid_report_enum *rep_enum;
+
+	/*
+	 * The wired device creates 3 interfaces:
+	 *  0: emulated mouse.
+	 *  1: emulated keyboard.
+	 *  2: the real game pad.
+	 * The wireless device creates 5 interfaces:
+	 *  0: emulated keyboard.
+	 *  1-4: slots where up to 4 real game pads will be connected to.
+	 * We know which one is the real gamepad interface because they are the
+	 * only ones with a feature report.
+	 */
+	rep_enum = &hdev->report_enum[HID_FEATURE_REPORT];
+	return !list_empty(&rep_enum->report_list);
+}
+
+static int steam_client_ll_parse(struct hid_device *hdev)
+{
+	struct steam_device *steam = hid_get_drvdata(hdev);
+
+	return hid_parse_report(hdev, steam->hdev->dev_rdesc,
+			steam->hdev->dev_rsize);
+}
+
+static int steam_client_ll_start(struct hid_device *hdev)
+{
+	return 0;
+}
+
+static void steam_client_ll_stop(struct hid_device *hdev)
+{
+}
+
+static int steam_client_ll_open(struct hid_device *hdev)
+{
+	struct steam_device *steam = hid_get_drvdata(hdev);
+	int ret;
+
+	ret = hid_hw_open(steam->hdev);
+	if (ret)
+		return ret;
+
+	mutex_lock(&steam->mutex);
+	steam->client_opened = true;
+	mutex_unlock(&steam->mutex);
+	return ret;
+}
+
+static void steam_client_ll_close(struct hid_device *hdev)
+{
+	struct steam_device *steam = hid_get_drvdata(hdev);
+
+	mutex_lock(&steam->mutex);
+	steam->client_opened = false;
+	if (steam->input_opened)
+		steam_set_lizard_mode(steam, false);
+	else
+		steam_set_lizard_mode(steam, lizard_mode);
+	mutex_unlock(&steam->mutex);
+
+	hid_hw_close(steam->hdev);
+}
+
+static int steam_client_ll_raw_request(struct hid_device *hdev,
+				unsigned char reportnum, u8 *buf,
+				size_t count, unsigned char report_type,
+				int reqtype)
+{
+	struct steam_device *steam = hid_get_drvdata(hdev);
+
+	return hid_hw_raw_request(steam->hdev, reportnum, buf, count,
+			report_type, reqtype);
+}
+
+static struct hid_ll_driver steam_client_ll_driver = {
+	.parse = steam_client_ll_parse,
+	.start = steam_client_ll_start,
+	.stop = steam_client_ll_stop,
+	.open = steam_client_ll_open,
+	.close = steam_client_ll_close,
+	.raw_request = steam_client_ll_raw_request,
+};
+
+static struct hid_device *steam_create_client_hid(struct hid_device *hdev)
+{
+	struct hid_device *client_hdev;
+
+	client_hdev = hid_allocate_device();
+	if (IS_ERR(client_hdev))
+		return client_hdev;
+
+	client_hdev->ll_driver = &steam_client_ll_driver;
+	client_hdev->dev.parent = hdev->dev.parent;
+	client_hdev->bus = hdev->bus;
+	client_hdev->vendor = hdev->vendor;
+	client_hdev->product = hdev->product;
+	strlcpy(client_hdev->name, hdev->name,
+			sizeof(client_hdev->name));
+	strlcpy(client_hdev->phys, hdev->phys,
+			sizeof(client_hdev->phys));
+	/*
+	 * Since we use the same device info than the real interface to
+	 * trick userspace, we will be calling steam_probe recursively.
+	 * We need to recognize the client interface somehow.
+	 */
+	client_hdev->group = HID_GROUP_STEAM;
+	return client_hdev;
+}
+
+static int steam_probe(struct hid_device *hdev,
+				const struct hid_device_id *id)
+{
+	struct steam_device *steam;
+	int ret;
+
+	ret = hid_parse(hdev);
+	if (ret) {
+		hid_err(hdev,
+			"%s:parse of hid interface failed\n", __func__);
+		return ret;
+	}
+
+	/*
+	 * The virtual client_dev is only used for hidraw.
+	 * Also avoid the recursive probe.
+	 */
+	if (hdev->group == HID_GROUP_STEAM)
+		return hid_hw_start(hdev, HID_CONNECT_HIDRAW);
+	/*
+	 * The non-valve interfaces (mouse and keyboard emulation) are
+	 * connected without changes.
+	 */
+	if (!steam_is_valve_interface(hdev))
+		return hid_hw_start(hdev, HID_CONNECT_DEFAULT);
+
+	steam = devm_kzalloc(&hdev->dev, sizeof(*steam), GFP_KERNEL);
+	if (!steam) {
+		ret = -ENOMEM;
+		goto steam_alloc_fail;
+	}
+	steam->hdev = hdev;
+	hid_set_drvdata(hdev, steam);
+	spin_lock_init(&steam->lock);
+	mutex_init(&steam->mutex);
+	steam->quirks = id->driver_data;
+	INIT_WORK(&steam->work_connect, steam_work_connect_cb);
+
+	steam->client_hdev = steam_create_client_hid(hdev);
+	if (IS_ERR(steam->client_hdev)) {
+		ret = PTR_ERR(steam->client_hdev);
+		goto client_hdev_fail;
+	}
+	hid_set_drvdata(steam->client_hdev, steam);
+
+	/*
+	 * With the real steam controller interface, do not connect hidraw.
+	 * Instead, create the client_hid and connect that.
+	 */
+	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_HIDRAW);
+	if (ret)
+		goto hid_hw_start_fail;
+
+	ret = hid_add_device(steam->client_hdev);
+	if (ret)
+		goto client_hdev_add_fail;
+
+	if (steam->quirks & STEAM_QUIRK_WIRELESS) {
+		ret = hid_hw_open(hdev);
+		if (ret) {
+			hid_err(hdev,
+				"%s:hid_hw_open for wireless\n",
+				__func__);
+			goto hid_hw_open_fail;
+		}
+		hid_info(hdev, "Steam wireless receiver connected");
+		steam_request_conn_status(steam);
+	} else {
+		ret = steam_register(steam);
+		if (ret) {
+			hid_err(hdev,
+				"%s:steam_register failed with error %d\n",
+				__func__, ret);
+			goto input_register_fail;
+		}
+	}
+
+	mutex_lock(&steam_devices_lock);
+	steam_update_lizard_mode(steam);
+	list_add(&steam->list, &steam_devices);
+	mutex_unlock(&steam_devices_lock);
+
+	return 0;
+
+hid_hw_open_fail:
+input_register_fail:
+client_hdev_add_fail:
+	hid_hw_stop(hdev);
+hid_hw_start_fail:
+	hid_destroy_device(steam->client_hdev);
+client_hdev_fail:
+	cancel_work_sync(&steam->work_connect);
+steam_alloc_fail:
+	hid_err(hdev, "%s: failed with error %d\n",
+			__func__, ret);
+	return ret;
+}
+
+static void steam_remove(struct hid_device *hdev)
+{
+	struct steam_device *steam = hid_get_drvdata(hdev);
+
+	if (!steam || hdev->group == HID_GROUP_STEAM) {
+		hid_hw_stop(hdev);
+		return;
+	}
+
+	mutex_lock(&steam_devices_lock);
+	list_del(&steam->list);
+	mutex_unlock(&steam_devices_lock);
+
+	hid_destroy_device(steam->client_hdev);
+	steam->client_opened = false;
+	cancel_work_sync(&steam->work_connect);
+	if (steam->quirks & STEAM_QUIRK_WIRELESS) {
+		hid_info(hdev, "Steam wireless receiver disconnected");
+		hid_hw_close(hdev);
+	}
+	hid_hw_stop(hdev);
+	steam_unregister(steam);
+}
+
+static void steam_do_connect_event(struct steam_device *steam, bool connected)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&steam->lock, flags);
+	steam->connected = connected;
+	spin_unlock_irqrestore(&steam->lock, flags);
+
+	if (schedule_work(&steam->work_connect) == 0)
+		dbg_hid("%s: connected=%d event already queued\n",
+				__func__, connected);
+}
+
+/*
+ * Some input data in the protocol has the opposite sign.
+ * Clamp the values to 32767..-32767 so that the range is
+ * symmetrical and can be negated safely.
+ */
+static inline s16 steam_le16(u8 *data)
+{
+	s16 x = (s16) le16_to_cpup((__le16 *)data);
+
+	return x == -32768 ? -32767 : x;
+}
+
+/*
+ * The size for this message payload is 60.
+ * The known values are:
+ *  (* values are not sent through wireless)
+ *  (* accelerator/gyro is disabled by default)
+ *  Offset| Type  | Mapped to |Meaning
+ * -------+-------+-----------+--------------------------
+ *  4-7   | u32   | --        | sequence number
+ *  8-10  | 24bit | see below | buttons
+ *  11    | u8    | ABS_HAT2Y | left trigger
+ *  12    | u8    | ABS_HAT2X | right trigger
+ *  13-15 | --    | --        | always 0
+ *  16-17 | s16   | ABS_X/ABS_HAT0X     | X value
+ *  18-19 | s16   | ABS_Y/ABS_HAT0Y     | Y value
+ *  20-21 | s16   | ABS_RX    | right-pad X value
+ *  22-23 | s16   | ABS_RY    | right-pad Y value
+ *  24-25 | s16   | --        | * left trigger
+ *  26-27 | s16   | --        | * right trigger
+ *  28-29 | s16   | --        | * accelerometer X value
+ *  30-31 | s16   | --        | * accelerometer Y value
+ *  32-33 | s16   | --        | * accelerometer Z value
+ *  34-35 | s16   | --        | gyro X value
+ *  36-36 | s16   | --        | gyro Y value
+ *  38-39 | s16   | --        | gyro Z value
+ *  40-41 | s16   | --        | quaternion W value
+ *  42-43 | s16   | --        | quaternion X value
+ *  44-45 | s16   | --        | quaternion Y value
+ *  46-47 | s16   | --        | quaternion Z value
+ *  48-49 | --    | --        | always 0
+ *  50-51 | s16   | --        | * left trigger (uncalibrated)
+ *  52-53 | s16   | --        | * right trigger (uncalibrated)
+ *  54-55 | s16   | --        | * joystick X value (uncalibrated)
+ *  56-57 | s16   | --        | * joystick Y value (uncalibrated)
+ *  58-59 | s16   | --        | * left-pad X value
+ *  60-61 | s16   | --        | * left-pad Y value
+ *  62-63 | u16   | --        | * battery voltage
+ *
+ * The buttons are:
+ *  Bit  | Mapped to  | Description
+ * ------+------------+--------------------------------
+ *  8.0  | BTN_TR2    | right trigger fully pressed
+ *  8.1  | BTN_TL2    | left trigger fully pressed
+ *  8.2  | BTN_TR     | right shoulder
+ *  8.3  | BTN_TL     | left shoulder
+ *  8.4  | BTN_Y      | button Y
+ *  8.5  | BTN_B      | button B
+ *  8.6  | BTN_X      | button X
+ *  8.7  | BTN_A      | button A
+ *  9.0  | BTN_DPAD_UP    | lef-pad up
+ *  9.1  | BTN_DPAD_RIGHT | lef-pad right
+ *  9.2  | BTN_DPAD_LEFT  | lef-pad left
+ *  9.3  | BTN_DPAD_DOWN  | lef-pad down
+ *  9.4  | BTN_SELECT | menu left
+ *  9.5  | BTN_MODE   | steam logo
+ *  9.6  | BTN_START  | menu right
+ *  9.7  | BTN_GEAR_DOWN | left back lever
+ * 10.0  | BTN_GEAR_UP   | right back lever
+ * 10.1  | --         | left-pad clicked
+ * 10.2  | BTN_THUMBR | right-pad clicked
+ * 10.3  | BTN_THUMB  | left-pad touched (but see explanation below)
+ * 10.4  | BTN_THUMB2 | right-pad touched
+ * 10.5  | --         | unknown
+ * 10.6  | BTN_THUMBL | joystick clicked
+ * 10.7  | --         | lpad_and_joy
+ */
+
+static void steam_do_input_event(struct steam_device *steam,
+		struct input_dev *input, u8 *data)
+{
+	/* 24 bits of buttons */
+	u8 b8, b9, b10;
+	s16 x, y;
+	bool lpad_touched, lpad_and_joy;
+
+	b8 = data[8];
+	b9 = data[9];
+	b10 = data[10];
+
+	input_report_abs(input, ABS_HAT2Y, data[11]);
+	input_report_abs(input, ABS_HAT2X, data[12]);
+
+	/*
+	 * These two bits tells how to interpret the values X and Y.
+	 * lpad_and_joy tells that the joystick and the lpad are used at the
+	 * same time.
+	 * lpad_touched tells whether X/Y are to be read as lpad coord or
+	 * joystick values.
+	 * (lpad_touched || lpad_and_joy) tells if the lpad is really touched.
+	 */
+	lpad_touched = b10 & BIT(3);
+	lpad_and_joy = b10 & BIT(7);
+	x = steam_le16(data + 16);
+	y = -steam_le16(data + 18);
+
+	input_report_abs(input, lpad_touched ? ABS_HAT0X : ABS_X, x);
+	input_report_abs(input, lpad_touched ? ABS_HAT0Y : ABS_Y, y);
+	/* Check if joystick is centered */
+	if (lpad_touched && !lpad_and_joy) {
+		input_report_abs(input, ABS_X, 0);
+		input_report_abs(input, ABS_Y, 0);
+	}
+	/* Check if lpad is untouched */
+	if (!(lpad_touched || lpad_and_joy)) {
+		input_report_abs(input, ABS_HAT0X, 0);
+		input_report_abs(input, ABS_HAT0Y, 0);
+	}
+
+	input_report_abs(input, ABS_RX, steam_le16(data + 20));
+	input_report_abs(input, ABS_RY, -steam_le16(data + 22));
+
+	input_event(input, EV_KEY, BTN_TR2, !!(b8 & BIT(0)));
+	input_event(input, EV_KEY, BTN_TL2, !!(b8 & BIT(1)));
+	input_event(input, EV_KEY, BTN_TR, !!(b8 & BIT(2)));
+	input_event(input, EV_KEY, BTN_TL, !!(b8 & BIT(3)));
+	input_event(input, EV_KEY, BTN_Y, !!(b8 & BIT(4)));
+	input_event(input, EV_KEY, BTN_B, !!(b8 & BIT(5)));
+	input_event(input, EV_KEY, BTN_X, !!(b8 & BIT(6)));
+	input_event(input, EV_KEY, BTN_A, !!(b8 & BIT(7)));
+	input_event(input, EV_KEY, BTN_SELECT, !!(b9 & BIT(4)));
+	input_event(input, EV_KEY, BTN_MODE, !!(b9 & BIT(5)));
+	input_event(input, EV_KEY, BTN_START, !!(b9 & BIT(6)));
+	input_event(input, EV_KEY, BTN_GEAR_DOWN, !!(b9 & BIT(7)));
+	input_event(input, EV_KEY, BTN_GEAR_UP, !!(b10 & BIT(0)));
+	input_event(input, EV_KEY, BTN_THUMBR, !!(b10 & BIT(2)));
+	input_event(input, EV_KEY, BTN_THUMBL, !!(b10 & BIT(6)));
+	input_event(input, EV_KEY, BTN_THUMB, lpad_touched || lpad_and_joy);
+	input_event(input, EV_KEY, BTN_THUMB2, !!(b10 & BIT(4)));
+	input_event(input, EV_KEY, BTN_DPAD_UP, !!(b9 & BIT(0)));
+	input_event(input, EV_KEY, BTN_DPAD_RIGHT, !!(b9 & BIT(1)));
+	input_event(input, EV_KEY, BTN_DPAD_LEFT, !!(b9 & BIT(2)));
+	input_event(input, EV_KEY, BTN_DPAD_DOWN, !!(b9 & BIT(3)));
+
+	input_sync(input);
+}
+
+static int steam_raw_event(struct hid_device *hdev,
+			struct hid_report *report, u8 *data,
+			int size)
+{
+	struct steam_device *steam = hid_get_drvdata(hdev);
+	struct input_dev *input;
+
+	if (!steam)
+		return 0;
+
+	if (steam->client_opened)
+		hid_input_report(steam->client_hdev, HID_FEATURE_REPORT,
+				data, size, 0);
+	/*
+	 * All messages are size=64, all values little-endian.
+	 * The format is:
+	 *  Offset| Meaning
+	 * -------+--------------------------------------------
+	 *  0-1   | always 0x01, 0x00, maybe protocol version?
+	 *  2     | type of message
+	 *  3     | length of the real payload (not checked)
+	 *  4-n   | payload data, depends on the type
+	 *
+	 * There are these known types of message:
+	 *  0x01: input data (60 bytes)
+	 *  0x03: wireless connect/disconnect (1 byte)
+	 *  0x04: battery status (11 bytes)
+	 */
+
+	if (size != 64 || data[0] != 1 || data[1] != 0)
+		return 0;
+
+	switch (data[2]) {
+	case STEAM_EV_INPUT_DATA:
+		if (steam->client_opened)
+			return 0;
+		rcu_read_lock();
+		input = rcu_dereference(steam->input);
+		if (likely(input)) {
+			steam_do_input_event(steam, input, data);
+		} else {
+			dbg_hid("%s: input data without connect event\n",
+					__func__);
+			steam_do_connect_event(steam, true);
+		}
+		rcu_read_unlock();
+		break;
+	case STEAM_EV_CONNECT:
+		/*
+		 * The payload of this event is a single byte:
+		 *  0x01: disconnected.
+		 *  0x02: connected.
+		 */
+		switch (data[4]) {
+		case 0x01:
+			steam_do_connect_event(steam, false);
+			break;
+		case 0x02:
+			steam_do_connect_event(steam, true);
+			break;
+		}
+		break;
+	case STEAM_EV_BATTERY:
+		/* TODO: battery info */
+		break;
+	}
+	return 0;
+}
+
+static int steam_param_set_lizard_mode(const char *val,
+					const struct kernel_param *kp)
+{
+	struct steam_device *steam;
+	int ret;
+
+	ret = param_set_bool(val, kp);
+	if (ret)
+		return ret;
+
+	mutex_lock(&steam_devices_lock);
+	list_for_each_entry(steam, &steam_devices, list) {
+		steam_update_lizard_mode(steam);
+	}
+	mutex_unlock(&steam_devices_lock);
+	return 0;
+}
+
+static const struct kernel_param_ops steam_lizard_mode_ops = {
+	.set	= steam_param_set_lizard_mode,
+	.get	= param_get_bool,
+};
+
+module_param_cb(lizard_mode, &steam_lizard_mode_ops, &lizard_mode, 0644);
+MODULE_PARM_DESC(lizard_mode,
+	"Enable mouse and keyboard emulation (lizard mode) when the gamepad is not in use");
+
+static const struct hid_device_id steam_controllers[] = {
+	{ /* Wired Steam Controller */
+	  HID_USB_DEVICE(USB_VENDOR_ID_VALVE,
+		USB_DEVICE_ID_STEAM_CONTROLLER)
+	},
+	{ /* Wireless Steam Controller */
+	  HID_USB_DEVICE(USB_VENDOR_ID_VALVE,
+		USB_DEVICE_ID_STEAM_CONTROLLER_WIRELESS),
+	  .driver_data = STEAM_QUIRK_WIRELESS
+	},
+	{}
+};
+
+MODULE_DEVICE_TABLE(hid, steam_controllers);
+
+static struct hid_driver steam_controller_driver = {
+	.name = "hid-steam",
+	.id_table = steam_controllers,
+	.probe = steam_probe,
+	.remove = steam_remove,
+	.raw_event = steam_raw_event,
+};
+
+module_hid_driver(steam_controller_driver);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 26240a22978a..08d92bb005fd 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -367,6 +367,7 @@ struct hid_item {
 #define HID_GROUP_RMI				0x0100
 #define HID_GROUP_WACOM				0x0101
 #define HID_GROUP_LOGITECH_DJ_DEVICE		0x0102
+#define HID_GROUP_STEAM				0x0103
 
 /*
  * HID protocol status
-- 
cgit v1.2.3


From 9c7aea8e17d435beeabfed88ff46a811eeb92ae4 Mon Sep 17 00:00:00 2001
From: Yixun Lan <yixun.lan@amlogic.com>
Date: Thu, 3 May 2018 21:26:22 +0800
Subject: dt-bindings: clock: reset: Add AXG AO Clock and Reset Bindings

Add dt-bindings headers for the Meson-AXG's AO clock and
reset controller.

Acked-by: Philipp Zabel <p.zabel@pengutronix.de>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Yixun Lan <yixun.lan@amlogic.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
---
 include/dt-bindings/clock/axg-aoclkc.h | 26 ++++++++++++++++++++++++++
 include/dt-bindings/reset/axg-aoclkc.h | 20 ++++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 include/dt-bindings/clock/axg-aoclkc.h
 create mode 100644 include/dt-bindings/reset/axg-aoclkc.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/axg-aoclkc.h b/include/dt-bindings/clock/axg-aoclkc.h
new file mode 100644
index 000000000000..61955016a55b
--- /dev/null
+++ b/include/dt-bindings/clock/axg-aoclkc.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2016 BayLibre, SAS
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ *
+ * Copyright (c) 2018 Amlogic, inc.
+ * Author: Qiufang Dai <qiufang.dai@amlogic.com>
+ */
+
+#ifndef DT_BINDINGS_CLOCK_AMLOGIC_MESON_AXG_AOCLK
+#define DT_BINDINGS_CLOCK_AMLOGIC_MESON_AXG_AOCLK
+
+#define CLKID_AO_REMOTE		0
+#define CLKID_AO_I2C_MASTER	1
+#define CLKID_AO_I2C_SLAVE	2
+#define CLKID_AO_UART1		3
+#define CLKID_AO_UART2		4
+#define CLKID_AO_IR_BLASTER	5
+#define CLKID_AO_SAR_ADC	6
+#define CLKID_AO_CLK81		7
+#define CLKID_AO_SAR_ADC_SEL	8
+#define CLKID_AO_SAR_ADC_DIV	9
+#define CLKID_AO_SAR_ADC_CLK	10
+#define CLKID_AO_ALT_XTAL	11
+
+#endif
diff --git a/include/dt-bindings/reset/axg-aoclkc.h b/include/dt-bindings/reset/axg-aoclkc.h
new file mode 100644
index 000000000000..d342c0b6b2a7
--- /dev/null
+++ b/include/dt-bindings/reset/axg-aoclkc.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2016 BayLibre, SAS
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ *
+ * Copyright (c) 2018 Amlogic, inc.
+ * Author: Qiufang Dai <qiufang.dai@amlogic.com>
+ */
+
+#ifndef DT_BINDINGS_RESET_AMLOGIC_MESON_AXG_AOCLK
+#define DT_BINDINGS_RESET_AMLOGIC_MESON_AXG_AOCLK
+
+#define RESET_AO_REMOTE		0
+#define RESET_AO_I2C_MASTER	1
+#define RESET_AO_I2C_SLAVE	2
+#define RESET_AO_UART1		3
+#define RESET_AO_UART2		4
+#define RESET_AO_IR_BLASTER	5
+
+#endif
-- 
cgit v1.2.3


From ab96746aaa344fb720a198245a837e266fad3b62 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 4 May 2018 13:08:18 +0800
Subject: iommu/vt-d: Clean up pasid quirk for pre-production devices

The pasid28 quirk is needed only for some pre-production devices.
Remove it to make the code concise.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c | 32 ++------------------------------
 include/linux/intel-iommu.h |  1 -
 2 files changed, 2 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index fd5cfd85c166..d79e3ebbe437 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -485,37 +485,14 @@ static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int intel_iommu_ecs = 1;
-static int intel_iommu_pasid28;
 static int iommu_identity_mapping;
 
 #define IDENTMAP_ALL		1
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
 
-/* Broadwell and Skylake have broken ECS support — normal so-called "second
- * level" translation of DMA requests-without-PASID doesn't actually happen
- * unless you also set the NESTE bit in an extended context-entry. Which of
- * course means that SVM doesn't work because it's trying to do nested
- * translation of the physical addresses it finds in the process page tables,
- * through the IOVA->phys mapping found in the "second level" page tables.
- *
- * The VT-d specification was retroactively changed to change the definition
- * of the capability bits and pretend that Broadwell/Skylake never happened...
- * but unfortunately the wrong bit was changed. It's ECS which is broken, but
- * for some reason it was the PASID capability bit which was redefined (from
- * bit 28 on BDW/SKL to bit 40 in future).
- *
- * So our test for ECS needs to eschew those implementations which set the old
- * PASID capabiity bit 28, since those are the ones on which ECS is broken.
- * Unless we are working around the 'pasid28' limitations, that is, by putting
- * the device into passthrough mode for normal DMA and thus masking the bug.
- */
-#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
-			    (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
-/* PASID support is thus enabled if ECS is enabled and *either* of the old
- * or new capability bits are set. */
-#define pasid_enabled(iommu) (ecs_enabled(iommu) &&			\
-			      (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
+#define ecs_enabled(iommu)	(intel_iommu_ecs && ecap_ecs(iommu->ecap))
+#define pasid_enabled(iommu)	(ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
 
 int intel_iommu_gfx_mapped;
 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
@@ -578,11 +555,6 @@ static int __init intel_iommu_setup(char *str)
 			printk(KERN_INFO
 				"Intel-IOMMU: disable extended context table support\n");
 			intel_iommu_ecs = 0;
-		} else if (!strncmp(str, "pasid28", 7)) {
-			printk(KERN_INFO
-				"Intel-IOMMU: enable pre-production PASID support\n");
-			intel_iommu_pasid28 = 1;
-			iommu_identity_mapping |= IDENTMAP_GFX;
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
 			printk(KERN_INFO
 				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ef169d67df92..1df940196ab2 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -121,7 +121,6 @@
 #define ecap_srs(e)		((e >> 31) & 0x1)
 #define ecap_ers(e)		((e >> 30) & 0x1)
 #define ecap_prs(e)		((e >> 29) & 0x1)
-#define ecap_broken_pasid(e)	((e >> 28) & 0x1)
 #define ecap_dis(e)		((e >> 27) & 0x1)
 #define ecap_nest(e)		((e >> 26) & 0x1)
 #define ecap_mts(e)		((e >> 25) & 0x1)
-- 
cgit v1.2.3


From 1d906b22076e12cf6557cf4658defe82c0b5ff1f Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Date: Tue, 15 May 2018 15:41:42 +0100
Subject: PCI: dwc: Add support for EP mode

The PCIe controller dual mode is capable of operating in Root Complex
(RC) mode as well as EP mode by configuration option.

Add EP support to the DesignWare driver on top of RC mode support.

Add new property on pci_epc structure which allow to configure
pci_epf_test driver accordingly to the controller specific requirements.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/pci/dwc/Kconfig                       |  37 +++++--
 drivers/pci/dwc/pcie-designware-ep.c          |   3 +
 drivers/pci/dwc/pcie-designware-plat.c        | 149 ++++++++++++++++++++++++--
 drivers/pci/endpoint/functions/pci-epf-test.c |   7 ++
 include/linux/pci-epc.h                       |   8 ++
 5 files changed, 186 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/dwc/Kconfig b/drivers/pci/dwc/Kconfig
index 8c1a5167fb5b..ce9e5a583857 100644
--- a/drivers/pci/dwc/Kconfig
+++ b/drivers/pci/dwc/Kconfig
@@ -51,16 +51,37 @@ config PCI_DRA7XX_EP
 	  This uses the DesignWare core.
 
 config PCIE_DW_PLAT
-	bool "Platform bus based DesignWare PCIe Controller"
-	depends on PCI_MSI_IRQ_DOMAIN
-	select PCIE_DW_HOST
-	---help---
-	 This selects the DesignWare PCIe controller support. Select this if
-	 you have a PCIe controller on Platform bus.
+	bool
 
-	 If you have a controller with this interface, say Y or M here.
+config PCIE_DW_PLAT_HOST
+	bool "Platform bus based DesignWare PCIe Controller - Host mode"
+	depends on PCI && PCI_MSI_IRQ_DOMAIN
+	select PCIE_DW_HOST
+	select PCIE_DW_PLAT
+	default y
+	help
+	  Enables support for the PCIe controller in the Designware IP to
+	  work in host mode. There are two instances of PCIe controller in
+	  Designware IP.
+	  This controller can work either as EP or RC. In order to enable
+	  host-specific features PCIE_DW_PLAT_HOST must be selected and in
+	  order to enable device-specific features PCI_DW_PLAT_EP must be
+	  selected.
 
-	 If unsure, say N.
+config PCIE_DW_PLAT_EP
+	bool "Platform bus based DesignWare PCIe Controller - Endpoint mode"
+	depends on PCI && PCI_MSI_IRQ_DOMAIN
+	depends on PCI_ENDPOINT
+	select PCIE_DW_EP
+	select PCIE_DW_PLAT
+	help
+	  Enables support for the PCIe controller in the Designware IP to
+	  work in endpoint mode. There are two instances of PCIe controller
+	  in Designware IP.
+	  This controller can work either as EP or RC. In order to enable
+	  host-specific features PCIE_DW_PLAT_HOST must be selected and in
+	  order to enable device-specific features PCI_DW_PLAT_EP must be
+	  selected.
 
 config PCI_EXYNOS
 	bool "Samsung Exynos PCIe controller"
diff --git a/drivers/pci/dwc/pcie-designware-ep.c b/drivers/pci/dwc/pcie-designware-ep.c
index 15b22a69fc71..1eec4415a77f 100644
--- a/drivers/pci/dwc/pcie-designware-ep.c
+++ b/drivers/pci/dwc/pcie-designware-ep.c
@@ -411,6 +411,9 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
 		return -ENOMEM;
 	}
 
+	epc->features = EPC_FEATURE_NO_LINKUP_NOTIFIER;
+	EPC_FEATURE_SET_BAR(epc->features, BAR_0);
+
 	ep->epc = epc;
 	epc_set_drvdata(epc, ep);
 	dw_pcie_setup(pci);
diff --git a/drivers/pci/dwc/pcie-designware-plat.c b/drivers/pci/dwc/pcie-designware-plat.c
index 80a2782e9823..5937fed4c938 100644
--- a/drivers/pci/dwc/pcie-designware-plat.c
+++ b/drivers/pci/dwc/pcie-designware-plat.c
@@ -12,19 +12,29 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/of_device.h>
 #include <linux/of_gpio.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/resource.h>
 #include <linux/signal.h>
 #include <linux/types.h>
+#include <linux/regmap.h>
 
 #include "pcie-designware.h"
 
 struct dw_plat_pcie {
-	struct dw_pcie		*pci;
+	struct dw_pcie			*pci;
+	struct regmap			*regmap;
+	enum dw_pcie_device_mode	mode;
 };
 
+struct dw_plat_pcie_of_data {
+	enum dw_pcie_device_mode	mode;
+};
+
+static const struct of_device_id dw_plat_pcie_of_match[];
+
 static int dw_plat_pcie_host_init(struct pcie_port *pp)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
@@ -48,9 +58,53 @@ static const struct dw_pcie_host_ops dw_plat_pcie_host_ops = {
 	.set_num_vectors = dw_plat_set_num_vectors,
 };
 
-static int dw_plat_add_pcie_port(struct pcie_port *pp,
+static int dw_plat_pcie_establish_link(struct dw_pcie *pci)
+{
+	return 0;
+}
+
+static const struct dw_pcie_ops dw_pcie_ops = {
+	.start_link = dw_plat_pcie_establish_link,
+};
+
+static void dw_plat_pcie_ep_init(struct dw_pcie_ep *ep)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+	enum pci_barno bar;
+
+	for (bar = BAR_0; bar <= BAR_5; bar++)
+		dw_pcie_ep_reset_bar(pci, bar);
+}
+
+static int dw_plat_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
+				     enum pci_epc_irq_type type,
+				     u8 interrupt_num)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+
+	switch (type) {
+	case PCI_EPC_IRQ_LEGACY:
+		dev_err(pci->dev, "EP cannot trigger legacy IRQs\n");
+		return -EINVAL;
+	case PCI_EPC_IRQ_MSI:
+		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
+	default:
+		dev_err(pci->dev, "UNKNOWN IRQ type\n");
+	}
+
+	return 0;
+}
+
+static struct dw_pcie_ep_ops pcie_ep_ops = {
+	.ep_init = dw_plat_pcie_ep_init,
+	.raise_irq = dw_plat_pcie_ep_raise_irq,
+};
+
+static int dw_plat_add_pcie_port(struct dw_plat_pcie *dw_plat_pcie,
 				 struct platform_device *pdev)
 {
+	struct dw_pcie *pci = dw_plat_pcie->pci;
+	struct pcie_port *pp = &pci->pp;
 	struct device *dev = &pdev->dev;
 	int ret;
 
@@ -69,15 +123,44 @@ static int dw_plat_add_pcie_port(struct pcie_port *pp,
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(dev, "failed to initialize host\n");
+		dev_err(dev, "Failed to initialize host\n");
 		return ret;
 	}
 
 	return 0;
 }
 
-static const struct dw_pcie_ops dw_pcie_ops = {
-};
+static int dw_plat_add_pcie_ep(struct dw_plat_pcie *dw_plat_pcie,
+			       struct platform_device *pdev)
+{
+	int ret;
+	struct dw_pcie_ep *ep;
+	struct resource *res;
+	struct device *dev = &pdev->dev;
+	struct dw_pcie *pci = dw_plat_pcie->pci;
+
+	ep = &pci->ep;
+	ep->ops = &pcie_ep_ops;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi2");
+	pci->dbi_base2 = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pci->dbi_base2))
+		return PTR_ERR(pci->dbi_base2);
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "addr_space");
+	if (!res)
+		return -EINVAL;
+
+	ep->phys_base = res->start;
+	ep->addr_size = resource_size(res);
+
+	ret = dw_pcie_ep_init(ep);
+	if (ret) {
+		dev_err(dev, "Failed to initialize endpoint\n");
+		return ret;
+	}
+	return 0;
+}
 
 static int dw_plat_pcie_probe(struct platform_device *pdev)
 {
@@ -86,6 +169,16 @@ static int dw_plat_pcie_probe(struct platform_device *pdev)
 	struct dw_pcie *pci;
 	struct resource *res;  /* Resource from DT */
 	int ret;
+	const struct of_device_id *match;
+	const struct dw_plat_pcie_of_data *data;
+	enum dw_pcie_device_mode mode;
+
+	match = of_match_device(dw_plat_pcie_of_match, dev);
+	if (!match)
+		return -EINVAL;
+
+	data = (struct dw_plat_pcie_of_data *)match->data;
+	mode = (enum dw_pcie_device_mode)data->mode;
 
 	dw_plat_pcie = devm_kzalloc(dev, sizeof(*dw_plat_pcie), GFP_KERNEL);
 	if (!dw_plat_pcie)
@@ -99,23 +192,59 @@ static int dw_plat_pcie_probe(struct platform_device *pdev)
 	pci->ops = &dw_pcie_ops;
 
 	dw_plat_pcie->pci = pci;
+	dw_plat_pcie->mode = mode;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
+	if (!res)
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	pci->dbi_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pci->dbi_base))
 		return PTR_ERR(pci->dbi_base);
 
 	platform_set_drvdata(pdev, dw_plat_pcie);
 
-	ret = dw_plat_add_pcie_port(&pci->pp, pdev);
-	if (ret < 0)
-		return ret;
+	switch (dw_plat_pcie->mode) {
+	case DW_PCIE_RC_TYPE:
+		if (!IS_ENABLED(CONFIG_PCIE_DW_PLAT_HOST))
+			return -ENODEV;
+
+		ret = dw_plat_add_pcie_port(dw_plat_pcie, pdev);
+		if (ret < 0)
+			return ret;
+		break;
+	case DW_PCIE_EP_TYPE:
+		if (!IS_ENABLED(CONFIG_PCIE_DW_PLAT_EP))
+			return -ENODEV;
+
+		ret = dw_plat_add_pcie_ep(dw_plat_pcie, pdev);
+		if (ret < 0)
+			return ret;
+		break;
+	default:
+		dev_err(dev, "INVALID device type %d\n", dw_plat_pcie->mode);
+	}
 
 	return 0;
 }
 
+static const struct dw_plat_pcie_of_data dw_plat_pcie_rc_of_data = {
+	.mode = DW_PCIE_RC_TYPE,
+};
+
+static const struct dw_plat_pcie_of_data dw_plat_pcie_ep_of_data = {
+	.mode = DW_PCIE_EP_TYPE,
+};
+
 static const struct of_device_id dw_plat_pcie_of_match[] = {
-	{ .compatible = "snps,dw-pcie", },
+	{
+		.compatible = "snps,dw-pcie",
+		.data = &dw_plat_pcie_rc_of_data,
+	},
+	{
+		.compatible = "snps,dw-pcie-ep",
+		.data = &dw_plat_pcie_ep_of_data,
+	},
 	{},
 };
 
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 7cef85124325..bee401d99217 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -435,6 +435,13 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 	if (WARN_ON_ONCE(!epc))
 		return -EINVAL;
 
+	if (epc->features & EPC_FEATURE_NO_LINKUP_NOTIFIER)
+		epf_test->linkup_notifier = false;
+	else
+		epf_test->linkup_notifier = true;
+
+	epf_test->test_reg_bar = EPC_FEATURE_GET_BAR(epc->features);
+
 	ret = pci_epc_write_header(epc, epf->func_no, header);
 	if (ret) {
 		dev_err(dev, "configuration header write failed\n");
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index af657ca58b70..243eaa5a66ff 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -90,8 +90,16 @@ struct pci_epc {
 	struct config_group		*group;
 	/* spinlock to protect against concurrent access of EP controller */
 	spinlock_t			lock;
+	unsigned int			features;
 };
 
+#define EPC_FEATURE_NO_LINKUP_NOTIFIER		BIT(0)
+#define EPC_FEATURE_BAR_MASK			(BIT(1) | BIT(2) | BIT(3))
+#define EPC_FEATURE_SET_BAR(features, bar)	\
+		(features |= (EPC_FEATURE_BAR_MASK & (bar << 1)))
+#define EPC_FEATURE_GET_BAR(features)		\
+		((features & EPC_FEATURE_BAR_MASK) >> 1)
+
 #define to_pci_epc(device) container_of((device), struct pci_epc, dev)
 
 #define pci_epc_create(dev, ops)    \
-- 
cgit v1.2.3


From e5cd3abcb31a48d4ea91bd32f0618802ca5f3592 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 14 May 2018 10:00:16 -0700
Subject: bpf: sockmap, refactor sockmap routines to work with hashmap

This patch only refactors the existing sockmap code. This will allow
much of the psock initialization code path and bpf helper codes to
work for both sockmap bpf map types that are backed by an array, the
currently supported type, and the new hash backed bpf map type
sockhash.

Most the fallout comes from three changes,

  - Pushing bpf programs into an independent structure so we
    can use it from the htab struct in the next patch.
  - Generalizing helpers to use void *key instead of the hardcoded
    u32.
  - Instead of passing map/key through the metadata we now do
    the lookup inline. This avoids storing the key in the metadata
    which will be useful when keys can be longer than 4 bytes. We
    rename the sk pointers to sk_redir at this point as well to
    avoid any confusion between the current sk pointer and the
    redirect pointer sk_redir.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |   3 +-
 include/net/tcp.h      |   3 +-
 kernel/bpf/sockmap.c   | 148 +++++++++++++++++++++++++++++--------------------
 net/core/filter.c      |  31 +++--------
 4 files changed, 98 insertions(+), 87 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index da7e16523128..9dbcb9d55921 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -515,9 +515,8 @@ struct sk_msg_buff {
 	int sg_end;
 	struct scatterlist sg_data[MAX_SKB_FRAGS];
 	bool sg_copy[MAX_SKB_FRAGS];
-	__u32 key;
 	__u32 flags;
-	struct bpf_map *map;
+	struct sock *sk_redir;
 	struct sk_buff *skb;
 	struct list_head list;
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf803fe0fb86..059287374ba0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -814,9 +814,8 @@ struct tcp_skb_cb {
 #endif
 		} header;	/* For incoming skbs */
 		struct {
-			__u32 key;
 			__u32 flags;
-			struct bpf_map *map;
+			struct sock *sk_redir;
 			void *data_end;
 		} bpf;
 	};
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca568c2b..beab9ec9b023 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -48,14 +48,18 @@
 #define SOCK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
-struct bpf_stab {
-	struct bpf_map map;
-	struct sock **sock_map;
+struct bpf_sock_progs {
 	struct bpf_prog *bpf_tx_msg;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
 };
 
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sock_map;
+	struct bpf_sock_progs progs;
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -461,7 +465,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
 static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
 {
 	return ((_rc == SK_PASS) ?
-	       (md->map ? __SK_REDIRECT : __SK_PASS) :
+	       (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
 	       __SK_DROP);
 }
 
@@ -1092,7 +1096,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 * when we orphan the skb so that we don't have the possibility
 	 * to reference a stale map.
 	 */
-	TCP_SKB_CB(skb)->bpf.map = NULL;
+	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
 	skb->sk = psock->sock;
 	bpf_compute_data_pointers(skb);
 	preempt_disable();
@@ -1102,7 +1106,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	/* Moving return codes from UAPI namespace into internal namespace */
 	return rc == SK_PASS ?
-		(TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+		(TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
 		__SK_DROP;
 }
 
@@ -1372,7 +1376,6 @@ static int smap_init_sock(struct smap_psock *psock,
 }
 
 static void smap_init_progs(struct smap_psock *psock,
-			    struct bpf_stab *stab,
 			    struct bpf_prog *verdict,
 			    struct bpf_prog *parse)
 {
@@ -1450,14 +1453,13 @@ static void smap_gc_work(struct work_struct *w)
 	kfree(psock);
 }
 
-static struct smap_psock *smap_init_psock(struct sock *sock,
-					  struct bpf_stab *stab)
+static struct smap_psock *smap_init_psock(struct sock *sock, int node)
 {
 	struct smap_psock *psock;
 
 	psock = kzalloc_node(sizeof(struct smap_psock),
 			     GFP_ATOMIC | __GFP_NOWARN,
-			     stab->map.numa_node);
+			     node);
 	if (!psock)
 		return ERR_PTR(-ENOMEM);
 
@@ -1662,40 +1664,26 @@ out:
  *  - sock_map must use READ_ONCE and (cmp)xchg operations
  *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
  */
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				    struct bpf_map *map,
-				    void *key, u64 flags)
+
+static int __sock_map_ctx_update_elem(struct bpf_map *map,
+				      struct bpf_sock_progs *progs,
+				      struct sock *sock,
+				      struct sock **map_link,
+				      void *key)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct smap_psock_map_entry *e = NULL;
 	struct bpf_prog *verdict, *parse, *tx_msg;
-	struct sock *osock, *sock;
+	struct smap_psock_map_entry *e = NULL;
 	struct smap_psock *psock;
-	u32 i = *(u32 *)key;
 	bool new = false;
 	int err;
 
-	if (unlikely(flags > BPF_EXIST))
-		return -EINVAL;
-
-	if (unlikely(i >= stab->map.max_entries))
-		return -E2BIG;
-
-	sock = READ_ONCE(stab->sock_map[i]);
-	if (flags == BPF_EXIST && !sock)
-		return -ENOENT;
-	else if (flags == BPF_NOEXIST && sock)
-		return -EEXIST;
-
-	sock = skops->sk;
-
 	/* 1. If sock map has BPF programs those will be inherited by the
 	 * sock being added. If the sock is already attached to BPF programs
 	 * this results in an error.
 	 */
-	verdict = READ_ONCE(stab->bpf_verdict);
-	parse = READ_ONCE(stab->bpf_parse);
-	tx_msg = READ_ONCE(stab->bpf_tx_msg);
+	verdict = READ_ONCE(progs->bpf_verdict);
+	parse = READ_ONCE(progs->bpf_parse);
+	tx_msg = READ_ONCE(progs->bpf_tx_msg);
 
 	if (parse && verdict) {
 		/* bpf prog refcnt may be zero if a concurrent attach operation
@@ -1703,11 +1691,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		 * we increment the refcnt. If this is the case abort with an
 		 * error.
 		 */
-		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		verdict = bpf_prog_inc_not_zero(progs->bpf_verdict);
 		if (IS_ERR(verdict))
 			return PTR_ERR(verdict);
 
-		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		parse = bpf_prog_inc_not_zero(progs->bpf_parse);
 		if (IS_ERR(parse)) {
 			bpf_prog_put(verdict);
 			return PTR_ERR(parse);
@@ -1715,7 +1703,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	}
 
 	if (tx_msg) {
-		tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+		tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg);
 		if (IS_ERR(tx_msg)) {
 			if (verdict)
 				bpf_prog_put(verdict);
@@ -1748,7 +1736,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 	} else {
-		psock = smap_init_psock(sock, stab);
+		psock = smap_init_psock(sock, map->numa_node);
 		if (IS_ERR(psock)) {
 			err = PTR_ERR(psock);
 			goto out_progs;
@@ -1763,7 +1751,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		err = -ENOMEM;
 		goto out_progs;
 	}
-	e->entry = &stab->sock_map[i];
 
 	/* 3. At this point we have a reference to a valid psock that is
 	 * running. Attach any BPF programs needed.
@@ -1780,7 +1767,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		err = smap_init_sock(psock, sock);
 		if (err)
 			goto out_free;
-		smap_init_progs(psock, stab, verdict, parse);
+		smap_init_progs(psock, verdict, parse);
 		smap_start_sock(psock, sock);
 	}
 
@@ -1789,19 +1776,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	 * it with. Because we can only have a single set of programs if
 	 * old_sock has a strp we can stop it.
 	 */
-	list_add_tail(&e->list, &psock->maps);
-	write_unlock_bh(&sock->sk_callback_lock);
-
-	osock = xchg(&stab->sock_map[i], sock);
-	if (osock) {
-		struct smap_psock *opsock = smap_psock_sk(osock);
-
-		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
-		smap_release_sock(opsock, osock);
-		write_unlock_bh(&osock->sk_callback_lock);
+	if (map_link) {
+		e->entry = map_link;
+		list_add_tail(&e->list, &psock->maps);
 	}
-	return 0;
+	write_unlock_bh(&sock->sk_callback_lock);
+	return err;
 out_free:
 	smap_release_sock(psock, sock);
 out_progs:
@@ -1816,23 +1796,69 @@ out_progs:
 	return err;
 }
 
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				    struct bpf_map *map,
+				    void *key, u64 flags)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs = &stab->progs;
+	struct sock *osock, *sock;
+	u32 i = *(u32 *)key;
+	int err;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= stab->map.max_entries))
+		return -E2BIG;
+
+	sock = READ_ONCE(stab->sock_map[i]);
+	if (flags == BPF_EXIST && !sock)
+		return -ENOENT;
+	else if (flags == BPF_NOEXIST && sock)
+		return -EEXIST;
+
+	sock = skops->sk;
+	err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
+					 key);
+	if (err)
+		goto out;
+
+	osock = xchg(&stab->sock_map[i], sock);
+	if (osock) {
+		struct smap_psock *opsock = smap_psock_sk(osock);
+
+		write_lock_bh(&osock->sk_callback_lock);
+		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_release_sock(opsock, osock);
+		write_unlock_bh(&osock->sk_callback_lock);
+	}
+out:
+	return 0;
+}
+
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
 		return -EINVAL;
+	}
 
 	switch (type) {
 	case BPF_SK_MSG_VERDICT:
-		orig = xchg(&stab->bpf_tx_msg, prog);
+		orig = xchg(&progs->bpf_tx_msg, prog);
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
-		orig = xchg(&stab->bpf_parse, prog);
+		orig = xchg(&progs->bpf_parse, prog);
 		break;
 	case BPF_SK_SKB_STREAM_VERDICT:
-		orig = xchg(&stab->bpf_verdict, prog);
+		orig = xchg(&progs->bpf_verdict, prog);
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -1881,16 +1907,18 @@ static int sock_map_update_elem(struct bpf_map *map,
 static void sock_map_release(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	orig = xchg(&stab->bpf_parse, NULL);
+	progs = &stab->progs;
+	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
-	orig = xchg(&stab->bpf_verdict, NULL);
+	orig = xchg(&progs->bpf_verdict, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 
-	orig = xchg(&stab->bpf_tx_msg, NULL);
+	orig = xchg(&progs->bpf_tx_msg, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index ca60d2872da4..61a3ed6bac25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2083,9 +2083,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
-	tcb->bpf.key = key;
 	tcb->bpf.flags = flags;
-	tcb->bpf.map = map;
+	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
 
 	return SK_PASS;
 }
@@ -2093,16 +2094,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 struct sock *do_sk_redirect_map(struct sk_buff *skb)
 {
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-	struct sock *sk = NULL;
-
-	if (tcb->bpf.map) {
-		sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
 
-		tcb->bpf.key = 0;
-		tcb->bpf.map = NULL;
-	}
-
-	return sk;
+	return tcb->bpf.sk_redir;
 }
 
 static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -2122,25 +2115,17 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
-	msg->key = key;
 	msg->flags = flags;
-	msg->map = map;
+	msg->sk_redir = __sock_map_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
 
 	return SK_PASS;
 }
 
 struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 {
-	struct sock *sk = NULL;
-
-	if (msg->map) {
-		sk = __sock_map_lookup_elem(msg->map, msg->key);
-
-		msg->key = 0;
-		msg->map = NULL;
-	}
-
-	return sk;
+	return msg->sk_redir;
 }
 
 static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
-- 
cgit v1.2.3


From cee4393989333795ae04dc9f3b83a578afe3fca6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 2 Mar 2018 16:35:27 -0800
Subject: rcu: Rename cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs()

Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor
of cond_resched()") substituted cond_resched() for the earlier call
to cond_resched_rcu_qs().  However, the new-age cond_resched() does
not do anything to help RCU-tasks grace periods because (1) RCU-tasks
is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a
complete no-op when preemption is enabled.  This situation results
in hangs when running the trace benchmarks.

A number of potential fixes were discussed on LKML
(https://lkml.kernel.org/r/20180224151240.0d63a059@vmware.local.home),
including making cond_resched() not be a no-op; making cond_resched()
not be a no-op, but only when running tracing benchmarks; reverting
the aforementioned commit (which works because cond_resched_rcu_qs()
does provide an RCU-tasks quiescent state; and adding a call to the
scheduler/RCU rcu_note_voluntary_context_switch() function.  All were
deemed unsatisfactory, either due to added cond_resched() overhead or
due to magic functions inviting cargo culting.

This commit renames cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs(),
which provides a clear hint as to what this function is doing and
why and where it should be used, and then replaces the call to
cond_resched() with cond_resched_tasks_rcu_qs() in the trace benchmark's
benchmark_event_kthread() function.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/linux/rcupdate.h       |  4 ++--
 kernel/rcu/rcuperf.c           |  2 +-
 kernel/rcu/tree.c              | 20 ++++++++++----------
 kernel/rcu/tree_plugin.h       |  4 ++--
 kernel/rcu/update.c            |  2 +-
 kernel/torture.c               |  2 +-
 kernel/trace/trace_benchmark.c |  4 ++--
 7 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 36360d07f25b..19d235fefdb9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -188,13 +188,13 @@ static inline void exit_tasks_rcu_finish(void) { }
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
- * cond_resched_rcu_qs - Report potential quiescent states to RCU
+ * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
  *
  * This macro resembles cond_resched(), except that it is defined to
  * report potential quiescent states to RCU-tasks even if the cond_resched()
  * machinery were to be shut off, as some advocate for PREEMPT kernels.
  */
-#define cond_resched_rcu_qs() \
+#define cond_resched_tasks_rcu_qs() \
 do { \
 	if (!cond_resched()) \
 		rcu_note_voluntary_context_switch_lite(current); \
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 777e7a6a0292..e232846516b3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void)
  */
 static void rcu_perf_wait_shutdown(void)
 {
-	cond_resched_rcu_qs();
+	cond_resched_tasks_rcu_qs();
 	if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
 		return;
 	while (!torture_must_stop())
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a734692a581..c4db0e20b035 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1234,10 +1234,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	}
 
 	/*
-	 * Has this CPU encountered a cond_resched_rcu_qs() since the
-	 * beginning of the grace period?  For this to be the case,
-	 * the CPU has to have noticed the current grace period.  This
-	 * might not be the case for nohz_full CPUs looping in the kernel.
+	 * Has this CPU encountered a cond_resched() since the beginning
+	 * of the grace period?  For this to be the case, the CPU has to
+	 * have noticed the current grace period.  This might not be the
+	 * case for nohz_full CPUs looping in the kernel.
 	 */
 	jtsq = jiffies_till_sched_qs;
 	ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
@@ -2049,7 +2049,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 					    rnp->level, rnp->grplo,
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq_rcu_node(rnp);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 
@@ -2151,7 +2151,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		sq = rcu_nocb_gp_get(rnp);
 		raw_spin_unlock_irq_rcu_node(rnp);
 		rcu_nocb_gp_cleanup(sq);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 		rcu_gp_slow(rsp, gp_cleanup_delay);
 	}
@@ -2202,7 +2202,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 			/* Locking provides needed memory barrier. */
 			if (rcu_gp_init(rsp))
 				break;
-			cond_resched_rcu_qs();
+			cond_resched_tasks_rcu_qs();
 			WRITE_ONCE(rsp->gp_activity, jiffies);
 			WARN_ON(signal_pending(current));
 			trace_rcu_grace_period(rsp->name,
@@ -2247,7 +2247,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
-				cond_resched_rcu_qs();
+				cond_resched_tasks_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
 				ret = 0; /* Force full wait till next FQS. */
 				j = jiffies_till_next_fqs;
@@ -2260,7 +2260,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				}
 			} else {
 				/* Deal with stray signal. */
-				cond_resched_rcu_qs();
+				cond_resched_tasks_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
 				WARN_ON(signal_pending(current));
 				trace_rcu_grace_period(rsp->name,
@@ -2782,7 +2782,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
 	struct rcu_node *rnp;
 
 	rcu_for_each_leaf_node(rsp, rnp) {
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		mask = 0;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask == 0) {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 12774c4fb546..dfc42c3d52d4 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1598,7 +1598,7 @@ static int rcu_oom_notify(struct notifier_block *self,
 
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 	}
 
 	/* Unconditionally decrement: no need to wake ourselves up. */
@@ -2227,7 +2227,7 @@ static int rcu_nocb_kthread(void *arg)
 				cl++;
 			c++;
 			local_bh_enable();
-			cond_resched_rcu_qs();
+			cond_resched_tasks_rcu_qs();
 			list = next;
 		}
 		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 68fa19a5e7bd..e401960c7f51 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -624,7 +624,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
  * grace period has elapsed, in other words after all currently
  * executing rcu-tasks read-side critical sections have elapsed.  These
  * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
  * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
  *
  * This is a very specialized primitive, intended only for a few uses in
diff --git a/kernel/torture.c b/kernel/torture.c
index 37b94012a3f8..3de1efbecd6a 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -574,7 +574,7 @@ void stutter_wait(const char *title)
 {
 	int spt;
 
-	cond_resched_rcu_qs();
+	cond_resched_tasks_rcu_qs();
 	spt = READ_ONCE(stutter_pause_test);
 	for (; spt; spt = READ_ONCE(stutter_pause_test)) {
 		if (spt == 1) {
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 22fee766081b..80e0b2aca703 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg)
 		 * wants to run, schedule in, but if the CPU is idle,
 		 * we'll keep burning cycles.
 		 *
-		 * Note the _rcu_qs() version of cond_resched() will
+		 * Note the tasks_rcu_qs() version of cond_resched() will
 		 * notify synchronize_rcu_tasks() that this thread has
 		 * passed a quiescent state for rcu_tasks. Otherwise
 		 * this thread will never voluntarily schedule which would
 		 * block synchronize_rcu_tasks() indefinitely.
 		 */
-		cond_resched();
+		cond_resched_tasks_rcu_qs();
 	}
 
 	return 0;
-- 
cgit v1.2.3


From c3442697c2d73d3cdb9d4135cf630ad36ba8552f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 5 Mar 2018 11:29:40 -0800
Subject: softirq: Eliminate unused cond_resched_softirq() macro

The cond_resched_softirq() macro is not used anywhere in mainline, so
this commit simplifies the kernel by eliminating it.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Tested-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/linux/sched.h |  8 --------
 kernel/sched/core.c   | 14 --------------
 kernel/softirq.c      |  3 +--
 3 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3d697f3b573..6fc99045658a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1613,7 +1613,6 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
  * explicit rescheduling in places that are safe. The return
  * value indicates whether a reschedule was done in fact.
  * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
  */
 #ifndef CONFIG_PREEMPT
 extern int _cond_resched(void);
@@ -1633,13 +1632,6 @@ extern int __cond_resched_lock(spinlock_t *lock);
 	__cond_resched_lock(lock);				\
 })
 
-extern int __cond_resched_softirq(void);
-
-#define cond_resched_softirq() ({					\
-	___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
-	__cond_resched_softirq();					\
-})
-
 static inline void cond_resched_rcu(void)
 {
 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e10aaeebfcc..6a09e6af64b9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5012,20 +5012,6 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
-int __sched __cond_resched_softirq(void)
-{
-	BUG_ON(!in_softirq());
-
-	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
-		local_bh_enable();
-		preempt_schedule_common();
-		local_bh_disable();
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
 /**
  * yield - yield the current processor to other threads.
  *
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 177de3640c78..03981f1c39ea 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
 }
 
 /*
- * Special-case - softirqs can safely be enabled in
- * cond_resched_softirq(), or by __do_softirq(),
+ * Special-case - softirqs can safely be enabled by __do_softirq(),
  * without processing still-pending softirqs:
  */
 void _local_bh_enable(void)
-- 
cgit v1.2.3


From 17672480fb1e953f999623b598a98130f8aacfbc Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@caviumnetworks.com>
Date: Sun, 25 Mar 2018 20:50:03 +0300
Subject: rcu: Declare rcu_eqs_special_set() in public header

Because rcu_eqs_special_set() is declared only in internal header
kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h, it is
inaccessible outside of the RCU implementation.  This patch therefore
moves the  rcu_eqs_special_set() declaration to include/linux/rcutree.h,
which allows it to be used in non-rcu kernel code.

Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/linux/rcutree.h | 1 +
 kernel/rcu/tree.h       | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index fd996cdf1833..448f20f27396 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void)
 void rcu_barrier(void);
 void rcu_barrier_bh(void);
 void rcu_barrier_sched(void);
+bool rcu_eqs_special_set(int cpu);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 unsigned long get_state_synchronize_sched(void);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 5fd374c71404..0b3a90ebe225 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -404,7 +404,6 @@ extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
-bool rcu_eqs_special_set(int cpu);
 
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-- 
cgit v1.2.3


From f7194ac32ca241d28765a98e42a7fe13debc85a7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 5 Apr 2018 17:19:17 -0700
Subject: srcu: Add cleanup_srcu_struct_quiesced()

The current cleanup_srcu_struct() flushes work, which prevents it
from being invoked from some workqueue contexts, as well as from
atomic (non-blocking) contexts.  This patch therefore introduced a
cleanup_srcu_struct_quiesced(), which can be invoked only after all
activity on the specified srcu_struct has completed.  This restriction
allows cleanup_srcu_struct_quiesced() to be invoked from workqueue
contexts as well as from atomic contexts.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Nitzan Carmi <nitzanc@mellanox.com>
Tested-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/linux/srcu.h    | 36 +++++++++++++++++++++++++++++++++++-
 kernel/rcu/rcutorture.c |  7 ++++++-
 kernel/rcu/srcutiny.c   |  9 ++++++---
 kernel/rcu/srcutree.c   | 30 +++++++++++++++++-------------
 4 files changed, 64 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 33c1c698df09..91494d7e8e41 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -69,11 +69,45 @@ struct srcu_struct { };
 
 void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 		void (*func)(struct rcu_head *head));
-void cleanup_srcu_struct(struct srcu_struct *sp);
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
 int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
 
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+static inline void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	_cleanup_srcu_struct(sp, false);
+}
+
+/**
+ * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.  Also,
+ * all grace-period processing must have completed.
+ *
+ * "Completed" means that the last synchronize_srcu() and
+ * synchronize_srcu_expedited() calls must have returned before the call
+ * to cleanup_srcu_struct_quiesced().  It also means that the callback
+ * from the last call_srcu() must have been invoked before the call to
+ * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
+ * with this last.  Violating these rules will get you a WARN_ON() splat
+ * (with high probability, anyway), and will also cause the srcu_struct
+ * to be leaked.
+ */
+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
+{
+	_cleanup_srcu_struct(sp, true);
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 /**
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 680c96d8c00f..f0e1d44459f8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
 
 static void srcu_torture_cleanup(void)
 {
-	cleanup_srcu_struct(&srcu_ctld);
+	static DEFINE_TORTURE_RANDOM(rand);
+
+	if (torture_random(&rand) & 0x800)
+		cleanup_srcu_struct(&srcu_ctld);
+	else
+		cleanup_srcu_struct_quiesced(&srcu_ctld);
 	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
 }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 76ac5f50b2c7..622792abe41a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.
  */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
-	flush_work(&sp->srcu_work);
+	if (quiesced)
+		WARN_ON(work_pending(&sp->srcu_work));
+	else
+		flush_work(&sp->srcu_work);
 	WARN_ON(sp->srcu_gp_running);
 	WARN_ON(sp->srcu_gp_waiting);
 	WARN_ON(sp->srcu_cb_head);
 	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fb560fca9ef4..b4123d7a2cec 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
 	return SRCU_INTERVAL;
 }
 
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	int cpu;
 
 	if (WARN_ON(!srcu_get_delay(sp)))
-		return; /* Leakage unless caller handles error. */
+		return; /* Just leak it! */
 	if (WARN_ON(srcu_readers_active(sp)))
-		return; /* Leakage unless caller handles error. */
-	flush_delayed_work(&sp->work);
+		return; /* Just leak it! */
+	if (quiesced) {
+		if (WARN_ON(delayed_work_pending(&sp->work)))
+			return; /* Just leak it! */
+	} else {
+		flush_delayed_work(&sp->work);
+	}
 	for_each_possible_cpu(cpu)
-		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		if (quiesced) {
+			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+				return; /* Just leak it! */
+		} else {
+			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		}
 	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
 	    WARN_ON(srcu_readers_active(sp))) {
 		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	free_percpu(sp->sda);
 	sp->sda = NULL;
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Counts the new reader in the appropriate per-CPU element of the
-- 
cgit v1.2.3


From c21b8b3ed0ebba0d9fd3cb126d749b653e7b5038 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 19 Apr 2018 16:38:25 -0700
Subject: rcu: Update list of rcu_future_grace_period() trace events

Reworking grace-period initiation and funnel locking added new
rcu_future_grace_period() trace events, so this commit updates the
rcu_future_grace_period() trace event's header comment accordingly.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/trace/events/rcu.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index d8c33298c153..5936aac357ab 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -84,20 +84,21 @@ TRACE_EVENT(rcu_grace_period,
 );
 
 /*
- * Tracepoint for future grace-period events, including those for no-callbacks
- * CPUs.  The caller should pull the data from the rcu_node structure,
- * other than rcuname, which comes from the rcu_state structure, and event,
- * which is one of the following:
+ * Tracepoint for future grace-period events.  The caller should pull
+ * the data from the rcu_node structure, other than rcuname, which comes
+ * from the rcu_state structure, and event, which is one of the following:
  *
- * "Startleaf": Request a nocb grace period based on leaf-node data.
+ * "Startleaf": Request a grace period based on leaf-node data.
+ * "Prestarted": Someone beat us to the request
  * "Startedleaf": Leaf-node start proved sufficient.
  * "Startedleafroot": Leaf-node start proved sufficient after checking root.
  * "Startedroot": Requested a nocb grace period based on root-node data.
+ * "NoGPkthread": The RCU grace-period kthread has not yet started.
  * "StartWait": Start waiting for the requested grace period.
  * "ResumeWait": Resume waiting after signal.
  * "EndWait": Complete wait.
  * "Cleanup": Clean up rcu_node structure after previous GP.
- * "CleanupMore": Clean up, and another no-CB GP is needed.
+ * "CleanupMore": Clean up, and another GP is needed.
  */
 TRACE_EVENT(rcu_future_grace_period,
 
-- 
cgit v1.2.3


From ae07b786888f1872ac2b63d74a17e206d441ec9f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 15 May 2018 11:07:00 +0200
Subject: PCI: Make pci_get_new_domain_nr() static

The only user of pci_get_new_domain_nr() is of_pci_bus_find_domain_nr().
Since they are defined in the same file, pci_get_new_domain_nr() can be
made static, which also simplifies preprocessor conditionals.

No functional change intended.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 drivers/pci/pci.c   | 6 ++----
 include/linux/pci.h | 3 ---
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655a5643..695c2bb4e853 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5702,15 +5702,14 @@ static void pci_no_domains(void)
 #endif
 }
 
-#ifdef CONFIG_PCI_DOMAINS
+#ifdef CONFIG_PCI_DOMAINS_GENERIC
 static atomic_t __domain_nr = ATOMIC_INIT(-1);
 
-int pci_get_new_domain_nr(void)
+static int pci_get_new_domain_nr(void)
 {
 	return atomic_inc_return(&__domain_nr);
 }
 
-#ifdef CONFIG_PCI_DOMAINS_GENERIC
 static int of_pci_bus_find_domain_nr(struct device *parent)
 {
 	static int use_dt_domains = -1;
@@ -5765,7 +5764,6 @@ int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent)
 			       acpi_pci_bus_find_domain_nr(bus);
 }
 #endif
-#endif
 
 /**
  * pci_ext_cfg_avail - can we access extended PCI config space?
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..963232a6cd2e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1510,12 +1510,10 @@ void pci_cfg_access_unlock(struct pci_dev *dev);
  */
 #ifdef CONFIG_PCI_DOMAINS
 extern int pci_domains_supported;
-int pci_get_new_domain_nr(void);
 #else
 enum { pci_domains_supported = 0 };
 static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
 static inline int pci_proc_domain(struct pci_bus *bus) { return 0; }
-static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 #endif /* CONFIG_PCI_DOMAINS */
 
 /*
@@ -1670,7 +1668,6 @@ static inline struct pci_dev *pci_get_domain_bus_and_slot(int domain,
 
 static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
 static inline struct pci_dev *pci_dev_get(struct pci_dev *dev) { return NULL; }
-static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 
 #define dev_is_pci(d) (false)
 #define dev_is_pf(d) (false)
-- 
cgit v1.2.3


From 81110384441a59cff47430f20f049e69b98c17f4 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 14 May 2018 10:00:17 -0700
Subject: bpf: sockmap, add hash map support

Sockmap is currently backed by an array and enforces keys to be
four bytes. This works well for many use cases and was originally
modeled after devmap which also uses four bytes keys. However,
this has become limiting in larger use cases where a hash would
be more appropriate. For example users may want to use the 5-tuple
of the socket as the lookup key.

To support this add hash support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h       |   8 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  54 ++++-
 kernel/bpf/core.c         |   1 +
 kernel/bpf/sockmap.c      | 494 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c     |  14 +-
 net/core/filter.c         |  58 ++++++
 7 files changed, 611 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a38e474bf7ee..ed0122b45b63 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
@@ -675,6 +676,12 @@ static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 	return NULL;
 }
 
+static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
+						    void *key)
+{
+	return NULL;
+}
+
 static inline int sock_map_prog(struct bpf_map *map,
 				struct bpf_prog *prog,
 				u32 type)
@@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
+extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d7df1b323082..b67f8793de0d 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 02e4112510f8..d94d333a8225 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
 };
 
 enum bpf_prog_type {
@@ -1828,7 +1829,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- *
  * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
  *	Description
  *		Do FIB lookup in kernel tables using parameters in *params*.
@@ -1855,6 +1855,53 @@ union bpf_attr {
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
  *             of failure.
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1926,7 +1973,10 @@ union bpf_attr {
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
 	FN(skb_load_bytes_relative),	\
-	FN(fib_lookup),
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d0d7d9462368..2194c6a9df42 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1707,6 +1707,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index beab9ec9b023..56879c9fd3a4 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -60,6 +60,28 @@ struct bpf_stab {
 	struct bpf_sock_progs progs;
 };
 
+struct bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct bucket *buckets;
+	atomic_t count;
+	u32 n_buckets;
+	u32 elem_size;
+	struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+	struct rcu_head rcu;
+	struct hlist_node hash_node;
+	u32 hash;
+	struct sock *sk;
+	char key[0];
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -67,6 +89,8 @@ enum smap_psock_state {
 struct smap_psock_map_entry {
 	struct list_head list;
 	struct sock **entry;
+	struct htab_elem *hash_link;
+	struct bpf_htab *htab;
 };
 
 struct smap_psock {
@@ -195,6 +219,12 @@ out:
 	rcu_read_unlock();
 }
 
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+	atomic_dec(&htab->count);
+	kfree_rcu(l, rcu);
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
@@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	}
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		osk = cmpxchg(e->entry, sk, NULL);
-		if (osk == sk) {
-			list_del(&e->list);
-			smap_release_sock(psock, sk);
+		if (e->entry) {
+			osk = cmpxchg(e->entry, sk, NULL);
+			if (osk == sk) {
+				list_del(&e->list);
+				smap_release_sock(psock, sk);
+			}
+		} else {
+			hlist_del_rcu(&e->hash_link->hash_node);
+			smap_release_sock(psock, e->hash_link->sk);
+			free_htab_elem(e->htab, e->hash_link);
 		}
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -1527,12 +1563,14 @@ free_stab:
 	return ERR_PTR(err);
 }
 
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+			     struct sock **entry,
+			     struct htab_elem *hash_link)
 {
 	struct smap_psock_map_entry *e, *tmp;
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry) {
+		if (e->entry == entry || e->hash_link == hash_link) {
 			list_del(&e->list);
 			break;
 		}
@@ -1570,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map)
 		 * to be null and queued for garbage collection.
 		 */
 		if (likely(psock)) {
-			smap_list_remove(psock, &stab->sock_map[i]);
+			smap_list_remove(psock, &stab->sock_map[i], NULL);
 			smap_release_sock(psock, sock);
 		}
 		write_unlock_bh(&sock->sk_callback_lock);
@@ -1629,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (psock->bpf_parse)
 		smap_stop_sock(psock, sock);
-	smap_list_remove(psock, &stab->sock_map[k]);
+	smap_list_remove(psock, &stab->sock_map[k], NULL);
 	smap_release_sock(psock, sock);
 out:
 	write_unlock_bh(&sock->sk_callback_lock);
@@ -1746,10 +1784,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 		new = true;
 	}
 
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e) {
-		err = -ENOMEM;
-		goto out_progs;
+	if (map_link) {
+		e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+		if (!e) {
+			err = -ENOMEM;
+			goto out_progs;
+		}
 	}
 
 	/* 3. At this point we have a reference to a valid psock that is
@@ -1783,6 +1823,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	write_unlock_bh(&sock->sk_callback_lock);
 	return err;
 out_free:
+	kfree(e);
 	smap_release_sock(psock, sock);
 out_progs:
 	if (verdict)
@@ -1829,7 +1870,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		struct smap_psock *opsock = smap_psock_sk(osock);
 
 		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_list_remove(opsock, &stab->sock_map[i], NULL);
 		smap_release_sock(opsock, osock);
 		write_unlock_bh(&osock->sk_callback_lock);
 	}
@@ -1846,6 +1887,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 
 		progs = &stab->progs;
+	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
 	} else {
 		return -EINVAL;
 	}
@@ -1906,11 +1951,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 
 static void sock_map_release(struct bpf_map *map)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	progs = &stab->progs;
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
+	}
+
 	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
@@ -1923,6 +1976,390 @@ static void sock_map_release(struct bpf_map *map)
 		bpf_prog_put(orig);
 }
 
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int i, err;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->value_size != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&htab->map, attr);
+
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8);
+	err = -EINVAL;
+	if (htab->n_buckets == 0 ||
+	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
+		goto free_htab;
+
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_htab;
+
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = bpf_map_area_alloc(
+				htab->n_buckets * sizeof(struct bucket),
+				htab->map.numa_node);
+	if (!htab->buckets)
+		goto free_htab;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
+
+	return &htab->map;
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* At this point no update, lookup or delete operations can happen.
+	 * However, be aware we can still get a socket state event updates,
+	 * and data ready callabacks that reference the psock from sk_user_data
+	 * Also psock worker threads are still in-flight. So smap_release_sock
+	 * will only free the psock after cancel_sync on the worker threads
+	 * and a grace period expire to ensure psock is really safe to remove.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			struct sock *sock = l->sk;
+			struct smap_psock *psock;
+
+			hlist_del_rcu(&l->hash_node);
+			write_lock_bh(&sock->sk_callback_lock);
+			psock = smap_psock_sk(sock);
+			/* This check handles a racing sock event that can get
+			 * the sk_callback_lock before this case but after xchg
+			 * causing the refcnt to hit zero and sock user data
+			 * (psock) to be null and queued for garbage collection.
+			 */
+			if (likely(psock)) {
+				smap_list_remove(psock, NULL, l);
+				smap_release_sock(psock, sock);
+			}
+			write_unlock_bh(&sock->sk_callback_lock);
+			kfree(l);
+		}
+	}
+	rcu_read_unlock();
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+					      void *key, u32 key_size, u32 hash,
+					      struct sock *sk,
+					      struct htab_elem *old_elem)
+{
+	struct htab_elem *l_new;
+
+	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+		if (!old_elem) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+	}
+	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+			     htab->map.numa_node);
+	if (!l_new)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(l_new->key, key, key_size);
+	l_new->sk = sk;
+	l_new->hash = hash;
+	return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+					 u32 hash, void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node) {
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+	}
+
+	return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+				  void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l, *next_l;
+	struct hlist_head *h;
+	u32 hash, key_size;
+	int i = 0;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+	if (!key)
+		goto find_first_elem;
+	hash = htab_map_hash(key, key_size);
+	h = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(h, hash, key, key_size);
+	if (!l)
+		goto find_first_elem;
+	next_l = hlist_entry_safe(
+		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+		     struct htab_elem, hash_node);
+	if (next_l) {
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		h = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(h)),
+				struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* iterated over all buckets and all elements */
+	return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				     struct bpf_map *map,
+				     void *key, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_sock_progs *progs = &htab->progs;
+	struct htab_elem *l_new = NULL, *l_old;
+	struct smap_psock_map_entry *e = NULL;
+	struct hlist_head *head;
+	struct smap_psock *psock;
+	u32 key_size, hash;
+	struct sock *sock;
+	struct bucket *b;
+	int err;
+
+	sock = skops->sk;
+
+	if (sock->sk_type != SOCK_STREAM ||
+	    sock->sk_protocol != IPPROTO_TCP)
+		return -EOPNOTSUPP;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+	if (!e)
+		return -ENOMEM;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+	if (err)
+		goto err;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_bh(&b->lock);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+	if (l_old && map_flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto bucket_err;
+	}
+	if (!l_old && map_flags == BPF_EXIST) {
+		err = -ENOENT;
+		goto bucket_err;
+	}
+
+	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+	if (IS_ERR(l_new)) {
+		err = PTR_ERR(l_new);
+		goto bucket_err;
+	}
+
+	psock = smap_psock_sk(sock);
+	if (unlikely(!psock)) {
+		err = -EINVAL;
+		goto bucket_err;
+	}
+
+	e->hash_link = l_new;
+	e->htab = container_of(map, struct bpf_htab, map);
+	list_add_tail(&e->list, &psock->maps);
+
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		psock = smap_psock_sk(l_old->sk);
+
+		hlist_del_rcu(&l_old->hash_node);
+		smap_list_remove(psock, NULL, l_old);
+		smap_release_sock(psock, l_old->sk);
+		free_htab_elem(htab, l_old);
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return 0;
+bucket_err:
+	raw_spin_unlock_bh(&b->lock);
+err:
+	kfree(e);
+	psock = smap_psock_sk(sock);
+	if (psock)
+		smap_release_sock(psock, sock);
+	return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_sock_ops_kern skops;
+	u32 fd = *(u32 *)value;
+	struct socket *socket;
+	int err;
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	skops.sk = socket->sk;
+	if (!skops.sk) {
+		fput(socket->file);
+		return -EINVAL;
+	}
+
+	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+	fput(socket->file);
+	return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct bucket *b;
+	struct htab_elem *l;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (l) {
+		struct sock *sock = l->sk;
+		struct smap_psock *psock;
+
+		hlist_del_rcu(&l->hash_node);
+		write_lock_bh(&sock->sk_callback_lock);
+		psock = smap_psock_sk(sock);
+		/* This check handles a racing sock event that can get the
+		 * sk_callback_lock before this case but after xchg happens
+		 * causing the refcnt to hit zero and sock user data (psock)
+		 * to be null and queued for garbage collection.
+		 */
+		if (likely(psock)) {
+			smap_list_remove(psock, NULL, l);
+			smap_release_sock(psock, sock);
+		}
+		write_unlock_bh(&sock->sk_callback_lock);
+		free_htab_elem(htab, l);
+		ret = 0;
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return ret;
+}
+
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 key_size, hash;
+	struct bucket *b;
+	struct sock *sk;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	sk = l ? l->sk : NULL;
+	raw_spin_unlock_bh(&b->lock);
+	return sk;
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -1933,6 +2370,15 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_release_uref = sock_map_release,
 };
 
+const struct bpf_map_ops sock_hash_ops = {
+	.map_alloc = sock_hash_alloc,
+	.map_free = sock_hash_free,
+	.map_lookup_elem = sock_map_lookup,
+	.map_get_next_key = sock_hash_get_next_key,
+	.map_update_elem = sock_hash_update_elem,
+	.map_delete_elem = sock_hash_delete_elem,
+};
+
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
 	   struct bpf_map *, map, void *, key, u64, flags)
 {
@@ -1950,3 +2396,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 	.arg3_type	= ARG_PTR_TO_MAP_KEY,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+	.func		= bpf_sock_hash_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d92d9c37affd..a9e4b1372da6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_msg_redirect_map)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_SOCKHASH:
+		if (func_id != BPF_FUNC_sk_redirect_hash &&
+		    func_id != BPF_FUNC_sock_hash_update &&
+		    func_id != BPF_FUNC_map_delete_elem &&
+		    func_id != BPF_FUNC_msg_redirect_hash)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_sk_redirect_map:
 	case BPF_FUNC_msg_redirect_map:
+	case BPF_FUNC_sock_map_update:
 		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
 			goto error;
 		break;
-	case BPF_FUNC_sock_map_update:
-		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+	case BPF_FUNC_sk_redirect_hash:
+	case BPF_FUNC_msg_redirect_hash:
+	case BPF_FUNC_sock_hash_update:
+		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
 			goto error;
 		break;
 	default:
diff --git a/net/core/filter.c b/net/core/filter.c
index 61a3ed6bac25..6d0d1560bd70 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2074,6 +2074,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+	.func           = bpf_sk_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -2108,6 +2135,31 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	msg->flags = flags;
+	msg->sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+	.func           = bpf_msg_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -4502,6 +4554,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
+	case BPF_FUNC_sock_hash_update:
+		return &bpf_sock_hash_update_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4513,6 +4567,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_msg_redirect_map:
 		return &bpf_msg_redirect_map_proto;
+	case BPF_FUNC_msg_redirect_hash:
+		return &bpf_msg_redirect_hash_proto;
 	case BPF_FUNC_msg_apply_bytes:
 		return &bpf_msg_apply_bytes_proto;
 	case BPF_FUNC_msg_cork_bytes:
@@ -4544,6 +4600,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_sk_redirect_map:
 		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_sk_redirect_hash:
+		return &bpf_sk_redirect_hash_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 8ee3a52e3f35e064a3bf82f21dc74ddaf9843648 Mon Sep 17 00:00:00 2001
From: Emily Deng <Emily.Deng@amd.com>
Date: Mon, 16 Apr 2018 10:07:02 +0800
Subject: drm/gpu-sched: fix force APP kill hang(v4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

issue:
there are VMC page fault occurred if force APP kill during
3dmark test, the cause is in entity_fini we manually signal
all those jobs in entity's queue which confuse the sync/dep
mechanism:

1)page fault occurred in sdma's clear job which operate on
shadow buffer, and shadow buffer's Gart table is cleaned by
ttm_bo_release since the fence in its reservation was fake signaled
by entity_fini() under the case of SIGKILL received.

2)page fault occurred in gfx' job because during the lifetime
of gfx job we manually fake signal all jobs from its entity
in entity_fini(), thus the unmapping/clear PTE job depend on those
result fence is satisfied and sdma start clearing the PTE and lead
to GFX page fault.

fix:
1)should at least wait all jobs already scheduled complete in entity_fini()
if SIGKILL is the case.

2)if a fence signaled and try to clear some entity's dependency, should
set this entity guilty to prevent its job really run since the dependency
is fake signaled.

v2:
splitting drm_sched_entity_fini() into two functions:
1)The first one is does the waiting, removes the entity from the
runqueue and returns an error when the process was killed.
2)The second one then goes over the entity, install it as
completion signal for the remaining jobs and signals all jobs
with an error code.

v3:
1)Replace the fini1 and fini2 with better name
2)Call the first part before the VM teardown in
amdgpu_driver_postclose_kms() and the second part
after the VM teardown
3)Keep the original function drm_sched_entity_fini to
refine the code.

v4:
1)Rename entity->finished to entity->last_scheduled;
2)Rename drm_sched_entity_fini_job_cb() to
drm_sched_entity_kill_jobs_cb();
3)Pass NULL to drm_sched_entity_fini_job_cb() if -ENOENT;
4)Replace the type of entity->fini_status with "int";
5)Remove the check about entity->finished.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Emily Deng <Emily.Deng@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64 ++++++++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 71 ++++++++++++++++++++++++++-----
 include/drm/gpu_scheduler.h               |  7 +++
 5 files changed, 128 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c25ee750c362..ea1b28536bfc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
 int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id);
 
 void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
+void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);
+void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 09d35051fdd6..eb80edfb1b0a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -111,8 +111,9 @@ failed:
 	return r;
 }
 
-static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
+static void amdgpu_ctx_fini(struct kref *ref)
 {
+	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
 	struct amdgpu_device *adev = ctx->adev;
 	unsigned i, j;
 
@@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
 	kfree(ctx->fences);
 	ctx->fences = NULL;
 
-	for (i = 0; i < adev->num_rings; i++)
-		drm_sched_entity_fini(&adev->rings[i]->sched,
-				      &ctx->rings[i].entity);
-
 	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
 
 	mutex_destroy(&ctx->lock);
+
+	kfree(ctx);
 }
 
 static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
@@ -170,12 +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
 static void amdgpu_ctx_do_release(struct kref *ref)
 {
 	struct amdgpu_ctx *ctx;
+	u32 i;
 
 	ctx = container_of(ref, struct amdgpu_ctx, refcount);
 
-	amdgpu_ctx_fini(ctx);
+	for (i = 0; i < ctx->adev->num_rings; i++)
+		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
+			&ctx->rings[i].entity);
 
-	kfree(ctx);
+	amdgpu_ctx_fini(ref);
 }
 
 static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
@@ -435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
 	idr_init(&mgr->ctx_handles);
 }
 
+void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
+{
+	struct amdgpu_ctx *ctx;
+	struct idr *idp;
+	uint32_t id, i;
+
+	idp = &mgr->ctx_handles;
+
+	idr_for_each_entry(idp, ctx, id) {
+
+		if (!ctx->adev)
+			return;
+
+		for (i = 0; i < ctx->adev->num_rings; i++)
+			if (kref_read(&ctx->refcount) == 1)
+				drm_sched_entity_do_release(&ctx->adev->rings[i]->sched,
+						  &ctx->rings[i].entity);
+			else
+				DRM_ERROR("ctx %p is still alive\n", ctx);
+	}
+}
+
+void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr)
+{
+	struct amdgpu_ctx *ctx;
+	struct idr *idp;
+	uint32_t id, i;
+
+	idp = &mgr->ctx_handles;
+
+	idr_for_each_entry(idp, ctx, id) {
+
+		if (!ctx->adev)
+			return;
+
+		for (i = 0; i < ctx->adev->num_rings; i++)
+			if (kref_read(&ctx->refcount) == 1)
+				drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched,
+					&ctx->rings[i].entity);
+			else
+				DRM_ERROR("ctx %p is still alive\n", ctx);
+	}
+}
+
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
 {
 	struct amdgpu_ctx *ctx;
 	struct idr *idp;
 	uint32_t id;
 
+	amdgpu_ctx_mgr_entity_cleanup(mgr);
+
 	idp = &mgr->ctx_handles;
 
 	idr_for_each_entry(idp, ctx, id) {
-		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
+		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
 			DRM_ERROR("ctx %p is still alive\n", ctx);
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index bd9e723dbb2b..1ed379524117 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
 		return;
 
 	pm_runtime_get_sync(dev->dev);
-
-	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
+	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
 
 	if (adev->asic_type != CHIP_RAVEN) {
 		amdgpu_uvd_free_handles(adev, file_priv);
@@ -935,6 +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
 	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
 
 	amdgpu_vm_fini(adev, &fpriv->vm);
+	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
+
 	if (pasid)
 		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
 	amdgpu_bo_unref(&pd);
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 310275eaf128..44d21981bf3b 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -136,6 +136,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 	entity->rq = rq;
 	entity->sched = sched;
 	entity->guilty = guilty;
+	entity->fini_status = 0;
+	entity->last_scheduled = NULL;
 
 	spin_lock_init(&entity->rq_lock);
 	spin_lock_init(&entity->queue_lock);
@@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
 	return true;
 }
 
+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+				    struct dma_fence_cb *cb)
+{
+	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+						 finish_cb);
+	drm_sched_fence_finished(job->s_fence);
+	WARN_ON(job->s_fence->parent);
+	dma_fence_put(&job->s_fence->finished);
+	job->sched->ops->free_job(job);
+}
+
+
 /**
  * Destroy a context entity
  *
  * @sched       Pointer to scheduler instance
  * @entity	The pointer to a valid scheduler entity
  *
- * Cleanup and free the allocated resources.
+ * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting,
+ * removes the entity from the runqueue and returns an error when the process was killed.
  */
-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
 			   struct drm_sched_entity *entity)
 {
-	int r;
-
 	if (!drm_sched_entity_is_initialized(sched, entity))
 		return;
 	/**
@@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 	 * queued IBs or discard them on SIGKILL
 	*/
 	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
-		r = -ERESTARTSYS;
+		entity->fini_status = -ERESTARTSYS;
 	else
-		r = wait_event_killable(sched->job_scheduled,
+		entity->fini_status = wait_event_killable(sched->job_scheduled,
 					drm_sched_entity_is_idle(entity));
 	drm_sched_entity_set_rq(entity, NULL);
-	if (r) {
+}
+EXPORT_SYMBOL(drm_sched_entity_do_release);
+
+/**
+ * Destroy a context entity
+ *
+ * @sched       Pointer to scheduler instance
+ * @entity	The pointer to a valid scheduler entity
+ *
+ * The second one then goes over the entity and signals all jobs with an error code.
+ */
+void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity)
+{
+	if (entity->fini_status) {
 		struct drm_sched_job *job;
+		int r;
 
 		/* Park the kernel for a moment to make sure it isn't processing
 		 * our enity.
@@ -241,13 +269,26 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 			struct drm_sched_fence *s_fence = job->s_fence;
 			drm_sched_fence_scheduled(s_fence);
 			dma_fence_set_error(&s_fence->finished, -ESRCH);
-			drm_sched_fence_finished(s_fence);
-			WARN_ON(s_fence->parent);
-			dma_fence_put(&s_fence->finished);
-			sched->ops->free_job(job);
+			r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
+							drm_sched_entity_kill_jobs_cb);
+			if (r == -ENOENT)
+				drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
+			else if (r)
+				DRM_ERROR("fence add callback failed (%d)\n", r);
 		}
+
+		dma_fence_put(entity->last_scheduled);
+		entity->last_scheduled = NULL;
 	}
 }
+EXPORT_SYMBOL(drm_sched_entity_cleanup);
+
+void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+				struct drm_sched_entity *entity)
+{
+	drm_sched_entity_do_release(sched, entity);
+	drm_sched_entity_cleanup(sched, entity);
+}
 EXPORT_SYMBOL(drm_sched_entity_fini);
 
 static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
@@ -530,6 +571,10 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
 		spin_unlock(&sched->job_list_lock);
 		fence = sched->ops->run_job(s_job);
 		atomic_inc(&sched->hw_rq_count);
+
+		dma_fence_put(s_job->entity->last_scheduled);
+		s_job->entity->last_scheduled = dma_fence_get(&s_fence->finished);
+
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
 			r = dma_fence_add_callback(fence, &s_fence->cb,
@@ -556,6 +601,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
 		       void *owner)
 {
 	job->sched = sched;
+	job->entity = entity;
 	job->s_priority = entity->rq - sched->sched_rq;
 	job->s_fence = drm_sched_fence_create(entity, owner);
 	if (!job->s_fence)
@@ -669,6 +715,9 @@ static int drm_sched_main(void *param)
 		fence = sched->ops->run_job(sched_job);
 		drm_sched_fence_scheduled(s_fence);
 
+		dma_fence_put(entity->last_scheduled);
+		entity->last_scheduled = dma_fence_get(&s_fence->finished);
+
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
 			r = dma_fence_add_callback(fence, &s_fence->cb,
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index c053a32341bf..350a62c26b29 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -65,6 +65,8 @@ struct drm_sched_entity {
 	struct dma_fence		*dependency;
 	struct dma_fence_cb		cb;
 	atomic_t			*guilty; /* points to ctx's guilty */
+	int            fini_status;
+	struct dma_fence    *last_scheduled;
 };
 
 /**
@@ -119,6 +121,7 @@ struct drm_sched_job {
 	uint64_t			id;
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
+	struct drm_sched_entity  *entity;
 };
 
 static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
@@ -186,6 +189,10 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 			  struct drm_sched_entity *entity,
 			  struct drm_sched_rq *rq,
 			  uint32_t jobs, atomic_t *guilty);
+void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity);
+void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity);
 void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 			   struct drm_sched_entity *entity);
 void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
-- 
cgit v1.2.3


From 1afd30efeddbb1b32cf35d3bf6477b35690eeca6 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 10 Apr 2018 13:42:29 +0200
Subject: drm/amdgpu: revert "add new bo flag that indicates BOs don't need
 fallback (v2)"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 6f51d28bfe8e1a676de5cd877639245bed3cc818.

Makes fallback handling to complicated. This is just a feature for the
GEM interface and shouldn't leak into the core BO create function.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 +----
 include/uapi/drm/amdgpu_drm.h              | 2 --
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 68af2f878bc9..e1756b68a17b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -385,8 +385,7 @@ retry:
 	    amdgpu_bo_in_cpu_visible_vram(bo))
 		p->bytes_moved_vis += ctx.bytes_moved;
 
-	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains &&
-	    !(bo->flags & AMDGPU_GEM_CREATE_NO_FALLBACK)) {
+	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
 		domain = bo->allowed_domains;
 		goto retry;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index a160ef0332d6..1de6864da717 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -388,8 +388,6 @@ retry:
 	drm_gem_private_object_init(adev->ddev, &bo->gem_base, size);
 	INIT_LIST_HEAD(&bo->shadow_list);
 	INIT_LIST_HEAD(&bo->va);
-	bo->preferred_domains = preferred_domains;
-	bo->allowed_domains = allowed_domains;
 
 	bo->flags = flags;
 
@@ -426,8 +424,7 @@ retry:
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
 				 &bo->placement, page_align, &ctx, acc_size,
 				 NULL, resv, &amdgpu_ttm_bo_destroy);
-	if (unlikely(r && r != -ERESTARTSYS) && type == ttm_bo_type_device &&
-	    !(flags & AMDGPU_GEM_CREATE_NO_FALLBACK)) {
+	if (unlikely(r && r != -ERESTARTSYS) && type == ttm_bo_type_device) {
 		if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) {
 			flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
 			goto retry;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 4f5a27d64c54..c363b67f2d0a 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -95,8 +95,6 @@ extern "C" {
 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID	(1 << 6)
 /* Flag that BO sharing will be explicitly synchronized */
 #define AMDGPU_GEM_CREATE_EXPLICIT_SYNC		(1 << 7)
-/* Flag that BO doesn't need fallback */
-#define AMDGPU_GEM_CREATE_NO_FALLBACK		(1 << 8)
 
 struct drm_amdgpu_gem_create_in  {
 	/** the requested memory size */
-- 
cgit v1.2.3


From 3f188453faf7ba5b59e8064df4afffbc946e25ec Mon Sep 17 00:00:00 2001
From: Chunming Zhou <david1.zhou@amd.com>
Date: Tue, 17 Apr 2018 18:34:40 +0800
Subject: drm/amdgpu: handle domain mask checking v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

if domain is illegal, we should return error.
v2:
  remove duplicated domain checking.

Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 7 +------
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 9 +--------
 include/uapi/drm/amdgpu_drm.h              | 6 ++++++
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index ff606ce88837..c62c3dd4dcc6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -229,12 +229,7 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 
 	/* reject invalid gem domains */
-	if (args->in.domains & ~(AMDGPU_GEM_DOMAIN_CPU |
-				 AMDGPU_GEM_DOMAIN_GTT |
-				 AMDGPU_GEM_DOMAIN_VRAM |
-				 AMDGPU_GEM_DOMAIN_GDS |
-				 AMDGPU_GEM_DOMAIN_GWS |
-				 AMDGPU_GEM_DOMAIN_OA))
+	if (args->in.domains & ~AMDGPU_GEM_DOMAIN_MASK)
 		return -EINVAL;
 
 	/* create a gem object to contain this object in */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 9258f0694922..feece0a491a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -360,7 +360,6 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 	};
 	struct amdgpu_bo *bo;
 	unsigned long page_align, size = bp->size;
-	u32 preferred_domains;
 	size_t acc_size;
 	int r;
 
@@ -381,14 +380,8 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 	drm_gem_private_object_init(adev->ddev, &bo->gem_base, size);
 	INIT_LIST_HEAD(&bo->shadow_list);
 	INIT_LIST_HEAD(&bo->va);
-	preferred_domains = bp->preferred_domain ? bp->preferred_domain :
+	bo->preferred_domains = bp->preferred_domain ? bp->preferred_domain :
 		bp->domain;
-	bo->preferred_domains = preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
-						     AMDGPU_GEM_DOMAIN_GTT |
-						     AMDGPU_GEM_DOMAIN_CPU |
-						     AMDGPU_GEM_DOMAIN_GDS |
-						     AMDGPU_GEM_DOMAIN_GWS |
-						     AMDGPU_GEM_DOMAIN_OA);
 	bo->allowed_domains = bo->preferred_domains;
 	if (bp->type != ttm_bo_type_kernel &&
 	    bo->allowed_domains == AMDGPU_GEM_DOMAIN_VRAM)
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index c363b67f2d0a..b193e95f1f24 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -78,6 +78,12 @@ extern "C" {
 #define AMDGPU_GEM_DOMAIN_GDS		0x8
 #define AMDGPU_GEM_DOMAIN_GWS		0x10
 #define AMDGPU_GEM_DOMAIN_OA		0x20
+#define AMDGPU_GEM_DOMAIN_MASK		(AMDGPU_GEM_DOMAIN_CPU | \
+					 AMDGPU_GEM_DOMAIN_GTT | \
+					 AMDGPU_GEM_DOMAIN_VRAM | \
+					 AMDGPU_GEM_DOMAIN_GDS | \
+					 AMDGPU_GEM_DOMAIN_GWS | \
+					 AMDGPU_GEM_DOMAIN_OA)
 
 /* Flag that CPU access will be required for the case of VRAM domain */
 #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED	(1 << 0)
-- 
cgit v1.2.3


From d240cd9eddd943dbe0267d081697195ff1e90b65 Mon Sep 17 00:00:00 2001
From: Marek Olšák <marek.olsak@amd.com>
Date: Tue, 3 Apr 2018 13:05:03 -0400
Subject: drm/amdgpu: optionally do a writeback but don't invalidate TC for IB
 fences
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a new IB flag that enables this new behavior.
Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense
when draw calls from two adjacent gfx IBs run in parallel. This will be
the new default for Mesa.

v2: bump the version

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  8 ++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  4 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c     | 11 +++++++----
 drivers/gpu/drm/amd/amdgpu/soc15d.h       |  1 +
 include/uapi/drm/amdgpu_drm.h             |  4 ++++
 8 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 5c0567ad1ba7..7c17a0bc2cd2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -75,9 +75,10 @@
  * - 3.23.0 - Add query for VRAM lost counter
  * - 3.24.0 - Add high priority compute support for gfx9
  * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
+ * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	25
+#define KMS_DRIVER_MINOR	26
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 97449e06a242..d09fcab2398f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -131,7 +131,8 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
  * Emits a fence command on the requested ring (all asics).
  * Returns 0 on success, -ENOMEM on failure.
  */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+		      unsigned flags)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_fence *fence;
@@ -149,7 +150,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
 		       adev->fence_context + ring->idx,
 		       seq);
 	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
-			       seq, AMDGPU_FENCE_FLAG_INT);
+			       seq, flags | AMDGPU_FENCE_FLAG_INT);
 
 	ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
 	/* This function can't be called concurrently anyway, otherwise
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 311589e02d17..f70eeed9ed76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -127,6 +127,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	struct amdgpu_vm *vm;
 	uint64_t fence_ctx;
 	uint32_t status = 0, alloc_size;
+	unsigned fence_flags = 0;
 
 	unsigned i;
 	int r = 0;
@@ -227,7 +228,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 #endif
 		amdgpu_asic_invalidate_hdp(adev, ring);
 
-	r = amdgpu_fence_emit(ring, f);
+	if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE)
+		fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+
+	r = amdgpu_fence_emit(ring, f, fence_flags);
 	if (r) {
 		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
 		if (job && job->vmid)
@@ -242,7 +246,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	/* wrap the last IB with fence */
 	if (job && job->uf_addr) {
 		amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
-				       AMDGPU_FENCE_FLAG_64BIT);
+				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
 	}
 
 	if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 08fcdf6f7b53..4f8dac2d36a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -42,6 +42,7 @@
 
 #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
+#define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
 
 enum amdgpu_ring_type {
 	AMDGPU_RING_TYPE_GFX,
@@ -90,7 +91,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
 				   unsigned irq_type);
 void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
 void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
+		      unsigned flags);
 int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
 void amdgpu_fence_process(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 9ec7c1041df2..9c2195a2896d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -633,7 +633,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
 		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
 
 	if (vm_flush_needed || pasid_mapping_needed) {
-		r = amdgpu_fence_emit(ring, &fence);
+		r = amdgpu_fence_emit(ring, &fence, 0);
 		if (r)
 			return r;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 6a19e0311a9c..05b2d34110b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3775,13 +3775,16 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 {
 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
+	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
 
 	/* RELEASE_MEM - flush caches, send int */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
-	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
-				 EOP_TC_ACTION_EN |
-				 EOP_TC_WB_ACTION_EN |
-				 EOP_TC_MD_ACTION_EN |
+	amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
+					       EOP_TC_NC_ACTION_EN) :
+					      (EOP_TCL1_ACTION_EN |
+					       EOP_TC_ACTION_EN |
+					       EOP_TC_WB_ACTION_EN |
+					       EOP_TC_MD_ACTION_EN)) |
 				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
 				 EVENT_INDEX(5)));
 	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
index 7f408f85fdb6..839a144c1645 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
@@ -159,6 +159,7 @@
 #define		EOP_TC_WB_ACTION_EN                     (1 << 15) /* L2 */
 #define		EOP_TCL1_ACTION_EN                      (1 << 16)
 #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
+#define		EOP_TC_NC_ACTION_EN			(1 << 19)
 #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
 
 #define		DATA_SEL(x)                             ((x) << 29)
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index b193e95f1f24..78fe828f2f79 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -526,6 +526,10 @@ union drm_amdgpu_cs {
 /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
 #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
 
+/* The IB fence should do the L2 writeback but not invalidate any shader
+ * caches (L2/vL1/sL1/I$). */
+#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
+
 struct drm_amdgpu_cs_chunk_ib {
 	__u32 _pad;
 	/** AMDGPU_IB_FLAG_* */
-- 
cgit v1.2.3


From 621a6318adea69b08a3652c64bc7cc0cb4dacfb4 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Mon, 22 Jan 2018 20:48:14 +0800
Subject: drm/amdgpu: add save restore list cntl gpm and srm firmware support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RLC save/restore list cntl/gpm_mem/srm_mem ucodes are used for CGPG and gfxoff
function.

Signed-off-by: Huang Rui <ray.huang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h       | 15 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   | 36 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 17 +++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c     | 55 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c    |  9 +++++
 include/uapi/drm/amdgpu_drm.h             |  6 ++++
 7 files changed, 138 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d64ef30fed47..5ad893915a85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -774,9 +774,18 @@ struct amdgpu_rlc {
 	u32 starting_offsets_start;
 	u32 reg_list_format_size_bytes;
 	u32 reg_list_size_bytes;
+	u32 reg_list_format_direct_reg_list_length;
+	u32 save_restore_list_cntl_size_bytes;
+	u32 save_restore_list_gpm_size_bytes;
+	u32 save_restore_list_srm_size_bytes;
 
 	u32 *register_list_format;
 	u32 *register_restore;
+	u8 *save_restore_list_cntl;
+	u8 *save_restore_list_gpm;
+	u8 *save_restore_list_srm;
+
+	bool is_rlc_v2_1;
 };
 
 #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES
@@ -943,6 +952,12 @@ struct amdgpu_gfx {
 	uint32_t			ce_feature_version;
 	uint32_t			pfp_feature_version;
 	uint32_t			rlc_feature_version;
+	uint32_t			rlc_srlc_fw_version;
+	uint32_t			rlc_srlc_feature_version;
+	uint32_t			rlc_srlg_fw_version;
+	uint32_t			rlc_srlg_feature_version;
+	uint32_t			rlc_srls_fw_version;
+	uint32_t			rlc_srls_feature_version;
 	uint32_t			mec_feature_version;
 	uint32_t			mec2_feature_version;
 	struct amdgpu_ring		gfx_ring[AMDGPU_MAX_GFX_RINGS];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d602f8b14c58..eb4785e51573 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -215,6 +215,18 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info,
 		fw_info->ver = adev->gfx.rlc_fw_version;
 		fw_info->feature = adev->gfx.rlc_feature_version;
 		break;
+	case AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL:
+		fw_info->ver = adev->gfx.rlc_srlc_fw_version;
+		fw_info->feature = adev->gfx.rlc_srlc_feature_version;
+		break;
+	case AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM:
+		fw_info->ver = adev->gfx.rlc_srlg_fw_version;
+		fw_info->feature = adev->gfx.rlc_srlg_feature_version;
+		break;
+	case AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM:
+		fw_info->ver = adev->gfx.rlc_srls_fw_version;
+		fw_info->feature = adev->gfx.rlc_srls_feature_version;
+		break;
 	case AMDGPU_INFO_FW_GFX_MEC:
 		if (query_fw->index == 0) {
 			fw_info->ver = adev->gfx.mec_fw_version;
@@ -1149,6 +1161,30 @@ static int amdgpu_debugfs_firmware_info(struct seq_file *m, void *data)
 	seq_printf(m, "RLC feature version: %u, firmware version: 0x%08x\n",
 		   fw_info.feature, fw_info.ver);
 
+	/* RLC SAVE RESTORE LIST CNTL */
+	query_fw.fw_type = AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL;
+	ret = amdgpu_firmware_info(&fw_info, &query_fw, adev);
+	if (ret)
+		return ret;
+	seq_printf(m, "RLC SRLC feature version: %u, firmware version: 0x%08x\n",
+		   fw_info.feature, fw_info.ver);
+
+	/* RLC SAVE RESTORE LIST GPM MEM */
+	query_fw.fw_type = AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM;
+	ret = amdgpu_firmware_info(&fw_info, &query_fw, adev);
+	if (ret)
+		return ret;
+	seq_printf(m, "RLC SRLG feature version: %u, firmware version: 0x%08x\n",
+		   fw_info.feature, fw_info.ver);
+
+	/* RLC SAVE RESTORE LIST SRM MEM */
+	query_fw.fw_type = AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM;
+	ret = amdgpu_firmware_info(&fw_info, &query_fw, adev);
+	if (ret)
+		return ret;
+	seq_printf(m, "RLC SRLS feature version: %u, firmware version: 0x%08x\n",
+		   fw_info.feature, fw_info.ver);
+
 	/* MEC */
 	query_fw.fw_type = AMDGPU_INFO_FW_GFX_MEC;
 	query_fw.index = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index 84d652599d5b..0c74c09ef3b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -337,7 +337,10 @@ static int amdgpu_ucode_init_single_fw(struct amdgpu_device *adev,
 	    (ucode->ucode_id != AMDGPU_UCODE_ID_CP_MEC1 &&
 	     ucode->ucode_id != AMDGPU_UCODE_ID_CP_MEC2 &&
 	     ucode->ucode_id != AMDGPU_UCODE_ID_CP_MEC1_JT &&
-	     ucode->ucode_id != AMDGPU_UCODE_ID_CP_MEC2_JT)) {
+	     ucode->ucode_id != AMDGPU_UCODE_ID_CP_MEC2_JT &&
+	     ucode->ucode_id != AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL &&
+	     ucode->ucode_id != AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM &&
+	     ucode->ucode_id != AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM)) {
 		ucode->ucode_size = le32_to_cpu(header->ucode_size_bytes);
 
 		memcpy(ucode->kaddr, (void *)((uint8_t *)ucode->fw->data +
@@ -359,6 +362,18 @@ static int amdgpu_ucode_init_single_fw(struct amdgpu_device *adev,
 					      le32_to_cpu(header->ucode_array_offset_bytes) +
 					      le32_to_cpu(cp_hdr->jt_offset) * 4),
 		       ucode->ucode_size);
+	} else if (ucode->ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL) {
+		ucode->ucode_size = adev->gfx.rlc.save_restore_list_cntl_size_bytes;
+		memcpy(ucode->kaddr, adev->gfx.rlc.save_restore_list_cntl,
+		       ucode->ucode_size);
+	} else if (ucode->ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM) {
+		ucode->ucode_size = adev->gfx.rlc.save_restore_list_gpm_size_bytes;
+		memcpy(ucode->kaddr, adev->gfx.rlc.save_restore_list_gpm,
+		       ucode->ucode_size);
+	} else if (ucode->ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM) {
+		ucode->ucode_size = adev->gfx.rlc.save_restore_list_srm_size_bytes;
+		memcpy(ucode->kaddr, adev->gfx.rlc.save_restore_list_srm,
+		       ucode->ucode_size);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
index 0b262f4bb4fc..08e38579af24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
@@ -187,6 +187,9 @@ enum AMDGPU_UCODE_ID {
 	AMDGPU_UCODE_ID_CP_MEC2,
 	AMDGPU_UCODE_ID_CP_MEC2_JT,
 	AMDGPU_UCODE_ID_RLC_G,
+	AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL,
+	AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM,
+	AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM,
 	AMDGPU_UCODE_ID_STORAGE,
 	AMDGPU_UCODE_ID_SMC,
 	AMDGPU_UCODE_ID_UVD,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 587a8731fa31..73b76fa29bad 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -41,7 +41,6 @@
 #define GFX9_MEC_HPD_SIZE 2048
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x00002000L
 #define RLC_SAVE_RESTORE_ADDR_STARTING_OFFSET 0x00000000L
-#define GFX9_RLC_FORMAT_DIRECT_REG_LIST_LENGTH 34
 
 #define mmPWR_MISC_CNTL_STATUS					0x0183
 #define mmPWR_MISC_CNTL_STATUS_BASE_IDX				0
@@ -401,6 +400,27 @@ static void gfx_v9_0_free_microcode(struct amdgpu_device *adev)
 	kfree(adev->gfx.rlc.register_list_format);
 }
 
+static void gfx_v9_0_init_rlc_ext_microcode(struct amdgpu_device *adev)
+{
+	const struct rlc_firmware_header_v2_1 *rlc_hdr;
+
+	rlc_hdr = (const struct rlc_firmware_header_v2_1 *)adev->gfx.rlc_fw->data;
+	adev->gfx.rlc_srlc_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_cntl_ucode_ver);
+	adev->gfx.rlc_srlc_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_cntl_feature_ver);
+	adev->gfx.rlc.save_restore_list_cntl_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_cntl_size_bytes);
+	adev->gfx.rlc.save_restore_list_cntl = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_cntl_offset_bytes);
+	adev->gfx.rlc_srlg_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_gpm_ucode_ver);
+	adev->gfx.rlc_srlg_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_gpm_feature_ver);
+	adev->gfx.rlc.save_restore_list_gpm_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_gpm_size_bytes);
+	adev->gfx.rlc.save_restore_list_gpm = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_gpm_offset_bytes);
+	adev->gfx.rlc_srls_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_srm_ucode_ver);
+	adev->gfx.rlc_srls_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_srm_feature_ver);
+	adev->gfx.rlc.save_restore_list_srm_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_srm_size_bytes);
+	adev->gfx.rlc.save_restore_list_srm = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_srm_offset_bytes);
+	adev->gfx.rlc.reg_list_format_direct_reg_list_length =
+			le32_to_cpu(rlc_hdr->reg_list_format_direct_reg_list_length);
+}
+
 static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
 {
 	const char *chip_name;
@@ -412,6 +432,8 @@ static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
 	const struct rlc_firmware_header_v2_0 *rlc_hdr;
 	unsigned int *tmp = NULL;
 	unsigned int i = 0;
+	uint16_t version_major;
+	uint16_t version_minor;
 
 	DRM_DEBUG("\n");
 
@@ -468,6 +490,12 @@ static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
 		goto out;
 	err = amdgpu_ucode_validate(adev->gfx.rlc_fw);
 	rlc_hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
+
+	version_major = le16_to_cpu(rlc_hdr->header.header_version_major);
+	version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor);
+	if (version_major == 2 && version_minor == 1)
+		adev->gfx.rlc.is_rlc_v2_1 = true;
+
 	adev->gfx.rlc_fw_version = le32_to_cpu(rlc_hdr->header.ucode_version);
 	adev->gfx.rlc_feature_version = le32_to_cpu(rlc_hdr->ucode_feature_version);
 	adev->gfx.rlc.save_and_restore_offset =
@@ -508,6 +536,9 @@ static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
 	for (i = 0 ; i < (rlc_hdr->reg_list_size_bytes >> 2); i++)
 		adev->gfx.rlc.register_restore[i] = le32_to_cpu(tmp[i]);
 
+	if (adev->gfx.rlc.is_rlc_v2_1)
+		gfx_v9_0_init_rlc_ext_microcode(adev);
+
 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec.bin", chip_name);
 	err = request_firmware(&adev->gfx.mec_fw, fw_name, adev->dev);
 	if (err)
@@ -566,6 +597,26 @@ static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
 		adev->firmware.fw_size +=
 			ALIGN(le32_to_cpu(header->ucode_size_bytes), PAGE_SIZE);
 
+		if (adev->gfx.rlc.is_rlc_v2_1) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.save_restore_list_cntl_size_bytes, PAGE_SIZE);
+
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.save_restore_list_gpm_size_bytes, PAGE_SIZE);
+
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.save_restore_list_srm_size_bytes, PAGE_SIZE);
+		}
+
 		info = &adev->firmware.ucode[AMDGPU_UCODE_ID_CP_MEC1];
 		info->ucode_id = AMDGPU_UCODE_ID_CP_MEC1;
 		info->fw = adev->gfx.mec_fw;
@@ -1781,7 +1832,7 @@ static int gfx_v9_0_init_rlc_save_restore_list(struct amdgpu_device *adev)
 
 	/* setup unique_indirect_regs array and indirect_start_offsets array */
 	gfx_v9_0_parse_ind_reg_list(register_list_format,
-				GFX9_RLC_FORMAT_DIRECT_REG_LIST_LENGTH,
+				adev->gfx.rlc.reg_list_format_direct_reg_list_length,
 				adev->gfx.rlc.reg_list_format_size_bytes >> 2,
 				unique_indirect_regs,
 				&unique_indirect_reg_count,
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
index 8873d833a7f7..0ff136d02d9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
@@ -70,6 +70,15 @@ psp_v10_0_get_fw_type(struct amdgpu_firmware_info *ucode, enum psp_gfx_fw_type *
 	case AMDGPU_UCODE_ID_RLC_G:
 		*type = GFX_FW_TYPE_RLC_G;
 		break;
+	case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL:
+		*type = GFX_FW_TYPE_RLC_RESTORE_LIST_CNTL;
+		break;
+	case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM:
+		*type = GFX_FW_TYPE_RLC_RESTORE_LIST_GPM_MEM;
+		break;
+	case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM:
+		*type = GFX_FW_TYPE_RLC_RESTORE_LIST_SRM_MEM;
+		break;
 	case AMDGPU_UCODE_ID_SMC:
 		*type = GFX_FW_TYPE_SMU;
 		break;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 78fe828f2f79..081d25640b64 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -630,6 +630,12 @@ struct drm_amdgpu_cs_chunk_data {
 	#define AMDGPU_INFO_FW_ASD		0x0d
 	/* Subquery id: Query VCN firmware version */
 	#define AMDGPU_INFO_FW_VCN		0x0e
+	/* Subquery id: Query GFX RLC SRLC firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f
+	/* Subquery id: Query GFX RLC SRLG firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10
+	/* Subquery id: Query GFX RLC SRLS firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11
 /* number of bytes moved for TTM migration */
 #define AMDGPU_INFO_NUM_BYTES_MOVED		0x0f
 /* the used VRAM size */
-- 
cgit v1.2.3


From 48ff108d9dc42bf92256484c50cdb3697f5ccb04 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Thu, 9 Nov 2017 13:18:24 -0500
Subject: drm/amdgpu: add VEGAM ASIC type

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
 include/drm/amd_asic_type.h                | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9e917f53f357..8ce60e6e2614 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -83,6 +83,7 @@ static const char *amdgpu_asic_name[] = {
 	"POLARIS10",
 	"POLARIS11",
 	"POLARIS12",
+	"VEGAM",
 	"VEGA10",
 	"VEGA12",
 	"RAVEN",
diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h
index 6c731c52c071..695bde7eb055 100644
--- a/include/drm/amd_asic_type.h
+++ b/include/drm/amd_asic_type.h
@@ -44,6 +44,7 @@ enum amd_asic_type {
 	CHIP_POLARIS10,
 	CHIP_POLARIS11,
 	CHIP_POLARIS12,
+	CHIP_VEGAM,
 	CHIP_VEGA10,
 	CHIP_VEGA12,
 	CHIP_RAVEN,
-- 
cgit v1.2.3


From 959a2091fae0fa498c79e095a4f6cbbb202a1194 Mon Sep 17 00:00:00 2001
From: Yong Zhao <yong.zhao@amd.com>
Date: Mon, 14 May 2018 12:15:27 -0400
Subject: drm/amdgpu: Add support to change mtype for 2nd part of gart BOs on
 GFX9
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change prepares for a workaround in amdkfd for a GFX9 HW bug. It
requires the control stack memory of compute queues, which is allocated
from the second page of MQD gart BOs, to have mtype NC, rather than
the default UC.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 54 +++++++++++++++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  5 +--
 include/uapi/drm/amdgpu_drm.h           |  4 +++
 3 files changed, 51 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dfd22db13fb1..cc3b067e1ec6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -834,6 +834,45 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
 	sg_free_table(ttm->sg);
 }
 
+int amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
+				struct ttm_buffer_object *tbo,
+				uint64_t flags)
+{
+	struct amdgpu_bo *abo = ttm_to_amdgpu_bo(tbo);
+	struct ttm_tt *ttm = tbo->ttm;
+	struct amdgpu_ttm_tt *gtt = (void *)ttm;
+	int r;
+
+	if (abo->flags & AMDGPU_GEM_CREATE_MQD_GFX9) {
+		uint64_t page_idx = 1;
+
+		r = amdgpu_gart_bind(adev, gtt->offset, page_idx,
+				ttm->pages, gtt->ttm.dma_address, flags);
+		if (r)
+			goto gart_bind_fail;
+
+		/* Patch mtype of the second part BO */
+		flags &=  ~AMDGPU_PTE_MTYPE_MASK;
+		flags |= AMDGPU_PTE_MTYPE(AMDGPU_MTYPE_NC);
+
+		r = amdgpu_gart_bind(adev,
+				gtt->offset + (page_idx << PAGE_SHIFT),
+				ttm->num_pages - page_idx,
+				&ttm->pages[page_idx],
+				&(gtt->ttm.dma_address[page_idx]), flags);
+	} else {
+		r = amdgpu_gart_bind(adev, gtt->offset, ttm->num_pages,
+				     ttm->pages, gtt->ttm.dma_address, flags);
+	}
+
+gart_bind_fail:
+	if (r)
+		DRM_ERROR("failed to bind %lu pages at 0x%08llX\n",
+			  ttm->num_pages, gtt->offset);
+
+	return r;
+}
+
 static int amdgpu_ttm_backend_bind(struct ttm_tt *ttm,
 				   struct ttm_mem_reg *bo_mem)
 {
@@ -907,8 +946,7 @@ int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo)
 
 	flags = amdgpu_ttm_tt_pte_flags(adev, bo->ttm, &tmp);
 	gtt->offset = (u64)tmp.start << PAGE_SHIFT;
-	r = amdgpu_gart_bind(adev, gtt->offset, bo->ttm->num_pages,
-			     bo->ttm->pages, gtt->ttm.dma_address, flags);
+	r = amdgpu_ttm_gart_bind(adev, bo, flags);
 	if (unlikely(r)) {
 		ttm_bo_mem_put(bo, &tmp);
 		return r;
@@ -925,19 +963,15 @@ int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo)
 int amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo)
 {
 	struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev);
-	struct amdgpu_ttm_tt *gtt = (void *)tbo->ttm;
 	uint64_t flags;
 	int r;
 
-	if (!gtt)
+	if (!tbo->ttm)
 		return 0;
 
-	flags = amdgpu_ttm_tt_pte_flags(adev, &gtt->ttm.ttm, &tbo->mem);
-	r = amdgpu_gart_bind(adev, gtt->offset, gtt->ttm.ttm.num_pages,
-			     gtt->ttm.ttm.pages, gtt->ttm.dma_address, flags);
-	if (r)
-		DRM_ERROR("failed to bind %lu pages at 0x%08llX\n",
-			  gtt->ttm.ttm.num_pages, gtt->offset);
+	flags = amdgpu_ttm_tt_pte_flags(adev, tbo->ttm, &tbo->mem);
+	r = amdgpu_ttm_gart_bind(adev, tbo, flags);
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 30f080364c97..4cf678684a12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -75,11 +75,12 @@ struct amdgpu_bo_list_entry;
 /* PDE Block Fragment Size for VEGA10 */
 #define AMDGPU_PDE_BFS(a)	((uint64_t)a << 59)
 
-/* VEGA10 only */
+
+/* For GFX9 */
 #define AMDGPU_PTE_MTYPE(a)    ((uint64_t)a << 57)
 #define AMDGPU_PTE_MTYPE_MASK	AMDGPU_PTE_MTYPE(3ULL)
 
-/* For Raven */
+#define AMDGPU_MTYPE_NC 0
 #define AMDGPU_MTYPE_CC 2
 
 #define AMDGPU_PTE_DEFAULT_ATC  (AMDGPU_PTE_SYSTEM      \
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 081d25640b64..78b4dd89fcb4 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -101,6 +101,10 @@ extern "C" {
 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID	(1 << 6)
 /* Flag that BO sharing will be explicitly synchronized */
 #define AMDGPU_GEM_CREATE_EXPLICIT_SYNC		(1 << 7)
+/* Flag that indicates allocating MQD gart on GFX9, where the mtype
+ * for the second page onward should be set to NC.
+ */
+#define AMDGPU_GEM_CREATE_MQD_GFX9		(1 << 8)
 
 struct drm_amdgpu_gem_create_in  {
 	/** the requested memory size */
-- 
cgit v1.2.3


From 8344c53f57057b42a5da87e9557c40fcda18fb7a Mon Sep 17 00:00:00 2001
From: Nayan Deshmukh <nayan26deshmukh@gmail.com>
Date: Thu, 29 Mar 2018 22:36:32 +0530
Subject: drm/scheduler: remove unused parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this patch also effect the amdgpu and etnaviv drivers which
use the function drm_sched_entity_init

Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Acked-by: Lucas Stach <l.stach@pengutronix.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    | 2 +-
 drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c     | 2 +-
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c     | 2 +-
 drivers/gpu/drm/etnaviv/etnaviv_drv.c     | 2 +-
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 3 +--
 include/drm/gpu_scheduler.h               | 2 +-
 11 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 6741a62a7d15..a8e531d604fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -91,7 +91,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
 			continue;
 
 		r = drm_sched_entity_init(&ring->sched, &ctx->rings[i].entity,
-					  rq, amdgpu_sched_jobs, &ctx->guilty);
+					  rq, &ctx->guilty);
 		if (r)
 			goto failed;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index cc3b067e1ec6..5e9fd256faad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -111,7 +111,7 @@ static int amdgpu_ttm_global_init(struct amdgpu_device *adev)
 	ring = adev->mman.buffer_funcs_ring;
 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_KERNEL];
 	r = drm_sched_entity_init(&ring->sched, &adev->mman.entity,
-				  rq, amdgpu_sched_jobs, NULL);
+				  rq, NULL);
 	if (r) {
 		DRM_ERROR("Failed setting up TTM BO move run queue.\n");
 		goto error_entity;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index d8dd4028c2bb..de4d77af02ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -242,7 +242,7 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev)
 	ring = &adev->uvd.ring;
 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL];
 	r = drm_sched_entity_init(&ring->sched, &adev->uvd.entity,
-				  rq, amdgpu_sched_jobs, NULL);
+				  rq, NULL);
 	if (r != 0) {
 		DRM_ERROR("Failed setting up UVD run queue.\n");
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index e2186eda3271..a86322f5164f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -186,7 +186,7 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size)
 	ring = &adev->vce.ring[0];
 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL];
 	r = drm_sched_entity_init(&ring->sched, &adev->vce.entity,
-				  rq, amdgpu_sched_jobs, NULL);
+				  rq, NULL);
 	if (r != 0) {
 		DRM_ERROR("Failed setting up VCE run queue.\n");
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 58e495330b38..e5d234cf804f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -105,7 +105,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
 	ring = &adev->vcn.ring_dec;
 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL];
 	r = drm_sched_entity_init(&ring->sched, &adev->vcn.entity_dec,
-				  rq, amdgpu_sched_jobs, NULL);
+				  rq, NULL);
 	if (r != 0) {
 		DRM_ERROR("Failed setting up VCN dec run queue.\n");
 		return r;
@@ -114,7 +114,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
 	ring = &adev->vcn.ring_enc[0];
 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL];
 	r = drm_sched_entity_init(&ring->sched, &adev->vcn.entity_enc,
-				  rq, amdgpu_sched_jobs, NULL);
+				  rq, NULL);
 	if (r != 0) {
 		DRM_ERROR("Failed setting up VCN enc run queue.\n");
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8e71d3984016..1a8f4e0dd023 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2404,7 +2404,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	ring = adev->vm_manager.vm_pte_rings[ring_instance];
 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_KERNEL];
 	r = drm_sched_entity_init(&ring->sched, &vm->entity,
-				  rq, amdgpu_sched_jobs, NULL);
+				  rq, NULL);
 	if (r)
 		return r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
index 8041b26a7a21..ca6ab56357b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
@@ -429,7 +429,7 @@ static int uvd_v6_0_sw_init(void *handle)
 		ring = &adev->uvd.ring_enc[0];
 		rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL];
 		r = drm_sched_entity_init(&ring->sched, &adev->uvd.entity_enc,
-					  rq, amdgpu_sched_jobs, NULL);
+					  rq, NULL);
 		if (r) {
 			DRM_ERROR("Failed setting up UVD ENC run queue.\n");
 			return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
index b0de1e04093b..0ca63d588670 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
@@ -418,7 +418,7 @@ static int uvd_v7_0_sw_init(void *handle)
 	ring = &adev->uvd.ring_enc[0];
 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL];
 	r = drm_sched_entity_init(&ring->sched, &adev->uvd.entity_enc,
-				  rq, amdgpu_sched_jobs, NULL);
+				  rq, NULL);
 	if (r) {
 		DRM_ERROR("Failed setting up UVD ENC run queue.\n");
 		return r;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
index ab50090d066c..23e73c2a19f4 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
@@ -116,7 +116,7 @@ static int etnaviv_open(struct drm_device *dev, struct drm_file *file)
 			drm_sched_entity_init(&gpu->sched,
 				&ctx->sched_entity[i],
 				&gpu->sched.sched_rq[DRM_SCHED_PRIORITY_NORMAL],
-				32, NULL);
+				NULL);
 			}
 	}
 
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 1f1dd70125a7..a364fc0b38c3 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -117,7 +117,6 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
  * @sched	The pointer to the scheduler
  * @entity	The pointer to a valid drm_sched_entity
  * @rq		The run queue this entity belongs
- * @jobs	The max number of jobs in the job queue
  * @guilty      atomic_t set to 1 when a job on this queue
  *              is found to be guilty causing a timeout
  *
@@ -126,7 +125,7 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
 int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 			  struct drm_sched_entity *entity,
 			  struct drm_sched_rq *rq,
-			  uint32_t jobs, atomic_t *guilty)
+			  atomic_t *guilty)
 {
 	if (!(sched && entity && rq))
 		return -EINVAL;
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 350a62c26b29..52380067a43f 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -188,7 +188,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 			  struct drm_sched_entity *entity,
 			  struct drm_sched_rq *rq,
-			  uint32_t jobs, atomic_t *guilty);
+			  atomic_t *guilty);
 void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
 			   struct drm_sched_entity *entity);
 void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
-- 
cgit v1.2.3


From c6d24c324e78a71e55af4e00de014cf0e0524dcb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 4 May 2018 04:13:45 +0300
Subject: lib/string_helpers: Add missed declaration of struct task_struct

Starting from the commit 0d0443288f22 the new function has been
introduced which takes struct task_struct as a parameter. Though,
compiler doesn't know where to get information about it at this stage.

Add missed declaration of struct task_struct to satisfy compiler.

Fixes: 0d0443288f22 ("string_helpers: add kstrdup_quotable_cmdline")
Cc: Kees Cook <keescook@chromium.org>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string_helpers.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 4397c52ec4a4..d23c5030901a 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 
 struct file;
+struct task_struct;
 
 /* Descriptions of the types of units to
  * print in */
-- 
cgit v1.2.3


From 0d52d803767e9dfab91348fbc8de1dfd388565fd Mon Sep 17 00:00:00 2001
From: Doug Ledford <dledford@redhat.com>
Date: Tue, 15 May 2018 15:54:46 -0400
Subject: RDMA/uapi: Fix uapi breakage

During this merge window, we added support for addition RDMA netlink
operations.  Unfortunately, we added the items in the middle of our uapi
enum.  Fix that before final release.

Fixes: da5c85078215 ("RDMA/nldev: add driver-specific resource
tracking")
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/rdma_netlink.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 07ff6c72fc50..edba6351ac13 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -399,6 +399,18 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_PD_ENTRY,		/* nested table */
 	RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,	/* u32 */
 	RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,	/* u32 */
+	/*
+	 * Provides logical name and index of netdevice which is
+	 * connected to physical port. This information is relevant
+	 * for RoCE and iWARP.
+	 *
+	 * The netdevices which are associated with containers are
+	 * supposed to be exported together with GID table once it
+	 * will be exposed through the netlink. Because the
+	 * associated netdevices are properties of GIDs.
+	 */
+	RDMA_NLDEV_ATTR_NDEV_INDEX,		/* u32 */
+	RDMA_NLDEV_ATTR_NDEV_NAME,		/* string */
 	/*
 	 * driver-specific attributes.
 	 */
@@ -415,18 +427,8 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_DRIVER_U64,		/* u64 */
 
 	/*
-	 * Provides logical name and index of netdevice which is
-	 * connected to physical port. This information is relevant
-	 * for RoCE and iWARP.
-	 *
-	 * The netdevices which are associated with containers are
-	 * supposed to be exported together with GID table once it
-	 * will be exposed through the netlink. Because the
-	 * associated netdevices are properties of GIDs.
+	 * Always the end
 	 */
-	RDMA_NLDEV_ATTR_NDEV_INDEX,		/* u32 */
-	RDMA_NLDEV_ATTR_NDEV_NAME,		/* string */
-
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From dcb899c47da9ff32e5156ddb9b2867f63ff7c4d0 Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Fri, 27 Apr 2018 12:25:47 +0930
Subject: clk: aspeed: Support second reset register

The ast2500 has an additional reset register that contains resets not
present in the ast2400. This enables support for this register, and adds
the one reset line that is controlled by it.

Reviewed-by: Andrew Jeffery <andrew@aj.id.au>
Signed-off-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-aspeed.c                 | 44 ++++++++++++++++++++++++++------
 include/dt-bindings/clock/aspeed-clock.h |  1 +
 2 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
index 5eb50c31e455..dd17a818dff8 100644
--- a/drivers/clk/clk-aspeed.c
+++ b/drivers/clk/clk-aspeed.c
@@ -16,6 +16,8 @@
 
 #define ASPEED_NUM_CLKS		35
 
+#define ASPEED_RESET2_OFFSET	32
+
 #define ASPEED_RESET_CTRL	0x04
 #define ASPEED_CLK_SELECTION	0x08
 #define ASPEED_CLK_STOP_CTRL	0x0c
@@ -30,6 +32,7 @@
 #define  CLKIN_25MHZ_EN		BIT(23)
 #define  AST2400_CLK_SOURCE_SEL	BIT(18)
 #define ASPEED_CLK_SELECTION_2	0xd8
+#define ASPEED_RESET_CTRL2	0xd4
 
 /* Globally visible clocks */
 static DEFINE_SPINLOCK(aspeed_clk_lock);
@@ -291,6 +294,7 @@ struct aspeed_reset {
 #define to_aspeed_reset(p) container_of((p), struct aspeed_reset, rcdev)
 
 static const u8 aspeed_resets[] = {
+	/* SCU04 resets */
 	[ASPEED_RESET_XDMA]	= 25,
 	[ASPEED_RESET_MCTP]	= 24,
 	[ASPEED_RESET_ADC]	= 23,
@@ -300,38 +304,62 @@ static const u8 aspeed_resets[] = {
 	[ASPEED_RESET_PCIVGA]	=  8,
 	[ASPEED_RESET_I2C]	=  2,
 	[ASPEED_RESET_AHB]	=  1,
+
+	/*
+	 * SCUD4 resets start at an offset to separate them from
+	 * the SCU04 resets.
+	 */
+	[ASPEED_RESET_CRT1]	= ASPEED_RESET2_OFFSET + 5,
 };
 
 static int aspeed_reset_deassert(struct reset_controller_dev *rcdev,
 				 unsigned long id)
 {
 	struct aspeed_reset *ar = to_aspeed_reset(rcdev);
-	u32 rst = BIT(aspeed_resets[id]);
+	u32 reg = ASPEED_RESET_CTRL;
+	u32 bit = aspeed_resets[id];
+
+	if (bit >= ASPEED_RESET2_OFFSET) {
+		bit -= ASPEED_RESET2_OFFSET;
+		reg = ASPEED_RESET_CTRL2;
+	}
 
-	return regmap_update_bits(ar->map, ASPEED_RESET_CTRL, rst, 0);
+	return regmap_update_bits(ar->map, reg, BIT(bit), 0);
 }
 
 static int aspeed_reset_assert(struct reset_controller_dev *rcdev,
 			       unsigned long id)
 {
 	struct aspeed_reset *ar = to_aspeed_reset(rcdev);
-	u32 rst = BIT(aspeed_resets[id]);
+	u32 reg = ASPEED_RESET_CTRL;
+	u32 bit = aspeed_resets[id];
 
-	return regmap_update_bits(ar->map, ASPEED_RESET_CTRL, rst, rst);
+	if (bit >= ASPEED_RESET2_OFFSET) {
+		bit -= ASPEED_RESET2_OFFSET;
+		reg = ASPEED_RESET_CTRL2;
+	}
+
+	return regmap_update_bits(ar->map, reg, BIT(bit), BIT(bit));
 }
 
 static int aspeed_reset_status(struct reset_controller_dev *rcdev,
 			       unsigned long id)
 {
 	struct aspeed_reset *ar = to_aspeed_reset(rcdev);
-	u32 val, rst = BIT(aspeed_resets[id]);
-	int ret;
+	u32 reg = ASPEED_RESET_CTRL;
+	u32 bit = aspeed_resets[id];
+	int ret, val;
+
+	if (bit >= ASPEED_RESET2_OFFSET) {
+		bit -= ASPEED_RESET2_OFFSET;
+		reg = ASPEED_RESET_CTRL2;
+	}
 
-	ret = regmap_read(ar->map, ASPEED_RESET_CTRL, &val);
+	ret = regmap_read(ar->map, reg, &val);
 	if (ret)
 		return ret;
 
-	return !!(val & rst);
+	return !!(val & BIT(bit));
 }
 
 static const struct reset_control_ops aspeed_reset_ops = {
diff --git a/include/dt-bindings/clock/aspeed-clock.h b/include/dt-bindings/clock/aspeed-clock.h
index d3558d897a4d..513c1b4af7a8 100644
--- a/include/dt-bindings/clock/aspeed-clock.h
+++ b/include/dt-bindings/clock/aspeed-clock.h
@@ -48,5 +48,6 @@
 #define ASPEED_RESET_PCIVGA		6
 #define ASPEED_RESET_I2C		7
 #define ASPEED_RESET_AHB		8
+#define ASPEED_RESET_CRT1		9
 
 #endif
-- 
cgit v1.2.3


From e76e56823a318ca580be4cfc5a6a9269bc70abea Mon Sep 17 00:00:00 2001
From: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
Date: Thu, 26 Apr 2018 10:22:32 -0700
Subject: clk:aspeed: Fix reset bits for PCI/VGA and PECI

This commit fixes incorrect setting of reset bits for PCI/VGA and
PECI modules.

1. Reset bit for PCI/VGA is 8.
2. PECI reset bit is missing so added bit 10 as its reset bit.

Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
Fixes: 15ed8ce5f84e ("clk: aspeed: Register gated clocks")
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-aspeed.c                 | 4 ++--
 include/dt-bindings/clock/aspeed-clock.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
index dd17a818dff8..eb5fb7f88ccd 100644
--- a/drivers/clk/clk-aspeed.c
+++ b/drivers/clk/clk-aspeed.c
@@ -91,7 +91,7 @@ static const struct aspeed_gate_data aspeed_gates[] = {
 	[ASPEED_CLK_GATE_GCLK] =	{  1,  7, "gclk-gate",		NULL,	0 }, /* 2D engine */
 	[ASPEED_CLK_GATE_MCLK] =	{  2, -1, "mclk-gate",		"mpll",	CLK_IS_CRITICAL }, /* SDRAM */
 	[ASPEED_CLK_GATE_VCLK] =	{  3,  6, "vclk-gate",		NULL,	0 }, /* Video Capture */
-	[ASPEED_CLK_GATE_BCLK] =	{  4, 10, "bclk-gate",		"bclk",	0 }, /* PCIe/PCI */
+	[ASPEED_CLK_GATE_BCLK] =	{  4,  8, "bclk-gate",		"bclk",	0 }, /* PCIe/PCI */
 	[ASPEED_CLK_GATE_DCLK] =	{  5, -1, "dclk-gate",		NULL,	0 }, /* DAC */
 	[ASPEED_CLK_GATE_REFCLK] =	{  6, -1, "refclk-gate",	"clkin", CLK_IS_CRITICAL },
 	[ASPEED_CLK_GATE_USBPORT2CLK] =	{  7,  3, "usb-port2-gate",	NULL,	0 }, /* USB2.0 Host port 2 */
@@ -301,7 +301,7 @@ static const u8 aspeed_resets[] = {
 	[ASPEED_RESET_JTAG_MASTER] = 22,
 	[ASPEED_RESET_MIC]	= 18,
 	[ASPEED_RESET_PWM]	=  9,
-	[ASPEED_RESET_PCIVGA]	=  8,
+	[ASPEED_RESET_PECI]	= 10,
 	[ASPEED_RESET_I2C]	=  2,
 	[ASPEED_RESET_AHB]	=  1,
 
diff --git a/include/dt-bindings/clock/aspeed-clock.h b/include/dt-bindings/clock/aspeed-clock.h
index 513c1b4af7a8..4d01804e7c43 100644
--- a/include/dt-bindings/clock/aspeed-clock.h
+++ b/include/dt-bindings/clock/aspeed-clock.h
@@ -45,7 +45,7 @@
 #define ASPEED_RESET_JTAG_MASTER	3
 #define ASPEED_RESET_MIC		4
 #define ASPEED_RESET_PWM		5
-#define ASPEED_RESET_PCIVGA		6
+#define ASPEED_RESET_PECI		6
 #define ASPEED_RESET_I2C		7
 #define ASPEED_RESET_AHB		8
 #define ASPEED_RESET_CRT1		9
-- 
cgit v1.2.3


From 80820a7bc8eb92bee8dab36668dfc567062b0ccf Mon Sep 17 00:00:00 2001
From: Jianguo Sun <sunjianguo1@huawei.com>
Date: Fri, 4 May 2018 16:56:30 +0800
Subject: clk: hisilicon: add missing usb3 clocks for Hi3798CV200 SoC

There are two USB3 host controllers on Hi3798CV200 SoC.
This commit adds missing clocks for them.

Signed-off-by: Jianguo Sun <sunjianguo1@huawei.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/hisilicon/crg-hi3798cv200.c | 17 +++++++++++++++++
 include/dt-bindings/clock/histb-clock.h |  8 ++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/hisilicon/crg-hi3798cv200.c b/drivers/clk/hisilicon/crg-hi3798cv200.c
index 743eec131528..4fe0b2a9baf1 100644
--- a/drivers/clk/hisilicon/crg-hi3798cv200.c
+++ b/drivers/clk/hisilicon/crg-hi3798cv200.c
@@ -186,6 +186,23 @@ static const struct hisi_gate_clock hi3798cv200_gate_clks[] = {
 		CLK_SET_RATE_PARENT, 0xbc, 0, 0 },
 	{ HISTB_USB2_PHY2_REF_CLK, "clk_u2_phy2_ref", "24m",
 		CLK_SET_RATE_PARENT, 0xbc, 2, 0 },
+	/* USB3 */
+	{ HISTB_USB3_BUS_CLK, "clk_u3_bus", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 0, 0 },
+	{ HISTB_USB3_UTMI_CLK, "clk_u3_utmi", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 4, 0 },
+	{ HISTB_USB3_PIPE_CLK, "clk_u3_pipe", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 3, 0 },
+	{ HISTB_USB3_SUSPEND_CLK, "clk_u3_suspend", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 2, 0 },
+	{ HISTB_USB3_BUS_CLK1, "clk_u3_bus1", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 16, 0 },
+	{ HISTB_USB3_UTMI_CLK1, "clk_u3_utmi1", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 20, 0 },
+	{ HISTB_USB3_PIPE_CLK1, "clk_u3_pipe1", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 19, 0 },
+	{ HISTB_USB3_SUSPEND_CLK1, "clk_u3_suspend1", NULL,
+		CLK_SET_RATE_PARENT, 0xb0, 18, 0 },
 };
 
 static struct hisi_clock_data *hi3798cv200_clk_register(
diff --git a/include/dt-bindings/clock/histb-clock.h b/include/dt-bindings/clock/histb-clock.h
index fab30b3f78b2..136de24733be 100644
--- a/include/dt-bindings/clock/histb-clock.h
+++ b/include/dt-bindings/clock/histb-clock.h
@@ -62,6 +62,14 @@
 #define HISTB_USB2_PHY1_REF_CLK		40
 #define HISTB_USB2_PHY2_REF_CLK		41
 #define HISTB_COMBPHY0_CLK		42
+#define HISTB_USB3_BUS_CLK		43
+#define HISTB_USB3_UTMI_CLK		44
+#define HISTB_USB3_PIPE_CLK		45
+#define HISTB_USB3_SUSPEND_CLK		46
+#define HISTB_USB3_BUS_CLK1		47
+#define HISTB_USB3_UTMI_CLK1		48
+#define HISTB_USB3_PIPE_CLK1		49
+#define HISTB_USB3_SUSPEND_CLK1		50
 
 /* clocks provided by mcu CRG */
 #define HISTB_MCE_CLK			1
-- 
cgit v1.2.3


From bf61099a21f5a4da3b0551a88d7b3551fa4fff08 Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Tue, 17 Apr 2018 20:30:27 +0800
Subject: clk: mediatek: correct the clocks for MT2701 HDMI PHY module

The hdmitx_dig_cts clock signal is not a child of clk26m,
and the actual output of the PLL block is derived from
the tvdpll via a configurable PLL post-divider.

It is used as the PLL reference input to the HDMI PHY module.

Fixes: e9862118272a ("clk: mediatek: Add MT2701 clock support")
Signed-off-by: Chunhui Dai <chunhui.dai@mediatek.com>
Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/mediatek/clk-mt2701.c      |  8 ++++++--
 include/dt-bindings/clock/mt2701-clk.h | 16 ++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/mediatek/clk-mt2701.c b/drivers/clk/mediatek/clk-mt2701.c
index deca7527f92f..4dda8988b2f0 100644
--- a/drivers/clk/mediatek/clk-mt2701.c
+++ b/drivers/clk/mediatek/clk-mt2701.c
@@ -46,8 +46,6 @@ static const struct mtk_fixed_clk top_fixed_clks[] = {
 		340 * MHZ),
 	FIXED_CLK(CLK_TOP_HDMI_0_PLL340M, "hdmi_0_pll340m", "clk26m",
 		340 * MHZ),
-	FIXED_CLK(CLK_TOP_HDMITX_CLKDIG_CTS, "hdmitx_dig_cts", "clk26m",
-		300 * MHZ),
 	FIXED_CLK(CLK_TOP_HADDS2_FB, "hadds2_fbclk", "clk26m",
 		27 * MHZ),
 	FIXED_CLK(CLK_TOP_WBG_DIG_416M, "wbg_dig_ck_416m", "clk26m",
@@ -977,6 +975,10 @@ static const struct mtk_pll_data apmixed_plls[] = {
 				21, 0x2d0, 4, 0x0, 0x2d4, 0),
 };
 
+static const struct mtk_fixed_factor apmixed_fixed_divs[] = {
+	FACTOR(CLK_APMIXED_HDMI_REF, "hdmi_ref", "tvdpll", 1, 1),
+};
+
 static int mtk_apmixedsys_init(struct platform_device *pdev)
 {
 	struct clk_onecell_data *clk_data;
@@ -988,6 +990,8 @@ static int mtk_apmixedsys_init(struct platform_device *pdev)
 
 	mtk_clk_register_plls(node, apmixed_plls, ARRAY_SIZE(apmixed_plls),
 								clk_data);
+	mtk_clk_register_factors(apmixed_fixed_divs, ARRAY_SIZE(apmixed_fixed_divs),
+								clk_data);
 
 	return of_clk_add_provider(node, of_clk_src_onecell_get, clk_data);
 }
diff --git a/include/dt-bindings/clock/mt2701-clk.h b/include/dt-bindings/clock/mt2701-clk.h
index 24e93dfcee9f..1956ebba4ab9 100644
--- a/include/dt-bindings/clock/mt2701-clk.h
+++ b/include/dt-bindings/clock/mt2701-clk.h
@@ -171,13 +171,12 @@
 #define CLK_TOP_8BDAC				151
 #define CLK_TOP_WBG_DIG_416M			152
 #define CLK_TOP_DPI				153
-#define CLK_TOP_HDMITX_CLKDIG_CTS		154
-#define CLK_TOP_DSI0_LNTC_DSI			155
-#define CLK_TOP_AUD_EXT1			156
-#define CLK_TOP_AUD_EXT2			157
-#define CLK_TOP_NFI1X_PAD			158
-#define CLK_TOP_AXISEL_D4			159
-#define CLK_TOP_NR				160
+#define CLK_TOP_DSI0_LNTC_DSI			154
+#define CLK_TOP_AUD_EXT1			155
+#define CLK_TOP_AUD_EXT2			156
+#define CLK_TOP_NFI1X_PAD			157
+#define CLK_TOP_AXISEL_D4			158
+#define CLK_TOP_NR				159
 
 /* APMIXEDSYS */
 
@@ -194,7 +193,8 @@
 #define CLK_APMIXED_HADDS2PLL			11
 #define CLK_APMIXED_AUD2PLL			12
 #define CLK_APMIXED_TVD2PLL			13
-#define CLK_APMIXED_NR				14
+#define CLK_APMIXED_HDMI_REF			14
+#define CLK_APMIXED_NR				15
 
 /* DDRPHY */
 
-- 
cgit v1.2.3


From aa9bb8d19d53fae8c09a0a170175d8a60f99a6b3 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Fri, 27 Apr 2018 16:14:44 +0800
Subject: dt-bindings: clock: mediatek: add entry for Mali-450 node to refer

Just add binding for a required clock referenced by Mali-450 on MT7623
or MT2701 SoC.

Cc: devicetree@vger.kernel.org
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/mt2701-clk.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/mt2701-clk.h b/include/dt-bindings/clock/mt2701-clk.h
index 24e93dfcee9f..2ac62a6e4574 100644
--- a/include/dt-bindings/clock/mt2701-clk.h
+++ b/include/dt-bindings/clock/mt2701-clk.h
@@ -431,6 +431,10 @@
 #define CLK_ETHSYS_CRYPTO			8
 #define CLK_ETHSYS_NR				9
 
+/* G3DSYS */
+#define CLK_G3DSYS_CORE				1
+#define CLK_G3DSYS_NR				2
+
 /* BDP */
 
 #define CLK_BDP_BRG_BA				1
-- 
cgit v1.2.3


From 5eb57e1e7a73000610ecff2b1e9b11133ed7f6a1 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Fri, 27 Apr 2018 16:14:45 +0800
Subject: dt-bindings: reset: mediatek: add entry for Mali-450 node to refer

Just add binding for a required reset referenced by Mali-450 on MT7623
or MT2701 SoC.

Cc: devicetree@vger.kernel.org
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/reset/mt2701-resets.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/reset/mt2701-resets.h b/include/dt-bindings/reset/mt2701-resets.h
index 21deb547cfa4..50b7f066da9a 100644
--- a/include/dt-bindings/reset/mt2701-resets.h
+++ b/include/dt-bindings/reset/mt2701-resets.h
@@ -87,4 +87,7 @@
 #define MT2701_ETHSYS_GMAC_RST			23
 #define MT2701_ETHSYS_PPE_RST			31
 
+/* G3DSYS resets */
+#define MT2701_G3DSYS_CORE_RST			0
+
 #endif  /* _DT_BINDINGS_RESET_CONTROLLER_MT2701 */
-- 
cgit v1.2.3


From f5a4670de96678be275c7dd7f469a9d98938d68a Mon Sep 17 00:00:00 2001
From: Michael Trimarchi <michael@amarulasolutions.com>
Date: Fri, 20 Apr 2018 23:00:04 +0200
Subject: clk: imx: Add new clo01 and clo2 controlled by CCOSR

osc->cko2_sel->cko2_podf->clk_cko2->clk_cko

Example of usage to provide clock to the sgtl5000

codec: sgtl5000@0a {
	compatible = "fsl,sgtl5000";
	reg = <0x0a>;
	clocks = <&clks IMX6UL_CLK_OSC>;
	#sound-dai-cells = <0>;
	clocks = <&clks IMX6UL_CLK_CKO>;
	assigned-clocks = <&clks IMX6UL_CLK_CKO2_SEL>,
			  <&clks IMX6UL_CLK_CKO2_PODF>,
			  <&clks IMX6UL_CLK_CKO2>,
			  <&clks IMX6UL_CLK_CKO>;
	assigned-clock-parents = <&clks IMX6UL_CLK_OSC>,
				 <&clks IMX6UL_CLK_CKO2_SEL>,
				 <&clks IMX6UL_CLK_CKO2_PODF>,
				 <&clks IMX6UL_CLK_CKO2>;
	clock-names = "mclk";
	wlf,shared-lrclk;

Signed-off-by: Matteo Lisi <matteo.lisi@engicam.com>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Tested-by: Michael Trimarchi <michael@amarulasolutions.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx6ul.c             | 18 +++++++++++++++++
 include/dt-bindings/clock/imx6ul-clock.h | 33 +++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c
index 114ecbb94ec5..9bfef1acee63 100644
--- a/drivers/clk/imx/clk-imx6ul.c
+++ b/drivers/clk/imx/clk-imx6ul.c
@@ -68,6 +68,13 @@ static const char *sim_sels[] = { "sim_podf", "ipp_di0", "ipp_di1", "ldb_di0", "
 static const char *epdc_pre_sels[] = { "pll2_bus", "pll3_usb_otg", "pll5_video_div", "pll2_pfd0_352m", "pll2_pfd2_396m", "pll3_pfd2_508m", };
 static const char *esai_sels[] = { "pll4_audio_div", "pll3_pfd2_508m", "pll5_video_div", "pll3_usb_otg", };
 static const char *epdc_sels[] = { "epdc_podf", "ipp_di0", "ipp_di1", "ldb_di0", "ldb_di1", };
+static const char *cko1_sels[] = { "dummy", "dummy", "dummy", "dummy", "dummy", "axi", "enfc", "dummy", "dummy",
+				   "dummy", "lcdif_pix", "ahb", "ipg", "ipg_per", "ckil", "pll4_audio_div", };
+static const char *cko2_sels[] = { "dummy", "dummy", "dummy", "usdhc1", "dummy", "dummy", "ecspi_root", "dummy",
+				   "dummy", "dummy", "dummy", "dummy", "dummy", "dummy", "osc", "dummy",
+				   "dummy", "usdhc2", "sai1", "sai2", "sai3", "dummy", "dummy", "can_root",
+				   "dummy", "dummy", "dummy", "dummy", "uart_serial", "spdif", "dummy", "dummy", };
+static const char *cko_sels[] = { "cko1", "cko2", };
 
 static struct clk *clks[IMX6UL_CLK_END];
 static struct clk_onecell_data clk_data;
@@ -273,6 +280,10 @@ static void __init imx6ul_clocks_init(struct device_node *ccm_node)
 	clks[IMX6UL_CLK_LDB_DI0_DIV_SEL]  = imx_clk_mux("ldb_di0", base + 0x20, 10, 1, ldb_di0_div_sels, ARRAY_SIZE(ldb_di0_div_sels));
 	clks[IMX6UL_CLK_LDB_DI1_DIV_SEL]  = imx_clk_mux("ldb_di1", base + 0x20, 11, 1, ldb_di1_div_sels, ARRAY_SIZE(ldb_di1_div_sels));
 
+	clks[IMX6UL_CLK_CKO1_SEL]	  = imx_clk_mux("cko1_sel", base + 0x60, 0,  4, cko1_sels, ARRAY_SIZE(cko1_sels));
+	clks[IMX6UL_CLK_CKO2_SEL]	  = imx_clk_mux("cko2_sel", base + 0x60, 16, 5, cko2_sels, ARRAY_SIZE(cko2_sels));
+	clks[IMX6UL_CLK_CKO]		  = imx_clk_mux("cko", base + 0x60, 8, 1, cko_sels, ARRAY_SIZE(cko_sels));
+
 	clks[IMX6UL_CLK_LDB_DI0_DIV_3_5] = imx_clk_fixed_factor("ldb_di0_div_3_5", "ldb_di0_sel", 2, 7);
 	clks[IMX6UL_CLK_LDB_DI0_DIV_7]	 = imx_clk_fixed_factor("ldb_di0_div_7",   "ldb_di0_sel", 1, 7);
 	clks[IMX6UL_CLK_LDB_DI1_DIV_3_5] = imx_clk_fixed_factor("ldb_di1_div_3_5", "qspi1_sel", 2, 7);
@@ -316,6 +327,9 @@ static void __init imx6ul_clocks_init(struct device_node *ccm_node)
 	clks[IMX6UL_CLK_LCDIF_PRED]	= imx_clk_divider("lcdif_pred",	   "lcdif_pre_sel",	base + 0x38, 12, 3);
 	clks[IMX6UL_CLK_CSI_PODF]       = imx_clk_divider("csi_podf",      "csi_sel",           base + 0x3c, 11, 3);
 
+	clks[IMX6UL_CLK_CKO1_PODF]	= imx_clk_divider("cko1_podf",     "cko1_sel",          base + 0x60, 4,  3);
+	clks[IMX6UL_CLK_CKO2_PODF]	= imx_clk_divider("cko2_podf",     "cko2_sel",          base + 0x60, 21, 3);
+
 	clks[IMX6UL_CLK_ARM]		= imx_clk_busy_divider("arm",	    "pll1_sw",	base +	0x10, 0,  3,  base + 0x48, 16);
 	clks[IMX6UL_CLK_MMDC_PODF]	= imx_clk_busy_divider("mmdc_podf", "periph2",	base +  0x14, 3,  3,  base + 0x48, 2);
 	clks[IMX6UL_CLK_AXI_PODF]	= imx_clk_busy_divider("axi_podf",  "axi_sel",	base +  0x14, 16, 3,  base + 0x48, 0);
@@ -445,6 +459,10 @@ static void __init imx6ul_clocks_init(struct device_node *ccm_node)
 	clks[IMX6UL_CLK_PWM6]		= imx_clk_gate2("pwm6",		"perclk",	 base +	0x80,	28);
 	clks[IMX6UL_CLK_PWM7]		= imx_clk_gate2("pwm7",		"perclk",	 base + 0x80,	30);
 
+	/* CCOSR */
+	clks[IMX6UL_CLK_CKO1]		= imx_clk_gate("cko1",		"cko1_podf",	 base + 0x60,	7);
+	clks[IMX6UL_CLK_CKO2]		= imx_clk_gate("cko2",		"cko2_podf",	 base + 0x60,	24);
+
 	/* mask handshake of mmdc */
 	writel_relaxed(BM_CCM_CCDR_MMDC_CH0_MASK, base + CCDR);
 
diff --git a/include/dt-bindings/clock/imx6ul-clock.h b/include/dt-bindings/clock/imx6ul-clock.h
index ee9f1a508d2f..9564597cbfac 100644
--- a/include/dt-bindings/clock/imx6ul-clock.h
+++ b/include/dt-bindings/clock/imx6ul-clock.h
@@ -235,20 +235,27 @@
 #define IMX6UL_CLK_CSI_PODF		222
 #define IMX6UL_CLK_PLL3_120M		223
 #define IMX6UL_CLK_KPP			224
+#define IMX6UL_CLK_CKO1_SEL		225
+#define IMX6UL_CLK_CKO1_PODF		226
+#define IMX6UL_CLK_CKO1			227
+#define IMX6UL_CLK_CKO2_SEL		228
+#define IMX6UL_CLK_CKO2_PODF		229
+#define IMX6UL_CLK_CKO2			230
+#define IMX6UL_CLK_CKO			231
 
 /* For i.MX6ULL */
-#define IMX6ULL_CLK_ESAI_PRED		225
-#define IMX6ULL_CLK_ESAI_PODF		226
-#define IMX6ULL_CLK_ESAI_EXTAL		227
-#define IMX6ULL_CLK_ESAI_MEM		228
-#define IMX6ULL_CLK_ESAI_IPG		229
-#define IMX6ULL_CLK_DCP_CLK		230
-#define IMX6ULL_CLK_EPDC_PRE_SEL	231
-#define IMX6ULL_CLK_EPDC_SEL		232
-#define IMX6ULL_CLK_EPDC_PODF		233
-#define IMX6ULL_CLK_EPDC_ACLK		234
-#define IMX6ULL_CLK_EPDC_PIX		235
-#define IMX6ULL_CLK_ESAI_SEL		236
-#define IMX6UL_CLK_END			237
+#define IMX6ULL_CLK_ESAI_PRED		232
+#define IMX6ULL_CLK_ESAI_PODF		233
+#define IMX6ULL_CLK_ESAI_EXTAL		234
+#define IMX6ULL_CLK_ESAI_MEM		235
+#define IMX6ULL_CLK_ESAI_IPG		236
+#define IMX6ULL_CLK_DCP_CLK		237
+#define IMX6ULL_CLK_EPDC_PRE_SEL	238
+#define IMX6ULL_CLK_EPDC_SEL		239
+#define IMX6ULL_CLK_EPDC_PODF		240
+#define IMX6ULL_CLK_EPDC_ACLK		241
+#define IMX6ULL_CLK_EPDC_PIX		242
+#define IMX6ULL_CLK_ESAI_SEL		243
+#define IMX6UL_CLK_END			244
 
 #endif /* __DT_BINDINGS_CLOCK_IMX6UL_H */
-- 
cgit v1.2.3


From 67482129cdabf7cede1301d2415ef4f0156d35cd Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 10 May 2018 08:38:15 -0700
Subject: iomap: add a swapfile activation function

Add a new iomap_swapfile_activate function so that filesystems can
activate swap files without having to use the obsolete and slow bmap
function.  This enables XFS to support fallocate'd swap files and
swap files on realtime devices.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/iomap.c            | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_aops.c     |  12 ++++
 include/linux/iomap.h |  11 ++++
 3 files changed, 185 insertions(+)

(limited to 'include')

diff --git a/fs/iomap.c b/fs/iomap.c
index bcfd7f3654d4..d193390a1c20 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,6 +27,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/dax.h>
 #include <linux/sched/signal.h>
+#include <linux/swap.h>
 
 #include "internal.h"
 
@@ -1139,3 +1140,164 @@ out_free_dio:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+/* Swapfile activation */
+
+#ifdef CONFIG_SWAP
+struct iomap_swapfile_info {
+	struct iomap iomap;		/* accumulated iomap */
+	struct swap_info_struct *sis;
+	uint64_t lowest_ppage;		/* lowest physical addr seen (pages) */
+	uint64_t highest_ppage;		/* highest physical addr seen (pages) */
+	unsigned long nr_pages;		/* number of pages collected */
+	int nr_extents;			/* extent count */
+};
+
+/*
+ * Collect physical extents for this swap file.  Physical extents reported to
+ * the swap code must be trimmed to align to a page boundary.  The logical
+ * offset within the file is irrelevant since the swapfile code maps logical
+ * page numbers of the swap device to the physical page-aligned extents.
+ */
+static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
+{
+	struct iomap *iomap = &isi->iomap;
+	unsigned long nr_pages;
+	uint64_t first_ppage;
+	uint64_t first_ppage_reported;
+	uint64_t next_ppage;
+	int error;
+
+	/*
+	 * Round the start up and the end down so that the physical
+	 * extent aligns to a page boundary.
+	 */
+	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+			PAGE_SHIFT;
+
+	/* Skip too-short physical extents. */
+	if (first_ppage >= next_ppage)
+		return 0;
+	nr_pages = next_ppage - first_ppage;
+
+	/*
+	 * Calculate how much swap space we're adding; the first page contains
+	 * the swap header and doesn't count.  The mm still wants that first
+	 * page fed to add_swap_extent, however.
+	 */
+	first_ppage_reported = first_ppage;
+	if (iomap->offset == 0)
+		first_ppage_reported++;
+	if (isi->lowest_ppage > first_ppage_reported)
+		isi->lowest_ppage = first_ppage_reported;
+	if (isi->highest_ppage < (next_ppage - 1))
+		isi->highest_ppage = next_ppage - 1;
+
+	/* Add extent, set up for the next call. */
+	error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
+	if (error < 0)
+		return error;
+	isi->nr_extents += error;
+	isi->nr_pages += nr_pages;
+	return 0;
+}
+
+/*
+ * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
+ * swap only cares about contiguous page-aligned physical extents and makes no
+ * distinction between written and unwritten extents.
+ */
+static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
+		loff_t count, void *data, struct iomap *iomap)
+{
+	struct iomap_swapfile_info *isi = data;
+	int error;
+
+	/* Skip holes. */
+	if (iomap->type == IOMAP_HOLE)
+		goto out;
+
+	/* Only one bdev per swap file. */
+	if (iomap->bdev != isi->sis->bdev)
+		goto err;
+
+	/* Only real or unwritten extents. */
+	if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN)
+		goto err;
+
+	/* No uncommitted metadata or shared blocks or inline data. */
+	if (iomap->flags & (IOMAP_F_DIRTY | IOMAP_F_SHARED |
+			    IOMAP_F_DATA_INLINE))
+		goto err;
+
+	/* No null physical addresses. */
+	if (iomap->addr == IOMAP_NULL_ADDR)
+		goto err;
+
+	if (isi->iomap.length == 0) {
+		/* No accumulated extent, so just store it. */
+		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
+		/* Append this to the accumulated extent. */
+		isi->iomap.length += iomap->length;
+	} else {
+		/* Otherwise, add the retained iomap and store this one. */
+		error = iomap_swapfile_add_extent(isi);
+		if (error)
+			return error;
+		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+	}
+out:
+	return count;
+err:
+	pr_err("swapon: file cannot be used for swap\n");
+	return -EINVAL;
+}
+
+/*
+ * Iterate a swap file's iomaps to construct physical extents that can be
+ * passed to the swapfile subsystem.
+ */
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+		struct file *swap_file, sector_t *pagespan,
+		const struct iomap_ops *ops)
+{
+	struct iomap_swapfile_info isi = {
+		.sis = sis,
+		.lowest_ppage = (sector_t)-1ULL,
+	};
+	struct address_space *mapping = swap_file->f_mapping;
+	struct inode *inode = mapping->host;
+	loff_t pos = 0;
+	loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
+	loff_t ret;
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		return ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
+				ops, &isi, iomap_swapfile_activate_actor);
+		if (ret <= 0)
+			return ret;
+
+		pos += ret;
+		len -= ret;
+	}
+
+	if (isi.iomap.length) {
+		ret = iomap_swapfile_add_extent(&isi);
+		if (ret)
+			return ret;
+	}
+
+	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
+	sis->max = isi.nr_pages;
+	sis->pages = isi.nr_pages - 1;
+	sis->highest_bit = isi.nr_pages - 1;
+	return isi.nr_extents;
+}
+EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
+#endif /* CONFIG_SWAP */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0ab824f574ed..80de476cecf8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1475,6 +1475,16 @@ xfs_vm_set_page_dirty(
 	return newly_dirty;
 }
 
+static int
+xfs_iomap_swapfile_activate(
+	struct swap_info_struct		*sis,
+	struct file			*swap_file,
+	sector_t			*span)
+{
+	sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
+	return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
+}
+
 const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
@@ -1488,6 +1498,7 @@ const struct address_space_operations xfs_address_space_operations = {
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
+	.swap_activate		= xfs_iomap_swapfile_activate,
 };
 
 const struct address_space_operations xfs_dax_aops = {
@@ -1495,4 +1506,5 @@ const struct address_space_operations xfs_dax_aops = {
 	.direct_IO		= noop_direct_IO,
 	.set_page_dirty		= noop_set_page_dirty,
 	.invalidatepage		= noop_invalidatepage,
+	.swap_activate		= xfs_iomap_swapfile_activate,
 };
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 19a07de28212..4bd87294219a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -106,4 +106,15 @@ typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, iomap_dio_end_io_t end_io);
 
+#ifdef CONFIG_SWAP
+struct file;
+struct swap_info_struct;
+
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+		struct file *swap_file, sector_t *pagespan,
+		const struct iomap_ops *ops);
+#else
+# define iomap_swapfile_activate(sis, swapfile, pagespan, ops)	(-EIO)
+#endif /* CONFIG_SWAP */
+
 #endif /* LINUX_IOMAP_H */
-- 
cgit v1.2.3


From 76f668be1e8e87d235fbd50037f50fdde948b46d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 May 2018 07:19:01 +0200
Subject: proc: introduce a proc_pid_ns helper

Factor out retrieving the per-sb pid namespaces from the sb private data
into an easier to understand helper.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/proc/array.c         |  7 +------
 fs/proc/base.c          | 18 ++++++++----------
 fs/proc/self.c          |  4 ++--
 fs/proc/thread_self.c   |  4 ++--
 include/linux/proc_fs.h |  6 ++++++
 5 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ae2c807fd719..911f66924d81 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -677,12 +677,7 @@ out:
 
 static int children_seq_show(struct seq_file *seq, void *v)
 {
-	struct inode *inode = seq->private;
-	pid_t pid;
-
-	pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
-	seq_printf(seq, "%d ", pid);
-
+	seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(seq->private)));
 	return 0;
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b2ede6abcdf..29237cad19fd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -698,7 +698,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 
 static int proc_pid_permission(struct inode *inode, int mask)
 {
-	struct pid_namespace *pid = inode->i_sb->s_fs_info;
+	struct pid_namespace *pid = proc_pid_ns(inode);
 	struct task_struct *task;
 	bool has_perms;
 
@@ -733,13 +733,11 @@ static const struct inode_operations proc_def_inode_operations = {
 static int proc_single_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
-	struct pid_namespace *ns;
-	struct pid *pid;
+	struct pid_namespace *ns = proc_pid_ns(inode);
+	struct pid *pid = proc_pid(inode);
 	struct task_struct *task;
 	int ret;
 
-	ns = inode->i_sb->s_fs_info;
-	pid = proc_pid(inode);
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task)
 		return -ESRCH;
@@ -1410,7 +1408,7 @@ static const struct file_operations proc_fail_nth_operations = {
 static int sched_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	struct task_struct *p;
 
 	p = get_proc_task(inode);
@@ -1782,8 +1780,8 @@ int pid_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
+	struct pid_namespace *pid = proc_pid_ns(inode);
 	struct task_struct *task;
-	struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
 
@@ -2337,7 +2335,7 @@ static int proc_timers_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	tp->pid = proc_pid(inode);
-	tp->ns = inode->i_sb->s_fs_info;
+	tp->ns = proc_pid_ns(inode);
 	return 0;
 }
 
@@ -3239,7 +3237,7 @@ retry:
 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct tgid_iter iter;
-	struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(file_inode(file));
 	loff_t pos = ctx->pos;
 
 	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
@@ -3588,7 +3586,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
 	 */
-	ns = inode->i_sb->s_fs_info;
+	ns = proc_pid_ns(inode);
 	tid = (int)file->f_version;
 	file->f_version = 0;
 	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 4d7d061696b3..127265e5c55f 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
 				      struct inode *inode,
 				      struct delayed_call *done)
 {
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	char *name;
 
@@ -36,7 +36,7 @@ static unsigned self_inum __ro_after_init;
 int proc_setup_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct pid_namespace *ns = s->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *self;
 	
 	inode_lock(root_inode);
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 9d2efaca499f..b905010ca9eb 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 					     struct inode *inode,
 					     struct delayed_call *done)
 {
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
@@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init;
 int proc_setup_thread_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct pid_namespace *ns = s->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *thread_self;
 
 	inode_lock(root_inode);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 928ef9e4d912..4edcde510631 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -83,4 +83,10 @@ struct ns_common;
 int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns));
 
+/* get the associated pid namespace for a file in procfs */
+static inline struct pid_namespace *proc_pid_ns(struct inode *inode)
+{
+	return inode->i_sb->s_fs_info;
+}
+
 #endif /* _LINUX_PROC_FS_H */
-- 
cgit v1.2.3


From fddda2b7b521185f3aa018f9559eb33b0aee53a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Apr 2018 19:44:18 +0200
Subject: proc: introduce proc_create_seq{,_data}

Variants of proc_create{,_data} that directly take a struct seq_operations
argument and drastically reduces the boilerplate code in the callers.

All trivial callers converted over.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/ia64/hp/common/sba_iommu.c  | 15 +-------
 arch/ia64/kernel/perfmon.c       | 16 +-------
 arch/s390/kernel/sysinfo.c       | 14 +------
 block/genhd.c                    | 28 +-------------
 crypto/proc.c                    | 14 +------
 drivers/char/misc.c              | 15 +-------
 drivers/isdn/capi/kcapi_proc.c   | 80 +++-------------------------------------
 drivers/net/hamradio/bpqether.c  | 16 +-------
 drivers/net/hamradio/scc.c       | 17 +--------
 drivers/net/hamradio/yam.c       | 16 +-------
 drivers/pci/proc.c               | 17 +--------
 drivers/s390/block/dasd_proc.c   | 17 +--------
 drivers/s390/char/tape_proc.c    | 19 +---------
 drivers/staging/ipx/ipx_proc.c   | 45 +++-------------------
 drivers/tty/tty_ldisc.c          | 15 +-------
 drivers/video/fbdev/core/fbmem.c | 15 +-------
 drivers/zorro/proc.c             | 17 +--------
 fs/cachefiles/proc.c             | 19 +---------
 fs/fscache/histogram.c           | 17 +--------
 fs/fscache/internal.h            |  3 +-
 fs/fscache/proc.c                |  4 +-
 fs/proc/consoles.c               | 14 +------
 fs/proc/devices.c                | 14 +------
 fs/proc/generic.c                | 30 +++++++++++++++
 fs/proc/internal.h               |  1 +
 fs/proc/interrupts.c             | 14 +------
 fs/proc/nommu.c                  | 14 +------
 fs/proc/proc_tty.c               | 16 +-------
 include/linux/proc_fs.h          |  9 +++++
 include/linux/tty.h              |  3 +-
 include/net/ax25.h               |  5 ++-
 include/net/netrom.h             |  5 ++-
 include/net/rose.h               |  6 +--
 kernel/locking/lockdep_proc.c    | 29 +--------------
 kernel/sched/debug.c             | 28 +-------------
 kernel/sched/stats.c             | 15 +-------
 mm/vmalloc.c                     | 11 +++---
 mm/vmstat.c                      | 56 ++--------------------------
 net/appletalk/atalk_proc.c       | 48 +++---------------------
 net/atm/br2684.c                 | 14 +------
 net/ax25/af_ax25.c               | 21 ++---------
 net/ax25/ax25_route.c            | 15 +-------
 net/ax25/ax25_uid.c              | 15 +-------
 net/core/net-procfs.c            | 16 +-------
 net/decnet/dn_dev.c              | 15 +-------
 net/llc/llc_proc.c               | 28 +-------------
 net/netrom/af_netrom.c           | 18 ++-------
 net/netrom/nr_route.c            | 29 +--------------
 net/rose/af_rose.c               | 26 ++++---------
 net/rose/rose_route.c            | 44 ++--------------------
 net/sctp/objcnt.c                | 16 +-------
 net/x25/x25_proc.c               | 48 +++---------------------
 security/keys/proc.c             | 34 +----------------
 53 files changed, 151 insertions(+), 925 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index aec4a3354abe..cb5cd86a5530 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1942,19 +1942,6 @@ static const struct seq_operations ioc_seq_ops = {
 	.show  = ioc_show
 };
 
-static int
-ioc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ioc_seq_ops);
-}
-
-static const struct file_operations ioc_fops = {
-	.open    = ioc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
 static void __init
 ioc_proc_init(void)
 {
@@ -1964,7 +1951,7 @@ ioc_proc_init(void)
 	if (!dir)
 		return;
 
-	proc_create(ioc_list->name, 0, dir, &ioc_fops);
+	proc_create_seq(ioc_list->name, 0, dir, &ioc_seq_ops);
 }
 #endif
 
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 8fb280e33114..3b38c717008a 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5708,13 +5708,6 @@ const struct seq_operations pfm_seq_ops = {
  	.show =		pfm_proc_show
 };
 
-static int
-pfm_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pfm_seq_ops);
-}
-
-
 /*
  * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
  * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
@@ -6537,13 +6530,6 @@ found:
 	return 0;
 }
 
-static const struct file_operations pfm_proc_fops = {
-	.open		= pfm_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 int __init
 pfm_init(void)
 {
@@ -6615,7 +6601,7 @@ pfm_init(void)
 	/*
 	 * create /proc/perfmon (mostly for debugging purposes)
 	 */
-	perfmon_dir = proc_create("perfmon", S_IRUGO, NULL, &pfm_proc_fops);
+	perfmon_dir = proc_create_seq("perfmon", S_IRUGO, NULL, &pfm_seq_ops);
 	if (perfmon_dir == NULL) {
 		printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
 		pmu_conf = NULL;
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index fc7e04c2195b..fed49601f06d 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -386,18 +386,6 @@ static const struct seq_operations service_level_seq_ops = {
 	.show		= service_level_show
 };
 
-static int service_level_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &service_level_seq_ops);
-}
-
-static const struct file_operations service_level_ops = {
-	.open		= service_level_open,
-	.read		= seq_read,
-	.llseek 	= seq_lseek,
-	.release	= seq_release
-};
-
 static void service_level_vm_print(struct seq_file *m,
 				   struct service_level *slr)
 {
@@ -420,7 +408,7 @@ static struct service_level service_level_vm = {
 
 static __init int create_proc_service_level(void)
 {
-	proc_create("service_levels", 0, NULL, &service_level_ops);
+	proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops);
 	if (MACHINE_IS_VM)
 		register_service_level(&service_level_vm);
 	return 0;
diff --git a/block/genhd.c b/block/genhd.c
index c4513fe1adda..6d7bc8958fda 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1027,18 +1027,6 @@ static const struct seq_operations partitions_op = {
 	.stop	= disk_seqf_stop,
 	.show	= show_partition
 };
-
-static int partitions_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &partitions_op);
-}
-
-static const struct file_operations proc_partitions_operations = {
-	.open		= partitions_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif
 
 
@@ -1377,22 +1365,10 @@ static const struct seq_operations diskstats_op = {
 	.show	= diskstats_show
 };
 
-static int diskstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &diskstats_op);
-}
-
-static const struct file_operations proc_diskstats_operations = {
-	.open		= diskstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_genhd_init(void)
 {
-	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-	proc_create("partitions", 0, NULL, &proc_partitions_operations);
+	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
+	proc_create_seq("partitions", 0, NULL, &partitions_op);
 	return 0;
 }
 module_init(proc_genhd_init);
diff --git a/crypto/proc.c b/crypto/proc.c
index 822fcef6d91c..f4eb6139973e 100644
--- a/crypto/proc.c
+++ b/crypto/proc.c
@@ -94,21 +94,9 @@ static const struct seq_operations crypto_seq_ops = {
 	.show		= c_show
 };
 
-static int crypto_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &crypto_seq_ops);
-}
-        
-static const struct file_operations proc_crypto_ops = {
-	.open		= crypto_info_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release
-};
-
 void __init crypto_init_proc(void)
 {
-	proc_create("crypto", 0, NULL, &proc_crypto_ops);
+	proc_create_seq("crypto", 0, NULL, &crypto_seq_ops);
 }
 
 void __exit crypto_exit_proc(void)
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 1bb9e7cc82e3..53cfe574d8d4 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -95,19 +95,6 @@ static const struct seq_operations misc_seq_ops = {
 	.stop  = misc_seq_stop,
 	.show  = misc_seq_show,
 };
-
-static int misc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &misc_seq_ops);
-}
-
-static const struct file_operations misc_proc_fops = {
-	.owner	 = THIS_MODULE,
-	.open    = misc_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
 #endif
 
 static int misc_open(struct inode *inode, struct file *file)
@@ -282,7 +269,7 @@ static int __init misc_init(void)
 	int err;
 	struct proc_dir_entry *ret;
 
-	ret = proc_create("misc", 0, NULL, &misc_proc_fops);
+	ret = proc_create_seq("misc", 0, NULL, &misc_seq_ops);
 	misc_class = class_create(THIS_MODULE, "misc");
 	err = PTR_ERR(misc_class);
 	if (IS_ERR(misc_class))
diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c
index 68db3c5a1063..c94bd12c0f7c 100644
--- a/drivers/isdn/capi/kcapi_proc.c
+++ b/drivers/isdn/capi/kcapi_proc.c
@@ -108,32 +108,6 @@ static const struct seq_operations seq_contrstats_ops = {
 	.show	= contrstats_show,
 };
 
-static int seq_controller_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_controller_ops);
-}
-
-static int seq_contrstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_contrstats_ops);
-}
-
-static const struct file_operations proc_controller_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_controller_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations proc_contrstats_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_contrstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 // /proc/capi/applications:
 //      applid l3cnt dblkcnt dblklen #ncci recvqueuelen
 // /proc/capi/applstats:
@@ -216,34 +190,6 @@ static const struct seq_operations seq_applstats_ops = {
 	.show	= applstats_show,
 };
 
-static int
-seq_applications_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_applications_ops);
-}
-
-static int
-seq_applstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_applstats_ops);
-}
-
-static const struct file_operations proc_applications_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_applications_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations proc_applstats_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_applstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 // ---------------------------------------------------------------------------
 
 static void *capi_driver_start(struct seq_file *seq, loff_t *pos)
@@ -279,22 +225,6 @@ static const struct seq_operations seq_capi_driver_ops = {
 	.show	= capi_driver_show,
 };
 
-static int
-seq_capi_driver_open(struct inode *inode, struct file *file)
-{
-	int err;
-	err = seq_open(file, &seq_capi_driver_ops);
-	return err;
-}
-
-static const struct file_operations proc_driver_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_capi_driver_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 // ---------------------------------------------------------------------------
 
 void __init
@@ -302,11 +232,11 @@ kcapi_proc_init(void)
 {
 	proc_mkdir("capi",             NULL);
 	proc_mkdir("capi/controllers", NULL);
-	proc_create("capi/controller",   0, NULL, &proc_controller_ops);
-	proc_create("capi/contrstats",   0, NULL, &proc_contrstats_ops);
-	proc_create("capi/applications", 0, NULL, &proc_applications_ops);
-	proc_create("capi/applstats",    0, NULL, &proc_applstats_ops);
-	proc_create("capi/driver",       0, NULL, &proc_driver_ops);
+	proc_create_seq("capi/controller",   0, NULL, &seq_controller_ops);
+	proc_create_seq("capi/contrstats",   0, NULL, &seq_contrstats_ops);
+	proc_create_seq("capi/applications", 0, NULL, &seq_applications_ops);
+	proc_create_seq("capi/applstats",    0, NULL, &seq_applstats_ops);
+	proc_create_seq("capi/driver",       0, NULL, &seq_capi_driver_ops);
 }
 
 void __exit
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index dfabbae72efd..f347fd9c5b28 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -449,20 +449,6 @@ static const struct seq_operations bpq_seqops = {
 	.show = bpq_seq_show,
 };
 
-static int bpq_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &bpq_seqops);
-}
-
-static const struct file_operations bpq_info_fops = {
-	.owner = THIS_MODULE,
-	.open = bpq_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
-
 /* ------------------------------------------------------------------------ */
 
 static const struct net_device_ops bpq_netdev_ops = {
@@ -590,7 +576,7 @@ static int bpq_device_event(struct notifier_block *this,
 static int __init bpq_init_driver(void)
 {
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("bpqether", 0444, init_net.proc_net, &bpq_info_fops)) {
+	if (!proc_create_seq("bpqether", 0444, init_net.proc_net, &bpq_seqops)) {
 		printk(KERN_ERR
 			"bpq: cannot create /proc/net/bpqether entry.\n");
 		return -ENOENT;
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index 3de272959090..6c03932d8a6b 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -2084,21 +2084,6 @@ static const struct seq_operations scc_net_seq_ops = {
 	.stop   = scc_net_seq_stop,
 	.show   = scc_net_seq_show,
 };
-
-
-static int scc_net_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &scc_net_seq_ops);
-}
-
-static const struct file_operations scc_net_seq_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = scc_net_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_private,
-};
-
 #endif /* CONFIG_PROC_FS */
 
  
@@ -2122,7 +2107,7 @@ static int __init scc_init_driver (void)
 	}
 	rtnl_unlock();
 
-	proc_create("z8530drv", 0, init_net.proc_net, &scc_net_seq_fops);
+	proc_create_seq("z8530drv", 0, init_net.proc_net, &scc_net_seq_ops);
 
 	return 0;
 }
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index 83034eb7ed4f..16ec7af6ab7b 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -841,20 +841,6 @@ static const struct seq_operations yam_seqops = {
 	.stop = yam_seq_stop,
 	.show = yam_seq_show,
 };
-
-static int yam_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &yam_seqops);
-}
-
-static const struct file_operations yam_info_fops = {
-	.owner = THIS_MODULE,
-	.open = yam_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 
@@ -1168,7 +1154,7 @@ static int __init yam_init_driver(void)
 	yam_timer.expires = jiffies + HZ / 100;
 	add_timer(&yam_timer);
 
-	proc_create("yam", 0444, init_net.proc_net, &yam_info_fops);
+	proc_create_seq("yam", 0444, init_net.proc_net, &yam_seqops);
 	return 0;
  error:
 	while (--i >= 0) {
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 1ee8927a0635..7ac035af39f0 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -435,25 +435,12 @@ int pci_proc_detach_bus(struct pci_bus *bus)
 	return 0;
 }
 
-static int proc_bus_pci_dev_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_bus_pci_devices_op);
-}
-
-static const struct file_operations proc_bus_pci_dev_operations = {
-	.owner		= THIS_MODULE,
-	.open		= proc_bus_pci_dev_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init pci_proc_init(void)
 {
 	struct pci_dev *dev = NULL;
 	proc_bus_pci_dir = proc_mkdir("bus/pci", NULL);
-	proc_create("devices", 0, proc_bus_pci_dir,
-		    &proc_bus_pci_dev_operations);
+	proc_create_seq("devices", 0, proc_bus_pci_dir,
+		    &proc_bus_pci_devices_op);
 	proc_initialized = 1;
 	for_each_pci_dev(dev)
 		pci_proc_attach_device(dev);
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index c33788a829c3..5cb80c645489 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -131,19 +131,6 @@ static const struct seq_operations dasd_devices_seq_ops = {
 	.show		= dasd_devices_show,
 };
 
-static int dasd_devices_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &dasd_devices_seq_ops);
-}
-
-static const struct file_operations dasd_devices_file_ops = {
-	.owner		= THIS_MODULE,
-	.open		= dasd_devices_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_DASD_PROFILE
 static int dasd_stats_all_block_on(void)
 {
@@ -352,10 +339,10 @@ dasd_proc_init(void)
 	dasd_proc_root_entry = proc_mkdir("dasd", NULL);
 	if (!dasd_proc_root_entry)
 		goto out_nodasd;
-	dasd_devices_entry = proc_create("devices",
+	dasd_devices_entry = proc_create_seq("devices",
 					 S_IFREG | S_IRUGO | S_IWUSR,
 					 dasd_proc_root_entry,
-					 &dasd_devices_file_ops);
+					 &dasd_devices_seq_ops);
 	if (!dasd_devices_entry)
 		goto out_nodevices;
 	dasd_statistics_entry = proc_create("statistics",
diff --git a/drivers/s390/char/tape_proc.c b/drivers/s390/char/tape_proc.c
index faae30476f4b..32a14ee31c6b 100644
--- a/drivers/s390/char/tape_proc.c
+++ b/drivers/s390/char/tape_proc.c
@@ -105,29 +105,14 @@ static const struct seq_operations tape_proc_seq = {
 	.show		= tape_proc_show,
 };
 
-static int tape_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tape_proc_seq);
-}
-
-static const struct file_operations tape_proc_ops =
-{
-	.owner		= THIS_MODULE,
-	.open		= tape_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * Initialize procfs stuff on startup
  */
 void
 tape_proc_init(void)
 {
-	tape_proc_devices =
-		proc_create("tapedevices", S_IFREG | S_IRUGO | S_IWUSR, NULL,
-			    &tape_proc_ops);
+	tape_proc_devices = proc_create_seq("tapedevices",
+			S_IFREG | S_IRUGO | S_IWUSR, NULL,  &tape_proc_seq);
 	if (tape_proc_devices == NULL) {
 		return;
 	}
diff --git a/drivers/staging/ipx/ipx_proc.c b/drivers/staging/ipx/ipx_proc.c
index b9232e4e2ed4..360f0ad970de 100644
--- a/drivers/staging/ipx/ipx_proc.c
+++ b/drivers/staging/ipx/ipx_proc.c
@@ -244,42 +244,6 @@ static const struct seq_operations ipx_seq_socket_ops = {
 	.show   = ipx_seq_socket_show,
 };
 
-static int ipx_seq_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ipx_seq_route_ops);
-}
-
-static int ipx_seq_interface_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ipx_seq_interface_ops);
-}
-
-static int ipx_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ipx_seq_socket_ops);
-}
-
-static const struct file_operations ipx_seq_interface_fops = {
-	.open           = ipx_seq_interface_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
-
-static const struct file_operations ipx_seq_route_fops = {
-	.open           = ipx_seq_route_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
-
-static const struct file_operations ipx_seq_socket_fops = {
-	.open           = ipx_seq_socket_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
-
 static struct proc_dir_entry *ipx_proc_dir;
 
 int __init ipx_proc_init(void)
@@ -291,16 +255,17 @@ int __init ipx_proc_init(void)
 
 	if (!ipx_proc_dir)
 		goto out;
-	p = proc_create("interface", S_IRUGO,
-			ipx_proc_dir, &ipx_seq_interface_fops);
+	p = proc_create_seq("interface", S_IRUGO, ipx_proc_dir,
+			&ipx_seq_interface_ops);
 	if (!p)
 		goto out_interface;
 
-	p = proc_create("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_fops);
+	p = proc_create_seq("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_ops);
 	if (!p)
 		goto out_route;
 
-	p = proc_create("socket", S_IRUGO, ipx_proc_dir, &ipx_seq_socket_fops);
+	p = proc_create_seq("socket", S_IRUGO, ipx_proc_dir,
+			&ipx_seq_socket_ops);
 	if (!p)
 		goto out_socket;
 
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index fb7329ab2b37..fc4c97cae01e 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -229,26 +229,13 @@ static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static const struct seq_operations tty_ldiscs_seq_ops = {
+const struct seq_operations tty_ldiscs_seq_ops = {
 	.start	= tty_ldiscs_seq_start,
 	.next	= tty_ldiscs_seq_next,
 	.stop	= tty_ldiscs_seq_stop,
 	.show	= tty_ldiscs_seq_show,
 };
 
-static int proc_tty_ldiscs_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tty_ldiscs_seq_ops);
-}
-
-const struct file_operations tty_ldiscs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_tty_ldiscs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /**
  *	tty_ldisc_ref_wait	-	wait for the tty ldisc
  *	@tty: tty device
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index f741ba8df01b..924d0730ffe2 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -713,19 +713,6 @@ static const struct seq_operations proc_fb_seq_ops = {
 	.show	= fb_seq_show,
 };
 
-static int proc_fb_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_fb_seq_ops);
-}
-
-static const struct file_operations fb_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_fb_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * We hold a reference to the fb_info in file->private_data,
  * but if the current registered fb has changed, we don't
@@ -1877,7 +1864,7 @@ fbmem_init(void)
 {
 	int ret;
 
-	if (!proc_create("fb", 0, NULL, &fb_proc_fops))
+	if (!proc_create_seq("fb", 0, NULL, &proc_fb_seq_ops))
 		return -ENOMEM;
 
 	ret = register_chrdev(FB_MAJOR, "fb", &fb_fops);
diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c
index df05a26ab8d8..2e4ca4dc0960 100644
--- a/drivers/zorro/proc.c
+++ b/drivers/zorro/proc.c
@@ -96,19 +96,6 @@ static const struct seq_operations zorro_devices_seq_ops = {
 	.show  = zorro_seq_show,
 };
 
-static int zorro_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &zorro_devices_seq_ops);
-}
-
-static const struct file_operations zorro_devices_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= zorro_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct proc_dir_entry *proc_bus_zorro_dir;
 
 static int __init zorro_proc_attach_device(unsigned int slot)
@@ -132,8 +119,8 @@ static int __init zorro_proc_init(void)
 
 	if (MACH_IS_AMIGA && AMIGAHW_PRESENT(ZORRO)) {
 		proc_bus_zorro_dir = proc_mkdir("bus/zorro", NULL);
-		proc_create("devices", 0, proc_bus_zorro_dir,
-			    &zorro_devices_proc_fops);
+		proc_create_seq("devices", 0, proc_bus_zorro_dir,
+			    &zorro_devices_seq_ops);
 		for (slot = 0; slot < zorro_num_autocon; slot++)
 			zorro_proc_attach_device(slot);
 	}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
index 125b90f6c796..0ce1aa56b67f 100644
--- a/fs/cachefiles/proc.c
+++ b/fs/cachefiles/proc.c
@@ -84,21 +84,6 @@ static const struct seq_operations cachefiles_histogram_ops = {
 	.show		= cachefiles_histogram_show,
 };
 
-/*
- * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
- */
-static int cachefiles_histogram_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &cachefiles_histogram_ops);
-}
-
-static const struct file_operations cachefiles_histogram_fops = {
-	.open		= cachefiles_histogram_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * initialise the /proc/fs/cachefiles/ directory
  */
@@ -109,8 +94,8 @@ int __init cachefiles_proc_init(void)
 	if (!proc_mkdir("fs/cachefiles", NULL))
 		goto error_dir;
 
-	if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
-			 &cachefiles_histogram_fops))
+	if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+			 &cachefiles_histogram_ops))
 		goto error_histogram;
 
 	_leave(" = 0");
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index 15a3d042247e..9a13e9e15b69 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c
@@ -83,24 +83,9 @@ static void fscache_histogram_stop(struct seq_file *m, void *v)
 {
 }
 
-static const struct seq_operations fscache_histogram_ops = {
+const struct seq_operations fscache_histogram_ops = {
 	.start		= fscache_histogram_start,
 	.stop		= fscache_histogram_stop,
 	.next		= fscache_histogram_next,
 	.show		= fscache_histogram_show,
 };
-
-/*
- * open "/proc/fs/fscache/histogram" to provide latency data
- */
-static int fscache_histogram_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &fscache_histogram_ops);
-}
-
-const struct file_operations fscache_histogram_fops = {
-	.open		= fscache_histogram_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 500650f938fe..53cfd0b34c38 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -31,6 +31,7 @@
 #include <linux/fscache-cache.h>
 #include <trace/events/fscache.h>
 #include <linux/sched.h>
+#include <linux/seq_file.h>
 
 #define FSCACHE_MIN_THREADS	4
 #define FSCACHE_MAX_THREADS	32
@@ -84,7 +85,7 @@ static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
 	atomic_inc(&histogram[jif]);
 }
 
-extern const struct file_operations fscache_histogram_fops;
+extern const struct seq_operations fscache_histogram_ops;
 
 #else
 #define fscache_hist(hist, start_jif) do {} while (0)
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 1d9e4951a597..459df553ea09 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -32,8 +32,8 @@ int __init fscache_proc_init(void)
 #endif
 
 #ifdef CONFIG_FSCACHE_HISTOGRAM
-	if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
-			 &fscache_histogram_fops))
+	if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL,
+			 &fscache_histogram_ops))
 		goto error_histogram;
 #endif
 
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index a8ac48aebd59..954caf0b7fee 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -91,21 +91,9 @@ static const struct seq_operations consoles_op = {
 	.show	= show_console_dev
 };
 
-static int consoles_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &consoles_op);
-}
-
-static const struct file_operations proc_consoles_operations = {
-	.open		= consoles_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_consoles_init(void)
 {
-	proc_create("consoles", 0, NULL, &proc_consoles_operations);
+	proc_create_seq("consoles", 0, NULL, &consoles_op);
 	return 0;
 }
 fs_initcall(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 2c7f22b14489..37d38697eaf8 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -51,21 +51,9 @@ static const struct seq_operations devinfo_ops = {
 	.show  = devinfo_show
 };
 
-static int devinfo_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &devinfo_ops);
-}
-
-static const struct file_operations proc_devinfo_operations = {
-	.open		= devinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_devices_init(void)
 {
-	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	proc_create_seq("devices", 0, NULL, &devinfo_ops);
 	return 0;
 }
 fs_initcall(proc_devices_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index ab6a321076b8..af644caaaf85 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/uaccess.h>
+#include <linux/seq_file.h>
 
 #include "internal.h"
 
@@ -555,6 +556,35 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL(proc_create);
 
+static int proc_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *de = PDE(inode);
+
+	return seq_open(file, de->seq_ops);
+}
+
+static const struct file_operations proc_seq_fops = {
+	.open		= proc_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+struct proc_dir_entry *proc_create_seq_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_seq_fops;
+	p->seq_ops = ops;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_seq_data);
+
 void proc_set_size(struct proc_dir_entry *de, loff_t size)
 {
 	de->size = size;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index dd1e11400b97..4fb01c5f9c1a 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,6 +44,7 @@ struct proc_dir_entry {
 	struct completion *pde_unload_completion;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
+	const struct seq_operations *seq_ops;
 	void *data;
 	unsigned int low_ino;
 	nlink_t nlink;
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 6a6bee9c603c..cb0edc7cbf09 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -34,21 +34,9 @@ static const struct seq_operations int_seq_ops = {
 	.show  = show_interrupts
 };
 
-static int interrupts_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &int_seq_ops);
-}
-
-static const struct file_operations proc_interrupts_operations = {
-	.open		= interrupts_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_interrupts_init(void)
 {
-	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+	proc_create_seq("interrupts", 0, NULL, &int_seq_ops);
 	return 0;
 }
 fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 75634379f82e..3b63be64e436 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -113,21 +113,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = {
 	.show	= nommu_region_list_show
 };
 
-static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_nommu_region_list_seqop);
-}
-
-static const struct file_operations proc_nommu_region_list_operations = {
-	.open    = proc_nommu_region_list_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 static int __init proc_nommu_init(void)
 {
-	proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
+	proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop);
 	return 0;
 }
 
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index d0cf1c50bb6c..b1a4a8ddd246 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -126,18 +126,6 @@ static const struct seq_operations tty_drivers_op = {
 	.show	= show_tty_driver
 };
 
-static int tty_drivers_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tty_drivers_op);
-}
-
-static const struct file_operations proc_tty_drivers_operations = {
-	.open		= tty_drivers_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -186,6 +174,6 @@ void __init proc_tty_init(void)
 	 * entry.
 	 */
 	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
-	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
-	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+	proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops);
+	proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op);
 }
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 4edcde510631..f368a896a8cb 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 
 struct proc_dir_entry;
+struct seq_operations;
 
 #ifdef CONFIG_PROC_FS
 
@@ -23,6 +24,12 @@ extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
 extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t,
 					      struct proc_dir_entry *);
 struct proc_dir_entry *proc_create_mount_point(const char *name);
+
+struct proc_dir_entry *proc_create_seq_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		void *data);
+#define proc_create_seq(name, mode, parent, ops) \
+	proc_create_seq_data(name, mode, parent, ops, NULL)
  
 extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 					       struct proc_dir_entry *,
@@ -57,6 +64,8 @@ static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
+#define proc_create_seq_data(name, mode, parent, ops, data) ({NULL;})
+#define proc_create_seq(name, mode, parent, ops) ({NULL;})
 #define proc_create(name, mode, parent, proc_fops) ({NULL;})
 #define proc_create_data(name, mode, parent, proc_fops, data) ({NULL;})
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1dd587ba6d88..9bd7d37adbfa 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -10,6 +10,7 @@
 #include <linux/tty_ldisc.h>
 #include <linux/mutex.h>
 #include <linux/tty_flags.h>
+#include <linux/seq_file.h>
 #include <uapi/linux/tty.h>
 #include <linux/rwsem.h>
 #include <linux/llist.h>
@@ -535,7 +536,7 @@ extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
 extern void tty_ldisc_hangup(struct tty_struct *tty, bool reset);
 extern int tty_ldisc_reinit(struct tty_struct *tty, int disc);
-extern const struct file_operations tty_ldiscs_proc_fops;
+extern const struct seq_operations tty_ldiscs_seq_ops;
 
 extern void tty_wakeup(struct tty_struct *tty);
 extern void tty_ldisc_flush(struct tty_struct *tty);
diff --git a/include/net/ax25.h b/include/net/ax25.h
index c91bc87931c7..3f9aea8087e3 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -15,6 +15,7 @@
 #include <linux/refcount.h>
 #include <net/neighbour.h>
 #include <net/sock.h>
+#include <linux/seq_file.h>
 
 #define	AX25_T1CLAMPLO  		1
 #define	AX25_T1CLAMPHI 			(30 * HZ)
@@ -399,7 +400,7 @@ int ax25_check_iframes_acked(ax25_cb *, unsigned short);
 /* ax25_route.c */
 void ax25_rt_device_down(struct net_device *);
 int ax25_rt_ioctl(unsigned int, void __user *);
-extern const struct file_operations ax25_route_fops;
+extern const struct seq_operations ax25_rt_seqops;
 ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev);
 int ax25_rt_autobind(ax25_cb *, ax25_address *);
 struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *,
@@ -455,7 +456,7 @@ unsigned long ax25_display_timer(struct timer_list *);
 extern int  ax25_uid_policy;
 ax25_uid_assoc *ax25_findbyuid(kuid_t);
 int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *);
-extern const struct file_operations ax25_uid_fops;
+extern const struct seq_operations ax25_uid_seqops;
 void ax25_uid_free(void);
 
 /* sysctl_net_ax25.c */
diff --git a/include/net/netrom.h b/include/net/netrom.h
index 0dad2dd5f9d7..5a0714ff500f 100644
--- a/include/net/netrom.h
+++ b/include/net/netrom.h
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <linux/refcount.h>
+#include <linux/seq_file.h>
 
 #define	NR_NETWORK_LEN			15
 #define	NR_TRANSPORT_LEN		5
@@ -216,8 +217,8 @@ struct net_device *nr_dev_get(ax25_address *);
 int nr_rt_ioctl(unsigned int, void __user *);
 void nr_link_failed(ax25_cb *, int);
 int nr_route_frame(struct sk_buff *, ax25_cb *);
-extern const struct file_operations nr_nodes_fops;
-extern const struct file_operations nr_neigh_fops;
+extern const struct seq_operations nr_node_seqops;
+extern const struct seq_operations nr_neigh_seqops;
 void nr_rt_free(void);
 
 /* nr_subr.c */
diff --git a/include/net/rose.h b/include/net/rose.h
index 04b72681f2ab..cf517d306a28 100644
--- a/include/net/rose.h
+++ b/include/net/rose.h
@@ -200,9 +200,9 @@ void rose_enquiry_response(struct sock *);
 
 /* rose_route.c */
 extern struct rose_neigh *rose_loopback_neigh;
-extern const struct file_operations rose_neigh_fops;
-extern const struct file_operations rose_nodes_fops;
-extern const struct file_operations rose_routes_fops;
+extern const struct seq_operations rose_neigh_seqops;
+extern const struct seq_operations rose_node_seqops;
+extern struct seq_operations rose_route_seqops;
 
 void rose_add_loopback_neigh(void);
 int __must_check rose_add_loopback_node(rose_address *);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index ad69bbc9bd28..3629049648a1 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = {
 	.show	= l_show,
 };
 
-static int lockdep_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &lockdep_ops);
-}
-
-static const struct file_operations proc_lockdep_operations = {
-	.open		= lockdep_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_PROVE_LOCKING
 static void *lc_start(struct seq_file *m, loff_t *pos)
 {
@@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = {
 	.stop	= lc_stop,
 	.show	= lc_show,
 };
-
-static int lockdep_chains_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &lockdep_chains_ops);
-}
-
-static const struct file_operations proc_lockdep_chains_operations = {
-	.open		= lockdep_chains_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif /* CONFIG_PROVE_LOCKING */
 
 static void lockdep_stats_debug_show(struct seq_file *m)
@@ -682,10 +658,9 @@ static const struct file_operations proc_lock_stat_operations = {
 
 static int __init lockdep_proc_init(void)
 {
-	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+	proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops);
 #ifdef CONFIG_PROVE_LOCKING
-	proc_create("lockdep_chains", S_IRUSR, NULL,
-		    &proc_lockdep_chains_operations);
+	proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
 #endif
 	proc_create("lockdep_stats", S_IRUSR, NULL,
 		    &proc_lockdep_stats_operations);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 15b10e210a6b..e593b4118578 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -823,35 +823,9 @@ static const struct seq_operations sched_debug_sops = {
 	.show		= sched_debug_show,
 };
 
-static int sched_debug_release(struct inode *inode, struct file *file)
-{
-	seq_release(inode, file);
-
-	return 0;
-}
-
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
-	int ret = 0;
-
-	ret = seq_open(filp, &sched_debug_sops);
-
-	return ret;
-}
-
-static const struct file_operations sched_debug_fops = {
-	.open		= sched_debug_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= sched_debug_release,
-};
-
 static int __init init_sched_debug_procfs(void)
 {
-	struct proc_dir_entry *pe;
-
-	pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
-	if (!pe)
+	if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index ab112cbfd7c8..750fb3c67eed 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -120,22 +120,9 @@ static const struct seq_operations schedstat_sops = {
 	.show  = show_schedstat,
 };
 
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &schedstat_sops);
-}
-
-static const struct file_operations proc_schedstat_operations = {
-	.open    = schedstat_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 static int __init proc_schedstat_init(void)
 {
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-
+	proc_create_seq("schedstat", 0, NULL, &schedstat_sops);
 	return 0;
 }
 subsys_initcall(proc_schedstat_init);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ebff729cc956..bc43c7838778 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2753,11 +2753,8 @@ static const struct seq_operations vmalloc_op = {
 
 static int vmalloc_open(struct inode *inode, struct file *file)
 {
-	if (IS_ENABLED(CONFIG_NUMA))
-		return seq_open_private(file, &vmalloc_op,
+	return seq_open_private(file, &vmalloc_op,
 					nr_node_ids * sizeof(unsigned int));
-	else
-		return seq_open(file, &vmalloc_op);
 }
 
 static const struct file_operations proc_vmalloc_operations = {
@@ -2769,7 +2766,11 @@ static const struct file_operations proc_vmalloc_operations = {
 
 static int __init proc_vmalloc_init(void)
 {
-	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
+	if (IS_ENABLED(CONFIG_NUMA))
+		proc_create("vmallocinfo", S_IRUSR, NULL,
+				&proc_vmalloc_operations);
+	else
+		proc_create_seq("vmallocinfo", S_IRUSR, NULL, &vmalloc_op);
 	return 0;
 }
 module_init(proc_vmalloc_init);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a2b9518980ce..75eda9c2b260 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1516,18 +1516,6 @@ static const struct seq_operations fragmentation_op = {
 	.show	= frag_show,
 };
 
-static int fragmentation_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &fragmentation_op);
-}
-
-static const struct file_operations buddyinfo_file_operations = {
-	.open		= fragmentation_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static const struct seq_operations pagetypeinfo_op = {
 	.start	= frag_start,
 	.next	= frag_next,
@@ -1535,18 +1523,6 @@ static const struct seq_operations pagetypeinfo_op = {
 	.show	= pagetypeinfo_show,
 };
 
-static int pagetypeinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pagetypeinfo_op);
-}
-
-static const struct file_operations pagetypeinfo_file_operations = {
-	.open		= pagetypeinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
 {
 	int zid;
@@ -1663,18 +1639,6 @@ static const struct seq_operations zoneinfo_op = {
 	.show	= zoneinfo_show,
 };
 
-static int zoneinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &zoneinfo_op);
-}
-
-static const struct file_operations zoneinfo_file_operations = {
-	.open		= zoneinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 enum writeback_stat_item {
 	NR_DIRTY_THRESHOLD,
 	NR_DIRTY_BG_THRESHOLD,
@@ -1762,18 +1726,6 @@ static const struct seq_operations vmstat_op = {
 	.stop	= vmstat_stop,
 	.show	= vmstat_show,
 };
-
-static int vmstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &vmstat_op);
-}
-
-static const struct file_operations vmstat_file_operations = {
-	.open		= vmstat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
@@ -2020,10 +1972,10 @@ void __init init_mm_internals(void)
 	start_shepherd_timer();
 #endif
 #ifdef CONFIG_PROC_FS
-	proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
-	proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
-	proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
-	proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
+	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+	proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
+	proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
+	proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
 #endif
 }
 
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index 7214aea14cb3..d456c702e725 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -210,42 +210,6 @@ static const struct seq_operations atalk_seq_socket_ops = {
 	.show   = atalk_seq_socket_show,
 };
 
-static int atalk_seq_interface_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &atalk_seq_interface_ops);
-}
-
-static int atalk_seq_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &atalk_seq_route_ops);
-}
-
-static int atalk_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &atalk_seq_socket_ops);
-}
-
-static const struct file_operations atalk_seq_interface_fops = {
-	.open		= atalk_seq_interface_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations atalk_seq_route_fops = {
-	.open		= atalk_seq_route_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations atalk_seq_socket_fops = {
-	.open		= atalk_seq_socket_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct proc_dir_entry *atalk_proc_dir;
 
 int __init atalk_proc_init(void)
@@ -257,18 +221,18 @@ int __init atalk_proc_init(void)
 	if (!atalk_proc_dir)
 		goto out;
 
-	p = proc_create("interface", 0444, atalk_proc_dir,
-			&atalk_seq_interface_fops);
+	p = proc_create_seq("interface", 0444, atalk_proc_dir,
+			&atalk_seq_interface_ops);
 	if (!p)
 		goto out_interface;
 
-	p = proc_create("route", 0444, atalk_proc_dir,
-			&atalk_seq_route_fops);
+	p = proc_create_seq("route", 0444, atalk_proc_dir,
+			&atalk_seq_route_ops);
 	if (!p)
 		goto out_route;
 
-	p = proc_create("socket", 0444, atalk_proc_dir,
-			&atalk_seq_socket_fops);
+	p = proc_create_seq("socket", 0444, atalk_proc_dir,
+			&atalk_seq_socket_ops);
 	if (!p)
 		goto out_socket;
 
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index fd94bea36ee8..36b3adacc0dd 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -818,18 +818,6 @@ static const struct seq_operations br2684_seq_ops = {
 	.show = br2684_seq_show,
 };
 
-static int br2684_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &br2684_seq_ops);
-}
-
-static const struct file_operations br2684_proc_ops = {
-	.open = br2684_proc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 extern struct proc_dir_entry *atm_proc_root;	/* from proc.c */
 #endif /* CONFIG_PROC_FS */
 
@@ -837,7 +825,7 @@ static int __init br2684_init(void)
 {
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *p;
-	p = proc_create("br2684", 0, atm_proc_root, &br2684_proc_ops);
+	p = proc_create_seq("br2684", 0, atm_proc_root, &br2684_seq_ops);
 	if (p == NULL)
 		return -ENOMEM;
 #endif
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2b41366fcad2..c603d33d5410 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1924,19 +1924,6 @@ static const struct seq_operations ax25_info_seqops = {
 	.stop = ax25_info_stop,
 	.show = ax25_info_show,
 };
-
-static int ax25_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ax25_info_seqops);
-}
-
-static const struct file_operations ax25_info_fops = {
-	.open = ax25_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 static const struct net_proto_family ax25_family_ops = {
@@ -1989,10 +1976,10 @@ static int __init ax25_init(void)
 	dev_add_pack(&ax25_packet_type);
 	register_netdevice_notifier(&ax25_dev_notifier);
 
-	proc_create("ax25_route", 0444, init_net.proc_net,
-		    &ax25_route_fops);
-	proc_create("ax25", 0444, init_net.proc_net, &ax25_info_fops);
-	proc_create("ax25_calls", 0444, init_net.proc_net, &ax25_uid_fops);
+	proc_create_seq("ax25_route", 0444, init_net.proc_net, &ax25_rt_seqops);
+	proc_create_seq("ax25", 0444, init_net.proc_net, &ax25_info_seqops);
+	proc_create_seq("ax25_calls", 0444, init_net.proc_net,
+			&ax25_uid_seqops);
 out:
 	return rc;
 }
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 525558972fd9..a0eff323af12 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -323,25 +323,12 @@ static int ax25_rt_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations ax25_rt_seqops = {
+const struct seq_operations ax25_rt_seqops = {
 	.start = ax25_rt_seq_start,
 	.next = ax25_rt_seq_next,
 	.stop = ax25_rt_seq_stop,
 	.show = ax25_rt_seq_show,
 };
-
-static int ax25_rt_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ax25_rt_seqops);
-}
-
-const struct file_operations ax25_route_fops = {
-	.open = ax25_rt_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 /*
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 4ebe91ba317a..99d02e390e43 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -181,25 +181,12 @@ static int ax25_uid_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations ax25_uid_seqops = {
+const struct seq_operations ax25_uid_seqops = {
 	.start = ax25_uid_seq_start,
 	.next = ax25_uid_seq_next,
 	.stop = ax25_uid_seq_stop,
 	.show = ax25_uid_seq_show,
 };
-
-static int ax25_uid_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ax25_uid_seqops);
-}
-
-const struct file_operations ax25_uid_fops = {
-	.open = ax25_uid_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 /*
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 9737302907b1..c4e8ebe55e26 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -195,18 +195,6 @@ static const struct seq_operations softnet_seq_ops = {
 	.show  = softnet_seq_show,
 };
 
-static int softnet_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &softnet_seq_ops);
-}
-
-static const struct file_operations softnet_seq_fops = {
-	.open    = softnet_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 static void *ptype_get_idx(loff_t pos)
 {
 	struct packet_type *pt = NULL;
@@ -317,8 +305,8 @@ static int __net_init dev_proc_net_init(struct net *net)
 
 	if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops))
 		goto out;
-	if (!proc_create("softnet_stat", 0444, net->proc_net,
-			 &softnet_seq_fops))
+	if (!proc_create_seq("softnet_stat", 0444, net->proc_net,
+			 &softnet_seq_ops))
 		goto out_dev;
 	if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops))
 		goto out_softnet;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index c03b046478c3..bfd43e8f2c06 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1382,19 +1382,6 @@ static const struct seq_operations dn_dev_seq_ops = {
 	.stop	= dn_dev_seq_stop,
 	.show	= dn_dev_seq_show,
 };
-
-static int dn_dev_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &dn_dev_seq_ops);
-}
-
-static const struct file_operations dn_dev_seq_fops = {
-	.open	 = dn_dev_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 static int addr[2];
@@ -1424,7 +1411,7 @@ void __init dn_dev_init(void)
 	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
 			     NULL, dn_nl_dump_ifaddr, 0);
 
-	proc_create("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_fops);
+	proc_create_seq("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_ops);
 
 #ifdef CONFIG_SYSCTL
 	{
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 62ea0aed94b4..f3a36c16a5e7 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -214,30 +214,6 @@ static const struct seq_operations llc_seq_core_ops = {
 	.show   = llc_seq_core_show,
 };
 
-static int llc_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &llc_seq_socket_ops);
-}
-
-static int llc_seq_core_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &llc_seq_core_ops);
-}
-
-static const struct file_operations llc_seq_socket_fops = {
-	.open		= llc_seq_socket_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations llc_seq_core_fops = {
-	.open		= llc_seq_core_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct proc_dir_entry *llc_proc_dir;
 
 int __init llc_proc_init(void)
@@ -249,11 +225,11 @@ int __init llc_proc_init(void)
 	if (!llc_proc_dir)
 		goto out;
 
-	p = proc_create("socket", 0444, llc_proc_dir, &llc_seq_socket_fops);
+	p = proc_create_seq("socket", 0444, llc_proc_dir, &llc_seq_socket_ops);
 	if (!p)
 		goto out_socket;
 
-	p = proc_create("core", 0444, llc_proc_dir, &llc_seq_core_fops);
+	p = proc_create_seq("core", 0444, llc_proc_dir, &llc_seq_core_ops);
 	if (!p)
 		goto out_core;
 
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 4221d98a314b..c2888c78d4c1 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1338,18 +1338,6 @@ static const struct seq_operations nr_info_seqops = {
 	.stop = nr_info_stop,
 	.show = nr_info_show,
 };
-
-static int nr_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &nr_info_seqops);
-}
-
-static const struct file_operations nr_info_fops = {
-	.open = nr_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
 #endif	/* CONFIG_PROC_FS */
 
 static const struct net_proto_family nr_family_ops = {
@@ -1450,9 +1438,9 @@ static int __init nr_proto_init(void)
 
 	nr_loopback_init();
 
-	proc_create("nr", 0444, init_net.proc_net, &nr_info_fops);
-	proc_create("nr_neigh", 0444, init_net.proc_net, &nr_neigh_fops);
-	proc_create("nr_nodes", 0444, init_net.proc_net, &nr_nodes_fops);
+	proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops);
+	proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops);
+	proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops);
 out:
 	return rc;
 fail:
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index b5a7dcb30991..6485f593e2f0 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -888,25 +888,13 @@ static int nr_node_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations nr_node_seqops = {
+const struct seq_operations nr_node_seqops = {
 	.start = nr_node_start,
 	.next = nr_node_next,
 	.stop = nr_node_stop,
 	.show = nr_node_show,
 };
 
-static int nr_node_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &nr_node_seqops);
-}
-
-const struct file_operations nr_nodes_fops = {
-	.open = nr_node_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static void *nr_neigh_start(struct seq_file *seq, loff_t *pos)
 {
 	spin_lock_bh(&nr_neigh_list_lock);
@@ -954,25 +942,12 @@ static int nr_neigh_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations nr_neigh_seqops = {
+const struct seq_operations nr_neigh_seqops = {
 	.start = nr_neigh_start,
 	.next = nr_neigh_next,
 	.stop = nr_neigh_stop,
 	.show = nr_neigh_show,
 };
-
-static int nr_neigh_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &nr_neigh_seqops);
-}
-
-const struct file_operations nr_neigh_fops = {
-	.open = nr_neigh_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 /*
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 9ff5e0a76593..22a7f2b413ac 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1453,18 +1453,6 @@ static const struct seq_operations rose_info_seqops = {
 	.stop = rose_info_stop,
 	.show = rose_info_show,
 };
-
-static int rose_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_info_seqops);
-}
-
-static const struct file_operations rose_info_fops = {
-	.open = rose_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
 #endif	/* CONFIG_PROC_FS */
 
 static const struct net_proto_family rose_family_ops = {
@@ -1567,13 +1555,13 @@ static int __init rose_proto_init(void)
 
 	rose_add_loopback_neigh();
 
-	proc_create("rose", 0444, init_net.proc_net, &rose_info_fops);
-	proc_create("rose_neigh", 0444, init_net.proc_net,
-		    &rose_neigh_fops);
-	proc_create("rose_nodes", 0444, init_net.proc_net,
-		    &rose_nodes_fops);
-	proc_create("rose_routes", 0444, init_net.proc_net,
-		    &rose_routes_fops);
+	proc_create_seq("rose", 0444, init_net.proc_net, &rose_info_seqops);
+	proc_create_seq("rose_neigh", 0444, init_net.proc_net,
+		    &rose_neigh_seqops);
+	proc_create_seq("rose_nodes", 0444, init_net.proc_net,
+		    &rose_node_seqops);
+	proc_create_seq("rose_routes", 0444, init_net.proc_net,
+		    &rose_route_seqops);
 out:
 	return rc;
 fail:
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 178619ddab68..77e9f85a2c92 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -1143,25 +1143,13 @@ static int rose_node_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations rose_node_seqops = {
+const struct seq_operations rose_node_seqops = {
 	.start = rose_node_start,
 	.next = rose_node_next,
 	.stop = rose_node_stop,
 	.show = rose_node_show,
 };
 
-static int rose_nodes_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_node_seqops);
-}
-
-const struct file_operations rose_nodes_fops = {
-	.open = rose_nodes_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static void *rose_neigh_start(struct seq_file *seq, loff_t *pos)
 	__acquires(rose_neigh_list_lock)
 {
@@ -1226,26 +1214,13 @@ static int rose_neigh_show(struct seq_file *seq, void *v)
 }
 
 
-static const struct seq_operations rose_neigh_seqops = {
+const struct seq_operations rose_neigh_seqops = {
 	.start = rose_neigh_start,
 	.next = rose_neigh_next,
 	.stop = rose_neigh_stop,
 	.show = rose_neigh_show,
 };
 
-static int rose_neigh_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_neigh_seqops);
-}
-
-const struct file_operations rose_neigh_fops = {
-	.open = rose_neigh_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
-
 static void *rose_route_start(struct seq_file *seq, loff_t *pos)
 	__acquires(rose_route_list_lock)
 {
@@ -1311,25 +1286,12 @@ static int rose_route_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations rose_route_seqops = {
+struct seq_operations rose_route_seqops = {
 	.start = rose_route_start,
 	.next = rose_route_next,
 	.stop = rose_route_stop,
 	.show = rose_route_show,
 };
-
-static int rose_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_route_seqops);
-}
-
-const struct file_operations rose_routes_fops = {
-	.open = rose_route_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 /*
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index fd2684ad94c8..a6179b26b80c 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -108,25 +108,13 @@ static const struct seq_operations sctp_objcnt_seq_ops = {
 	.show  = sctp_objcnt_seq_show,
 };
 
-static int sctp_objcnt_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &sctp_objcnt_seq_ops);
-}
-
-static const struct file_operations sctp_objcnt_ops = {
-	.open	 = sctp_objcnt_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release,
-};
-
 /* Initialize the objcount in the proc filesystem.  */
 void sctp_dbg_objcnt_init(struct net *net)
 {
 	struct proc_dir_entry *ent;
 
-	ent = proc_create("sctp_dbg_objcnt", 0,
-			  net->sctp.proc_net_sctp, &sctp_objcnt_ops);
+	ent = proc_create_seq("sctp_dbg_objcnt", 0,
+			  net->sctp.proc_net_sctp, &sctp_objcnt_seq_ops);
 	if (!ent)
 		pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
 }
diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c
index 64b415e93f6a..da52c9dc256c 100644
--- a/net/x25/x25_proc.c
+++ b/net/x25/x25_proc.c
@@ -171,57 +171,21 @@ static const struct seq_operations x25_seq_forward_ops = {
 	.show   = x25_seq_forward_show,
 };
 
-static int x25_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &x25_seq_socket_ops);
-}
-
-static int x25_seq_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &x25_seq_route_ops);
-}
-
-static int x25_seq_forward_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &x25_seq_forward_ops);
-}
-
-static const struct file_operations x25_seq_socket_fops = {
-	.open		= x25_seq_socket_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations x25_seq_route_fops = {
-	.open		= x25_seq_route_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations x25_seq_forward_fops = {
-	.open		= x25_seq_forward_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 int __init x25_proc_init(void)
 {
 	if (!proc_mkdir("x25", init_net.proc_net))
 		return -ENOMEM;
 
-	if (!proc_create("x25/route", 0444, init_net.proc_net,
-			 &x25_seq_route_fops))
+	if (!proc_create_seq("x25/route", 0444, init_net.proc_net,
+			 &x25_seq_route_ops))
 		goto out;
 
-	if (!proc_create("x25/socket", 0444, init_net.proc_net,
-			 &x25_seq_socket_fops))
+	if (!proc_create_seq("x25/socket", 0444, init_net.proc_net,
+			 &x25_seq_socket_ops))
 		goto out;
 
-	if (!proc_create("x25/forward", 0444, init_net.proc_net,
-			 &x25_seq_forward_fops))
+	if (!proc_create_seq("x25/forward", 0444, init_net.proc_net,
+			 &x25_seq_forward_ops))
 		goto out;
 	return 0;
 
diff --git a/security/keys/proc.c b/security/keys/proc.c
index fbc4af5c6c9f..5af2934965d8 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -18,7 +18,6 @@
 #include <asm/errno.h>
 #include "internal.h"
 
-static int proc_keys_open(struct inode *inode, struct file *file);
 static void *proc_keys_start(struct seq_file *p, loff_t *_pos);
 static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos);
 static void proc_keys_stop(struct seq_file *p, void *v);
@@ -31,14 +30,6 @@ static const struct seq_operations proc_keys_ops = {
 	.show	= proc_keys_show,
 };
 
-static const struct file_operations proc_keys_fops = {
-	.open		= proc_keys_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int proc_key_users_open(struct inode *inode, struct file *file);
 static void *proc_key_users_start(struct seq_file *p, loff_t *_pos);
 static void *proc_key_users_next(struct seq_file *p, void *v, loff_t *_pos);
 static void proc_key_users_stop(struct seq_file *p, void *v);
@@ -51,13 +42,6 @@ static const struct seq_operations proc_key_users_ops = {
 	.show	= proc_key_users_show,
 };
 
-static const struct file_operations proc_key_users_fops = {
-	.open		= proc_key_users_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * Declare the /proc files.
  */
@@ -65,11 +49,11 @@ static int __init key_proc_init(void)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("keys", 0, NULL, &proc_keys_fops);
+	p = proc_create_seq("keys", 0, NULL, &proc_keys_ops);
 	if (!p)
 		panic("Cannot create /proc/keys\n");
 
-	p = proc_create("key-users", 0, NULL, &proc_key_users_fops);
+	p = proc_create_seq("key-users", 0, NULL, &proc_key_users_ops);
 	if (!p)
 		panic("Cannot create /proc/key-users\n");
 
@@ -96,11 +80,6 @@ static struct rb_node *key_serial_next(struct seq_file *p, struct rb_node *n)
 	return n;
 }
 
-static int proc_keys_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_keys_ops);
-}
-
 static struct key *find_ge_key(struct seq_file *p, key_serial_t id)
 {
 	struct user_namespace *user_ns = seq_user_ns(p);
@@ -293,15 +272,6 @@ static struct rb_node *key_user_first(struct user_namespace *user_ns, struct rb_
 	return __key_user_next(user_ns, n);
 }
 
-/*
- * Implement "/proc/key-users" to provides a list of the key users and their
- * quotas.
- */
-static int proc_key_users_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_key_users_ops);
-}
-
 static void *proc_key_users_start(struct seq_file *p, loff_t *_pos)
 	__acquires(key_user_lock)
 {
-- 
cgit v1.2.3


From 44414d82cfe0f68cb59d0a42f599ccd893ae0032 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Apr 2018 17:05:17 +0200
Subject: proc: introduce proc_create_seq_private

Variant of proc_create_data that directly take a struct seq_operations
argument + a private state size and drastically reduces the boilerplate
code in the callers.

All trivial callers converted over.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c                 | 16 ++--------------
 fs/proc/generic.c          |  9 ++++++---
 fs/proc/internal.h         |  1 +
 include/linux/atalk.h      |  7 ++++++-
 include/linux/proc_fs.h    |  9 ++++++---
 kernel/time/timer_list.c   | 16 ++--------------
 mm/vmalloc.c               | 18 +++---------------
 net/appletalk/aarp.c       | 20 +-------------------
 net/appletalk/atalk_proc.c |  3 ++-
 net/atm/lec.c              | 15 ++-------------
 net/decnet/af_decnet.c     | 17 +++--------------
 net/decnet/dn_route.c      | 19 +++----------------
 12 files changed, 37 insertions(+), 113 deletions(-)

(limited to 'include')

diff --git a/fs/locks.c b/fs/locks.c
index 62bbe8b31f26..05e211be8684 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2788,22 +2788,10 @@ static const struct seq_operations locks_seq_operations = {
 	.show	= locks_show,
 };
 
-static int locks_open(struct inode *inode, struct file *filp)
-{
-	return seq_open_private(filp, &locks_seq_operations,
-					sizeof(struct locks_iterator));
-}
-
-static const struct file_operations proc_locks_operations = {
-	.open		= locks_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int __init proc_locks_init(void)
 {
-	proc_create("locks", 0, NULL, &proc_locks_operations);
+	proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
+			sizeof(struct locks_iterator), NULL);
 	return 0;
 }
 fs_initcall(proc_locks_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index af644caaaf85..f87cb0053387 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -560,6 +560,8 @@ static int proc_seq_open(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *de = PDE(inode);
 
+	if (de->state_size)
+		return seq_open_private(file, de->seq_ops, de->state_size);
 	return seq_open(file, de->seq_ops);
 }
 
@@ -570,9 +572,9 @@ static const struct file_operations proc_seq_fops = {
 	.release	= seq_release,
 };
 
-struct proc_dir_entry *proc_create_seq_data(const char *name, umode_t mode,
+struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, const struct seq_operations *ops,
-		void *data)
+		unsigned int state_size, void *data)
 {
 	struct proc_dir_entry *p;
 
@@ -581,9 +583,10 @@ struct proc_dir_entry *proc_create_seq_data(const char *name, umode_t mode,
 		return NULL;
 	p->proc_fops = &proc_seq_fops;
 	p->seq_ops = ops;
+	p->state_size = state_size;
 	return proc_register(parent, p);
 }
-EXPORT_SYMBOL(proc_create_seq_data);
+EXPORT_SYMBOL(proc_create_seq_private);
 
 void proc_set_size(struct proc_dir_entry *de, loff_t size)
 {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 4fb01c5f9c1a..bcfe830ffd59 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -46,6 +46,7 @@ struct proc_dir_entry {
 	const struct file_operations *proc_fops;
 	const struct seq_operations *seq_ops;
 	void *data;
+	unsigned int state_size;
 	unsigned int low_ino;
 	nlink_t nlink;
 	kuid_t uid;
diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 40373920ea58..23f805562f4e 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -145,7 +145,12 @@ extern rwlock_t atalk_interfaces_lock;
 
 extern struct atalk_route atrtr_default;
 
-extern const struct file_operations atalk_seq_arp_fops;
+struct aarp_iter_state {
+	int bucket;
+	struct aarp_entry **table;
+};
+
+extern const struct seq_operations aarp_seq_ops;
 
 extern int sysctl_aarp_expiry_time;
 extern int sysctl_aarp_tick_time;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index f368a896a8cb..314713a48817 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -25,11 +25,13 @@ extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t,
 					      struct proc_dir_entry *);
 struct proc_dir_entry *proc_create_mount_point(const char *name);
 
-struct proc_dir_entry *proc_create_seq_data(const char *name, umode_t mode,
+struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, const struct seq_operations *ops,
-		void *data);
+		unsigned int state_size, void *data);
+#define proc_create_seq_data(name, mode, parent, ops, data) \
+	proc_create_seq_private(name, mode, parent, ops, 0, data)
 #define proc_create_seq(name, mode, parent, ops) \
-	proc_create_seq_data(name, mode, parent, ops, NULL)
+	proc_create_seq_private(name, mode, parent, ops, 0, NULL)
  
 extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 					       struct proc_dir_entry *,
@@ -64,6 +66,7 @@ static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
+#define proc_create_seq_private(name, mode, parent, ops, 0, data) ({NULL;})
 #define proc_create_seq_data(name, mode, parent, ops, data) ({NULL;})
 #define proc_create_seq(name, mode, parent, ops) ({NULL;})
 #define proc_create(name, mode, parent, proc_fops) ({NULL;})
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0ed768b56c60..675c4e9563a9 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -372,24 +372,12 @@ static const struct seq_operations timer_list_sops = {
 	.show = timer_list_show,
 };
 
-static int timer_list_open(struct inode *inode, struct file *filp)
-{
-	return seq_open_private(filp, &timer_list_sops,
-			sizeof(struct timer_list_iter));
-}
-
-static const struct file_operations timer_list_fops = {
-	.open		= timer_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int __init init_timer_list_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
+	pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops,
+			sizeof(struct timer_list_iter), NULL);
 	if (!pe)
 		return -ENOMEM;
 	return 0;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bc43c7838778..63a5f502da08 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2751,24 +2751,12 @@ static const struct seq_operations vmalloc_op = {
 	.show = s_show,
 };
 
-static int vmalloc_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &vmalloc_op,
-					nr_node_ids * sizeof(unsigned int));
-}
-
-static const struct file_operations proc_vmalloc_operations = {
-	.open		= vmalloc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int __init proc_vmalloc_init(void)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
-		proc_create("vmallocinfo", S_IRUSR, NULL,
-				&proc_vmalloc_operations);
+		proc_create_seq_private("vmallocinfo", S_IRUSR, NULL,
+				&vmalloc_op,
+				nr_node_ids * sizeof(unsigned int), NULL);
 	else
 		proc_create_seq("vmallocinfo", S_IRUSR, NULL, &vmalloc_op);
 	return 0;
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index d4c1021e74e1..49a16cee2aae 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -907,11 +907,6 @@ void aarp_device_down(struct net_device *dev)
 }
 
 #ifdef CONFIG_PROC_FS
-struct aarp_iter_state {
-	int bucket;
-	struct aarp_entry **table;
-};
-
 /*
  * Get the aarp entry that is in the chain described
  * by the iterator.
@@ -1033,25 +1028,12 @@ static int aarp_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations aarp_seq_ops = {
+const struct seq_operations aarp_seq_ops = {
 	.start  = aarp_seq_start,
 	.next   = aarp_seq_next,
 	.stop   = aarp_seq_stop,
 	.show   = aarp_seq_show,
 };
-
-static int aarp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &aarp_seq_ops,
-			sizeof(struct aarp_iter_state));
-}
-
-const struct file_operations atalk_seq_arp_fops = {
-	.open           = aarp_seq_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release	= seq_release_private,
-};
 #endif
 
 /* General module cleanup. Called from cleanup_module() in ddp.c. */
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index d456c702e725..8006295f8bd7 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -236,7 +236,8 @@ int __init atalk_proc_init(void)
 	if (!p)
 		goto out_socket;
 
-	p = proc_create("arp", 0444, atalk_proc_dir, &atalk_seq_arp_fops);
+	p = proc_create_seq_private("arp", 0444, atalk_proc_dir, &aarp_seq_ops,
+			sizeof(struct aarp_iter_state), NULL);
 	if (!p)
 		goto out_arp;
 
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 3138a869b5c0..5a95fcf6f9b6 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -990,18 +990,6 @@ static const struct seq_operations lec_seq_ops = {
 	.stop = lec_seq_stop,
 	.show = lec_seq_show,
 };
-
-static int lec_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &lec_seq_ops, sizeof(struct lec_state));
-}
-
-static const struct file_operations lec_seq_fops = {
-	.open = lec_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_private,
-};
 #endif
 
 static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
@@ -1047,7 +1035,8 @@ static int __init lane_module_init(void)
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *p;
 
-	p = proc_create("lec", 0444, atm_proc_root, &lec_seq_fops);
+	p = proc_create_seq_private("lec", 0444, atm_proc_root, &lec_seq_ops,
+			sizeof(struct lec_state), NULL);
 	if (!p) {
 		pr_err("Unable to initialize /proc/net/atm/lec\n");
 		return -ENOMEM;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 32751602767f..7d6ff983ba2c 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -2314,19 +2314,6 @@ static const struct seq_operations dn_socket_seq_ops = {
 	.stop	= dn_socket_seq_stop,
 	.show	= dn_socket_seq_show,
 };
-
-static int dn_socket_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &dn_socket_seq_ops,
-			sizeof(struct dn_iter_state));
-}
-
-static const struct file_operations dn_socket_seq_fops = {
-	.open		= dn_socket_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
 #endif
 
 static const struct net_proto_family	dn_family_ops = {
@@ -2383,7 +2370,9 @@ static int __init decnet_init(void)
 	dev_add_pack(&dn_dix_packet_type);
 	register_netdevice_notifier(&dn_dev_notifier);
 
-	proc_create("decnet", 0444, init_net.proc_net, &dn_socket_seq_fops);
+	proc_create_seq_private("decnet", 0444, init_net.proc_net,
+			&dn_socket_seq_ops, sizeof(struct dn_iter_state),
+			NULL);
 	dn_register_sysctl();
 out:
 	return rc;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index eca0cc6b761f..e74765024d88 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1852,20 +1852,6 @@ static const struct seq_operations dn_rt_cache_seq_ops = {
 	.stop	= dn_rt_cache_seq_stop,
 	.show	= dn_rt_cache_seq_show,
 };
-
-static int dn_rt_cache_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &dn_rt_cache_seq_ops,
-			sizeof(struct dn_rt_cache_iter_state));
-}
-
-static const struct file_operations dn_rt_cache_seq_fops = {
-	.open	 = dn_rt_cache_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_private,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 void __init dn_route_init(void)
@@ -1918,8 +1904,9 @@ void __init dn_route_init(void)
 
 	dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
 
-	proc_create("decnet_cache", 0444, init_net.proc_net,
-		    &dn_rt_cache_seq_fops);
+	proc_create_seq_private("decnet_cache", 0444, init_net.proc_net,
+			&dn_rt_cache_seq_ops,
+			sizeof(struct dn_rt_cache_iter_state), NULL);
 
 #ifdef CONFIG_DECNET_ROUTER
 	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
-- 
cgit v1.2.3


From 3f3942aca6da351a12543aa776467791b63b3a78 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 May 2018 15:57:23 +0200
Subject: proc: introduce proc_create_single{,_data}

Variants of proc_create{,_data} that directly take a seq_file show
callback and drastically reduces the boilerplate code in the callers.

All trivial callers converted over.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/arm/kernel/dma.c                            | 14 +---
 arch/arm/kernel/swp_emulate.c                    | 15 +----
 arch/arm/mach-rpc/ecard.c                        | 16 +----
 arch/ia64/kernel/palinfo.c                       | 16 +----
 arch/ia64/kernel/salinfo.c                       | 42 ++++--------
 arch/ia64/sn/kernel/sn2/prominfo_proc.c          | 32 ++-------
 arch/ia64/sn/kernel/sn2/sn_proc_fs.c             | 62 ++----------------
 arch/m68k/kernel/setup_mm.c                      | 14 +---
 arch/mips/pci/ops-pmcmsp.c                       | 28 +-------
 arch/mips/sibyte/common/bus_watcher.c            | 16 +----
 arch/parisc/kernel/pci-dma.c                     | 17 +----
 arch/parisc/kernel/pdc_chassis.c                 | 14 +---
 arch/powerpc/kernel/eeh.c                        | 14 +---
 arch/powerpc/kernel/rtas-proc.c                  | 32 ++-------
 arch/powerpc/platforms/cell/spufs/sched.c        | 14 +---
 arch/s390/kernel/sysinfo.c                       | 14 +---
 arch/sh/drivers/dma/dma-api.c                    | 14 +---
 arch/sparc/kernel/ioport.c                       | 19 ++----
 arch/um/drivers/ubd_kern.c                       | 16 +----
 arch/x86/kernel/apm_32.c                         | 15 +----
 drivers/acpi/ac.c                                | 21 +-----
 drivers/acpi/button.c                            | 19 +-----
 drivers/block/DAC960.c                           | 49 ++------------
 drivers/block/pktcdvd.c                          | 14 +---
 drivers/block/ps3vram.c                          | 17 +----
 drivers/char/apm-emulation.c                     | 15 +----
 drivers/char/ds1620.c                            | 14 +---
 drivers/char/efirtc.c                            | 15 +----
 drivers/char/nvram.c                             | 15 +----
 drivers/char/rtc.c                               | 19 +-----
 drivers/char/toshiba.c                           | 15 +----
 drivers/connector/connector.c                    | 15 +----
 drivers/input/misc/hp_sdc_rtc.c                  | 14 +---
 drivers/isdn/capi/capi.c                         | 30 +--------
 drivers/isdn/capi/capidrv.c                      | 15 +----
 drivers/isdn/hardware/eicon/diva_didd.c          | 17 +----
 drivers/isdn/hardware/eicon/divasi.c             | 17 +----
 drivers/macintosh/via-pmu.c                      | 57 +++-------------
 drivers/media/pci/saa7164/saa7164-core.c         | 14 +---
 drivers/media/pci/zoran/videocodec.c             | 16 +----
 drivers/message/fusion/mptbase.c                 | 57 ++++------------
 drivers/mtd/mtdcore.c                            | 14 +---
 drivers/net/wireless/atmel/atmel.c               | 15 +----
 drivers/net/wireless/intersil/hostap/hostap_ap.c | 16 +----
 drivers/net/wireless/ray_cs.c                    | 15 +----
 drivers/nubus/proc.c                             | 51 +++------------
 drivers/parisc/ccio-dma.c                        | 34 ++--------
 drivers/parisc/sba_iommu.c                       | 32 +--------
 drivers/platform/x86/toshiba_acpi.c              | 17 +----
 drivers/pnp/pnpbios/proc.c                       | 78 ++--------------------
 drivers/staging/comedi/proc.c                    | 18 +-----
 drivers/usb/gadget/udc/at91_udc.c                | 16 +----
 drivers/usb/gadget/udc/fsl_udc_core.c            | 18 +-----
 drivers/usb/gadget/udc/goku_udc.c                | 18 +-----
 drivers/usb/gadget/udc/omap_udc.c                | 15 +----
 drivers/video/fbdev/via/viafbdev.c               | 17 +----
 fs/cifs/cifs_debug.c                             | 15 +----
 fs/f2fs/sysfs.c                                  | 29 ++-------
 fs/filesystems.c                                 | 14 +---
 fs/fscache/internal.h                            |  2 +-
 fs/fscache/proc.c                                |  4 +-
 fs/fscache/stats.c                               | 17 +----
 fs/proc/cmdline.c                                | 14 +---
 fs/proc/generic.c                                | 29 +++++++++
 fs/proc/internal.h                               |  5 +-
 fs/proc/loadavg.c                                | 14 +---
 fs/proc/meminfo.c                                | 14 +---
 fs/proc/softirqs.c                               | 14 +---
 fs/proc/uptime.c                                 | 14 +---
 fs/proc/version.c                                | 14 +---
 fs/reiserfs/procfs.c                             | 16 +----
 fs/xfs/xfs_stats.c                               | 31 +--------
 include/linux/proc_fs.h                          | 10 ++-
 kernel/cgroup/cgroup-internal.h                  |  2 +-
 kernel/cgroup/cgroup-v1.c                        | 14 +---
 kernel/cgroup/cgroup.c                           |  2 +-
 kernel/dma.c                                     | 14 +---
 kernel/exec_domain.c                             | 14 +---
 kernel/irq/proc.c                                | 82 +++---------------------
 kernel/locking/lockdep_proc.c                    | 16 +----
 net/8021q/vlanproc.c                             | 21 +-----
 net/ipv4/ipconfig.c                              | 14 +---
 net/ipv4/route.c                                 | 15 +----
 net/ipv6/proc.c                                  | 17 +----
 net/sched/sch_api.c                              | 14 +---
 85 files changed, 235 insertions(+), 1509 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kernel/dma.c b/arch/arm/kernel/dma.c
index e651c4d0a0d9..6739d37c2bc5 100644
--- a/arch/arm/kernel/dma.c
+++ b/arch/arm/kernel/dma.c
@@ -276,21 +276,9 @@ static int proc_dma_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_dma_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_dma_show, NULL);
-}
-
-static const struct file_operations proc_dma_operations = {
-	.open		= proc_dma_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_dma_init(void)
 {
-	proc_create("dma", 0, NULL, &proc_dma_operations);
+	proc_create_single("dma", 0, NULL, proc_dma_show);
 	return 0;
 }
 
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 3bda08bee674..6e971e114879 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -91,18 +91,6 @@ static int proc_status_show(struct seq_file *m, void *v)
 		seq_printf(m, "Last process:\t\t%d\n", previous_pid);
 	return 0;
 }
-
-static int proc_status_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_status_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_status_fops = {
-	.open		= proc_status_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 /*
@@ -260,7 +248,8 @@ static int __init swp_emulation_init(void)
 		return 0;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("cpu/swp_emulation", S_IRUGO, NULL, &proc_status_fops))
+	if (!proc_create_single("cpu/swp_emulation", S_IRUGO, NULL,
+			proc_status_show))
 		return -ENOMEM;
 #endif /* CONFIG_PROC_FS */
 
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index bdb5ec1cf560..39aef4876ed4 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -657,25 +657,13 @@ static int ecard_devices_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ecard_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ecard_devices_proc_show, NULL);
-}
-
-static const struct file_operations bus_ecard_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ecard_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static struct proc_dir_entry *proc_bus_ecard_dir = NULL;
 
 static void ecard_proc_init(void)
 {
 	proc_bus_ecard_dir = proc_mkdir("bus/ecard", NULL);
-	proc_create("devices", 0, proc_bus_ecard_dir, &bus_ecard_proc_fops);
+	proc_create_single("devices", 0, proc_bus_ecard_dir,
+			ecard_devices_proc_show);
 }
 
 #define ec_set_resource(ec,nr,st,sz)				\
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index b6e597860888..f4a94241265c 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -920,18 +920,6 @@ static int proc_palinfo_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_palinfo_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_palinfo_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_palinfo_fops = {
-	.open		= proc_palinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int palinfo_add_proc(unsigned int cpu)
 {
 	pal_func_cpu_u_t f;
@@ -948,8 +936,8 @@ static int palinfo_add_proc(unsigned int cpu)
 
 	for (j=0; j < NR_PALINFO_ENTRIES; j++) {
 		f.func_id = j;
-		proc_create_data(palinfo_entries[j].name, 0, cpu_dir,
-				 &proc_palinfo_fops, (void *)f.value);
+		proc_create_single_data(palinfo_entries[j].name, 0, cpu_dir,
+				proc_palinfo_show, (void *)f.value);
 	}
 	return 0;
 }
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index 52c404b08904..aba1f463a8dd 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -54,8 +54,6 @@ MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
 MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
 MODULE_LICENSE("GPL");
 
-static const struct file_operations proc_salinfo_fops;
-
 typedef struct {
 	const char		*name;		/* name of the proc entry */
 	unsigned long           feature;        /* feature bit */
@@ -578,6 +576,17 @@ static int salinfo_cpu_pre_down(unsigned int cpu)
 	return 0;
 }
 
+/*
+ * 'data' contains an integer that corresponds to the feature we're
+ * testing
+ */
+static int proc_salinfo_show(struct seq_file *m, void *v)
+{
+	unsigned long data = (unsigned long)v;
+	seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n");
+	return 0;
+}
+
 static int __init
 salinfo_init(void)
 {
@@ -593,9 +602,9 @@ salinfo_init(void)
 
 	for (i=0; i < NR_SALINFO_ENTRIES; i++) {
 		/* pass the feature bit in question as misc data */
-		*sdir++ = proc_create_data(salinfo_entries[i].name, 0, salinfo_dir,
-					   &proc_salinfo_fops,
-					   (void *)salinfo_entries[i].feature);
+		*sdir++ = proc_create_single_data(salinfo_entries[i].name, 0,
+				salinfo_dir, proc_salinfo_show,
+				(void *)salinfo_entries[i].feature);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
@@ -633,27 +642,4 @@ salinfo_init(void)
 	return 0;
 }
 
-/*
- * 'data' contains an integer that corresponds to the feature we're
- * testing
- */
-static int proc_salinfo_show(struct seq_file *m, void *v)
-{
-	unsigned long data = (unsigned long)v;
-	seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n");
-	return 0;
-}
-
-static int proc_salinfo_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_salinfo_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_salinfo_fops = {
-	.open		= proc_salinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 module_init(salinfo_init);
diff --git a/arch/ia64/sn/kernel/sn2/prominfo_proc.c b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
index ec4de2b09653..e15457bf21ac 100644
--- a/arch/ia64/sn/kernel/sn2/prominfo_proc.c
+++ b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
@@ -140,18 +140,6 @@ static int proc_fit_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_fit_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_fit_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_fit_fops = {
-	.open		= proc_fit_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int proc_version_show(struct seq_file *m, void *v)
 {
 	unsigned long nasid = (unsigned long)m->private;
@@ -174,18 +162,6 @@ static int proc_version_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_version_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_version_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_version_fops = {
-	.open		= proc_version_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* module entry points */
 int __init prominfo_init(void);
 void __exit prominfo_exit(void);
@@ -217,10 +193,10 @@ int __init prominfo_init(void)
 		if (!dir)
 			continue;
 		nasid = cnodeid_to_nasid(cnodeid);
-		proc_create_data("fit", 0, dir,
-				 &proc_fit_fops, (void *)nasid);
-		proc_create_data("version", 0, dir,
-				 &proc_version_fops, (void *)nasid);
+		proc_create_single_data("fit", 0, dir, proc_fit_show, 
+				(void *)nasid);
+		proc_create_single_data("version", 0, dir, proc_version_show,
+				(void *)nasid);
 	}
 	return 0;
 }
diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
index 29cf8f8c08e9..c2a4d84297b0 100644
--- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
+++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
@@ -18,33 +18,18 @@ static int partition_id_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int partition_id_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, partition_id_show, NULL);
-}
-
 static int system_serial_number_show(struct seq_file *s, void *p)
 {
 	seq_printf(s, "%s\n", sn_system_serial_number());
 	return 0;
 }
 
-static int system_serial_number_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, system_serial_number_show, NULL);
-}
-
 static int licenseID_show(struct seq_file *s, void *p)
 {
 	seq_printf(s, "0x%llx\n", sn_partition_serial_number_val());
 	return 0;
 }
 
-static int licenseID_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, licenseID_show, NULL);
-}
-
 static int coherence_id_show(struct seq_file *s, void *p)
 {
 	seq_printf(s, "%d\n", partition_coherence_id());
@@ -52,43 +37,10 @@ static int coherence_id_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int coherence_id_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, coherence_id_show, NULL);
-}
-
 /* /proc/sgi_sn/sn_topology uses seq_file, see sn_hwperf.c */
 extern int sn_topology_open(struct inode *, struct file *);
 extern int sn_topology_release(struct inode *, struct file *);
 
-static const struct file_operations proc_partition_id_fops = {
-	.open		= partition_id_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations proc_system_sn_fops = {
-	.open		= system_serial_number_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations proc_license_id_fops = {
-	.open		= licenseID_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations proc_coherence_id_fops = {
-	.open		= coherence_id_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct file_operations proc_sn_topo_fops = {
 	.open		= sn_topology_open,
 	.read		= seq_read,
@@ -104,13 +56,13 @@ void register_sn_procfs(void)
 	if (!(sgi_proc_dir = proc_mkdir("sgi_sn", NULL)))
 		return;
 
-	proc_create("partition_id", 0444, sgi_proc_dir,
-		    &proc_partition_id_fops);
-	proc_create("system_serial_number", 0444, sgi_proc_dir,
-		    &proc_system_sn_fops);
-	proc_create("licenseID", 0444, sgi_proc_dir, &proc_license_id_fops);
-	proc_create("coherence_id", 0444, sgi_proc_dir,
-		    &proc_coherence_id_fops);
+	proc_create_single("partition_id", 0444, sgi_proc_dir,
+			partition_id_show);
+	proc_create_single("system_serial_number", 0444, sgi_proc_dir,
+			system_serial_number_show);
+	proc_create_single("licenseID", 0444, sgi_proc_dir, licenseID_show);
+	proc_create_single("coherence_id", 0444, sgi_proc_dir,
+			coherence_id_show);
 	proc_create("sn_topology", 0444, sgi_proc_dir, &proc_sn_topo_fops);
 }
 
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index dd25bfc22fb4..f35e3ebd6331 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -527,21 +527,9 @@ static int hardware_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int hardware_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hardware_proc_show, NULL);
-}
-
-static const struct file_operations hardware_proc_fops = {
-	.open		= hardware_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_hardware_init(void)
 {
-	proc_create("hardware", 0, NULL, &hardware_proc_fops);
+	proc_create_single("hardware", 0, NULL, hardware_proc_show);
 	return 0;
 }
 module_init(proc_hardware_init);
diff --git a/arch/mips/pci/ops-pmcmsp.c b/arch/mips/pci/ops-pmcmsp.c
index dd2d9f7e9412..7649372103af 100644
--- a/arch/mips/pci/ops-pmcmsp.c
+++ b/arch/mips/pci/ops-pmcmsp.c
@@ -83,18 +83,6 @@ static int show_msp_pci_counts(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int msp_pci_rd_cnt_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_msp_pci_counts, NULL);
-}
-
-static const struct file_operations msp_pci_rd_cnt_fops = {
-	.open		= msp_pci_rd_cnt_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*****************************************************************************
  *
  *  FUNCTION: gen_pci_cfg_wr_show
@@ -160,18 +148,6 @@ static int gen_pci_cfg_wr_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int gen_pci_cfg_wr_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, gen_pci_cfg_wr_show, NULL);
-}
-
-static const struct file_operations gen_pci_cfg_wr_fops = {
-	.open		= gen_pci_cfg_wr_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*****************************************************************************
  *
  *  FUNCTION: pci_proc_init
@@ -188,8 +164,8 @@ static const struct file_operations gen_pci_cfg_wr_fops = {
  ****************************************************************************/
 static void pci_proc_init(void)
 {
-	proc_create("pmc_msp_pci_rd_cnt", 0, NULL, &msp_pci_rd_cnt_fops);
-	proc_create("pmc_msp_pci_cfg_wr", 0, NULL, &gen_pci_cfg_wr_fops);
+	proc_create_single("pmc_msp_pci_rd_cnt", 0, NULL, show_msp_pci_counts);
+	proc_create_single("pmc_msp_pci_cfg_wr", 0, NULL, gen_pci_cfg_wr_show);
 }
 #endif /* CONFIG_PROC_FS && PCI_COUNTERS */
 
diff --git a/arch/mips/sibyte/common/bus_watcher.c b/arch/mips/sibyte/common/bus_watcher.c
index a4e55999ecb4..4bb85de9229b 100644
--- a/arch/mips/sibyte/common/bus_watcher.c
+++ b/arch/mips/sibyte/common/bus_watcher.c
@@ -142,24 +142,12 @@ static int bw_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int bw_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, bw_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations bw_proc_fops = {
-	.open		= bw_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void create_proc_decoder(struct bw_stats_struct *stats)
 {
 	struct proc_dir_entry *ent;
 
-	ent = proc_create_data("bus_watcher", S_IWUSR | S_IRUGO, NULL,
-			       &bw_proc_fops, stats);
+	ent = proc_create_single_data("bus_watcher", S_IWUSR | S_IRUGO, NULL,
+			bw_proc_show, stats);
 	if (!ent) {
 		printk(KERN_INFO "Unable to initialize bus_watcher /proc entry\n");
 		return;
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 91bc0cac03a1..6df07ce4f3c2 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -367,19 +367,6 @@ static int proc_pcxl_dma_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_pcxl_dma_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_pcxl_dma_show, NULL);
-}
-
-static const struct file_operations proc_pcxl_dma_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_pcxl_dma_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init
 pcxl_dma_init(void)
 {
@@ -397,8 +384,8 @@ pcxl_dma_init(void)
 			"pcxl_dma_init: Unable to create gsc /proc dir entry\n");
 	else {
 		struct proc_dir_entry* ent;
-		ent = proc_create("pcxl_dma", 0, proc_gsc_root,
-				  &proc_pcxl_dma_ops);
+		ent = proc_create_single("pcxl_dma", 0, proc_gsc_root,
+				proc_pcxl_dma_show);
 		if (!ent)
 			printk(KERN_WARNING
 				"pci-dma.c: Unable to create pcxl_dma /proc entry.\n");
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 3e04242de5a7..28e07482b0f1 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -266,18 +266,6 @@ static int pdc_chassis_warn_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pdc_chassis_warn_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pdc_chassis_warn_show, NULL);
-}
-
-static const struct file_operations pdc_chassis_warn_fops = {
-	.open		= pdc_chassis_warn_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init pdc_chassis_create_procfs(void)
 {
 	unsigned long test;
@@ -292,7 +280,7 @@ static int __init pdc_chassis_create_procfs(void)
 
 	printk(KERN_INFO "Enabling PDC chassis warnings support v%s\n",
 			PDC_CHASSIS_VER);
-	proc_create("chassis", 0400, NULL, &pdc_chassis_warn_fops);
+	proc_create_single("chassis", 0400, NULL, pdc_chassis_warn_show);
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index bc640e4c5ca5..90bb39b1a23c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1775,18 +1775,6 @@ static int proc_eeh_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_eeh_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_eeh_show, NULL);
-}
-
-static const struct file_operations proc_eeh_operations = {
-	.open      = proc_eeh_open,
-	.read      = seq_read,
-	.llseek    = seq_lseek,
-	.release   = single_release,
-};
-
 #ifdef CONFIG_DEBUG_FS
 static int eeh_enable_dbgfs_set(void *data, u64 val)
 {
@@ -1828,7 +1816,7 @@ DEFINE_SIMPLE_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get,
 static int __init eeh_init_proc(void)
 {
 	if (machine_is(pseries) || machine_is(powernv)) {
-		proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
+		proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show);
 #ifdef CONFIG_DEBUG_FS
 		debugfs_create_file("eeh_enable", 0600,
                                     powerpc_debugfs_root, NULL,
diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c
index fb070d8cad07..d49063d0baa4 100644
--- a/arch/powerpc/kernel/rtas-proc.c
+++ b/arch/powerpc/kernel/rtas-proc.c
@@ -154,18 +154,6 @@ static ssize_t ppc_rtas_tone_volume_write(struct file *file,
 static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v);
 static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v);
 
-static int sensors_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ppc_rtas_sensors_show, NULL);
-}
-
-static const struct file_operations ppc_rtas_sensors_operations = {
-	.open		= sensors_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int poweron_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, ppc_rtas_poweron_show, NULL);
@@ -231,18 +219,6 @@ static const struct file_operations ppc_rtas_tone_volume_operations = {
 	.release	= single_release,
 };
 
-static int rmo_buf_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ppc_rtas_rmo_buf_show, NULL);
-}
-
-static const struct file_operations ppc_rtas_rmo_buf_ops = {
-	.open		= rmo_buf_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ppc_rtas_find_all_sensors(void);
 static void ppc_rtas_process_sensor(struct seq_file *m,
 	struct individual_sensor *s, int state, int error, const char *loc);
@@ -267,14 +243,14 @@ static int __init proc_rtas_init(void)
 		    &ppc_rtas_clock_operations);
 	proc_create("powerpc/rtas/poweron", 0644, NULL,
 		    &ppc_rtas_poweron_operations);
-	proc_create("powerpc/rtas/sensors", 0444, NULL,
-		    &ppc_rtas_sensors_operations);
+	proc_create_single("powerpc/rtas/sensors", 0444, NULL,
+			ppc_rtas_sensors_show);
 	proc_create("powerpc/rtas/frequency", 0644, NULL,
 		    &ppc_rtas_tone_freq_operations);
 	proc_create("powerpc/rtas/volume", 0644, NULL,
 		    &ppc_rtas_tone_volume_operations);
-	proc_create("powerpc/rtas/rmo_buffer", 0400, NULL,
-		    &ppc_rtas_rmo_buf_ops);
+	proc_create_single("powerpc/rtas/rmo_buffer", 0400, NULL,
+			ppc_rtas_rmo_buf_show);
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index ccc421503363..c9ef3c532169 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1095,18 +1095,6 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		atomic_read(&nr_spu_contexts),
 		idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
 	return 0;
-}
-
-static int spu_loadavg_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_spu_loadavg, NULL);
-}
-
-static const struct file_operations spu_loadavg_fops = {
-	.open		= spu_loadavg_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
 };
 
 int __init spu_sched_init(void)
@@ -1135,7 +1123,7 @@ int __init spu_sched_init(void)
 
 	mod_timer(&spuloadavg_timer, 0);
 
-	entry = proc_create("spu_loadavg", 0, NULL, &spu_loadavg_fops);
+	entry = proc_create_single("spu_loadavg", 0, NULL, show_spu_loadavg);
 	if (!entry)
 		goto out_stop_kthread;
 
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index fed49601f06d..54f5496913fa 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -294,21 +294,9 @@ static int sysinfo_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int sysinfo_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, sysinfo_show, NULL);
-}
-
-static const struct file_operations sysinfo_fops = {
-	.open		= sysinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init sysinfo_create_proc(void)
 {
-	proc_create("sysinfo", 0444, NULL, &sysinfo_fops);
+	proc_create_single("sysinfo", 0444, NULL, sysinfo_show);
 	return 0;
 }
 device_initcall(sysinfo_create_proc);
diff --git a/arch/sh/drivers/dma/dma-api.c b/arch/sh/drivers/dma/dma-api.c
index c0eec08d8f95..b05be597b19f 100644
--- a/arch/sh/drivers/dma/dma-api.c
+++ b/arch/sh/drivers/dma/dma-api.c
@@ -339,18 +339,6 @@ static int dma_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int dma_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dma_proc_show, NULL);
-}
-
-static const struct file_operations dma_proc_fops = {
-	.open		= dma_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 int register_dmac(struct dma_info *info)
 {
 	unsigned int total_channels, i;
@@ -423,7 +411,7 @@ EXPORT_SYMBOL(unregister_dmac);
 static int __init dma_api_init(void)
 {
 	printk(KERN_NOTICE "DMA: Registering DMA API.\n");
-	return proc_create("dma", 0, NULL, &dma_proc_fops) ? 0 : -ENOMEM;
+	return proc_create_single("dma", 0, NULL, dma_proc_show) ? 0 : -ENOMEM;
 }
 subsys_initcall(dma_api_init);
 
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 3bcef9ce74df..cca9134cfa7d 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -678,25 +678,14 @@ static int sparc_io_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int sparc_io_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, sparc_io_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations sparc_io_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= sparc_io_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 static void register_proc_sparc_ioport(void)
 {
 #ifdef CONFIG_PROC_FS
-	proc_create_data("io_map", 0, NULL, &sparc_io_proc_fops, &sparc_iomap);
-	proc_create_data("dvma_map", 0, NULL, &sparc_io_proc_fops, &_sparc_dvma);
+	proc_create_single_data("io_map", 0, NULL, sparc_io_proc_show,
+			&sparc_iomap);
+	proc_create_single_data("dvma_map", 0, NULL, sparc_io_proc_show,
+			&_sparc_dvma);
 #endif
 }
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index d4e8c497ae86..dcf5ea28a281 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -208,19 +208,6 @@ static int fake_ide_media_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int fake_ide_media_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, fake_ide_media_proc_show, NULL);
-}
-
-static const struct file_operations fake_ide_media_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= fake_ide_media_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void make_ide_entries(const char *dev_name)
 {
 	struct proc_dir_entry *dir, *ent;
@@ -231,7 +218,8 @@ static void make_ide_entries(const char *dev_name)
 	dir = proc_mkdir(dev_name, proc_ide);
 	if(!dir) return;
 
-	ent = proc_create("media", S_IRUGO, dir, &fake_ide_media_proc_fops);
+	ent = proc_create_single("media", S_IRUGO, dir,
+			fake_ide_media_proc_show);
 	if(!ent) return;
 	snprintf(name, sizeof(name), "ide0/%s", dev_name);
 	proc_symlink(dev_name, proc_ide_root, name);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index dfcbe6924eaf..cadeafabf167 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_file_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_apm_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int apm(void *unused)
 {
 	unsigned short	bx;
@@ -2360,7 +2347,7 @@ static int __init apm_init(void)
 	set_desc_base(&gdt[APM_DS >> 3],
 		 (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
 
-	proc_create("apm", 0, NULL, &apm_file_ops);
+	proc_create_single("apm", 0, NULL, proc_apm_show);
 
 	kapmd_task = kthread_create(apm, NULL, "kapmd");
 	if (IS_ERR(kapmd_task)) {
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 2d8de2f8c1ed..84fdfa70920a 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -82,7 +82,6 @@ static SIMPLE_DEV_PM_OPS(acpi_ac_pm, NULL, acpi_ac_resume);
 #ifdef CONFIG_ACPI_PROCFS_POWER
 extern struct proc_dir_entry *acpi_lock_ac_dir(void);
 extern void *acpi_unlock_ac_dir(struct proc_dir_entry *acpi_ac_dir);
-static int acpi_ac_open_fs(struct inode *inode, struct file *file);
 #endif
 
 
@@ -111,16 +110,6 @@ struct acpi_ac {
 
 #define to_acpi_ac(x) power_supply_get_drvdata(x)
 
-#ifdef CONFIG_ACPI_PROCFS_POWER
-static const struct file_operations acpi_ac_fops = {
-	.owner = THIS_MODULE,
-	.open = acpi_ac_open_fs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-#endif
-
 /* --------------------------------------------------------------------------
                                AC Adapter Management
    -------------------------------------------------------------------------- */
@@ -209,11 +198,6 @@ static int acpi_ac_seq_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int acpi_ac_open_fs(struct inode *inode, struct file *file)
-{
-	return single_open(file, acpi_ac_seq_show, PDE_DATA(inode));
-}
-
 static int acpi_ac_add_fs(struct acpi_ac *ac)
 {
 	struct proc_dir_entry *entry = NULL;
@@ -228,9 +212,8 @@ static int acpi_ac_add_fs(struct acpi_ac *ac)
 	}
 
 	/* 'state' [R] */
-	entry = proc_create_data(ACPI_AC_FILE_STATE,
-				 S_IRUGO, acpi_device_dir(ac->device),
-				 &acpi_ac_fops, ac);
+	entry = proc_create_single_data(ACPI_AC_FILE_STATE, S_IRUGO,
+			acpi_device_dir(ac->device), acpi_ac_seq_show, ac);
 	if (!entry)
 		return -ENODEV;
 	return 0;
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index f1cc4f9d31cd..2345a5ee2dbb 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -263,19 +263,6 @@ static int acpi_button_state_seq_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int acpi_button_state_open_fs(struct inode *inode, struct file *file)
-{
-	return single_open(file, acpi_button_state_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations acpi_button_state_fops = {
-	.owner = THIS_MODULE,
-	.open = acpi_button_state_open_fs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int acpi_button_add_fs(struct acpi_device *device)
 {
 	struct acpi_button *button = acpi_driver_data(device);
@@ -311,9 +298,9 @@ static int acpi_button_add_fs(struct acpi_device *device)
 	}
 
 	/* create /proc/acpi/button/lid/LID/state */
-	entry = proc_create_data(ACPI_BUTTON_FILE_STATE,
-				 S_IRUGO, acpi_device_dir(device),
-				 &acpi_button_state_fops, device);
+	entry = proc_create_single_data(ACPI_BUTTON_FILE_STATE, S_IRUGO,
+			acpi_device_dir(device), acpi_button_state_seq_show,
+			device);
 	if (!entry) {
 		ret = -ENODEV;
 		goto remove_dev_dir;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index f781eff7d23e..6918c3d9482e 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6451,19 +6451,6 @@ static int dac960_proc_show(struct seq_file *m, void *v)
   return 0;
 }
 
-static int dac960_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dac960_proc_show, NULL);
-}
-
-static const struct file_operations dac960_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dac960_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
 {
 	DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
@@ -6471,19 +6458,6 @@ static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int dac960_initial_status_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dac960_initial_status_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations dac960_initial_status_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dac960_initial_status_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int dac960_current_status_proc_show(struct seq_file *m, void *v)
 {
   DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
@@ -6517,19 +6491,6 @@ static int dac960_current_status_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int dac960_current_status_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dac960_current_status_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations dac960_current_status_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dac960_current_status_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int dac960_user_command_proc_show(struct seq_file *m, void *v)
 {
 	DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
@@ -6584,16 +6545,18 @@ static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
 
 	if (DAC960_ProcDirectoryEntry == NULL) {
 		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
-		proc_create("status", 0, DAC960_ProcDirectoryEntry,
-			    &dac960_proc_fops);
+		proc_create_single("status", 0, DAC960_ProcDirectoryEntry,
+				dac960_proc_show);
 	}
 
 	snprintf(Controller->ControllerName, sizeof(Controller->ControllerName),
 		 "c%d", Controller->ControllerNumber);
 	ControllerProcEntry = proc_mkdir(Controller->ControllerName,
 					 DAC960_ProcDirectoryEntry);
-	proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
-	proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
+	proc_create_single_data("initial_status", 0, ControllerProcEntry,
+			dac960_initial_status_proc_show, Controller);
+	proc_create_single_data("current_status", 0, ControllerProcEntry,
+			dac960_current_status_proc_show, Controller);
 	proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
 	Controller->ControllerProcEntry = ControllerProcEntry;
 }
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index c61d20c9f3f8..8dce5c04dbf2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2538,18 +2538,6 @@ static int pkt_seq_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static int pkt_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pkt_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations pkt_proc_fops = {
-	.open	= pkt_seq_open,
-	.read	= seq_read,
-	.llseek	= seq_lseek,
-	.release = single_release
-};
-
 static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 {
 	int i;
@@ -2604,7 +2592,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		goto out_mem;
 	}
 
-	proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd);
+	proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd);
 	pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b));
 	return 0;
 
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 6a55959cbf78..8fa4533a1249 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -521,26 +521,13 @@ static int ps3vram_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ps3vram_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ps3vram_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ps3vram_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ps3vram_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void ps3vram_proc_init(struct ps3_system_bus_device *dev)
 {
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	struct proc_dir_entry *pde;
 
-	pde = proc_create_data(DEVICE_NAME, 0444, NULL, &ps3vram_proc_fops,
-			       priv);
+	pde = proc_create_single_data(DEVICE_NAME, 0444, NULL,
+			ps3vram_proc_show, priv);
 	if (!pde)
 		dev_warn(&dev->core, "failed to create /proc entry\n");
 }
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index a5e2f9e557ea..53436c03dbce 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -461,19 +461,6 @@ static int proc_apm_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_apm_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int kapmd(void *arg)
@@ -657,7 +644,7 @@ static int __init apm_init(void)
 	wake_up_process(kapmd_tsk);
 
 #ifdef CONFIG_PROC_FS
-	proc_create("apm", 0, NULL, &apm_proc_fops);
+	proc_create_single("apm", 0, NULL, proc_apm_show);
 #endif
 
 	ret = misc_register(&apm_device);
diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c
index eb53cbadb68f..a5ecf6dae02e 100644
--- a/drivers/char/ds1620.c
+++ b/drivers/char/ds1620.c
@@ -345,18 +345,6 @@ static int ds1620_proc_therm_show(struct seq_file *m, void *v)
 		   fan_state[netwinder_get_fan()]);
 	return 0;
 }
-
-static int ds1620_proc_therm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ds1620_proc_therm_show, NULL);
-}
-
-static const struct file_operations ds1620_proc_therm_fops = {
-	.open		= ds1620_proc_therm_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static const struct file_operations ds1620_fops = {
@@ -404,7 +392,7 @@ static int __init ds1620_init(void)
 		return ret;
 
 #ifdef THERM_USE_PROC
-	if (!proc_create("therm", 0, NULL, &ds1620_proc_therm_fops))
+	if (!proc_create_single("therm", 0, NULL, ds1620_proc_therm_show))
 		printk(KERN_ERR "therm: unable to register /proc/therm\n");
 #endif
 
diff --git a/drivers/char/efirtc.c b/drivers/char/efirtc.c
index dc62568b7dde..d9aab643997e 100644
--- a/drivers/char/efirtc.c
+++ b/drivers/char/efirtc.c
@@ -358,19 +358,6 @@ static int efi_rtc_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int efi_rtc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, efi_rtc_proc_show, NULL);
-}
-
-static const struct file_operations efi_rtc_proc_fops = {
-	.open		= efi_rtc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init 
 efi_rtc_init(void)
 {
@@ -386,7 +373,7 @@ efi_rtc_init(void)
 		return ret;
 	}
 
-	dir = proc_create("driver/efirtc", 0, NULL, &efi_rtc_proc_fops);
+	dir = proc_create_single("driver/efirtc", 0, NULL, efi_rtc_proc_show);
 	if (dir == NULL) {
 		printk(KERN_ERR "efirtc: can't create /proc/driver/efirtc.\n");
 		misc_deregister(&efi_rtc_dev);
diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c
index 678fa97e41fb..25264d65e716 100644
--- a/drivers/char/nvram.c
+++ b/drivers/char/nvram.c
@@ -389,22 +389,9 @@ static int nvram_proc_read(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int nvram_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nvram_proc_read, NULL);
-}
-
-static const struct file_operations nvram_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= nvram_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int nvram_add_proc_fs(void)
 {
-	if (!proc_create("driver/nvram", 0, NULL, &nvram_proc_fops))
+	if (!proc_create_single("driver/nvram", 0, NULL, nvram_proc_read))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 57dc546628b5..94fedeeec035 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -171,7 +171,7 @@ static void mask_rtc_irq_bit(unsigned char bit)
 #endif
 
 #ifdef CONFIG_PROC_FS
-static int rtc_proc_open(struct inode *inode, struct file *file);
+static int rtc_proc_show(struct seq_file *seq, void *v);
 #endif
 
 /*
@@ -832,16 +832,6 @@ static struct miscdevice rtc_dev = {
 	.fops		= &rtc_fops,
 };
 
-#ifdef CONFIG_PROC_FS
-static const struct file_operations rtc_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rtc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
 static resource_size_t rtc_size;
 
 static struct resource * __init rtc_request_region(resource_size_t size)
@@ -982,7 +972,7 @@ no_irq:
 	}
 
 #ifdef CONFIG_PROC_FS
-	ent = proc_create("driver/rtc", 0, NULL, &rtc_proc_fops);
+	ent = proc_create_single("driver/rtc", 0, NULL, rtc_proc_show);
 	if (!ent)
 		printk(KERN_WARNING "rtc: Failed to register with procfs.\n");
 #endif
@@ -1201,11 +1191,6 @@ static int rtc_proc_show(struct seq_file *seq, void *v)
 #undef YN
 #undef NY
 }
-
-static int rtc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rtc_proc_show, NULL);
-}
 #endif
 
 static void rtc_get_rtc_time(struct rtc_time *rtc_tm)
diff --git a/drivers/char/toshiba.c b/drivers/char/toshiba.c
index 5488516da8ea..802376fe851a 100644
--- a/drivers/char/toshiba.c
+++ b/drivers/char/toshiba.c
@@ -326,19 +326,6 @@ static int proc_toshiba_show(struct seq_file *m, void *v)
 		key);
 	return 0;
 }
-
-static int proc_toshiba_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_toshiba_show, NULL);
-}
-
-static const struct file_operations proc_toshiba_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_toshiba_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 
@@ -524,7 +511,7 @@ static int __init toshiba_init(void)
 	{
 		struct proc_dir_entry *pde;
 
-		pde = proc_create("toshiba", 0, NULL, &proc_toshiba_fops);
+		pde = proc_create_single("toshiba", 0, NULL, proc_toshiba_show);
 		if (!pde) {
 			misc_deregister(&tosh_device);
 			return -ENOMEM;
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 8615594bd065..e718b8c69a56 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -260,19 +260,6 @@ static int cn_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cn_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cn_proc_show, NULL);
-}
-
-static const struct file_operations cn_file_ops = {
-	.owner   = THIS_MODULE,
-	.open    = cn_proc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release
-};
-
 static struct cn_dev cdev = {
 	.input   = cn_rx_skb,
 };
@@ -297,7 +284,7 @@ static int cn_init(void)
 
 	cn_already_initialized = 1;
 
-	proc_create("connector", S_IRUGO, init_net.proc_net, &cn_file_ops);
+	proc_create_single("connector", S_IRUGO, init_net.proc_net, cn_proc_show);
 
 	return 0;
 }
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index 49b34de0aed4..47eb8ca729fe 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -509,18 +509,6 @@ static int hp_sdc_rtc_proc_show(struct seq_file *m, void *v)
 #undef NY
 }
 
-static int hp_sdc_rtc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hp_sdc_rtc_proc_show, NULL);
-}
-
-static const struct file_operations hp_sdc_rtc_proc_fops = {
-	.open		= hp_sdc_rtc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int hp_sdc_rtc_ioctl(struct file *file, 
 			    unsigned int cmd, unsigned long arg)
 {
@@ -713,7 +701,7 @@ static int __init hp_sdc_rtc_init(void)
 	if (misc_register(&hp_sdc_rtc_dev) != 0)
 		printk(KERN_INFO "Could not register misc. dev for i8042 rtc\n");
 
-        proc_create("driver/rtc", 0, NULL, &hp_sdc_rtc_proc_fops);
+        proc_create_single("driver/rtc", 0, NULL, hp_sdc_rtc_proc_show);
 
 	printk(KERN_INFO "HP i8042 SDC + MSM-58321 RTC support loaded "
 			 "(RTC v " RTC_VERSION ")\n");
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 19cd93783c87..baa1ee2bc2ac 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1340,19 +1340,6 @@ static int capi20_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int capi20_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, capi20_proc_show, NULL);
-}
-
-static const struct file_operations capi20_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= capi20_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * /proc/capi/capi20ncci:
  *  applid ncci
@@ -1373,23 +1360,10 @@ static int capi20ncci_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int capi20ncci_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, capi20ncci_proc_show, NULL);
-}
-
-static const struct file_operations capi20ncci_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= capi20ncci_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void __init proc_init(void)
 {
-	proc_create("capi/capi20", 0, NULL, &capi20_proc_fops);
-	proc_create("capi/capi20ncci", 0, NULL, &capi20ncci_proc_fops);
+	proc_create_single("capi/capi20", 0, NULL, capi20_proc_show);
+	proc_create_single("capi/capi20ncci", 0, NULL, capi20ncci_proc_show);
 }
 
 static void __exit proc_exit(void)
diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
index 49fef08858c5..7ac51798949d 100644
--- a/drivers/isdn/capi/capidrv.c
+++ b/drivers/isdn/capi/capidrv.c
@@ -2460,22 +2460,9 @@ static int capidrv_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int capidrv_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, capidrv_proc_show, NULL);
-}
-
-static const struct file_operations capidrv_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= capidrv_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void __init proc_init(void)
 {
-	proc_create("capi/capidrv", 0, NULL, &capidrv_proc_fops);
+	proc_create_single("capi/capidrv", 0, NULL, capidrv_proc_show);
 }
 
 static void __exit proc_exit(void)
diff --git a/drivers/isdn/hardware/eicon/diva_didd.c b/drivers/isdn/hardware/eicon/diva_didd.c
index fab6ccfb00d5..60e79257dd5f 100644
--- a/drivers/isdn/hardware/eicon/diva_didd.c
+++ b/drivers/isdn/hardware/eicon/diva_didd.c
@@ -78,26 +78,13 @@ static int divadidd_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int divadidd_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, divadidd_proc_show, NULL);
-}
-
-static const struct file_operations divadidd_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= divadidd_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init create_proc(void)
 {
 	proc_net_eicon = proc_mkdir("eicon", init_net.proc_net);
 
 	if (proc_net_eicon) {
-		proc_didd = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon,
-					&divadidd_proc_fops);
+		proc_didd = proc_create_single(DRIVERLNAME, S_IRUGO,
+				proc_net_eicon, divadidd_proc_show);
 		return (1);
 	}
 	return (0);
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index 525518c945fe..e7081e0c0e35 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -101,23 +101,10 @@ static int um_idi_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int um_idi_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, um_idi_proc_show, NULL);
-}
-
-static const struct file_operations um_idi_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= um_idi_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init create_um_idi_proc(void)
 {
-	um_idi_proc_entry = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon,
-					&um_idi_proc_fops);
+	um_idi_proc_entry = proc_create_single(DRIVERLNAME, S_IRUGO,
+			proc_net_eicon, um_idi_proc_show);
 	if (!um_idi_proc_entry)
 		return (0);
 	return (1);
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 433dbeddfcf9..6663893f41c4 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -191,10 +191,10 @@ static int init_pmu(void);
 static void pmu_start(void);
 static irqreturn_t via_pmu_interrupt(int irq, void *arg);
 static irqreturn_t gpio1_interrupt(int irq, void *arg);
-static const struct file_operations pmu_info_proc_fops;
-static const struct file_operations pmu_irqstats_proc_fops;
+static int pmu_info_proc_show(struct seq_file *m, void *v);
+static int pmu_irqstats_proc_show(struct seq_file *m, void *v);
+static int pmu_battery_proc_show(struct seq_file *m, void *v);
 static void pmu_pass_intr(unsigned char *data, int len);
-static const struct file_operations pmu_battery_proc_fops;
 static const struct file_operations pmu_options_proc_fops;
 
 #ifdef CONFIG_ADB
@@ -511,13 +511,15 @@ static int __init via_pmu_dev_init(void)
 		for (i=0; i<pmu_battery_count; i++) {
 			char title[16];
 			sprintf(title, "battery_%ld", i);
-			proc_pmu_batt[i] = proc_create_data(title, 0, proc_pmu_root,
-					&pmu_battery_proc_fops, (void *)i);
+			proc_pmu_batt[i] = proc_create_single_data(title, 0,
+					proc_pmu_root, pmu_battery_proc_show,
+					(void *)i);
 		}
 
-		proc_pmu_info = proc_create("info", 0, proc_pmu_root, &pmu_info_proc_fops);
-		proc_pmu_irqstats = proc_create("interrupts", 0, proc_pmu_root,
-						&pmu_irqstats_proc_fops);
+		proc_pmu_info = proc_create_single("info", 0, proc_pmu_root,
+				pmu_info_proc_show);
+		proc_pmu_irqstats = proc_create_single("interrupts", 0,
+				proc_pmu_root, pmu_irqstats_proc_show);
 		proc_pmu_options = proc_create("options", 0600, proc_pmu_root,
 						&pmu_options_proc_fops);
 	}
@@ -811,19 +813,6 @@ static int pmu_info_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pmu_info_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmu_info_proc_show, NULL);
-}
-
-static const struct file_operations pmu_info_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pmu_info_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pmu_irqstats_proc_show(struct seq_file *m, void *v)
 {
 	int i;
@@ -848,19 +837,6 @@ static int pmu_irqstats_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pmu_irqstats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmu_irqstats_proc_show, NULL);
-}
-
-static const struct file_operations pmu_irqstats_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pmu_irqstats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pmu_battery_proc_show(struct seq_file *m, void *v)
 {
 	long batnum = (long)m->private;
@@ -875,19 +851,6 @@ static int pmu_battery_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pmu_battery_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmu_battery_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations pmu_battery_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pmu_battery_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pmu_options_proc_show(struct seq_file *m, void *v)
 {
 #if defined(CONFIG_SUSPEND) && defined(CONFIG_PPC32)
diff --git a/drivers/media/pci/saa7164/saa7164-core.c b/drivers/media/pci/saa7164/saa7164-core.c
index fca36a4910c2..d697e1ad929c 100644
--- a/drivers/media/pci/saa7164/saa7164-core.c
+++ b/drivers/media/pci/saa7164/saa7164-core.c
@@ -1122,23 +1122,11 @@ static int saa7164_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int saa7164_proc_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, saa7164_proc_show, NULL);
-}
-
-static const struct file_operations saa7164_proc_fops = {
-	.open		= saa7164_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int saa7164_proc_create(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("saa7164", S_IRUGO, NULL, &saa7164_proc_fops);
+	pe = proc_create_single("saa7164", S_IRUGO, NULL, saa7164_proc_show);
 	if (!pe)
 		return -ENOMEM;
 
diff --git a/drivers/media/pci/zoran/videocodec.c b/drivers/media/pci/zoran/videocodec.c
index 5ff23ef89215..4427ae7126e2 100644
--- a/drivers/media/pci/zoran/videocodec.c
+++ b/drivers/media/pci/zoran/videocodec.c
@@ -344,19 +344,6 @@ static int proc_videocodecs_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int proc_videocodecs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_videocodecs_show, NULL);
-}
-
-static const struct file_operations videocodecs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_videocodecs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 /* ===================== */
@@ -373,7 +360,8 @@ videocodec_init (void)
 	       VIDEOCODEC_VERSION);
 
 #ifdef CONFIG_PROC_FS
-	videocodec_proc_entry = proc_create("videocodecs", 0, NULL, &videocodecs_proc_fops);
+	videocodec_proc_entry = proc_create_single("videocodecs", 0, NULL,
+			proc_videocodecs_show);
 	if (!videocodec_proc_entry) {
 		dprintk(1, KERN_ERR "videocodec: can't init procfs.\n");
 	}
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 51eb1b027963..a746ccdd630a 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -197,9 +197,9 @@ static int	mpt_host_page_access_control(MPT_ADAPTER *ioc, u8 access_control_valu
 static int	mpt_host_page_alloc(MPT_ADAPTER *ioc, pIOCInit_t ioc_init);
 
 #ifdef CONFIG_PROC_FS
-static const struct file_operations mpt_summary_proc_fops;
-static const struct file_operations mpt_version_proc_fops;
-static const struct file_operations mpt_iocinfo_proc_fops;
+static int mpt_summary_proc_show(struct seq_file *m, void *v);
+static int mpt_version_proc_show(struct seq_file *m, void *v);
+static int mpt_iocinfo_proc_show(struct seq_file *m, void *v);
 #endif
 static void	mpt_get_fw_exp_ver(char *buf, MPT_ADAPTER *ioc);
 
@@ -2040,8 +2040,10 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	 */
 	dent = proc_mkdir(ioc->name, mpt_proc_root_dir);
 	if (dent) {
-		proc_create_data("info", S_IRUGO, dent, &mpt_iocinfo_proc_fops, ioc);
-		proc_create_data("summary", S_IRUGO, dent, &mpt_summary_proc_fops, ioc);
+		proc_create_single_data("info", S_IRUGO, dent,
+				mpt_iocinfo_proc_show, ioc);
+		proc_create_single_data("summary", S_IRUGO, dent,
+				mpt_summary_proc_show, ioc);
 	}
 #endif
 
@@ -6606,8 +6608,10 @@ procmpt_create(void)
 	if (mpt_proc_root_dir == NULL)
 		return -ENOTDIR;
 
-	proc_create("summary", S_IRUGO, mpt_proc_root_dir, &mpt_summary_proc_fops);
-	proc_create("version", S_IRUGO, mpt_proc_root_dir, &mpt_version_proc_fops);
+	proc_create_single("summary", S_IRUGO, mpt_proc_root_dir,
+			mpt_summary_proc_show);
+	proc_create_single("version", S_IRUGO, mpt_proc_root_dir,
+			mpt_version_proc_show);
 	return 0;
 }
 
@@ -6646,19 +6650,6 @@ static int mpt_summary_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mpt_summary_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mpt_summary_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations mpt_summary_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mpt_summary_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int mpt_version_proc_show(struct seq_file *m, void *v)
 {
 	u8	 cb_idx;
@@ -6701,19 +6692,6 @@ static int mpt_version_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mpt_version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mpt_version_proc_show, NULL);
-}
-
-static const struct file_operations mpt_version_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mpt_version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int mpt_iocinfo_proc_show(struct seq_file *m, void *v)
 {
 	MPT_ADAPTER	*ioc = m->private;
@@ -6793,19 +6771,6 @@ static int mpt_iocinfo_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int mpt_iocinfo_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mpt_iocinfo_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations mpt_iocinfo_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mpt_iocinfo_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif		/* CONFIG_PROC_FS } */
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 807d17d863b3..64a1fcaafd9a 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1829,18 +1829,6 @@ static int mtd_proc_show(struct seq_file *m, void *v)
 	mutex_unlock(&mtd_table_mutex);
 	return 0;
 }
-
-static int mtd_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mtd_proc_show, NULL);
-}
-
-static const struct file_operations mtd_proc_ops = {
-	.open		= mtd_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 /*====================================================================*/
@@ -1883,7 +1871,7 @@ static int __init init_mtd(void)
 		goto err_bdi;
 	}
 
-	proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
+	proc_mtd = proc_create_single("mtd", 0, NULL, mtd_proc_show);
 
 	ret = init_mtdchar();
 	if (ret)
diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c
index d122386c382b..b01dc34d55af 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -1482,18 +1482,6 @@ static int atmel_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int atmel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, atmel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations atmel_proc_fops = {
-	.open		= atmel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct net_device_ops atmel_netdev_ops = {
 	.ndo_open 		= atmel_open,
 	.ndo_stop		= atmel_close,
@@ -1614,7 +1602,8 @@ struct net_device *init_atmel_card(unsigned short irq, unsigned long port,
 
 	netif_carrier_off(dev);
 
-	if (!proc_create_data("driver/atmel", 0, NULL, &atmel_proc_fops, priv))
+	if (!proc_create_single_data("driver/atmel", 0, NULL, atmel_proc_show,
+			priv))
 		printk(KERN_WARNING "atmel: unable to create /proc entry.\n");
 
 	printk(KERN_INFO "%s: Atmel at76c50x. Version %d.%d. MAC %pM\n",
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
index b4dfe1893d18..4f76f81dd3af 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c
@@ -1106,18 +1106,6 @@ static int prism2_sta_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int prism2_sta_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, prism2_sta_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations prism2_sta_proc_fops = {
-	.open		= prism2_sta_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void handle_add_proc_queue(struct work_struct *work)
 {
 	struct ap_data *ap = container_of(work, struct ap_data,
@@ -1138,9 +1126,9 @@ static void handle_add_proc_queue(struct work_struct *work)
 
 		if (sta) {
 			sprintf(name, "%pM", sta->addr);
-			sta->proc = proc_create_data(
+			sta->proc = proc_create_single_data(
 				name, 0, ap->proc,
-				&prism2_sta_proc_fops, sta);
+				prism2_sta_proc_show, sta);
 
 			atomic_dec(&sta->users);
 		}
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 7f9b16b97ea3..a7e0a17aa7e8 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -2663,19 +2663,6 @@ static int ray_cs_proc_show(struct seq_file *m, void *v)
 	}
 	return 0;
 }
-
-static int ray_cs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ray_cs_proc_show, NULL);
-}
-
-static const struct file_operations ray_cs_proc_fops = {
-	.owner = THIS_MODULE,
-	.open = ray_cs_proc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
 #endif
 /*===========================================================================*/
 static int build_auth_frame(ray_dev_t *local, UCHAR *dest, int auth_type)
@@ -2814,7 +2801,7 @@ static int __init init_ray_cs(void)
 #ifdef CONFIG_PROC_FS
 	proc_mkdir("driver/ray_cs", NULL);
 
-	proc_create("driver/ray_cs/ray_cs", 0, NULL, &ray_cs_proc_fops);
+	proc_create_single("driver/ray_cs/ray_cs", 0, NULL, ray_cs_proc_show);
 	proc_create("driver/ray_cs/essid", 0200, NULL, &ray_cs_essid_proc_fops);
 	proc_create_data("driver/ray_cs/net_type", 0200, NULL, &int_proc_fops,
 			 &net_type);
diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c
index c2e5a7e6bd3e..88e1f9a0faaf 100644
--- a/drivers/nubus/proc.c
+++ b/drivers/nubus/proc.c
@@ -45,18 +45,6 @@ nubus_devices_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int nubus_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nubus_devices_proc_show, NULL);
-}
-
-static const struct file_operations nubus_devices_proc_fops = {
-	.open		= nubus_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static struct proc_dir_entry *proc_bus_nubus_dir;
 
 /*
@@ -149,18 +137,6 @@ static int nubus_proc_rsrc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int nubus_proc_rsrc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nubus_proc_rsrc_show, inode);
-}
-
-static const struct file_operations nubus_proc_rsrc_fops = {
-	.open		= nubus_proc_rsrc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 void nubus_proc_add_rsrc_mem(struct proc_dir_entry *procdir,
 			     const struct nubus_dirent *ent,
 			     unsigned int size)
@@ -176,8 +152,8 @@ void nubus_proc_add_rsrc_mem(struct proc_dir_entry *procdir,
 		pde_data = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size);
 	else
 		pde_data = NULL;
-	proc_create_data(name, S_IFREG | 0444, procdir,
-			 &nubus_proc_rsrc_fops, pde_data);
+	proc_create_single_data(name, S_IFREG | 0444, procdir,
+			nubus_proc_rsrc_show, pde_data);
 }
 
 void nubus_proc_add_rsrc(struct proc_dir_entry *procdir,
@@ -190,32 +166,21 @@ void nubus_proc_add_rsrc(struct proc_dir_entry *procdir,
 		return;
 
 	snprintf(name, sizeof(name), "%x", ent->type);
-	proc_create_data(name, S_IFREG | 0444, procdir,
-			 &nubus_proc_rsrc_fops,
-			 nubus_proc_alloc_pde_data(data, 0));
+	proc_create_single_data(name, S_IFREG | 0444, procdir,
+			nubus_proc_rsrc_show,
+			nubus_proc_alloc_pde_data(data, 0));
 }
 
 /*
  * /proc/nubus stuff
  */
 
-static int nubus_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nubus_proc_show, NULL);
-}
-
-static const struct file_operations nubus_proc_fops = {
-	.open		= nubus_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 void __init nubus_proc_init(void)
 {
-	proc_create("nubus", 0, NULL, &nubus_proc_fops);
+	proc_create_single("nubus", 0, NULL, nubus_proc_show);
 	proc_bus_nubus_dir = proc_mkdir("bus/nubus", NULL);
 	if (!proc_bus_nubus_dir)
 		return;
-	proc_create("devices", 0, proc_bus_nubus_dir, &nubus_devices_proc_fops);
+	proc_create_single("devices", 0, proc_bus_nubus_dir,
+			nubus_devices_proc_show);
 }
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 126cf19e869b..21a3e93417c4 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1108,19 +1108,6 @@ static int ccio_proc_info(struct seq_file *m, void *p)
 	return 0;
 }
 
-static int ccio_proc_info_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, &ccio_proc_info, NULL);
-}
-
-static const struct file_operations ccio_proc_info_fops = {
-	.owner = THIS_MODULE,
-	.open = ccio_proc_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
 {
 	struct ioc *ioc = ioc_list;
@@ -1135,19 +1122,6 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
 
 	return 0;
 }
-
-static int ccio_proc_bitmap_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, &ccio_proc_bitmap_info, NULL);
-}
-
-static const struct file_operations ccio_proc_bitmap_fops = {
-	.owner = THIS_MODULE,
-	.open = ccio_proc_bitmap_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 /**
@@ -1589,10 +1563,10 @@ static int __init ccio_probe(struct parisc_device *dev)
 
 #ifdef CONFIG_PROC_FS
 	if (ioc_count == 0) {
-		proc_create(MODULE_NAME, 0, proc_runway_root,
-			    &ccio_proc_info_fops);
-		proc_create(MODULE_NAME"-bitmap", 0, proc_runway_root,
-			    &ccio_proc_bitmap_fops);
+		proc_create_single(MODULE_NAME, 0, proc_runway_root,
+				ccio_proc_info);
+		proc_create_single(MODULE_NAME"-bitmap", 0, proc_runway_root,
+				ccio_proc_bitmap_info);
 	}
 #endif
 	ioc_count++;
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 0a9c762a70fa..0d33d1f86d10 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1863,20 +1863,6 @@ static int sba_proc_info(struct seq_file *m, void *p)
 	return 0;
 }
 
-static int
-sba_proc_open(struct inode *i, struct file *f)
-{
-	return single_open(f, &sba_proc_info, NULL);
-}
-
-static const struct file_operations sba_proc_fops = {
-	.owner = THIS_MODULE,
-	.open = sba_proc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int
 sba_proc_bitmap_info(struct seq_file *m, void *p)
 {
@@ -1889,20 +1875,6 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
 
 	return 0;
 }
-
-static int
-sba_proc_bitmap_open(struct inode *i, struct file *f)
-{
-	return single_open(f, &sba_proc_bitmap_info, NULL);
-}
-
-static const struct file_operations sba_proc_bitmap_fops = {
-	.owner = THIS_MODULE,
-	.open = sba_proc_bitmap_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 static const struct parisc_device_id sba_tbl[] __initconst = {
@@ -2014,8 +1986,8 @@ static int __init sba_driver_callback(struct parisc_device *dev)
 		break;
 	}
 
-	proc_create("sba_iommu", 0, root, &sba_proc_fops);
-	proc_create("sba_iommu-bitmap", 0, root, &sba_proc_bitmap_fops);
+	proc_create_single("sba_iommu", 0, root, sba_proc_info);
+	proc_create_single("sba_iommu-bitmap", 0, root, sba_proc_bitmap_info);
 #endif
 
 	parisc_has_iommu();
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index e8d058c5ef21..eef76bfa5d73 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -1689,19 +1689,6 @@ static int version_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, version_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations version_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * Proc and module init
  */
@@ -1722,8 +1709,8 @@ static void create_toshiba_proc_entries(struct toshiba_acpi_dev *dev)
 	if (dev->hotkey_dev)
 		proc_create_data("keys", S_IRUGO | S_IWUSR, toshiba_proc_dir,
 				 &keys_proc_fops, dev);
-	proc_create_data("version", S_IRUGO, toshiba_proc_dir,
-			 &version_proc_fops, dev);
+	proc_create_single_data("version", S_IRUGO, toshiba_proc_dir,
+			version_proc_show, dev);
 }
 
 static void remove_toshiba_proc_entries(struct toshiba_acpi_dev *dev)
diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c
index 7d4aca7948dd..fe1c8f5d9af0 100644
--- a/drivers/pnp/pnpbios/proc.c
+++ b/drivers/pnp/pnpbios/proc.c
@@ -47,19 +47,6 @@ static int pnpconfig_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pnpconfig_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pnpconfig_proc_show, NULL);
-}
-
-static const struct file_operations pnpconfig_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pnpconfig_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int escd_info_proc_show(struct seq_file *m, void *v)
 {
 	struct escd_info_struc escd;
@@ -74,19 +61,6 @@ static int escd_info_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int escd_info_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, escd_info_proc_show, NULL);
-}
-
-static const struct file_operations escd_info_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= escd_info_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #define MAX_SANE_ESCD_SIZE (32*1024)
 static int escd_proc_show(struct seq_file *m, void *v)
 {
@@ -129,19 +103,6 @@ static int escd_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int escd_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, escd_proc_show, NULL);
-}
-
-static const struct file_operations escd_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= escd_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pnp_legacyres_proc_show(struct seq_file *m, void *v)
 {
 	void *buf;
@@ -159,19 +120,6 @@ static int pnp_legacyres_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pnp_legacyres_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pnp_legacyres_proc_show, NULL);
-}
-
-static const struct file_operations pnp_legacyres_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pnp_legacyres_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pnp_devices_proc_show(struct seq_file *m, void *v)
 {
 	struct pnp_bios_node *node;
@@ -202,19 +150,6 @@ static int pnp_devices_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pnp_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pnp_devices_proc_show, NULL);
-}
-
-static const struct file_operations pnp_devices_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pnp_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pnpbios_proc_show(struct seq_file *m, void *v)
 {
 	void *data = m->private;
@@ -318,12 +253,13 @@ int __init pnpbios_proc_init(void)
 	proc_pnp_boot = proc_mkdir("boot", proc_pnp);
 	if (!proc_pnp_boot)
 		return -EIO;
-	proc_create("devices", 0, proc_pnp, &pnp_devices_proc_fops);
-	proc_create("configuration_info", 0, proc_pnp, &pnpconfig_proc_fops);
-	proc_create("escd_info", 0, proc_pnp, &escd_info_proc_fops);
-	proc_create("escd", S_IRUSR, proc_pnp, &escd_proc_fops);
-	proc_create("legacy_device_resources", 0, proc_pnp, &pnp_legacyres_proc_fops);
-
+	proc_create_single("devices", 0, proc_pnp, pnp_devices_proc_show);
+	proc_create_single("configuration_info", 0, proc_pnp,
+			pnpconfig_proc_show);
+	proc_create_single("escd_info", 0, proc_pnp, escd_info_proc_show);
+	proc_create_single("escd", S_IRUSR, proc_pnp, escd_proc_show);
+	proc_create_single("legacy_device_resources", 0, proc_pnp,
+			pnp_legacyres_proc_show);
 	return 0;
 }
 
diff --git a/drivers/staging/comedi/proc.c b/drivers/staging/comedi/proc.c
index 50d38938ac6f..8bc8e42beb90 100644
--- a/drivers/staging/comedi/proc.c
+++ b/drivers/staging/comedi/proc.c
@@ -62,25 +62,9 @@ static int comedi_read(struct seq_file *m, void *v)
 	return 0;
 }
 
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int comedi_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, comedi_read, NULL);
-}
-
-static const struct file_operations comedi_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= comedi_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 void __init comedi_proc_init(void)
 {
-	if (!proc_create("comedi", 0444, NULL, &comedi_proc_fops))
+	if (!proc_create_single("comedi", 0444, NULL, comedi_read))
 		pr_warn("comedi: unable to create proc entry\n");
 }
 
diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c
index ad743a8493be..03959dc86cfd 100644
--- a/drivers/usb/gadget/udc/at91_udc.c
+++ b/drivers/usb/gadget/udc/at91_udc.c
@@ -234,22 +234,10 @@ static int proc_udc_show(struct seq_file *s, void *unused)
 	return 0;
 }
 
-static int proc_udc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_udc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_udc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void create_debug_file(struct at91_udc *udc)
 {
-	udc->pde = proc_create_data(debug_filename, 0, NULL, &proc_ops, udc);
+	udc->pde = proc_create_single_data(debug_filename, 0, NULL,
+			proc_udc_show, udc);
 }
 
 static void remove_debug_file(struct at91_udc *udc)
diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c
index 56b517a38865..7d8af299dfc7 100644
--- a/drivers/usb/gadget/udc/fsl_udc_core.c
+++ b/drivers/usb/gadget/udc/fsl_udc_core.c
@@ -2207,22 +2207,8 @@ static int fsl_proc_read(struct seq_file *m, void *v)
 	return 0;
 }
 
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int fsl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, fsl_proc_read, NULL);
-}
-
-static const struct file_operations fsl_proc_fops = {
-	.open		= fsl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-#define create_proc_file()	proc_create(proc_filename, 0, NULL, &fsl_proc_fops)
+#define create_proc_file() \
+	proc_create_single(proc_filename, 0, NULL, fsl_proc_read)
 #define remove_proc_file()	remove_proc_entry(proc_filename, NULL)
 
 #else				/* !CONFIG_USB_GADGET_DEBUG_FILES */
diff --git a/drivers/usb/gadget/udc/goku_udc.c b/drivers/usb/gadget/udc/goku_udc.c
index 4504d0b202db..c3721225b61e 100644
--- a/drivers/usb/gadget/udc/goku_udc.c
+++ b/drivers/usb/gadget/udc/goku_udc.c
@@ -1241,22 +1241,6 @@ done:
 	local_irq_restore(flags);
 	return 0;
 }
-
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int udc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, udc_proc_read, PDE_DATA(file_inode(file)));
-}
-
-static const struct file_operations udc_proc_fops = {
-	.open		= udc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #endif	/* CONFIG_USB_GADGET_DEBUG_FILES */
 
 /*-------------------------------------------------------------------------*/
@@ -1826,7 +1810,7 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 
 #ifdef CONFIG_USB_GADGET_DEBUG_FILES
-	proc_create_data(proc_node_name, 0, NULL, &udc_proc_fops, dev);
+	proc_create_single_data(proc_node_name, 0, NULL, udc_proc_read, dev);
 #endif
 
 	retval = usb_add_gadget_udc_release(&pdev->dev, &dev->gadget,
diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index dc35a54bad90..3a16431da321 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2432,22 +2432,9 @@ static int proc_udc_show(struct seq_file *s, void *_)
 	return 0;
 }
 
-static int proc_udc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_udc_show, NULL);
-}
-
-static const struct file_operations proc_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_udc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void create_proc_file(void)
 {
-	proc_create(proc_filename, 0, NULL, &proc_ops);
+	proc_create_single(proc_filename, 0, NULL, proc_udc_show);
 }
 
 static void remove_proc_file(void)
diff --git a/drivers/video/fbdev/via/viafbdev.c b/drivers/video/fbdev/via/viafbdev.c
index badee04ef496..9b45125988fb 100644
--- a/drivers/video/fbdev/via/viafbdev.c
+++ b/drivers/video/fbdev/via/viafbdev.c
@@ -1475,19 +1475,6 @@ static int viafb_sup_odev_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int viafb_sup_odev_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, viafb_sup_odev_proc_show, NULL);
-}
-
-static const struct file_operations viafb_sup_odev_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= viafb_sup_odev_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static ssize_t odev_update(const char __user *buffer, size_t count, u32 *odev)
 {
 	char buf[64], *ptr = buf;
@@ -1616,8 +1603,8 @@ static void viafb_init_proc(struct viafb_shared *shared)
 				&viafb_vt1636_proc_fops);
 #endif /* CONFIG_FB_VIA_DIRECT_PROCFS */
 
-		proc_create("supported_output_devices", 0, viafb_entry,
-			&viafb_sup_odev_proc_fops);
+		proc_create_single("supported_output_devices", 0, viafb_entry,
+			viafb_sup_odev_proc_show);
 		iga1_entry = proc_mkdir("iga1", viafb_entry);
 		shared->iga1_proc_entry = iga1_entry;
 		proc_create("output_devices", 0, iga1_entry,
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9d69ea433330..4bc4a7ac61d9 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -314,18 +314,6 @@ skip_rdma:
 	return 0;
 }
 
-static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cifs_debug_data_proc_show, NULL);
-}
-
-static const struct file_operations cifs_debug_data_proc_fops = {
-	.open		= cifs_debug_data_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #ifdef CONFIG_CIFS_STATS
 static ssize_t cifs_stats_proc_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos)
@@ -497,7 +485,8 @@ cifs_proc_init(void)
 	if (proc_fs_cifs == NULL)
 		return;
 
-	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
+	proc_create_single("DebugData", 0, proc_fs_cifs,
+			cifs_debug_data_proc_show);
 
 #ifdef CONFIG_CIFS_STATS
 	proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f33a56d6e6dd..4b47ca6296a7 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -572,23 +572,6 @@ static int iostat_info_seq_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-#define F2FS_PROC_FILE_DEF(_name)					\
-static int _name##_open_fs(struct inode *inode, struct file *file)	\
-{									\
-	return single_open(file, _name##_seq_show, PDE_DATA(inode));	\
-}									\
-									\
-static const struct file_operations f2fs_seq_##_name##_fops = {		\
-	.open = _name##_open_fs,					\
-	.read = seq_read,						\
-	.llseek = seq_lseek,						\
-	.release = single_release,					\
-};
-
-F2FS_PROC_FILE_DEF(segment_info);
-F2FS_PROC_FILE_DEF(segment_bits);
-F2FS_PROC_FILE_DEF(iostat_info);
-
 int __init f2fs_init_sysfs(void)
 {
 	int ret;
@@ -632,12 +615,12 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
 
 	if (sbi->s_proc) {
-		proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
-				 &f2fs_seq_segment_info_fops, sb);
-		proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
-				 &f2fs_seq_segment_bits_fops, sb);
-		proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
-				&f2fs_seq_iostat_info_fops, sb);
+		proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc,
+				segment_info_seq_show, sb);
+		proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc,
+				segment_bits_seq_show, sb);
+		proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc,
+				iostat_info_seq_show, sb);
 	}
 	return 0;
 }
diff --git a/fs/filesystems.c b/fs/filesystems.c
index f2728a4a03a1..b03f57b1105b 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -238,21 +238,9 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int filesystems_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, filesystems_proc_show, NULL);
-}
-
-static const struct file_operations filesystems_proc_fops = {
-	.open		= filesystems_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_filesystems_init(void)
 {
-	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
 	return 0;
 }
 module_init(proc_filesystems_init);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 53cfd0b34c38..f83328a7f048 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -295,7 +295,7 @@ static inline void fscache_stat_d(atomic_t *stat)
 
 #define __fscache_stat(stat) (stat)
 
-extern const struct file_operations fscache_stats_fops;
+int fscache_stats_show(struct seq_file *m, void *v);
 #else
 
 #define __fscache_stat(stat) (NULL)
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 459df553ea09..49a8c90414bc 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -26,8 +26,8 @@ int __init fscache_proc_init(void)
 		goto error_dir;
 
 #ifdef CONFIG_FSCACHE_STATS
-	if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
-			 &fscache_stats_fops))
+	if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
+			fscache_stats_show))
 		goto error_stats;
 #endif
 
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index fcc8c2f2690e..00564a1dfd76 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -138,7 +138,7 @@ atomic_t fscache_n_cache_culled_objects;
 /*
  * display the general statistics
  */
-static int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m, void *v)
 {
 	seq_puts(m, "FS-Cache statistics\n");
 
@@ -284,18 +284,3 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_cache_culled_objects));
 	return 0;
 }
-
-/*
- * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
- */
-static int fscache_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, fscache_stats_show, NULL);
-}
-
-const struct file_operations fscache_stats_fops = {
-	.open		= fscache_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release        = single_release,
-};
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 8233e7af9389..fa762c5fbcb2 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -11,21 +11,9 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cmdline_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cmdline_proc_show, NULL);
-}
-
-static const struct file_operations cmdline_proc_fops = {
-	.open		= cmdline_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_cmdline_init(void)
 {
-	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
 	return 0;
 }
 fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f87cb0053387..02bb1914f5f7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -588,6 +588,35 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL(proc_create_seq_private);
 
+static int proc_single_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *de = PDE(inode);
+
+	return single_open(file, de->single_show, de->data);
+}
+
+static const struct file_operations proc_single_fops = {
+	.open		= proc_single_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_single_fops;
+	p->single_show = show;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_single_data);
+
 void proc_set_size(struct proc_dir_entry *de, loff_t size)
 {
 	de->size = size;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index bcfe830ffd59..84c68508a256 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,7 +44,10 @@ struct proc_dir_entry {
 	struct completion *pde_unload_completion;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
-	const struct seq_operations *seq_ops;
+	union {
+		const struct seq_operations *seq_ops;
+		int (*single_show)(struct seq_file *, void *);
+	};
 	void *data;
 	unsigned int state_size;
 	unsigned int low_ino;
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index b572cc865b92..d06694757201 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -28,21 +28,9 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int loadavg_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, loadavg_proc_show, NULL);
-}
-
-static const struct file_operations loadavg_proc_fops = {
-	.open		= loadavg_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_loadavg_init(void)
 {
-	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
 	return 0;
 }
 fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 65a72ab57471..2fb04846ed11 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -149,21 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int meminfo_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, meminfo_proc_show, NULL);
-}
-
-static const struct file_operations meminfo_proc_fops = {
-	.open		= meminfo_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_meminfo_init(void)
 {
-	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
 	return 0;
 }
 fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 24072cc06e65..12901dcf57e2 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -25,21 +25,9 @@ static int show_softirqs(struct seq_file *p, void *v)
 	return 0;
 }
 
-static int softirqs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_softirqs, NULL);
-}
-
-static const struct file_operations proc_softirqs_operations = {
-	.open		= softirqs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_softirqs_init(void)
 {
-	proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+	proc_create_single("softirqs", 0, NULL, show_softirqs);
 	return 0;
 }
 fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 95a708d83721..3bd12f955867 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -30,21 +30,9 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uptime_proc_show, NULL);
-}
-
-static const struct file_operations uptime_proc_fops = {
-	.open		= uptime_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	proc_create_single("uptime", 0, NULL, uptime_proc_show);
 	return 0;
 }
 fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 94901e8e700d..b449f186577f 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -15,21 +15,9 @@ static int version_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, version_proc_show, NULL);
-}
-
-static const struct file_operations version_proc_fops = {
-	.open		= version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_version_init(void)
 {
-	proc_create("version", 0, NULL, &version_proc_fops);
+	proc_create_single("version", 0, NULL, version_proc_show);
 	return 0;
 }
 fs_initcall(proc_version_init);
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index fe999157dd97..e39b3910d24d 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -389,27 +389,13 @@ static int show_journal(struct seq_file *m, void *unused)
 	return 0;
 }
 
-static int r_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, PDE_DATA(inode), 
-				proc_get_parent_data(inode));
-}
-
-static const struct file_operations r_file_operations = {
-	.open = r_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static struct proc_dir_entry *proc_info_root = NULL;
 static const char proc_info_root_name[] = "fs/reiserfs";
 
 static void add_file(struct super_block *sb, char *name,
 		     int (*func) (struct seq_file *, void *))
 {
-	proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
-			 &r_file_operations, func);
+	proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
 }
 
 int reiserfs_proc_info_init(struct super_block *sb)
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 056e12b421eb..04762a72e344 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -124,18 +124,6 @@ static int xqm_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, xqm_proc_show, NULL);
-}
-
-static const struct file_operations xqm_proc_fops = {
-	.open		= xqm_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* legacy quota stats interface no 2 */
 static int xqmstat_proc_show(struct seq_file *m, void *v)
 {
@@ -147,19 +135,6 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
 	seq_putc(m, '\n');
 	return 0;
 }
-
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, xqmstat_proc_show, NULL);
-}
-
-static const struct file_operations xqmstat_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= xqmstat_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_XFS_QUOTA */
 
 #ifdef CONFIG_PROC_FS
@@ -174,11 +149,9 @@ xfs_init_procfs(void)
 		goto out;
 
 #ifdef CONFIG_XFS_QUOTA
-	if (!proc_create("fs/xfs/xqmstat", 0, NULL,
-			 &xqmstat_proc_fops))
+	if (!proc_create_single("fs/xfs/xqmstat", 0, NULL, xqmstat_proc_show))
 		goto out;
-	if (!proc_create("fs/xfs/xqm", 0, NULL,
-			 &xqm_proc_fops))
+	if (!proc_create_single("fs/xfs/xqm", 0, NULL, xqm_proc_show))
 		goto out;
 #endif
 	return 0;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 314713a48817..2529b871f379 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 
 struct proc_dir_entry;
+struct seq_file;
 struct seq_operations;
 
 #ifdef CONFIG_PROC_FS
@@ -32,6 +33,11 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
 	proc_create_seq_private(name, mode, parent, ops, 0, data)
 #define proc_create_seq(name, mode, parent, ops) \
 	proc_create_seq_private(name, mode, parent, ops, 0, NULL)
+struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data);
+#define proc_create_single(name, mode, parent, show) \
+	proc_create_single_data(name, mode, parent, show, NULL)
  
 extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 					       struct proc_dir_entry *,
@@ -66,9 +72,11 @@ static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
-#define proc_create_seq_private(name, mode, parent, ops, 0, data) ({NULL;})
+#define proc_create_seq_private(name, mode, parent, ops, size, data) ({NULL;})
 #define proc_create_seq_data(name, mode, parent, ops, data) ({NULL;})
 #define proc_create_seq(name, mode, parent, ops) ({NULL;})
+#define proc_create_single(name, mode, parent, show) ({NULL;})
+#define proc_create_single_data(name, mode, parent, show, data) ({NULL;})
 #define proc_create(name, mode, parent, proc_fops) ({NULL;})
 #define proc_create_data(name, mode, parent, proc_fops, data) ({NULL;})
 
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b928b27050c6..0808a33d16d3 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -218,9 +218,9 @@ extern const struct proc_ns_operations cgroupns_operations;
  * cgroup-v1.c
  */
 extern struct cftype cgroup1_base_files[];
-extern const struct file_operations proc_cgroupstats_operations;
 extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 
+int proc_cgroupstats_show(struct seq_file *m, void *v);
 bool cgroup1_ssid_disabled(int ssid);
 void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
 void cgroup1_release_agent(struct work_struct *work);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a2c05d2476ac..e06c97f3ed1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -682,7 +682,7 @@ struct cftype cgroup1_base_files[] = {
 };
 
 /* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
+int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -705,18 +705,6 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a662bfcbea0e..12883656e63e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5335,7 +5335,7 @@ int __init cgroup_init(void)
 	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
 	WARN_ON(register_filesystem(&cgroup_fs_type));
 	WARN_ON(register_filesystem(&cgroup2_fs_type));
-	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
+	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
 
 	return 0;
 }
diff --git a/kernel/dma.c b/kernel/dma.c
index 3506fc34a712..40f152936316 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -135,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v)
 }
 #endif /* MAX_DMA_CHANNELS */
 
-static int proc_dma_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_dma_show, NULL);
-}
-
-static const struct file_operations proc_dma_operations = {
-	.open		= proc_dma_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_dma_init(void)
 {
-	proc_create("dma", 0, NULL, &proc_dma_operations);
+	proc_create_single("dma", 0, NULL, proc_dma_show);
 	return 0;
 }
 
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a5697119290e..33f07c5f2515 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int execdomains_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, execdomains_proc_show, NULL);
-}
-
-static const struct file_operations execdomains_proc_fops = {
-	.open		= execdomains_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_execdomains_init(void)
 {
-	proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+	proc_create_single("execdomains", 0, NULL, execdomains_proc_show);
 	return 0;
 }
 module_init(proc_execdomains_init);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7cb091d81d91..37eda10f5c36 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -185,11 +185,6 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
 }
 
-static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
-}
-
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -198,13 +193,6 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
-static const struct file_operations irq_affinity_hint_proc_fops = {
-	.open		= irq_affinity_hint_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct file_operations irq_affinity_list_proc_fops = {
 	.open		= irq_affinity_list_proc_open,
 	.read		= seq_read,
@@ -223,32 +211,6 @@ static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
 {
 	return show_irq_affinity(EFFECTIVE_LIST, m);
 }
-
-static int irq_effective_aff_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode));
-}
-
-static int irq_effective_aff_list_proc_open(struct inode *inode,
-					    struct file *file)
-{
-	return single_open(file, irq_effective_aff_list_proc_show,
-			   PDE_DATA(inode));
-}
-
-static const struct file_operations irq_effective_aff_proc_fops = {
-	.open		= irq_effective_aff_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations irq_effective_aff_list_proc_fops = {
-	.open		= irq_effective_aff_list_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int default_affinity_show(struct seq_file *m, void *v)
@@ -313,18 +275,6 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "%d\n", irq_desc_get_node(desc));
 	return 0;
 }
-
-static int irq_node_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_node_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_node_proc_fops = {
-	.open		= irq_node_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -337,18 +287,6 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int irq_spurious_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_spurious_proc_fops = {
-	.open		= irq_spurious_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -421,24 +359,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 			 &irq_affinity_proc_fops, irqp);
 
 	/* create /proc/irq/<irq>/affinity_hint */
-	proc_create_data("affinity_hint", 0444, desc->dir,
-			 &irq_affinity_hint_proc_fops, irqp);
+	proc_create_single_data("affinity_hint", 0444, desc->dir,
+			irq_affinity_hint_proc_show, irqp);
 
 	/* create /proc/irq/<irq>/smp_affinity_list */
 	proc_create_data("smp_affinity_list", 0644, desc->dir,
 			 &irq_affinity_list_proc_fops, irqp);
 
-	proc_create_data("node", 0444, desc->dir,
-			 &irq_node_proc_fops, irqp);
+	proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
+			irqp);
 # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
-	proc_create_data("effective_affinity", 0444, desc->dir,
-			 &irq_effective_aff_proc_fops, irqp);
-	proc_create_data("effective_affinity_list", 0444, desc->dir,
-			 &irq_effective_aff_list_proc_fops, irqp);
+	proc_create_single_data("effective_affinity", 0444, desc->dir,
+			irq_effective_aff_proc_show, irqp);
+	proc_create_single_data("effective_affinity_list", 0444, desc->dir,
+			irq_effective_aff_list_proc_show, irqp);
 # endif
 #endif
-	proc_create_data("spurious", 0444, desc->dir,
-			 &irq_spurious_proc_fops, (void *)(long)irq);
+	proc_create_single_data("spurious", 0444, desc->dir,
+			irq_spurious_proc_show, (void *)(long)irq);
 
 out_unlock:
 	mutex_unlock(&register_lock);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 3629049648a1..3dd980dfba2d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -331,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int lockdep_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, lockdep_stats_show, NULL);
-}
-
-static const struct file_operations proc_lockdep_stats_operations = {
-	.open		= lockdep_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #ifdef CONFIG_LOCK_STAT
 
 struct lock_stat_data {
@@ -662,9 +650,7 @@ static int __init lockdep_proc_init(void)
 #ifdef CONFIG_PROVE_LOCKING
 	proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
 #endif
-	proc_create("lockdep_stats", S_IRUSR, NULL,
-		    &proc_lockdep_stats_operations);
-
+	proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
 #ifdef CONFIG_LOCK_STAT
 	proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
 		    &proc_lock_stat_operations);
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index a627a5db2125..d3e3f0f2ec1c 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -86,22 +86,6 @@ static const struct file_operations vlan_fops = {
 	.release = seq_release_net,
 };
 
-/*
- *	/proc/net/vlan/<device> file and inode operations
- */
-
-static int vlandev_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, vlandev_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations vlandev_fops = {
-	.open    = vlandev_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release,
-};
-
 /*
  * Proc filesystem directory entries.
  */
@@ -171,9 +155,8 @@ int vlan_proc_add_dev(struct net_device *vlandev)
 
 	if (!strcmp(vlandev->name, name_conf))
 		return -EINVAL;
-	vlan->dent =
-		proc_create_data(vlandev->name, S_IFREG | 0600,
-				 vn->proc_vlan_dir, &vlandev_fops, vlandev);
+	vlan->dent = proc_create_single_data(vlandev->name, S_IFREG | 0600,
+			vn->proc_vlan_dir, vlandev_seq_show, vlandev);
 	if (!vlan->dent)
 		return -ENOBUFS;
 	return 0;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 43f620feb1c4..bbcbcc113d19 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1282,18 +1282,6 @@ static int pnp_seq_show(struct seq_file *seq, void *v)
 			   &ic_servaddr);
 	return 0;
 }
-
-static int pnp_seq_open(struct inode *indoe, struct file *file)
-{
-	return single_open(file, pnp_seq_show, NULL);
-}
-
-static const struct file_operations pnp_seq_fops = {
-	.open		= pnp_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -1369,7 +1357,7 @@ static int __init ip_auto_config(void)
 	unsigned int i;
 
 #ifdef CONFIG_PROC_FS
-	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
+	proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
 #endif /* CONFIG_PROC_FS */
 
 	if (!ic_enable)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 29268efad247..49f5f3e5dc30 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -360,18 +360,6 @@ static int rt_acct_proc_show(struct seq_file *m, void *v)
 	kfree(dst);
 	return 0;
 }
-
-static int rt_acct_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rt_acct_proc_show, NULL);
-}
-
-static const struct file_operations rt_acct_proc_fops = {
-	.open		= rt_acct_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -389,7 +377,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
 		goto err2;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+	pde = proc_create_single("rt_acct", 0, net->proc_net,
+			rt_acct_proc_show);
 	if (!pde)
 		goto err3;
 #endif
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index a85f7e0b14b1..a0edf194822a 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -267,18 +267,6 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int snmp6_dev_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, snmp6_dev_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations snmp6_dev_seq_fops = {
-	.open	 = snmp6_dev_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release,
-};
-
 int snmp6_register_dev(struct inet6_dev *idev)
 {
 	struct proc_dir_entry *p;
@@ -291,9 +279,8 @@ int snmp6_register_dev(struct inet6_dev *idev)
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 
-	p = proc_create_data(idev->dev->name, 0444,
-			     net->mib.proc_net_devsnmp6,
-			     &snmp6_dev_seq_fops, idev);
+	p = proc_create_single_data(idev->dev->name, 0444,
+			net->mib.proc_net_devsnmp6, snmp6_dev_seq_show, idev);
 	if (!p)
 		return -ENOMEM;
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 106dae7e4818..54eca685420f 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2092,23 +2092,11 @@ static int psched_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int psched_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, psched_show, NULL);
-}
-
-static const struct file_operations psched_fops = {
-	.open = psched_open,
-	.read  = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int __net_init psched_net_init(struct net *net)
 {
 	struct proc_dir_entry *e;
 
-	e = proc_create("psched", 0, net->proc_net, &psched_fops);
+	e = proc_create_single("psched", 0, net->proc_net, psched_show);
 	if (e == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From a3d2599b24462c762719a70bc4d2ec8e8cb52fcf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Apr 2018 21:31:50 +0200
Subject: ipv{4,6}/udp{,lite}: simplify proc registration

Remove a couple indirections to make the code look like most other
protocols.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/udp.h  | 20 +++++------
 net/ipv4/udp.c     | 99 ++++++++++++++++++++----------------------------------
 net/ipv4/udplite.c | 21 ++++--------
 net/ipv6/udp.c     | 30 ++++++++++++-----
 net/ipv6/udplite.c | 21 ++++--------
 5 files changed, 78 insertions(+), 113 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index 0676b272f6ac..093cd323f66a 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -408,31 +408,27 @@ do {									\
 #define __UDPX_INC_STATS(sk, field) __UDP_INC_STATS(sock_net(sk), field, 0)
 #endif
 
-/* /proc */
-int udp_seq_open(struct inode *inode, struct file *file);
-
+#ifdef CONFIG_PROC_FS
 struct udp_seq_afinfo {
-	char				*name;
 	sa_family_t			family;
 	struct udp_table		*udp_table;
-	const struct file_operations	*seq_fops;
-	struct seq_operations		seq_ops;
 };
 
 struct udp_iter_state {
 	struct seq_net_private  p;
-	sa_family_t		family;
 	int			bucket;
-	struct udp_table	*udp_table;
 };
 
-#ifdef CONFIG_PROC_FS
-int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo);
-void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo);
+void *udp_seq_start(struct seq_file *seq, loff_t *pos);
+void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+void udp_seq_stop(struct seq_file *seq, void *v);
+
+extern const struct file_operations udp_afinfo_seq_fops;
+extern const struct file_operations udp6_afinfo_seq_fops;
 
 int udp4_proc_init(void);
 void udp4_proc_exit(void);
-#endif
+#endif /* CONFIG_PROC_FS */
 
 int udpv4_offload_init(void);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b61a770884fa..51559a8c6e57 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2582,12 +2582,13 @@ EXPORT_SYMBOL(udp_prot);
 static struct sock *udp_get_first(struct seq_file *seq, int start)
 {
 	struct sock *sk;
+	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
-	for (state->bucket = start; state->bucket <= state->udp_table->mask;
+	for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
 	     ++state->bucket) {
-		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+		struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
 
 		if (hlist_empty(&hslot->head))
 			continue;
@@ -2596,7 +2597,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 		sk_for_each(sk, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
-			if (sk->sk_family == state->family)
+			if (sk->sk_family == afinfo->family)
 				goto found;
 		}
 		spin_unlock_bh(&hslot->lock);
@@ -2608,16 +2609,17 @@ found:
 
 static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 {
+	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
 	do {
 		sk = sk_next(sk);
-	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != afinfo->family));
 
 	if (!sk) {
-		if (state->bucket <= state->udp_table->mask)
-			spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+		if (state->bucket <= afinfo->udp_table->mask)
+			spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
 		return udp_get_first(seq, state->bucket + 1);
 	}
 	return sk;
@@ -2633,15 +2635,16 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 	return pos ? NULL : sk;
 }
 
-static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+void *udp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct udp_iter_state *state = seq->private;
 	state->bucket = MAX_UDP_PORTS;
 
 	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
+EXPORT_SYMBOL(udp_seq_start);
 
-static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct sock *sk;
 
@@ -2653,56 +2656,17 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	return sk;
 }
+EXPORT_SYMBOL(udp_seq_next);
 
-static void udp_seq_stop(struct seq_file *seq, void *v)
+void udp_seq_stop(struct seq_file *seq, void *v)
 {
+	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct udp_iter_state *state = seq->private;
 
-	if (state->bucket <= state->udp_table->mask)
-		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+	if (state->bucket <= afinfo->udp_table->mask)
+		spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
 }
-
-int udp_seq_open(struct inode *inode, struct file *file)
-{
-	struct udp_seq_afinfo *afinfo = PDE_DATA(inode);
-	struct udp_iter_state *s;
-	int err;
-
-	err = seq_open_net(inode, file, &afinfo->seq_ops,
-			   sizeof(struct udp_iter_state));
-	if (err < 0)
-		return err;
-
-	s = ((struct seq_file *)file->private_data)->private;
-	s->family		= afinfo->family;
-	s->udp_table		= afinfo->udp_table;
-	return err;
-}
-EXPORT_SYMBOL(udp_seq_open);
-
-/* ------------------------------------------------------------------------ */
-int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
-{
-	struct proc_dir_entry *p;
-	int rc = 0;
-
-	afinfo->seq_ops.start		= udp_seq_start;
-	afinfo->seq_ops.next		= udp_seq_next;
-	afinfo->seq_ops.stop		= udp_seq_stop;
-
-	p = proc_create_data(afinfo->name, 0444, net->proc_net,
-			     afinfo->seq_fops, afinfo);
-	if (!p)
-		rc = -ENOMEM;
-	return rc;
-}
-EXPORT_SYMBOL(udp_proc_register);
-
-void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
-{
-	remove_proc_entry(afinfo->name, net->proc_net);
-}
-EXPORT_SYMBOL(udp_proc_unregister);
+EXPORT_SYMBOL(udp_seq_stop);
 
 /* ------------------------------------------------------------------------ */
 static void udp4_format_sock(struct sock *sp, struct seq_file *f,
@@ -2742,32 +2706,43 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct file_operations udp_afinfo_seq_fops = {
+static const struct seq_operations udp_seq_ops = {
+	.start		= udp_seq_start,
+	.next		= udp_seq_next,
+	.stop		= udp_seq_stop,
+	.show		= udp4_seq_show,
+};
+
+static int udp_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &udp_seq_ops,
+			sizeof(struct udp_iter_state));
+}
+
+const struct file_operations udp_afinfo_seq_fops = {
 	.open     = udp_seq_open,
 	.read     = seq_read,
 	.llseek   = seq_lseek,
 	.release  = seq_release_net
 };
+EXPORT_SYMBOL(udp_afinfo_seq_fops);
 
-/* ------------------------------------------------------------------------ */
 static struct udp_seq_afinfo udp4_seq_afinfo = {
-	.name		= "udp",
 	.family		= AF_INET,
 	.udp_table	= &udp_table,
-	.seq_fops	= &udp_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp4_seq_show,
-	},
 };
 
 static int __net_init udp4_proc_init_net(struct net *net)
 {
-	return udp_proc_register(net, &udp4_seq_afinfo);
+	if (!proc_create_data("udp", 0444, net->proc_net, &udp_afinfo_seq_fops,
+			&udp4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit udp4_proc_exit_net(struct net *net)
 {
-	udp_proc_unregister(net, &udp4_seq_afinfo);
+	remove_proc_entry("udp", net->proc_net);
 }
 
 static struct pernet_operations udp4_net_ops = {
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index f96614e9b9a5..4a6e67bfbe0d 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -14,6 +14,7 @@
 #define pr_fmt(fmt) "UDPLite: " fmt
 
 #include <linux/export.h>
+#include <linux/proc_fs.h>
 #include "udp_impl.h"
 
 struct udp_table 	udplite_table __read_mostly;
@@ -73,32 +74,22 @@ static struct inet_protosw udplite4_protosw = {
 };
 
 #ifdef CONFIG_PROC_FS
-
-static const struct file_operations udplite_afinfo_seq_fops = {
-	.open     = udp_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
-};
-
 static struct udp_seq_afinfo udplite4_seq_afinfo = {
-	.name		= "udplite",
 	.family		= AF_INET,
 	.udp_table 	= &udplite_table,
-	.seq_fops	= &udplite_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp4_seq_show,
-	},
 };
 
 static int __net_init udplite4_proc_init_net(struct net *net)
 {
-	return udp_proc_register(net, &udplite4_seq_afinfo);
+	if (!proc_create_data("udplite", 0444, net->proc_net,
+			&udp_afinfo_seq_fops, &udplite4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit udplite4_proc_exit_net(struct net *net)
 {
-	udp_proc_unregister(net, &udplite4_seq_afinfo);
+	remove_proc_entry("udplite", net->proc_net);
 }
 
 static struct pernet_operations udplite4_net_ops = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ea0730028e5d..29adddeac3e5 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1480,31 +1480,43 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct file_operations udp6_afinfo_seq_fops = {
-	.open     = udp_seq_open,
+static const struct seq_operations udp6_seq_ops = {
+	.start		= udp_seq_start,
+	.next		= udp_seq_next,
+	.stop		= udp_seq_stop,
+	.show		= udp6_seq_show,
+};
+
+static int udp6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &udp6_seq_ops,
+			sizeof(struct udp_iter_state));
+}
+
+const struct file_operations udp6_afinfo_seq_fops = {
+	.open     = udp6_seq_open,
 	.read     = seq_read,
 	.llseek   = seq_lseek,
 	.release  = seq_release_net
 };
+EXPORT_SYMBOL(udp6_afinfo_seq_fops);
 
 static struct udp_seq_afinfo udp6_seq_afinfo = {
-	.name		= "udp6",
 	.family		= AF_INET6,
 	.udp_table	= &udp_table,
-	.seq_fops	= &udp6_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp6_seq_show,
-	},
 };
 
 int __net_init udp6_proc_init(struct net *net)
 {
-	return udp_proc_register(net, &udp6_seq_afinfo);
+	if (!proc_create_data("udp6", 0444, net->proc_net,
+			&udp6_afinfo_seq_fops, &udp6_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 void udp6_proc_exit(struct net *net)
 {
-	udp_proc_unregister(net, &udp6_seq_afinfo);
+	remove_proc_entry("udp6", net->proc_net);
 }
 #endif /* CONFIG_PROC_FS */
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 14ae32bb1f3d..a119e57196b5 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -12,6 +12,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 #include <linux/export.h>
+#include <linux/proc_fs.h>
 #include "udp_impl.h"
 
 static int udplitev6_rcv(struct sk_buff *skb)
@@ -92,32 +93,22 @@ void udplitev6_exit(void)
 }
 
 #ifdef CONFIG_PROC_FS
-
-static const struct file_operations udplite6_afinfo_seq_fops = {
-	.open     = udp_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
-};
-
 static struct udp_seq_afinfo udplite6_seq_afinfo = {
-	.name		= "udplite6",
 	.family		= AF_INET6,
 	.udp_table	= &udplite_table,
-	.seq_fops	= &udplite6_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp6_seq_show,
-	},
 };
 
 static int __net_init udplite6_proc_init_net(struct net *net)
 {
-	return udp_proc_register(net, &udplite6_seq_afinfo);
+	if (!proc_create_data("udplite6", 0444, net->proc_net,
+			&udp6_afinfo_seq_fops, &udplite6_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit udplite6_proc_exit_net(struct net *net)
 {
-	udp_proc_unregister(net, &udplite6_seq_afinfo);
+	remove_proc_entry("udplite6", net->proc_net);
 }
 
 static struct pernet_operations udplite6_net_ops = {
-- 
cgit v1.2.3


From 37d849bb4294e22d5250264e82beaf4dd8a5403c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Apr 2018 09:31:28 +0200
Subject: ipv{4,6}/tcp: simplify procfs registration

Avoid most of the afinfo indirections and just call the proc helpers
directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/tcp.h   | 11 ++-----
 net/ipv4/tcp_ipv4.c | 85 +++++++++++++++++++----------------------------------
 net/ipv6/tcp_ipv6.c | 27 ++++++++++++-----
 3 files changed, 53 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9c9b3768b350..51dc7a26a2fa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1747,27 +1747,22 @@ enum tcp_seq_states {
 	TCP_SEQ_STATE_ESTABLISHED,
 };
 
-int tcp_seq_open(struct inode *inode, struct file *file);
+void *tcp_seq_start(struct seq_file *seq, loff_t *pos);
+void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+void tcp_seq_stop(struct seq_file *seq, void *v);
 
 struct tcp_seq_afinfo {
-	char				*name;
 	sa_family_t			family;
-	const struct file_operations	*seq_fops;
-	struct seq_operations		seq_ops;
 };
 
 struct tcp_iter_state {
 	struct seq_net_private	p;
-	sa_family_t		family;
 	enum tcp_seq_states	state;
 	struct sock		*syn_wait_sk;
 	int			bucket, offset, sbucket, num;
 	loff_t			last_pos;
 };
 
-int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo);
-void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo);
-
 extern struct request_sock_ops tcp_request_sock_ops;
 extern struct request_sock_ops tcp6_request_sock_ops;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b50838..645f259d0972 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1961,6 +1961,7 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
  */
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
+	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct inet_listen_hashbucket *ilb;
@@ -1983,7 +1984,7 @@ get_sk:
 	sk_for_each_from(sk) {
 		if (!net_eq(sock_net(sk), net))
 			continue;
-		if (sk->sk_family == st->family)
+		if (sk->sk_family == afinfo->family)
 			return sk;
 	}
 	spin_unlock(&ilb->lock);
@@ -2020,6 +2021,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
  */
 static void *established_get_first(struct seq_file *seq)
 {
+	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
@@ -2036,7 +2038,7 @@ static void *established_get_first(struct seq_file *seq)
 
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
-			if (sk->sk_family != st->family ||
+			if (sk->sk_family != afinfo->family ||
 			    !net_eq(sock_net(sk), net)) {
 				continue;
 			}
@@ -2051,6 +2053,7 @@ out:
 
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
+	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct sock *sk = cur;
 	struct hlist_nulls_node *node;
 	struct tcp_iter_state *st = seq->private;
@@ -2062,7 +2065,8 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+		if (sk->sk_family == afinfo->family &&
+		    net_eq(sock_net(sk), net))
 			return sk;
 	}
 
@@ -2135,7 +2139,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 	return rc;
 }
 
-static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
 	void *rc;
@@ -2156,8 +2160,9 @@ out:
 	st->last_pos = *pos;
 	return rc;
 }
+EXPORT_SYMBOL(tcp_seq_start);
 
-static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
 	void *rc = NULL;
@@ -2186,8 +2191,9 @@ out:
 	st->last_pos = *pos;
 	return rc;
 }
+EXPORT_SYMBOL(tcp_seq_next);
 
-static void tcp_seq_stop(struct seq_file *seq, void *v)
+void tcp_seq_stop(struct seq_file *seq, void *v)
 {
 	struct tcp_iter_state *st = seq->private;
 
@@ -2202,47 +2208,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 		break;
 	}
 }
-
-int tcp_seq_open(struct inode *inode, struct file *file)
-{
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
-	struct tcp_iter_state *s;
-	int err;
-
-	err = seq_open_net(inode, file, &afinfo->seq_ops,
-			  sizeof(struct tcp_iter_state));
-	if (err < 0)
-		return err;
-
-	s = ((struct seq_file *)file->private_data)->private;
-	s->family		= afinfo->family;
-	s->last_pos		= 0;
-	return 0;
-}
-EXPORT_SYMBOL(tcp_seq_open);
-
-int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
-{
-	int rc = 0;
-	struct proc_dir_entry *p;
-
-	afinfo->seq_ops.start		= tcp_seq_start;
-	afinfo->seq_ops.next		= tcp_seq_next;
-	afinfo->seq_ops.stop		= tcp_seq_stop;
-
-	p = proc_create_data(afinfo->name, 0444, net->proc_net,
-			     afinfo->seq_fops, afinfo);
-	if (!p)
-		rc = -ENOMEM;
-	return rc;
-}
-EXPORT_SYMBOL(tcp_proc_register);
-
-void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
-{
-	remove_proc_entry(afinfo->name, net->proc_net);
-}
-EXPORT_SYMBOL(tcp_proc_unregister);
+EXPORT_SYMBOL(tcp_seq_stop);
 
 static void get_openreq4(const struct request_sock *req,
 			 struct seq_file *f, int i)
@@ -2377,6 +2343,19 @@ out:
 	return 0;
 }
 
+static const struct seq_operations tcp4_seq_ops = {
+	.show		= tcp4_seq_show,
+	.start		= tcp_seq_start,
+	.next		= tcp_seq_next,
+	.stop		= tcp_seq_stop,
+};
+
+static int tcp_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &tcp4_seq_ops,
+			  sizeof(struct tcp_iter_state));
+}
+
 static const struct file_operations tcp_afinfo_seq_fops = {
 	.open    = tcp_seq_open,
 	.read    = seq_read,
@@ -2385,22 +2364,20 @@ static const struct file_operations tcp_afinfo_seq_fops = {
 };
 
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
-	.name		= "tcp",
 	.family		= AF_INET,
-	.seq_fops	= &tcp_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= tcp4_seq_show,
-	},
 };
 
 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	if (!proc_create_data("tcp", 0444, net->proc_net,
+			&tcp_afinfo_seq_fops, &tcp4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
-	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+	remove_proc_entry("tcp", net->proc_net);
 }
 
 static struct pernet_operations tcp4_net_ops = {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6d664d83cd16..c0329bb1692f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1909,30 +1909,41 @@ out:
 	return 0;
 }
 
+static const struct seq_operations tcp6_seq_ops = {
+	.show		= tcp6_seq_show,
+	.start		= tcp_seq_start,
+	.next		= tcp_seq_next,
+	.stop		= tcp_seq_stop,
+};
+
+static int tcp6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &tcp6_seq_ops,
+			  sizeof(struct tcp_iter_state));
+}
+
 static const struct file_operations tcp6_afinfo_seq_fops = {
-	.open    = tcp_seq_open,
+	.open    = tcp6_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release_net
 };
 
 static struct tcp_seq_afinfo tcp6_seq_afinfo = {
-	.name		= "tcp6",
 	.family		= AF_INET6,
-	.seq_fops	= &tcp6_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= tcp6_seq_show,
-	},
 };
 
 int __net_init tcp6_proc_init(struct net *net)
 {
-	return tcp_proc_register(net, &tcp6_seq_afinfo);
+	if (!proc_create_data("tcp6", 0444, net->proc_net,
+			&tcp6_afinfo_seq_fops, &tcp6_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 void tcp6_proc_exit(struct net *net)
 {
-	tcp_proc_unregister(net, &tcp6_seq_afinfo);
+	remove_proc_entry("tcp6", net->proc_net);
 }
 #endif
 
-- 
cgit v1.2.3


From f455022166b57c8693897334508dfa75d3b62b38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Apr 2018 20:04:20 +0200
Subject: ipv{4,6}/ping: simplify proc file creation

Remove the pointless ping_seq_afinfo indirection and make the code look
like most other protocols.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/ping.h | 11 -----------
 net/ipv4/ping.c    | 50 ++++++++++++++------------------------------------
 net/ipv6/ping.c    | 35 +++++++++++++++++++++++------------
 3 files changed, 37 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/net/ping.h b/include/net/ping.h
index 4cd90d6b5c25..fd080e043a6e 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -83,20 +83,9 @@ int  ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 bool ping_rcv(struct sk_buff *skb);
 
 #ifdef CONFIG_PROC_FS
-struct ping_seq_afinfo {
-	char				*name;
-	sa_family_t			family;
-	const struct file_operations	*seq_fops;
-	const struct seq_operations	seq_ops;
-};
-
-extern const struct file_operations ping_seq_fops;
-
 void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family);
 void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos);
 void ping_seq_stop(struct seq_file *seq, void *v);
-int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo);
-void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo);
 
 int __init ping_proc_init(void);
 void ping_proc_exit(void);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 56a010622f70..4d21c24dba78 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1150,58 +1150,36 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int ping_seq_open(struct inode *inode, struct file *file)
+static const struct seq_operations ping_v4_seq_ops = {
+	.start		= ping_v4_seq_start,
+	.show		= ping_v4_seq_show,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
+};
+
+static int ping_v4_seq_open(struct inode *inode, struct file *file)
 {
-	struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
-	return seq_open_net(inode, file, &afinfo->seq_ops,
+	return seq_open_net(inode, file, &ping_v4_seq_ops,
 			   sizeof(struct ping_iter_state));
 }
 
-const struct file_operations ping_seq_fops = {
-	.open		= ping_seq_open,
+const struct file_operations ping_v4_seq_fops = {
+	.open		= ping_v4_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release_net,
 };
-EXPORT_SYMBOL_GPL(ping_seq_fops);
-
-static struct ping_seq_afinfo ping_v4_seq_afinfo = {
-	.name		= "icmp",
-	.family		= AF_INET,
-	.seq_fops	= &ping_seq_fops,
-	.seq_ops	= {
-		.start		= ping_v4_seq_start,
-		.show		= ping_v4_seq_show,
-		.next		= ping_seq_next,
-		.stop		= ping_seq_stop,
-	},
-};
 
-int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo)
+static int __net_init ping_v4_proc_init_net(struct net *net)
 {
-	struct proc_dir_entry *p;
-	p = proc_create_data(afinfo->name, 0444, net->proc_net,
-			     afinfo->seq_fops, afinfo);
-	if (!p)
+	if (!proc_create("icmp", 0444, net->proc_net, &ping_v4_seq_fops))
 		return -ENOMEM;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(ping_proc_register);
-
-void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo)
-{
-	remove_proc_entry(afinfo->name, net->proc_net);
-}
-EXPORT_SYMBOL_GPL(ping_proc_unregister);
-
-static int __net_init ping_v4_proc_init_net(struct net *net)
-{
-	return ping_proc_register(net, &ping_v4_seq_afinfo);
-}
 
 static void __net_exit ping_v4_proc_exit_net(struct net *net)
 {
-	ping_proc_unregister(net, &ping_v4_seq_afinfo);
+	remove_proc_entry("icmp", net->proc_net);
 }
 
 static struct pernet_operations ping_v4_net_ops = {
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 746eeae7f581..45d5c8e0f2bf 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -24,6 +24,7 @@
 #include <net/protocol.h>
 #include <net/udp.h>
 #include <net/transp_v6.h>
+#include <linux/proc_fs.h>
 #include <net/ping.h>
 
 /* Compatibility glue so we can support IPv6 when it's compiled as a module */
@@ -215,26 +216,36 @@ static int ping_v6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static struct ping_seq_afinfo ping_v6_seq_afinfo = {
-	.name		= "icmp6",
-	.family		= AF_INET6,
-	.seq_fops       = &ping_seq_fops,
-	.seq_ops	= {
-		.start		= ping_v6_seq_start,
-		.show		= ping_v6_seq_show,
-		.next		= ping_seq_next,
-		.stop		= ping_seq_stop,
-	},
+static const struct seq_operations ping_v6_seq_ops = {
+	.start		= ping_v6_seq_start,
+	.show		= ping_v6_seq_show,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
+};
+
+static int ping_v6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ping_v6_seq_ops,
+			   sizeof(struct ping_iter_state));
+}
+
+const struct file_operations ping_v6_seq_fops = {
+	.open		= ping_v6_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
 };
 
 static int __net_init ping_v6_proc_init_net(struct net *net)
 {
-	return ping_proc_register(net, &ping_v6_seq_afinfo);
+	if (!proc_create("icmp6", 0444, net->proc_net, &ping_v6_seq_fops))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_init ping_v6_proc_exit_net(struct net *net)
 {
-	return ping_proc_unregister(net, &ping_v6_seq_afinfo);
+	remove_proc_entry("icmp6", net->proc_net);
 }
 
 static struct pernet_operations ping_v6_net_ops = {
-- 
cgit v1.2.3


From 93cb5a1f5851ab1a0b4be735c40130898a64b812 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Apr 2018 22:08:28 +0200
Subject: ipv{4,6}/raw: simplify ѕeq_file code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pass the hashtable to the proc private data instead of copying
it into the per-file private data.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/raw.h |  4 ----
 net/ipv4/raw.c    | 36 ++++++++++++------------------------
 net/ipv6/raw.c    |  6 ++++--
 3 files changed, 16 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 99d26d0c4a19..9c9fa98a91a4 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -48,7 +48,6 @@ void raw_proc_exit(void);
 struct raw_iter_state {
 	struct seq_net_private p;
 	int bucket;
-	struct raw_hashinfo *h;
 };
 
 static inline struct raw_iter_state *raw_seq_private(struct seq_file *seq)
@@ -58,9 +57,6 @@ static inline struct raw_iter_state *raw_seq_private(struct seq_file *seq)
 void *raw_seq_start(struct seq_file *seq, loff_t *pos);
 void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos);
 void raw_seq_stop(struct seq_file *seq, void *v);
-int raw_seq_open(struct inode *ino, struct file *file,
-		 struct raw_hashinfo *h, const struct seq_operations *ops);
-
 #endif
 
 int raw_hash_sk(struct sock *sk);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1b4d3355624a..ae57962b31e3 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1003,11 +1003,12 @@ struct proto raw_prot = {
 static struct sock *raw_get_first(struct seq_file *seq)
 {
 	struct sock *sk;
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 	struct raw_iter_state *state = raw_seq_private(seq);
 
 	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
 			++state->bucket) {
-		sk_for_each(sk, &state->h->ht[state->bucket])
+		sk_for_each(sk, &h->ht[state->bucket])
 			if (sock_net(sk) == seq_file_net(seq))
 				goto found;
 	}
@@ -1018,6 +1019,7 @@ found:
 
 static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
 {
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 	struct raw_iter_state *state = raw_seq_private(seq);
 
 	do {
@@ -1027,7 +1029,7 @@ try_again:
 	} while (sk && sock_net(sk) != seq_file_net(seq));
 
 	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
-		sk = sk_head(&state->h->ht[state->bucket]);
+		sk = sk_head(&h->ht[state->bucket]);
 		goto try_again;
 	}
 	return sk;
@@ -1045,9 +1047,9 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
 
 void *raw_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct raw_iter_state *state = raw_seq_private(seq);
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 
-	read_lock(&state->h->lock);
+	read_lock(&h->lock);
 	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1067,9 +1069,9 @@ EXPORT_SYMBOL_GPL(raw_seq_next);
 
 void raw_seq_stop(struct seq_file *seq, void *v)
 {
-	struct raw_iter_state *state = raw_seq_private(seq);
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 
-	read_unlock(&state->h->lock);
+	read_unlock(&h->lock);
 }
 EXPORT_SYMBOL_GPL(raw_seq_stop);
 
@@ -1110,25 +1112,10 @@ static const struct seq_operations raw_seq_ops = {
 	.show  = raw_seq_show,
 };
 
-int raw_seq_open(struct inode *ino, struct file *file,
-		 struct raw_hashinfo *h, const struct seq_operations *ops)
-{
-	int err;
-	struct raw_iter_state *i;
-
-	err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));
-	if (err < 0)
-		return err;
-
-	i = raw_seq_private((struct seq_file *)file->private_data);
-	i->h = h;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(raw_seq_open);
-
 static int raw_v4_seq_open(struct inode *inode, struct file *file)
 {
-	return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);
+	return seq_open_net(inode, file, &raw_seq_ops,
+			sizeof(struct raw_iter_state));
 }
 
 static const struct file_operations raw_seq_fops = {
@@ -1140,7 +1127,8 @@ static const struct file_operations raw_seq_fops = {
 
 static __net_init int raw_init_net(struct net *net)
 {
-	if (!proc_create("raw", 0444, net->proc_net, &raw_seq_fops))
+	if (!proc_create_data("raw", 0444, net->proc_net, &raw_seq_fops,
+			&raw_v4_hashinfo))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5eb9b08947ed..dade69bf61e6 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1306,7 +1306,8 @@ static const struct seq_operations raw6_seq_ops = {
 
 static int raw6_seq_open(struct inode *inode, struct file *file)
 {
-	return raw_seq_open(inode, file, &raw_v6_hashinfo, &raw6_seq_ops);
+	return seq_open_net(inode, file, &raw6_seq_ops,
+			sizeof(struct raw_iter_state));
 }
 
 static const struct file_operations raw6_seq_fops = {
@@ -1318,7 +1319,8 @@ static const struct file_operations raw6_seq_fops = {
 
 static int __net_init raw6_init_net(struct net *net)
 {
-	if (!proc_create("raw6", 0444, net->proc_net, &raw6_seq_fops))
+	if (!proc_create_data("raw6", 0444, net->proc_net, &raw6_seq_fops,
+			&raw_v6_hashinfo))
 		return -ENOMEM;
 
 	return 0;
-- 
cgit v1.2.3


From a2dcdee3748b664bf011b4b12de64e945dd4c8c2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Apr 2018 12:32:45 +0200
Subject: net: move seq_file_single_net to <linux/seq_file_net.h>

This helper deals with single_{open,release}_net internals and thus
belongs here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/seq_file_net.h | 13 +++++++++++++
 include/net/ip_vs.h          | 12 ------------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index 43ccd84127b6..ed20faa99e05 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -28,4 +28,17 @@ static inline struct net *seq_file_net(struct seq_file *seq)
 #endif
 }
 
+/*
+ * This one is needed for single_open_net since net is stored directly in
+ * private not as a struct i.e. seq_file_net can't be used.
+ */
+static inline struct net *seq_file_single_net(struct seq_file *seq)
+{
+#ifdef CONFIG_NET_NS
+	return (struct net *)seq->private;
+#else
+	return &init_net;
+#endif
+}
+
 #endif
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec043c96..aea7a124e66b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -41,18 +41,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
 	return net->ipvs;
 }
 
-/* This one needed for single_open_net since net is stored directly in
- * private not as a struct i.e. seq_file_net can't be used.
- */
-static inline struct net *seq_file_single_net(struct seq_file *seq)
-{
-#ifdef CONFIG_NET_NS
-	return (struct net *)seq->private;
-#else
-	return &init_net;
-#endif
-}
-
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
 
-- 
cgit v1.2.3


From c3506372277779fccbffee2475400fcd689d5738 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Apr 2018 19:42:55 +0200
Subject: proc: introduce proc_create_net{,_data}

Variants of proc_create{,_data} that directly take a struct seq_operations
and deal with network namespaces in ->open and ->release.  All callers of
proc_create + seq_open_net converted over, and seq_{open,release}_net are
removed entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/net/ppp/pppoe.c                 | 18 ++--------
 fs/nfs/client.c                         | 43 +++--------------------
 fs/proc/proc_net.c                      | 61 +++++++++++++++++++++------------
 include/linux/proc_fs.h                 |  9 +++++
 include/linux/seq_file_net.h            |  3 --
 include/net/ip6_fib.h                   | 10 +++++-
 include/net/phonet/pn_dev.h             |  4 +--
 include/net/udp.h                       |  4 +--
 net/8021q/vlanproc.c                    | 18 ++--------
 net/atm/clip.c                          | 17 ++-------
 net/core/net-procfs.c                   | 49 ++++----------------------
 net/core/sock.c                         | 16 ++-------
 net/decnet/dn_neigh.c                   | 18 ++--------
 net/ipv4/arp.c                          | 17 ++-------
 net/ipv4/fib_trie.c                     | 32 +++--------------
 net/ipv4/igmp.c                         | 33 +++---------------
 net/ipv4/ipmr.c                         | 32 +++--------------
 net/ipv4/ping.c                         | 16 ++-------
 net/ipv4/raw.c                          | 17 ++-------
 net/ipv4/tcp_ipv4.c                     | 17 ++-------
 net/ipv4/udp.c                          | 21 +++---------
 net/ipv4/udplite.c                      |  4 +--
 net/ipv6/addrconf.c                     | 16 ++-------
 net/ipv6/anycast.c                      | 16 ++-------
 net/ipv6/ip6_fib.c                      | 18 +---------
 net/ipv6/ip6_flowlabel.c                | 17 ++-------
 net/ipv6/ip6mr.c                        | 32 +++--------------
 net/ipv6/mcast.c                        | 34 +++---------------
 net/ipv6/ping.c                         | 16 ++-------
 net/ipv6/raw.c                          | 17 ++-------
 net/ipv6/route.c                        | 11 ++----
 net/ipv6/tcp_ipv6.c                     | 17 ++-------
 net/ipv6/udp.c                          | 21 +++---------
 net/ipv6/udplite.c                      |  5 +--
 net/kcm/kcmproc.c                       | 16 ++-------
 net/key/af_key.c                        | 16 ++-------
 net/l2tp/l2tp_ppp.c                     | 22 ++----------
 net/netfilter/ipvs/ip_vs_app.c          | 16 ++-------
 net/netfilter/ipvs/ip_vs_conn.c         | 35 +++----------------
 net/netfilter/ipvs/ip_vs_ctl.c          | 16 ++-------
 net/netfilter/nf_conntrack_expect.c     | 17 ++-------
 net/netfilter/nf_conntrack_standalone.c | 33 +++---------------
 net/netfilter/nf_log.c                  | 19 ++--------
 net/netfilter/nf_synproxy_core.c        | 17 ++-------
 net/netfilter/nfnetlink_log.c           | 18 ++--------
 net/netfilter/nfnetlink_queue.c         | 18 ++--------
 net/netfilter/x_tables.c                | 18 ++--------
 net/netlink/af_netlink.c                | 18 ++--------
 net/packet/af_packet.c                  | 17 ++-------
 net/phonet/pn_dev.c                     |  6 ++--
 net/phonet/socket.c                     | 30 ++--------------
 net/rxrpc/ar-internal.h                 |  4 +--
 net/rxrpc/net_ns.c                      |  7 ++--
 net/rxrpc/proc.c                        | 31 ++---------------
 net/sctp/proc.c                         | 54 ++++-------------------------
 net/unix/af_unix.c                      | 17 ++-------
 net/wireless/wext-proc.c                | 17 ++-------
 57 files changed, 202 insertions(+), 939 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 7df07337d69c..ce61231e96ea 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1096,21 +1096,6 @@ static const struct seq_operations pppoe_seq_ops = {
 	.stop		= pppoe_seq_stop,
 	.show		= pppoe_seq_show,
 };
-
-static int pppoe_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pppoe_seq_ops,
-			sizeof(struct seq_net_private));
-}
-
-static const struct file_operations pppoe_seq_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pppoe_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 static const struct proto_ops pppoe_ops = {
@@ -1146,7 +1131,8 @@ static __net_init int pppoe_init_net(struct net *net)
 
 	rwlock_init(&pn->hash_lock);
 
-	pde = proc_create("pppoe", 0444, net->proc_net, &pppoe_seq_fops);
+	pde = proc_create_net("pppoe", 0444, net->proc_net,
+			&pppoe_seq_ops, sizeof(struct seq_net_private));
 #ifdef CONFIG_PROC_FS
 	if (!pde)
 		return -ENOMEM;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b9129e2befea..bbc91d7ca1bd 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1067,7 +1067,6 @@ void nfs_clients_init(struct net *net)
 }
 
 #ifdef CONFIG_PROC_FS
-static int nfs_server_list_open(struct inode *inode, struct file *file);
 static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
 static void nfs_server_list_stop(struct seq_file *p, void *v);
@@ -1080,14 +1079,6 @@ static const struct seq_operations nfs_server_list_ops = {
 	.show	= nfs_server_list_show,
 };
 
-static const struct file_operations nfs_server_list_fops = {
-	.open		= nfs_server_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
-static int nfs_volume_list_open(struct inode *inode, struct file *file);
 static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
 static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
 static void nfs_volume_list_stop(struct seq_file *p, void *v);
@@ -1100,23 +1091,6 @@ static const struct seq_operations nfs_volume_list_ops = {
 	.show	= nfs_volume_list_show,
 };
 
-static const struct file_operations nfs_volume_list_fops = {
-	.open		= nfs_volume_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
-/*
- * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which
- * we're dealing
- */
-static int nfs_server_list_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nfs_server_list_ops,
-			   sizeof(struct seq_net_private));
-}
-
 /*
  * set up the iterator to start reading from the server list and return the first item
  */
@@ -1184,15 +1158,6 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-/*
- * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes
- */
-static int nfs_volume_list_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nfs_volume_list_ops,
-			   sizeof(struct seq_net_private));
-}
-
 /*
  * set up the iterator to start reading from the volume list and return the first item
  */
@@ -1278,14 +1243,14 @@ int nfs_fs_proc_net_init(struct net *net)
 		goto error_0;
 
 	/* a file of servers with which we're dealing */
-	p = proc_create("servers", S_IFREG|S_IRUGO,
-			nn->proc_nfsfs, &nfs_server_list_fops);
+	p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+			&nfs_server_list_ops, sizeof(struct seq_net_private));
 	if (!p)
 		goto error_1;
 
 	/* a file of volumes that we have mounted */
-	p = proc_create("volumes", S_IFREG|S_IRUGO,
-			nn->proc_nfsfs, &nfs_volume_list_fops);
+	p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+			&nfs_volume_list_ops, sizeof(struct seq_net_private));
 	if (!p)
 		goto error_1;
 	return 0;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 1763f370489d..c99fd183f034 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -38,20 +38,20 @@ static struct net *get_proc_net(const struct inode *inode)
 	return maybe_get_net(PDE_NET(PDE(inode)));
 }
 
-int seq_open_net(struct inode *ino, struct file *f,
-		 const struct seq_operations *ops, int size)
+static int seq_open_net(struct inode *inode, struct file *file)
 {
-	struct net *net;
+	unsigned int state_size = PDE(inode)->state_size;
 	struct seq_net_private *p;
+	struct net *net;
 
-	BUG_ON(size < sizeof(*p));
+	WARN_ON_ONCE(state_size < sizeof(*p));
 
-	net = get_proc_net(ino);
-	if (net == NULL)
+	net = get_proc_net(inode);
+	if (!net)
 		return -ENXIO;
 
-	p = __seq_open_private(f, ops, size);
-	if (p == NULL) {
+	p = __seq_open_private(file, PDE(inode)->seq_ops, state_size);
+	if (!p) {
 		put_net(net);
 		return -ENOMEM;
 	}
@@ -60,7 +60,38 @@ int seq_open_net(struct inode *ino, struct file *f,
 #endif
 	return 0;
 }
-EXPORT_SYMBOL_GPL(seq_open_net);
+
+static int seq_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq = f->private_data;
+
+	put_net(seq_file_net(seq));
+	seq_release_private(ino, f);
+	return 0;
+}
+
+static const struct file_operations proc_net_seq_fops = {
+	.open		= seq_open_net,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		unsigned int state_size, void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_net_seq_fops;
+	p->seq_ops = ops;
+	p->state_size = state_size;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data);
 
 int single_open_net(struct inode *inode, struct file *file,
 		int (*show)(struct seq_file *, void *))
@@ -86,18 +117,6 @@ err_net:
 }
 EXPORT_SYMBOL_GPL(single_open_net);
 
-int seq_release_net(struct inode *ino, struct file *f)
-{
-	struct seq_file *seq;
-
-	seq = f->private_data;
-
-	put_net(seq_file_net(seq));
-	seq_release_private(ino, f);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(seq_release_net);
-
 int single_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq = f->private_data;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 2529b871f379..9dcde9644253 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -53,6 +53,12 @@ extern void proc_remove(struct proc_dir_entry *);
 extern void remove_proc_entry(const char *, struct proc_dir_entry *);
 extern int remove_proc_subtree(const char *, struct proc_dir_entry *);
 
+struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		unsigned int state_size, void *data);
+#define proc_create_net(name, mode, parent, state_size, ops) \
+	proc_create_net_data(name, mode, parent, state_size, ops, NULL)
+
 #else /* CONFIG_PROC_FS */
 
 static inline void proc_root_init(void)
@@ -89,6 +95,9 @@ static inline void proc_remove(struct proc_dir_entry *de) {}
 #define remove_proc_entry(name, parent) do {} while (0)
 static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; }
 
+#define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;})
+#define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
+
 #endif /* CONFIG_PROC_FS */
 
 struct net;
diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index ed20faa99e05..5ea18a16291a 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -13,11 +13,8 @@ struct seq_net_private {
 #endif
 };
 
-int seq_open_net(struct inode *, struct file *,
-		 const struct seq_operations *, int);
 int single_open_net(struct inode *, struct file *file,
 		int (*show)(struct seq_file *, void *));
-int seq_release_net(struct inode *, struct file *);
 int single_release_net(struct inode *, struct file *);
 static inline struct net *seq_file_net(struct seq_file *seq)
 {
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5e86fd9dc857..0e79c3408569 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -394,7 +394,15 @@ void fib6_gc_cleanup(void);
 
 int fib6_init(void);
 
-int ipv6_route_open(struct inode *inode, struct file *file);
+struct ipv6_route_iter {
+	struct seq_net_private p;
+	struct fib6_walker w;
+	loff_t skip;
+	struct fib6_table *tbl;
+	int sernum;
+};
+
+extern const struct seq_operations ipv6_route_seq_ops;
 
 int call_fib6_notifier(struct notifier_block *nb, struct net *net,
 		       enum fib_event_type event_type,
diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h
index 8639de5750f6..cbee32be1d9c 100644
--- a/include/net/phonet/pn_dev.h
+++ b/include/net/phonet/pn_dev.h
@@ -56,7 +56,7 @@ struct net_device *phonet_route_output(struct net *net, u8 daddr);
 
 #define PN_NO_ADDR	0xff
 
-extern const struct file_operations pn_sock_seq_fops;
-extern const struct file_operations pn_res_seq_fops;
+extern const struct seq_operations pn_sock_seq_ops;
+extern const struct seq_operations pn_res_seq_ops;
 
 #endif
diff --git a/include/net/udp.h b/include/net/udp.h
index 093cd323f66a..621778b80e3d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -423,8 +423,8 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos);
 void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
 void udp_seq_stop(struct seq_file *seq, void *v);
 
-extern const struct file_operations udp_afinfo_seq_fops;
-extern const struct file_operations udp6_afinfo_seq_fops;
+extern const struct seq_operations udp_seq_ops;
+extern const struct seq_operations udp6_seq_ops;
 
 int udp4_proc_init(void);
 void udp4_proc_exit(void);
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index d3e3f0f2ec1c..d36e8c4b7f56 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -73,19 +73,6 @@ static const struct seq_operations vlan_seq_ops = {
 	.show = vlan_seq_show,
 };
 
-static int vlan_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &vlan_seq_ops,
-			sizeof(struct seq_net_private));
-}
-
-static const struct file_operations vlan_fops = {
-	.open    = vlan_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 /*
  * Proc filesystem directory entries.
  */
@@ -132,8 +119,9 @@ int __net_init vlan_proc_init(struct net *net)
 	if (!vn->proc_vlan_dir)
 		goto err;
 
-	vn->proc_vlan_conf = proc_create(name_conf, S_IFREG | 0600,
-					 vn->proc_vlan_dir, &vlan_fops);
+	vn->proc_vlan_conf = proc_create_net(name_conf, S_IFREG | 0600,
+			vn->proc_vlan_dir, &vlan_seq_ops,
+			sizeof(struct seq_net_private));
 	if (!vn->proc_vlan_conf)
 		goto err;
 	return 0;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index f07dbc632222..66caa48a27c2 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -863,20 +863,6 @@ static const struct seq_operations arp_seq_ops = {
 	.stop	= neigh_seq_stop,
 	.show	= clip_seq_show,
 };
-
-static int arp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &arp_seq_ops,
-			    sizeof(struct clip_seq_state));
-}
-
-static const struct file_operations arp_seq_fops = {
-	.open		= arp_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-	.owner		= THIS_MODULE
-};
 #endif
 
 static void atm_clip_exit_noproc(void);
@@ -893,7 +879,8 @@ static int __init atm_clip_init(void)
 	{
 		struct proc_dir_entry *p;
 
-		p = proc_create("arp", 0444, atm_proc_root, &arp_seq_fops);
+		p = proc_create_net("arp", 0444, atm_proc_root, &arp_seq_ops,
+				sizeof(struct clip_seq_state));
 		if (!p) {
 			pr_err("Unable to initialize /proc/net/atm/arp\n");
 			atm_clip_exit_noproc();
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index c4e8ebe55e26..63881f72ef71 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -175,19 +175,6 @@ static const struct seq_operations dev_seq_ops = {
 	.show  = dev_seq_show,
 };
 
-static int dev_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &dev_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations dev_seq_fops = {
-	.open    = dev_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static const struct seq_operations softnet_seq_ops = {
 	.start = softnet_seq_start,
 	.next  = softnet_seq_next,
@@ -285,30 +272,18 @@ static const struct seq_operations ptype_seq_ops = {
 	.show  = ptype_seq_show,
 };
 
-static int ptype_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ptype_seq_ops,
-			sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ptype_seq_fops = {
-	.open    = ptype_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
-
 static int __net_init dev_proc_net_init(struct net *net)
 {
 	int rc = -ENOMEM;
 
-	if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops))
+	if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops,
+			sizeof(struct seq_net_private)))
 		goto out;
 	if (!proc_create_seq("softnet_stat", 0444, net->proc_net,
 			 &softnet_seq_ops))
 		goto out_dev;
-	if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops))
+	if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
+			sizeof(struct seq_net_private)))
 		goto out_softnet;
 
 	if (wext_proc_init(net))
@@ -365,22 +340,10 @@ static const struct seq_operations dev_mc_seq_ops = {
 	.show  = dev_mc_seq_show,
 };
 
-static int dev_mc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &dev_mc_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations dev_mc_seq_fops = {
-	.open    = dev_mc_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int __net_init dev_mc_net_init(struct net *net)
 {
-	if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
+	if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 6444525f610c..835a22f94bc5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3439,22 +3439,10 @@ static const struct seq_operations proto_seq_ops = {
 	.show   = proto_seq_show,
 };
 
-static int proto_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &proto_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations proto_seq_fops = {
-	.open		= proto_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static __net_init int proto_init_net(struct net *net)
 {
-	if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
+	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 13156165afa3..94b306f6d551 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -589,27 +589,13 @@ static const struct seq_operations dn_neigh_seq_ops = {
 	.stop  = neigh_seq_stop,
 	.show  = dn_neigh_seq_show,
 };
-
-static int dn_neigh_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &dn_neigh_seq_ops,
-			    sizeof(struct neigh_seq_state));
-}
-
-static const struct file_operations dn_neigh_seq_fops = {
-	.open		= dn_neigh_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 void __init dn_neigh_init(void)
 {
 	neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table);
-	proc_create("decnet_neigh", 0444, init_net.proc_net,
-		    &dn_neigh_seq_fops);
+	proc_create_net("decnet_neigh", 0444, init_net.proc_net,
+			&dn_neigh_seq_ops, sizeof(struct neigh_seq_state));
 }
 
 void __exit dn_neigh_cleanup(void)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index bf6c2d4d4fdc..e90c89ef8c08 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1418,23 +1418,12 @@ static const struct seq_operations arp_seq_ops = {
 	.show	= arp_seq_show,
 };
 
-static int arp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &arp_seq_ops,
-			    sizeof(struct neigh_seq_state));
-}
-
-static const struct file_operations arp_seq_fops = {
-	.open           = arp_seq_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release	= seq_release_net,
-};
-
+/* ------------------------------------------------------------------------ */
 
 static int __net_init arp_net_init(struct net *net)
 {
-	if (!proc_create("arp", 0444, net->proc_net, &arp_seq_fops))
+	if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops,
+			sizeof(struct neigh_seq_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3ce98c..3293f04b941d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2533,19 +2533,6 @@ static const struct seq_operations fib_trie_seq_ops = {
 	.show   = fib_trie_seq_show,
 };
 
-static int fib_trie_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &fib_trie_seq_ops,
-			    sizeof(struct fib_trie_iter));
-}
-
-static const struct file_operations fib_trie_fops = {
-	.open   = fib_trie_seq_open,
-	.read   = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
-
 struct fib_route_iter {
 	struct seq_net_private p;
 	struct fib_table *main_tb;
@@ -2726,29 +2713,18 @@ static const struct seq_operations fib_route_seq_ops = {
 	.show   = fib_route_seq_show,
 };
 
-static int fib_route_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &fib_route_seq_ops,
-			    sizeof(struct fib_route_iter));
-}
-
-static const struct file_operations fib_route_fops = {
-	.open   = fib_route_seq_open,
-	.read   = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
-
 int __net_init fib_proc_init(struct net *net)
 {
-	if (!proc_create("fib_trie", 0444, net->proc_net, &fib_trie_fops))
+	if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
+			sizeof(struct fib_trie_iter)))
 		goto out1;
 
 	if (!proc_create("fib_triestat", 0444, net->proc_net,
 			 &fib_triestat_fops))
 		goto out2;
 
-	if (!proc_create("route", 0444, net->proc_net, &fib_route_fops))
+	if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
+			sizeof(struct fib_route_iter)))
 		goto out3;
 
 	return 0;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b26a81a7de42..85b617b655bc 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2829,19 +2829,6 @@ static const struct seq_operations igmp_mc_seq_ops = {
 	.show	=	igmp_mc_seq_show,
 };
 
-static int igmp_mc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp_mc_seq_ops,
-			sizeof(struct igmp_mc_iter_state));
-}
-
-static const struct file_operations igmp_mc_seq_fops = {
-	.open		=	igmp_mc_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 struct igmp_mcf_iter_state {
 	struct seq_net_private p;
 	struct net_device *dev;
@@ -2975,29 +2962,17 @@ static const struct seq_operations igmp_mcf_seq_ops = {
 	.show	=	igmp_mcf_seq_show,
 };
 
-static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp_mcf_seq_ops,
-			sizeof(struct igmp_mcf_iter_state));
-}
-
-static const struct file_operations igmp_mcf_seq_fops = {
-	.open		=	igmp_mcf_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 static int __net_init igmp_net_init(struct net *net)
 {
 	struct proc_dir_entry *pde;
 	int err;
 
-	pde = proc_create("igmp", 0444, net->proc_net, &igmp_mc_seq_fops);
+	pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops,
+			sizeof(struct igmp_mc_iter_state));
 	if (!pde)
 		goto out_igmp;
-	pde = proc_create("mcfilter", 0444, net->proc_net,
-			  &igmp_mcf_seq_fops);
+	pde = proc_create_net("mcfilter", 0444, net->proc_net,
+			&igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state));
 	if (!pde)
 		goto out_mcfilter;
 	err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2fb4de3f7f66..37c4f885ff7b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2828,19 +2828,6 @@ static const struct seq_operations ipmr_vif_seq_ops = {
 	.show  = ipmr_vif_seq_show,
 };
 
-static int ipmr_vif_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
-			    sizeof(struct mr_vif_iter));
-}
-
-static const struct file_operations ipmr_vif_fops = {
-	.open    = ipmr_vif_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct net *net = seq_file_net(seq);
@@ -2900,19 +2887,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = {
 	.stop  = mr_mfc_seq_stop,
 	.show  = ipmr_mfc_seq_show,
 };
-
-static int ipmr_mfc_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
-			    sizeof(struct mr_mfc_iter));
-}
-
-static const struct file_operations ipmr_mfc_fops = {
-	.open    = ipmr_mfc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 #ifdef CONFIG_IP_PIMSM_V2
@@ -2977,9 +2951,11 @@ static int __net_init ipmr_net_init(struct net *net)
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
-	if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
+	if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops,
+			sizeof(struct mr_vif_iter)))
 		goto proc_vif_fail;
-	if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
+	if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
+			sizeof(struct mr_mfc_iter)))
 		goto proc_cache_fail;
 #endif
 	return 0;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4d21c24dba78..2ed64bca54e3 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1157,22 +1157,10 @@ static const struct seq_operations ping_v4_seq_ops = {
 	.stop		= ping_seq_stop,
 };
 
-static int ping_v4_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ping_v4_seq_ops,
-			   sizeof(struct ping_iter_state));
-}
-
-const struct file_operations ping_v4_seq_fops = {
-	.open		= ping_v4_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int __net_init ping_v4_proc_init_net(struct net *net)
 {
-	if (!proc_create("icmp", 0444, net->proc_net, &ping_v4_seq_fops))
+	if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
+			sizeof(struct ping_iter_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ae57962b31e3..abb3c9490c55 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1112,23 +1112,10 @@ static const struct seq_operations raw_seq_ops = {
 	.show  = raw_seq_show,
 };
 
-static int raw_v4_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &raw_seq_ops,
-			sizeof(struct raw_iter_state));
-}
-
-static const struct file_operations raw_seq_fops = {
-	.open	 = raw_v4_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static __net_init int raw_init_net(struct net *net)
 {
-	if (!proc_create_data("raw", 0444, net->proc_net, &raw_seq_fops,
-			&raw_v4_hashinfo))
+	if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops,
+			sizeof(struct raw_iter_state), &raw_v4_hashinfo))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 645f259d0972..2c970626b398 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2350,27 +2350,14 @@ static const struct seq_operations tcp4_seq_ops = {
 	.stop		= tcp_seq_stop,
 };
 
-static int tcp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &tcp4_seq_ops,
-			  sizeof(struct tcp_iter_state));
-}
-
-static const struct file_operations tcp_afinfo_seq_fops = {
-	.open    = tcp_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net
-};
-
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
 	.family		= AF_INET,
 };
 
 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	if (!proc_create_data("tcp", 0444, net->proc_net,
-			&tcp_afinfo_seq_fops, &tcp4_seq_afinfo))
+	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
+			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 51559a8c6e57..051a43ff3fb8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2706,26 +2706,13 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations udp_seq_ops = {
+const struct seq_operations udp_seq_ops = {
 	.start		= udp_seq_start,
 	.next		= udp_seq_next,
 	.stop		= udp_seq_stop,
 	.show		= udp4_seq_show,
 };
-
-static int udp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &udp_seq_ops,
-			sizeof(struct udp_iter_state));
-}
-
-const struct file_operations udp_afinfo_seq_fops = {
-	.open     = udp_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
-};
-EXPORT_SYMBOL(udp_afinfo_seq_fops);
+EXPORT_SYMBOL(udp_seq_ops);
 
 static struct udp_seq_afinfo udp4_seq_afinfo = {
 	.family		= AF_INET,
@@ -2734,8 +2721,8 @@ static struct udp_seq_afinfo udp4_seq_afinfo = {
 
 static int __net_init udp4_proc_init_net(struct net *net)
 {
-	if (!proc_create_data("udp", 0444, net->proc_net, &udp_afinfo_seq_fops,
-			&udp4_seq_afinfo))
+	if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
+			sizeof(struct udp_iter_state), &udp4_seq_afinfo))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 4a6e67bfbe0d..8545457752fb 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -81,8 +81,8 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = {
 
 static int __net_init udplite4_proc_init_net(struct net *net)
 {
-	if (!proc_create_data("udplite", 0444, net->proc_net,
-			&udp_afinfo_seq_fops, &udplite4_seq_afinfo))
+	if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops,
+			sizeof(struct udp_iter_state), &udplite4_seq_afinfo))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 78cef00c9596..1b5ea3379d9b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4254,22 +4254,10 @@ static const struct seq_operations if6_seq_ops = {
 	.stop	= if6_seq_stop,
 };
 
-static int if6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &if6_seq_ops,
-			    sizeof(struct if6_iter_state));
-}
-
-static const struct file_operations if6_fops = {
-	.open		= if6_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int __net_init if6_proc_net_init(struct net *net)
 {
-	if (!proc_create("if_inet6", 0444, net->proc_net, &if6_fops))
+	if (!proc_create_net("if_inet6", 0444, net->proc_net, &if6_seq_ops,
+			sizeof(struct if6_iter_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index bbcabbba9bd8..ebeaf47d5c8d 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -529,22 +529,10 @@ static const struct seq_operations ac6_seq_ops = {
 	.show	=	ac6_seq_show,
 };
 
-static int ac6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ac6_seq_ops,
-			    sizeof(struct ac6_iter_state));
-}
-
-static const struct file_operations ac6_seq_fops = {
-	.open		=	ac6_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 int __net_init ac6_proc_init(struct net *net)
 {
-	if (!proc_create("anycast6", 0444, net->proc_net, &ac6_seq_fops))
+	if (!proc_create_net("anycast6", 0444, net->proc_net, &ac6_seq_ops,
+			sizeof(struct ac6_iter_state)))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index deab2db6692e..01372dd74e38 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2209,15 +2209,6 @@ void fib6_gc_cleanup(void)
 }
 
 #ifdef CONFIG_PROC_FS
-
-struct ipv6_route_iter {
-	struct seq_net_private p;
-	struct fib6_walker w;
-	loff_t skip;
-	struct fib6_table *tbl;
-	int sernum;
-};
-
 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct rt6_info *rt = v;
@@ -2383,17 +2374,10 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock_bh();
 }
 
-static const struct seq_operations ipv6_route_seq_ops = {
+const struct seq_operations ipv6_route_seq_ops = {
 	.start	= ipv6_route_seq_start,
 	.next	= ipv6_route_seq_next,
 	.stop	= ipv6_route_seq_stop,
 	.show	= ipv6_route_seq_show
 };
-
-int ipv6_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipv6_route_seq_ops,
-			    sizeof(struct ipv6_route_iter));
-}
-
 #endif /* CONFIG_PROC_FS */
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 2fbd9bed764a..3eee7637bdfe 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -812,23 +812,10 @@ static const struct seq_operations ip6fl_seq_ops = {
 	.show	=	ip6fl_seq_show,
 };
 
-static int ip6fl_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip6fl_seq_ops,
-			   sizeof(struct ip6fl_iter_state));
-}
-
-static const struct file_operations ip6fl_seq_fops = {
-	.open		=	ip6fl_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 static int __net_init ip6_flowlabel_proc_init(struct net *net)
 {
-	if (!proc_create("ip6_flowlabel", 0444, net->proc_net,
-			 &ip6fl_seq_fops))
+	if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net,
+			&ip6fl_seq_ops, sizeof(struct ip6fl_iter_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b6ed17..4a15529d33eb 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -439,19 +439,6 @@ static const struct seq_operations ip6mr_vif_seq_ops = {
 	.show  = ip6mr_vif_seq_show,
 };
 
-static int ip6mr_vif_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip6mr_vif_seq_ops,
-			    sizeof(struct mr_vif_iter));
-}
-
-static const struct file_operations ip6mr_vif_fops = {
-	.open    = ip6mr_vif_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct net *net = seq_file_net(seq);
@@ -512,19 +499,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = {
 	.stop  = mr_mfc_seq_stop,
 	.show  = ipmr_mfc_seq_show,
 };
-
-static int ipmr_mfc_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
-			    sizeof(struct mr_mfc_iter));
-}
-
-static const struct file_operations ip6mr_mfc_fops = {
-	.open    = ipmr_mfc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 #ifdef CONFIG_IPV6_PIMSM_V2
@@ -1316,9 +1290,11 @@ static int __net_init ip6mr_net_init(struct net *net)
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
-	if (!proc_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops))
+	if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops,
+			sizeof(struct mr_vif_iter)))
 		goto proc_vif_fail;
-	if (!proc_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops))
+	if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
+			sizeof(struct mr_mfc_iter)))
 		goto proc_cache_fail;
 #endif
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 793159d77d8a..975021df7c1c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2749,19 +2749,6 @@ static const struct seq_operations igmp6_mc_seq_ops = {
 	.show	=	igmp6_mc_seq_show,
 };
 
-static int igmp6_mc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp6_mc_seq_ops,
-			    sizeof(struct igmp6_mc_iter_state));
-}
-
-static const struct file_operations igmp6_mc_seq_fops = {
-	.open		=	igmp6_mc_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 struct igmp6_mcf_iter_state {
 	struct seq_net_private p;
 	struct net_device *dev;
@@ -2903,28 +2890,17 @@ static const struct seq_operations igmp6_mcf_seq_ops = {
 	.show	=	igmp6_mcf_seq_show,
 };
 
-static int igmp6_mcf_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp6_mcf_seq_ops,
-			    sizeof(struct igmp6_mcf_iter_state));
-}
-
-static const struct file_operations igmp6_mcf_seq_fops = {
-	.open		=	igmp6_mcf_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 static int __net_init igmp6_proc_init(struct net *net)
 {
 	int err;
 
 	err = -ENOMEM;
-	if (!proc_create("igmp6", 0444, net->proc_net, &igmp6_mc_seq_fops))
+	if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops,
+			sizeof(struct igmp6_mc_iter_state)))
 		goto out;
-	if (!proc_create("mcfilter6", 0444, net->proc_net,
-			 &igmp6_mcf_seq_fops))
+	if (!proc_create_net("mcfilter6", 0444, net->proc_net,
+			&igmp6_mcf_seq_ops,
+			sizeof(struct igmp6_mcf_iter_state)))
 		goto out_proc_net_igmp6;
 
 	err = 0;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 45d5c8e0f2bf..96f56bf49a30 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -223,22 +223,10 @@ static const struct seq_operations ping_v6_seq_ops = {
 	.stop		= ping_seq_stop,
 };
 
-static int ping_v6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ping_v6_seq_ops,
-			   sizeof(struct ping_iter_state));
-}
-
-const struct file_operations ping_v6_seq_fops = {
-	.open		= ping_v6_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int __net_init ping_v6_proc_init_net(struct net *net)
 {
-	if (!proc_create("icmp6", 0444, net->proc_net, &ping_v6_seq_fops))
+	if (!proc_create_net("icmp6", 0444, net->proc_net, &ping_v6_seq_ops,
+			sizeof(struct ping_iter_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index dade69bf61e6..afc307c89d1a 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1304,23 +1304,10 @@ static const struct seq_operations raw6_seq_ops = {
 	.show =		raw6_seq_show,
 };
 
-static int raw6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &raw6_seq_ops,
-			sizeof(struct raw_iter_state));
-}
-
-static const struct file_operations raw6_seq_fops = {
-	.open =		raw6_seq_open,
-	.read =		seq_read,
-	.llseek =	seq_lseek,
-	.release =	seq_release_net,
-};
-
 static int __net_init raw6_init_net(struct net *net)
 {
-	if (!proc_create_data("raw6", 0444, net->proc_net, &raw6_seq_fops,
-			&raw_v6_hashinfo))
+	if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops,
+			sizeof(struct raw_iter_state), &raw_v6_hashinfo))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4d61736c41a..527b9b644f5a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4862,14 +4862,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
  */
 
 #ifdef CONFIG_PROC_FS
-
-static const struct file_operations ipv6_route_proc_fops = {
-	.open		= ipv6_route_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = (struct net *)seq->private;
@@ -5100,7 +5092,8 @@ static void __net_exit ip6_route_net_exit(struct net *net)
 static int __net_init ip6_route_net_init_late(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
+	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
+			sizeof(struct ipv6_route_iter));
 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
 #endif
 	return 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c0329bb1692f..d2ce66b23430 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1916,27 +1916,14 @@ static const struct seq_operations tcp6_seq_ops = {
 	.stop		= tcp_seq_stop,
 };
 
-static int tcp6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &tcp6_seq_ops,
-			  sizeof(struct tcp_iter_state));
-}
-
-static const struct file_operations tcp6_afinfo_seq_fops = {
-	.open    = tcp6_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net
-};
-
 static struct tcp_seq_afinfo tcp6_seq_afinfo = {
 	.family		= AF_INET6,
 };
 
 int __net_init tcp6_proc_init(struct net *net)
 {
-	if (!proc_create_data("tcp6", 0444, net->proc_net,
-			&tcp6_afinfo_seq_fops, &tcp6_seq_afinfo))
+	if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops,
+			sizeof(struct tcp_iter_state), &tcp6_seq_afinfo))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 29adddeac3e5..00e2112da26d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1480,26 +1480,13 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations udp6_seq_ops = {
+const struct seq_operations udp6_seq_ops = {
 	.start		= udp_seq_start,
 	.next		= udp_seq_next,
 	.stop		= udp_seq_stop,
 	.show		= udp6_seq_show,
 };
-
-static int udp6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &udp6_seq_ops,
-			sizeof(struct udp_iter_state));
-}
-
-const struct file_operations udp6_afinfo_seq_fops = {
-	.open     = udp6_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
-};
-EXPORT_SYMBOL(udp6_afinfo_seq_fops);
+EXPORT_SYMBOL(udp6_seq_ops);
 
 static struct udp_seq_afinfo udp6_seq_afinfo = {
 	.family		= AF_INET6,
@@ -1508,8 +1495,8 @@ static struct udp_seq_afinfo udp6_seq_afinfo = {
 
 int __net_init udp6_proc_init(struct net *net)
 {
-	if (!proc_create_data("udp6", 0444, net->proc_net,
-			&udp6_afinfo_seq_fops, &udp6_seq_afinfo))
+	if (!proc_create_net_data("udp6", 0444, net->proc_net, &udp6_seq_ops,
+			sizeof(struct udp_iter_state), &udp6_seq_afinfo))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index a119e57196b5..5000ad6878e6 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -100,8 +100,9 @@ static struct udp_seq_afinfo udplite6_seq_afinfo = {
 
 static int __net_init udplite6_proc_init_net(struct net *net)
 {
-	if (!proc_create_data("udplite6", 0444, net->proc_net,
-			&udp6_afinfo_seq_fops, &udplite6_seq_afinfo))
+	if (!proc_create_net_data("udplite6", 0444, net->proc_net,
+			&udp6_seq_ops, sizeof(struct udp_iter_state),
+			&udplite6_seq_afinfo))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 6d0667e62baf..6283ed2f43b2 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -239,19 +239,6 @@ static const struct seq_operations kcm_seq_ops = {
 	.stop	= kcm_seq_stop,
 };
 
-static int kcm_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &kcm_seq_ops,
-			   sizeof(struct kcm_proc_mux_state));
-}
-
-static const struct file_operations kcm_seq_fops = {
-	.open		= kcm_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 {
 	struct kcm_psock_stats psock_stats;
@@ -376,7 +363,8 @@ static int kcm_proc_init_net(struct net *net)
 			 &kcm_stats_seq_fops))
 		goto out_kcm_stats;
 
-	if (!proc_create("kcm", 0444, net->proc_net, &kcm_seq_fops))
+	if (!proc_create_net("kcm", 0444, net->proc_net, &kcm_seq_ops,
+			sizeof(struct kcm_proc_mux_state)))
 		goto out_kcm;
 
 	return 0;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index e62e52e8f141..5e1d2946ffbf 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3812,24 +3812,12 @@ static const struct seq_operations pfkey_seq_ops = {
 	.show	= pfkey_seq_show,
 };
 
-static int pfkey_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pfkey_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations pfkey_proc_ops = {
-	.open	 = pfkey_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int __net_init pfkey_init_proc(struct net *net)
 {
 	struct proc_dir_entry *e;
 
-	e = proc_create("pfkey", 0, net->proc_net, &pfkey_proc_ops);
+	e = proc_create_net("pfkey", 0, net->proc_net, &pfkey_seq_ops,
+			sizeof(struct seq_net_private));
 	if (e == NULL)
 		return -ENOMEM;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1fd9e145076a..830469766c1f 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1742,24 +1742,6 @@ static const struct seq_operations pppol2tp_seq_ops = {
 	.stop		= pppol2tp_seq_stop,
 	.show		= pppol2tp_seq_show,
 };
-
-/* Called when our /proc file is opened. We allocate data for use when
- * iterating our tunnel / session contexts and store it in the private
- * data of the seq_file.
- */
-static int pppol2tp_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pppol2tp_seq_ops,
-			    sizeof(struct pppol2tp_seq_data));
-}
-
-static const struct file_operations pppol2tp_proc_fops = {
-	.open		= pppol2tp_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 /*****************************************************************************
@@ -1771,8 +1753,8 @@ static __net_init int pppol2tp_init_net(struct net *net)
 	struct proc_dir_entry *pde;
 	int err = 0;
 
-	pde = proc_create("pppol2tp", 0444, net->proc_net,
-			  &pppol2tp_proc_fops);
+	pde = proc_create_net("pppol2tp", 0444, net->proc_net,
+			&pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data));
 	if (!pde) {
 		err = -ENOMEM;
 		goto out;
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1c98c907bc63..c3db074fc1f7 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -587,25 +587,13 @@ static const struct seq_operations ip_vs_app_seq_ops = {
 	.stop  = ip_vs_app_seq_stop,
 	.show  = ip_vs_app_seq_show,
 };
-
-static int ip_vs_app_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_app_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ip_vs_app_fops = {
-	.open	 = ip_vs_app_open,
-	.read	 = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
 {
 	INIT_LIST_HEAD(&ipvs->app_list);
-	proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops);
+	proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
+			sizeof(struct seq_net_private));
 	return 0;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 370abbf6f421..2dde148f97b5 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1136,19 +1136,6 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
 	.show  = ip_vs_conn_seq_show,
 };
 
-static int ip_vs_conn_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
-			    sizeof(struct ip_vs_iter_state));
-}
-
-static const struct file_operations ip_vs_conn_fops = {
-	.open    = ip_vs_conn_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static const char *ip_vs_origin_name(unsigned int flags)
 {
 	if (flags & IP_VS_CONN_F_SYNC)
@@ -1212,20 +1199,6 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
 	.stop  = ip_vs_conn_seq_stop,
 	.show  = ip_vs_conn_sync_seq_show,
 };
-
-static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
-			    sizeof(struct ip_vs_iter_state));
-}
-
-static const struct file_operations ip_vs_conn_sync_fops = {
-	.open    = ip_vs_conn_sync_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 #endif
 
 
@@ -1385,9 +1358,11 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
 {
 	atomic_set(&ipvs->conn_count, 0);
 
-	proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops);
-	proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net,
-		    &ip_vs_conn_sync_fops);
+	proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+			&ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
+	proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+			&ip_vs_conn_sync_seq_ops,
+			sizeof(struct ip_vs_iter_state));
 	return 0;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index f36098887ad0..6221016e554a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2109,19 +2109,6 @@ static const struct seq_operations ip_vs_info_seq_ops = {
 	.show  = ip_vs_info_seq_show,
 };
 
-static int ip_vs_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_info_seq_ops,
-			sizeof(struct ip_vs_iter));
-}
-
-static const struct file_operations ip_vs_info_fops = {
-	.open    = ip_vs_info_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int ip_vs_stats_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_single_net(seq);
@@ -4030,7 +4017,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 
 	spin_lock_init(&ipvs->tot_stats.lock);
 
-	proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops);
+	proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
+			sizeof(struct ip_vs_iter));
 	proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops);
 	proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
 		    &ip_vs_stats_percpu_fops);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 4b2b3d53acfc..853b23206bb7 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -644,19 +644,6 @@ static const struct seq_operations exp_seq_ops = {
 	.stop = exp_seq_stop,
 	.show = exp_seq_show
 };
-
-static int exp_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &exp_seq_ops,
-			sizeof(struct ct_expect_iter_state));
-}
-
-static const struct file_operations exp_file_ops = {
-	.open    = exp_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif /* CONFIG_NF_CONNTRACK_PROCFS */
 
 static int exp_proc_init(struct net *net)
@@ -666,8 +653,8 @@ static int exp_proc_init(struct net *net)
 	kuid_t root_uid;
 	kgid_t root_gid;
 
-	proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
-			   &exp_file_ops);
+	proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net,
+			&exp_seq_ops, sizeof(struct ct_expect_iter_state));
 	if (!proc)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 037fec54c850..b642c0b2495c 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -375,19 +375,6 @@ static const struct seq_operations ct_seq_ops = {
 	.show  = ct_seq_show
 };
 
-static int ct_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ct_seq_ops,
-			sizeof(struct ct_iter_state));
-}
-
-static const struct file_operations ct_file_ops = {
-	.open    = ct_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct net *net = seq_file_net(seq);
@@ -467,26 +454,14 @@ static const struct seq_operations ct_cpu_seq_ops = {
 	.show	= ct_cpu_seq_show,
 };
 
-static int ct_cpu_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ct_cpu_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ct_cpu_seq_fops = {
-	.open	 = ct_cpu_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int nf_conntrack_standalone_init_proc(struct net *net)
 {
 	struct proc_dir_entry *pde;
 	kuid_t root_uid;
 	kgid_t root_gid;
 
-	pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
+	pde = proc_create_net("nf_conntrack", 0440, net->proc_net, &ct_seq_ops,
+			sizeof(struct ct_iter_state));
 	if (!pde)
 		goto out_nf_conntrack;
 
@@ -495,8 +470,8 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
 	if (uid_valid(root_uid) && gid_valid(root_gid))
 		proc_set_user(pde, root_uid, root_gid);
 
-	pde = proc_create("nf_conntrack", 0444, net->proc_net_stat,
-			  &ct_cpu_seq_fops);
+	pde = proc_create_net("nf_conntrack", 0444, net->proc_net_stat,
+			&ct_cpu_seq_ops, sizeof(struct seq_net_private));
 	if (!pde)
 		goto out_stat_nf_conntrack;
 	return 0;
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 6d0357817cda..426457047578 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -394,21 +394,6 @@ static const struct seq_operations nflog_seq_ops = {
 	.stop	= seq_stop,
 	.show	= seq_show,
 };
-
-static int nflog_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nflog_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations nflog_file_ops = {
-	.open	 = nflog_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
-
 #endif /* PROC_FS */
 
 #ifdef CONFIG_SYSCTL
@@ -549,8 +534,8 @@ static int __net_init nf_log_net_init(struct net *net)
 	int ret = -ENOMEM;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("nf_log", 0444,
-			 net->nf.proc_netfilter, &nflog_file_ops))
+	if (!proc_create_net("nf_log", 0444, net->nf.proc_netfilter,
+			&nflog_seq_ops, sizeof(struct seq_net_private)))
 		return ret;
 #endif
 	ret = netfilter_log_sysctl_init(net);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 6039b350abbe..8ff4d22f10b2 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -310,23 +310,10 @@ static const struct seq_operations synproxy_cpu_seq_ops = {
 	.show		= synproxy_cpu_seq_show,
 };
 
-static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations synproxy_cpu_seq_fops = {
-	.open		= synproxy_cpu_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int __net_init synproxy_proc_init(struct net *net)
 {
-	if (!proc_create("synproxy", 0444, net->proc_net_stat,
-			 &synproxy_cpu_seq_fops))
+	if (!proc_create_net("synproxy", 0444, net->proc_net_stat,
+			&synproxy_cpu_seq_ops, sizeof(struct seq_net_private)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 7b46aa4c478d..c14822b9729f 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1046,20 +1046,6 @@ static const struct seq_operations nful_seq_ops = {
 	.stop	= seq_stop,
 	.show	= seq_show,
 };
-
-static int nful_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nful_seq_ops,
-			    sizeof(struct iter_state));
-}
-
-static const struct file_operations nful_file_ops = {
-	.open	 = nful_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 #endif /* PROC_FS */
 
 static int __net_init nfnl_log_net_init(struct net *net)
@@ -1077,8 +1063,8 @@ static int __net_init nfnl_log_net_init(struct net *net)
 	spin_lock_init(&log->instances_lock);
 
 #ifdef CONFIG_PROC_FS
-	proc = proc_create("nfnetlink_log", 0440,
-			   net->nf.proc_netfilter, &nful_file_ops);
+	proc = proc_create_net("nfnetlink_log", 0440, net->nf.proc_netfilter,
+			&nful_seq_ops, sizeof(struct iter_state));
 	if (!proc)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 74a04638ef03..494a9ab35cb6 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1469,20 +1469,6 @@ static const struct seq_operations nfqnl_seq_ops = {
 	.stop	= seq_stop,
 	.show	= seq_show,
 };
-
-static int nfqnl_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nfqnl_seq_ops,
-			sizeof(struct iter_state));
-}
-
-static const struct file_operations nfqnl_file_ops = {
-	.open	 = nfqnl_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 #endif /* PROC_FS */
 
 static int __net_init nfnl_queue_net_init(struct net *net)
@@ -1496,8 +1482,8 @@ static int __net_init nfnl_queue_net_init(struct net *net)
 	spin_lock_init(&q->instances_lock);
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("nfnetlink_queue", 0440,
-			 net->nf.proc_netfilter, &nfqnl_file_ops))
+	if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter,
+			&nfqnl_seq_ops, sizeof(struct iter_state)))
 		return -ENOMEM;
 #endif
 	nf_register_queue_handler(net, &nfqh);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 3704101af27f..344dd01a5027 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1529,19 +1529,6 @@ static const struct seq_operations xt_table_seq_ops = {
 	.show	= xt_table_seq_show,
 };
 
-static int xt_table_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &xt_table_seq_ops,
-			sizeof(struct seq_net_private));
-}
-
-static const struct file_operations xt_table_ops = {
-	.open	 = xt_table_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 /*
  * Traverse state for ip{,6}_{tables,matches} for helping crossing
  * the multi-AF mutexes.
@@ -1790,8 +1777,9 @@ int xt_proto_init(struct net *net, u_int8_t af)
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_TABLES, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops,
-				(void *)(unsigned long)af);
+	proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops,
+			sizeof(struct seq_net_private),
+			(void *)(unsigned long)af);
 	if (!proc)
 		goto out;
 	if (uid_valid(root_uid) && gid_valid(root_gid))
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2e2dd88fc79f..393573a99a5a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2635,21 +2635,6 @@ static const struct seq_operations netlink_seq_ops = {
 	.stop   = netlink_seq_stop,
 	.show   = netlink_seq_show,
 };
-
-
-static int netlink_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &netlink_seq_ops,
-				sizeof(struct nl_seq_iter));
-}
-
-static const struct file_operations netlink_seq_fops = {
-	.open		= netlink_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 int netlink_register_notifier(struct notifier_block *nb)
@@ -2694,7 +2679,8 @@ static const struct net_proto_family netlink_family_ops = {
 static int __net_init netlink_net_init(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
+	if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
+			sizeof(struct nl_seq_iter)))
 		return -ENOMEM;
 #endif
 	return 0;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01f3515cada0..833e65252f1f 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4554,20 +4554,6 @@ static const struct seq_operations packet_seq_ops = {
 	.stop	= packet_seq_stop,
 	.show	= packet_seq_show,
 };
-
-static int packet_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &packet_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations packet_seq_fops = {
-	.open		= packet_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 static int __net_init packet_net_init(struct net *net)
@@ -4575,7 +4561,8 @@ static int __net_init packet_net_init(struct net *net)
 	mutex_init(&net->packet.sklist_lock);
 	INIT_HLIST_HEAD(&net->packet.sklist);
 
-	if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
+	if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index 77787512fc32..6cb4f602ab71 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -320,7 +320,8 @@ static int __net_init phonet_init_net(struct net *net)
 {
 	struct phonet_net *pnn = phonet_pernet(net);
 
-	if (!proc_create("phonet", 0, net->proc_net, &pn_sock_seq_fops))
+	if (!proc_create_net("phonet", 0, net->proc_net, &pn_sock_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&pnn->pndevs.list);
@@ -351,7 +352,8 @@ int __init phonet_device_init(void)
 	if (err)
 		return err;
 
-	proc_create("pnresource", 0, init_net.proc_net, &pn_res_seq_fops);
+	proc_create_net("pnresource", 0, init_net.proc_net, &pn_res_seq_ops,
+			sizeof(struct seq_net_private));
 	register_netdevice_notifier(&phonet_device_notifier);
 	err = phonet_netlink_register();
 	if (err)
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index f9b40e6a18a5..30187990257f 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -620,25 +620,12 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations pn_sock_seq_ops = {
+const struct seq_operations pn_sock_seq_ops = {
 	.start = pn_sock_seq_start,
 	.next = pn_sock_seq_next,
 	.stop = pn_sock_seq_stop,
 	.show = pn_sock_seq_show,
 };
-
-static int pn_sock_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pn_sock_seq_ops,
-				sizeof(struct seq_net_private));
-}
-
-const struct file_operations pn_sock_seq_fops = {
-	.open = pn_sock_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 static struct  {
@@ -802,23 +789,10 @@ static int pn_res_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations pn_res_seq_ops = {
+const struct seq_operations pn_res_seq_ops = {
 	.start = pn_res_seq_start,
 	.next = pn_res_seq_next,
 	.stop = pn_res_seq_stop,
 	.show = pn_res_seq_show,
 };
-
-static int pn_res_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pn_res_seq_ops,
-				sizeof(struct seq_net_private));
-}
-
-const struct file_operations pn_res_seq_fops = {
-	.open = pn_res_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 19975d2ca9a2..29923ec2189c 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -1051,8 +1051,8 @@ void __rxrpc_queue_peer_error(struct rxrpc_peer *);
 /*
  * proc.c
  */
-extern const struct file_operations rxrpc_call_seq_fops;
-extern const struct file_operations rxrpc_connection_seq_fops;
+extern const struct seq_operations rxrpc_call_seq_ops;
+extern const struct seq_operations rxrpc_connection_seq_ops;
 
 /*
  * recvmsg.c
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index c7a023fb22d0..5d6a773db973 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -97,8 +97,11 @@ static __net_init int rxrpc_init_net(struct net *net)
 	if (!rxnet->proc_net)
 		goto err_proc;
 
-	proc_create("calls", 0444, rxnet->proc_net, &rxrpc_call_seq_fops);
-	proc_create("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_fops);
+	proc_create_net("calls", 0444, rxnet->proc_net, &rxrpc_call_seq_ops,
+			sizeof(struct seq_net_private));
+	proc_create_net("conns", 0444, rxnet->proc_net,
+			&rxrpc_connection_seq_ops,
+			sizeof(struct seq_net_private));
 	return 0;
 
 err_proc:
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 7e45db058823..d9fca8c4bcdc 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -115,26 +115,13 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations rxrpc_call_seq_ops = {
+const struct seq_operations rxrpc_call_seq_ops = {
 	.start  = rxrpc_call_seq_start,
 	.next   = rxrpc_call_seq_next,
 	.stop   = rxrpc_call_seq_stop,
 	.show   = rxrpc_call_seq_show,
 };
 
-static int rxrpc_call_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &rxrpc_call_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-const struct file_operations rxrpc_call_seq_fops = {
-	.open		= rxrpc_call_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * generate a list of extant virtual connections in /proc/net/rxrpc_conns
  */
@@ -207,23 +194,9 @@ print:
 	return 0;
 }
 
-static const struct seq_operations rxrpc_connection_seq_ops = {
+const struct seq_operations rxrpc_connection_seq_ops = {
 	.start  = rxrpc_connection_seq_start,
 	.next   = rxrpc_connection_seq_next,
 	.stop   = rxrpc_connection_seq_stop,
 	.show   = rxrpc_connection_seq_show,
 };
-
-
-static int rxrpc_connection_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &rxrpc_connection_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-const struct file_operations rxrpc_connection_seq_fops = {
-	.open		= rxrpc_connection_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 1d9ccc6dab2b..009c185b71eb 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -225,21 +225,6 @@ static const struct seq_operations sctp_eps_ops = {
 	.show  = sctp_eps_seq_show,
 };
 
-
-/* Initialize the seq file operations for 'eps' object. */
-static int sctp_eps_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &sctp_eps_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations sctp_eps_seq_fops = {
-	.open	 = sctp_eps_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 struct sctp_ht_iter {
 	struct seq_net_private p;
 	struct rhashtable_iter hti;
@@ -338,20 +323,6 @@ static const struct seq_operations sctp_assoc_ops = {
 	.show  = sctp_assocs_seq_show,
 };
 
-/* Initialize the seq file operations for 'assocs' object. */
-static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &sctp_assoc_ops,
-			    sizeof(struct sctp_ht_iter));
-}
-
-static const struct file_operations sctp_assocs_seq_fops = {
-	.open	 = sctp_assocs_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 {
 	struct sctp_association *assoc;
@@ -431,19 +402,6 @@ static const struct seq_operations sctp_remaddr_ops = {
 	.show  = sctp_remaddr_seq_show,
 };
 
-static int sctp_remaddr_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &sctp_remaddr_ops,
-			    sizeof(struct sctp_ht_iter));
-}
-
-static const struct file_operations sctp_remaddr_seq_fops = {
-	.open = sctp_remaddr_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
-
 /* Set up the proc fs entry for the SCTP protocol. */
 int __net_init sctp_proc_init(struct net *net)
 {
@@ -453,14 +411,14 @@ int __net_init sctp_proc_init(struct net *net)
 	if (!proc_create("snmp", 0444, net->sctp.proc_net_sctp,
 			 &sctp_snmp_seq_fops))
 		goto cleanup;
-	if (!proc_create("eps", 0444, net->sctp.proc_net_sctp,
-			 &sctp_eps_seq_fops))
+	if (!proc_create_net("eps", 0444, net->sctp.proc_net_sctp,
+			&sctp_eps_ops, sizeof(struct seq_net_private)))
 		goto cleanup;
-	if (!proc_create("assocs", 0444, net->sctp.proc_net_sctp,
-			 &sctp_assocs_seq_fops))
+	if (!proc_create_net("assocs", 0444, net->sctp.proc_net_sctp,
+			&sctp_assoc_ops, sizeof(struct sctp_ht_iter)))
 		goto cleanup;
-	if (!proc_create("remaddr", 0444, net->sctp.proc_net_sctp,
-			 &sctp_remaddr_seq_fops))
+	if (!proc_create_net("remaddr", 0444, net->sctp.proc_net_sctp,
+			&sctp_remaddr_ops, sizeof(struct sctp_ht_iter)))
 		goto cleanup;
 	return 0;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 68bb70a62afe..e5473c03d667 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2852,20 +2852,6 @@ static const struct seq_operations unix_seq_ops = {
 	.stop   = unix_seq_stop,
 	.show   = unix_seq_show,
 };
-
-static int unix_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &unix_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations unix_seq_fops = {
-	.open		= unix_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 static const struct net_proto_family unix_family_ops = {
@@ -2884,7 +2870,8 @@ static int __net_init unix_net_init(struct net *net)
 		goto out;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
+	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
+			sizeof(struct seq_net_private))) {
 		unix_sysctl_unregister(net);
 		goto out;
 	}
diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c
index b4c464594a5e..cadcf8613af2 100644
--- a/net/wireless/wext-proc.c
+++ b/net/wireless/wext-proc.c
@@ -126,24 +126,11 @@ static const struct seq_operations wireless_seq_ops = {
 	.show  = wireless_dev_seq_show,
 };
 
-static int seq_open_wireless(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &wireless_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations wireless_seq_fops = {
-	.open    = seq_open_wireless,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 int __net_init wext_proc_init(struct net *net)
 {
 	/* Create /proc/net/wireless entry */
-	if (!proc_create("wireless", 0444, net->proc_net,
-			 &wireless_seq_fops))
+	if (!proc_create_net("wireless", 0444, net->proc_net,
+			&wireless_seq_ops, sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	return 0;
-- 
cgit v1.2.3


From 3617d9496cd92dcca4d0893191d95554590d8d9f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Apr 2018 20:38:35 +0200
Subject: proc: introduce proc_create_net_single

Variant of proc_create_data that directly take a seq_file show
callback and deals with network namespaces in ->open and ->release.
All callers of proc_create + single_open_net converted over, and
single_{open,release}_net are removed entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/proc/proc_net.c             |  49 ++++++++++------
 include/linux/proc_fs.h        |   4 ++
 include/linux/seq_file_net.h   |   7 +--
 net/can/bcm.c                  |  16 +-----
 net/can/proc.c                 | 127 ++++++++---------------------------------
 net/ipv4/fib_trie.c            |  16 +-----
 net/ipv4/proc.c                |  48 ++--------------
 net/ipv6/proc.c                |  31 ++--------
 net/ipv6/route.c               |  15 +----
 net/kcm/kcmproc.c              |  16 +-----
 net/netfilter/ipvs/ip_vs_ctl.c |  31 ++--------
 net/sctp/proc.c                |  17 +-----
 net/xfrm/xfrm_proc.c           |  16 +-----
 13 files changed, 86 insertions(+), 307 deletions(-)

(limited to 'include')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index c99fd183f034..7d94fa005b0d 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -93,37 +93,50 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(proc_create_net_data);
 
-int single_open_net(struct inode *inode, struct file *file,
-		int (*show)(struct seq_file *, void *))
+static int single_open_net(struct inode *inode, struct file *file)
 {
-	int err;
+	struct proc_dir_entry *de = PDE(inode);
 	struct net *net;
+	int err;
 
-	err = -ENXIO;
 	net = get_proc_net(inode);
-	if (net == NULL)
-		goto err_net;
-
-	err = single_open(file, show, net);
-	if (err < 0)
-		goto err_open;
-
-	return 0;
+	if (!net)
+		return -ENXIO;
 
-err_open:
-	put_net(net);
-err_net:
+	err = single_open(file, de->single_show, net);
+	if (err)
+		put_net(net);
 	return err;
 }
-EXPORT_SYMBOL_GPL(single_open_net);
 
-int single_release_net(struct inode *ino, struct file *f)
+static int single_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq = f->private_data;
 	put_net(seq->private);
 	return single_release(ino, f);
 }
-EXPORT_SYMBOL_GPL(single_release_net);
+
+static const struct file_operations proc_net_single_fops = {
+	.open		= single_open_net,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_net_single_fops;
+	p->single_show = show;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single);
 
 static struct net *get_proc_task_net(struct inode *dir)
 {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 9dcde9644253..e518352137e7 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -58,6 +58,9 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 		unsigned int state_size, void *data);
 #define proc_create_net(name, mode, parent, state_size, ops) \
 	proc_create_net_data(name, mode, parent, state_size, ops, NULL)
+struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data);
 
 #else /* CONFIG_PROC_FS */
 
@@ -97,6 +100,7 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
 
 #define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;})
 #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
+#define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
 
 #endif /* CONFIG_PROC_FS */
 
diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index 5ea18a16291a..0fdbe1ddd8d1 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -13,9 +13,6 @@ struct seq_net_private {
 #endif
 };
 
-int single_open_net(struct inode *, struct file *file,
-		int (*show)(struct seq_file *, void *));
-int single_release_net(struct inode *, struct file *);
 static inline struct net *seq_file_net(struct seq_file *seq)
 {
 #ifdef CONFIG_NET_NS
@@ -26,8 +23,8 @@ static inline struct net *seq_file_net(struct seq_file *seq)
 }
 
 /*
- * This one is needed for single_open_net since net is stored directly in
- * private not as a struct i.e. seq_file_net can't be used.
+ * This one is needed for proc_create_net_single since net is stored directly
+ * in private not as a struct i.e. seq_file_net can't be used.
  */
 static inline struct net *seq_file_single_net(struct seq_file *seq)
 {
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 8073fa14e143..6ad89f49b341 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -239,18 +239,6 @@ static int bcm_proc_show(struct seq_file *m, void *v)
 	seq_putc(m, '\n');
 	return 0;
 }
-
-static int bcm_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, bcm_proc_show);
-}
-
-static const struct file_operations bcm_proc_fops = {
-	.open		= bcm_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release_net,
-};
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -1606,9 +1594,9 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
 	if (net->can.bcmproc_dir) {
 		/* unique socket address as filename */
 		sprintf(bo->procname, "%lu", sock_i_ino(sk));
-		bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
+		bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644,
 						     net->can.bcmproc_dir,
-						     &bcm_proc_fops, sk);
+						     bcm_proc_show, sk);
 		if (!bo->bcm_proc_read) {
 			ret = -ENOMEM;
 			goto fail;
diff --git a/net/can/proc.c b/net/can/proc.c
index fde2fd55b826..70fea17bb04c 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -270,18 +270,6 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_stats_proc_show);
-}
-
-static const struct file_operations can_stats_proc_fops = {
-	.open		= can_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release_net,
-};
-
 static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 {
 	struct net *net = m->private;
@@ -303,36 +291,12 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_reset_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_reset_stats_proc_show);
-}
-
-static const struct file_operations can_reset_stats_proc_fops = {
-	.open		= can_reset_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int can_version_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "%s\n", CAN_VERSION_STRING);
 	return 0;
 }
 
-static int can_version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_version_proc_show);
-}
-
-static const struct file_operations can_version_proc_fops = {
-	.open		= can_version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
 					     struct net_device *dev,
 					     struct can_dev_rcv_lists *d)
@@ -373,18 +337,6 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_rcvlist_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_rcvlist_proc_show);
-}
-
-static const struct file_operations can_rcvlist_proc_fops = {
-	.open		= can_rcvlist_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static inline void can_rcvlist_proc_show_array(struct seq_file *m,
 					       struct net_device *dev,
 					       struct hlist_head *rcv_array,
@@ -440,19 +392,6 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_rcvlist_sff_proc_show);
-}
-
-static const struct file_operations can_rcvlist_sff_proc_fops = {
-	.open		= can_rcvlist_sff_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release_net,
-};
-
-
 static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 {
 	struct net_device *dev;
@@ -483,18 +422,6 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_rcvlist_eff_proc_show);
-}
-
-static const struct file_operations can_rcvlist_eff_proc_fops = {
-	.open		= can_rcvlist_eff_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release_net,
-};
-
 /*
  * can_init_proc - create main CAN proc directory and procfs entries
  */
@@ -510,37 +437,29 @@ void can_init_proc(struct net *net)
 	}
 
 	/* own procfs entries from the AF_CAN core */
-	net->can.pde_version     = proc_create(CAN_PROC_VERSION, 0644,
-					       net->can.proc_dir,
-					       &can_version_proc_fops);
-	net->can.pde_stats       = proc_create(CAN_PROC_STATS, 0644,
-					       net->can.proc_dir,
-					       &can_stats_proc_fops);
-	net->can.pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644,
-					       net->can.proc_dir,
-					       &can_reset_stats_proc_fops);
-	net->can.pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_ERR);
-	net->can.pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_ALL);
-	net->can.pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_FIL);
-	net->can.pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_INV);
-	net->can.pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644,
-					       net->can.proc_dir,
-					       &can_rcvlist_eff_proc_fops);
-	net->can.pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644,
-					       net->can.proc_dir,
-					       &can_rcvlist_sff_proc_fops);
+	net->can.pde_version = proc_create_net_single(CAN_PROC_VERSION, 0644,
+			net->can.proc_dir, can_version_proc_show, NULL);
+	net->can.pde_stats = proc_create_net_single(CAN_PROC_STATS, 0644,
+			net->can.proc_dir, can_stats_proc_show, NULL);
+	net->can.pde_reset_stats = proc_create_net_single(CAN_PROC_RESET_STATS,
+			0644, net->can.proc_dir, can_reset_stats_proc_show,
+			NULL);
+	net->can.pde_rcvlist_err = proc_create_net_single(CAN_PROC_RCVLIST_ERR,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_ERR);
+	net->can.pde_rcvlist_all = proc_create_net_single(CAN_PROC_RCVLIST_ALL,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_ALL);
+	net->can.pde_rcvlist_fil = proc_create_net_single(CAN_PROC_RCVLIST_FIL,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_FIL);
+	net->can.pde_rcvlist_inv = proc_create_net_single(CAN_PROC_RCVLIST_INV,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_INV);
+	net->can.pde_rcvlist_eff = proc_create_net_single(CAN_PROC_RCVLIST_EFF,
+			0644, net->can.proc_dir, can_rcvlist_eff_proc_show, NULL);
+	net->can.pde_rcvlist_sff = proc_create_net_single(CAN_PROC_RCVLIST_SFF,
+			0644, net->can.proc_dir, can_rcvlist_sff_proc_show, NULL);
 }
 
 /*
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3293f04b941d..99c23a0cb8ca 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2348,18 +2348,6 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int fib_triestat_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, fib_triestat_seq_show);
-}
-
-static const struct file_operations fib_triestat_fops = {
-	.open	= fib_triestat_seq_open,
-	.read	= seq_read,
-	.llseek	= seq_lseek,
-	.release = single_release_net,
-};
-
 static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 {
 	struct fib_trie_iter *iter = seq->private;
@@ -2719,8 +2707,8 @@ int __net_init fib_proc_init(struct net *net)
 			sizeof(struct fib_trie_iter)))
 		goto out1;
 
-	if (!proc_create("fib_triestat", 0444, net->proc_net,
-			 &fib_triestat_fops))
+	if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
+			fib_triestat_seq_show, NULL))
 		goto out2;
 
 	if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a058de677e94..573e43c8ed87 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -77,18 +77,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int sockstat_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, sockstat_seq_show);
-}
-
-static const struct file_operations sockstat_seq_fops = {
-	.open	 = sockstat_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 /* snmp items */
 static const struct snmp_mib snmp4_ipstats_list[] = {
 	SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
@@ -460,20 +448,6 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int snmp_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, snmp_seq_show);
-}
-
-static const struct file_operations snmp_seq_fops = {
-	.open	 = snmp_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
-
-
 /*
  *	Output /proc/net/netstat
  */
@@ -507,26 +481,16 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int netstat_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, netstat_seq_show);
-}
-
-static const struct file_operations netstat_seq_fops = {
-	.open	 = netstat_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 static __net_init int ip_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat", 0444, net->proc_net,
-			 &sockstat_seq_fops))
+	if (!proc_create_net_single("sockstat", 0444, net->proc_net,
+			sockstat_seq_show, NULL))
 		goto out_sockstat;
-	if (!proc_create("netstat", 0444, net->proc_net, &netstat_seq_fops))
+	if (!proc_create_net_single("netstat", 0444, net->proc_net,
+			netstat_seq_show, NULL))
 		goto out_netstat;
-	if (!proc_create("snmp", 0444, net->proc_net, &snmp_seq_fops))
+	if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show,
+			NULL))
 		goto out_snmp;
 
 	return 0;
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index a0edf194822a..2356b4af7309 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -53,18 +53,6 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int sockstat6_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, sockstat6_seq_show);
-}
-
-static const struct file_operations sockstat6_seq_fops = {
-	.open	 = sockstat6_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 static const struct snmp_mib snmp6_ipstats_list[] = {
 /* ipv6 mib according to RFC 2465 */
 	SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS),
@@ -242,18 +230,6 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int snmp6_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, snmp6_seq_show);
-}
-
-static const struct file_operations snmp6_seq_fops = {
-	.open	 = snmp6_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
 {
 	struct inet6_dev *idev = (struct inet6_dev *)seq->private;
@@ -302,11 +278,12 @@ int snmp6_unregister_dev(struct inet6_dev *idev)
 
 static int __net_init ipv6_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat6", 0444, net->proc_net,
-			 &sockstat6_seq_fops))
+	if (!proc_create_net_single("sockstat6", 0444, net->proc_net,
+			sockstat6_seq_show, NULL))
 		return -ENOMEM;
 
-	if (!proc_create("snmp6", 0444, net->proc_net, &snmp6_seq_fops))
+	if (!proc_create_net_single("snmp6", 0444, net->proc_net,
+			snmp6_seq_show, NULL))
 		goto proc_snmp6_fail;
 
 	net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 527b9b644f5a..a6598762d2c1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4876,18 +4876,6 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 
 	return 0;
 }
-
-static int rt6_stats_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, rt6_stats_seq_show);
-}
-
-static const struct file_operations rt6_stats_seq_fops = {
-	.open	 = rt6_stats_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
 #endif	/* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SYSCTL
@@ -5094,7 +5082,8 @@ static int __net_init ip6_route_net_init_late(struct net *net)
 #ifdef CONFIG_PROC_FS
 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
 			sizeof(struct ipv6_route_iter));
-	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
+	proc_create_net_single("rt6_stats", 0444, net->proc_net,
+			rt6_stats_seq_show, NULL);
 #endif
 	return 0;
 }
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 6283ed2f43b2..370da2f80e3c 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -345,22 +345,10 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int kcm_stats_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, kcm_stats_seq_show);
-}
-
-static const struct file_operations kcm_stats_seq_fops = {
-	.open    = kcm_stats_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release_net,
-};
-
 static int kcm_proc_init_net(struct net *net)
 {
-	if (!proc_create("kcm_stats", 0444, net->proc_net,
-			 &kcm_stats_seq_fops))
+	if (!proc_create_net_single("kcm_stats", 0444, net->proc_net,
+			 kcm_stats_seq_show, NULL))
 		goto out_kcm_stats;
 
 	if (!proc_create_net("kcm", 0444, net->proc_net, &kcm_seq_ops,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6221016e554a..c983dea67e3a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2141,18 +2141,6 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, ip_vs_stats_show);
-}
-
-static const struct file_operations ip_vs_stats_fops = {
-	.open = ip_vs_stats_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release_net,
-};
-
 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_single_net(seq);
@@ -2208,18 +2196,6 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 
 	return 0;
 }
-
-static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, ip_vs_stats_percpu_show);
-}
-
-static const struct file_operations ip_vs_stats_percpu_fops = {
-	.open = ip_vs_stats_percpu_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release_net,
-};
 #endif
 
 /*
@@ -4019,9 +3995,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 
 	proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
 			sizeof(struct ip_vs_iter));
-	proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops);
-	proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
-		    &ip_vs_stats_percpu_fops);
+	proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
+			ip_vs_stats_show, NULL);
+	proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
+			ip_vs_stats_percpu_show, NULL);
 
 	if (ip_vs_control_net_init_sysctl(ipvs))
 		goto err;
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 009c185b71eb..ef5c9a82d4e8 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -88,19 +88,6 @@ static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-/* Initialize the seq file operations for 'snmp' object. */
-static int sctp_snmp_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, sctp_snmp_seq_show);
-}
-
-static const struct file_operations sctp_snmp_seq_fops = {
-	.open	 = sctp_snmp_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 /* Dump local addresses of an association/endpoint. */
 static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
 {
@@ -408,8 +395,8 @@ int __net_init sctp_proc_init(struct net *net)
 	net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net);
 	if (!net->sctp.proc_net_sctp)
 		return -ENOMEM;
-	if (!proc_create("snmp", 0444, net->sctp.proc_net_sctp,
-			 &sctp_snmp_seq_fops))
+	if (!proc_create_net_single("snmp", 0444, net->sctp.proc_net_sctp,
+			 sctp_snmp_seq_show, NULL))
 		goto cleanup;
 	if (!proc_create_net("eps", 0444, net->sctp.proc_net_sctp,
 			&sctp_eps_ops, sizeof(struct seq_net_private)))
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index ed06903cd84d..178318d2e120 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -65,22 +65,10 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int xfrm_statistics_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, xfrm_statistics_seq_show);
-}
-
-static const struct file_operations xfrm_statistics_seq_fops = {
-	.open	 = xfrm_statistics_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 int __net_init xfrm_proc_init(struct net *net)
 {
-	if (!proc_create("xfrm_stat", 0444, net->proc_net,
-			 &xfrm_statistics_seq_fops))
+	if (!proc_create_net_single("xfrm_stat", 0444, net->proc_net,
+			 xfrm_statistics_seq_show, NULL))
 		return -ENOMEM;
 	return 0;
 }
-- 
cgit v1.2.3


From 2cd1f0ddbb5667f61e69089964209e8f716e9009 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Apr 2018 18:39:29 +0200
Subject: isdn: replace ->proc_fops with ->proc_show

And switch to proc_create_single_data.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/isdn/capi/kcapi.c              |  3 ++-
 drivers/isdn/gigaset/capi.c            | 16 +---------------
 drivers/isdn/hardware/avm/avmcard.h    |  4 ++--
 drivers/isdn/hardware/avm/b1.c         | 17 ++---------------
 drivers/isdn/hardware/avm/b1dma.c      | 17 ++---------------
 drivers/isdn/hardware/avm/b1isa.c      |  2 +-
 drivers/isdn/hardware/avm/b1pci.c      |  4 ++--
 drivers/isdn/hardware/avm/b1pcmcia.c   |  2 +-
 drivers/isdn/hardware/avm/c4.c         | 15 +--------------
 drivers/isdn/hardware/avm/t1isa.c      |  2 +-
 drivers/isdn/hardware/avm/t1pci.c      |  2 +-
 drivers/isdn/hardware/eicon/capimain.c | 15 +--------------
 drivers/isdn/hysdn/hycapi.c            | 15 +--------------
 include/linux/isdn/capilli.h           |  2 +-
 net/bluetooth/cmtp/capi.c              | 14 +-------------
 15 files changed, 20 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 46c189ad8d94..0ff517d3c98f 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -534,7 +534,8 @@ int attach_capi_ctr(struct capi_ctr *ctr)
 	init_waitqueue_head(&ctr->state_wait_queue);
 
 	sprintf(ctr->procfn, "capi/controllers/%d", ctr->cnr);
-	ctr->procent = proc_create_data(ctr->procfn, 0, NULL, ctr->proc_fops, ctr);
+	ctr->procent = proc_create_single_data(ctr->procfn, 0, NULL,
+			ctr->proc_show, ctr);
 
 	ncontrollers++;
 
diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c
index ccec7778cad2..dac5cd35e901 100644
--- a/drivers/isdn/gigaset/capi.c
+++ b/drivers/isdn/gigaset/capi.c
@@ -2437,19 +2437,6 @@ static int gigaset_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int gigaset_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, gigaset_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations gigaset_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= gigaset_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /**
  * gigaset_isdn_regdev() - register device to LL
  * @cs:		device descriptor structure.
@@ -2478,8 +2465,7 @@ int gigaset_isdn_regdev(struct cardstate *cs, const char *isdnid)
 	iif->ctr.register_appl = gigaset_register_appl;
 	iif->ctr.release_appl  = gigaset_release_appl;
 	iif->ctr.send_message  = gigaset_send_message;
-	iif->ctr.procinfo      = gigaset_procinfo;
-	iif->ctr.proc_fops = &gigaset_proc_fops;
+	iif->ctr.proc_show     = gigaset_proc_show,
 	INIT_LIST_HEAD(&iif->appls);
 	skb_queue_head_init(&iif->sendqueue);
 	atomic_set(&iif->sendqlen, 0);
diff --git a/drivers/isdn/hardware/avm/avmcard.h b/drivers/isdn/hardware/avm/avmcard.h
index c95712dbfa9f..cdfa89c71997 100644
--- a/drivers/isdn/hardware/avm/avmcard.h
+++ b/drivers/isdn/hardware/avm/avmcard.h
@@ -556,7 +556,7 @@ u16  b1_send_message(struct capi_ctr *ctrl, struct sk_buff *skb);
 void b1_parse_version(avmctrl_info *card);
 irqreturn_t b1_interrupt(int interrupt, void *devptr);
 
-extern const struct file_operations b1ctl_proc_fops;
+int b1_proc_show(struct seq_file *m, void *v);
 
 avmcard_dmainfo *avmcard_dma_alloc(char *name, struct pci_dev *,
 				   long rsize, long ssize);
@@ -576,6 +576,6 @@ void b1dma_register_appl(struct capi_ctr *ctrl,
 			 capi_register_params *rp);
 void b1dma_release_appl(struct capi_ctr *ctrl, u16 appl);
 u16  b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb);
-extern const struct file_operations b1dmactl_proc_fops;
+int b1dma_proc_show(struct seq_file *m, void *v);
 
 #endif /* _AVMCARD_H_ */
diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c
index b1833d08a5fe..5ee5489d3f15 100644
--- a/drivers/isdn/hardware/avm/b1.c
+++ b/drivers/isdn/hardware/avm/b1.c
@@ -637,7 +637,7 @@ irqreturn_t b1_interrupt(int interrupt, void *devptr)
 }
 
 /* ------------------------------------------------------------- */
-static int b1ctl_proc_show(struct seq_file *m, void *v)
+int b1_proc_show(struct seq_file *m, void *v)
 {
 	struct capi_ctr *ctrl = m->private;
 	avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata);
@@ -699,20 +699,7 @@ static int b1ctl_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int b1ctl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, b1ctl_proc_show, PDE_DATA(inode));
-}
-
-const struct file_operations b1ctl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= b1ctl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL(b1ctl_proc_fops);
+EXPORT_SYMBOL(b1_proc_show);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index 9538a9e5e1a8..6a3dc9937ce5 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -858,7 +858,7 @@ u16 b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
 
 /* ------------------------------------------------------------- */
 
-static int b1dmactl_proc_show(struct seq_file *m, void *v)
+int b1dma_proc_show(struct seq_file *m, void *v)
 {
 	struct capi_ctr *ctrl = m->private;
 	avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata);
@@ -941,20 +941,7 @@ static int b1dmactl_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int b1dmactl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, b1dmactl_proc_show, PDE_DATA(inode));
-}
-
-const struct file_operations b1dmactl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= b1dmactl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL(b1dmactl_proc_fops);
+EXPORT_SYMBOL(b1dma_proc_show);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/b1isa.c b/drivers/isdn/hardware/avm/b1isa.c
index 54e871a47387..cdfea72e0ef6 100644
--- a/drivers/isdn/hardware/avm/b1isa.c
+++ b/drivers/isdn/hardware/avm/b1isa.c
@@ -121,7 +121,7 @@ static int b1isa_probe(struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1isa_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/b1pci.c b/drivers/isdn/hardware/avm/b1pci.c
index ac4863c2ecbc..b76b57a82c02 100644
--- a/drivers/isdn/hardware/avm/b1pci.c
+++ b/drivers/isdn/hardware/avm/b1pci.c
@@ -112,7 +112,7 @@ static int b1pci_probe(struct capicardparams *p, struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1pci_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 	cinfo->capi_ctrl.owner         = THIS_MODULE;
 
@@ -251,7 +251,7 @@ static int b1pciv4_probe(struct capicardparams *p, struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1dma_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1dma_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1pciv4_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1dma_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/b1pcmcia.c b/drivers/isdn/hardware/avm/b1pcmcia.c
index 6b0d19d963d5..3aca16e62902 100644
--- a/drivers/isdn/hardware/avm/b1pcmcia.c
+++ b/drivers/isdn/hardware/avm/b1pcmcia.c
@@ -108,7 +108,7 @@ static int b1pcmcia_add_card(unsigned int port, unsigned irq,
 	cinfo->capi_ctrl.load_firmware = b1_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1pcmcia_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index 034cabac699d..ac72cd204c4d 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -1127,19 +1127,6 @@ static int c4_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int c4_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, c4_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations c4_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= c4_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* ------------------------------------------------------------- */
 
 static int c4_add_card(struct capicardparams *p, struct pci_dev *dev,
@@ -1211,7 +1198,7 @@ static int c4_add_card(struct capicardparams *p, struct pci_dev *dev,
 		cinfo->capi_ctrl.load_firmware = c4_load_firmware;
 		cinfo->capi_ctrl.reset_ctr     = c4_reset_ctr;
 		cinfo->capi_ctrl.procinfo      = c4_procinfo;
-		cinfo->capi_ctrl.proc_fops = &c4_proc_fops;
+		cinfo->capi_ctrl.proc_show     = c4_proc_show;
 		strcpy(cinfo->capi_ctrl.name, card->name);
 
 		retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c
index 9f80d20ced87..2153619c5b31 100644
--- a/drivers/isdn/hardware/avm/t1isa.c
+++ b/drivers/isdn/hardware/avm/t1isa.c
@@ -430,7 +430,7 @@ static int t1isa_probe(struct pci_dev *pdev, int cardnr)
 	cinfo->capi_ctrl.load_firmware = t1isa_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = t1isa_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = t1isa_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/t1pci.c b/drivers/isdn/hardware/avm/t1pci.c
index 2180b1685691..f5ed1d5004c9 100644
--- a/drivers/isdn/hardware/avm/t1pci.c
+++ b/drivers/isdn/hardware/avm/t1pci.c
@@ -119,7 +119,7 @@ static int t1pci_add_card(struct capicardparams *p, struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1dma_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1dma_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = t1pci_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1dma_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/eicon/capimain.c b/drivers/isdn/hardware/eicon/capimain.c
index be36d82004d6..f9244dc1c3c9 100644
--- a/drivers/isdn/hardware/eicon/capimain.c
+++ b/drivers/isdn/hardware/eicon/capimain.c
@@ -90,19 +90,6 @@ static int diva_ctl_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int diva_ctl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, diva_ctl_proc_show, NULL);
-}
-
-static const struct file_operations diva_ctl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= diva_ctl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * set additional os settings in capi_ctr struct
  */
@@ -111,7 +98,7 @@ void diva_os_set_controller_struct(struct capi_ctr *ctrl)
 	ctrl->driver_name = DRIVERLNAME;
 	ctrl->load_firmware = NULL;
 	ctrl->reset_ctr = NULL;
-	ctrl->proc_fops = &diva_ctl_proc_fops;
+	ctrl->proc_show = diva_ctl_proc_show;
 	ctrl->owner = THIS_MODULE;
 }
 
diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c
index eac0f51a0f60..a2c15cd7bf67 100644
--- a/drivers/isdn/hysdn/hycapi.c
+++ b/drivers/isdn/hysdn/hycapi.c
@@ -467,19 +467,6 @@ static int hycapi_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int hycapi_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hycapi_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations hycapi_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= hycapi_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /**************************************************************
 hycapi_load_firmware
 
@@ -774,7 +761,7 @@ hycapi_capi_create(hysdn_card *card)
 		ctrl->load_firmware = hycapi_load_firmware;
 		ctrl->reset_ctr     = hycapi_reset_ctr;
 		ctrl->procinfo      = hycapi_procinfo;
-		ctrl->proc_fops = &hycapi_proc_fops;
+		ctrl->proc_show     = hycapi_proc_show;
 		strcpy(ctrl->name, cinfo->cardname);
 		ctrl->owner = THIS_MODULE;
 
diff --git a/include/linux/isdn/capilli.h b/include/linux/isdn/capilli.h
index 11b57c485854..d75e1ad72964 100644
--- a/include/linux/isdn/capilli.h
+++ b/include/linux/isdn/capilli.h
@@ -50,7 +50,7 @@ struct capi_ctr {
 	u16  (*send_message)(struct capi_ctr *, struct sk_buff *skb);
 	
 	char *(*procinfo)(struct capi_ctr *);
-	const struct file_operations *proc_fops;
+	int (*proc_show)(struct seq_file *, void *);
 
 	/* filled in before calling ready callback */
 	u8 manu[CAPI_MANUFACTURER_LEN];		/* CAPI_GET_MANUFACTURER */
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 426a92f02db4..eb41556002e3 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -521,18 +521,6 @@ static int cmtp_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cmtp_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cmtp_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations cmtp_proc_fops = {
-	.open		= cmtp_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 int cmtp_attach_device(struct cmtp_session *session)
 {
 	unsigned char buf[4];
@@ -571,7 +559,7 @@ int cmtp_attach_device(struct cmtp_session *session)
 	session->ctrl.send_message  = cmtp_send_message;
 
 	session->ctrl.procinfo      = cmtp_procinfo;
-	session->ctrl.proc_fops = &cmtp_proc_fops;
+	session->ctrl.proc_show     = cmtp_proc_show;
 
 	if (attach_capi_ctr(&session->ctrl) < 0) {
 		BT_ERR("Can't attach new controller");
-- 
cgit v1.2.3


From ec7d9c9ce897174243af4fcd201dbfc34df0f3a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Apr 2018 21:25:54 +0200
Subject: ide: replace ->proc_fops with ->proc_show

Just set up the show callback in the tty_operations, and use
proc_create_single_data to create the file without additional
boilerplace code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/ide/ide-cd.c          |  15 +----
 drivers/ide/ide-disk_proc.c   |  62 ++-----------------
 drivers/ide/ide-floppy_proc.c |  17 +-----
 drivers/ide/ide-proc.c        | 136 ++++++------------------------------------
 drivers/ide/ide-tape.c        |  17 +-----
 include/linux/ide.h           |   6 +-
 6 files changed, 31 insertions(+), 222 deletions(-)

(limited to 'include')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5a8e8e3c22cd..b52a7bdace52 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1426,21 +1426,8 @@ static int idecd_capacity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idecd_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idecd_capacity_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idecd_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idecd_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static ide_proc_entry_t idecd_proc[] = {
-	{ "capacity", S_IFREG|S_IRUGO, &idecd_capacity_proc_fops },
+	{ "capacity", S_IFREG|S_IRUGO, idecd_capacity_proc_show },
 	{}
 };
 
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index 82a36ced4e96..95d239b2f646 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -52,19 +52,6 @@ static int idedisk_cache_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idedisk_cache_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_cache_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_cache_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_cache_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int idedisk_capacity_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t*drive = (ide_drive_t *)m->private;
@@ -73,19 +60,6 @@ static int idedisk_capacity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idedisk_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_capacity_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __idedisk_proc_show(struct seq_file *m, ide_drive_t *drive, u8 sub_cmd)
 {
 	u8 *buf;
@@ -114,43 +88,17 @@ static int idedisk_sv_proc_show(struct seq_file *m, void *v)
 	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_VALUES);
 }
 
-static int idedisk_sv_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_sv_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_sv_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_sv_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int idedisk_st_proc_show(struct seq_file *m, void *v)
 {
 	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_THRESHOLDS);
 }
 
-static int idedisk_st_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_st_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_st_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_st_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 ide_proc_entry_t ide_disk_proc[] = {
-	{ "cache",	  S_IFREG|S_IRUGO, &idedisk_cache_proc_fops	},
-	{ "capacity",	  S_IFREG|S_IRUGO, &idedisk_capacity_proc_fops	},
-	{ "geometry",	  S_IFREG|S_IRUGO, &ide_geometry_proc_fops	},
-	{ "smart_values", S_IFREG|S_IRUSR, &idedisk_sv_proc_fops	},
-	{ "smart_thresholds", S_IFREG|S_IRUSR, &idedisk_st_proc_fops	},
+	{ "cache",	  S_IFREG|S_IRUGO, idedisk_cache_proc_show	},
+	{ "capacity",	  S_IFREG|S_IRUGO, idedisk_capacity_proc_show	},
+	{ "geometry",	  S_IFREG|S_IRUGO, ide_geometry_proc_show	},
+	{ "smart_values", S_IFREG|S_IRUSR, idedisk_sv_proc_show		},
+	{ "smart_thresholds", S_IFREG|S_IRUSR, idedisk_st_proc_show	},
 	{}
 };
 
diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c
index 471457ebea67..7f697ddb5fe5 100644
--- a/drivers/ide/ide-floppy_proc.c
+++ b/drivers/ide/ide-floppy_proc.c
@@ -14,22 +14,9 @@ static int idefloppy_capacity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idefloppy_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idefloppy_capacity_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idefloppy_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idefloppy_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 ide_proc_entry_t ide_floppy_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO, &idefloppy_capacity_proc_fops	},
-	{ "geometry",	S_IFREG|S_IRUGO, &ide_geometry_proc_fops	},
+	{ "capacity",	S_IFREG|S_IRUGO, idefloppy_capacity_proc_show	},
+	{ "geometry",	S_IFREG|S_IRUGO, ide_geometry_proc_show		},
 	{}
 };
 
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index b3b8b8822d6a..45c997430332 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -56,19 +56,6 @@ static int ide_imodel_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_imodel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_imodel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_imodel_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_imodel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_mate_proc_show(struct seq_file *m, void *v)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
@@ -80,19 +67,6 @@ static int ide_mate_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_mate_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_mate_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_mate_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_mate_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_channel_proc_show(struct seq_file *m, void *v)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
@@ -101,19 +75,6 @@ static int ide_channel_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_channel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_channel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_channel_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_channel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_identify_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t *drive = (ide_drive_t *)m->private;
@@ -141,19 +102,6 @@ static int ide_identify_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_identify_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_identify_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_identify_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_identify_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /**
  *	ide_find_setting	-	find a specific setting
  *	@st: setting table pointer
@@ -441,27 +389,14 @@ static const struct file_operations ide_settings_proc_fops = {
 	.write		= ide_settings_proc_write,
 };
 
-static int ide_capacity_proc_show(struct seq_file *m, void *v)
+int ide_capacity_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "%llu\n", (long long)0x7fffffff);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ide_capacity_proc_show);
 
-static int ide_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_capacity_proc_show, NULL);
-}
-
-const struct file_operations ide_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL_GPL(ide_capacity_proc_fops);
-
-static int ide_geometry_proc_show(struct seq_file *m, void *v)
+int ide_geometry_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t	*drive = (ide_drive_t *) m->private;
 
@@ -471,20 +406,7 @@ static int ide_geometry_proc_show(struct seq_file *m, void *v)
 			drive->bios_cyl, drive->bios_head, drive->bios_sect);
 	return 0;
 }
-
-static int ide_geometry_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_geometry_proc_show, PDE_DATA(inode));
-}
-
-const struct file_operations ide_geometry_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_geometry_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL(ide_geometry_proc_fops);
+EXPORT_SYMBOL(ide_geometry_proc_show);
 
 static int ide_dmodel_proc_show(struct seq_file *seq, void *v)
 {
@@ -495,19 +417,6 @@ static int ide_dmodel_proc_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int ide_dmodel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_dmodel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_dmodel_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_dmodel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_driver_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t		*drive = (ide_drive_t *)m->private;
@@ -523,19 +432,6 @@ static int ide_driver_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_driver_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_driver_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_driver_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_driver_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_media_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t	*drive = (ide_drive_t *) m->private;
@@ -567,11 +463,10 @@ static const struct file_operations ide_media_proc_fops = {
 };
 
 static ide_proc_entry_t generic_drive_entries[] = {
-	{ "driver",	S_IFREG|S_IRUGO,	 &ide_driver_proc_fops	},
-	{ "identify",	S_IFREG|S_IRUSR,	 &ide_identify_proc_fops},
-	{ "media",	S_IFREG|S_IRUGO,	 &ide_media_proc_fops	},
-	{ "model",	S_IFREG|S_IRUGO,	 &ide_dmodel_proc_fops	},
-	{ "settings",	S_IFREG|S_IRUSR|S_IWUSR, &ide_settings_proc_fops},
+	{ "driver",	S_IFREG|S_IRUGO,	 ide_driver_proc_show	},
+	{ "identify",	S_IFREG|S_IRUSR,	 ide_identify_proc_show	},
+	{ "media",	S_IFREG|S_IRUGO,	 ide_media_proc_show	},
+	{ "model",	S_IFREG|S_IRUGO,	 ide_dmodel_proc_show	},
 	{}
 };
 
@@ -582,7 +477,7 @@ static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p
 	if (!dir || !p)
 		return;
 	while (p->name != NULL) {
-		ent = proc_create_data(p->name, p->mode, dir, p->proc_fops, data);
+		ent = proc_create_single_data(p->name, p->mode, dir, p->show, data);
 		if (!ent) return;
 		p++;
 	}
@@ -647,8 +542,12 @@ void ide_proc_port_register_devices(ide_hwif_t *hwif)
 			continue;
 
 		drive->proc = proc_mkdir(drive->name, parent);
-		if (drive->proc)
+		if (drive->proc) {
 			ide_add_proc_entries(drive->proc, generic_drive_entries, drive);
+			proc_create_data("setting", S_IFREG|S_IRUSR|S_IWUSR,
+					drive->proc, &ide_settings_proc_fops,
+					drive);
+		}
 		sprintf(name, "ide%d/%s", (drive->name[2]-'a')/2, drive->name);
 		ent = proc_symlink(drive->name, proc_ide_root, name);
 		if (!ent) return;
@@ -658,6 +557,7 @@ void ide_proc_port_register_devices(ide_hwif_t *hwif)
 void ide_proc_unregister_device(ide_drive_t *drive)
 {
 	if (drive->proc) {
+		remove_proc_entry("settings", drive->proc);
 		ide_remove_proc_entries(drive->proc, generic_drive_entries);
 		remove_proc_entry(drive->name, proc_ide_root);
 		remove_proc_entry(drive->name, drive->hwif->proc);
@@ -666,9 +566,9 @@ void ide_proc_unregister_device(ide_drive_t *drive)
 }
 
 static ide_proc_entry_t hwif_entries[] = {
-	{ "channel",	S_IFREG|S_IRUGO,	&ide_channel_proc_fops	},
-	{ "mate",	S_IFREG|S_IRUGO,	&ide_mate_proc_fops	},
-	{ "model",	S_IFREG|S_IRUGO,	&ide_imodel_proc_fops	},
+	{ "channel",	S_IFREG|S_IRUGO,	ide_channel_proc_show	},
+	{ "mate",	S_IFREG|S_IRUGO,	ide_mate_proc_show	},
+	{ "model",	S_IFREG|S_IRUGO,	ide_imodel_proc_show	},
 	{}
 };
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fd57e8ccc47a..ac2c6820c65a 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1847,22 +1847,9 @@ static int idetape_name_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idetape_name_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idetape_name_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idetape_name_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idetape_name_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static ide_proc_entry_t idetape_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO,	&ide_capacity_proc_fops	},
-	{ "name",	S_IFREG|S_IRUGO,	&idetape_name_proc_fops	},
+	{ "capacity",	S_IFREG|S_IRUGO,	ide_capacity_proc_show	},
+	{ "name",	S_IFREG|S_IRUGO,	idetape_name_proc_show	},
 	{}
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ca9d34feb572..752464f5a772 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -961,7 +961,7 @@ __IDE_PROC_DEVSET(_name, _min, _max, NULL, NULL)
 typedef struct {
 	const char	*name;
 	umode_t		mode;
-	const struct file_operations *proc_fops;
+	int (*show)(struct seq_file *, void *);
 } ide_proc_entry_t;
 
 void proc_ide_create(void);
@@ -973,8 +973,8 @@ void ide_proc_unregister_port(ide_hwif_t *);
 void ide_proc_register_driver(ide_drive_t *, struct ide_driver *);
 void ide_proc_unregister_driver(ide_drive_t *, struct ide_driver *);
 
-extern const struct file_operations ide_capacity_proc_fops;
-extern const struct file_operations ide_geometry_proc_fops;
+int ide_capacity_proc_show(struct seq_file *m, void *v);
+int ide_geometry_proc_show(struct seq_file *m, void *v);
 #else
 static inline void proc_ide_create(void) { ; }
 static inline void proc_ide_destroy(void) { ; }
-- 
cgit v1.2.3


From 8a8dcabffb991a08fa1fab4e75b80a9075825606 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Apr 2018 21:04:45 +0200
Subject: tty: replace ->proc_fops with ->proc_show

Just set up the show callback in the tty_operations, and use
proc_create_single_data to create the file without additional
boilerplace code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/ia64/hp/sim/simserial.c        | 15 +--------------
 arch/xtensa/platforms/iss/console.c | 15 +--------------
 drivers/char/pcmcia/synclink_cs.c   | 15 +--------------
 drivers/mmc/core/sdio_uart.c        | 15 +--------------
 drivers/staging/fwserial/fwserial.c | 15 +--------------
 drivers/tty/amiserial.c             | 15 +--------------
 drivers/tty/cyclades.c              | 15 +--------------
 drivers/tty/serial/serial_core.c    | 15 +--------------
 drivers/tty/synclink.c              | 15 +--------------
 drivers/tty/synclink_gt.c           | 15 +--------------
 drivers/tty/synclinkmp.c            | 15 +--------------
 drivers/usb/serial/usb-serial.c     | 15 +--------------
 fs/proc/proc_tty.c                  |  6 +++---
 include/linux/tty_driver.h          |  2 +-
 14 files changed, 16 insertions(+), 172 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index a419ccf33cde..663388a73d4e 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -435,19 +435,6 @@ static int rs_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int rs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rs_proc_show, NULL);
-}
-
-static const struct file_operations rs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rs_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct tty_operations hp_ops = {
 	.open = rs_open,
 	.close = rs_close,
@@ -462,7 +449,7 @@ static const struct tty_operations hp_ops = {
 	.unthrottle = rs_unthrottle,
 	.send_xchar = rs_send_xchar,
 	.hangup = rs_hangup,
-	.proc_fops = &rs_proc_fops,
+	.proc_show = rs_proc_show,
 };
 
 static const struct tty_port_operations hp_port_ops = {
diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c
index 92f567f9a21e..af81a62faba6 100644
--- a/arch/xtensa/platforms/iss/console.c
+++ b/arch/xtensa/platforms/iss/console.c
@@ -153,19 +153,6 @@ static int rs_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int rs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rs_proc_show, NULL);
-}
-
-static const struct file_operations rs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rs_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct tty_operations serial_ops = {
 	.open = rs_open,
 	.close = rs_close,
@@ -176,7 +163,7 @@ static const struct tty_operations serial_ops = {
 	.chars_in_buffer = rs_chars_in_buffer,
 	.hangup = rs_hangup,
 	.wait_until_sent = rs_wait_until_sent,
-	.proc_fops = &rs_proc_fops,
+	.proc_show = rs_proc_show,
 };
 
 int __init rs_init(void)
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index aa502e9fb7fa..66b04194aa9f 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -2616,19 +2616,6 @@ static int mgslpc_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mgslpc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mgslpc_proc_show, NULL);
-}
-
-static const struct file_operations mgslpc_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mgslpc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int rx_alloc_buffers(MGSLPC_INFO *info)
 {
 	/* each buffer has header and data */
@@ -2815,7 +2802,7 @@ static const struct tty_operations mgslpc_ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = mgslpc_get_icount,
-	.proc_fops = &mgslpc_proc_fops,
+	.proc_show = mgslpc_proc_show,
 };
 
 static int __init synclink_cs_init(void)
diff --git a/drivers/mmc/core/sdio_uart.c b/drivers/mmc/core/sdio_uart.c
index d3c91f412b69..25e113001a3c 100644
--- a/drivers/mmc/core/sdio_uart.c
+++ b/drivers/mmc/core/sdio_uart.c
@@ -1008,19 +1008,6 @@ static int sdio_uart_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int sdio_uart_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, sdio_uart_proc_show, NULL);
-}
-
-static const struct file_operations sdio_uart_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= sdio_uart_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct tty_port_operations sdio_uart_port_ops = {
 	.dtr_rts = uart_dtr_rts,
 	.carrier_raised = uart_carrier_raised,
@@ -1045,7 +1032,7 @@ static const struct tty_operations sdio_uart_ops = {
 	.tiocmset		= sdio_uart_tiocmset,
 	.install		= sdio_uart_install,
 	.cleanup		= sdio_uart_cleanup,
-	.proc_fops		= &sdio_uart_proc_fops,
+	.proc_show		= sdio_uart_proc_show,
 };
 
 static struct tty_driver *sdio_uart_tty_driver;
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index e8bfe5520bc7..fa0dd425b454 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -1506,11 +1506,6 @@ static int fwtty_debugfs_peers_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int fwtty_proc_open(struct inode *inode, struct file *fp)
-{
-	return single_open(fp, fwtty_proc_show, NULL);
-}
-
 static int fwtty_stats_open(struct inode *inode, struct file *fp)
 {
 	return single_open(fp, fwtty_debugfs_stats_show, inode->i_private);
@@ -1537,14 +1532,6 @@ static const struct file_operations fwtty_peers_fops = {
 	.release =	single_release,
 };
 
-static const struct file_operations fwtty_proc_fops = {
-	.owner =        THIS_MODULE,
-	.open =         fwtty_proc_open,
-	.read =         seq_read,
-	.llseek =       seq_lseek,
-	.release =      single_release,
-};
-
 static const struct tty_port_operations fwtty_port_ops = {
 	.dtr_rts =		fwtty_port_dtr_rts,
 	.carrier_raised =	fwtty_port_carrier_raised,
@@ -1570,7 +1557,7 @@ static const struct tty_operations fwtty_ops = {
 	.tiocmget =		fwtty_tiocmget,
 	.tiocmset =		fwtty_tiocmset,
 	.get_icount =		fwtty_get_icount,
-	.proc_fops =		&fwtty_proc_fops,
+	.proc_show =		fwtty_proc_show,
 };
 
 static const struct tty_operations fwloop_ops = {
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 32d7ce430b02..34dead614149 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1566,19 +1566,6 @@ static int rs_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int rs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rs_proc_show, NULL);
-}
-
-static const struct file_operations rs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rs_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * ---------------------------------------------------------------------
  * rs_init() and friends
@@ -1620,7 +1607,7 @@ static const struct tty_operations serial_ops = {
 	.tiocmget = rs_tiocmget,
 	.tiocmset = rs_tiocmset,
 	.get_icount = rs_get_icount,
-	.proc_fops = &rs_proc_fops,
+	.proc_show = rs_proc_show,
 };
 
 static int amiga_carrier_raised(struct tty_port *port)
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index cf0bde3bb927..6d3c58051ce3 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -3972,19 +3972,6 @@ static int cyclades_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cyclades_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cyclades_proc_show, NULL);
-}
-
-static const struct file_operations cyclades_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= cyclades_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* The serial driver boot-time initialization code!
     Hardware I/O ports are mapped to character special devices on a
     first found, first allocated manner.  That is, this code searches
@@ -4024,7 +4011,7 @@ static const struct tty_operations cy_ops = {
 	.tiocmget = cy_tiocmget,
 	.tiocmset = cy_tiocmset,
 	.get_icount = cy_get_icount,
-	.proc_fops = &cyclades_proc_fops,
+	.proc_show = cyclades_proc_show,
 };
 
 static int __init cy_init(void)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 0466f9f08a91..6ff9405954a6 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1829,19 +1829,6 @@ static int uart_proc_show(struct seq_file *m, void *v)
 		uart_line_info(m, drv, i);
 	return 0;
 }
-
-static int uart_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uart_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations uart_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= uart_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 #if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(CONFIG_CONSOLE_POLL)
@@ -2415,7 +2402,7 @@ static const struct tty_operations uart_ops = {
 	.break_ctl	= uart_break_ctl,
 	.wait_until_sent= uart_wait_until_sent,
 #ifdef CONFIG_PROC_FS
-	.proc_fops	= &uart_proc_fops,
+	.proc_show	= uart_proc_show,
 #endif
 	.tiocmget	= uart_tiocmget,
 	.tiocmset	= uart_tiocmset,
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 3c4ad71f261d..fbdf4d01c6a9 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -3534,19 +3534,6 @@ static int mgsl_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mgsl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mgsl_proc_show, NULL);
-}
-
-static const struct file_operations mgsl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mgsl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* mgsl_allocate_dma_buffers()
  * 
  * 	Allocate and format DMA buffers (ISA adapter)
@@ -4298,7 +4285,7 @@ static const struct tty_operations mgsl_ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = msgl_get_icount,
-	.proc_fops = &mgsl_proc_fops,
+	.proc_show = mgsl_proc_show,
 };
 
 /*
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 255c49687877..a94086597ebd 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -1316,19 +1316,6 @@ static int synclink_gt_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int synclink_gt_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, synclink_gt_proc_show, NULL);
-}
-
-static const struct file_operations synclink_gt_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= synclink_gt_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * return count of bytes in transmit buffer
  */
@@ -3721,7 +3708,7 @@ static const struct tty_operations ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = get_icount,
-	.proc_fops = &synclink_gt_proc_fops,
+	.proc_show = synclink_gt_proc_show,
 };
 
 static void slgt_cleanup(void)
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 75f11ce1f0a1..1e4d5b9c981a 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -1421,19 +1421,6 @@ static int synclinkmp_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int synclinkmp_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, synclinkmp_proc_show, NULL);
-}
-
-static const struct file_operations synclinkmp_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= synclinkmp_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* Return the count of bytes in transmit buffer
  */
 static int chars_in_buffer(struct tty_struct *tty)
@@ -3899,7 +3886,7 @@ static const struct tty_operations ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = get_icount,
-	.proc_fops = &synclinkmp_proc_fops,
+	.proc_show = synclinkmp_proc_show,
 };
 
 
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 790e0cbe3da9..268ffa6b51d2 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -476,19 +476,6 @@ static int serial_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int serial_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, serial_proc_show, NULL);
-}
-
-static const struct file_operations serial_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= serial_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int serial_tiocmget(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
@@ -1192,7 +1179,7 @@ static const struct tty_operations serial_ops = {
 	.get_icount =		serial_get_icount,
 	.cleanup =		serial_cleanup,
 	.install =		serial_install,
-	.proc_fops =		&serial_proc_fops,
+	.proc_show =		serial_proc_show,
 };
 
 
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index b1a4a8ddd246..c69ff191e5d8 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -135,11 +135,11 @@ void proc_tty_register_driver(struct tty_driver *driver)
 	struct proc_dir_entry *ent;
 		
 	if (!driver->driver_name || driver->proc_entry ||
-	    !driver->ops->proc_fops)
+	    !driver->ops->proc_show)
 		return;
 
-	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
-			       driver->ops->proc_fops, driver);
+	ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_show, driver);
 	driver->proc_entry = ent;
 }
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 31c2b5b166de..71dbc891851a 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -293,7 +293,7 @@ struct tty_operations {
 	int (*poll_get_char)(struct tty_driver *driver, int line);
 	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
 #endif
-	const struct file_operations *proc_fops;
+	int (*proc_show)(struct seq_file *, void *);
 } __randomize_layout;
 
 struct tty_driver {
-- 
cgit v1.2.3


From 0c6609805b638debcb7d9d44556546b043ded2e9 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Wed, 16 May 2018 09:35:57 +0200
Subject: mfd: stm32-timers: Add support for DMAs

STM32 Timers can support up to 7 DMA requests:
- 4 channels, update, compare and trigger.
Optionally request part, or all DMAs from stm32-timers MFD core.

Also add routine to implement burst reads using DMA from timer registers.
This is exported. So, it can be used by child drivers, PWM capture
for instance (but not limited to).

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Reviewed-by: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/stm32-timers.c       | 201 ++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/stm32-timers.h |  46 +++++++++
 2 files changed, 245 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/stm32-timers.c b/drivers/mfd/stm32-timers.c
index 1d347e5dfa79..efcd4b980c94 100644
--- a/drivers/mfd/stm32-timers.c
+++ b/drivers/mfd/stm32-timers.c
@@ -4,16 +4,156 @@
  * Author: Benjamin Gaignard <benjamin.gaignard@st.com>
  */
 
+#include <linux/bitfield.h>
 #include <linux/mfd/stm32-timers.h>
 #include <linux/module.h>
 #include <linux/of_platform.h>
 #include <linux/reset.h>
 
+#define STM32_TIMERS_MAX_REGISTERS	0x3fc
+
+/* DIER register DMA enable bits */
+static const u32 stm32_timers_dier_dmaen[STM32_TIMERS_MAX_DMAS] = {
+	TIM_DIER_CC1DE,
+	TIM_DIER_CC2DE,
+	TIM_DIER_CC3DE,
+	TIM_DIER_CC4DE,
+	TIM_DIER_UIE,
+	TIM_DIER_TDE,
+	TIM_DIER_COMDE
+};
+
+static void stm32_timers_dma_done(void *p)
+{
+	struct stm32_timers_dma *dma = p;
+	struct dma_tx_state state;
+	enum dma_status status;
+
+	status = dmaengine_tx_status(dma->chan, dma->chan->cookie, &state);
+	if (status == DMA_COMPLETE)
+		complete(&dma->completion);
+}
+
+/**
+ * stm32_timers_dma_burst_read - Read from timers registers using DMA.
+ *
+ * Read from STM32 timers registers using DMA on a single event.
+ * @dev: reference to stm32_timers MFD device
+ * @buf: DMA'able destination buffer
+ * @id: stm32_timers_dmas event identifier (ch[1..4], up, trig or com)
+ * @reg: registers start offset for DMA to read from (like CCRx for capture)
+ * @num_reg: number of registers to read upon each DMA request, starting @reg.
+ * @bursts: number of bursts to read (e.g. like two for pwm period capture)
+ * @tmo_ms: timeout (milliseconds)
+ */
+int stm32_timers_dma_burst_read(struct device *dev, u32 *buf,
+				enum stm32_timers_dmas id, u32 reg,
+				unsigned int num_reg, unsigned int bursts,
+				unsigned long tmo_ms)
+{
+	struct stm32_timers *ddata = dev_get_drvdata(dev);
+	unsigned long timeout = msecs_to_jiffies(tmo_ms);
+	struct regmap *regmap = ddata->regmap;
+	struct stm32_timers_dma *dma = &ddata->dma;
+	size_t len = num_reg * bursts * sizeof(u32);
+	struct dma_async_tx_descriptor *desc;
+	struct dma_slave_config config;
+	dma_cookie_t cookie;
+	dma_addr_t dma_buf;
+	u32 dbl, dba;
+	long err;
+	int ret;
+
+	/* Sanity check */
+	if (id < STM32_TIMERS_DMA_CH1 || id >= STM32_TIMERS_MAX_DMAS)
+		return -EINVAL;
+
+	if (!num_reg || !bursts || reg > STM32_TIMERS_MAX_REGISTERS ||
+	    (reg + num_reg * sizeof(u32)) > STM32_TIMERS_MAX_REGISTERS)
+		return -EINVAL;
+
+	if (!dma->chans[id])
+		return -ENODEV;
+	mutex_lock(&dma->lock);
+
+	/* Select DMA channel in use */
+	dma->chan = dma->chans[id];
+	dma_buf = dma_map_single(dev, buf, len, DMA_FROM_DEVICE);
+	if (dma_mapping_error(dev, dma_buf)) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* Prepare DMA read from timer registers, using DMA burst mode */
+	memset(&config, 0, sizeof(config));
+	config.src_addr = (dma_addr_t)dma->phys_base + TIM_DMAR;
+	config.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+	ret = dmaengine_slave_config(dma->chan, &config);
+	if (ret)
+		goto unmap;
+
+	desc = dmaengine_prep_slave_single(dma->chan, dma_buf, len,
+					   DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT);
+	if (!desc) {
+		ret = -EBUSY;
+		goto unmap;
+	}
+
+	desc->callback = stm32_timers_dma_done;
+	desc->callback_param = dma;
+	cookie = dmaengine_submit(desc);
+	ret = dma_submit_error(cookie);
+	if (ret)
+		goto dma_term;
+
+	reinit_completion(&dma->completion);
+	dma_async_issue_pending(dma->chan);
+
+	/* Setup and enable timer DMA burst mode */
+	dbl = FIELD_PREP(TIM_DCR_DBL, bursts - 1);
+	dba = FIELD_PREP(TIM_DCR_DBA, reg >> 2);
+	ret = regmap_write(regmap, TIM_DCR, dbl | dba);
+	if (ret)
+		goto dma_term;
+
+	/* Clear pending flags before enabling DMA request */
+	ret = regmap_write(regmap, TIM_SR, 0);
+	if (ret)
+		goto dcr_clr;
+
+	ret = regmap_update_bits(regmap, TIM_DIER, stm32_timers_dier_dmaen[id],
+				 stm32_timers_dier_dmaen[id]);
+	if (ret)
+		goto dcr_clr;
+
+	err = wait_for_completion_interruptible_timeout(&dma->completion,
+							timeout);
+	if (err == 0)
+		ret = -ETIMEDOUT;
+	else if (err < 0)
+		ret = err;
+
+	regmap_update_bits(regmap, TIM_DIER, stm32_timers_dier_dmaen[id], 0);
+	regmap_write(regmap, TIM_SR, 0);
+dcr_clr:
+	regmap_write(regmap, TIM_DCR, 0);
+dma_term:
+	dmaengine_terminate_all(dma->chan);
+unmap:
+	dma_unmap_single(dev, dma_buf, len, DMA_FROM_DEVICE);
+unlock:
+	dma->chan = NULL;
+	mutex_unlock(&dma->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stm32_timers_dma_burst_read);
+
 static const struct regmap_config stm32_timers_regmap_cfg = {
 	.reg_bits = 32,
 	.val_bits = 32,
 	.reg_stride = sizeof(u32),
-	.max_register = 0x3fc,
+	.max_register = STM32_TIMERS_MAX_REGISTERS,
 };
 
 static void stm32_timers_get_arr_size(struct stm32_timers *ddata)
@@ -27,12 +167,45 @@ static void stm32_timers_get_arr_size(struct stm32_timers *ddata)
 	regmap_write(ddata->regmap, TIM_ARR, 0x0);
 }
 
+static void stm32_timers_dma_probe(struct device *dev,
+				   struct stm32_timers *ddata)
+{
+	int i;
+	char name[4];
+
+	init_completion(&ddata->dma.completion);
+	mutex_init(&ddata->dma.lock);
+
+	/* Optional DMA support: get valid DMA channel(s) or NULL */
+	for (i = STM32_TIMERS_DMA_CH1; i <= STM32_TIMERS_DMA_CH4; i++) {
+		snprintf(name, ARRAY_SIZE(name), "ch%1d", i + 1);
+		ddata->dma.chans[i] = dma_request_slave_channel(dev, name);
+	}
+	ddata->dma.chans[STM32_TIMERS_DMA_UP] =
+		dma_request_slave_channel(dev, "up");
+	ddata->dma.chans[STM32_TIMERS_DMA_TRIG] =
+		dma_request_slave_channel(dev, "trig");
+	ddata->dma.chans[STM32_TIMERS_DMA_COM] =
+		dma_request_slave_channel(dev, "com");
+}
+
+static void stm32_timers_dma_remove(struct device *dev,
+				    struct stm32_timers *ddata)
+{
+	int i;
+
+	for (i = STM32_TIMERS_DMA_CH1; i < STM32_TIMERS_MAX_DMAS; i++)
+		if (ddata->dma.chans[i])
+			dma_release_channel(ddata->dma.chans[i]);
+}
+
 static int stm32_timers_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct stm32_timers *ddata;
 	struct resource *res;
 	void __iomem *mmio;
+	int ret;
 
 	ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL);
 	if (!ddata)
@@ -43,6 +216,9 @@ static int stm32_timers_probe(struct platform_device *pdev)
 	if (IS_ERR(mmio))
 		return PTR_ERR(mmio);
 
+	/* Timer physical addr for DMA */
+	ddata->dma.phys_base = res->start;
+
 	ddata->regmap = devm_regmap_init_mmio_clk(dev, "int", mmio,
 						  &stm32_timers_regmap_cfg);
 	if (IS_ERR(ddata->regmap))
@@ -54,9 +230,29 @@ static int stm32_timers_probe(struct platform_device *pdev)
 
 	stm32_timers_get_arr_size(ddata);
 
+	stm32_timers_dma_probe(dev, ddata);
+
 	platform_set_drvdata(pdev, ddata);
 
-	return devm_of_platform_populate(&pdev->dev);
+	ret = of_platform_populate(pdev->dev.of_node, NULL, NULL, &pdev->dev);
+	if (ret)
+		stm32_timers_dma_remove(dev, ddata);
+
+	return ret;
+}
+
+static int stm32_timers_remove(struct platform_device *pdev)
+{
+	struct stm32_timers *ddata = platform_get_drvdata(pdev);
+
+	/*
+	 * Don't use devm_ here: enfore of_platform_depopulate() happens before
+	 * DMA are released, to avoid race on DMA.
+	 */
+	of_platform_depopulate(&pdev->dev);
+	stm32_timers_dma_remove(&pdev->dev, ddata);
+
+	return 0;
 }
 
 static const struct of_device_id stm32_timers_of_match[] = {
@@ -67,6 +263,7 @@ MODULE_DEVICE_TABLE(of, stm32_timers_of_match);
 
 static struct platform_driver stm32_timers_driver = {
 	.probe = stm32_timers_probe,
+	.remove = stm32_timers_remove,
 	.driver	= {
 		.name = "stm32-timers",
 		.of_match_table = stm32_timers_of_match,
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index 2aadab6f34a1..9596d5c58859 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -8,6 +8,8 @@
 #define _LINUX_STM32_GPTIMER_H_
 
 #include <linux/clk.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
 #include <linux/regmap.h>
 
 #define TIM_CR1		0x00	/* Control Register 1      */
@@ -27,6 +29,8 @@
 #define TIM_CCR3	0x3C	/* Capt/Comp Register 3    */
 #define TIM_CCR4	0x40	/* Capt/Comp Register 4    */
 #define TIM_BDTR	0x44	/* Break and Dead-Time Reg */
+#define TIM_DCR		0x48	/* DMA control register    */
+#define TIM_DMAR	0x4C	/* DMA register for transfer */
 
 #define TIM_CR1_CEN	BIT(0)	/* Counter Enable	   */
 #define TIM_CR1_DIR	BIT(4)  /* Counter Direction	   */
@@ -36,6 +40,13 @@
 #define TIM_SMCR_SMS	(BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */
 #define TIM_SMCR_TS	(BIT(4) | BIT(5) | BIT(6)) /* Trigger selection */
 #define TIM_DIER_UIE	BIT(0)	/* Update interrupt	   */
+#define TIM_DIER_UDE	BIT(8)  /* Update DMA request Enable */
+#define TIM_DIER_CC1DE	BIT(9)  /* CC1 DMA request Enable  */
+#define TIM_DIER_CC2DE	BIT(10) /* CC2 DMA request Enable  */
+#define TIM_DIER_CC3DE	BIT(11) /* CC3 DMA request Enable  */
+#define TIM_DIER_CC4DE	BIT(12) /* CC4 DMA request Enable  */
+#define TIM_DIER_COMDE	BIT(13) /* COM DMA request Enable  */
+#define TIM_DIER_TDE	BIT(14) /* Trigger DMA request Enable */
 #define TIM_SR_UIF	BIT(0)	/* Update interrupt flag   */
 #define TIM_EGR_UG	BIT(0)	/* Update Generation       */
 #define TIM_CCMR_PE	BIT(3)	/* Channel Preload Enable  */
@@ -56,6 +67,8 @@
 #define TIM_BDTR_BK2F	(BIT(20) | BIT(21) | BIT(22) | BIT(23))
 #define TIM_BDTR_BK2E	BIT(24) /* Break 2 input enable	   */
 #define TIM_BDTR_BK2P	BIT(25) /* Break 2 input polarity  */
+#define TIM_DCR_DBA	GENMASK(4, 0)	/* DMA base addr */
+#define TIM_DCR_DBL	GENMASK(12, 8)	/* DMA burst len */
 
 #define MAX_TIM_PSC		0xFFFF
 #define TIM_CR2_MMS_SHIFT	4
@@ -65,9 +78,42 @@
 #define TIM_BDTR_BKF_SHIFT	16
 #define TIM_BDTR_BK2F_SHIFT	20
 
+enum stm32_timers_dmas {
+	STM32_TIMERS_DMA_CH1,
+	STM32_TIMERS_DMA_CH2,
+	STM32_TIMERS_DMA_CH3,
+	STM32_TIMERS_DMA_CH4,
+	STM32_TIMERS_DMA_UP,
+	STM32_TIMERS_DMA_TRIG,
+	STM32_TIMERS_DMA_COM,
+	STM32_TIMERS_MAX_DMAS,
+};
+
+/**
+ * struct stm32_timers_dma - STM32 timer DMA handling.
+ * @completion:		end of DMA transfer completion
+ * @phys_base:		control registers physical base address
+ * @lock:		protect DMA access
+ * @chan:		DMA channel in use
+ * @chans:		DMA channels available for this timer instance
+ */
+struct stm32_timers_dma {
+	struct completion completion;
+	phys_addr_t phys_base;
+	struct mutex lock;
+	struct dma_chan *chan;
+	struct dma_chan *chans[STM32_TIMERS_MAX_DMAS];
+};
+
 struct stm32_timers {
 	struct clk *clk;
 	struct regmap *regmap;
 	u32 max_arr;
+	struct stm32_timers_dma dma; /* Only to be used by the parent */
 };
+
+int stm32_timers_dma_burst_read(struct device *dev, u32 *buf,
+				enum stm32_timers_dmas id, u32 reg,
+				unsigned int num_reg, unsigned int bursts,
+				unsigned long tmo_ms);
 #endif
-- 
cgit v1.2.3


From 53e38fe73f941291fd20794c15c3bb7b104a4a17 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Wed, 16 May 2018 09:35:58 +0200
Subject: pwm: stm32: Add capture support

Add support for PMW input mode on pwm-stm32. STM32 timers support
period and duty cycle capture as long as they have at least two PWM
channels. One capture channel is used for period (rising-edge), one
for duty-cycle (falling-edge).
When there's only one channel available, only period can be captured.
Duty-cycle is simply zero'ed in such a case.

Capture requires exclusive access (e.g. no pwm output running at the
same time, to protect common prescaler).
Timer DMA burst mode (from MFD core) is being used, to take two
snapshots of capture registers (upon each period rising edge).

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Reviewed-by: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/pwm/pwm-stm32.c          | 176 +++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/stm32-timers.h |  11 +++
 2 files changed, 187 insertions(+)

(limited to 'include')

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 2708212933f7..ed3961b7b938 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -25,6 +25,7 @@ struct stm32_pwm {
 	struct regmap *regmap;
 	u32 max_arr;
 	bool have_complementary_output;
+	u32 capture[4] ____cacheline_aligned; /* DMA'able buffer */
 };
 
 struct stm32_breakinput {
@@ -62,6 +63,178 @@ static int write_ccrx(struct stm32_pwm *dev, int ch, u32 value)
 	return -EINVAL;
 }
 
+#define TIM_CCER_CC12P (TIM_CCER_CC1P | TIM_CCER_CC2P)
+#define TIM_CCER_CC12E (TIM_CCER_CC1E | TIM_CCER_CC2E)
+#define TIM_CCER_CC34P (TIM_CCER_CC3P | TIM_CCER_CC4P)
+#define TIM_CCER_CC34E (TIM_CCER_CC3E | TIM_CCER_CC4E)
+
+/*
+ * Capture using PWM input mode:
+ *                              ___          ___
+ * TI[1, 2, 3 or 4]: ........._|   |________|
+ *                             ^0  ^1       ^2
+ *                              .   .        .
+ *                              .   .        XXXXX
+ *                              .   .   XXXXX     |
+ *                              .  XXXXX     .    |
+ *                            XXXXX .        .    |
+ * COUNTER:        ______XXXXX  .   .        .    |_XXX
+ *                 start^       .   .        .        ^stop
+ *                      .       .   .        .
+ *                      v       v   .        v
+ *                                  v
+ * CCR1/CCR3:       tx..........t0...........t2
+ * CCR2/CCR4:       tx..............t1.........
+ *
+ * DMA burst transfer:          |            |
+ *                              v            v
+ * DMA buffer:                  { t0, tx }   { t2, t1 }
+ * DMA done:                                 ^
+ *
+ * 0: IC1/3 snapchot on rising edge: counter value -> CCR1/CCR3
+ *    + DMA transfer CCR[1/3] & CCR[2/4] values (t0, tx: doesn't care)
+ * 1: IC2/4 snapchot on falling edge: counter value -> CCR2/CCR4
+ * 2: IC1/3 snapchot on rising edge: counter value -> CCR1/CCR3
+ *    + DMA transfer CCR[1/3] & CCR[2/4] values (t2, t1)
+ *
+ * DMA done, compute:
+ * - Period     = t2 - t0
+ * - Duty cycle = t1 - t0
+ */
+static int stm32_pwm_raw_capture(struct stm32_pwm *priv, struct pwm_device *pwm,
+				 unsigned long tmo_ms, u32 *raw_prd,
+				 u32 *raw_dty)
+{
+	struct device *parent = priv->chip.dev->parent;
+	enum stm32_timers_dmas dma_id;
+	u32 ccen, ccr;
+	int ret;
+
+	/* Ensure registers have been updated, enable counter and capture */
+	regmap_update_bits(priv->regmap, TIM_EGR, TIM_EGR_UG, TIM_EGR_UG);
+	regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, TIM_CR1_CEN);
+
+	/* Use cc1 or cc3 DMA resp for PWM input channels 1 & 2 or 3 & 4 */
+	dma_id = pwm->hwpwm < 2 ? STM32_TIMERS_DMA_CH1 : STM32_TIMERS_DMA_CH3;
+	ccen = pwm->hwpwm < 2 ? TIM_CCER_CC12E : TIM_CCER_CC34E;
+	ccr = pwm->hwpwm < 2 ? TIM_CCR1 : TIM_CCR3;
+	regmap_update_bits(priv->regmap, TIM_CCER, ccen, ccen);
+
+	/*
+	 * Timer DMA burst mode. Request 2 registers, 2 bursts, to get both
+	 * CCR1 & CCR2 (or CCR3 & CCR4) on each capture event.
+	 * We'll get two capture snapchots: { CCR1, CCR2 }, { CCR1, CCR2 }
+	 * or { CCR3, CCR4 }, { CCR3, CCR4 }
+	 */
+	ret = stm32_timers_dma_burst_read(parent, priv->capture, dma_id, ccr, 2,
+					  2, tmo_ms);
+	if (ret)
+		goto stop;
+
+	/* Period: t2 - t0 (take care of counter overflow) */
+	if (priv->capture[0] <= priv->capture[2])
+		*raw_prd = priv->capture[2] - priv->capture[0];
+	else
+		*raw_prd = priv->max_arr - priv->capture[0] + priv->capture[2];
+
+	/* Duty cycle capture requires at least two capture units */
+	if (pwm->chip->npwm < 2)
+		*raw_dty = 0;
+	else if (priv->capture[0] <= priv->capture[3])
+		*raw_dty = priv->capture[3] - priv->capture[0];
+	else
+		*raw_dty = priv->max_arr - priv->capture[0] + priv->capture[3];
+
+	if (*raw_dty > *raw_prd) {
+		/*
+		 * Race beetween PWM input and DMA: it may happen
+		 * falling edge triggers new capture on TI2/4 before DMA
+		 * had a chance to read CCR2/4. It means capture[1]
+		 * contains period + duty_cycle. So, subtract period.
+		 */
+		*raw_dty -= *raw_prd;
+	}
+
+stop:
+	regmap_update_bits(priv->regmap, TIM_CCER, ccen, 0);
+	regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, 0);
+
+	return ret;
+}
+
+static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
+			     struct pwm_capture *result, unsigned long tmo_ms)
+{
+	struct stm32_pwm *priv = to_stm32_pwm_dev(chip);
+	unsigned long long prd, div, dty;
+	unsigned long rate;
+	unsigned int psc = 0;
+	u32 raw_prd, raw_dty;
+	int ret = 0;
+
+	mutex_lock(&priv->lock);
+
+	if (active_channels(priv)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = clk_enable(priv->clk);
+	if (ret) {
+		dev_err(priv->chip.dev, "failed to enable counter clock\n");
+		goto unlock;
+	}
+
+	rate = clk_get_rate(priv->clk);
+	if (!rate) {
+		ret = -EINVAL;
+		goto clk_dis;
+	}
+
+	/* prescaler: fit timeout window provided by upper layer */
+	div = (unsigned long long)rate * (unsigned long long)tmo_ms;
+	do_div(div, MSEC_PER_SEC);
+	prd = div;
+	while ((div > priv->max_arr) && (psc < MAX_TIM_PSC)) {
+		psc++;
+		div = prd;
+		do_div(div, psc + 1);
+	}
+	regmap_write(priv->regmap, TIM_ARR, priv->max_arr);
+	regmap_write(priv->regmap, TIM_PSC, psc);
+
+	/* Map TI1 or TI2 PWM input to IC1 & IC2 (or TI3/4 to IC3 & IC4) */
+	regmap_update_bits(priv->regmap,
+			   pwm->hwpwm < 2 ? TIM_CCMR1 : TIM_CCMR2,
+			   TIM_CCMR_CC1S | TIM_CCMR_CC2S, pwm->hwpwm & 0x1 ?
+			   TIM_CCMR_CC1S_TI2 | TIM_CCMR_CC2S_TI2 :
+			   TIM_CCMR_CC1S_TI1 | TIM_CCMR_CC2S_TI1);
+
+	/* Capture period on IC1/3 rising edge, duty cycle on IC2/4 falling. */
+	regmap_update_bits(priv->regmap, TIM_CCER, pwm->hwpwm < 2 ?
+			   TIM_CCER_CC12P : TIM_CCER_CC34P, pwm->hwpwm < 2 ?
+			   TIM_CCER_CC2P : TIM_CCER_CC4P);
+
+	ret = stm32_pwm_raw_capture(priv, pwm, tmo_ms, &raw_prd, &raw_dty);
+	if (ret)
+		goto stop;
+
+	prd = (unsigned long long)raw_prd * (psc + 1) * NSEC_PER_SEC;
+	result->period = DIV_ROUND_UP_ULL(prd, rate);
+	dty = (unsigned long long)raw_dty * (psc + 1) * NSEC_PER_SEC;
+	result->duty_cycle = DIV_ROUND_UP_ULL(dty, rate);
+stop:
+	regmap_write(priv->regmap, TIM_CCER, 0);
+	regmap_write(priv->regmap, pwm->hwpwm < 2 ? TIM_CCMR1 : TIM_CCMR2, 0);
+	regmap_write(priv->regmap, TIM_PSC, 0);
+clk_dis:
+	clk_disable(priv->clk);
+unlock:
+	mutex_unlock(&priv->lock);
+
+	return ret;
+}
+
 static int stm32_pwm_config(struct stm32_pwm *priv, int ch,
 			    int duty_ns, int period_ns)
 {
@@ -230,6 +403,9 @@ static int stm32_pwm_apply_locked(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops stm32pwm_ops = {
 	.owner = THIS_MODULE,
 	.apply = stm32_pwm_apply_locked,
+#if IS_ENABLED(CONFIG_DMA_ENGINE)
+	.capture = stm32_pwm_capture,
+#endif
 };
 
 static int stm32_pwm_set_breakinput(struct stm32_pwm *priv,
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index 9596d5c58859..d46f5500928e 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -51,13 +51,24 @@
 #define TIM_EGR_UG	BIT(0)	/* Update Generation       */
 #define TIM_CCMR_PE	BIT(3)	/* Channel Preload Enable  */
 #define TIM_CCMR_M1	(BIT(6) | BIT(5))  /* Channel PWM Mode 1 */
+#define TIM_CCMR_CC1S		(BIT(0) | BIT(1)) /* Capture/compare 1 sel */
+#define TIM_CCMR_IC1PSC		GENMASK(3, 2)	/* Input capture 1 prescaler */
+#define TIM_CCMR_CC2S		(BIT(8) | BIT(9)) /* Capture/compare 2 sel */
+#define TIM_CCMR_IC2PSC		GENMASK(11, 10)	/* Input capture 2 prescaler */
+#define TIM_CCMR_CC1S_TI1	BIT(0)	/* IC1/IC3 selects TI1/TI3 */
+#define TIM_CCMR_CC1S_TI2	BIT(1)	/* IC1/IC3 selects TI2/TI4 */
+#define TIM_CCMR_CC2S_TI2	BIT(8)	/* IC2/IC4 selects TI2/TI4 */
+#define TIM_CCMR_CC2S_TI1	BIT(9)	/* IC2/IC4 selects TI1/TI3 */
 #define TIM_CCER_CC1E	BIT(0)	/* Capt/Comp 1  out Ena    */
 #define TIM_CCER_CC1P	BIT(1)	/* Capt/Comp 1  Polarity   */
 #define TIM_CCER_CC1NE	BIT(2)	/* Capt/Comp 1N out Ena    */
 #define TIM_CCER_CC1NP	BIT(3)	/* Capt/Comp 1N Polarity   */
 #define TIM_CCER_CC2E	BIT(4)	/* Capt/Comp 2  out Ena    */
+#define TIM_CCER_CC2P	BIT(5)	/* Capt/Comp 2  Polarity   */
 #define TIM_CCER_CC3E	BIT(8)	/* Capt/Comp 3  out Ena    */
+#define TIM_CCER_CC3P	BIT(9)	/* Capt/Comp 3  Polarity   */
 #define TIM_CCER_CC4E	BIT(12)	/* Capt/Comp 4  out Ena    */
+#define TIM_CCER_CC4P	BIT(13)	/* Capt/Comp 4  Polarity   */
 #define TIM_CCER_CCXE	(BIT(0) | BIT(4) | BIT(8) | BIT(12))
 #define TIM_BDTR_BKE	BIT(12) /* Break input enable	   */
 #define TIM_BDTR_BKP	BIT(13) /* Break input polarity	   */
-- 
cgit v1.2.3


From ab3a897847834bf3e864fb07b733c444895a24ba Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Wed, 16 May 2018 09:36:00 +0200
Subject: pwm: stm32: Use input prescaler to improve period capture

Using input prescaler, capture unit will trigger DMA once every
configurable /2, /4 or /8 events (rising edge). This helps improve
period (only) capture accuracy at high rates.

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Reviewed-by: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/pwm/pwm-stm32.c          | 63 ++++++++++++++++++++++++++++++++++++++--
 include/linux/mfd/stm32-timers.h |  1 +
 2 files changed, 62 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 9a50acde1e61..60bfc07c4912 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -8,6 +8,7 @@
  *             pwm-atmel.c from Bo Shen
  */
 
+#include <linux/bitfield.h>
 #include <linux/mfd/stm32-timers.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -168,7 +169,7 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 	struct stm32_pwm *priv = to_stm32_pwm_dev(chip);
 	unsigned long long prd, div, dty;
 	unsigned long rate;
-	unsigned int psc = 0, scale;
+	unsigned int psc = 0, icpsc, scale;
 	u32 raw_prd, raw_dty;
 	int ret = 0;
 
@@ -222,6 +223,7 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 	/*
 	 * Got a capture. Try to improve accuracy at high rates:
 	 * - decrease counter clock prescaler, scale up to max rate.
+	 * - use input prescaler, capture once every /2 /4 or /8 edges.
 	 */
 	if (raw_prd) {
 		u32 max_arr = priv->max_arr - 0x1000; /* arbitrary margin */
@@ -241,8 +243,65 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 			goto stop;
 	}
 
+	/* Compute intermediate period not to exceed timeout at low rates */
 	prd = (unsigned long long)raw_prd * (psc + 1) * NSEC_PER_SEC;
-	result->period = DIV_ROUND_UP_ULL(prd, rate);
+	do_div(prd, rate);
+
+	for (icpsc = 0; icpsc < MAX_TIM_ICPSC ; icpsc++) {
+		/* input prescaler: also keep arbitrary margin */
+		if (raw_prd >= (priv->max_arr - 0x1000) >> (icpsc + 1))
+			break;
+		if (prd >= (tmo_ms * NSEC_PER_MSEC) >> (icpsc + 2))
+			break;
+	}
+
+	if (!icpsc)
+		goto done;
+
+	/* Last chance to improve period accuracy, using input prescaler */
+	regmap_update_bits(priv->regmap,
+			   pwm->hwpwm < 2 ? TIM_CCMR1 : TIM_CCMR2,
+			   TIM_CCMR_IC1PSC | TIM_CCMR_IC2PSC,
+			   FIELD_PREP(TIM_CCMR_IC1PSC, icpsc) |
+			   FIELD_PREP(TIM_CCMR_IC2PSC, icpsc));
+
+	ret = stm32_pwm_raw_capture(priv, pwm, tmo_ms, &raw_prd, &raw_dty);
+	if (ret)
+		goto stop;
+
+	if (raw_dty >= (raw_prd >> icpsc)) {
+		/*
+		 * We may fall here using input prescaler, when input
+		 * capture starts on high side (before falling edge).
+		 * Example with icpsc to capture on each 4 events:
+		 *
+		 *       start   1st capture                     2nd capture
+		 *         v     v                               v
+		 *         ___   _____   _____   _____   _____   ____
+		 * TI1..4     |__|    |__|    |__|    |__|    |__|
+		 *            v  v    .  .    .  .    .       v  v
+		 * icpsc1/3:  .  0    .  1    .  2    .  3    .  0
+		 * icpsc2/4:  0       1       2       3       0
+		 *            v  v                            v  v
+		 * CCR1/3  ......t0..............................t2
+		 * CCR2/4  ..t1..............................t1'...
+		 *               .                            .  .
+		 * Capture0:     .<----------------------------->.
+		 * Capture1:     .<-------------------------->.  .
+		 *               .                            .  .
+		 * Period:       .<------>                    .  .
+		 * Low side:                                  .<>.
+		 *
+		 * Result:
+		 * - Period = Capture0 / icpsc
+		 * - Duty = Period - Low side = Period - (Capture0 - Capture1)
+		 */
+		raw_dty = (raw_prd >> icpsc) - (raw_prd - raw_dty);
+	}
+
+done:
+	prd = (unsigned long long)raw_prd * (psc + 1) * NSEC_PER_SEC;
+	result->period = DIV_ROUND_UP_ULL(prd, rate << icpsc);
 	dty = (unsigned long long)raw_dty * (psc + 1) * NSEC_PER_SEC;
 	result->duty_cycle = DIV_ROUND_UP_ULL(dty, rate);
 stop:
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index d46f5500928e..9da1d7ece079 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -82,6 +82,7 @@
 #define TIM_DCR_DBL	GENMASK(12, 8)	/* DMA burst len */
 
 #define MAX_TIM_PSC		0xFFFF
+#define MAX_TIM_ICPSC		0x3
 #define TIM_CR2_MMS_SHIFT	4
 #define TIM_CR2_MMS2_SHIFT	20
 #define TIM_SMCR_TS_SHIFT	4
-- 
cgit v1.2.3


From 30feec093021ef665233a3c78a99f444673389a2 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 9 Jan 2018 19:59:21 +0100
Subject: mfd: syscon: Remove unused Exynos PMU headers

Since commit 5812f0106c44 ("phy: exynos4: Remove duplicated defines of
PHY register defines") and commit 7a66647b25b6 ("phy: exynos: Use one
define for enable bit") all users of syscon Exynos PMU headers (for PHY
drivers) are converted to use different headers so these can be removed.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/syscon/exynos4-pmu.h | 21 ---------------------
 include/linux/mfd/syscon/exynos5-pmu.h | 19 -------------------
 2 files changed, 40 deletions(-)
 delete mode 100644 include/linux/mfd/syscon/exynos4-pmu.h
 delete mode 100644 include/linux/mfd/syscon/exynos5-pmu.h

(limited to 'include')

diff --git a/include/linux/mfd/syscon/exynos4-pmu.h b/include/linux/mfd/syscon/exynos4-pmu.h
deleted file mode 100644
index 278b1b1549e9..000000000000
--- a/include/linux/mfd/syscon/exynos4-pmu.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2015 Samsung Electronics Co., Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_
-#define _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_
-
-/* Exynos4 PMU register definitions */
-
-/* MIPI_PHYn_CONTROL register offset: n = 0..1 */
-#define EXYNOS4_MIPI_PHY_CONTROL(n)	(0x710 + (n) * 4)
-#define EXYNOS4_MIPI_PHY_ENABLE		(1 << 0)
-#define EXYNOS4_MIPI_PHY_SRESETN	(1 << 1)
-#define EXYNOS4_MIPI_PHY_MRESETN	(1 << 2)
-#define EXYNOS4_MIPI_PHY_RESET_MASK	(3 << 1)
-
-#endif /* _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_ */
diff --git a/include/linux/mfd/syscon/exynos5-pmu.h b/include/linux/mfd/syscon/exynos5-pmu.h
deleted file mode 100644
index b4942a32b81d..000000000000
--- a/include/linux/mfd/syscon/exynos5-pmu.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Exynos5 SoC series Power Management Unit (PMU) register offsets
- * and bit definitions.
- *
- * Copyright (C) 2014 Samsung Electronics Co., Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_
-#define _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_
-
-#define EXYNOS5_PHY_ENABLE			BIT(0)
-#define EXYNOS5_MIPI_PHY_S_RESETN		BIT(1)
-#define EXYNOS5_MIPI_PHY_M_RESETN		BIT(2)
-
-#endif /* _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_ */
-- 
cgit v1.2.3


From efc9b2048c7f6bb52f70cc84b8b8b8dcc685a0d6 Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Thu, 15 Feb 2018 22:17:05 -0600
Subject: mfd: tps65218: Reorder tps65218_regulator_id enum

Commit 2dc4940360d4 ("regulator: tps65218: Remove all the compatibles")
changes the probe function of drivers/regulator/tps65218-regulator.c so
that it iterates through all available regulators and assumes that the
regulator IDs are sequential and match the order present in the enum
tps65218_regulator_id. However, for some reason the much older commit
c0ea88b890d6 ("regulator: tps65218: add support for LS3 current
regulator") updated all arrays with LS3 at the end but added it second
to last for the enum.

Because of this long standing mismatch in order between the
tps65218_regulator_id enum and the regulator_desc array in the tps65218
regulator driver, the new probe function causes the strobe values to be
associated with the wrong regulator ID. This causes LDO1 to fail to
suspend in tps65218_pmic_set_suspend_disable due to not having anything
probes for its strobe value. Fix the order in the enum so the probe
function works as the update intended.

Fixes: 2dc4940360d4 ("regulator: tps65218: Remove all the compatibles")
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/tps65218.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/tps65218.h b/include/linux/mfd/tps65218.h
index f069c518c0ed..c204d9a79436 100644
--- a/include/linux/mfd/tps65218.h
+++ b/include/linux/mfd/tps65218.h
@@ -205,10 +205,10 @@ enum tps65218_regulator_id {
 	TPS65218_DCDC_4,
 	TPS65218_DCDC_5,
 	TPS65218_DCDC_6,
-	/* LS's */
-	TPS65218_LS_3,
 	/* LDOs */
 	TPS65218_LDO_1,
+	/* LS's */
+	TPS65218_LS_3,
 };
 
 #define TPS65218_MAX_REG_ID		TPS65218_LDO_1
-- 
cgit v1.2.3


From 37c089d1facaf03969f66a5469c169a2c73429f6 Mon Sep 17 00:00:00 2001
From: Rajmohan Mani <rajmohan.mani@intel.com>
Date: Tue, 20 Feb 2018 15:54:07 -0800
Subject: mfd: Update to SPDX license identifier

Remove the GPL v2 license boilerplate and update with
the SPDX license identifier.

Signed-off-by: Rajmohan Mani <rajmohan.mani@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/tps68470.c       | 10 +---------
 include/linux/mfd/tps68470.h | 17 +++--------------
 2 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/tps68470.c b/drivers/mfd/tps68470.c
index 189efaea054c..a5981a79b29a 100644
--- a/drivers/mfd/tps68470.c
+++ b/drivers/mfd/tps68470.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * TPS68470 chip Parent driver
  *
@@ -8,15 +9,6 @@
  *	Tianshu Qiu <tian.shu.qiu@intel.com>
  *	Jian Xu Zheng <jian.xu.zheng@intel.com>
  *	Yuning Pu <yuning.pu@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/acpi.h>
diff --git a/include/linux/mfd/tps68470.h b/include/linux/mfd/tps68470.h
index 44f9d9f647ed..ffe81127d91c 100644
--- a/include/linux/mfd/tps68470.h
+++ b/include/linux/mfd/tps68470.h
@@ -1,17 +1,6 @@
-/*
- * Copyright (c) 2017 Intel Corporation
- *
- * Functions to access TPS68470 power management chip.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2017 Intel Corporation */
+/* Functions to access TPS68470 power management chip. */
 
 #ifndef __LINUX_MFD_TPS68470_H
 #define __LINUX_MFD_TPS68470_H
-- 
cgit v1.2.3


From 99fb0f25c448ab72481bd700b66e0e48c583ef5a Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@intel.com>
Date: Thu, 22 Feb 2018 13:30:52 -0800
Subject: Revert "mfd: cros_ec: Add ACPI GPE handler for LID0 devices"

This reverts commit e04653a9dcf4d98defe2149c885382e5cc72082f.

It is no longer needed to install Chrome EC GPE handler to have
GPE enabled in suspend to idle path. It is found that with this
handler installed, EC wake up doesn't work because default EC
event handler that can wake up system is not getting called.

Signed-off-by: Wenkai Du <wenkai.du@intel.com>
Acked-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Makefile           |   1 -
 drivers/mfd/cros_ec.c          |  15 ++----
 drivers/mfd/cros_ec_acpi_gpe.c | 103 -----------------------------------------
 include/linux/mfd/cros_ec.h    |  18 -------
 4 files changed, 3 insertions(+), 134 deletions(-)
 delete mode 100644 drivers/mfd/cros_ec_acpi_gpe.c

(limited to 'include')

diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index d9d2cf0d32ef..e9fd20dba18d 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -13,7 +13,6 @@ obj-$(CONFIG_MFD_ASIC3)		+= asic3.o tmio_core.o
 obj-$(CONFIG_MFD_BCM590XX)	+= bcm590xx.o
 obj-$(CONFIG_MFD_BD9571MWV)	+= bd9571mwv.o
 cros_ec_core-objs		:= cros_ec.o
-cros_ec_core-$(CONFIG_ACPI)	+= cros_ec_acpi_gpe.o
 obj-$(CONFIG_MFD_CROS_EC)	+= cros_ec_core.o
 obj-$(CONFIG_MFD_CROS_EC_I2C)	+= cros_ec_i2c.o
 obj-$(CONFIG_MFD_CROS_EC_SPI)	+= cros_ec_spi.o
diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index d61024141e2b..c221163d5b9e 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -173,8 +173,6 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 
 	dev_info(dev, "Chrome EC device registered\n");
 
-	cros_ec_acpi_install_gpe_handler(dev);
-
 	return 0;
 
 fail_mfd:
@@ -188,8 +186,6 @@ int cros_ec_remove(struct cros_ec_device *ec_dev)
 {
 	mfd_remove_devices(ec_dev->dev);
 
-	cros_ec_acpi_remove_gpe_handler();
-
 	if (ec_dev->irq)
 		free_irq(ec_dev->irq, ec_dev);
 
@@ -204,14 +200,9 @@ int cros_ec_suspend(struct cros_ec_device *ec_dev)
 	int ret;
 	u8 sleep_event;
 
-	if (!IS_ENABLED(CONFIG_ACPI) || pm_suspend_via_firmware()) {
-		sleep_event = HOST_SLEEP_EVENT_S3_SUSPEND;
-	} else {
-		sleep_event = HOST_SLEEP_EVENT_S0IX_SUSPEND;
-
-		/* Clearing the GPE status for any pending event */
-		cros_ec_acpi_clear_gpe();
-	}
+	sleep_event = (!IS_ENABLED(CONFIG_ACPI) || pm_suspend_via_firmware()) ?
+		      HOST_SLEEP_EVENT_S3_SUSPEND :
+		      HOST_SLEEP_EVENT_S0IX_SUSPEND;
 
 	ret = cros_ec_sleep_event(ec_dev, sleep_event);
 	if (ret < 0)
diff --git a/drivers/mfd/cros_ec_acpi_gpe.c b/drivers/mfd/cros_ec_acpi_gpe.c
deleted file mode 100644
index 56d305dab2d4..000000000000
--- a/drivers/mfd/cros_ec_acpi_gpe.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * ChromeOS EC multi-function device
- *
- * Copyright (C) 2017 Google, Inc
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * The ChromeOS EC multi function device is used to mux all the requests
- * to the EC device for its multiple features: keyboard controller,
- * battery charging and regulator control, firmware update.
- */
-#include <linux/acpi.h>
-
-#define ACPI_LID_DEVICE      "LID0"
-
-static int ec_wake_gpe = -EINVAL;
-
-/*
- * This handler indicates to ACPI core that this GPE should stay enabled for
- * lid to work in suspend to idle path.
- */
-static u32 cros_ec_gpe_handler(acpi_handle gpe_device, u32 gpe_number,
-			       void *data)
-{
-	return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
-}
-
-/*
- * Get ACPI GPE for LID0 device.
- */
-static int cros_ec_get_ec_wake_gpe(struct device *dev)
-{
-	struct acpi_device *cros_acpi_dev;
-	struct acpi_device *adev;
-	acpi_handle handle;
-	acpi_status status;
-	int ret;
-
-	cros_acpi_dev = ACPI_COMPANION(dev);
-
-	if (!cros_acpi_dev || !cros_acpi_dev->parent ||
-	   !cros_acpi_dev->parent->handle)
-		return -EINVAL;
-
-	status = acpi_get_handle(cros_acpi_dev->parent->handle, ACPI_LID_DEVICE,
-				 &handle);
-	if (ACPI_FAILURE(status))
-		return -EINVAL;
-
-	ret = acpi_bus_get_device(handle, &adev);
-	if (ret)
-		return ret;
-
-	return adev->wakeup.gpe_number;
-}
-
-int cros_ec_acpi_install_gpe_handler(struct device *dev)
-{
-	acpi_status status;
-
-	ec_wake_gpe = cros_ec_get_ec_wake_gpe(dev);
-
-	if (ec_wake_gpe < 0)
-		return ec_wake_gpe;
-
-	status = acpi_install_gpe_handler(NULL, ec_wake_gpe,
-					  ACPI_GPE_EDGE_TRIGGERED,
-					  &cros_ec_gpe_handler, NULL);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	dev_info(dev, "Initialized, GPE = 0x%x\n", ec_wake_gpe);
-
-	return 0;
-}
-
-void cros_ec_acpi_remove_gpe_handler(void)
-{
-	acpi_status status;
-
-	if (ec_wake_gpe < 0)
-		return;
-
-	status = acpi_remove_gpe_handler(NULL, ec_wake_gpe,
-						 &cros_ec_gpe_handler);
-	if (ACPI_FAILURE(status))
-		pr_err("failed to remove gpe handler\n");
-}
-
-void cros_ec_acpi_clear_gpe(void)
-{
-	if (ec_wake_gpe < 0)
-		return;
-
-	acpi_clear_gpe(NULL, ec_wake_gpe);
-}
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 2d4e23c9ea0a..4ff0cec979b0 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -328,22 +328,4 @@ extern struct attribute_group cros_ec_vbc_attr_group;
 int cros_ec_debugfs_init(struct cros_ec_dev *ec);
 void cros_ec_debugfs_remove(struct cros_ec_dev *ec);
 
-/* ACPI GPE handler */
-#ifdef CONFIG_ACPI
-
-int cros_ec_acpi_install_gpe_handler(struct device *dev);
-void cros_ec_acpi_remove_gpe_handler(void);
-void cros_ec_acpi_clear_gpe(void);
-
-#else /* CONFIG_ACPI */
-
-static inline int cros_ec_acpi_install_gpe_handler(struct device *dev)
-{
-	return -ENODEV;
-}
-static inline void cros_ec_acpi_remove_gpe_handler(void) {}
-static inline void cros_ec_acpi_clear_gpe(void) {}
-
-#endif /* CONFIG_ACPI */
-
 #endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit v1.2.3


From c18604660ad02324dfc21de5dfd0c8d15e9070a5 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 12 Mar 2018 15:52:00 +0000
Subject: mfd: arizona: Update reset pin to use GPIOD

Now GPIOD has support for both pdata systems and for non-standard DT
bindings the Arizona reset GPIO can be converted to use it. Worth
noting gpiod_set_raw_value_cansleep is used to match the behaviour
of the old GPIOs. This is because the part is fairly widely used and
it is unknown how many DTs are correctly setting active low through
device tree, so to avoid breaking any existing users it is best to
match the previous behaviour.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/arizona-core.c        | 53 ++++++++++++++++++++++++++-------------
 include/linux/mfd/arizona/pdata.h |  3 ++-
 2 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index 77875250abe5..83f1c5a516d9 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -13,13 +13,12 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/err.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
@@ -279,7 +278,7 @@ static int arizona_wait_for_boot(struct arizona *arizona)
 static inline void arizona_enable_reset(struct arizona *arizona)
 {
 	if (arizona->pdata.reset)
-		gpio_set_value_cansleep(arizona->pdata.reset, 0);
+		gpiod_set_raw_value_cansleep(arizona->pdata.reset, 0);
 }
 
 static void arizona_disable_reset(struct arizona *arizona)
@@ -295,7 +294,7 @@ static void arizona_disable_reset(struct arizona *arizona)
 			break;
 		}
 
-		gpio_set_value_cansleep(arizona->pdata.reset, 1);
+		gpiod_set_raw_value_cansleep(arizona->pdata.reset, 1);
 		usleep_range(1000, 5000);
 	}
 }
@@ -799,14 +798,27 @@ static int arizona_of_get_core_pdata(struct arizona *arizona)
 	struct arizona_pdata *pdata = &arizona->pdata;
 	int ret, i;
 
-	pdata->reset = of_get_named_gpio(arizona->dev->of_node, "wlf,reset", 0);
-	if (pdata->reset == -EPROBE_DEFER) {
-		return pdata->reset;
-	} else if (pdata->reset < 0) {
-		dev_err(arizona->dev, "Reset GPIO missing/malformed: %d\n",
-			pdata->reset);
+	/* Handle old non-standard DT binding */
+	pdata->reset = devm_gpiod_get_from_of_node(arizona->dev,
+						   arizona->dev->of_node,
+						   "wlf,reset", 0,
+						   GPIOD_OUT_LOW,
+						   "arizona /RESET");
+	if (IS_ERR(pdata->reset)) {
+		ret = PTR_ERR(pdata->reset);
 
-		pdata->reset = 0;
+		/*
+		 * Reset missing will be caught when other binding is read
+		 * but all other errors imply this binding is in use but has
+		 * encountered a problem so should be handled.
+		 */
+		if (ret == -EPROBE_DEFER)
+			return ret;
+		else if (ret != -ENOENT && ret != -ENOSYS)
+			dev_err(arizona->dev, "Reset GPIO malformed: %d\n",
+				ret);
+
+		pdata->reset = NULL;
 	}
 
 	ret = of_property_read_u32_array(arizona->dev->of_node,
@@ -1050,14 +1062,19 @@ int arizona_dev_init(struct arizona *arizona)
 		goto err_early;
 	}
 
-	if (arizona->pdata.reset) {
+	if (!arizona->pdata.reset) {
 		/* Start out with /RESET low to put the chip into reset */
-		ret = devm_gpio_request_one(arizona->dev, arizona->pdata.reset,
-					    GPIOF_DIR_OUT | GPIOF_INIT_LOW,
-					    "arizona /RESET");
-		if (ret != 0) {
-			dev_err(dev, "Failed to request /RESET: %d\n", ret);
-			goto err_dcvdd;
+		arizona->pdata.reset = devm_gpiod_get(arizona->dev, "reset",
+						      GPIOD_OUT_LOW);
+		if (IS_ERR(arizona->pdata.reset)) {
+			ret = PTR_ERR(arizona->pdata.reset);
+			if (ret == -EPROBE_DEFER)
+				goto err_dcvdd;
+
+			dev_err(arizona->dev,
+				"Reset GPIO missing/malformed: %d\n", ret);
+
+			arizona->pdata.reset = NULL;
 		}
 	}
 
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index f72dc53848d7..0013075d4cda 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -56,6 +56,7 @@
 #define ARIZONA_MAX_PDM_SPK 2
 
 struct regulator_init_data;
+struct gpio_desc;
 
 struct arizona_micbias {
 	int mV;                    /** Regulated voltage */
@@ -77,7 +78,7 @@ struct arizona_micd_range {
 };
 
 struct arizona_pdata {
-	int reset;      /** GPIO controlling /RESET, if any */
+	struct gpio_desc *reset;      /** GPIO controlling /RESET, if any */
 
 	/** Regulator configuration for MICVDD */
 	struct arizona_micsupp_pdata micvdd;
-- 
cgit v1.2.3


From 531a469ead6ad1e79e8c84ffd4787c0747336e5a Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Thu, 29 Mar 2018 12:31:13 +0800
Subject: mfd: axp20x: Constify struct mfd_cell and struct resource

The axp20x driver has lots of mfd_cell and resource structs.
These can all be const-ified.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x.c       | 44 ++++++++++++++++++++++----------------------
 include/linux/mfd/axp20x.h |  2 +-
 2 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index aaf2acbef701..14be0f658d74 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -169,18 +169,18 @@ static const struct regmap_access_table axp806_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp806_volatile_ranges),
 };
 
-static struct resource axp152_pek_resources[] = {
+static const struct resource axp152_pek_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_RIS_EDGE, "PEK_DBR"),
 	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
 };
 
-static struct resource axp20x_ac_power_supply_resources[] = {
+static const struct resource axp20x_ac_power_supply_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_ACIN_PLUGIN, "ACIN_PLUGIN"),
 	DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_ACIN_REMOVAL, "ACIN_REMOVAL"),
 	DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_ACIN_OVER_V, "ACIN_OVER_V"),
 };
 
-static struct resource axp20x_pek_resources[] = {
+static const struct resource axp20x_pek_resources[] = {
 	{
 		.name	= "PEK_DBR",
 		.start	= AXP20X_IRQ_PEK_RIS_EDGE,
@@ -194,19 +194,19 @@ static struct resource axp20x_pek_resources[] = {
 	},
 };
 
-static struct resource axp20x_usb_power_supply_resources[] = {
+static const struct resource axp20x_usb_power_supply_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_VBUS_PLUGIN, "VBUS_PLUGIN"),
 	DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_VBUS_REMOVAL, "VBUS_REMOVAL"),
 	DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_VBUS_VALID, "VBUS_VALID"),
 	DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_VBUS_NOT_VALID, "VBUS_NOT_VALID"),
 };
 
-static struct resource axp22x_usb_power_supply_resources[] = {
+static const struct resource axp22x_usb_power_supply_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP22X_IRQ_VBUS_PLUGIN, "VBUS_PLUGIN"),
 	DEFINE_RES_IRQ_NAMED(AXP22X_IRQ_VBUS_REMOVAL, "VBUS_REMOVAL"),
 };
 
-static struct resource axp22x_pek_resources[] = {
+static const struct resource axp22x_pek_resources[] = {
 	{
 		.name   = "PEK_DBR",
 		.start  = AXP22X_IRQ_PEK_RIS_EDGE,
@@ -220,7 +220,7 @@ static struct resource axp22x_pek_resources[] = {
 	},
 };
 
-static struct resource axp288_power_button_resources[] = {
+static const struct resource axp288_power_button_resources[] = {
 	{
 		.name	= "PEK_DBR",
 		.start	= AXP288_IRQ_POKP,
@@ -235,7 +235,7 @@ static struct resource axp288_power_button_resources[] = {
 	},
 };
 
-static struct resource axp288_fuel_gauge_resources[] = {
+static const struct resource axp288_fuel_gauge_resources[] = {
 	{
 		.start = AXP288_IRQ_QWBTU,
 		.end   = AXP288_IRQ_QWBTU,
@@ -268,7 +268,7 @@ static struct resource axp288_fuel_gauge_resources[] = {
 	},
 };
 
-static struct resource axp803_pek_resources[] = {
+static const struct resource axp803_pek_resources[] = {
 	{
 		.name   = "PEK_DBR",
 		.start  = AXP803_IRQ_PEK_RIS_EDGE,
@@ -282,7 +282,7 @@ static struct resource axp803_pek_resources[] = {
 	},
 };
 
-static struct resource axp809_pek_resources[] = {
+static const struct resource axp809_pek_resources[] = {
 	{
 		.name   = "PEK_DBR",
 		.start  = AXP809_IRQ_PEK_RIS_EDGE,
@@ -648,7 +648,7 @@ static const struct regmap_irq_chip axp809_regmap_irq_chip = {
 	.num_regs		= 5,
 };
 
-static struct mfd_cell axp20x_cells[] = {
+static const struct mfd_cell axp20x_cells[] = {
 	{
 		.name		= "axp20x-gpio",
 		.of_compatible	= "x-powers,axp209-gpio",
@@ -677,7 +677,7 @@ static struct mfd_cell axp20x_cells[] = {
 	},
 };
 
-static struct mfd_cell axp221_cells[] = {
+static const struct mfd_cell axp221_cells[] = {
 	{
 		.name		= "axp221-pek",
 		.num_resources	= ARRAY_SIZE(axp22x_pek_resources),
@@ -703,7 +703,7 @@ static struct mfd_cell axp221_cells[] = {
 	},
 };
 
-static struct mfd_cell axp223_cells[] = {
+static const struct mfd_cell axp223_cells[] = {
 	{
 		.name			= "axp221-pek",
 		.num_resources		= ARRAY_SIZE(axp22x_pek_resources),
@@ -729,7 +729,7 @@ static struct mfd_cell axp223_cells[] = {
 	},
 };
 
-static struct mfd_cell axp152_cells[] = {
+static const struct mfd_cell axp152_cells[] = {
 	{
 		.name			= "axp20x-pek",
 		.num_resources		= ARRAY_SIZE(axp152_pek_resources),
@@ -737,7 +737,7 @@ static struct mfd_cell axp152_cells[] = {
 	},
 };
 
-static struct resource axp288_adc_resources[] = {
+static const struct resource axp288_adc_resources[] = {
 	{
 		.name  = "GPADC",
 		.start = AXP288_IRQ_GPADC,
@@ -746,7 +746,7 @@ static struct resource axp288_adc_resources[] = {
 	},
 };
 
-static struct resource axp288_extcon_resources[] = {
+static const struct resource axp288_extcon_resources[] = {
 	{
 		.start = AXP288_IRQ_VBUS_FALL,
 		.end   = AXP288_IRQ_VBUS_FALL,
@@ -769,7 +769,7 @@ static struct resource axp288_extcon_resources[] = {
 	},
 };
 
-static struct resource axp288_charger_resources[] = {
+static const struct resource axp288_charger_resources[] = {
 	{
 		.start = AXP288_IRQ_OV,
 		.end   = AXP288_IRQ_OV,
@@ -817,7 +817,7 @@ static struct resource axp288_charger_resources[] = {
 	},
 };
 
-static struct mfd_cell axp288_cells[] = {
+static const struct mfd_cell axp288_cells[] = {
 	{
 		.name = "axp288_adc",
 		.num_resources = ARRAY_SIZE(axp288_adc_resources),
@@ -848,7 +848,7 @@ static struct mfd_cell axp288_cells[] = {
 	},
 };
 
-static struct mfd_cell axp803_cells[] = {
+static const struct mfd_cell axp803_cells[] = {
 	{
 		.name			= "axp221-pek",
 		.num_resources		= ARRAY_SIZE(axp803_pek_resources),
@@ -857,14 +857,14 @@ static struct mfd_cell axp803_cells[] = {
 	{	.name			= "axp20x-regulator" },
 };
 
-static struct mfd_cell axp806_cells[] = {
+static const struct mfd_cell axp806_cells[] = {
 	{
 		.id			= 2,
 		.name			= "axp20x-regulator",
 	},
 };
 
-static struct mfd_cell axp809_cells[] = {
+static const struct mfd_cell axp809_cells[] = {
 	{
 		.name			= "axp221-pek",
 		.num_resources		= ARRAY_SIZE(axp809_pek_resources),
@@ -875,7 +875,7 @@ static struct mfd_cell axp809_cells[] = {
 	},
 };
 
-static struct mfd_cell axp813_cells[] = {
+static const struct mfd_cell axp813_cells[] = {
 	{
 		.name			= "axp221-pek",
 		.num_resources		= ARRAY_SIZE(axp803_pek_resources),
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 82bf7747b312..a2489363a998 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -642,7 +642,7 @@ struct axp20x_dev {
 	struct regmap_irq_chip_data	*regmap_irqc;
 	long				variant;
 	int                             nr_cells;
-	struct mfd_cell                 *cells;
+	const struct mfd_cell           *cells;
 	const struct regmap_config	*regmap_cfg;
 	const struct regmap_irq_chip	*regmap_irq_chip;
 };
-- 
cgit v1.2.3


From eef2b53a3ec6b1919d32bd722d2f5d33769a1f48 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Thu, 29 Mar 2018 12:31:15 +0800
Subject: mfd: axp20x: Correct AXP806 POK interrupt prefix

When AXP806 support was added, POK was incorrectly expanded to PWROK.
However, the datasheet lists them as POK[LSNP], which is the same as
on the AXP288. Furthermore, the registers associated with POK functions
are the same as the PEK on the other AXP PMICs. This suggests that
"POK" means "Power On Key", much like "PEK" means "Power Enable Key",
instead of "Power OK".

This patch changes the "PWROK" prefix to "POK" for these interrupts.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x.c       | 8 ++++----
 include/linux/mfd/axp20x.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index b3051e1c3ddc..9a2ef3d9b8f8 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -450,11 +450,11 @@ static const struct regmap_irq axp806_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP806, DCDCC_V_LOW,		0, 5),
 	INIT_REGMAP_IRQ(AXP806, DCDCD_V_LOW,		0, 6),
 	INIT_REGMAP_IRQ(AXP806, DCDCE_V_LOW,		0, 7),
-	INIT_REGMAP_IRQ(AXP806, PWROK_LONG,		1, 0),
-	INIT_REGMAP_IRQ(AXP806, PWROK_SHORT,		1, 1),
+	INIT_REGMAP_IRQ(AXP806, POK_LONG,		1, 0),
+	INIT_REGMAP_IRQ(AXP806, POK_SHORT,		1, 1),
 	INIT_REGMAP_IRQ(AXP806, WAKEUP,			1, 4),
-	INIT_REGMAP_IRQ(AXP806, PWROK_FALL,		1, 5),
-	INIT_REGMAP_IRQ(AXP806, PWROK_RISE,		1, 6),
+	INIT_REGMAP_IRQ(AXP806, POK_FALL,		1, 5),
+	INIT_REGMAP_IRQ(AXP806, POK_RISE,		1, 6),
 };
 
 static const struct regmap_irq axp809_regmap_irqs[] = {
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index a2489363a998..517e60eecbcb 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -592,11 +592,11 @@ enum axp806_irqs {
 	AXP806_IRQ_DCDCC_V_LOW,
 	AXP806_IRQ_DCDCD_V_LOW,
 	AXP806_IRQ_DCDCE_V_LOW,
-	AXP806_IRQ_PWROK_LONG,
-	AXP806_IRQ_PWROK_SHORT,
+	AXP806_IRQ_POK_LONG,
+	AXP806_IRQ_POK_SHORT,
 	AXP806_IRQ_WAKEUP,
-	AXP806_IRQ_PWROK_FALL,
-	AXP806_IRQ_PWROK_RISE,
+	AXP806_IRQ_POK_FALL,
+	AXP806_IRQ_POK_RISE,
 };
 
 enum axp809_irqs {
-- 
cgit v1.2.3


From a411e81e61df24023f40255145d8f023402002c9 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Tue, 10 Apr 2018 22:30:28 +0200
Subject: gpiolib: add hogs support for machine code

Board files constitute a significant part of the users of the legacy
GPIO framework. In many cases they only export a line and set its
desired value. We could use GPIO hogs for that like we do for DT and
ACPI but there's no support for that in machine code.

This patch proposes to extend the machine.h API with support for
registering hog tables in board files.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/driver-api/gpio/board.rst | 16 ++++++++
 drivers/gpio/gpiolib.c                  | 67 +++++++++++++++++++++++++++++++++
 include/linux/gpio/machine.h            | 31 +++++++++++++++
 3 files changed, 114 insertions(+)

(limited to 'include')

diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst
index 25d62b2e9fd0..2c112553df84 100644
--- a/Documentation/driver-api/gpio/board.rst
+++ b/Documentation/driver-api/gpio/board.rst
@@ -177,3 +177,19 @@ mapping and is thus transparent to GPIO consumers.
 
 A set of functions such as gpiod_set_value() is available to work with
 the new descriptor-oriented interface.
+
+Boards using platform data can also hog GPIO lines by defining GPIO hog tables.
+
+.. code-block:: c
+
+        struct gpiod_hog gpio_hog_table[] = {
+                GPIO_HOG("gpio.0", 10, "foo", GPIO_ACTIVE_LOW, GPIOD_OUT_HIGH),
+                { }
+        };
+
+And the table can be added to the board code as follows::
+
+        gpiod_add_hogs(gpio_hog_table);
+
+The line will be hogged as soon as the gpiochip is created or - in case the
+chip was created earlier - when the hog table is registered.
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 43aeb07343ec..547adc149b62 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -71,6 +71,9 @@ static DEFINE_MUTEX(gpio_lookup_lock);
 static LIST_HEAD(gpio_lookup_list);
 LIST_HEAD(gpio_devices);
 
+static DEFINE_MUTEX(gpio_machine_hogs_mutex);
+static LIST_HEAD(gpio_machine_hogs);
+
 static void gpiochip_free_hogs(struct gpio_chip *chip);
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
 				struct lock_class_key *lock_key,
@@ -1171,6 +1174,41 @@ err_remove_device:
 	return status;
 }
 
+static void gpiochip_machine_hog(struct gpio_chip *chip, struct gpiod_hog *hog)
+{
+	struct gpio_desc *desc;
+	int rv;
+
+	desc = gpiochip_get_desc(chip, hog->chip_hwnum);
+	if (IS_ERR(desc)) {
+		pr_err("%s: unable to get GPIO desc: %ld\n",
+		       __func__, PTR_ERR(desc));
+		return;
+	}
+
+	if (desc->flags & FLAG_IS_HOGGED)
+		return;
+
+	rv = gpiod_hog(desc, hog->line_name, hog->lflags, hog->dflags);
+	if (rv)
+		pr_err("%s: unable to hog GPIO line (%s:%u): %d\n",
+		       __func__, chip->label, hog->chip_hwnum, rv);
+}
+
+static void machine_gpiochip_add(struct gpio_chip *chip)
+{
+	struct gpiod_hog *hog;
+
+	mutex_lock(&gpio_machine_hogs_mutex);
+
+	list_for_each_entry(hog, &gpio_machine_hogs, list) {
+		if (!strcmp(chip->label, hog->chip_label))
+			gpiochip_machine_hog(chip, hog);
+	}
+
+	mutex_unlock(&gpio_machine_hogs_mutex);
+}
+
 static void gpiochip_setup_devs(void)
 {
 	struct gpio_device *gdev;
@@ -1326,6 +1364,8 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 
 	acpi_gpiochip_add(chip);
 
+	machine_gpiochip_add(chip);
+
 	/*
 	 * By first adding the chardev, and then adding the device,
 	 * we get a device node entry in sysfs under
@@ -3462,6 +3502,33 @@ void gpiod_remove_lookup_table(struct gpiod_lookup_table *table)
 }
 EXPORT_SYMBOL_GPL(gpiod_remove_lookup_table);
 
+/**
+ * gpiod_add_hogs() - register a set of GPIO hogs from machine code
+ * @hogs: table of gpio hog entries with a zeroed sentinel at the end
+ */
+void gpiod_add_hogs(struct gpiod_hog *hogs)
+{
+	struct gpio_chip *chip;
+	struct gpiod_hog *hog;
+
+	mutex_lock(&gpio_machine_hogs_mutex);
+
+	for (hog = &hogs[0]; hog->chip_label; hog++) {
+		list_add_tail(&hog->list, &gpio_machine_hogs);
+
+		/*
+		 * The chip may have been registered earlier, so check if it
+		 * exists and, if so, try to hog the line now.
+		 */
+		chip = find_chip_by_name(hog->chip_label);
+		if (chip)
+			gpiochip_machine_hog(chip, hog);
+	}
+
+	mutex_unlock(&gpio_machine_hogs_mutex);
+}
+EXPORT_SYMBOL_GPL(gpiod_add_hogs);
+
 static struct gpiod_lookup_table *gpiod_find_lookup_table(struct device *dev)
 {
 	const char *dev_id = dev ? dev_name(dev) : NULL;
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index b2f2dc638463..daa44eac9241 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -39,6 +39,23 @@ struct gpiod_lookup_table {
 	struct gpiod_lookup table[];
 };
 
+/**
+ * struct gpiod_hog - GPIO line hog table
+ * @chip_label: name of the chip the GPIO belongs to
+ * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO
+ * @line_name: consumer name for the hogged line
+ * @lflags: mask of GPIO lookup flags
+ * @dflags: GPIO flags used to specify the direction and value
+ */
+struct gpiod_hog {
+	struct list_head list;
+	const char *chip_label;
+	u16 chip_hwnum;
+	const char *line_name;
+	enum gpio_lookup_flags lflags;
+	int dflags;
+};
+
 /*
  * Simple definition of a single GPIO under a con_id
  */
@@ -59,10 +76,23 @@ struct gpiod_lookup_table {
 	.flags = _flags,                                                  \
 }
 
+/*
+ * Simple definition of a single GPIO hog in an array.
+ */
+#define GPIO_HOG(_chip_label, _chip_hwnum, _line_name, _lflags, _dflags)  \
+{                                                                         \
+	.chip_label = _chip_label,                                        \
+	.chip_hwnum = _chip_hwnum,                                        \
+	.line_name = _line_name,                                          \
+	.lflags = _lflags,                                                \
+	.dflags = _dflags,                                                \
+}
+
 #ifdef CONFIG_GPIOLIB
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table);
 void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n);
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table);
+void gpiod_add_hogs(struct gpiod_hog *hogs);
 #else
 static inline
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {}
@@ -70,6 +100,7 @@ static inline
 void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {}
 static inline
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {}
+static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {}
 #endif
 
 #endif /* __LINUX_GPIO_MACHINE_H */
-- 
cgit v1.2.3


From e6ca26abd37606ba4864f20c85d3fe4a2173b93f Mon Sep 17 00:00:00 2001
From: Phil Edworthy <phil.edworthy@renesas.com>
Date: Thu, 26 Apr 2018 17:19:47 +0100
Subject: gpio: dwapb: Add support for 1 interrupt per port A GPIO

The DesignWare GPIO IP can be configured for either 1 interrupt or 1
per GPIO in port A, but the driver currently only supports 1 interrupt.
See the DesignWare DW_apb_gpio Databook description of the
'GPIO_INTR_IO' parameter.

This change allows the driver to work with up to 32 interrupts, it will
get as many interrupts as specified in the DT 'interrupts' property.
It doesn't do anything clever with the different interrupts, it just calls
the same handler used for single interrupt hardware.

ACPI companion code provided by Hoan Tran <hotran@apm.com>. This was tested
on X-Gene by Hoan.

Signed-off-by: Phil Edworthy <phil.edworthy@renesas.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Hoan Tran <hotran@apm.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 .../devicetree/bindings/gpio/snps-dwapb-gpio.txt   |  9 ++++-
 drivers/gpio/gpio-dwapb.c                          | 46 +++++++++++++++++-----
 drivers/mfd/intel_quark_i2c_gpio.c                 |  3 +-
 include/linux/platform_data/gpio-dwapb.h           |  3 +-
 4 files changed, 48 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt b/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt
index 4a75da7051bd..3c1118bc67f5 100644
--- a/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt
+++ b/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt
@@ -26,8 +26,13 @@ controller.
   the second encodes the triger flags encoded as described in
   Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
 - interrupt-parent : The parent interrupt controller.
-- interrupts : The interrupt to the parent controller raised when GPIOs
-  generate the interrupts.
+- interrupts : The interrupts to the parent controller raised when GPIOs
+  generate the interrupts. If the controller provides one combined interrupt
+  for all GPIOs, specify a single interrupt. If the controller provides one
+  interrupt for each GPIO, provide a list of interrupts that correspond to each
+  of the GPIO pins. When specifying multiple interrupts, if any are unconnected,
+  use the interrupts-extended property to specify the interrupts and set the
+  interrupt controller handle for unused interrupts to 0.
 - snps,nr-gpios : The number of pins in the port, a single cell.
 - resets : Reset line for the controller.
 
diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c
index 226977f78482..7dcd06b11570 100644
--- a/drivers/gpio/gpio-dwapb.c
+++ b/drivers/gpio/gpio-dwapb.c
@@ -441,14 +441,19 @@ static void dwapb_configure_irqs(struct dwapb_gpio *gpio,
 	irq_gc->chip_types[1].handler = handle_edge_irq;
 
 	if (!pp->irq_shared) {
-		irq_set_chained_handler_and_data(pp->irq, dwapb_irq_handler,
-						 gpio);
+		int i;
+
+		for (i = 0; i < pp->ngpio; i++) {
+			if (pp->irq[i])
+				irq_set_chained_handler_and_data(pp->irq[i],
+						dwapb_irq_handler, gpio);
+		}
 	} else {
 		/*
 		 * Request a shared IRQ since where MFD would have devices
 		 * using the same irq pin
 		 */
-		err = devm_request_irq(gpio->dev, pp->irq,
+		err = devm_request_irq(gpio->dev, pp->irq[0],
 				       dwapb_irq_handler_mfd,
 				       IRQF_SHARED, "gpio-dwapb-mfd", gpio);
 		if (err) {
@@ -524,7 +529,7 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio,
 	if (pp->idx == 0)
 		port->gc.set_config = dwapb_gpio_set_config;
 
-	if (pp->irq)
+	if (pp->has_irq)
 		dwapb_configure_irqs(gpio, port, pp);
 
 	err = gpiochip_add_data(&port->gc, port);
@@ -535,7 +540,7 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio,
 		port->is_registered = true;
 
 	/* Add GPIO-signaled ACPI event support */
-	if (pp->irq)
+	if (pp->has_irq)
 		acpi_gpiochip_request_interrupts(&port->gc);
 
 	return err;
@@ -601,13 +606,36 @@ dwapb_gpio_get_pdata(struct device *dev)
 		if (dev->of_node && pp->idx == 0 &&
 			fwnode_property_read_bool(fwnode,
 						  "interrupt-controller")) {
-			pp->irq = irq_of_parse_and_map(to_of_node(fwnode), 0);
-			if (!pp->irq)
+			struct device_node *np = to_of_node(fwnode);
+			unsigned int j;
+
+			/*
+			 * The IP has configuration options to allow a single
+			 * combined interrupt or one per gpio. If one per gpio,
+			 * some might not be used.
+			 */
+			for (j = 0; j < pp->ngpio; j++) {
+				int irq = of_irq_get(np, j);
+				if (irq < 0)
+					continue;
+
+				pp->irq[j] = irq;
+				pp->has_irq = true;
+			}
+
+			if (!pp->has_irq)
 				dev_warn(dev, "no irq for port%d\n", pp->idx);
 		}
 
-		if (has_acpi_companion(dev) && pp->idx == 0)
-			pp->irq = platform_get_irq(to_platform_device(dev), 0);
+		if (has_acpi_companion(dev) && pp->idx == 0) {
+			unsigned int j;
+
+			for (j = 0; j < pp->ngpio; j++) {
+				pp->irq[j] = platform_get_irq(to_platform_device(dev), j);
+				if (pp->irq[j])
+					pp->has_irq = true;
+			}
+		}
 
 		pp->irq_shared	= false;
 		pp->gpio_base	= -1;
diff --git a/drivers/mfd/intel_quark_i2c_gpio.c b/drivers/mfd/intel_quark_i2c_gpio.c
index 90e35dec8648..5bddb84cfc1f 100644
--- a/drivers/mfd/intel_quark_i2c_gpio.c
+++ b/drivers/mfd/intel_quark_i2c_gpio.c
@@ -233,7 +233,8 @@ static int intel_quark_gpio_setup(struct pci_dev *pdev, struct mfd_cell *cell)
 	pdata->properties->idx		= 0;
 	pdata->properties->ngpio	= INTEL_QUARK_MFD_NGPIO;
 	pdata->properties->gpio_base	= INTEL_QUARK_MFD_GPIO_BASE;
-	pdata->properties->irq		= pdev->irq;
+	pdata->properties->irq[0]	= pdev->irq;
+	pdata->properties->has_irq	= true;
 	pdata->properties->irq_shared	= true;
 
 	cell->platform_data = pdata;
diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h
index 2dc7f4a8ab09..5a52d69c13f3 100644
--- a/include/linux/platform_data/gpio-dwapb.h
+++ b/include/linux/platform_data/gpio-dwapb.h
@@ -19,7 +19,8 @@ struct dwapb_port_property {
 	unsigned int	idx;
 	unsigned int	ngpio;
 	unsigned int	gpio_base;
-	unsigned int	irq;
+	unsigned int	irq[32];
+	bool		has_irq;
 	bool		irq_shared;
 };
 
-- 
cgit v1.2.3


From 62750d040bd137fd6f541e216502d9158e07d348 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 15 May 2018 13:20:03 -0700
Subject: fs: copy BTRFS_IOC_[SG]ET_FSLABEL to vfs

This retains 256 chars as the maximum size through the interface, which
is the btrfs limit and AFAIK exceeds any other filesystem's maximum
label size.

This just copies the ioctl for now and leaves it in place for btrfs
for the time being.  A later patch will allow btrfs to use the new
common ioctl definition, but it may be sent after this is merged.

(Note, Reviewed-by's were originally given for the combined vfs+btrfs
patch, some license taken here.)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: David Sterba <dsterba@suse.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 Documentation/ioctl/ioctl-number.txt | 3 ++-
 include/uapi/linux/fs.h              | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 7f7413e597f3..27c1b7b78504 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -296,7 +296,8 @@ Code  Seq#(hex)	Include File		Comments
 0x90	00	drivers/cdrom/sbpcd.h
 0x92	00-0F	drivers/usb/mon/mon_bin.c
 0x93	60-7F	linux/auto_fs.h
-0x94	all	fs/btrfs/ioctl.h
+0x94	all	fs/btrfs/ioctl.h	Btrfs filesystem
+		and linux/fs.h		some lifted to vfs/generic
 0x97	00-7F	fs/ceph/ioctl.h		Ceph file system
 0x99	00-0F				537-Addinboard driver
 					<mailto:buk@buks.ipn.de>
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index d2a8313fabd7..9d132f8f2df8 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -242,6 +242,8 @@ struct fsxattr {
 #define FICLONERANGE	_IOW(0x94, 13, struct file_clone_range)
 #define FIDEDUPERANGE	_IOWR(0x94, 54, struct file_dedupe_range)
 
+#define FSLABEL_MAX 256	/* Max chars for the interface; each fs may differ */
+
 #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
 #define	FS_IOC_GETVERSION		_IOR('v', 1, long)
@@ -251,8 +253,10 @@ struct fsxattr {
 #define FS_IOC32_SETFLAGS		_IOW('f', 2, int)
 #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
-#define FS_IOC_FSGETXATTR		_IOR ('X', 31, struct fsxattr)
-#define FS_IOC_FSSETXATTR		_IOW ('X', 32, struct fsxattr)
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
+#define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
 
 /*
  * File system encryption support
-- 
cgit v1.2.3


From e79c1055749e3183a2beee04a24da378623329c5 Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Mon, 14 May 2018 14:48:09 -0400
Subject: bonding: allow use of tx hashing in balance-alb

The rx load balancing provided by balance-alb is not mutually
exclusive with using hashing for tx selection, and should provide a decent
speed increase because this eliminates spinlocks and cache contention.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_alb.c     | 20 ++++++++++++++++++--
 drivers/net/bonding/bond_main.c    | 25 +++++++++++++++----------
 drivers/net/bonding/bond_options.c |  2 +-
 include/net/bonding.h              | 11 +++++++++--
 4 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 67fd1af1d1de..e82108c917a6 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1478,8 +1478,24 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	}
 
 	if (do_tx_balance) {
-		hash_index = _simple_hash(hash_start, hash_size);
-		tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+		if (bond->params.tlb_dynamic_lb) {
+			hash_index = _simple_hash(hash_start, hash_size);
+			tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+		} else {
+			/*
+			 * do_tx_balance means we are free to select the tx_slave
+			 * So we do exactly what tlb would do for hash selection
+			 */
+
+			struct bond_up_slave *slaves;
+			unsigned int count;
+
+			slaves = rcu_dereference(bond->slave_arr);
+			count = slaves ? READ_ONCE(slaves->count) : 0;
+			if (likely(count))
+				tx_slave = slaves->arr[bond_xmit_hash(bond, skb) %
+						       count];
+		}
 	}
 
 	return bond_do_alb_xmit(skb, bond, tx_slave);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4176e1d95f47..a4cd7f6bfd4d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -159,7 +159,7 @@ module_param(min_links, int, 0);
 MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");
 
 module_param(xmit_hash_policy, charp, 0);
-MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; "
+MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
 				   "4 for encap layer 3+4");
@@ -1735,7 +1735,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		unblock_netpoll_tx();
 	}
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, NULL);
 
 	bond->nest_level = dev_get_nest_level(bond_dev);
@@ -1870,7 +1870,7 @@ static int __bond_release_one(struct net_device *bond_dev,
 	if (BOND_MODE(bond) == BOND_MODE_8023AD)
 		bond_3ad_unbind_slave(slave);
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, slave);
 
 	netdev_info(bond_dev, "Releasing %s interface %s\n",
@@ -3102,7 +3102,7 @@ static int bond_slave_netdev_event(unsigned long event,
 		 * events. If these (miimon/arpmon) parameters are configured
 		 * then array gets refreshed twice and that should be fine!
 		 */
-		if (bond_mode_uses_xmit_hash(bond))
+		if (bond_mode_can_use_xmit_hash(bond))
 			bond_update_slave_arr(bond, NULL);
 		break;
 	case NETDEV_CHANGEMTU:
@@ -3322,7 +3322,7 @@ static int bond_open(struct net_device *bond_dev)
 		 */
 		if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
 			return -ENOMEM;
-		if (bond->params.tlb_dynamic_lb)
+		if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
 			queue_delayed_work(bond->wq, &bond->alb_work, 0);
 	}
 
@@ -3341,7 +3341,7 @@ static int bond_open(struct net_device *bond_dev)
 		bond_3ad_initiate_agg_selection(bond, 1);
 	}
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, NULL);
 
 	return 0;
@@ -3894,7 +3894,7 @@ err:
  * to determine the slave interface -
  * (a) BOND_MODE_8023AD
  * (b) BOND_MODE_XOR
- * (c) BOND_MODE_TLB && tlb_dynamic_lb == 0
+ * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0
  *
  * The caller is expected to hold RTNL only and NO other lock!
  */
@@ -3947,6 +3947,11 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
 			continue;
 		if (skipslave == slave)
 			continue;
+
+		netdev_dbg(bond->dev,
+			   "Adding slave dev %s to tx hash array[%d]\n",
+			   slave->dev->name, new_arr->count);
+
 		new_arr->arr[new_arr->count++] = slave;
 	}
 
@@ -4324,9 +4329,9 @@ static int bond_check_params(struct bond_params *params)
 	}
 
 	if (xmit_hash_policy) {
-		if ((bond_mode != BOND_MODE_XOR) &&
-		    (bond_mode != BOND_MODE_8023AD) &&
-		    (bond_mode != BOND_MODE_TLB)) {
+		if (bond_mode == BOND_MODE_ROUNDROBIN ||
+		    bond_mode == BOND_MODE_ACTIVEBACKUP ||
+		    bond_mode == BOND_MODE_BROADCAST) {
 			pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
 				bond_mode_name(bond_mode));
 		} else {
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 58c705f24f96..8a945c9341d6 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -395,7 +395,7 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.id = BOND_OPT_TLB_DYNAMIC_LB,
 		.name = "tlb_dynamic_lb",
 		.desc = "Enable dynamic flow shuffling",
-		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB)),
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)),
 		.values = bond_tlb_dynamic_lb_tbl,
 		.flags = BOND_OPTFLAG_IFDOWN,
 		.set = bond_option_tlb_dynamic_lb_set,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b52235158836..808f1d167349 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -285,8 +285,15 @@ static inline bool bond_needs_speed_duplex(const struct bonding *bond)
 
 static inline bool bond_is_nondyn_tlb(const struct bonding *bond)
 {
-	return (BOND_MODE(bond) == BOND_MODE_TLB)  &&
-	       (bond->params.tlb_dynamic_lb == 0);
+	return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0);
+}
+
+static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond)
+{
+	return (BOND_MODE(bond) == BOND_MODE_8023AD ||
+		BOND_MODE(bond) == BOND_MODE_XOR ||
+		BOND_MODE(bond) == BOND_MODE_TLB ||
+		BOND_MODE(bond) == BOND_MODE_ALB);
 }
 
 static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond)
-- 
cgit v1.2.3


From 32f7b44d0f5661044fcfa84e9ad402ed9d759107 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 May 2018 10:50:31 +0200
Subject: sched: manipulate __QDISC_STATE_RUNNING in qdisc_run_* helpers

Currently NOLOCK qdiscs pay a measurable overhead to atomically
manipulate the __QDISC_STATE_RUNNING. Such bit is flipped twice per
packet in the uncontended scenario with packet rate below the
line rate: on packed dequeue and on the next, failing dequeue attempt.

This changeset moves the bit manipulation into the qdisc_run_{begin,end}
helpers, so that the bit is now flipped only once per packet, with
measurable performance improvement in the uncontended scenario.

This also allows simplifying the qdisc teardown code path - since
qdisc_is_running() is now effective for each qdisc type - and avoid a
possible race between qdisc_run() and dev_deactivate_many(), as now
the some_qdisc_is_busy() can properly detect NOLOCK qdiscs being busy
dequeuing packets.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 10 +++++++++-
 net/core/dev.c            |  2 +-
 net/sched/sch_generic.c   | 31 +++++++++----------------------
 3 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 5154c8300262..4d2b37226e75 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -113,13 +113,19 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
 
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
+	if (qdisc->flags & TCQ_F_NOLOCK)
+		return test_bit(__QDISC_STATE_RUNNING, &qdisc->state);
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
-	if (qdisc_is_running(qdisc))
+	if (qdisc->flags & TCQ_F_NOLOCK) {
+		if (test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state))
+			return false;
+	} else if (qdisc_is_running(qdisc)) {
 		return false;
+	}
 	/* Variant of write_seqcount_begin() telling lockdep a trylock
 	 * was attempted.
 	 */
@@ -131,6 +137,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
 	write_seqcount_end(&qdisc->running);
+	if (qdisc->flags & TCQ_F_NOLOCK)
+		clear_bit(__QDISC_STATE_RUNNING, &qdisc->state);
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/core/dev.c b/net/core/dev.c
index 9f4390182384..7ca19f47a92a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3244,7 +3244,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			rc = NET_XMIT_DROP;
 		} else {
 			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
-			__qdisc_run(q);
+			qdisc_run(q);
 		}
 
 		if (unlikely(to_free))
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39c144b6ff98..ff3ce71aec93 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -373,33 +373,24 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
  */
 static inline bool qdisc_restart(struct Qdisc *q, int *packets)
 {
-	bool more, validate, nolock = q->flags & TCQ_F_NOLOCK;
 	spinlock_t *root_lock = NULL;
 	struct netdev_queue *txq;
 	struct net_device *dev;
 	struct sk_buff *skb;
+	bool validate;
 
 	/* Dequeue packet */
-	if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
-		return false;
-
 	skb = dequeue_skb(q, &validate, packets);
-	if (unlikely(!skb)) {
-		if (nolock)
-			clear_bit(__QDISC_STATE_RUNNING, &q->state);
+	if (unlikely(!skb))
 		return false;
-	}
 
-	if (!nolock)
+	if (!(q->flags & TCQ_F_NOLOCK))
 		root_lock = qdisc_lock(q);
 
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
-	more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
-	if (nolock)
-		clear_bit(__QDISC_STATE_RUNNING, &q->state);
-	return more;
+	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
 }
 
 void __qdisc_run(struct Qdisc *q)
@@ -1131,17 +1122,13 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 		dev_queue = netdev_get_tx_queue(dev, i);
 		q = dev_queue->qdisc_sleeping;
 
-		if (q->flags & TCQ_F_NOLOCK) {
-			val = test_bit(__QDISC_STATE_SCHED, &q->state);
-		} else {
-			root_lock = qdisc_lock(q);
-			spin_lock_bh(root_lock);
+		root_lock = qdisc_lock(q);
+		spin_lock_bh(root_lock);
 
-			val = (qdisc_is_running(q) ||
-			       test_bit(__QDISC_STATE_SCHED, &q->state));
+		val = (qdisc_is_running(q) ||
+		       test_bit(__QDISC_STATE_SCHED, &q->state));
 
-			spin_unlock_bh(root_lock);
-		}
+		spin_unlock_bh(root_lock);
 
 		if (val)
 			return true;
-- 
cgit v1.2.3


From c59b3715ac16544f8f68ab7af03f108e339b36aa Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Wed, 16 May 2018 21:08:42 +0200
Subject: rtc: nvmem: don't return an error when not enabled

Avoid reporting an error when RTC_NVMEM is not selected.

Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/rtc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 4c007f69082f..6268208760e9 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -285,7 +285,7 @@ void rtc_nvmem_unregister(struct rtc_device *rtc);
 static inline int rtc_nvmem_register(struct rtc_device *rtc,
 				     struct nvmem_config *nvmem_config)
 {
-	return -ENODEV;
+	return 0;
 }
 static inline void rtc_nvmem_unregister(struct rtc_device *rtc) {}
 #endif
-- 
cgit v1.2.3


From be2d04d11fd33bd46622f94619aae1596d9f9303 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 16 May 2018 22:27:41 +0200
Subject: bpf: add __printf verification to bpf_verifier_vlog
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__printf is useful to verify format and arguments. ‘bpf_verifier_vlog’
function is used twice in verifier.c in both cases the caller function
already uses the __printf gcc attribute.

Remove the following warning, triggered with W=1:

  kernel/bpf/verifier.c:176:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format]

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8f70dc181e23..c286813deaeb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -200,8 +200,8 @@ struct bpf_verifier_env {
 	u32 subprog_cnt;
 };
 
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
-		       va_list args);
+__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
+				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 
-- 
cgit v1.2.3


From 20b6563ba164efc1ae24f637cb269bb3a9eb5467 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 13 May 2018 14:33:29 +0300
Subject: IB/uverbs: Expose GRE flow spec to the user-kernel ABI header

Add ib_uverbs_flow_spec_gre to define a rule to match the GRE
encapsulation protocol.

The spec includes the generic specs header, type, size and reserved
fields while the filter itself is defined as ib_uverbs_flow_gre_filter
and includes:
1. Checksum present bit, key present bit and version bits in a single
   16bit field.
2. Protocol type field - Indicates the ether protocol type of the
   encapsulated payload.
3. Key field - present if key bit is set and contains an application
   specific key value.

Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/uapi/rdma/ib_user_verbs.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 6aeb03315b0b..77c1a9a76e71 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -1033,6 +1033,33 @@ struct ib_uverbs_flow_spec_esp {
 	struct ib_uverbs_flow_spec_esp_filter mask;
 };
 
+struct ib_uverbs_flow_gre_filter {
+	/* c_ks_res0_ver field is bits 0-15 in offset 0 of a standard GRE header:
+	 * bit 0 - C - checksum bit.
+	 * bit 1 - reserved. set to 0.
+	 * bit 2 - key bit.
+	 * bit 3 - sequence number bit.
+	 * bits 4:12 - reserved. set to 0.
+	 * bits 13:15 - GRE version.
+	 */
+	__be16 c_ks_res0_ver;
+	__be16 protocol;
+	__be32 key;
+};
+
+struct ib_uverbs_flow_spec_gre {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_gre_filter     val;
+	struct ib_uverbs_flow_gre_filter     mask;
+};
+
 struct ib_uverbs_flow_attr {
 	__u32 type;
 	__u16 size;
-- 
cgit v1.2.3


From d90e5e5038ef8e62c82710e3970317ef4eb367cf Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 13 May 2018 14:33:30 +0300
Subject: IB/uverbs: Introduce a GRE steering match filter

Adding a new GRE steering match filter that can match against
key and protocol fields.

Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 11 +++++++++++
 include/rdma/ib_verbs.h              | 17 +++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 21a887c9523b..82c8c8b3e10b 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2948,6 +2948,17 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
 		memcpy(&ib_spec->esp.val, kern_spec_val, actual_filter_sz);
 		memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz);
 		break;
+	case IB_FLOW_SPEC_GRE:
+		ib_filter_sz = offsetof(struct ib_flow_gre_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->gre.size = sizeof(struct ib_flow_spec_gre);
+		memcpy(&ib_spec->gre.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz);
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9fc8a825aa28..e3ec61fe6060 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1852,6 +1852,7 @@ enum ib_flow_spec_type {
 	IB_FLOW_SPEC_TCP		= 0x40,
 	IB_FLOW_SPEC_UDP		= 0x41,
 	IB_FLOW_SPEC_VXLAN_TUNNEL	= 0x50,
+	IB_FLOW_SPEC_GRE		= 0x51,
 	IB_FLOW_SPEC_INNER		= 0x100,
 	/* Actions */
 	IB_FLOW_SPEC_ACTION_TAG         = 0x1000,
@@ -1994,6 +1995,21 @@ struct ib_flow_spec_esp {
 	struct ib_flow_esp_filter     mask;
 };
 
+struct ib_flow_gre_filter {
+	__be16 c_ks_res0_ver;
+	__be16 protocol;
+	__be32 key;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_gre {
+	u32                           type;
+	u16			      size;
+	struct ib_flow_gre_filter     val;
+	struct ib_flow_gre_filter     mask;
+};
+
 struct ib_flow_spec_action_tag {
 	enum ib_flow_spec_type	      type;
 	u16			      size;
@@ -2023,6 +2039,7 @@ union ib_flow_spec {
 	struct ib_flow_spec_ipv6        ipv6;
 	struct ib_flow_spec_tunnel      tunnel;
 	struct ib_flow_spec_esp		esp;
+	struct ib_flow_spec_gre		gre;
 	struct ib_flow_spec_action_tag  flow_tag;
 	struct ib_flow_spec_action_drop drop;
 	struct ib_flow_spec_action_handle action;
-- 
cgit v1.2.3


From 0d86bbec71b283803f844b35d50dba977be2db4a Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 13 May 2018 14:33:31 +0300
Subject: IB/uverbs: Expose MPLS flow spec to the user-kernel ABI header

Add ib_uverbs_flow_spec_mpls to define a rule to match the MPLS
protocol.

The spec includes the generic specs header, type, size and reserved
fields while the filter itself is defined as ib_uverbs_flow_mpls_filter
and includes a single 32bit field named 'label' which consists of:
Bits 0:19  - The MPLS label.
Bits 20:22 - Traffic class field.
Bit  23    - Bottom of stack bit.
Bits 24:31 - Time to live (TTL) field.

Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/uapi/rdma/ib_user_verbs.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 77c1a9a76e71..409507f83b91 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -1060,6 +1060,29 @@ struct ib_uverbs_flow_spec_gre {
 	struct ib_uverbs_flow_gre_filter     mask;
 };
 
+struct ib_uverbs_flow_mpls_filter {
+	/* The field includes the entire MPLS label:
+	 * bits 0:19 - label field.
+	 * bits 20:22 - traffic class field.
+	 * bits 23 - bottom of stack bit.
+	 * bits 24:31 - ttl field.
+	 */
+	__be32 label;
+};
+
+struct ib_uverbs_flow_spec_mpls {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_mpls_filter     val;
+	struct ib_uverbs_flow_mpls_filter     mask;
+};
+
 struct ib_uverbs_flow_attr {
 	__u32 type;
 	__u16 size;
-- 
cgit v1.2.3


From b04f0f036ac17b879e4d8c83f6503a19322ddde4 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 13 May 2018 14:33:32 +0300
Subject: IB/uverbs: Introduce a MPLS steering match filter

Add a new MPLS steering match filter that can match against
a single MPLS tag field.

Since the MPLS header can reside in different locations in the packet's
protocol stack as well as be encapsulated with a tunnel protocol, it
is required to know the exact location of the header in the protocol
stack.

Therefore, when including the MPLS protocol spec in the specs list,
it is mandatory to provide the list in an ordered manner, so
that it represents the actual header order in a matching packet.

Drivers that process the spec list and apply the matching rule
should treat the position of the MPLS spec in the spec list as the
actual location of the MPLS label in the packet's protocol stack.

Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 12 ++++++++++++
 include/rdma/ib_verbs.h              | 15 +++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 82c8c8b3e10b..e74262ee104c 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2959,6 +2959,17 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
 		memcpy(&ib_spec->gre.val, kern_spec_val, actual_filter_sz);
 		memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz);
 		break;
+	case IB_FLOW_SPEC_MPLS:
+		ib_filter_sz = offsetof(struct ib_flow_mpls_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->mpls.size = sizeof(struct ib_flow_spec_mpls);
+		memcpy(&ib_spec->mpls.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->mpls.mask, kern_spec_mask, actual_filter_sz);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -3518,6 +3529,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 					   uflow_res);
 		if (err)
 			goto err_free;
+
 		flow_attr->size +=
 			((union ib_flow_spec *) ib_spec)->size;
 		cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e3ec61fe6060..e849bd0fc618 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1853,6 +1853,7 @@ enum ib_flow_spec_type {
 	IB_FLOW_SPEC_UDP		= 0x41,
 	IB_FLOW_SPEC_VXLAN_TUNNEL	= 0x50,
 	IB_FLOW_SPEC_GRE		= 0x51,
+	IB_FLOW_SPEC_MPLS		= 0x60,
 	IB_FLOW_SPEC_INNER		= 0x100,
 	/* Actions */
 	IB_FLOW_SPEC_ACTION_TAG         = 0x1000,
@@ -2010,6 +2011,19 @@ struct ib_flow_spec_gre {
 	struct ib_flow_gre_filter     mask;
 };
 
+struct ib_flow_mpls_filter {
+	__be32 tag;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_mpls {
+	u32                           type;
+	u16			      size;
+	struct ib_flow_mpls_filter     val;
+	struct ib_flow_mpls_filter     mask;
+};
+
 struct ib_flow_spec_action_tag {
 	enum ib_flow_spec_type	      type;
 	u16			      size;
@@ -2040,6 +2054,7 @@ union ib_flow_spec {
 	struct ib_flow_spec_tunnel      tunnel;
 	struct ib_flow_spec_esp		esp;
 	struct ib_flow_spec_gre		gre;
+	struct ib_flow_spec_mpls	mpls;
 	struct ib_flow_spec_action_tag  flow_tag;
 	struct ib_flow_spec_action_drop drop;
 	struct ib_flow_spec_action_handle action;
-- 
cgit v1.2.3


From 71c6e8638ce3baf9ec16d64d263aab74beac912d Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 13 May 2018 14:33:34 +0300
Subject: IB/mlx5: Add support for MPLS flow specification

This patch introduces support for the MPLS flow spec and
allows the creation of rules that are matching on the
MPLS label.

Applying the rule matching depends on the flow specs order and
the location of the MPLS in the spec list as there are different
configurations to be made in the device in the cases of MPLSoGRE
and MPLSoUDP vs. non-encapsulated MPLS.

Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                 | 102 +++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |  14 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |   2 +-
 include/linux/mlx5/device.h                       |   7 ++
 include/linux/mlx5/mlx5_ifc.h                     |  45 ++++++++--
 5 files changed, 159 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 81f696b21356..8792248034cb 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2386,7 +2386,8 @@ static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
 enum {
 	MATCH_CRITERIA_ENABLE_OUTER_BIT,
 	MATCH_CRITERIA_ENABLE_MISC_BIT,
-	MATCH_CRITERIA_ENABLE_INNER_BIT
+	MATCH_CRITERIA_ENABLE_INNER_BIT,
+	MATCH_CRITERIA_ENABLE_MISC2_BIT
 };
 
 #define HEADER_IS_ZERO(match_criteria, headers)			           \
@@ -2406,6 +2407,9 @@ static u8 get_match_criteria_enable(u32 *match_criteria)
 	match_criteria_enable |=
 		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
 		MATCH_CRITERIA_ENABLE_INNER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
+		MATCH_CRITERIA_ENABLE_MISC2_BIT;
 
 	return match_criteria_enable;
 }
@@ -2440,6 +2444,27 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
 	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
 }
 
+static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
+{
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 #define LAST_ETH_FIELD vlan_tag
 #define LAST_IB_FIELD sl
 #define LAST_IPV4_FIELD tos
@@ -2480,12 +2505,16 @@ static int parse_flow_flow_action(const union ib_flow_spec *ib_spec,
 static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
 			   u32 *match_v, const union ib_flow_spec *ib_spec,
 			   const struct ib_flow_attr *flow_attr,
-			   struct mlx5_flow_act *action)
+			   struct mlx5_flow_act *action, u32 prev_type)
 {
 	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					   misc_parameters);
 	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
 					   misc_parameters);
+	void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					    misc_parameters_2);
+	void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					    misc_parameters_2);
 	void *headers_c;
 	void *headers_v;
 	int match_ipv;
@@ -2713,6 +2742,70 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
 		       &ib_spec->gre.val.key,
 		       sizeof(ib_spec->gre.val.key));
 		break;
+	case IB_FLOW_SPEC_MPLS:
+		switch (prev_type) {
+		case IB_FLOW_SPEC_UDP:
+			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+						   ft_field_support.outer_first_mpls_over_udp),
+						   &ib_spec->mpls.mask.tag))
+				return -EOPNOTSUPP;
+
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+					    outer_first_mpls_over_udp),
+			       &ib_spec->mpls.val.tag,
+			       sizeof(ib_spec->mpls.val.tag));
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+					    outer_first_mpls_over_udp),
+			       &ib_spec->mpls.mask.tag,
+			       sizeof(ib_spec->mpls.mask.tag));
+			break;
+		case IB_FLOW_SPEC_GRE:
+			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+						   ft_field_support.outer_first_mpls_over_gre),
+						   &ib_spec->mpls.mask.tag))
+				return -EOPNOTSUPP;
+
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+					    outer_first_mpls_over_gre),
+			       &ib_spec->mpls.val.tag,
+			       sizeof(ib_spec->mpls.val.tag));
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+					    outer_first_mpls_over_gre),
+			       &ib_spec->mpls.mask.tag,
+			       sizeof(ib_spec->mpls.mask.tag));
+			break;
+		default:
+			if (ib_spec->type & IB_FLOW_SPEC_INNER) {
+				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+							   ft_field_support.inner_first_mpls),
+							   &ib_spec->mpls.mask.tag))
+					return -EOPNOTSUPP;
+
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+						    inner_first_mpls),
+				       &ib_spec->mpls.val.tag,
+				       sizeof(ib_spec->mpls.val.tag));
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+						    inner_first_mpls),
+				       &ib_spec->mpls.mask.tag,
+				       sizeof(ib_spec->mpls.mask.tag));
+			} else {
+				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+							   ft_field_support.outer_first_mpls),
+							   &ib_spec->mpls.mask.tag))
+					return -EOPNOTSUPP;
+
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+						    outer_first_mpls),
+				       &ib_spec->mpls.val.tag,
+				       sizeof(ib_spec->mpls.val.tag));
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+						    outer_first_mpls),
+				       &ib_spec->mpls.mask.tag,
+				       sizeof(ib_spec->mpls.mask.tag));
+			}
+		}
+		break;
 	case IB_FLOW_SPEC_VXLAN_TUNNEL:
 		if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
 					 LAST_TUNNEL_FIELD))
@@ -3044,6 +3137,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 	struct mlx5_flow_destination *rule_dst = dst;
 	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
 	unsigned int spec_index;
+	u32 prev_type = 0;
 	int err = 0;
 	int dest_num = 1;
 	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
@@ -3063,10 +3157,12 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
 		err = parse_flow_attr(dev->mdev, spec->match_criteria,
 				      spec->match_value,
-				      ib_flow, flow_attr, &flow_act);
+				      ib_flow, flow_attr, &flow_act,
+				      prev_type);
 		if (err < 0)
 			goto free;
 
+		prev_type = ((union ib_flow_spec *)ib_flow)->type;
 		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index de51e7c39bc8..556202b9256a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -324,7 +324,8 @@ static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria
 	if (match_criteria_enable & ~(
 		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS)   |
 		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) |
-		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS)))
+		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS) |
+		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2)))
 		return false;
 
 	if (!(match_criteria_enable &
@@ -360,6 +361,17 @@ static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria
 			return false;
 	}
 
+	if (!(match_criteria_enable &
+	      1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2)) {
+		char *fg_type_mask = MLX5_ADDR_OF(fte_match_param,
+						  match_criteria, misc_parameters_2);
+
+		if (fg_type_mask[0] ||
+		    memcmp(fg_type_mask, fg_type_mask + 1,
+			   MLX5_ST_SZ_BYTES(fte_match_set_misc2) - 1))
+			return false;
+	}
+
 	return check_last_reserved(match_criteria);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index e26d3e9d5f9f..b6da322a8016 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -159,7 +159,7 @@ struct mlx5_ft_underlay_qp {
 	u32 qpn;
 };
 
-#define MLX5_FTE_MATCH_PARAM_RESERVED	reserved_at_600
+#define MLX5_FTE_MATCH_PARAM_RESERVED	reserved_at_800
 /* Calculate the fte_match_param length and without the reserved length.
  * Make sure the reserved field is the last.
  */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2bc27f8c5b87..fd1a9341edfa 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -994,6 +994,13 @@ enum mlx5_wol_mode {
 	MLX5_WOL_PHY_ACTIVITY   = 1 << 7,
 };
 
+enum mlx5_mpls_supported_fields {
+	MLX5_FIELD_SUPPORT_MPLS_LABEL = 1 << 0,
+	MLX5_FIELD_SUPPORT_MPLS_EXP   = 1 << 1,
+	MLX5_FIELD_SUPPORT_MPLS_S_BOS = 1 << 2,
+	MLX5_FIELD_SUPPORT_MPLS_TTL   = 1 << 3
+};
+
 /* MLX5 DEV CAPs */
 
 /* TODO: EAT.ME */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1aad455538f4..3fee2f74d09d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -298,9 +298,15 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         inner_tcp_dport[0x1];
 	u8         inner_tcp_flags[0x1];
 	u8         reserved_at_37[0x9];
-	u8         reserved_at_40[0x17];
+
+	u8         reserved_at_40[0x5];
+	u8         outer_first_mpls_over_udp[0x4];
+	u8         outer_first_mpls_over_gre[0x4];
+	u8         inner_first_mpls[0x4];
+	u8         outer_first_mpls[0x4];
+	u8         reserved_at_55[0x2];
 	u8	   outer_esp_spi[0x1];
-	u8	   reserved_at_58[0x2];
+	u8         reserved_at_58[0x2];
 	u8         bth_dst_qp[0x1];
 
 	u8         reserved_at_5b[0x25];
@@ -450,6 +456,29 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         reserved_at_1a0[0x60];
 };
 
+struct mlx5_ifc_fte_match_mpls_bits {
+	u8         mpls_label[0x14];
+	u8         mpls_exp[0x3];
+	u8         mpls_s_bos[0x1];
+	u8         mpls_ttl[0x8];
+};
+
+struct mlx5_ifc_fte_match_set_misc2_bits {
+	struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls;
+
+	struct mlx5_ifc_fte_match_mpls_bits inner_first_mpls;
+
+	struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_gre;
+
+	struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_udp;
+
+	u8         reserved_at_80[0x100];
+
+	u8         metadata_reg_a[0x20];
+
+	u8         reserved_at_1a0[0x60];
+};
+
 struct mlx5_ifc_cmd_pas_bits {
 	u8         pa_h[0x20];
 
@@ -1170,7 +1199,9 @@ struct mlx5_ifc_fte_match_param_bits {
 
 	struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers;
 
-	u8         reserved_at_600[0xa00];
+	struct mlx5_ifc_fte_match_set_misc2_bits misc_parameters_2;
+
+	u8         reserved_at_800[0x800];
 };
 
 enum {
@@ -4579,6 +4610,7 @@ enum {
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
+	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0X3,
 };
 
 struct mlx5_ifc_query_flow_group_out_bits {
@@ -6969,9 +7001,10 @@ struct mlx5_ifc_create_flow_group_out_bits {
 };
 
 enum {
-	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
-	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
-	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS     = 0x0,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS   = 0x1,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS     = 0x2,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3,
 };
 
 struct mlx5_ifc_create_flow_group_in_bits {
-- 
cgit v1.2.3


From e818e255a58d64e86c8c93e3aa52498b1a3d1760 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 13 May 2018 14:33:35 +0300
Subject: IB/mlx5: Expose MPLS related tunneling offloads

This patch reports the device's capbilities to offload
encapsulated MPLS tunnel protocols to user-space:
- Capability to offload MPLS over GRE.
- Capability to offload MPLS over UDP.

Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 8 ++++++++
 include/linux/mlx5/device.h       | 5 +++++
 include/linux/mlx5/mlx5_ifc.h     | 4 +++-
 include/uapi/rdma/mlx5-abi.h      | 4 +++-
 4 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 8792248034cb..ab8cd5c034a2 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1084,6 +1084,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
 			resp.tunnel_offloads_caps |=
 				MLX5_IB_TUNNELED_OFFLOADS_GRE;
+		if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
+		    MLX5_FLEX_PROTO_CW_MPLS_GRE)
+			resp.tunnel_offloads_caps |=
+				MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
+		if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
+		    MLX5_FLEX_PROTO_CW_MPLS_UDP)
+			resp.tunnel_offloads_caps |=
+				MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
 	}
 
 	if (uhw->outlen) {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index fd1a9341edfa..5004ddc702e3 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1001,6 +1001,11 @@ enum mlx5_mpls_supported_fields {
 	MLX5_FIELD_SUPPORT_MPLS_TTL   = 1 << 3
 };
 
+enum mlx5_flex_parser_protos {
+	MLX5_FLEX_PROTO_CW_MPLS_GRE   = 1 << 4,
+	MLX5_FLEX_PROTO_CW_MPLS_UDP   = 1 << 5,
+};
+
 /* MLX5 DEV CAPs */
 
 /* TODO: EAT.ME */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3fee2f74d09d..68f756ea550d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1138,7 +1138,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_500[0x20];
 	u8	   num_of_uars_per_page[0x20];
-	u8         reserved_at_540[0x40];
+
+	u8         flex_parser_protocols[0x20];
+	u8         reserved_at_560[0x20];
 
 	u8         reserved_at_580[0x3d];
 	u8         cqe_128_always[0x1];
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index fdaf00e20649..508ea8c82da7 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -233,7 +233,9 @@ enum mlx5_ib_query_dev_resp_flags {
 enum mlx5_ib_tunnel_offloads {
 	MLX5_IB_TUNNELED_OFFLOADS_VXLAN  = 1 << 0,
 	MLX5_IB_TUNNELED_OFFLOADS_GRE    = 1 << 1,
-	MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2
+	MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2,
+	MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE = 1 << 3,
+	MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP = 1 << 4,
 };
 
 struct mlx5_ib_query_device_resp {
-- 
cgit v1.2.3


From 84b3a7c9c6befe5ab4d49070fe7bcab2da22637e Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 15 May 2018 15:07:17 -0700
Subject: regulator: core: Allow for regulators that can't be read at bootup

Regulators attached via RPMh on Qualcomm sdm845 apparently are
write-only.  Specifically you can send a request for a certain voltage
but you can't read back to see what voltage you've requested.  What
this means is that at bootup we have absolutely no idea what voltage
we could be at.

As discussed in the patches to try to support the RPMh regulators [1],
the fact that regulators are write-only means that its driver's
get_voltage_sel() should return an error code if it's called before
any calls to set_voltage_sel().  This causes problems in
machine_constraints_voltage() when trying to apply the constraints.

A proposed fix was to come up with an error code that could be
returned by get_voltage_sel() which would cause the regulator
framework to simply try setting the voltage with the current
constraints.

In this patch I propose the error code -ENOTRECOVERABLE.  In errno.h
this error is described as "State not recoverable".  Though the error
code was originally intended "for robust mutexes", the description of
the error code seems to apply here because we can't read the state of
the regulator.  Also note that the only existing user of this error
code in the regulator framework is tps65090-regulator.c which returns
this error code from the enable() call (not get_voltage() or
get_voltage_sel()), so there should be no existing regulators that
might accidentally get the new behavior.  (Side note is that tps65090
seems to interpret this error code to mean an error that you can't
recover from rather than some data that can't be recovered).

[1] https://patchwork.kernel.org/patch/10340897/

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 12 ++++++++++++
 include/linux/regulator/driver.h |  7 +++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index d4803460a557..fe314ff56772 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -886,6 +886,18 @@ static int machine_constraints_voltage(struct regulator_dev *rdev,
 	    rdev->constraints->min_uV && rdev->constraints->max_uV) {
 		int target_min, target_max;
 		int current_uV = _regulator_get_voltage(rdev);
+
+		if (current_uV == -ENOTRECOVERABLE) {
+			/* This regulator can't be read and must be initted */
+			rdev_info(rdev, "Setting %d-%duV\n",
+				  rdev->constraints->min_uV,
+				  rdev->constraints->max_uV);
+			_regulator_do_set_voltage(rdev,
+						  rdev->constraints->min_uV,
+						  rdev->constraints->max_uV);
+			current_uV = _regulator_get_voltage(rdev);
+		}
+
 		if (current_uV < 0) {
 			rdev_err(rdev,
 				 "failed to get the current voltage(%d)\n",
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4fc96cb8e5d7..14e512ad6d4f 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -81,9 +81,12 @@ struct regulator_linear_range {
  * @set_voltage_sel: Set the voltage for the regulator using the specified
  *                   selector.
  * @map_voltage: Convert a voltage into a selector
- * @get_voltage: Return the currently configured voltage for the regulator.
+ * @get_voltage: Return the currently configured voltage for the regulator;
+ *                   return -ENOTRECOVERABLE if regulator can't be read at
+ *                   bootup and hasn't been set yet.
  * @get_voltage_sel: Return the currently configured voltage selector for the
- *                   regulator.
+ *                   regulator; return -ENOTRECOVERABLE if regulator can't
+ *                   be read at bootup and hasn't been set yet.
  * @list_voltage: Return one of the supported voltages, in microvolts; zero
  *	if the selector indicates a voltage that is unusable on this system;
  *	or negative errno.  Selectors range from zero to one less than
-- 
cgit v1.2.3


From dc82e52492f684dcd5ed9e4773e72dbf2203d75e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 15 May 2018 20:25:29 +0200
Subject: ALSA: core: Assure control device to be registered at last

The commit 289ca025ee1d ("ALSA: Use priority list for managing device
list") changed the way to register/disconnect/free devices via a
single priority list.  This helped to make behavior consistent, but it
also changed a slight behavior change: namely, the control device is
registered earlier than others, while it was supposed to be the very
last one.

I've put SNDRV_DEV_CONTROL in the current position as the release of
ctl elements often conflict with the private ctl elements some PCM or
other components may create, which often leads to a double-free.
But, the order of register and disconnect should be indeed fixed as
expected in the early days: the control device gets registered at
last, and disconnected at first.

This patch changes the priority list order to move SNDRV_DEV_CONTROL
as the last guy to assure the register / disconnect order.  Meanwhile,
for keeping the messy resource release order, manually treat the
control and lowlevel devices as last freed one.

Additional note:
The lowlevel device is the device where a card driver creates at
probe.  And, we still keep the release order control -> lowlevel, as
there might  be link from a control element back to a lowlevel object.

Fixes: 289ca025ee1d ("ALSA: Use priority list for managing device list")
Reported-by: Tzung-Bi Shih <tzungbi@google.com>
Tested-by: Tzung-Bi Shih <tzungbi@google.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/core.h | 2 +-
 sound/core/device.c  | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/core.h b/include/sound/core.h
index 5f181b875c2f..36a5934cf4b1 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -51,7 +51,6 @@ struct completion;
  */
 enum snd_device_type {
 	SNDRV_DEV_LOWLEVEL,
-	SNDRV_DEV_CONTROL,
 	SNDRV_DEV_INFO,
 	SNDRV_DEV_BUS,
 	SNDRV_DEV_CODEC,
@@ -62,6 +61,7 @@ enum snd_device_type {
 	SNDRV_DEV_SEQUENCER,
 	SNDRV_DEV_HWDEP,
 	SNDRV_DEV_JACK,
+	SNDRV_DEV_CONTROL,	/* NOTE: this must be the last one */
 };
 
 enum snd_device_state {
diff --git a/sound/core/device.c b/sound/core/device.c
index cb0e46f66cc9..535102d564e3 100644
--- a/sound/core/device.c
+++ b/sound/core/device.c
@@ -240,6 +240,15 @@ void snd_device_free_all(struct snd_card *card)
 
 	if (snd_BUG_ON(!card))
 		return;
+	list_for_each_entry_safe_reverse(dev, next, &card->devices, list) {
+		/* exception: free ctl and lowlevel stuff later */
+		if (dev->type == SNDRV_DEV_CONTROL ||
+		    dev->type == SNDRV_DEV_LOWLEVEL)
+			continue;
+		__snd_device_free(dev);
+	}
+
+	/* free all */
 	list_for_each_entry_safe_reverse(dev, next, &card->devices, list)
 		__snd_device_free(dev);
 }
-- 
cgit v1.2.3


From e1739e86f0cb9c48e8745a610e6981a4e24cadad Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:24 +0200
Subject: regulator: arizona-ldo1: Look up a descriptor and pass to the core

Instead of passing a global GPIO number, pass a descriptor looked
up with the standard devm_gpiod_get_optional() call.

We have augmented the GPIO core to look up the regulator special
GPIO "wlf,ldoena" in commit 6a537d48461d
"gpio: of: Support regulator nonstandard GPIO properties".

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-s3c64xx/mach-crag6410-module.c | 28 ++++++++++++++++++++++------
 drivers/regulator/arizona-ldo1.c             | 19 ++++++-------------
 include/linux/regulator/arizona-ldo1.h       |  3 ---
 3 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-s3c64xx/mach-crag6410-module.c b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
index f00988705408..5aa472892465 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410-module.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
@@ -9,6 +9,7 @@
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
 #include <linux/spi/spi.h>
+#include <linux/gpio/machine.h>
 
 #include <linux/mfd/wm831x/irq.h>
 #include <linux/mfd/wm831x/gpio.h>
@@ -206,9 +207,6 @@ static const struct i2c_board_info wm1277_devs[] = {
 };
 
 static struct arizona_pdata wm5102_reva_pdata = {
-	.ldo1 = {
-		.ldoena = S3C64XX_GPN(7),
-	},
 	.gpio_base = CODEC_GPIO_BASE,
 	.irq_flags = IRQF_TRIGGER_HIGH,
 	.micd_pol_gpio = CODEC_GPIO_BASE + 4,
@@ -237,10 +235,16 @@ static struct spi_board_info wm5102_reva_spi_devs[] = {
 	},
 };
 
-static struct arizona_pdata wm5102_pdata = {
-	.ldo1 = {
-		.ldoena = S3C64XX_GPN(7),
+static struct gpiod_lookup_table wm5102_reva_gpiod_table = {
+	.dev_id = "spi0.1", /* SPI device name */
+	.table = {
+		GPIO_LOOKUP("GPION", 7,
+			    "wlf,ldoena", GPIO_ACTIVE_HIGH),
+		{ },
 	},
+};
+
+static struct arizona_pdata wm5102_pdata = {
 	.gpio_base = CODEC_GPIO_BASE,
 	.irq_flags = IRQF_TRIGGER_HIGH,
 	.micd_pol_gpio = CODEC_GPIO_BASE + 2,
@@ -264,6 +268,15 @@ static struct spi_board_info wm5102_spi_devs[] = {
 	},
 };
 
+static struct gpiod_lookup_table wm5102_gpiod_table = {
+	.dev_id = "spi0.1", /* SPI device name */
+	.table = {
+		GPIO_LOOKUP("GPION", 7,
+			    "wlf,ldo1ena", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static struct spi_board_info wm5110_spi_devs[] = {
 	[0] = {
 		.modalias	= "wm5110",
@@ -366,6 +379,9 @@ static int wlf_gf_module_probe(struct i2c_client *i2c,
 					    rev == gf_mods[i].rev))
 			break;
 
+	gpiod_add_lookup_table(&wm5102_reva_gpiod_table);
+	gpiod_add_lookup_table(&wm5102_gpiod_table);
+
 	if (i < ARRAY_SIZE(gf_mods)) {
 		dev_info(&i2c->dev, "%s revision %d\n",
 			 gf_mods[i].name, rev + 1);
diff --git a/drivers/regulator/arizona-ldo1.c b/drivers/regulator/arizona-ldo1.c
index 96fddfff5dc4..f6d6a4ad9e8a 100644
--- a/drivers/regulator/arizona-ldo1.c
+++ b/drivers/regulator/arizona-ldo1.c
@@ -17,12 +17,11 @@
 #include <linux/bitops.h>
 #include <linux/err.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/of_regulator.h>
-#include <linux/gpio.h>
 #include <linux/slab.h>
 
 #include <linux/regulator/arizona-ldo1.h>
@@ -198,16 +197,6 @@ static int arizona_ldo1_of_get_pdata(struct arizona_ldo1_pdata *pdata,
 	struct device_node *init_node, *dcvdd_node;
 	struct regulator_init_data *init_data;
 
-	pdata->ldoena = of_get_named_gpio(np, "wlf,ldoena", 0);
-	if (pdata->ldoena < 0) {
-		dev_warn(config->dev,
-			 "LDOENA GPIO property missing/malformed: %d\n",
-			 pdata->ldoena);
-		pdata->ldoena = 0;
-	} else {
-		config->ena_gpio_initialized = true;
-	}
-
 	init_node = of_get_child_by_name(np, "ldo1");
 	dcvdd_node = of_parse_phandle(np, "DCVDD-supply", 0);
 
@@ -264,7 +253,11 @@ static int arizona_ldo1_common_init(struct platform_device *pdev,
 		}
 	}
 
-	config.ena_gpio = pdata->ldoena;
+	/* We assume that high output = regulator off */
+	config.ena_gpiod = devm_gpiod_get_optional(&pdev->dev, "wlf,ldoena",
+						   GPIOD_OUT_HIGH);
+	if (IS_ERR(config.ena_gpiod))
+		return PTR_ERR(config.ena_gpiod);
 
 	if (pdata->init_data)
 		config.init_data = pdata->init_data;
diff --git a/include/linux/regulator/arizona-ldo1.h b/include/linux/regulator/arizona-ldo1.h
index c685f1277c63..fe74ab9990e6 100644
--- a/include/linux/regulator/arizona-ldo1.h
+++ b/include/linux/regulator/arizona-ldo1.h
@@ -14,9 +14,6 @@
 struct regulator_init_data;
 
 struct arizona_ldo1_pdata {
-	/** GPIO controlling LDOENA, if any */
-	int ldoena;
-
 	/** Regulator configuration for LDO1 */
 	const struct regulator_init_data *init_data;
 };
-- 
cgit v1.2.3


From 66cf9a7e0192734c1c94751e628bd075be62cff4 Mon Sep 17 00:00:00 2001
From: Maciej Purski <m.purski@samsung.com>
Date: Mon, 23 Apr 2018 16:33:37 +0200
Subject: regulator: core: Make locks re-entrant

Setting voltage, enabling/disabling regulators requires operations on
all regulators related with the regulator being changed. Therefore,
all of them should be locked for the whole operation. With the current
locking implementation, adding additional dependency (regulators
coupling) causes deadlocks in some cases.

Introduce a possibility to attempt to lock a mutex multiple times
by the same task without waiting on a mutex. This should handle all
reasonable coupling-supplying combinations, especially when two coupled
regulators share common supplies. The only situation that should be
forbidden is simultaneous coupling and supplying between a pair of
regulators.

The idea is based on clk core.

Signed-off-by: Maciej Purski <m.purski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 132 +++++++++++++++++++++++++++------------
 include/linux/regulator/driver.h |   2 +
 2 files changed, 93 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index fe314ff56772..0ca941b53571 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -146,6 +146,56 @@ static inline struct regulator_dev *rdev_get_supply(struct regulator_dev *rdev)
 	return NULL;
 }
 
+/**
+ * regulator_lock_nested - lock a single regulator
+ * @rdev:		regulator source
+ * @subclass:		mutex subclass used for lockdep
+ *
+ * This function can be called many times by one task on
+ * a single regulator and its mutex will be locked only
+ * once. If a task, which is calling this function is other
+ * than the one, which initially locked the mutex, it will
+ * wait on mutex.
+ */
+static void regulator_lock_nested(struct regulator_dev *rdev,
+				  unsigned int subclass)
+{
+	if (!mutex_trylock(&rdev->mutex)) {
+		if (rdev->mutex_owner == current) {
+			rdev->ref_cnt++;
+			return;
+		}
+		mutex_lock_nested(&rdev->mutex, subclass);
+	}
+
+	rdev->ref_cnt = 1;
+	rdev->mutex_owner = current;
+}
+
+static inline void regulator_lock(struct regulator_dev *rdev)
+{
+	regulator_lock_nested(rdev, 0);
+}
+
+/**
+ * regulator_unlock - unlock a single regulator
+ * @rdev:		regulator_source
+ *
+ * This function unlocks the mutex when the
+ * reference counter reaches 0.
+ */
+static void regulator_unlock(struct regulator_dev *rdev)
+{
+	if (rdev->ref_cnt != 0) {
+		rdev->ref_cnt--;
+
+		if (!rdev->ref_cnt) {
+			rdev->mutex_owner = NULL;
+			mutex_unlock(&rdev->mutex);
+		}
+	}
+}
+
 /**
  * regulator_lock_supply - lock a regulator and its supplies
  * @rdev:         regulator source
@@ -155,7 +205,7 @@ static void regulator_lock_supply(struct regulator_dev *rdev)
 	int i;
 
 	for (i = 0; rdev; rdev = rdev_get_supply(rdev), i++)
-		mutex_lock_nested(&rdev->mutex, i);
+		regulator_lock_nested(rdev, i);
 }
 
 /**
@@ -167,7 +217,7 @@ static void regulator_unlock_supply(struct regulator_dev *rdev)
 	struct regulator *supply;
 
 	while (1) {
-		mutex_unlock(&rdev->mutex);
+		regulator_unlock(rdev);
 		supply = rdev->supply;
 
 		if (!rdev->supply)
@@ -350,9 +400,9 @@ static ssize_t regulator_uV_show(struct device *dev,
 	struct regulator_dev *rdev = dev_get_drvdata(dev);
 	ssize_t ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	ret = sprintf(buf, "%d\n", _regulator_get_voltage(rdev));
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return ret;
 }
@@ -416,9 +466,9 @@ static ssize_t regulator_state_show(struct device *dev,
 	struct regulator_dev *rdev = dev_get_drvdata(dev);
 	ssize_t ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	ret = regulator_print_state(buf, _regulator_is_enabled(rdev));
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return ret;
 }
@@ -526,10 +576,10 @@ static ssize_t regulator_total_uA_show(struct device *dev,
 	struct regulator *regulator;
 	int uA = 0;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	list_for_each_entry(regulator, &rdev->consumer_list, list)
 		uA += regulator->uA_load;
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return sprintf(buf, "%d\n", uA);
 }
 static DEVICE_ATTR(requested_microamps, 0444, regulator_total_uA_show, NULL);
@@ -1333,7 +1383,7 @@ static struct regulator *create_regulator(struct regulator_dev *rdev,
 	if (regulator == NULL)
 		return NULL;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	regulator->rdev = rdev;
 	list_add(&regulator->list, &rdev->consumer_list);
 
@@ -1388,12 +1438,12 @@ static struct regulator *create_regulator(struct regulator_dev *rdev,
 	    _regulator_is_enabled(rdev))
 		regulator->always_on = true;
 
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return regulator;
 overflow_err:
 	list_del(&regulator->list);
 	kfree(regulator);
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return NULL;
 }
 
@@ -1782,13 +1832,13 @@ static void _regulator_put(struct regulator *regulator)
 	/* remove any sysfs entries */
 	if (regulator->dev)
 		sysfs_remove_link(&rdev->dev.kobj, regulator->supply_name);
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	list_del(&regulator->list);
 
 	rdev->open_count--;
 	rdev->exclusive = 0;
 	put_device(&rdev->dev);
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	kfree_const(regulator->supply_name);
 	kfree(regulator);
@@ -2396,7 +2446,7 @@ static void regulator_disable_work(struct work_struct *work)
 						  disable_work.work);
 	int count, i, ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	BUG_ON(!rdev->deferred_disables);
 
@@ -2417,7 +2467,7 @@ static void regulator_disable_work(struct work_struct *work)
 			rdev_err(rdev, "Deferred disable failed: %d\n", ret);
 	}
 
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	if (rdev->supply) {
 		for (i = 0; i < count; i++) {
@@ -2452,11 +2502,11 @@ int regulator_disable_deferred(struct regulator *regulator, int ms)
 	if (!ms)
 		return regulator_disable(regulator);
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	rdev->deferred_disables++;
 	mod_delayed_work(system_power_efficient_wq, &rdev->disable_work,
 			 msecs_to_jiffies(ms));
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return 0;
 }
@@ -2488,10 +2538,10 @@ static int _regulator_list_voltage(struct regulator_dev *rdev,
 		if (selector >= rdev->desc->n_voltages)
 			return -EINVAL;
 		if (lock)
-			mutex_lock(&rdev->mutex);
+			regulator_lock(rdev);
 		ret = ops->list_voltage(rdev, selector);
 		if (lock)
-			mutex_unlock(&rdev->mutex);
+			regulator_unlock(rdev);
 	} else if (rdev->is_switch && rdev->supply) {
 		ret = _regulator_list_voltage(rdev->supply->rdev,
 					      selector, lock);
@@ -3264,7 +3314,7 @@ int regulator_sync_voltage(struct regulator *regulator)
 	struct regulator_voltage *voltage = &regulator->voltage[PM_SUSPEND_ON];
 	int ret, min_uV, max_uV;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	if (!rdev->desc->ops->set_voltage &&
 	    !rdev->desc->ops->set_voltage_sel) {
@@ -3293,7 +3343,7 @@ int regulator_sync_voltage(struct regulator *regulator)
 	ret = _regulator_do_set_voltage(rdev, min_uV, max_uV);
 
 out:
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(regulator_sync_voltage);
@@ -3386,7 +3436,7 @@ int regulator_set_current_limit(struct regulator *regulator,
 	struct regulator_dev *rdev = regulator->rdev;
 	int ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	/* sanity check */
 	if (!rdev->desc->ops->set_current_limit) {
@@ -3401,7 +3451,7 @@ int regulator_set_current_limit(struct regulator *regulator,
 
 	ret = rdev->desc->ops->set_current_limit(rdev, min_uA, max_uA);
 out:
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(regulator_set_current_limit);
@@ -3410,7 +3460,7 @@ static int _regulator_get_current_limit(struct regulator_dev *rdev)
 {
 	int ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	/* sanity check */
 	if (!rdev->desc->ops->get_current_limit) {
@@ -3420,7 +3470,7 @@ static int _regulator_get_current_limit(struct regulator_dev *rdev)
 
 	ret = rdev->desc->ops->get_current_limit(rdev);
 out:
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return ret;
 }
 
@@ -3456,7 +3506,7 @@ int regulator_set_mode(struct regulator *regulator, unsigned int mode)
 	int ret;
 	int regulator_curr_mode;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	/* sanity check */
 	if (!rdev->desc->ops->set_mode) {
@@ -3480,7 +3530,7 @@ int regulator_set_mode(struct regulator *regulator, unsigned int mode)
 
 	ret = rdev->desc->ops->set_mode(rdev, mode);
 out:
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(regulator_set_mode);
@@ -3489,7 +3539,7 @@ static unsigned int _regulator_get_mode(struct regulator_dev *rdev)
 {
 	int ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	/* sanity check */
 	if (!rdev->desc->ops->get_mode) {
@@ -3499,7 +3549,7 @@ static unsigned int _regulator_get_mode(struct regulator_dev *rdev)
 
 	ret = rdev->desc->ops->get_mode(rdev);
 out:
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return ret;
 }
 
@@ -3520,7 +3570,7 @@ static int _regulator_get_error_flags(struct regulator_dev *rdev,
 {
 	int ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	/* sanity check */
 	if (!rdev->desc->ops->get_error_flags) {
@@ -3530,7 +3580,7 @@ static int _regulator_get_error_flags(struct regulator_dev *rdev,
 
 	ret = rdev->desc->ops->get_error_flags(rdev, flags);
 out:
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 	return ret;
 }
 
@@ -3579,10 +3629,10 @@ int regulator_set_load(struct regulator *regulator, int uA_load)
 	struct regulator_dev *rdev = regulator->rdev;
 	int ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	regulator->uA_load = uA_load;
 	ret = drms_uA_update(rdev);
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return ret;
 }
@@ -3610,7 +3660,7 @@ int regulator_allow_bypass(struct regulator *regulator, bool enable)
 	if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_BYPASS))
 		return 0;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	if (enable && !regulator->bypass) {
 		rdev->bypass_count++;
@@ -3634,7 +3684,7 @@ int regulator_allow_bypass(struct regulator *regulator, bool enable)
 	if (ret == 0)
 		regulator->bypass = enable;
 
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return ret;
 }
@@ -4300,9 +4350,9 @@ static int _regulator_suspend_late(struct device *dev, void *data)
 	suspend_state_t *state = data;
 	int ret;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 	ret = suspend_set_state(rdev, *state);
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return ret;
 }
@@ -4332,14 +4382,14 @@ static int _regulator_resume_early(struct device *dev, void *data)
 	if (rstate == NULL)
 		return 0;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	if (rdev->desc->ops->resume_early &&
 	    (rstate->enabled == ENABLE_IN_SUSPEND ||
 	     rstate->enabled == DISABLE_IN_SUSPEND))
 		ret = rdev->desc->ops->resume_early(rdev);
 
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return ret;
 }
@@ -4641,7 +4691,7 @@ static int __init regulator_late_cleanup(struct device *dev, void *data)
 	if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS))
 		return 0;
 
-	mutex_lock(&rdev->mutex);
+	regulator_lock(rdev);
 
 	if (rdev->use_count)
 		goto unlock;
@@ -4672,7 +4722,7 @@ static int __init regulator_late_cleanup(struct device *dev, void *data)
 	}
 
 unlock:
-	mutex_unlock(&rdev->mutex);
+	regulator_unlock(rdev);
 
 	return 0;
 }
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 14e512ad6d4f..c2a181fa7287 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -434,6 +434,8 @@ struct regulator_dev {
 
 	struct blocking_notifier_head notifier;
 	struct mutex mutex; /* consumer lock */
+	struct task_struct *mutex_owner;
+	int ref_cnt;
 	struct module *owner;
 	struct device dev;
 	struct regulation_constraints *constraints;
-- 
cgit v1.2.3


From a085a31af566254fb8f7721911c828c8fa797894 Mon Sep 17 00:00:00 2001
From: Maciej Purski <m.purski@samsung.com>
Date: Mon, 23 Apr 2018 16:33:39 +0200
Subject: regulator: core: Parse coupled regulators properties

On Odroid XU3/4 and other Exynos5422 based boards there is a case, that
different devices on the board are supplied by different regulators
with non-fixed voltages. If one of these devices temporarily requires
higher voltage, there might occur a situation that the spread between
devices' voltages is so high, that there is a risk of changing
'high' and 'low' states on the interconnection between devices powered
by those regulators.

Add new structure "coupling_desc" to regulator_dev, which contains
pointers to all coupled regulators including the owner of the structure,
number of coupled regulators and counter of currently resolved
regulators.

Add of_functions to parse all data needed in regulator coupling.
Provide method to check DTS data consistency. Check if each coupled
regulator's max_spread is equal and if their lists of regulators match.

Signed-off-by: Maciej Purski <m.purski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/internal.h      |  28 ++++++-
 drivers/regulator/of_regulator.c  | 151 ++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/driver.h  |  18 +++++
 include/linux/regulator/machine.h |   4 +
 4 files changed, 199 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/internal.h b/drivers/regulator/internal.h
index 24fde1e08f3a..943926a156f2 100644
--- a/drivers/regulator/internal.h
+++ b/drivers/regulator/internal.h
@@ -62,6 +62,14 @@ struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 			         const struct regulator_desc *desc,
 				 struct regulator_config *config,
 				 struct device_node **node);
+
+struct regulator_dev *of_parse_coupled_regulator(struct regulator_dev *rdev,
+						 int index);
+
+int of_get_n_coupled(struct regulator_dev *rdev);
+
+bool of_check_coupling_data(struct regulator_dev *rdev);
+
 #else
 static inline struct regulator_dev *
 of_find_regulator_by_node(struct device_node *np)
@@ -77,8 +85,25 @@ regulator_of_get_init_data(struct device *dev,
 {
 	return NULL;
 }
-#endif
 
+static inline struct regulator_dev *
+of_parse_coupled_regulator(struct regulator_dev *rdev,
+			   int index)
+{
+	return NULL;
+}
+
+static inline int of_get_n_coupled(struct regulator_dev *rdev)
+{
+	return 0;
+}
+
+static inline bool of_check_coupling_data(struct regulator_dev *rdev)
+{
+	return false;
+}
+
+#endif
 enum regulator_get_type {
 	NORMAL_GET,
 	EXCLUSIVE_GET,
@@ -88,5 +113,4 @@ enum regulator_get_type {
 
 struct regulator *_regulator_get(struct device *dev, const char *id,
 				 enum regulator_get_type get_type);
-
 #endif
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index d61fed28fdb9..638f17d4c848 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -166,6 +166,10 @@ static void of_get_regulation_constraints(struct device_node *np,
 	if (!of_property_read_u32(np, "regulator-system-load", &pval))
 		constraints->system_load = pval;
 
+	if (!of_property_read_u32(np, "regulator-coupled-max-spread",
+				  &pval))
+		constraints->max_spread = pval;
+
 	constraints->over_current_protection = of_property_read_bool(np,
 					"regulator-over-current-protection");
 
@@ -435,3 +439,150 @@ struct regulator_dev *of_find_regulator_by_node(struct device_node *np)
 
 	return dev ? dev_to_rdev(dev) : NULL;
 }
+
+/*
+ * Returns number of regulators coupled with rdev.
+ */
+int of_get_n_coupled(struct regulator_dev *rdev)
+{
+	struct device_node *node = rdev->dev.of_node;
+	int n_phandles;
+
+	n_phandles = of_count_phandle_with_args(node,
+						"regulator-coupled-with",
+						NULL);
+
+	return (n_phandles > 0) ? n_phandles : 0;
+}
+
+/* Looks for "to_find" device_node in src's "regulator-coupled-with" property */
+static bool of_coupling_find_node(struct device_node *src,
+				  struct device_node *to_find)
+{
+	int n_phandles, i;
+	bool found = false;
+
+	n_phandles = of_count_phandle_with_args(src,
+						"regulator-coupled-with",
+						NULL);
+
+	for (i = 0; i < n_phandles; i++) {
+		struct device_node *tmp = of_parse_phandle(src,
+					   "regulator-coupled-with", i);
+
+		if (!tmp)
+			break;
+
+		/* found */
+		if (tmp == to_find)
+			found = true;
+
+		of_node_put(tmp);
+
+		if (found)
+			break;
+	}
+
+	return found;
+}
+
+/**
+ * of_check_coupling_data - Parse rdev's coupling properties and check data
+ *			    consistency
+ * @rdev - pointer to regulator_dev whose data is checked
+ *
+ * Function checks if all the following conditions are met:
+ * - rdev's max_spread is greater than 0
+ * - all coupled regulators have the same max_spread
+ * - all coupled regulators have the same number of regulator_dev phandles
+ * - all regulators are linked to each other
+ *
+ * Returns true if all conditions are met.
+ */
+bool of_check_coupling_data(struct regulator_dev *rdev)
+{
+	int max_spread = rdev->constraints->max_spread;
+	struct device_node *node = rdev->dev.of_node;
+	int n_phandles = of_get_n_coupled(rdev);
+	struct device_node *c_node;
+	int i;
+	bool ret = true;
+
+	if (max_spread <= 0) {
+		dev_err(&rdev->dev, "max_spread value invalid\n");
+		return false;
+	}
+
+	/* iterate over rdev's phandles */
+	for (i = 0; i < n_phandles; i++) {
+		int c_max_spread, c_n_phandles;
+
+		c_node = of_parse_phandle(node,
+					  "regulator-coupled-with", i);
+
+		if (!c_node)
+			ret = false;
+
+		c_n_phandles = of_count_phandle_with_args(c_node,
+							  "regulator-coupled-with",
+							  NULL);
+
+		if (c_n_phandles != n_phandles) {
+			dev_err(&rdev->dev, "number of couped reg phandles mismatch\n");
+			ret = false;
+			goto clean;
+		}
+
+		if (of_property_read_u32(c_node, "regulator-coupled-max-spread",
+					 &c_max_spread)) {
+			ret = false;
+			goto clean;
+		}
+
+		if (c_max_spread != max_spread) {
+			dev_err(&rdev->dev,
+				"coupled regulators max_spread mismatch\n");
+			ret = false;
+			goto clean;
+		}
+
+		if (!of_coupling_find_node(c_node, node)) {
+			dev_err(&rdev->dev, "missing 2-way linking for coupled regulators\n");
+			ret = false;
+		}
+
+clean:
+		of_node_put(c_node);
+		if (!ret)
+			break;
+	}
+
+	return ret;
+}
+
+/**
+ * of_parse_coupled regulator - Get regulator_dev pointer from rdev's property
+ * @rdev: Pointer to regulator_dev, whose DTS is used as a source to parse
+ *	  "regulator-coupled-with" property
+ * @index: Index in phandles array
+ *
+ * Returns the regulator_dev pointer parsed from DTS. If it has not been yet
+ * registered, returns NULL
+ */
+struct regulator_dev *of_parse_coupled_regulator(struct regulator_dev *rdev,
+						 int index)
+{
+	struct device_node *node = rdev->dev.of_node;
+	struct device_node *c_node;
+	struct regulator_dev *c_rdev;
+
+	c_node = of_parse_phandle(node, "regulator-coupled-with", index);
+	if (!c_node)
+		return NULL;
+
+	c_rdev = of_find_regulator_by_node(c_node);
+
+	of_node_put(c_node);
+
+	return c_rdev;
+}
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index c2a181fa7287..fc2dc8df476f 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -15,6 +15,8 @@
 #ifndef __LINUX_REGULATOR_DRIVER_H_
 #define __LINUX_REGULATOR_DRIVER_H_
 
+#define MAX_COUPLED		4
+
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/regulator/consumer.h>
@@ -409,6 +411,20 @@ struct regulator_config {
 	unsigned int ena_gpio_flags;
 };
 
+/*
+ * struct coupling_desc
+ *
+ * Describes coupling of regulators. Each regulator should have
+ * at least a pointer to itself in coupled_rdevs array.
+ * When a new coupled regulator is resolved, n_resolved is
+ * incremented.
+ */
+struct coupling_desc {
+	struct regulator_dev *coupled_rdevs[MAX_COUPLED];
+	int n_resolved;
+	int n_coupled;
+};
+
 /*
  * struct regulator_dev
  *
@@ -432,6 +448,8 @@ struct regulator_dev {
 	/* lists we own */
 	struct list_head consumer_list; /* consumers we supply */
 
+	struct coupling_desc coupling_desc;
+
 	struct blocking_notifier_head notifier;
 	struct mutex mutex; /* consumer lock */
 	struct task_struct *mutex_owner;
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 93a04893c739..3468703d663a 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -103,6 +103,7 @@ struct regulator_state {
  * @ilim_uA: Maximum input current.
  * @system_load: Load that isn't captured by any consumer requests.
  *
+ * @max_spread: Max possible spread between coupled regulators
  * @valid_modes_mask: Mask of modes which may be configured by consumers.
  * @valid_ops_mask: Operations which may be performed by consumers.
  *
@@ -154,6 +155,9 @@ struct regulation_constraints {
 
 	int system_load;
 
+	/* used for coupled regulators */
+	int max_spread;
+
 	/* valid regulator operating modes for this machine */
 	unsigned int valid_modes_mask;
 
-- 
cgit v1.2.3


From 3c6b38d45fa51c7c51c5e2347fc1a6bef6a46525 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:34 +0200
Subject: regulator: wm8994: Pass descriptor instead of GPIO number

Instead of passing a global GPIO number for the enable GPIO, pass
a descriptor looked up from the device tree node or the board file
decriptor table for the regulator.

There is a single board file passing the GPIOs for LDO1 and LDO2
through platform data, so augment this to pass descriptors
associated with the i2c device as well.

The special GPIO enable DT property for the enable GPIO is
nonstandard but this was accomodated in
commit 6a537d48461deacc57c07ed86d9915e5aa4b3539
"gpio: of: Support regulator nonstandard GPIO properties".

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-s3c64xx/mach-crag6410-module.c | 17 +++++++++++++++--
 drivers/mfd/wm8994-core.c                    |  9 ---------
 drivers/regulator/wm8994-regulator.c         | 19 +++++++++++--------
 include/linux/mfd/wm8994/pdata.h             |  3 ---
 4 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-s3c64xx/mach-crag6410-module.c b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
index 5aa472892465..76c4855a03bc 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410-module.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
@@ -194,8 +194,8 @@ static struct wm8994_pdata wm8994_pdata = {
 		0x3,          /* IRQ out, active high, CMOS */
 	},
 	.ldo = {
-		 { .enable = S3C64XX_GPN(6), .init_data = &wm8994_ldo1, },
-		 { .enable = S3C64XX_GPN(4), .init_data = &wm8994_ldo2, },
+		 { .init_data = &wm8994_ldo1, },
+		 { .init_data = &wm8994_ldo2, },
 	},
 };
 
@@ -203,6 +203,18 @@ static const struct i2c_board_info wm1277_devs[] = {
 	{ I2C_BOARD_INFO("wm8958", 0x1a),  /* WM8958 is the superset */
 	  .platform_data = &wm8994_pdata,
 	  .irq = GLENFARCLAS_PMIC_IRQ_BASE + WM831X_IRQ_GPIO_2,
+	  .dev_name = "wm8958",
+	},
+};
+
+static struct gpiod_lookup_table wm8994_gpiod_table = {
+	.dev_id = "i2c-wm8958", /* I2C device name */
+	.table = {
+		GPIO_LOOKUP("GPION", 6,
+			    "wlf,ldo1ena", GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP("GPION", 4,
+			    "wlf,ldo2ena", GPIO_ACTIVE_HIGH),
+		{ },
 	},
 };
 
@@ -381,6 +393,7 @@ static int wlf_gf_module_probe(struct i2c_client *i2c,
 
 	gpiod_add_lookup_table(&wm5102_reva_gpiod_table);
 	gpiod_add_lookup_table(&wm5102_gpiod_table);
+	gpiod_add_lookup_table(&wm8994_gpiod_table);
 
 	if (i < ARRAY_SIZE(gf_mods)) {
 		dev_info(&i2c->dev, "%s revision %d\n",
diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index 953d0790ffd5..c409464231f6 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -21,7 +21,6 @@
 #include <linux/mfd/core.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
@@ -302,14 +301,6 @@ static int wm8994_set_pdata_from_of(struct wm8994 *wm8994)
 	if (of_find_property(np, "wlf,ldoena-always-driven", NULL))
 		pdata->lineout2fb = true;
 
-	pdata->ldo[0].enable = of_get_named_gpio(np, "wlf,ldo1ena", 0);
-	if (pdata->ldo[0].enable < 0)
-		pdata->ldo[0].enable = 0;
-
-	pdata->ldo[1].enable = of_get_named_gpio(np, "wlf,ldo2ena", 0);
-	if (pdata->ldo[1].enable < 0)
-		pdata->ldo[1].enable = 0;
-
 	return 0;
 }
 #else
diff --git a/drivers/regulator/wm8994-regulator.c b/drivers/regulator/wm8994-regulator.c
index 7a4ce6df4f22..d3a5f48119c2 100644
--- a/drivers/regulator/wm8994-regulator.c
+++ b/drivers/regulator/wm8994-regulator.c
@@ -19,7 +19,7 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 
 #include <linux/mfd/wm8994/core.h>
@@ -129,6 +129,7 @@ static int wm8994_ldo_probe(struct platform_device *pdev)
 	int id = pdev->id % ARRAY_SIZE(pdata->ldo);
 	struct regulator_config config = { };
 	struct wm8994_ldo *ldo;
+	struct gpio_desc *gpiod;
 	int ret;
 
 	dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1);
@@ -145,12 +146,14 @@ static int wm8994_ldo_probe(struct platform_device *pdev)
 	config.driver_data = ldo;
 	config.regmap = wm8994->regmap;
 	config.init_data = &ldo->init_data;
-	if (pdata) {
-		config.ena_gpio = pdata->ldo[id].enable;
-	} else if (wm8994->dev->of_node) {
-		config.ena_gpio = wm8994->pdata.ldo[id].enable;
-		config.ena_gpio_initialized = true;
-	}
+
+	/* Look up LDO enable GPIO from the parent device node */
+	gpiod = devm_gpiod_get_optional(pdev->dev.parent,
+					id ? "wlf,ldo2ena" : "wlf,ldo1ena",
+					GPIOD_OUT_LOW);
+	if (IS_ERR(gpiod))
+		return PTR_ERR(gpiod);
+	config.ena_gpiod = gpiod;
 
 	/* Use default constraints if none set up */
 	if (!pdata || !pdata->ldo[id].init_data || wm8994->dev->of_node) {
@@ -159,7 +162,7 @@ static int wm8994_ldo_probe(struct platform_device *pdev)
 
 		ldo->init_data = wm8994_ldo_default[id];
 		ldo->init_data.consumer_supplies = &ldo->supply;
-		if (!config.ena_gpio)
+		if (!gpiod)
 			ldo->init_data.constraints.valid_ops_mask = 0;
 	} else {
 		ldo->init_data = *pdata->ldo[id].init_data;
diff --git a/include/linux/mfd/wm8994/pdata.h b/include/linux/mfd/wm8994/pdata.h
index 90c60524a496..fca67bd194e2 100644
--- a/include/linux/mfd/wm8994/pdata.h
+++ b/include/linux/mfd/wm8994/pdata.h
@@ -20,9 +20,6 @@
 #define WM8994_NUM_AIF   3
 
 struct wm8994_ldo_pdata {
-	/** GPIOs to enable regulator, 0 or less if not available */
-	int enable;
-
 	const struct regulator_init_data *init_data;
 };
 
-- 
cgit v1.2.3


From 0d93d5c915d5b8a7f74f6d3e59748c06adcf6e9f Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Thu, 30 Nov 2017 11:32:18 -0500
Subject: media: v4l: vsp1: Document the vsp1_du_atomic_config structure

The structure is used in the API that the VSP1 driver exposes to the DU
driver. Documenting it is thus important.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/media/vsp1.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/media/vsp1.h b/include/media/vsp1.h
index 68a8abe4fac5..ff7ef894465d 100644
--- a/include/media/vsp1.h
+++ b/include/media/vsp1.h
@@ -41,6 +41,16 @@ struct vsp1_du_lif_config {
 int vsp1_du_setup_lif(struct device *dev, unsigned int pipe_index,
 		      const struct vsp1_du_lif_config *cfg);
 
+/**
+ * struct vsp1_du_atomic_config - VSP atomic configuration parameters
+ * @pixelformat: plane pixel format (V4L2 4CC)
+ * @pitch: line pitch in bytes, for all planes
+ * @mem: DMA memory address for each plane of the frame buffer
+ * @src: source rectangle in the frame buffer (integer coordinates)
+ * @dst: destination rectangle on the display (integer coordinates)
+ * @alpha: alpha value (0: fully transparent, 255: fully opaque)
+ * @zpos: Z position of the plane (from 0 to number of planes minus 1)
+ */
 struct vsp1_du_atomic_config {
 	u32 pixelformat;
 	unsigned int pitch;
-- 
cgit v1.2.3


From 6e274b43b5730f53029354fc10c0beabefed60e2 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 1 Dec 2017 06:47:19 -0500
Subject: media: v4l: vsp1: Extend the DU API to support CRC computation

Add a parameter (in the form of a structure to ease future API
extensions) to the VSP atomic flush handler to pass CRC source
configuration, and pass the CRC value to the completion callback.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Reviewed-by: Jacopo Mondi <jacopo+renesas@jmondi.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/gpu/drm/rcar-du/rcar_du_vsp.c  |  6 ++++--
 drivers/media/platform/vsp1/vsp1_drm.c |  6 ++++--
 drivers/media/platform/vsp1/vsp1_drm.h |  2 +-
 include/media/vsp1.h                   | 35 ++++++++++++++++++++++++++++++++--
 4 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/rcar-du/rcar_du_vsp.c b/drivers/gpu/drm/rcar-du/rcar_du_vsp.c
index 2c260c33840b..bdcec201591f 100644
--- a/drivers/gpu/drm/rcar-du/rcar_du_vsp.c
+++ b/drivers/gpu/drm/rcar-du/rcar_du_vsp.c
@@ -31,7 +31,7 @@
 #include "rcar_du_kms.h"
 #include "rcar_du_vsp.h"
 
-static void rcar_du_vsp_complete(void *private, bool completed)
+static void rcar_du_vsp_complete(void *private, bool completed, u32 crc)
 {
 	struct rcar_du_crtc *crtc = private;
 
@@ -102,7 +102,9 @@ void rcar_du_vsp_atomic_begin(struct rcar_du_crtc *crtc)
 
 void rcar_du_vsp_atomic_flush(struct rcar_du_crtc *crtc)
 {
-	vsp1_du_atomic_flush(crtc->vsp->vsp, crtc->vsp_pipe);
+	struct vsp1_du_atomic_pipe_config cfg = { { 0, } };
+
+	vsp1_du_atomic_flush(crtc->vsp->vsp, crtc->vsp_pipe, &cfg);
 }
 
 /* Keep the two tables in sync. */
diff --git a/drivers/media/platform/vsp1/vsp1_drm.c b/drivers/media/platform/vsp1/vsp1_drm.c
index 2b29a83dceb9..5fc31578f9b0 100644
--- a/drivers/media/platform/vsp1/vsp1_drm.c
+++ b/drivers/media/platform/vsp1/vsp1_drm.c
@@ -36,7 +36,7 @@ static void vsp1_du_pipeline_frame_end(struct vsp1_pipeline *pipe,
 	bool complete = completion == VSP1_DL_FRAME_END_COMPLETED;
 
 	if (drm_pipe->du_complete)
-		drm_pipe->du_complete(drm_pipe->du_private, complete);
+		drm_pipe->du_complete(drm_pipe->du_private, complete, 0);
 
 	if (completion & VSP1_DL_FRAME_END_INTERNAL) {
 		drm_pipe->force_brx_release = false;
@@ -739,8 +739,10 @@ EXPORT_SYMBOL_GPL(vsp1_du_atomic_update);
  * vsp1_du_atomic_flush - Commit an atomic update
  * @dev: the VSP device
  * @pipe_index: the DRM pipeline index
+ * @cfg: atomic pipe configuration
  */
-void vsp1_du_atomic_flush(struct device *dev, unsigned int pipe_index)
+void vsp1_du_atomic_flush(struct device *dev, unsigned int pipe_index,
+			  const struct vsp1_du_atomic_pipe_config *cfg)
 {
 	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
 	struct vsp1_drm_pipeline *drm_pipe = &vsp1->drm->pipe[pipe_index];
diff --git a/drivers/media/platform/vsp1/vsp1_drm.h b/drivers/media/platform/vsp1/vsp1_drm.h
index f4af1b2b12d6..e5b88b28806c 100644
--- a/drivers/media/platform/vsp1/vsp1_drm.h
+++ b/drivers/media/platform/vsp1/vsp1_drm.h
@@ -35,7 +35,7 @@ struct vsp1_drm_pipeline {
 	wait_queue_head_t wait_queue;
 
 	/* Frame synchronisation */
-	void (*du_complete)(void *, bool);
+	void (*du_complete)(void *data, bool completed, u32 crc);
 	void *du_private;
 };
 
diff --git a/include/media/vsp1.h b/include/media/vsp1.h
index ff7ef894465d..678c24de1ac6 100644
--- a/include/media/vsp1.h
+++ b/include/media/vsp1.h
@@ -34,7 +34,7 @@ struct vsp1_du_lif_config {
 	unsigned int width;
 	unsigned int height;
 
-	void (*callback)(void *, bool);
+	void (*callback)(void *data, bool completed, u32 crc);
 	void *callback_data;
 };
 
@@ -61,11 +61,42 @@ struct vsp1_du_atomic_config {
 	unsigned int zpos;
 };
 
+/**
+ * enum vsp1_du_crc_source - Source used for CRC calculation
+ * @VSP1_DU_CRC_NONE: CRC calculation disabled
+ * @VSP1_DU_CRC_PLANE: Perform CRC calculation on an input plane
+ * @VSP1_DU_CRC_OUTPUT: Perform CRC calculation on the composed output
+ */
+enum vsp1_du_crc_source {
+	VSP1_DU_CRC_NONE,
+	VSP1_DU_CRC_PLANE,
+	VSP1_DU_CRC_OUTPUT,
+};
+
+/**
+ * struct vsp1_du_crc_config - VSP CRC computation configuration parameters
+ * @source: source for CRC calculation
+ * @index: index of the CRC source plane (when source is set to plane)
+ */
+struct vsp1_du_crc_config {
+	enum vsp1_du_crc_source source;
+	unsigned int index;
+};
+
+/**
+ * struct vsp1_du_atomic_pipe_config - VSP atomic pipe configuration parameters
+ * @crc: CRC computation configuration
+ */
+struct vsp1_du_atomic_pipe_config {
+	struct vsp1_du_crc_config crc;
+};
+
 void vsp1_du_atomic_begin(struct device *dev, unsigned int pipe_index);
 int vsp1_du_atomic_update(struct device *dev, unsigned int pipe_index,
 			  unsigned int rpf,
 			  const struct vsp1_du_atomic_config *cfg);
-void vsp1_du_atomic_flush(struct device *dev, unsigned int pipe_index);
+void vsp1_du_atomic_flush(struct device *dev, unsigned int pipe_index,
+			  const struct vsp1_du_atomic_pipe_config *cfg);
 int vsp1_du_map_sg(struct device *dev, struct sg_table *sgt);
 void vsp1_du_unmap_sg(struct device *dev, struct sg_table *sgt);
 
-- 
cgit v1.2.3


From 421bf6a1f061a6edba47e5e7a5458f29eb791de9 Mon Sep 17 00:00:00 2001
From: Akshu Agrawal <Akshu.Agrawal@amd.com>
Date: Wed, 9 May 2018 17:59:00 +0800
Subject: clk: x86: Add ST oscout platform clock

Stoney SoC provides oscout clock. This clock can support 25Mhz and
48Mhz of frequency.
The clock is available for general system use.

Signed-off-by: Akshu Agrawal <akshu.agrawal@amd.com>
Reviewed-by: Daniel Kurtz <djkurtz@chromium.org>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/clk/x86/Makefile             |  3 +-
 drivers/clk/x86/clk-st.c             | 77 ++++++++++++++++++++++++++++++++++++
 include/linux/platform_data/clk-st.h | 17 ++++++++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 drivers/clk/x86/clk-st.c
 create mode 100644 include/linux/platform_data/clk-st.h

(limited to 'include')

diff --git a/drivers/clk/x86/Makefile b/drivers/clk/x86/Makefile
index 1367afb03858..00303bc05415 100644
--- a/drivers/clk/x86/Makefile
+++ b/drivers/clk/x86/Makefile
@@ -1,3 +1,4 @@
+obj-$(CONFIG_PMC_ATOM)		+= clk-pmc-atom.o
+obj-$(CONFIG_X86_AMD_PLATFORM_DEVICE)	+= clk-st.o
 clk-x86-lpss-objs		:= clk-lpt.o
 obj-$(CONFIG_X86_INTEL_LPSS)	+= clk-x86-lpss.o
-obj-$(CONFIG_PMC_ATOM)		+= clk-pmc-atom.o
diff --git a/drivers/clk/x86/clk-st.c b/drivers/clk/x86/clk-st.c
new file mode 100644
index 000000000000..fb62f3938008
--- /dev/null
+++ b/drivers/clk/x86/clk-st.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+/*
+ * clock framework for AMD Stoney based clocks
+ *
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/clk.h>
+#include <linux/clkdev.h>
+#include <linux/clk-provider.h>
+#include <linux/platform_data/clk-st.h>
+#include <linux/platform_device.h>
+
+/* Clock Driving Strength 2 register */
+#define CLKDRVSTR2	0x28
+/* Clock Control 1 register */
+#define MISCCLKCNTL1	0x40
+/* Auxiliary clock1 enable bit */
+#define OSCCLKENB	2
+/* 25Mhz auxiliary output clock freq bit */
+#define OSCOUT1CLK25MHZ	16
+
+#define ST_CLK_48M	0
+#define ST_CLK_25M	1
+#define ST_CLK_MUX	2
+#define ST_CLK_GATE	3
+#define ST_MAX_CLKS	4
+
+static const char * const clk_oscout1_parents[] = { "clk48MHz", "clk25MHz" };
+static struct clk_hw *hws[ST_MAX_CLKS];
+
+static int st_clk_probe(struct platform_device *pdev)
+{
+	struct st_clk_data *st_data;
+
+	st_data = dev_get_platdata(&pdev->dev);
+	if (!st_data || !st_data->base)
+		return -EINVAL;
+
+	hws[ST_CLK_48M] = clk_hw_register_fixed_rate(NULL, "clk48MHz", NULL, 0,
+						     48000000);
+	hws[ST_CLK_25M] = clk_hw_register_fixed_rate(NULL, "clk25MHz", NULL, 0,
+						     25000000);
+
+	hws[ST_CLK_MUX] = clk_hw_register_mux(NULL, "oscout1_mux",
+		clk_oscout1_parents, ARRAY_SIZE(clk_oscout1_parents),
+		0, st_data->base + CLKDRVSTR2, OSCOUT1CLK25MHZ, 3, 0, NULL);
+
+	clk_set_parent(hws[ST_CLK_MUX]->clk, hws[ST_CLK_25M]->clk);
+
+	hws[ST_CLK_GATE] = clk_hw_register_gate(NULL, "oscout1", "oscout1_mux",
+		0, st_data->base + MISCCLKCNTL1, OSCCLKENB,
+		CLK_GATE_SET_TO_DISABLE, NULL);
+
+	clk_hw_register_clkdev(hws[ST_CLK_GATE], "oscout1", NULL);
+
+	return 0;
+}
+
+static int st_clk_remove(struct platform_device *pdev)
+{
+	int i;
+
+	for (i = 0; i < ST_MAX_CLKS; i++)
+		clk_hw_unregister(hws[i]);
+	return 0;
+}
+
+static struct platform_driver st_clk_driver = {
+	.driver = {
+		.name = "clk-st",
+		.suppress_bind_attrs = true,
+	},
+	.probe = st_clk_probe,
+	.remove = st_clk_remove,
+};
+builtin_platform_driver(st_clk_driver);
diff --git a/include/linux/platform_data/clk-st.h b/include/linux/platform_data/clk-st.h
new file mode 100644
index 000000000000..7cdb6a402b35
--- /dev/null
+++ b/include/linux/platform_data/clk-st.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * clock framework for AMD Stoney based clock
+ *
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ */
+
+#ifndef __CLK_ST_H
+#define __CLK_ST_H
+
+#include <linux/compiler.h>
+
+struct st_clk_data {
+	void __iomem *base;
+};
+
+#endif /* __CLK_ST_H */
-- 
cgit v1.2.3


From 63dcc7090137a893322432e156d66be3ce104615 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 15 May 2018 20:32:02 +0300
Subject: device property: Get rid of union aliasing

Commit 318a19718261 (device property: refactor built-in properties
support) went way too far and brought a union aliasing. Partially
revert it here to get rid of union aliasing.

Note, all Apple properties are considered as u8 arrays. To get a value
of any of them the caller must use device_property_read_u8_array().

What's union aliasing?
~~~~~~~~~~~~~~~~~~~~~~

The C99 standard in section 6.2.5 paragraph 20 defines union type as
"an overlapping nonempty set of member objects". It also states in
section 6.7.2.1 paragraph 14 that "the value of at most one of the
members can be stored in a union object at any time'.

Union aliasing is a type punning mechanism using union members to store
as one type and read back as another.

Why it's not good?
~~~~~~~~~~~~~~~~~~

Section 6.2.6.1 paragraph 6 says that a union object may not be a trap
representation, although its member objects may be.

Meanwhile annex J.1 says that "the value of a union member other than
the last one stored into" is unspecified [removed in C11].

In TC3, a footnote is added which specifies that accessing a member of a
union other than the last one stored causes "the object representation"
to be re-interpreted in the new type and specifically refers to this as
"type punning". This conflicts to some degree with Annex J.1.

While it's working in Linux with GCC, the use of union members to do
type punning is not clear area in the C standard and might lead to
unspecified behaviour.

More information is available in this [1] blog post.

[1]: https://davmac.wordpress.com/2010/02/26/c99-revisited/

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c                 | 104 ++++++++++++++++++++++++++------
 drivers/firmware/efi/apple-properties.c |   8 ++-
 include/linux/property.h                |  52 ++++++++--------
 3 files changed, 117 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 8f205f6461ed..240ab5230ff6 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -56,6 +56,72 @@ pset_prop_get(const struct property_set *pset, const char *name)
 	return NULL;
 }
 
+static const void *property_get_pointer(const struct property_entry *prop)
+{
+	switch (prop->type) {
+	case DEV_PROP_U8:
+		if (prop->is_array)
+			return prop->pointer.u8_data;
+		return &prop->value.u8_data;
+	case DEV_PROP_U16:
+		if (prop->is_array)
+			return prop->pointer.u16_data;
+		return &prop->value.u16_data;
+	case DEV_PROP_U32:
+		if (prop->is_array)
+			return prop->pointer.u32_data;
+		return &prop->value.u32_data;
+	case DEV_PROP_U64:
+		if (prop->is_array)
+			return prop->pointer.u64_data;
+		return &prop->value.u64_data;
+	case DEV_PROP_STRING:
+		if (prop->is_array)
+			return prop->pointer.str;
+		return &prop->value.str;
+	default:
+		return NULL;
+	}
+}
+
+static void property_set_pointer(struct property_entry *prop, const void *pointer)
+{
+	switch (prop->type) {
+	case DEV_PROP_U8:
+		if (prop->is_array)
+			prop->pointer.u8_data = pointer;
+		else
+			prop->value.u8_data = *((u8 *)pointer);
+		break;
+	case DEV_PROP_U16:
+		if (prop->is_array)
+			prop->pointer.u16_data = pointer;
+		else
+			prop->value.u16_data = *((u16 *)pointer);
+		break;
+	case DEV_PROP_U32:
+		if (prop->is_array)
+			prop->pointer.u32_data = pointer;
+		else
+			prop->value.u32_data = *((u32 *)pointer);
+		break;
+	case DEV_PROP_U64:
+		if (prop->is_array)
+			prop->pointer.u64_data = pointer;
+		else
+			prop->value.u64_data = *((u64 *)pointer);
+		break;
+	case DEV_PROP_STRING:
+		if (prop->is_array)
+			prop->pointer.str = pointer;
+		else
+			prop->value.str = pointer;
+		break;
+	default:
+		break;
+	}
+}
+
 static const void *pset_prop_find(const struct property_set *pset,
 				  const char *propname, size_t length)
 {
@@ -65,10 +131,7 @@ static const void *pset_prop_find(const struct property_set *pset,
 	prop = pset_prop_get(pset, propname);
 	if (!prop)
 		return ERR_PTR(-EINVAL);
-	if (prop->is_array)
-		pointer = prop->pointer.raw_data;
-	else
-		pointer = &prop->value.raw_data;
+	pointer = property_get_pointer(prop);
 	if (!pointer)
 		return ERR_PTR(-ENODATA);
 	if (length > prop->length)
@@ -698,16 +761,17 @@ EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args);
 
 static void property_entry_free_data(const struct property_entry *p)
 {
+	const void *pointer = property_get_pointer(p);
 	size_t i, nval;
 
 	if (p->is_array) {
-		if (p->is_string && p->pointer.str) {
+		if (p->type == DEV_PROP_STRING && p->pointer.str) {
 			nval = p->length / sizeof(const char *);
 			for (i = 0; i < nval; i++)
 				kfree(p->pointer.str[i]);
 		}
-		kfree(p->pointer.raw_data);
-	} else if (p->is_string) {
+		kfree(pointer);
+	} else if (p->type == DEV_PROP_STRING) {
 		kfree(p->value.str);
 	}
 	kfree(p->name);
@@ -716,7 +780,7 @@ static void property_entry_free_data(const struct property_entry *p)
 static int property_copy_string_array(struct property_entry *dst,
 				      const struct property_entry *src)
 {
-	char **d;
+	const char **d;
 	size_t nval = src->length / sizeof(*d);
 	int i;
 
@@ -734,40 +798,44 @@ static int property_copy_string_array(struct property_entry *dst,
 		}
 	}
 
-	dst->pointer.raw_data = d;
+	dst->pointer.str = d;
 	return 0;
 }
 
 static int property_entry_copy_data(struct property_entry *dst,
 				    const struct property_entry *src)
 {
+	const void *pointer = property_get_pointer(src);
+	const void *new;
 	int error;
 
 	if (src->is_array) {
 		if (!src->length)
 			return -ENODATA;
 
-		if (src->is_string) {
+		if (src->type == DEV_PROP_STRING) {
 			error = property_copy_string_array(dst, src);
 			if (error)
 				return error;
+			new = dst->pointer.str;
 		} else {
-			dst->pointer.raw_data = kmemdup(src->pointer.raw_data,
-							src->length, GFP_KERNEL);
-			if (!dst->pointer.raw_data)
+			new = kmemdup(pointer, src->length, GFP_KERNEL);
+			if (!new)
 				return -ENOMEM;
 		}
-	} else if (src->is_string) {
-		dst->value.str = kstrdup(src->value.str, GFP_KERNEL);
-		if (!dst->value.str && src->value.str)
+	} else if (src->type == DEV_PROP_STRING) {
+		new = kstrdup(src->value.str, GFP_KERNEL);
+		if (!new && src->value.str)
 			return -ENOMEM;
 	} else {
-		dst->value.raw_data = src->value.raw_data;
+		new = pointer;
 	}
 
 	dst->length = src->length;
 	dst->is_array = src->is_array;
-	dst->is_string = src->is_string;
+	dst->type = src->type;
+
+	property_set_pointer(dst, new);
 
 	dst->name = kstrdup(src->name, GFP_KERNEL);
 	if (!dst->name)
diff --git a/drivers/firmware/efi/apple-properties.c b/drivers/firmware/efi/apple-properties.c
index adaa9a3714b9..60a95719ecb8 100644
--- a/drivers/firmware/efi/apple-properties.c
+++ b/drivers/firmware/efi/apple-properties.c
@@ -13,6 +13,9 @@
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Note, all properties are considered as u8 arrays.
+ * To get a value of any of them the caller must use device_property_read_u8_array().
  */
 
 #define pr_fmt(fmt) "apple-properties: " fmt
@@ -96,12 +99,13 @@ static void __init unmarshal_key_value_pairs(struct dev_header *dev_header,
 		entry[i].name = key;
 		entry[i].length = val_len - sizeof(val_len);
 		entry[i].is_array = !!entry[i].length;
-		entry[i].pointer.raw_data = ptr + key_len + sizeof(val_len);
+		entry[i].type = DEV_PROP_U8;
+		entry[i].pointer.u8_data = ptr + key_len + sizeof(val_len);
 
 		if (dump_properties) {
 			dev_info(dev, "property: %s\n", entry[i].name);
 			print_hex_dump(KERN_INFO, pr_fmt(), DUMP_PREFIX_OFFSET,
-				16, 1, entry[i].pointer.raw_data,
+				16, 1, entry[i].pointer.u8_data,
 				entry[i].length, true);
 		}
 
diff --git a/include/linux/property.h b/include/linux/property.h
index 2eea4b310fc2..ac8a1ebc4c1b 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -178,7 +178,7 @@ static inline int fwnode_property_read_u64(const struct fwnode_handle *fwnode,
  * @name: Name of the property.
  * @length: Length of data making up the value.
  * @is_array: True when the property is an array.
- * @is_string: True when property is a string.
+ * @type: Type of the data in unions.
  * @pointer: Pointer to the property (an array of items of the given type).
  * @value: Value of the property (when it is a single item of the given type).
  */
@@ -186,10 +186,9 @@ struct property_entry {
 	const char *name;
 	size_t length;
 	bool is_array;
-	bool is_string;
+	enum dev_prop_type type;
 	union {
 		union {
-			const void *raw_data;
 			const u8 *u8_data;
 			const u16 *u16_data;
 			const u32 *u32_data;
@@ -197,7 +196,6 @@ struct property_entry {
 			const char * const *str;
 		} pointer;
 		union {
-			unsigned long long raw_data;
 			u8 u8_data;
 			u16 u16_data;
 			u32 u32_data;
@@ -213,55 +211,55 @@ struct property_entry {
  * and structs.
  */
 
-#define PROPERTY_ENTRY_INTEGER_ARRAY(_name_, _type_, _val_)	\
-(struct property_entry) {					\
-	.name = _name_,						\
-	.length = ARRAY_SIZE(_val_) * sizeof(_type_),		\
-	.is_array = true,					\
-	.is_string = false,					\
-	{ .pointer = { ._type_##_data = _val_ } },		\
+#define PROPERTY_ENTRY_INTEGER_ARRAY(_name_, _type_, _Type_, _val_)	\
+(struct property_entry) {						\
+	.name = _name_,							\
+	.length = ARRAY_SIZE(_val_) * sizeof(_type_),			\
+	.is_array = true,						\
+	.type = DEV_PROP_##_Type_,					\
+	{ .pointer = { ._type_##_data = _val_ } },			\
 }
 
 #define PROPERTY_ENTRY_U8_ARRAY(_name_, _val_)			\
-	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u8, _val_)
+	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u8, U8, _val_)
 #define PROPERTY_ENTRY_U16_ARRAY(_name_, _val_)			\
-	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u16, _val_)
+	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u16, U16, _val_)
 #define PROPERTY_ENTRY_U32_ARRAY(_name_, _val_)			\
-	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u32, _val_)
+	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u32, U32, _val_)
 #define PROPERTY_ENTRY_U64_ARRAY(_name_, _val_)			\
-	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u64, _val_)
+	PROPERTY_ENTRY_INTEGER_ARRAY(_name_, u64, U64, _val_)
 
 #define PROPERTY_ENTRY_STRING_ARRAY(_name_, _val_)		\
 (struct property_entry) {					\
 	.name = _name_,						\
 	.length = ARRAY_SIZE(_val_) * sizeof(const char *),	\
 	.is_array = true,					\
-	.is_string = true,					\
+	.type = DEV_PROP_STRING,				\
 	{ .pointer = { .str = _val_ } },			\
 }
 
-#define PROPERTY_ENTRY_INTEGER(_name_, _type_, _val_)	\
-(struct property_entry) {				\
-	.name = _name_,					\
-	.length = sizeof(_type_),			\
-	.is_string = false,				\
-	{ .value = { ._type_##_data = _val_ } },	\
+#define PROPERTY_ENTRY_INTEGER(_name_, _type_, _Type_, _val_)	\
+(struct property_entry) {					\
+	.name = _name_,						\
+	.length = sizeof(_type_),				\
+	.type = DEV_PROP_##_Type_,				\
+	{ .value = { ._type_##_data = _val_ } },		\
 }
 
 #define PROPERTY_ENTRY_U8(_name_, _val_)		\
-	PROPERTY_ENTRY_INTEGER(_name_, u8, _val_)
+	PROPERTY_ENTRY_INTEGER(_name_, u8, U8, _val_)
 #define PROPERTY_ENTRY_U16(_name_, _val_)		\
-	PROPERTY_ENTRY_INTEGER(_name_, u16, _val_)
+	PROPERTY_ENTRY_INTEGER(_name_, u16, U16, _val_)
 #define PROPERTY_ENTRY_U32(_name_, _val_)		\
-	PROPERTY_ENTRY_INTEGER(_name_, u32, _val_)
+	PROPERTY_ENTRY_INTEGER(_name_, u32, U32, _val_)
 #define PROPERTY_ENTRY_U64(_name_, _val_)		\
-	PROPERTY_ENTRY_INTEGER(_name_, u64, _val_)
+	PROPERTY_ENTRY_INTEGER(_name_, u64, U64, _val_)
 
 #define PROPERTY_ENTRY_STRING(_name_, _val_)		\
 (struct property_entry) {				\
 	.name = _name_,					\
 	.length = sizeof(_val_),			\
-	.is_string = true,				\
+	.type = DEV_PROP_STRING,			\
 	{ .value = { .str = _val_ } },			\
 }
 
-- 
cgit v1.2.3


From 01cd267bff52619a53fa05c930ea5ed53493d21a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 May 2018 10:05:38 +0200
Subject: netfilter: fix fallout from xt/nf osf separation

Stephen Rothwell says:
  today's linux-next build (x86_64 allmodconfig) produced this warning:
  ./usr/include/linux/netfilter/nf_osf.h:25: found __[us]{8,16,32,64} type without #include <linux/types.h>

Fix that up and also move kernel-private struct out of uapi (it was not
exposed in any released kernel version).

tested via allmodconfig build + make headers_check.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      | 6 ++++++
 include/uapi/linux/netfilter/nf_osf.h | 8 ++------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
index a2b39602e87d..0e114c492fb8 100644
--- a/include/linux/netfilter/nf_osf.h
+++ b/include/linux/netfilter/nf_osf.h
@@ -21,6 +21,12 @@ enum osf_fmatch_states {
 	FMATCH_OPT_WRONG,
 };
 
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
 bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 		  int hooknum, struct net_device *in, struct net_device *out,
 		  const struct nf_osf_info *info, struct net *net,
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index 45376eae31ef..8f2f2f403183 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -1,6 +1,8 @@
 #ifndef _NF_OSF_H
 #define _NF_OSF_H
 
+#include <linux/types.h>
+
 #define MAXGENRELEN	32
 
 #define NF_OSF_GENRE	(1 << 0)
@@ -57,12 +59,6 @@ struct nf_osf_user_finger {
 	struct nf_osf_opt	opt[MAX_IPOPTLEN];
 };
 
-struct nf_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct nf_osf_user_finger	finger;
-};
-
 struct nf_osf_nlmsg {
 	struct nf_osf_user_finger	f;
 	struct iphdr			ip;
-- 
cgit v1.2.3


From b9ccc07e3f31ad8073697982bac014fbceef7ecb Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Fri, 11 May 2018 00:14:48 +0200
Subject: netfilter: nft_hash: add map lookups for hashing operations

This patch creates new attributes to accept a map as argument and
then perform the lookup with the generated hash accordingly.

Both current hash functions are supported: Jenkins and Symmetric Hash.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +
 net/netfilter/nft_hash.c                 | 131 ++++++++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ce031cf72288..9c71f024f9cc 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -856,6 +856,8 @@ enum nft_hash_types {
  * @NFTA_HASH_SEED: seed value (NLA_U32)
  * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32)
  * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types)
+ * @NFTA_HASH_SET_NAME: name of the map to lookup (NLA_STRING)
+ * @NFTA_HASH_SET_ID: id of the map (NLA_U32)
  */
 enum nft_hash_attributes {
 	NFTA_HASH_UNSPEC,
@@ -866,6 +868,8 @@ enum nft_hash_attributes {
 	NFTA_HASH_SEED,
 	NFTA_HASH_OFFSET,
 	NFTA_HASH_TYPE,
+	NFTA_HASH_SET_NAME,
+	NFTA_HASH_SET_ID,
 	__NFTA_HASH_MAX,
 };
 #define NFTA_HASH_MAX	(__NFTA_HASH_MAX - 1)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index e235c17f1b8b..f0fc21f88775 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -25,6 +25,7 @@ struct nft_jhash {
 	u32			modulus;
 	u32			seed;
 	u32			offset;
+	struct nft_set		*map;
 };
 
 static void nft_jhash_eval(const struct nft_expr *expr,
@@ -35,14 +36,39 @@ static void nft_jhash_eval(const struct nft_expr *expr,
 	const void *data = &regs->data[priv->sreg];
 	u32 h;
 
-	h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+	h = reciprocal_scale(jhash(data, priv->len, priv->seed),
+			     priv->modulus);
+
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
+static void nft_jhash_map_eval(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	const void *data = &regs->data[priv->sreg];
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = reciprocal_scale(jhash(data, priv->len, priv->seed),
+					priv->modulus) + priv->offset;
+
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
+}
+
 struct nft_symhash {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	u32			offset;
+	struct nft_set		*map;
 };
 
 static void nft_symhash_eval(const struct nft_expr *expr,
@@ -58,6 +84,28 @@ static void nft_symhash_eval(const struct nft_expr *expr,
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
+static void nft_symhash_map_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nft_symhash *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = reciprocal_scale(__skb_get_hash_symmetric(skb),
+				  priv->modulus) + priv->offset;
+
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
+}
+
 static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
 	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
@@ -66,6 +114,9 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SEED]	= { .type = NLA_U32 },
 	[NFTA_HASH_OFFSET]	= { .type = NLA_U32 },
 	[NFTA_HASH_TYPE]	= { .type = NLA_U32 },
+	[NFTA_HASH_SET_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_SET_MAXNAMELEN - 1 },
+	[NFTA_HASH_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -115,6 +166,23 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_jhash_map_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_jhash_init(ctx, expr, tb);
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_HASH_SET_NAME],
+					  tb[NFTA_HASH_SET_ID], genmask);
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_symhash_init(const struct nft_ctx *ctx,
 			    const struct nft_expr *expr,
 			    const struct nlattr * const tb[])
@@ -141,6 +209,23 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_symhash_map_init(const struct nft_ctx *ctx,
+				const struct nft_expr *expr,
+				const struct nlattr * const tb[])
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_symhash_init(ctx, expr, tb);
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_HASH_SET_NAME],
+					  tb[NFTA_HASH_SET_ID], genmask);
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_jhash_dump(struct sk_buff *skb,
 			  const struct nft_expr *expr)
 {
@@ -168,6 +253,18 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_jhash_map_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr)
+{
+	const struct nft_jhash *priv = nft_expr_priv(expr);
+
+	if (nft_jhash_dump(skb, expr) ||
+	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+		return -1;
+
+	return 0;
+}
+
 static int nft_symhash_dump(struct sk_buff *skb,
 			    const struct nft_expr *expr)
 {
@@ -188,6 +285,18 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_symhash_map_dump(struct sk_buff *skb,
+				const struct nft_expr *expr)
+{
+	const struct nft_symhash *priv = nft_expr_priv(expr);
+
+	if (nft_symhash_dump(skb, expr) ||
+	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+		return -1;
+
+	return 0;
+}
+
 static struct nft_expr_type nft_hash_type;
 static const struct nft_expr_ops nft_jhash_ops = {
 	.type		= &nft_hash_type,
@@ -197,6 +306,14 @@ static const struct nft_expr_ops nft_jhash_ops = {
 	.dump		= nft_jhash_dump,
 };
 
+static const struct nft_expr_ops nft_jhash_map_ops = {
+	.type		= &nft_hash_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
+	.eval		= nft_jhash_map_eval,
+	.init		= nft_jhash_map_init,
+	.dump		= nft_jhash_map_dump,
+};
+
 static const struct nft_expr_ops nft_symhash_ops = {
 	.type		= &nft_hash_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
@@ -205,6 +322,14 @@ static const struct nft_expr_ops nft_symhash_ops = {
 	.dump		= nft_symhash_dump,
 };
 
+static const struct nft_expr_ops nft_symhash_map_ops = {
+	.type		= &nft_hash_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
+	.eval		= nft_symhash_map_eval,
+	.init		= nft_symhash_map_init,
+	.dump		= nft_symhash_map_dump,
+};
+
 static const struct nft_expr_ops *
 nft_hash_select_ops(const struct nft_ctx *ctx,
 		    const struct nlattr * const tb[])
@@ -217,8 +342,12 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
 	type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
 	switch (type) {
 	case NFT_HASH_SYM:
+		if (tb[NFTA_HASH_SET_NAME])
+			return &nft_symhash_map_ops;
 		return &nft_symhash_ops;
 	case NFT_HASH_JENKINS:
+		if (tb[NFTA_HASH_SET_NAME])
+			return &nft_jhash_map_ops;
 		return &nft_jhash_ops;
 	default:
 		break;
-- 
cgit v1.2.3


From 94e58e0ac31284fa26597c0e00a9b1d87a691d02 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 9 May 2018 16:23:49 +0900
Subject: export.h: remove code for prefixing symbols with underscore

CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX was selected by BLACKFIN, METAG.
They were removed by commit 4ba66a976072 ("arch: remove blackfin port"),
commit bb6fb6dfcc17 ("metag: Remove arch/metag/"), respectively.

No more architecture enables CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX.
Clean up the export.h headers.  I am keeping VMLINUX_SYMBOL() and
VMLINUX_SYMBOL_STR() because they are widely used.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/asm-generic/export.h | 34 ++++++++++++----------------------
 include/linux/export.h       | 16 +++++-----------
 2 files changed, 17 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h
index 719db1968d81..68efb950a918 100644
--- a/include/asm-generic/export.h
+++ b/include/asm-generic/export.h
@@ -19,42 +19,32 @@
 #define KCRC_ALIGN 4
 #endif
 
-#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX
-#define KSYM(name) _##name
-#else
-#define KSYM(name) name
-#endif
-
 /*
  * note on .section use: @progbits vs %progbits nastiness doesn't matter,
  * since we immediately emit into those sections anyway.
  */
 .macro ___EXPORT_SYMBOL name,val,sec
 #ifdef CONFIG_MODULES
-	.globl KSYM(__ksymtab_\name)
+	.globl __ksymtab_\name
 	.section ___ksymtab\sec+\name,"a"
 	.balign KSYM_ALIGN
-KSYM(__ksymtab_\name):
-	__put \val, KSYM(__kstrtab_\name)
+__ksymtab_\name:
+	__put \val, __kstrtab_\name
 	.previous
 	.section __ksymtab_strings,"a"
-KSYM(__kstrtab_\name):
-#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX
-	.asciz "_\name"
-#else
+__kstrtab_\name:
 	.asciz "\name"
-#endif
 	.previous
 #ifdef CONFIG_MODVERSIONS
 	.section ___kcrctab\sec+\name,"a"
 	.balign KCRC_ALIGN
-KSYM(__kcrctab_\name):
+__kcrctab_\name:
 #if defined(CONFIG_MODULE_REL_CRCS)
-	.long KSYM(__crc_\name) - .
+	.long __crc_\name - .
 #else
-	.long KSYM(__crc_\name)
+	.long __crc_\name
 #endif
-	.weak KSYM(__crc_\name)
+	.weak __crc_\name
 	.previous
 #endif
 #endif
@@ -84,12 +74,12 @@ KSYM(__kcrctab_\name):
 #endif
 
 #define EXPORT_SYMBOL(name)					\
-	__EXPORT_SYMBOL(name, KSYM_FUNC(KSYM(name)),)
+	__EXPORT_SYMBOL(name, KSYM_FUNC(name),)
 #define EXPORT_SYMBOL_GPL(name) 				\
-	__EXPORT_SYMBOL(name, KSYM_FUNC(KSYM(name)), _gpl)
+	__EXPORT_SYMBOL(name, KSYM_FUNC(name), _gpl)
 #define EXPORT_DATA_SYMBOL(name)				\
-	__EXPORT_SYMBOL(name, KSYM(name),)
+	__EXPORT_SYMBOL(name, name,)
 #define EXPORT_DATA_SYMBOL_GPL(name)				\
-	__EXPORT_SYMBOL(name, KSYM(name),_gpl)
+	__EXPORT_SYMBOL(name, name,_gpl)
 
 #endif
diff --git a/include/linux/export.h b/include/linux/export.h
index 1a1dfdb2a5c6..b768d6dd3c90 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -10,14 +10,8 @@
  * hackers place grumpy comments in header files.
  */
 
-/* Some toolchains use a `_' prefix for all user symbols. */
-#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX
-#define __VMLINUX_SYMBOL(x) _##x
-#define __VMLINUX_SYMBOL_STR(x) "_" #x
-#else
 #define __VMLINUX_SYMBOL(x) x
 #define __VMLINUX_SYMBOL_STR(x) #x
-#endif
 
 /* Indirect, so macros are expanded before pasting. */
 #define VMLINUX_SYMBOL(x) __VMLINUX_SYMBOL(x)
@@ -46,14 +40,14 @@ extern struct module __this_module;
 #if defined(CONFIG_MODULE_REL_CRCS)
 #define __CRC_SYMBOL(sym, sec)						\
 	asm("	.section \"___kcrctab" sec "+" #sym "\", \"a\"	\n"	\
-	    "	.weak	" VMLINUX_SYMBOL_STR(__crc_##sym) "	\n"	\
-	    "	.long	" VMLINUX_SYMBOL_STR(__crc_##sym) " - .	\n"	\
+	    "	.weak	__crc_" #sym "				\n"	\
+	    "	.long	__crc_" #sym " - .			\n"	\
 	    "	.previous					\n");
 #else
 #define __CRC_SYMBOL(sym, sec)						\
 	asm("	.section \"___kcrctab" sec "+" #sym "\", \"a\"	\n"	\
-	    "	.weak	" VMLINUX_SYMBOL_STR(__crc_##sym) "	\n"	\
-	    "	.long	" VMLINUX_SYMBOL_STR(__crc_##sym) "	\n"	\
+	    "	.weak	__crc_" #sym "				\n"	\
+	    "	.long	__crc_" #sym "				\n"	\
 	    "	.previous					\n");
 #endif
 #else
@@ -66,7 +60,7 @@ extern struct module __this_module;
 	__CRC_SYMBOL(sym, sec)						\
 	static const char __kstrtab_##sym[]				\
 	__attribute__((section("__ksymtab_strings"), aligned(1)))	\
-	= VMLINUX_SYMBOL_STR(sym);					\
+	= #sym;								\
 	static const struct kernel_symbol __ksymtab_##sym		\
 	__used								\
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
-- 
cgit v1.2.3


From a6214385005333202c8cc1744c7075a9e1a26b9a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 9 May 2018 16:23:51 +0900
Subject: vmlinux.lds.h: remove no-op macro VMLINUX_SYMBOL()

Now that VMLINUX_SYMBOL() is no-op, clean up the linker script.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/asm-generic/vmlinux.lds.h | 289 +++++++++++++++++++-------------------
 1 file changed, 144 insertions(+), 145 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index af240573e482..f4980c72d389 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -104,66 +104,66 @@
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 #define MCOUNT_REC()	. = ALIGN(8);				\
-			VMLINUX_SYMBOL(__start_mcount_loc) = .; \
+			__start_mcount_loc = .;			\
 			*(__mcount_loc)				\
-			VMLINUX_SYMBOL(__stop_mcount_loc) = .;
+			__stop_mcount_loc = .;
 #else
 #define MCOUNT_REC()
 #endif
 
 #ifdef CONFIG_TRACE_BRANCH_PROFILING
-#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
-				*(_ftrace_annotated_branch)			      \
-				VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
+#define LIKELY_PROFILE()	__start_annotated_branch_profile = .;	\
+				*(_ftrace_annotated_branch)		\
+				__stop_annotated_branch_profile = .;
 #else
 #define LIKELY_PROFILE()
 #endif
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
-#define BRANCH_PROFILE()	VMLINUX_SYMBOL(__start_branch_profile) = .;   \
-				*(_ftrace_branch)			      \
-				VMLINUX_SYMBOL(__stop_branch_profile) = .;
+#define BRANCH_PROFILE()	__start_branch_profile = .;		\
+				*(_ftrace_branch)			\
+				__stop_branch_profile = .;
 #else
 #define BRANCH_PROFILE()
 #endif
 
 #ifdef CONFIG_KPROBES
 #define KPROBE_BLACKLIST()	. = ALIGN(8);				      \
-				VMLINUX_SYMBOL(__start_kprobe_blacklist) = .; \
+				__start_kprobe_blacklist = .;		      \
 				KEEP(*(_kprobe_blacklist))		      \
-				VMLINUX_SYMBOL(__stop_kprobe_blacklist) = .;
+				__stop_kprobe_blacklist = .;
 #else
 #define KPROBE_BLACKLIST()
 #endif
 
 #ifdef CONFIG_FUNCTION_ERROR_INJECTION
 #define ERROR_INJECT_WHITELIST()	STRUCT_ALIGN();			      \
-			VMLINUX_SYMBOL(__start_error_injection_whitelist) = .;\
+			__start_error_injection_whitelist = .;		      \
 			KEEP(*(_error_injection_whitelist))		      \
-			VMLINUX_SYMBOL(__stop_error_injection_whitelist) = .;
+			__stop_error_injection_whitelist = .;
 #else
 #define ERROR_INJECT_WHITELIST()
 #endif
 
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS()	. = ALIGN(8);					\
-			VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
+			__start_ftrace_events = .;			\
 			KEEP(*(_ftrace_events))				\
-			VMLINUX_SYMBOL(__stop_ftrace_events) = .;	\
-			VMLINUX_SYMBOL(__start_ftrace_eval_maps) = .;	\
+			__stop_ftrace_events = .;			\
+			__start_ftrace_eval_maps = .;			\
 			KEEP(*(_ftrace_eval_map))			\
-			VMLINUX_SYMBOL(__stop_ftrace_eval_maps) = .;
+			__stop_ftrace_eval_maps = .;
 #else
 #define FTRACE_EVENTS()
 #endif
 
 #ifdef CONFIG_TRACING
-#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .;      \
+#define TRACE_PRINTKS()	 __start___trace_bprintk_fmt = .;      \
 			 KEEP(*(__trace_printk_fmt)) /* Trace_printk fmt' pointer */ \
-			 VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .;
-#define TRACEPOINT_STR() VMLINUX_SYMBOL(__start___tracepoint_str) = .;	\
+			 __stop___trace_bprintk_fmt = .;
+#define TRACEPOINT_STR() __start___tracepoint_str = .;	\
 			 KEEP(*(__tracepoint_str)) /* Trace_printk fmt' pointer */ \
-			 VMLINUX_SYMBOL(__stop___tracepoint_str) = .;
+			 __stop___tracepoint_str = .;
 #else
 #define TRACE_PRINTKS()
 #define TRACEPOINT_STR()
@@ -171,27 +171,27 @@
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 #define TRACE_SYSCALLS() . = ALIGN(8);					\
-			 VMLINUX_SYMBOL(__start_syscalls_metadata) = .;	\
+			 __start_syscalls_metadata = .;			\
 			 KEEP(*(__syscalls_metadata))			\
-			 VMLINUX_SYMBOL(__stop_syscalls_metadata) = .;
+			 __stop_syscalls_metadata = .;
 #else
 #define TRACE_SYSCALLS()
 #endif
 
 #ifdef CONFIG_BPF_EVENTS
 #define BPF_RAW_TP() STRUCT_ALIGN();					\
-			 VMLINUX_SYMBOL(__start__bpf_raw_tp) = .;	\
+			 __start__bpf_raw_tp = .;			\
 			 KEEP(*(__bpf_raw_tp_map))			\
-			 VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .;
+			 __stop__bpf_raw_tp = .;
 #else
 #define BPF_RAW_TP()
 #endif
 
 #ifdef CONFIG_SERIAL_EARLYCON
 #define EARLYCON_TABLE() . = ALIGN(8);				\
-			 VMLINUX_SYMBOL(__earlycon_table) = .;	\
+			 __earlycon_table = .;			\
 			 KEEP(*(__earlycon_table))		\
-			 VMLINUX_SYMBOL(__earlycon_table_end) = .;
+			 __earlycon_table_end = .;
 #else
 #define EARLYCON_TABLE()
 #endif
@@ -202,7 +202,7 @@
 #define _OF_TABLE_0(name)
 #define _OF_TABLE_1(name)						\
 	. = ALIGN(8);							\
-	VMLINUX_SYMBOL(__##name##_of_table) = .;			\
+	__##name##_of_table = .;					\
 	KEEP(*(__##name##_of_table))					\
 	KEEP(*(__##name##_of_table_end))
 
@@ -217,18 +217,18 @@
 #ifdef CONFIG_ACPI
 #define ACPI_PROBE_TABLE(name)						\
 	. = ALIGN(8);							\
-	VMLINUX_SYMBOL(__##name##_acpi_probe_table) = .;		\
+	__##name##_acpi_probe_table = .;				\
 	KEEP(*(__##name##_acpi_probe_table))				\
-	VMLINUX_SYMBOL(__##name##_acpi_probe_table_end) = .;
+	__##name##_acpi_probe_table_end = .;
 #else
 #define ACPI_PROBE_TABLE(name)
 #endif
 
 #define KERNEL_DTB()							\
 	STRUCT_ALIGN();							\
-	VMLINUX_SYMBOL(__dtb_start) = .;				\
+	__dtb_start = .;						\
 	KEEP(*(.dtb.init.rodata))					\
-	VMLINUX_SYMBOL(__dtb_end) = .;
+	__dtb_end = .;
 
 /*
  * .data section
@@ -241,20 +241,20 @@
 	MEM_KEEP(init.data)						\
 	MEM_KEEP(exit.data)						\
 	*(.data.unlikely)						\
-	VMLINUX_SYMBOL(__start_once) = .;				\
+	__start_once = .;						\
 	*(.data.once)							\
-	VMLINUX_SYMBOL(__end_once) = .;					\
+	__end_once = .;							\
 	STRUCT_ALIGN();							\
 	*(__tracepoints)						\
 	/* implement dynamic printk debug */				\
 	. = ALIGN(8);                                                   \
-	VMLINUX_SYMBOL(__start___jump_table) = .;                       \
+	__start___jump_table = .;					\
 	KEEP(*(__jump_table))                                           \
-	VMLINUX_SYMBOL(__stop___jump_table) = .;                        \
+	__stop___jump_table = .;					\
 	. = ALIGN(8);							\
-	VMLINUX_SYMBOL(__start___verbose) = .;                          \
+	__start___verbose = .;						\
 	KEEP(*(__verbose))                                              \
-	VMLINUX_SYMBOL(__stop___verbose) = .;				\
+	__stop___verbose = .;						\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
@@ -266,10 +266,10 @@
  */
 #define NOSAVE_DATA							\
 	. = ALIGN(PAGE_SIZE);						\
-	VMLINUX_SYMBOL(__nosave_begin) = .;				\
+	__nosave_begin = .;						\
 	*(.data..nosave)						\
 	. = ALIGN(PAGE_SIZE);						\
-	VMLINUX_SYMBOL(__nosave_end) = .;
+	__nosave_end = .;
 
 #define PAGE_ALIGNED_DATA(page_align)					\
 	. = ALIGN(page_align);						\
@@ -286,13 +286,13 @@
 
 #define INIT_TASK_DATA(align)						\
 	. = ALIGN(align);						\
-	VMLINUX_SYMBOL(__start_init_task) = .;				\
-	VMLINUX_SYMBOL(init_thread_union) = .;				\
-	VMLINUX_SYMBOL(init_stack) = .;					\
+	__start_init_task = .;						\
+	init_thread_union = .;						\
+	init_stack = .;							\
 	*(.data..init_task)						\
 	*(.data..init_thread_info)					\
-	. = VMLINUX_SYMBOL(__start_init_task) + THREAD_SIZE;		\
-	VMLINUX_SYMBOL(__end_init_task) = .;
+	. = __start_init_task + THREAD_SIZE;				\
+	__end_init_task = .;
 
 /*
  * Allow architectures to handle ro_after_init data on their
@@ -300,9 +300,9 @@
  */
 #ifndef RO_AFTER_INIT_DATA
 #define RO_AFTER_INIT_DATA						\
-	VMLINUX_SYMBOL(__start_ro_after_init) = .;			\
+	__start_ro_after_init = .;					\
 	*(.data..ro_after_init)						\
-	VMLINUX_SYMBOL(__end_ro_after_init) = .;
+	__end_ro_after_init = .;
 #endif
 
 /*
@@ -311,14 +311,14 @@
 #define RO_DATA_SECTION(align)						\
 	. = ALIGN((align));						\
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start_rodata) = .;			\
+		__start_rodata = .;					\
 		*(.rodata) *(.rodata.*)					\
 		RO_AFTER_INIT_DATA	/* Read only after init */	\
 		KEEP(*(__vermagic))	/* Kernel version magic */	\
 		. = ALIGN(8);						\
-		VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;		\
+		__start___tracepoints_ptrs = .;				\
 		KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \
-		VMLINUX_SYMBOL(__stop___tracepoints_ptrs) = .;		\
+		__stop___tracepoints_ptrs = .;				\
 		*(__tracepoints_strings)/* Tracepoints: strings */	\
 	}								\
 									\
@@ -328,109 +328,109 @@
 									\
 	/* PCI quirks */						\
 	.pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__start_pci_fixups_early) = .;		\
+		__start_pci_fixups_early = .;				\
 		KEEP(*(.pci_fixup_early))				\
-		VMLINUX_SYMBOL(__end_pci_fixups_early) = .;		\
-		VMLINUX_SYMBOL(__start_pci_fixups_header) = .;		\
+		__end_pci_fixups_early = .;				\
+		__start_pci_fixups_header = .;				\
 		KEEP(*(.pci_fixup_header))				\
-		VMLINUX_SYMBOL(__end_pci_fixups_header) = .;		\
-		VMLINUX_SYMBOL(__start_pci_fixups_final) = .;		\
+		__end_pci_fixups_header = .;				\
+		__start_pci_fixups_final = .;				\
 		KEEP(*(.pci_fixup_final))				\
-		VMLINUX_SYMBOL(__end_pci_fixups_final) = .;		\
-		VMLINUX_SYMBOL(__start_pci_fixups_enable) = .;		\
+		__end_pci_fixups_final = .;				\
+		__start_pci_fixups_enable = .;				\
 		KEEP(*(.pci_fixup_enable))				\
-		VMLINUX_SYMBOL(__end_pci_fixups_enable) = .;		\
-		VMLINUX_SYMBOL(__start_pci_fixups_resume) = .;		\
+		__end_pci_fixups_enable = .;				\
+		__start_pci_fixups_resume = .;				\
 		KEEP(*(.pci_fixup_resume))				\
-		VMLINUX_SYMBOL(__end_pci_fixups_resume) = .;		\
-		VMLINUX_SYMBOL(__start_pci_fixups_resume_early) = .;	\
+		__end_pci_fixups_resume = .;				\
+		__start_pci_fixups_resume_early = .;			\
 		KEEP(*(.pci_fixup_resume_early))			\
-		VMLINUX_SYMBOL(__end_pci_fixups_resume_early) = .;	\
-		VMLINUX_SYMBOL(__start_pci_fixups_suspend) = .;		\
+		__end_pci_fixups_resume_early = .;			\
+		__start_pci_fixups_suspend = .;				\
 		KEEP(*(.pci_fixup_suspend))				\
-		VMLINUX_SYMBOL(__end_pci_fixups_suspend) = .;		\
-		VMLINUX_SYMBOL(__start_pci_fixups_suspend_late) = .;	\
+		__end_pci_fixups_suspend = .;				\
+		__start_pci_fixups_suspend_late = .;			\
 		KEEP(*(.pci_fixup_suspend_late))			\
-		VMLINUX_SYMBOL(__end_pci_fixups_suspend_late) = .;	\
+		__end_pci_fixups_suspend_late = .;			\
 	}								\
 									\
 	/* Built-in firmware blobs */					\
 	.builtin_fw        : AT(ADDR(.builtin_fw) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__start_builtin_fw) = .;			\
+		__start_builtin_fw = .;					\
 		KEEP(*(.builtin_fw))					\
-		VMLINUX_SYMBOL(__end_builtin_fw) = .;			\
+		__end_builtin_fw = .;					\
 	}								\
 									\
 	TRACEDATA							\
 									\
 	/* Kernel symbol table: Normal symbols */			\
 	__ksymtab         : AT(ADDR(__ksymtab) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start___ksymtab) = .;			\
+		__start___ksymtab = .;					\
 		KEEP(*(SORT(___ksymtab+*)))				\
-		VMLINUX_SYMBOL(__stop___ksymtab) = .;			\
+		__stop___ksymtab = .;					\
 	}								\
 									\
 	/* Kernel symbol table: GPL-only symbols */			\
 	__ksymtab_gpl     : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__start___ksymtab_gpl) = .;		\
+		__start___ksymtab_gpl = .;				\
 		KEEP(*(SORT(___ksymtab_gpl+*)))				\
-		VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .;		\
+		__stop___ksymtab_gpl = .;				\
 	}								\
 									\
 	/* Kernel symbol table: Normal unused symbols */		\
 	__ksymtab_unused  : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__start___ksymtab_unused) = .;		\
+		__start___ksymtab_unused = .;				\
 		KEEP(*(SORT(___ksymtab_unused+*)))			\
-		VMLINUX_SYMBOL(__stop___ksymtab_unused) = .;		\
+		__stop___ksymtab_unused = .;				\
 	}								\
 									\
 	/* Kernel symbol table: GPL-only unused symbols */		\
 	__ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \
-		VMLINUX_SYMBOL(__start___ksymtab_unused_gpl) = .;	\
+		__start___ksymtab_unused_gpl = .;			\
 		KEEP(*(SORT(___ksymtab_unused_gpl+*)))			\
-		VMLINUX_SYMBOL(__stop___ksymtab_unused_gpl) = .;	\
+		__stop___ksymtab_unused_gpl = .;			\
 	}								\
 									\
 	/* Kernel symbol table: GPL-future-only symbols */		\
 	__ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \
-		VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .;	\
+		__start___ksymtab_gpl_future = .;			\
 		KEEP(*(SORT(___ksymtab_gpl_future+*)))			\
-		VMLINUX_SYMBOL(__stop___ksymtab_gpl_future) = .;	\
+		__stop___ksymtab_gpl_future = .;			\
 	}								\
 									\
 	/* Kernel symbol table: Normal symbols */			\
 	__kcrctab         : AT(ADDR(__kcrctab) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start___kcrctab) = .;			\
+		__start___kcrctab = .;					\
 		KEEP(*(SORT(___kcrctab+*)))				\
-		VMLINUX_SYMBOL(__stop___kcrctab) = .;			\
+		__stop___kcrctab = .;					\
 	}								\
 									\
 	/* Kernel symbol table: GPL-only symbols */			\
 	__kcrctab_gpl     : AT(ADDR(__kcrctab_gpl) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__start___kcrctab_gpl) = .;		\
+		__start___kcrctab_gpl = .;				\
 		KEEP(*(SORT(___kcrctab_gpl+*)))				\
-		VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .;		\
+		__stop___kcrctab_gpl = .;				\
 	}								\
 									\
 	/* Kernel symbol table: Normal unused symbols */		\
 	__kcrctab_unused  : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__start___kcrctab_unused) = .;		\
+		__start___kcrctab_unused = .;				\
 		KEEP(*(SORT(___kcrctab_unused+*)))			\
-		VMLINUX_SYMBOL(__stop___kcrctab_unused) = .;		\
+		__stop___kcrctab_unused = .;				\
 	}								\
 									\
 	/* Kernel symbol table: GPL-only unused symbols */		\
 	__kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \
-		VMLINUX_SYMBOL(__start___kcrctab_unused_gpl) = .;	\
+		__start___kcrctab_unused_gpl = .;			\
 		KEEP(*(SORT(___kcrctab_unused_gpl+*)))			\
-		VMLINUX_SYMBOL(__stop___kcrctab_unused_gpl) = .;	\
+		__stop___kcrctab_unused_gpl = .;			\
 	}								\
 									\
 	/* Kernel symbol table: GPL-future-only symbols */		\
 	__kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \
-		VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .;	\
+		__start___kcrctab_gpl_future = .;			\
 		KEEP(*(SORT(___kcrctab_gpl_future+*)))			\
-		VMLINUX_SYMBOL(__stop___kcrctab_gpl_future) = .;	\
+		__stop___kcrctab_gpl_future = .;			\
 	}								\
 									\
 	/* Kernel symbol table: strings */				\
@@ -447,18 +447,18 @@
 									\
 	/* Built-in module parameters. */				\
 	__param : AT(ADDR(__param) - LOAD_OFFSET) {			\
-		VMLINUX_SYMBOL(__start___param) = .;			\
+		__start___param = .;					\
 		KEEP(*(__param))					\
-		VMLINUX_SYMBOL(__stop___param) = .;			\
+		__stop___param = .;					\
 	}								\
 									\
 	/* Built-in module versions. */					\
 	__modver : AT(ADDR(__modver) - LOAD_OFFSET) {			\
-		VMLINUX_SYMBOL(__start___modver) = .;			\
+		__start___modver = .;					\
 		KEEP(*(__modver))					\
-		VMLINUX_SYMBOL(__stop___modver) = .;			\
+		__stop___modver = .;					\
 		. = ALIGN((align));					\
-		VMLINUX_SYMBOL(__end_rodata) = .;			\
+		__end_rodata = .;					\
 	}								\
 	. = ALIGN((align));
 
@@ -469,9 +469,9 @@
 
 #define SECURITY_INIT							\
 	.security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \
-		VMLINUX_SYMBOL(__security_initcall_start) = .;		\
+		__security_initcall_start = .;				\
 		KEEP(*(.security_initcall.init))			\
-		VMLINUX_SYMBOL(__security_initcall_end) = .;		\
+		__security_initcall_end = .;				\
 	}
 
 /*
@@ -495,47 +495,47 @@
  * address even at second ld pass when generating System.map */
 #define SCHED_TEXT							\
 		ALIGN_FUNCTION();					\
-		VMLINUX_SYMBOL(__sched_text_start) = .;			\
+		__sched_text_start = .;					\
 		*(.sched.text)						\
-		VMLINUX_SYMBOL(__sched_text_end) = .;
+		__sched_text_end = .;
 
 /* spinlock.text is aling to function alignment to secure we have same
  * address even at second ld pass when generating System.map */
 #define LOCK_TEXT							\
 		ALIGN_FUNCTION();					\
-		VMLINUX_SYMBOL(__lock_text_start) = .;			\
+		__lock_text_start = .;					\
 		*(.spinlock.text)					\
-		VMLINUX_SYMBOL(__lock_text_end) = .;
+		__lock_text_end = .;
 
 #define CPUIDLE_TEXT							\
 		ALIGN_FUNCTION();					\
-		VMLINUX_SYMBOL(__cpuidle_text_start) = .;		\
+		__cpuidle_text_start = .;				\
 		*(.cpuidle.text)					\
-		VMLINUX_SYMBOL(__cpuidle_text_end) = .;
+		__cpuidle_text_end = .;
 
 #define KPROBES_TEXT							\
 		ALIGN_FUNCTION();					\
-		VMLINUX_SYMBOL(__kprobes_text_start) = .;		\
+		__kprobes_text_start = .;				\
 		*(.kprobes.text)					\
-		VMLINUX_SYMBOL(__kprobes_text_end) = .;
+		__kprobes_text_end = .;
 
 #define ENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		__entry_text_start = .;					\
 		*(.entry.text)						\
-		VMLINUX_SYMBOL(__entry_text_end) = .;
+		__entry_text_end = .;
 
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
+		__irqentry_text_start = .;				\
 		*(.irqentry.text)					\
-		VMLINUX_SYMBOL(__irqentry_text_end) = .;
+		__irqentry_text_end = .;
 
 #define SOFTIRQENTRY_TEXT						\
 		ALIGN_FUNCTION();					\
-		VMLINUX_SYMBOL(__softirqentry_text_start) = .;		\
+		__softirqentry_text_start = .;				\
 		*(.softirqentry.text)					\
-		VMLINUX_SYMBOL(__softirqentry_text_end) = .;
+		__softirqentry_text_end = .;
 
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
@@ -551,9 +551,9 @@
 #define EXCEPTION_TABLE(align)						\
 	. = ALIGN(align);						\
 	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start___ex_table) = .;			\
+		__start___ex_table = .;					\
 		KEEP(*(__ex_table))					\
-		VMLINUX_SYMBOL(__stop___ex_table) = .;			\
+		__stop___ex_table = .;					\
 	}
 
 /*
@@ -567,11 +567,11 @@
 
 #ifdef CONFIG_CONSTRUCTORS
 #define KERNEL_CTORS()	. = ALIGN(8);			   \
-			VMLINUX_SYMBOL(__ctors_start) = .; \
+			__ctors_start = .;		   \
 			KEEP(*(.ctors))			   \
 			KEEP(*(SORT(.init_array.*)))	   \
 			KEEP(*(.init_array))		   \
-			VMLINUX_SYMBOL(__ctors_end) = .;
+			__ctors_end = .;
 #else
 #define KERNEL_CTORS()
 #endif
@@ -706,9 +706,9 @@
 #define BUG_TABLE							\
 	. = ALIGN(8);							\
 	__bug_table : AT(ADDR(__bug_table) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start___bug_table) = .;		\
+		__start___bug_table = .;				\
 		KEEP(*(__bug_table))					\
-		VMLINUX_SYMBOL(__stop___bug_table) = .;			\
+		__stop___bug_table = .;					\
 	}
 #else
 #define BUG_TABLE
@@ -718,22 +718,22 @@
 #define ORC_UNWIND_TABLE						\
 	. = ALIGN(4);							\
 	.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__start_orc_unwind_ip) = .;		\
+		__start_orc_unwind_ip = .;				\
 		KEEP(*(.orc_unwind_ip))					\
-		VMLINUX_SYMBOL(__stop_orc_unwind_ip) = .;		\
+		__stop_orc_unwind_ip = .;				\
 	}								\
 	. = ALIGN(6);							\
 	.orc_unwind : AT(ADDR(.orc_unwind) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start_orc_unwind) = .;			\
+		__start_orc_unwind = .;					\
 		KEEP(*(.orc_unwind))					\
-		VMLINUX_SYMBOL(__stop_orc_unwind) = .;			\
+		__stop_orc_unwind = .;					\
 	}								\
 	. = ALIGN(4);							\
 	.orc_lookup : AT(ADDR(.orc_lookup) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(orc_lookup) = .;				\
+		orc_lookup = .;						\
 		. += (((SIZEOF(.text) + LOOKUP_BLOCK_SIZE - 1) /	\
 			LOOKUP_BLOCK_SIZE) + 1) * 4;			\
-		VMLINUX_SYMBOL(orc_lookup_end) = .;			\
+		orc_lookup_end = .;					\
 	}
 #else
 #define ORC_UNWIND_TABLE
@@ -743,9 +743,9 @@
 #define TRACEDATA							\
 	. = ALIGN(4);							\
 	.tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__tracedata_start) = .;			\
+		__tracedata_start = .;					\
 		KEEP(*(.tracedata))					\
-		VMLINUX_SYMBOL(__tracedata_end) = .;			\
+		__tracedata_end = .;					\
 	}
 #else
 #define TRACEDATA
@@ -753,24 +753,24 @@
 
 #define NOTES								\
 	.notes : AT(ADDR(.notes) - LOAD_OFFSET) {			\
-		VMLINUX_SYMBOL(__start_notes) = .;			\
+		__start_notes = .;					\
 		*(.note.*)						\
-		VMLINUX_SYMBOL(__stop_notes) = .;			\
+		__stop_notes = .;					\
 	}
 
 #define INIT_SETUP(initsetup_align)					\
 		. = ALIGN(initsetup_align);				\
-		VMLINUX_SYMBOL(__setup_start) = .;			\
+		__setup_start = .;					\
 		KEEP(*(.init.setup))					\
-		VMLINUX_SYMBOL(__setup_end) = .;
+		__setup_end = .;
 
 #define INIT_CALLS_LEVEL(level)						\
-		VMLINUX_SYMBOL(__initcall##level##_start) = .;		\
+		__initcall##level##_start = .;				\
 		KEEP(*(.initcall##level##.init))			\
 		KEEP(*(.initcall##level##s.init))			\
 
 #define INIT_CALLS							\
-		VMLINUX_SYMBOL(__initcall_start) = .;			\
+		__initcall_start = .;					\
 		KEEP(*(.initcallearly.init))				\
 		INIT_CALLS_LEVEL(0)					\
 		INIT_CALLS_LEVEL(1)					\
@@ -781,22 +781,22 @@
 		INIT_CALLS_LEVEL(rootfs)				\
 		INIT_CALLS_LEVEL(6)					\
 		INIT_CALLS_LEVEL(7)					\
-		VMLINUX_SYMBOL(__initcall_end) = .;
+		__initcall_end = .;
 
 #define CON_INITCALL							\
-		VMLINUX_SYMBOL(__con_initcall_start) = .;		\
+		__con_initcall_start = .;				\
 		KEEP(*(.con_initcall.init))				\
-		VMLINUX_SYMBOL(__con_initcall_end) = .;
+		__con_initcall_end = .;
 
 #define SECURITY_INITCALL						\
-		VMLINUX_SYMBOL(__security_initcall_start) = .;		\
+		__security_initcall_start = .;				\
 		KEEP(*(.security_initcall.init))			\
-		VMLINUX_SYMBOL(__security_initcall_end) = .;
+		__security_initcall_end = .;
 
 #ifdef CONFIG_BLK_DEV_INITRD
 #define INIT_RAM_FS							\
 	. = ALIGN(4);							\
-	VMLINUX_SYMBOL(__initramfs_start) = .;				\
+	__initramfs_start = .;						\
 	KEEP(*(.init.ramfs))						\
 	. = ALIGN(8);							\
 	KEEP(*(.init.ramfs.info))
@@ -851,7 +851,7 @@
  * sharing between subsections for different purposes.
  */
 #define PERCPU_INPUT(cacheline)						\
-	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
+	__per_cpu_start = .;						\
 	*(.data..percpu..first)						\
 	. = ALIGN(PAGE_SIZE);						\
 	*(.data..percpu..page_aligned)					\
@@ -861,7 +861,7 @@
 	*(.data..percpu)						\
 	*(.data..percpu..shared_aligned)				\
 	PERCPU_DECRYPTED_SECTION					\
-	VMLINUX_SYMBOL(__per_cpu_end) = .;
+	__per_cpu_end = .;
 
 /**
  * PERCPU_VADDR - define output section for percpu area
@@ -888,12 +888,11 @@
  * address, use PERCPU_SECTION.
  */
 #define PERCPU_VADDR(cacheline, vaddr, phdr)				\
-	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
-	.data..percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
-				- LOAD_OFFSET) {			\
+	__per_cpu_load = .;						\
+	.data..percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) {	\
 		PERCPU_INPUT(cacheline)					\
 	} phdr								\
-	. = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data..percpu);
+	. = __per_cpu_load + SIZEOF(.data..percpu);
 
 /**
  * PERCPU_SECTION - define output section for percpu area, simple version
@@ -910,7 +909,7 @@
 #define PERCPU_SECTION(cacheline)					\
 	. = ALIGN(PAGE_SIZE);						\
 	.data..percpu	: AT(ADDR(.data..percpu) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__per_cpu_load) = .;			\
+		__per_cpu_load = .;					\
 		PERCPU_INPUT(cacheline)					\
 	}
 
@@ -949,9 +948,9 @@
 #define INIT_TEXT_SECTION(inittext_align)				\
 	. = ALIGN(inittext_align);					\
 	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(_sinittext) = .;				\
+		_sinittext = .;						\
 		INIT_TEXT						\
-		VMLINUX_SYMBOL(_einittext) = .;				\
+		_einittext = .;						\
 	}
 
 #define INIT_DATA_SECTION(initsetup_align)				\
@@ -966,8 +965,8 @@
 
 #define BSS_SECTION(sbss_align, bss_align, stop_align)			\
 	. = ALIGN(sbss_align);						\
-	VMLINUX_SYMBOL(__bss_start) = .;				\
+	__bss_start = .;						\
 	SBSS(sbss_align)						\
 	BSS(bss_align)							\
 	. = ALIGN(stop_align);						\
-	VMLINUX_SYMBOL(__bss_stop) = .;
+	__bss_stop = .;
-- 
cgit v1.2.3


From 266ff2a8f51f02b429a987d87634697eb0d01d6a Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 9 May 2018 22:59:58 +1000
Subject: kbuild: Fix asm-generic/vmlinux.lds.h for
 LD_DEAD_CODE_DATA_ELIMINATION

KEEP more tables, and add the function/data section wildcard to more
section selections.

This is a little ad-hoc at the moment, but kernel code should be moved
to consistently use .text..x (note: double dots) for explicit sections
and all references to it in the linker script can be made with
TEXT_MAIN, and similarly for other sections.

For now, let's see if major architectures move to enabling this option
then we can do some refactoring passes. Otherwise if it remains unused
or superseded by LTO, this may not be required.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 include/asm-generic/vmlinux.lds.h | 49 +++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f4980c72d389..e373e2e10f6a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -64,15 +64,24 @@
  * generates .data.identifier sections, which need to be pulled in with
  * .data. We don't want to pull in .data..other sections, which Linux
  * has defined. Same for text and bss.
+ *
+ * RODATA_MAIN is not used because existing code already defines .rodata.x
+ * sections to be brought in with rodata.
  */
 #ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
 #define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
 #define DATA_MAIN .data .data.[0-9a-zA-Z_]*
+#define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]*
+#define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]*
 #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]*
+#define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]*
 #else
 #define TEXT_MAIN .text
 #define DATA_MAIN .data
+#define SDATA_MAIN .sdata
+#define RODATA_MAIN .rodata
 #define BSS_MAIN .bss
+#define SBSS_MAIN .sbss
 #endif
 
 /*
@@ -105,7 +114,7 @@
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 #define MCOUNT_REC()	. = ALIGN(8);				\
 			__start_mcount_loc = .;			\
-			*(__mcount_loc)				\
+			KEEP(*(__mcount_loc))			\
 			__stop_mcount_loc = .;
 #else
 #define MCOUNT_REC()
@@ -113,7 +122,7 @@
 
 #ifdef CONFIG_TRACE_BRANCH_PROFILING
 #define LIKELY_PROFILE()	__start_annotated_branch_profile = .;	\
-				*(_ftrace_annotated_branch)		\
+				KEEP(*(_ftrace_annotated_branch))	\
 				__stop_annotated_branch_profile = .;
 #else
 #define LIKELY_PROFILE()
@@ -121,7 +130,7 @@
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
 #define BRANCH_PROFILE()	__start_branch_profile = .;		\
-				*(_ftrace_branch)			\
+				KEEP(*(_ftrace_branch))			\
 				__stop_branch_profile = .;
 #else
 #define BRANCH_PROFILE()
@@ -238,8 +247,8 @@
 	*(DATA_MAIN)							\
 	*(.ref.data)							\
 	*(.data..shared_aligned) /* percpu related */			\
-	MEM_KEEP(init.data)						\
-	MEM_KEEP(exit.data)						\
+	MEM_KEEP(init.data*)						\
+	MEM_KEEP(exit.data*)						\
 	*(.data.unlikely)						\
 	__start_once = .;						\
 	*(.data.once)							\
@@ -289,8 +298,8 @@
 	__start_init_task = .;						\
 	init_thread_union = .;						\
 	init_stack = .;							\
-	*(.data..init_task)						\
-	*(.data..init_thread_info)					\
+	KEEP(*(.data..init_task))					\
+	KEEP(*(.data..init_thread_info))				\
 	. = __start_init_task + THREAD_SIZE;				\
 	__end_init_task = .;
 
@@ -487,8 +496,8 @@
 		*(.text.hot TEXT_MAIN .text.fixup .text.unlikely)	\
 		*(.text..refcount)					\
 		*(.ref.text)						\
-	MEM_KEEP(init.text)						\
-	MEM_KEEP(exit.text)						\
+	MEM_KEEP(init.text*)						\
+	MEM_KEEP(exit.text*)						\
 
 
 /* sched.text is aling to function alignment to secure we have same
@@ -538,7 +547,7 @@
 		__softirqentry_text_end = .;
 
 /* Section used for early init (in .S files) */
-#define HEAD_TEXT  *(.head.text)
+#define HEAD_TEXT  KEEP(*(.head.text))
 
 #define HEAD_TEXT_SECTION							\
 	.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {		\
@@ -579,11 +588,11 @@
 /* init and exit section handling */
 #define INIT_DATA							\
 	KEEP(*(SORT(___kentry+*)))					\
-	*(.init.data)							\
-	MEM_DISCARD(init.data)						\
+	*(.init.data init.data.*)					\
+	MEM_DISCARD(init.data*)						\
 	KERNEL_CTORS()							\
 	MCOUNT_REC()							\
-	*(.init.rodata)							\
+	*(.init.rodata .init.rodata.*)					\
 	FTRACE_EVENTS()							\
 	TRACE_SYSCALLS()						\
 	KPROBE_BLACKLIST()						\
@@ -602,16 +611,16 @@
 	EARLYCON_TABLE()
 
 #define INIT_TEXT							\
-	*(.init.text)							\
+	*(.init.text .init.text.*)					\
 	*(.text.startup)						\
-	MEM_DISCARD(init.text)
+	MEM_DISCARD(init.text*)
 
 #define EXIT_DATA							\
-	*(.exit.data)							\
+	*(.exit.data .exit.data.*)					\
 	*(.fini_array)							\
 	*(.dtors)							\
-	MEM_DISCARD(exit.data)						\
-	MEM_DISCARD(exit.rodata)
+	MEM_DISCARD(exit.data*)						\
+	MEM_DISCARD(exit.rodata*)
 
 #define EXIT_TEXT							\
 	*(.exit.text)							\
@@ -629,7 +638,7 @@
 	. = ALIGN(sbss_align);						\
 	.sbss : AT(ADDR(.sbss) - LOAD_OFFSET) {				\
 		*(.dynsbss)						\
-		*(.sbss)						\
+		*(SBSS_MAIN)						\
 		*(.scommon)						\
 	}
 
@@ -754,7 +763,7 @@
 #define NOTES								\
 	.notes : AT(ADDR(.notes) - LOAD_OFFSET) {			\
 		__start_notes = .;					\
-		*(.note.*)						\
+		KEEP(*(.note.*))					\
 		__stop_notes = .;					\
 	}
 
-- 
cgit v1.2.3


From 1e9d42194e4c8f0ba3f9d4f72b5f54050ddf7a39 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 19 Apr 2018 22:00:07 +0200
Subject: i2c: gpio: move header to platform_data

This header only contains platform_data. Move it to the proper directory.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Tony Lindgren <tony@atomide.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: James Hogan <jhogan@kernel.org>
Acked-by: Greg Ungerer <gerg@uclinux.org>
---
 MAINTAINERS                                      |  2 +-
 arch/arm/mach-ks8695/board-acs5k.c               |  2 +-
 arch/arm/mach-omap1/board-htcherald.c            |  2 +-
 arch/arm/mach-pxa/palmz72.c                      |  2 +-
 arch/arm/mach-pxa/viper.c                        |  2 +-
 arch/arm/mach-sa1100/simpad.c                    |  2 +-
 arch/mips/alchemy/board-gpr.c                    |  2 +-
 drivers/i2c/busses/i2c-gpio.c                    |  2 +-
 drivers/media/platform/marvell-ccic/mmp-driver.c |  2 +-
 drivers/mfd/sm501.c                              |  2 +-
 include/linux/i2c-gpio.h                         | 34 ------------------------
 include/linux/platform_data/i2c-gpio.h           | 34 ++++++++++++++++++++++++
 12 files changed, 44 insertions(+), 44 deletions(-)
 delete mode 100644 include/linux/i2c-gpio.h
 create mode 100644 include/linux/platform_data/i2c-gpio.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 58b9861ccf99..38760fcce99a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5878,7 +5878,7 @@ GENERIC GPIO I2C DRIVER
 M:	Haavard Skinnemoen <hskinnemoen@gmail.com>
 S:	Supported
 F:	drivers/i2c/busses/i2c-gpio.c
-F:	include/linux/i2c-gpio.h
+F:	include/linux/platform_data/i2c-gpio.h
 
 GENERIC GPIO I2C MULTIPLEXER DRIVER
 M:	Peter Korsgaard <peter.korsgaard@barco.com>
diff --git a/arch/arm/mach-ks8695/board-acs5k.c b/arch/arm/mach-ks8695/board-acs5k.c
index 937eb1d47e7b..ef835d82cdb9 100644
--- a/arch/arm/mach-ks8695/board-acs5k.c
+++ b/arch/arm/mach-ks8695/board-acs5k.c
@@ -19,7 +19,7 @@
 #include <linux/gpio/machine.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/platform_data/pca953x.h>
 
 #include <linux/mtd/mtd.h>
diff --git a/arch/arm/mach-omap1/board-htcherald.c b/arch/arm/mach-omap1/board-htcherald.c
index 67d46690a56e..da8f3fc3180f 100644
--- a/arch/arm/mach-omap1/board-htcherald.c
+++ b/arch/arm/mach-omap1/board-htcherald.c
@@ -31,7 +31,7 @@
 #include <linux/gpio.h>
 #include <linux/gpio_keys.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/htcpld.h>
 #include <linux/leds.h>
 #include <linux/spi/spi.h>
diff --git a/arch/arm/mach-pxa/palmz72.c b/arch/arm/mach-pxa/palmz72.c
index 5877e547cecd..c053c8ce1586 100644
--- a/arch/arm/mach-pxa/palmz72.c
+++ b/arch/arm/mach-pxa/palmz72.c
@@ -30,7 +30,7 @@
 #include <linux/wm97xx.h>
 #include <linux/power_supply.h>
 #include <linux/usb/gpio_vbus.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/gpio/machine.h>
 
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c
index 90d0f277de55..39e05b7008d8 100644
--- a/arch/arm/mach-pxa/viper.c
+++ b/arch/arm/mach-pxa/viper.c
@@ -35,7 +35,7 @@
 #include <linux/sched.h>
 #include <linux/gpio.h>
 #include <linux/jiffies.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/gpio/machine.h>
 #include <linux/platform_data/i2c-pxa.h>
 #include <linux/serial_8250.h>
diff --git a/arch/arm/mach-sa1100/simpad.c b/arch/arm/mach-sa1100/simpad.c
index ace010479eb6..49a61e6f3c5f 100644
--- a/arch/arm/mach-sa1100/simpad.c
+++ b/arch/arm/mach-sa1100/simpad.c
@@ -37,7 +37,7 @@
 #include <linux/input.h>
 #include <linux/gpio_keys.h>
 #include <linux/leds.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 
 #include "generic.h"
 
diff --git a/arch/mips/alchemy/board-gpr.c b/arch/mips/alchemy/board-gpr.c
index 4e79dbd54a33..fa75d75b5ba9 100644
--- a/arch/mips/alchemy/board-gpr.c
+++ b/arch/mips/alchemy/board-gpr.c
@@ -29,7 +29,7 @@
 #include <linux/leds.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/gpio/machine.h>
 #include <asm/bootinfo.h>
 #include <asm/idle.h>
diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c
index 58abb3eced58..005e6e0330c2 100644
--- a/drivers/i2c/busses/i2c-gpio.c
+++ b/drivers/i2c/busses/i2c-gpio.c
@@ -11,7 +11,7 @@
 #include <linux/delay.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/media/platform/marvell-ccic/mmp-driver.c b/drivers/media/platform/marvell-ccic/mmp-driver.c
index 816f4b6a7b8e..d9f0dd0d3525 100644
--- a/drivers/media/platform/marvell-ccic/mmp-driver.c
+++ b/drivers/media/platform/marvell-ccic/mmp-driver.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c
index ad774161a22d..66af659b01b2 100644
--- a/drivers/mfd/sm501.c
+++ b/drivers/mfd/sm501.c
@@ -19,7 +19,7 @@
 #include <linux/device.h>
 #include <linux/platform_device.h>
 #include <linux/pci.h>
-#include <linux/i2c-gpio.h>
+#include <linux/platform_data/i2c-gpio.h>
 #include <linux/gpio/machine.h>
 #include <linux/slab.h>
 
diff --git a/include/linux/i2c-gpio.h b/include/linux/i2c-gpio.h
deleted file mode 100644
index 352c1426fd4d..000000000000
--- a/include/linux/i2c-gpio.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * i2c-gpio interface to platform code
- *
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _LINUX_I2C_GPIO_H
-#define _LINUX_I2C_GPIO_H
-
-/**
- * struct i2c_gpio_platform_data - Platform-dependent data for i2c-gpio
- * @udelay: signal toggle delay. SCL frequency is (500 / udelay) kHz
- * @timeout: clock stretching timeout in jiffies. If the slave keeps
- *	SCL low for longer than this, the transfer will time out.
- * @sda_is_open_drain: SDA is configured as open drain, i.e. the pin
- *	isn't actively driven high when setting the output value high.
- *	gpio_get_value() must return the actual pin state even if the
- *	pin is configured as an output.
- * @scl_is_open_drain: SCL is set up as open drain. Same requirements
- *	as for sda_is_open_drain apply.
- * @scl_is_output_only: SCL output drivers cannot be turned off.
- */
-struct i2c_gpio_platform_data {
-	int		udelay;
-	int		timeout;
-	unsigned int	sda_is_open_drain:1;
-	unsigned int	scl_is_open_drain:1;
-	unsigned int	scl_is_output_only:1;
-};
-
-#endif /* _LINUX_I2C_GPIO_H */
diff --git a/include/linux/platform_data/i2c-gpio.h b/include/linux/platform_data/i2c-gpio.h
new file mode 100644
index 000000000000..352c1426fd4d
--- /dev/null
+++ b/include/linux/platform_data/i2c-gpio.h
@@ -0,0 +1,34 @@
+/*
+ * i2c-gpio interface to platform code
+ *
+ * Copyright (C) 2007 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_I2C_GPIO_H
+#define _LINUX_I2C_GPIO_H
+
+/**
+ * struct i2c_gpio_platform_data - Platform-dependent data for i2c-gpio
+ * @udelay: signal toggle delay. SCL frequency is (500 / udelay) kHz
+ * @timeout: clock stretching timeout in jiffies. If the slave keeps
+ *	SCL low for longer than this, the transfer will time out.
+ * @sda_is_open_drain: SDA is configured as open drain, i.e. the pin
+ *	isn't actively driven high when setting the output value high.
+ *	gpio_get_value() must return the actual pin state even if the
+ *	pin is configured as an output.
+ * @scl_is_open_drain: SCL is set up as open drain. Same requirements
+ *	as for sda_is_open_drain apply.
+ * @scl_is_output_only: SCL output drivers cannot be turned off.
+ */
+struct i2c_gpio_platform_data {
+	int		udelay;
+	int		timeout;
+	unsigned int	sda_is_open_drain:1;
+	unsigned int	scl_is_open_drain:1;
+	unsigned int	scl_is_output_only:1;
+};
+
+#endif /* _LINUX_I2C_GPIO_H */
-- 
cgit v1.2.3


From 62ea22c4954f5b147488eefa644d668e843be6f7 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 19 Apr 2018 22:00:08 +0200
Subject: i2c: mux: gpio: move header to platform_data

This header only contains platform_data. Move it to the proper directory.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Peter Korsgaard <peter.korsgaard@barco.com>
---
 Documentation/i2c/muxes/i2c-mux-gpio       |  4 +--
 MAINTAINERS                                |  2 +-
 drivers/i2c/busses/i2c-i801.c              |  2 +-
 drivers/i2c/muxes/i2c-mux-gpio.c           |  2 +-
 include/linux/i2c-mux-gpio.h               | 43 ------------------------------
 include/linux/platform_data/i2c-mux-gpio.h | 43 ++++++++++++++++++++++++++++++
 6 files changed, 48 insertions(+), 48 deletions(-)
 delete mode 100644 include/linux/i2c-mux-gpio.h
 create mode 100644 include/linux/platform_data/i2c-mux-gpio.h

(limited to 'include')

diff --git a/Documentation/i2c/muxes/i2c-mux-gpio b/Documentation/i2c/muxes/i2c-mux-gpio
index 7a8d7d261632..893ecdfe6e43 100644
--- a/Documentation/i2c/muxes/i2c-mux-gpio
+++ b/Documentation/i2c/muxes/i2c-mux-gpio
@@ -30,12 +30,12 @@ i2c-mux-gpio uses the platform bus, so you need to provide a struct
 platform_device with the platform_data pointing to a struct
 i2c_mux_gpio_platform_data with the I2C adapter number of the master
 bus, the number of bus segments to create and the GPIO pins used
-to control it. See include/linux/i2c-mux-gpio.h for details.
+to control it. See include/linux/platform_data/i2c-mux-gpio.h for details.
 
 E.G. something like this for a MUX providing 4 bus segments
 controlled through 3 GPIO pins:
 
-#include <linux/i2c-mux-gpio.h>
+#include <linux/platform_data/i2c-mux-gpio.h>
 #include <linux/platform_device.h>
 
 static const unsigned myboard_gpiomux_gpios[] = {
diff --git a/MAINTAINERS b/MAINTAINERS
index 38760fcce99a..894f2bf9c9ba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5885,7 +5885,7 @@ M:	Peter Korsgaard <peter.korsgaard@barco.com>
 L:	linux-i2c@vger.kernel.org
 S:	Supported
 F:	drivers/i2c/muxes/i2c-mux-gpio.c
-F:	include/linux/i2c-mux-gpio.h
+F:	include/linux/platform_data/i2c-mux-gpio.h
 F:	Documentation/i2c/muxes/i2c-mux-gpio
 
 GENERIC HDLC (WAN) DRIVERS
diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index e0d59e9ff3c6..bff160d1ce3f 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -106,7 +106,7 @@
 
 #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI
 #include <linux/gpio.h>
-#include <linux/i2c-mux-gpio.h>
+#include <linux/platform_data/i2c-mux-gpio.h>
 #endif
 
 /* I801 SMBus address offsets */
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index 1a9973ede443..15a7cc0459fb 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -10,7 +10,7 @@
 
 #include <linux/i2c.h>
 #include <linux/i2c-mux.h>
-#include <linux/i2c-mux-gpio.h>
+#include <linux/platform_data/i2c-mux-gpio.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/include/linux/i2c-mux-gpio.h b/include/linux/i2c-mux-gpio.h
deleted file mode 100644
index 4406108201fe..000000000000
--- a/include/linux/i2c-mux-gpio.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * i2c-mux-gpio interface to platform code
- *
- * Peter Korsgaard <peter.korsgaard@barco.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _LINUX_I2C_MUX_GPIO_H
-#define _LINUX_I2C_MUX_GPIO_H
-
-/* MUX has no specific idle mode */
-#define I2C_MUX_GPIO_NO_IDLE	((unsigned)-1)
-
-/**
- * struct i2c_mux_gpio_platform_data - Platform-dependent data for i2c-mux-gpio
- * @parent: Parent I2C bus adapter number
- * @base_nr: Base I2C bus number to number adapters from or zero for dynamic
- * @values: Array of bitmasks of GPIO settings (low/high) for each
- *	position
- * @n_values: Number of multiplexer positions (busses to instantiate)
- * @classes: Optional I2C auto-detection classes
- * @gpio_chip: Optional GPIO chip name; if set, GPIO pin numbers are given
- *	relative to the base GPIO number of that chip
- * @gpios: Array of GPIO numbers used to control MUX
- * @n_gpios: Number of GPIOs used to control MUX
- * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used
- */
-struct i2c_mux_gpio_platform_data {
-	int parent;
-	int base_nr;
-	const unsigned *values;
-	int n_values;
-	const unsigned *classes;
-	char *gpio_chip;
-	const unsigned *gpios;
-	int n_gpios;
-	unsigned idle;
-};
-
-#endif /* _LINUX_I2C_MUX_GPIO_H */
diff --git a/include/linux/platform_data/i2c-mux-gpio.h b/include/linux/platform_data/i2c-mux-gpio.h
new file mode 100644
index 000000000000..4406108201fe
--- /dev/null
+++ b/include/linux/platform_data/i2c-mux-gpio.h
@@ -0,0 +1,43 @@
+/*
+ * i2c-mux-gpio interface to platform code
+ *
+ * Peter Korsgaard <peter.korsgaard@barco.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_I2C_MUX_GPIO_H
+#define _LINUX_I2C_MUX_GPIO_H
+
+/* MUX has no specific idle mode */
+#define I2C_MUX_GPIO_NO_IDLE	((unsigned)-1)
+
+/**
+ * struct i2c_mux_gpio_platform_data - Platform-dependent data for i2c-mux-gpio
+ * @parent: Parent I2C bus adapter number
+ * @base_nr: Base I2C bus number to number adapters from or zero for dynamic
+ * @values: Array of bitmasks of GPIO settings (low/high) for each
+ *	position
+ * @n_values: Number of multiplexer positions (busses to instantiate)
+ * @classes: Optional I2C auto-detection classes
+ * @gpio_chip: Optional GPIO chip name; if set, GPIO pin numbers are given
+ *	relative to the base GPIO number of that chip
+ * @gpios: Array of GPIO numbers used to control MUX
+ * @n_gpios: Number of GPIOs used to control MUX
+ * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used
+ */
+struct i2c_mux_gpio_platform_data {
+	int parent;
+	int base_nr;
+	const unsigned *values;
+	int n_values;
+	const unsigned *classes;
+	char *gpio_chip;
+	const unsigned *gpios;
+	int n_gpios;
+	unsigned idle;
+};
+
+#endif /* _LINUX_I2C_MUX_GPIO_H */
-- 
cgit v1.2.3


From 985ecf00375bde7d80be03a749f5b94d5ba77ac8 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 19 Apr 2018 22:00:09 +0200
Subject: i2c: ocores: move header to platform_data

This header only contains platform_data. Move it to the proper directory.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/i2c/busses/i2c-ocores      |  2 +-
 drivers/i2c/busses/i2c-ocores.c          |  2 +-
 drivers/mfd/timberdale.c                 |  2 +-
 include/linux/i2c-ocores.h               | 23 -----------------------
 include/linux/platform_data/i2c-ocores.h | 23 +++++++++++++++++++++++
 5 files changed, 26 insertions(+), 26 deletions(-)
 delete mode 100644 include/linux/i2c-ocores.h
 create mode 100644 include/linux/platform_data/i2c-ocores.h

(limited to 'include')

diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
index c269aaa2f26a..c12fa9d3b050 100644
--- a/Documentation/i2c/busses/i2c-ocores
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -18,7 +18,7 @@ Usage
 i2c-ocores uses the platform bus, so you need to provide a struct
 platform_device with the base address and interrupt number. The
 dev.platform_data of the device should also point to a struct
-ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
+ocores_i2c_platform_data (see linux/platform_data/i2c-ocores.h) describing the
 distance between registers and the input clock speed.
 There is also a possibility to attach a list of i2c_board_info which
 the i2c-ocores driver will add to the bus upon creation.
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index 8c42ca7107b2..d7da9adf7ee1 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -21,7 +21,7 @@
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
-#include <linux/i2c-ocores.h>
+#include <linux/platform_data/i2c-ocores.h>
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/log2.h>
diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c
index cd4a6d7d6750..118d7ef727e6 100644
--- a/drivers/mfd/timberdale.c
+++ b/drivers/mfd/timberdale.c
@@ -30,7 +30,7 @@
 #include <linux/timb_gpio.h>
 
 #include <linux/i2c.h>
-#include <linux/i2c-ocores.h>
+#include <linux/platform_data/i2c-ocores.h>
 #include <linux/i2c-xiic.h>
 
 #include <linux/spi/spi.h>
diff --git a/include/linux/i2c-ocores.h b/include/linux/i2c-ocores.h
deleted file mode 100644
index 01edd96fe1f7..000000000000
--- a/include/linux/i2c-ocores.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * i2c-ocores.h - definitions for the i2c-ocores interface
- *
- * Peter Korsgaard <jacmet@sunsite.dk>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2.  This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-
-#ifndef _LINUX_I2C_OCORES_H
-#define _LINUX_I2C_OCORES_H
-
-struct ocores_i2c_platform_data {
-	u32 reg_shift; /* register offset shift value */
-	u32 reg_io_width; /* register io read/write width */
-	u32 clock_khz; /* input clock in kHz */
-	bool big_endian; /* registers are big endian */
-	u8 num_devices; /* number of devices in the devices list */
-	struct i2c_board_info const *devices; /* devices connected to the bus */
-};
-
-#endif /* _LINUX_I2C_OCORES_H */
diff --git a/include/linux/platform_data/i2c-ocores.h b/include/linux/platform_data/i2c-ocores.h
new file mode 100644
index 000000000000..01edd96fe1f7
--- /dev/null
+++ b/include/linux/platform_data/i2c-ocores.h
@@ -0,0 +1,23 @@
+/*
+ * i2c-ocores.h - definitions for the i2c-ocores interface
+ *
+ * Peter Korsgaard <jacmet@sunsite.dk>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef _LINUX_I2C_OCORES_H
+#define _LINUX_I2C_OCORES_H
+
+struct ocores_i2c_platform_data {
+	u32 reg_shift; /* register offset shift value */
+	u32 reg_io_width; /* register io read/write width */
+	u32 clock_khz; /* input clock in kHz */
+	bool big_endian; /* registers are big endian */
+	u8 num_devices; /* number of devices in the devices list */
+	struct i2c_board_info const *devices; /* devices connected to the bus */
+};
+
+#endif /* _LINUX_I2C_OCORES_H */
-- 
cgit v1.2.3


From 79fc540fd543f47e77e1c7d407f2c082872a4625 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 19 Apr 2018 22:00:10 +0200
Subject: i2c: omap: move header to platform_data

This header only contains platform_data. Move it to the proper directory.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 MAINTAINERS                                |  4 +--
 arch/arm/mach-omap1/common.h               |  2 +-
 arch/arm/mach-omap1/i2c.c                  |  2 +-
 arch/arm/mach-omap2/common.h               |  2 +-
 arch/arm/mach-omap2/omap_hwmod_2420_data.c |  2 +-
 arch/arm/mach-omap2/omap_hwmod_2430_data.c |  2 +-
 arch/arm/mach-omap2/omap_hwmod_33xx_data.c |  2 +-
 arch/arm/mach-omap2/omap_hwmod_3xxx_data.c |  2 +-
 arch/arm/mach-omap2/omap_hwmod_44xx_data.c |  2 +-
 arch/arm/mach-omap2/omap_hwmod_54xx_data.c |  2 +-
 arch/arm/mach-omap2/omap_hwmod_7xx_data.c  |  2 +-
 drivers/i2c/busses/i2c-omap.c              |  2 +-
 include/linux/i2c-omap.h                   | 39 ------------------------------
 include/linux/platform_data/i2c-omap.h     | 39 ++++++++++++++++++++++++++++++
 14 files changed, 52 insertions(+), 52 deletions(-)
 delete mode 100644 include/linux/i2c-omap.h
 create mode 100644 include/linux/platform_data/i2c-omap.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 894f2bf9c9ba..e4a6c963bdcf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10245,7 +10245,7 @@ F:	arch/arm/mach-omap1/
 F:	arch/arm/plat-omap/
 F:	arch/arm/configs/omap1_defconfig
 F:	drivers/i2c/busses/i2c-omap.c
-F:	include/linux/i2c-omap.h
+F:	include/linux/platform_data/i2c-omap.h
 
 OMAP2+ SUPPORT
 M:	Tony Lindgren <tony@atomide.com>
@@ -10277,7 +10277,7 @@ F:	drivers/regulator/tps65218-regulator.c
 F:	drivers/regulator/tps65910-regulator.c
 F:	drivers/regulator/twl-regulator.c
 F:	drivers/regulator/twl6030-regulator.c
-F:	include/linux/i2c-omap.h
+F:	include/linux/platform_data/i2c-omap.h
 
 ONION OMEGA2+ BOARD
 M:	Harvey Hunt <harveyhuntnexus@gmail.com>
diff --git a/arch/arm/mach-omap1/common.h b/arch/arm/mach-omap1/common.h
index d83ff257eaa8..c6537d2c2859 100644
--- a/arch/arm/mach-omap1/common.h
+++ b/arch/arm/mach-omap1/common.h
@@ -27,7 +27,7 @@
 #define __ARCH_ARM_MACH_OMAP1_COMMON_H
 
 #include <linux/mtd/mtd.h>
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 #include <linux/reboot.h>
 
 #include <asm/exception.h>
diff --git a/arch/arm/mach-omap1/i2c.c b/arch/arm/mach-omap1/i2c.c
index 5bdf3c4190f9..9250f263ac51 100644
--- a/arch/arm/mach-omap1/i2c.c
+++ b/arch/arm/mach-omap1/i2c.c
@@ -20,7 +20,7 @@
  */
 
 #include <linux/i2c.h>
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 #include <mach/mux.h>
 #include "soc.h"
 
diff --git a/arch/arm/mach-omap2/common.h b/arch/arm/mach-omap2/common.h
index fbe0b78bf489..ed1a7e2f176a 100644
--- a/arch/arm/mach-omap2/common.h
+++ b/arch/arm/mach-omap2/common.h
@@ -30,7 +30,7 @@
 #include <linux/delay.h>
 #include <linux/i2c.h>
 #include <linux/mfd/twl.h>
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 #include <linux/reboot.h>
 #include <linux/irqchip/irq-omap-intc.h>
 
diff --git a/arch/arm/mach-omap2/omap_hwmod_2420_data.c b/arch/arm/mach-omap2/omap_hwmod_2420_data.c
index fe66cf247874..d684fac8f592 100644
--- a/arch/arm/mach-omap2/omap_hwmod_2420_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_2420_data.c
@@ -13,7 +13,7 @@
  * XXX these should be marked initdata for multi-OMAP kernels
  */
 
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 #include <linux/omap-dma.h>
 
 #include "omap_hwmod.h"
diff --git a/arch/arm/mach-omap2/omap_hwmod_2430_data.c b/arch/arm/mach-omap2/omap_hwmod_2430_data.c
index 74eefd30518c..abef9f6f9bf5 100644
--- a/arch/arm/mach-omap2/omap_hwmod_2430_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_2430_data.c
@@ -13,7 +13,7 @@
  * XXX these should be marked initdata for multi-OMAP kernels
  */
 
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 #include <linux/platform_data/hsmmc-omap.h>
 #include <linux/omap-dma.h>
 
diff --git a/arch/arm/mach-omap2/omap_hwmod_33xx_data.c b/arch/arm/mach-omap2/omap_hwmod_33xx_data.c
index 53e1ac3724f2..c9483bc06228 100644
--- a/arch/arm/mach-omap2/omap_hwmod_33xx_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_33xx_data.c
@@ -14,7 +14,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 
 #include "omap_hwmod.h"
 #include "omap_hwmod_common_data.h"
diff --git a/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c b/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
index 23336b6c7125..9c0953de24da 100644
--- a/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
@@ -15,7 +15,7 @@
  * XXX these should be marked initdata for multi-OMAP kernels
  */
 
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 #include <linux/power/smartreflex.h>
 #include <linux/platform_data/hsmmc-omap.h>
 
diff --git a/arch/arm/mach-omap2/omap_hwmod_44xx_data.c b/arch/arm/mach-omap2/omap_hwmod_44xx_data.c
index e4f8ae9cd637..9e4b4243fec7 100644
--- a/arch/arm/mach-omap2/omap_hwmod_44xx_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_44xx_data.c
@@ -23,7 +23,7 @@
 #include <linux/io.h>
 #include <linux/platform_data/hsmmc-omap.h>
 #include <linux/power/smartreflex.h>
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 
 #include <linux/omap-dma.h>
 
diff --git a/arch/arm/mach-omap2/omap_hwmod_54xx_data.c b/arch/arm/mach-omap2/omap_hwmod_54xx_data.c
index c72cd84b07ec..890c789485d3 100644
--- a/arch/arm/mach-omap2/omap_hwmod_54xx_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_54xx_data.c
@@ -20,7 +20,7 @@
 #include <linux/io.h>
 #include <linux/platform_data/hsmmc-omap.h>
 #include <linux/power/smartreflex.h>
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 
 #include <linux/omap-dma.h>
 
diff --git a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c
index 62352d1e6361..56b141fce973 100644
--- a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c
@@ -20,7 +20,7 @@
 #include <linux/io.h>
 #include <linux/platform_data/hsmmc-omap.h>
 #include <linux/power/smartreflex.h>
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 
 #include <linux/omap-dma.h>
 
diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
index b9172f08fd05..65d06a819307 100644
--- a/drivers/i2c/busses/i2c-omap.c
+++ b/drivers/i2c/busses/i2c-omap.c
@@ -36,7 +36,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/slab.h>
-#include <linux/i2c-omap.h>
+#include <linux/platform_data/i2c-omap.h>
 #include <linux/pm_runtime.h>
 #include <linux/pinctrl/consumer.h>
 
diff --git a/include/linux/i2c-omap.h b/include/linux/i2c-omap.h
deleted file mode 100644
index 3444265ee8ee..000000000000
--- a/include/linux/i2c-omap.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __I2C_OMAP_H__
-#define __I2C_OMAP_H__
-
-#include <linux/platform_device.h>
-
-/*
- * Version 2 of the I2C peripheral unit has a different register
- * layout and extra registers.  The ID register in the V2 peripheral
- * unit on the OMAP4430 reports the same ID as the V1 peripheral
- * unit on the OMAP3530, so we must inform the driver which IP
- * version we know it is running on from platform / cpu-specific
- * code using these constants in the hwmod class definition.
- */
-
-#define OMAP_I2C_IP_VERSION_1 1
-#define OMAP_I2C_IP_VERSION_2 2
-
-/* struct omap_i2c_bus_platform_data .flags meanings */
-
-#define OMAP_I2C_FLAG_NO_FIFO			BIT(0)
-#define OMAP_I2C_FLAG_SIMPLE_CLOCK		BIT(1)
-#define OMAP_I2C_FLAG_16BIT_DATA_REG		BIT(2)
-#define OMAP_I2C_FLAG_ALWAYS_ARMXOR_CLK	BIT(5)
-#define OMAP_I2C_FLAG_FORCE_19200_INT_CLK	BIT(6)
-/* how the CPU address bus must be translated for I2C unit access */
-#define OMAP_I2C_FLAG_BUS_SHIFT_NONE 0
-#define OMAP_I2C_FLAG_BUS_SHIFT_1		BIT(7)
-#define OMAP_I2C_FLAG_BUS_SHIFT_2		BIT(8)
-#define OMAP_I2C_FLAG_BUS_SHIFT__SHIFT 7
-
-struct omap_i2c_bus_platform_data {
-	u32		clkrate;
-	u32		rev;
-	u32		flags;
-	void		(*set_mpu_wkup_lat)(struct device *dev, long set);
-};
-
-#endif
diff --git a/include/linux/platform_data/i2c-omap.h b/include/linux/platform_data/i2c-omap.h
new file mode 100644
index 000000000000..3444265ee8ee
--- /dev/null
+++ b/include/linux/platform_data/i2c-omap.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __I2C_OMAP_H__
+#define __I2C_OMAP_H__
+
+#include <linux/platform_device.h>
+
+/*
+ * Version 2 of the I2C peripheral unit has a different register
+ * layout and extra registers.  The ID register in the V2 peripheral
+ * unit on the OMAP4430 reports the same ID as the V1 peripheral
+ * unit on the OMAP3530, so we must inform the driver which IP
+ * version we know it is running on from platform / cpu-specific
+ * code using these constants in the hwmod class definition.
+ */
+
+#define OMAP_I2C_IP_VERSION_1 1
+#define OMAP_I2C_IP_VERSION_2 2
+
+/* struct omap_i2c_bus_platform_data .flags meanings */
+
+#define OMAP_I2C_FLAG_NO_FIFO			BIT(0)
+#define OMAP_I2C_FLAG_SIMPLE_CLOCK		BIT(1)
+#define OMAP_I2C_FLAG_16BIT_DATA_REG		BIT(2)
+#define OMAP_I2C_FLAG_ALWAYS_ARMXOR_CLK	BIT(5)
+#define OMAP_I2C_FLAG_FORCE_19200_INT_CLK	BIT(6)
+/* how the CPU address bus must be translated for I2C unit access */
+#define OMAP_I2C_FLAG_BUS_SHIFT_NONE 0
+#define OMAP_I2C_FLAG_BUS_SHIFT_1		BIT(7)
+#define OMAP_I2C_FLAG_BUS_SHIFT_2		BIT(8)
+#define OMAP_I2C_FLAG_BUS_SHIFT__SHIFT 7
+
+struct omap_i2c_bus_platform_data {
+	u32		clkrate;
+	u32		rev;
+	u32		flags;
+	void		(*set_mpu_wkup_lat)(struct device *dev, long set);
+};
+
+#endif
-- 
cgit v1.2.3


From e5c7137793a754500e65ffd7477e88ff7a06ac53 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 19 Apr 2018 22:00:11 +0200
Subject: i2c: pca-platform: move header to platform_data

This header only contains platform_data. Move it to the proper directory.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 arch/sh/boards/board-sh7785lcr.c               |  2 +-
 drivers/i2c/busses/i2c-pca-platform.c          |  2 +-
 include/linux/i2c-pca-platform.h               | 10 ----------
 include/linux/platform_data/i2c-pca-platform.h | 10 ++++++++++
 4 files changed, 12 insertions(+), 12 deletions(-)
 delete mode 100644 include/linux/i2c-pca-platform.h
 create mode 100644 include/linux/platform_data/i2c-pca-platform.h

(limited to 'include')

diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index d7d232dea33e..3cba60ff7aab 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -17,7 +17,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
-#include <linux/i2c-pca-platform.h>
+#include <linux/platform_data/i2c-pca-platform.h>
 #include <linux/i2c-algo-pca.h>
 #include <linux/usb/r8a66597.h>
 #include <linux/sh_intc.h>
diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c
index bc2707ffd409..de3fe6e828cb 100644
--- a/drivers/i2c/busses/i2c-pca-platform.c
+++ b/drivers/i2c/busses/i2c-pca-platform.c
@@ -20,7 +20,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/i2c-algo-pca.h>
-#include <linux/i2c-pca-platform.h>
+#include <linux/platform_data/i2c-pca-platform.h>
 #include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/io.h>
diff --git a/include/linux/i2c-pca-platform.h b/include/linux/i2c-pca-platform.h
deleted file mode 100644
index c37329432a8e..000000000000
--- a/include/linux/i2c-pca-platform.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef I2C_PCA9564_PLATFORM_H
-#define I2C_PCA9564_PLATFORM_H
-
-struct i2c_pca9564_pf_platform_data {
-	int i2c_clock_speed;	/* values are defined in linux/i2c-algo-pca.h */
-	int timeout;		/* timeout in jiffies */
-};
-
-#endif /* I2C_PCA9564_PLATFORM_H */
diff --git a/include/linux/platform_data/i2c-pca-platform.h b/include/linux/platform_data/i2c-pca-platform.h
new file mode 100644
index 000000000000..c37329432a8e
--- /dev/null
+++ b/include/linux/platform_data/i2c-pca-platform.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef I2C_PCA9564_PLATFORM_H
+#define I2C_PCA9564_PLATFORM_H
+
+struct i2c_pca9564_pf_platform_data {
+	int i2c_clock_speed;	/* values are defined in linux/i2c-algo-pca.h */
+	int timeout;		/* timeout in jiffies */
+};
+
+#endif /* I2C_PCA9564_PLATFORM_H */
-- 
cgit v1.2.3


From 7072b75c15271cef07792f659eaf0cc7160f6442 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 19 Apr 2018 22:00:12 +0200
Subject: i2c: xiic: move header to platform_data

This header only contains platform_data. Move it to the proper directory.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/i2c/busses/i2c-xiic.c          |  2 +-
 drivers/mfd/timberdale.c               |  2 +-
 include/linux/i2c-xiic.h               | 43 ----------------------------------
 include/linux/platform_data/i2c-xiic.h | 43 ++++++++++++++++++++++++++++++++++
 4 files changed, 45 insertions(+), 45 deletions(-)
 delete mode 100644 include/linux/i2c-xiic.h
 create mode 100644 include/linux/platform_data/i2c-xiic.h

(limited to 'include')

diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c
index c80527816ad0..0ff36f6d7a57 100644
--- a/drivers/i2c/busses/i2c-xiic.c
+++ b/drivers/i2c/busses/i2c-xiic.c
@@ -33,7 +33,7 @@
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
-#include <linux/i2c-xiic.h>
+#include <linux/platform_data/i2c-xiic.h>
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/of.h>
diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c
index 118d7ef727e6..33abf5a79f2b 100644
--- a/drivers/mfd/timberdale.c
+++ b/drivers/mfd/timberdale.c
@@ -31,7 +31,7 @@
 
 #include <linux/i2c.h>
 #include <linux/platform_data/i2c-ocores.h>
-#include <linux/i2c-xiic.h>
+#include <linux/platform_data/i2c-xiic.h>
 
 #include <linux/spi/spi.h>
 #include <linux/spi/xilinx_spi.h>
diff --git a/include/linux/i2c-xiic.h b/include/linux/i2c-xiic.h
deleted file mode 100644
index 4f9f2256a97e..000000000000
--- a/include/linux/i2c-xiic.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * i2c-xiic.h
- * Copyright (c) 2009 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* Supports:
- * Xilinx IIC
- */
-
-#ifndef _LINUX_I2C_XIIC_H
-#define _LINUX_I2C_XIIC_H
-
-/**
- * struct xiic_i2c_platform_data - Platform data of the Xilinx I2C driver
- * @num_devices:	Number of devices that shall be added when the driver
- *			is probed.
- * @devices:		The actuall devices to add.
- *
- * This purpose of this platform data struct is to be able to provide a number
- * of devices that should be added to the I2C bus. The reason is that sometimes
- * the I2C board info is not enough, a new PCI board can for instance be
- * plugged into a standard PC, and the bus number might be unknown at
- * early init time.
- */
-struct xiic_i2c_platform_data {
-	u8				num_devices;
-	struct i2c_board_info const	*devices;
-};
-
-#endif /* _LINUX_I2C_XIIC_H */
diff --git a/include/linux/platform_data/i2c-xiic.h b/include/linux/platform_data/i2c-xiic.h
new file mode 100644
index 000000000000..4f9f2256a97e
--- /dev/null
+++ b/include/linux/platform_data/i2c-xiic.h
@@ -0,0 +1,43 @@
+/*
+ * i2c-xiic.h
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Xilinx IIC
+ */
+
+#ifndef _LINUX_I2C_XIIC_H
+#define _LINUX_I2C_XIIC_H
+
+/**
+ * struct xiic_i2c_platform_data - Platform data of the Xilinx I2C driver
+ * @num_devices:	Number of devices that shall be added when the driver
+ *			is probed.
+ * @devices:		The actuall devices to add.
+ *
+ * This purpose of this platform data struct is to be able to provide a number
+ * of devices that should be added to the I2C bus. The reason is that sometimes
+ * the I2C board info is not enough, a new PCI board can for instance be
+ * plugged into a standard PC, and the bus number might be unknown at
+ * early init time.
+ */
+struct xiic_i2c_platform_data {
+	u8				num_devices;
+	struct i2c_board_info const	*devices;
+};
+
+#endif /* _LINUX_I2C_XIIC_H */
-- 
cgit v1.2.3


From caaccda136ae3fa1c5f6563aae22ca3c199f563a Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 19 Apr 2018 22:00:13 +0200
Subject: i2c: pnx: move header into the driver

There are no platform_data users anymore. Move the structs into the
driver.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Vladimir Zapolskiy <vz@mleia.com>
---
 drivers/i2c/busses/i2c-pnx.c | 21 ++++++++++++++++++++-
 include/linux/i2c-pnx.h      | 38 --------------------------------------
 2 files changed, 20 insertions(+), 39 deletions(-)
 delete mode 100644 include/linux/i2c-pnx.h

(limited to 'include')

diff --git a/drivers/i2c/busses/i2c-pnx.c b/drivers/i2c/busses/i2c-pnx.c
index a542041df0cd..6e0e546ef83f 100644
--- a/drivers/i2c/busses/i2c-pnx.c
+++ b/drivers/i2c/busses/i2c-pnx.c
@@ -18,7 +18,6 @@
 #include <linux/timer.h>
 #include <linux/completion.h>
 #include <linux/platform_device.h>
-#include <linux/i2c-pnx.h>
 #include <linux/io.h>
 #include <linux/err.h>
 #include <linux/clk.h>
@@ -29,6 +28,26 @@
 #define I2C_PNX_SPEED_KHZ_DEFAULT	100
 #define I2C_PNX_REGION_SIZE		0x100
 
+struct i2c_pnx_mif {
+	int			ret;		/* Return value */
+	int			mode;		/* Interface mode */
+	struct completion	complete;	/* I/O completion */
+	struct timer_list	timer;		/* Timeout */
+	u8 *			buf;		/* Data buffer */
+	int			len;		/* Length of data buffer */
+	int			order;		/* RX Bytes to order via TX */
+};
+
+struct i2c_pnx_algo_data {
+	void __iomem		*ioaddr;
+	struct i2c_pnx_mif	mif;
+	int			last;
+	struct clk		*clk;
+	struct i2c_adapter	adapter;
+	int			irq;
+	u32			timeout;
+};
+
 enum {
 	mstatus_tdi = 0x00000001,
 	mstatus_afi = 0x00000002,
diff --git a/include/linux/i2c-pnx.h b/include/linux/i2c-pnx.h
deleted file mode 100644
index 5388326fbbff..000000000000
--- a/include/linux/i2c-pnx.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Header file for I2C support on PNX010x/4008.
- *
- * Author: Dennis Kovalev <dkovalev@ru.mvista.com>
- *
- * 2004-2006 (c) MontaVista Software, Inc. This file is licensed under
- * the terms of the GNU General Public License version 2. This program
- * is licensed "as is" without any warranty of any kind, whether express
- * or implied.
- */
-
-#ifndef __I2C_PNX_H__
-#define __I2C_PNX_H__
-
-struct platform_device;
-struct clk;
-
-struct i2c_pnx_mif {
-	int			ret;		/* Return value */
-	int			mode;		/* Interface mode */
-	struct completion	complete;	/* I/O completion */
-	struct timer_list	timer;		/* Timeout */
-	u8 *			buf;		/* Data buffer */
-	int			len;		/* Length of data buffer */
-	int			order;		/* RX Bytes to order via TX */
-};
-
-struct i2c_pnx_algo_data {
-	void __iomem		*ioaddr;
-	struct i2c_pnx_mif	mif;
-	int			last;
-	struct clk		*clk;
-	struct i2c_adapter	adapter;
-	int			irq;
-	u32			timeout;
-};
-
-#endif /* __I2C_PNX_H__ */
-- 
cgit v1.2.3


From 956fcddc0b2a7430b6ee4783827f57cb7c823c7d Mon Sep 17 00:00:00 2001
From: Feifei Xu <Feifei.Xu@amd.com>
Date: Fri, 20 Apr 2018 12:27:54 +0800
Subject: drm/amdgpu: Add vega20 to asic_type enum.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add vega20 to amd_asic_type enum and amdgpu_asic_name[].

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Feifei Xu <Feifei.Xu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
 include/drm/amd_asic_type.h                | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9fb20a53d5b2..f84fc560c797 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -86,6 +86,7 @@ static const char *amdgpu_asic_name[] = {
 	"VEGAM",
 	"VEGA10",
 	"VEGA12",
+	"VEGA20",
 	"RAVEN",
 	"LAST",
 };
diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h
index 695bde7eb055..dd63d08cc54e 100644
--- a/include/drm/amd_asic_type.h
+++ b/include/drm/amd_asic_type.h
@@ -47,6 +47,7 @@ enum amd_asic_type {
 	CHIP_VEGAM,
 	CHIP_VEGA10,
 	CHIP_VEGA12,
+	CHIP_VEGA20,
 	CHIP_RAVEN,
 	CHIP_LAST,
 };
-- 
cgit v1.2.3


From 9b97387c5c4260ffcdf3b913bdef0d98cb2d4a74 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Fri, 11 May 2018 18:57:58 -0500
Subject: cacheinfo: rename of_node to fw_token

Rename and change the type of of_node to indicate
it is a generic pointer which is generally only used
for comparison purposes. In a later patch we will put
an ACPI/PPTT token pointer in fw_token so that
the code which builds the shared cpu masks can be reused.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Vijaya Kumar K <vkilari@codeaurora.org>
Tested-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Tested-by: Tomasz Nowicki <Tomasz.Nowicki@cavium.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/base/cacheinfo.c  | 16 +++++++++-------
 include/linux/cacheinfo.h |  8 +++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index a872523e8951..597aacb233fc 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -35,7 +35,7 @@ struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu)
 static inline bool cache_leaves_are_shared(struct cacheinfo *this_leaf,
 					   struct cacheinfo *sib_leaf)
 {
-	return sib_leaf->of_node == this_leaf->of_node;
+	return sib_leaf->fw_token == this_leaf->fw_token;
 }
 
 /* OF properties to query for a given cache type */
@@ -167,9 +167,10 @@ static int cache_setup_of_node(unsigned int cpu)
 	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
 	unsigned int index = 0;
 
-	/* skip if of_node is already populated */
-	if (this_cpu_ci->info_list->of_node)
+	/* skip if fw_token is already populated */
+	if (this_cpu_ci->info_list->fw_token) {
 		return 0;
+	}
 
 	if (!cpu_dev) {
 		pr_err("No cpu device for CPU %d\n", cpu);
@@ -190,7 +191,7 @@ static int cache_setup_of_node(unsigned int cpu)
 		if (!np)
 			break;
 		cache_of_set_props(this_leaf, np);
-		this_leaf->of_node = np;
+		this_leaf->fw_token = np;
 		index++;
 	}
 
@@ -278,7 +279,7 @@ static void cache_shared_cpu_map_remove(unsigned int cpu)
 			cpumask_clear_cpu(cpu, &sib_leaf->shared_cpu_map);
 			cpumask_clear_cpu(sibling, &this_leaf->shared_cpu_map);
 		}
-		of_node_put(this_leaf->of_node);
+		of_node_put(this_leaf->fw_token);
 	}
 }
 
@@ -323,8 +324,9 @@ static int detect_cache_attributes(unsigned int cpu)
 	if (ret)
 		goto free_ci;
 	/*
-	 * For systems using DT for cache hierarchy, of_node and shared_cpu_map
-	 * will be set up here only if they are not populated already
+	 * For systems using DT for cache hierarchy, fw_token
+	 * and shared_cpu_map will be set up here only if they are
+	 * not populated already
 	 */
 	ret = cache_shared_cpu_map_setup(cpu);
 	if (ret) {
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 3d9805297cda..0c6f658054d2 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -34,9 +34,8 @@ enum cache_type {
  * @shared_cpu_map: logical cpumask representing all the cpus sharing
  *	this cache node
  * @attributes: bitfield representing various cache attributes
- * @of_node: if devicetree is used, this represents either the cpu node in
- *	case there's no explicit cache node or the cache node itself in the
- *	device tree
+ * @fw_token: Unique value used to determine if different cacheinfo
+ *	structures represent a single hardware cache instance.
  * @disable_sysfs: indicates whether this node is visible to the user via
  *	sysfs or not
  * @priv: pointer to any private data structure specific to particular
@@ -65,8 +64,7 @@ struct cacheinfo {
 #define CACHE_ALLOCATE_POLICY_MASK	\
 	(CACHE_READ_ALLOCATE | CACHE_WRITE_ALLOCATE)
 #define CACHE_ID		BIT(4)
-
-	struct device_node *of_node;
+	void *fw_token;
 	bool disable_sysfs;
 	void *priv;
 };
-- 
cgit v1.2.3


From 2bd00bcd73e5edd5769e2a5f24c59a517582d862 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Fri, 11 May 2018 18:58:00 -0500
Subject: ACPI/PPTT: Add Processor Properties Topology Table parsing

ACPI 6.2 adds a new table, which describes how processing units
are related to each other in tree like fashion. Caches are
also sprinkled throughout the tree and describe the properties
of the caches in relation to other caches and processing units.

Add the code to parse the cache hierarchy and report the total
number of levels of cache for a given core using
acpi_find_last_cache_level() as well as fill out the individual
cores cache information with cache_setup_acpi() once the
cpu_cacheinfo structure has been populated by the arch specific
code.

An additional patch later in the set adds the ability to report
peers in the topology using find_acpi_cpu_topology()
to report a unique ID for each processing unit at a given level
in the tree. These unique id's can then be used to match related
processing units which exist as threads, within a given
package, etc.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Vijaya Kumar K <vkilari@codeaurora.org>
Tested-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Tested-by: Tomasz Nowicki <Tomasz.Nowicki@cavium.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/pptt.c  | 655 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h |   4 +
 2 files changed, 659 insertions(+)
 create mode 100644 drivers/acpi/pptt.c

(limited to 'include')

diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
new file mode 100644
index 000000000000..e5ea1974d1e3
--- /dev/null
+++ b/drivers/acpi/pptt.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * pptt.c - parsing of Processor Properties Topology Table (PPTT)
+ *
+ * Copyright (C) 2018, ARM
+ *
+ * This file implements parsing of the Processor Properties Topology Table
+ * which is optionally used to describe the processor and cache topology.
+ * Due to the relative pointers used throughout the table, this doesn't
+ * leverage the existing subtable parsing in the kernel.
+ *
+ * The PPTT structure is an inverted tree, with each node potentially
+ * holding one or two inverted tree data structures describing
+ * the caches available at that level. Each cache structure optionally
+ * contains properties describing the cache at a given level which can be
+ * used to override hardware probed values.
+ */
+#define pr_fmt(fmt) "ACPI PPTT: " fmt
+
+#include <linux/acpi.h>
+#include <linux/cacheinfo.h>
+#include <acpi/processor.h>
+
+static struct acpi_subtable_header *fetch_pptt_subtable(struct acpi_table_header *table_hdr,
+							u32 pptt_ref)
+{
+	struct acpi_subtable_header *entry;
+
+	/* there isn't a subtable at reference 0 */
+	if (pptt_ref < sizeof(struct acpi_subtable_header))
+		return NULL;
+
+	if (pptt_ref + sizeof(struct acpi_subtable_header) > table_hdr->length)
+		return NULL;
+
+	entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr, pptt_ref);
+
+	if (entry->length == 0)
+		return NULL;
+
+	if (pptt_ref + entry->length > table_hdr->length)
+		return NULL;
+
+	return entry;
+}
+
+static struct acpi_pptt_processor *fetch_pptt_node(struct acpi_table_header *table_hdr,
+						   u32 pptt_ref)
+{
+	return (struct acpi_pptt_processor *)fetch_pptt_subtable(table_hdr, pptt_ref);
+}
+
+static struct acpi_pptt_cache *fetch_pptt_cache(struct acpi_table_header *table_hdr,
+						u32 pptt_ref)
+{
+	return (struct acpi_pptt_cache *)fetch_pptt_subtable(table_hdr, pptt_ref);
+}
+
+static struct acpi_subtable_header *acpi_get_pptt_resource(struct acpi_table_header *table_hdr,
+							   struct acpi_pptt_processor *node,
+							   int resource)
+{
+	u32 *ref;
+
+	if (resource >= node->number_of_priv_resources)
+		return NULL;
+
+	ref = ACPI_ADD_PTR(u32, node, sizeof(struct acpi_pptt_processor));
+	ref += resource;
+
+	return fetch_pptt_subtable(table_hdr, *ref);
+}
+
+static inline bool acpi_pptt_match_type(int table_type, int type)
+{
+	return ((table_type & ACPI_PPTT_MASK_CACHE_TYPE) == type ||
+		table_type & ACPI_PPTT_CACHE_TYPE_UNIFIED & type);
+}
+
+/**
+ * acpi_pptt_walk_cache() - Attempt to find the requested acpi_pptt_cache
+ * @table_hdr: Pointer to the head of the PPTT table
+ * @local_level: passed res reflects this cache level
+ * @res: cache resource in the PPTT we want to walk
+ * @found: returns a pointer to the requested level if found
+ * @level: the requested cache level
+ * @type: the requested cache type
+ *
+ * Attempt to find a given cache level, while counting the max number
+ * of cache levels for the cache node.
+ *
+ * Given a pptt resource, verify that it is a cache node, then walk
+ * down each level of caches, counting how many levels are found
+ * as well as checking the cache type (icache, dcache, unified). If a
+ * level & type match, then we set found, and continue the search.
+ * Once the entire cache branch has been walked return its max
+ * depth.
+ *
+ * Return: The cache structure and the level we terminated with.
+ */
+static int acpi_pptt_walk_cache(struct acpi_table_header *table_hdr,
+				int local_level,
+				struct acpi_subtable_header *res,
+				struct acpi_pptt_cache **found,
+				int level, int type)
+{
+	struct acpi_pptt_cache *cache;
+
+	if (res->type != ACPI_PPTT_TYPE_CACHE)
+		return 0;
+
+	cache = (struct acpi_pptt_cache *) res;
+	while (cache) {
+		local_level++;
+
+		if (local_level == level &&
+		    cache->flags & ACPI_PPTT_CACHE_TYPE_VALID &&
+		    acpi_pptt_match_type(cache->attributes, type)) {
+			if (*found != NULL && cache != *found)
+				pr_warn("Found duplicate cache level/type unable to determine uniqueness\n");
+
+			pr_debug("Found cache @ level %d\n", level);
+			*found = cache;
+			/*
+			 * continue looking at this node's resource list
+			 * to verify that we don't find a duplicate
+			 * cache node.
+			 */
+		}
+		cache = fetch_pptt_cache(table_hdr, cache->next_level_of_cache);
+	}
+	return local_level;
+}
+
+static struct acpi_pptt_cache *acpi_find_cache_level(struct acpi_table_header *table_hdr,
+						     struct acpi_pptt_processor *cpu_node,
+						     int *starting_level, int level,
+						     int type)
+{
+	struct acpi_subtable_header *res;
+	int number_of_levels = *starting_level;
+	int resource = 0;
+	struct acpi_pptt_cache *ret = NULL;
+	int local_level;
+
+	/* walk down from processor node */
+	while ((res = acpi_get_pptt_resource(table_hdr, cpu_node, resource))) {
+		resource++;
+
+		local_level = acpi_pptt_walk_cache(table_hdr, *starting_level,
+						   res, &ret, level, type);
+		/*
+		 * we are looking for the max depth. Since its potentially
+		 * possible for a given node to have resources with differing
+		 * depths verify that the depth we have found is the largest.
+		 */
+		if (number_of_levels < local_level)
+			number_of_levels = local_level;
+	}
+	if (number_of_levels > *starting_level)
+		*starting_level = number_of_levels;
+
+	return ret;
+}
+
+/**
+ * acpi_count_levels() - Given a PPTT table, and a cpu node, count the caches
+ * @table_hdr: Pointer to the head of the PPTT table
+ * @cpu_node: processor node we wish to count caches for
+ *
+ * Given a processor node containing a processing unit, walk into it and count
+ * how many levels exist solely for it, and then walk up each level until we hit
+ * the root node (ignore the package level because it may be possible to have
+ * caches that exist across packages). Count the number of cache levels that
+ * exist at each level on the way up.
+ *
+ * Return: Total number of levels found.
+ */
+static int acpi_count_levels(struct acpi_table_header *table_hdr,
+			     struct acpi_pptt_processor *cpu_node)
+{
+	int total_levels = 0;
+
+	do {
+		acpi_find_cache_level(table_hdr, cpu_node, &total_levels, 0, 0);
+		cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent);
+	} while (cpu_node);
+
+	return total_levels;
+}
+
+/**
+ * acpi_pptt_leaf_node() - Given a processor node, determine if its a leaf
+ * @table_hdr: Pointer to the head of the PPTT table
+ * @node: passed node is checked to see if its a leaf
+ *
+ * Determine if the *node parameter is a leaf node by iterating the
+ * PPTT table, looking for nodes which reference it.
+ *
+ * Return: 0 if we find a node referencing the passed node (or table error),
+ * or 1 if we don't.
+ */
+static int acpi_pptt_leaf_node(struct acpi_table_header *table_hdr,
+			       struct acpi_pptt_processor *node)
+{
+	struct acpi_subtable_header *entry;
+	unsigned long table_end;
+	u32 node_entry;
+	struct acpi_pptt_processor *cpu_node;
+	u32 proc_sz;
+
+	table_end = (unsigned long)table_hdr + table_hdr->length;
+	node_entry = ACPI_PTR_DIFF(node, table_hdr);
+	entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr,
+			     sizeof(struct acpi_table_pptt));
+	proc_sz = sizeof(struct acpi_pptt_processor *);
+
+	while ((unsigned long)entry + proc_sz < table_end) {
+		cpu_node = (struct acpi_pptt_processor *)entry;
+		if (entry->type == ACPI_PPTT_TYPE_PROCESSOR &&
+		    cpu_node->parent == node_entry)
+			return 0;
+		if (entry->length == 0)
+			return 0;
+		entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry,
+				     entry->length);
+
+	}
+	return 1;
+}
+
+/**
+ * acpi_find_processor_node() - Given a PPTT table find the requested processor
+ * @table_hdr:  Pointer to the head of the PPTT table
+ * @acpi_cpu_id: cpu we are searching for
+ *
+ * Find the subtable entry describing the provided processor.
+ * This is done by iterating the PPTT table looking for processor nodes
+ * which have an acpi_processor_id that matches the acpi_cpu_id parameter
+ * passed into the function. If we find a node that matches this criteria
+ * we verify that its a leaf node in the topology rather than depending
+ * on the valid flag, which doesn't need to be set for leaf nodes.
+ *
+ * Return: NULL, or the processors acpi_pptt_processor*
+ */
+static struct acpi_pptt_processor *acpi_find_processor_node(struct acpi_table_header *table_hdr,
+							    u32 acpi_cpu_id)
+{
+	struct acpi_subtable_header *entry;
+	unsigned long table_end;
+	struct acpi_pptt_processor *cpu_node;
+	u32 proc_sz;
+
+	table_end = (unsigned long)table_hdr + table_hdr->length;
+	entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr,
+			     sizeof(struct acpi_table_pptt));
+	proc_sz = sizeof(struct acpi_pptt_processor *);
+
+	/* find the processor structure associated with this cpuid */
+	while ((unsigned long)entry + proc_sz < table_end) {
+		cpu_node = (struct acpi_pptt_processor *)entry;
+
+		if (entry->length == 0) {
+			pr_warn("Invalid zero length subtable\n");
+			break;
+		}
+		if (entry->type == ACPI_PPTT_TYPE_PROCESSOR &&
+		    acpi_cpu_id == cpu_node->acpi_processor_id &&
+		     acpi_pptt_leaf_node(table_hdr, cpu_node)) {
+			return (struct acpi_pptt_processor *)entry;
+		}
+
+		entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry,
+				     entry->length);
+	}
+
+	return NULL;
+}
+
+static int acpi_find_cache_levels(struct acpi_table_header *table_hdr,
+				  u32 acpi_cpu_id)
+{
+	int number_of_levels = 0;
+	struct acpi_pptt_processor *cpu;
+
+	cpu = acpi_find_processor_node(table_hdr, acpi_cpu_id);
+	if (cpu)
+		number_of_levels = acpi_count_levels(table_hdr, cpu);
+
+	return number_of_levels;
+}
+
+static u8 acpi_cache_type(enum cache_type type)
+{
+	switch (type) {
+	case CACHE_TYPE_DATA:
+		pr_debug("Looking for data cache\n");
+		return ACPI_PPTT_CACHE_TYPE_DATA;
+	case CACHE_TYPE_INST:
+		pr_debug("Looking for instruction cache\n");
+		return ACPI_PPTT_CACHE_TYPE_INSTR;
+	default:
+	case CACHE_TYPE_UNIFIED:
+		pr_debug("Looking for unified cache\n");
+		/*
+		 * It is important that ACPI_PPTT_CACHE_TYPE_UNIFIED
+		 * contains the bit pattern that will match both
+		 * ACPI unified bit patterns because we use it later
+		 * to match both cases.
+		 */
+		return ACPI_PPTT_CACHE_TYPE_UNIFIED;
+	}
+}
+
+static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *table_hdr,
+						    u32 acpi_cpu_id,
+						    enum cache_type type,
+						    unsigned int level,
+						    struct acpi_pptt_processor **node)
+{
+	int total_levels = 0;
+	struct acpi_pptt_cache *found = NULL;
+	struct acpi_pptt_processor *cpu_node;
+	u8 acpi_type = acpi_cache_type(type);
+
+	pr_debug("Looking for CPU %d's level %d cache type %d\n",
+		 acpi_cpu_id, level, acpi_type);
+
+	cpu_node = acpi_find_processor_node(table_hdr, acpi_cpu_id);
+
+	while (cpu_node && !found) {
+		found = acpi_find_cache_level(table_hdr, cpu_node,
+					      &total_levels, level, acpi_type);
+		*node = cpu_node;
+		cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent);
+	}
+
+	return found;
+}
+
+/* total number of attributes checked by the properties code */
+#define PPTT_CHECKED_ATTRIBUTES 4
+
+/**
+ * update_cache_properties() - Update cacheinfo for the given processor
+ * @this_leaf: Kernel cache info structure being updated
+ * @found_cache: The PPTT node describing this cache instance
+ * @cpu_node: A unique reference to describe this cache instance
+ *
+ * The ACPI spec implies that the fields in the cache structures are used to
+ * extend and correct the information probed from the hardware. Lets only
+ * set fields that we determine are VALID.
+ *
+ * Return: nothing. Side effect of updating the global cacheinfo
+ */
+static void update_cache_properties(struct cacheinfo *this_leaf,
+				    struct acpi_pptt_cache *found_cache,
+				    struct acpi_pptt_processor *cpu_node)
+{
+	int valid_flags = 0;
+
+	this_leaf->fw_token = cpu_node;
+	if (found_cache->flags & ACPI_PPTT_SIZE_PROPERTY_VALID) {
+		this_leaf->size = found_cache->size;
+		valid_flags++;
+	}
+	if (found_cache->flags & ACPI_PPTT_LINE_SIZE_VALID) {
+		this_leaf->coherency_line_size = found_cache->line_size;
+		valid_flags++;
+	}
+	if (found_cache->flags & ACPI_PPTT_NUMBER_OF_SETS_VALID) {
+		this_leaf->number_of_sets = found_cache->number_of_sets;
+		valid_flags++;
+	}
+	if (found_cache->flags & ACPI_PPTT_ASSOCIATIVITY_VALID) {
+		this_leaf->ways_of_associativity = found_cache->associativity;
+		valid_flags++;
+	}
+	if (found_cache->flags & ACPI_PPTT_WRITE_POLICY_VALID) {
+		switch (found_cache->attributes & ACPI_PPTT_MASK_WRITE_POLICY) {
+		case ACPI_PPTT_CACHE_POLICY_WT:
+			this_leaf->attributes = CACHE_WRITE_THROUGH;
+			break;
+		case ACPI_PPTT_CACHE_POLICY_WB:
+			this_leaf->attributes = CACHE_WRITE_BACK;
+			break;
+		}
+	}
+	if (found_cache->flags & ACPI_PPTT_ALLOCATION_TYPE_VALID) {
+		switch (found_cache->attributes & ACPI_PPTT_MASK_ALLOCATION_TYPE) {
+		case ACPI_PPTT_CACHE_READ_ALLOCATE:
+			this_leaf->attributes |= CACHE_READ_ALLOCATE;
+			break;
+		case ACPI_PPTT_CACHE_WRITE_ALLOCATE:
+			this_leaf->attributes |= CACHE_WRITE_ALLOCATE;
+			break;
+		case ACPI_PPTT_CACHE_RW_ALLOCATE:
+		case ACPI_PPTT_CACHE_RW_ALLOCATE_ALT:
+			this_leaf->attributes |=
+				CACHE_READ_ALLOCATE | CACHE_WRITE_ALLOCATE;
+			break;
+		}
+	}
+	/*
+	 * If the above flags are valid, and the cache type is NOCACHE
+	 * update the cache type as well.
+	 */
+	if (this_leaf->type == CACHE_TYPE_NOCACHE &&
+	    valid_flags == PPTT_CHECKED_ATTRIBUTES)
+		this_leaf->type = CACHE_TYPE_UNIFIED;
+}
+
+static void cache_setup_acpi_cpu(struct acpi_table_header *table,
+				 unsigned int cpu)
+{
+	struct acpi_pptt_cache *found_cache;
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+	struct cacheinfo *this_leaf;
+	unsigned int index = 0;
+	struct acpi_pptt_processor *cpu_node = NULL;
+
+	while (index < get_cpu_cacheinfo(cpu)->num_leaves) {
+		this_leaf = this_cpu_ci->info_list + index;
+		found_cache = acpi_find_cache_node(table, acpi_cpu_id,
+						   this_leaf->type,
+						   this_leaf->level,
+						   &cpu_node);
+		pr_debug("found = %p %p\n", found_cache, cpu_node);
+		if (found_cache)
+			update_cache_properties(this_leaf,
+						found_cache,
+						cpu_node);
+
+		index++;
+	}
+}
+
+/* Passing level values greater than this will result in search termination */
+#define PPTT_ABORT_PACKAGE 0xFF
+
+static struct acpi_pptt_processor *acpi_find_processor_package_id(struct acpi_table_header *table_hdr,
+								  struct acpi_pptt_processor *cpu,
+								  int level, int flag)
+{
+	struct acpi_pptt_processor *prev_node;
+
+	while (cpu && level) {
+		if (cpu->flags & flag)
+			break;
+		pr_debug("level %d\n", level);
+		prev_node = fetch_pptt_node(table_hdr, cpu->parent);
+		if (prev_node == NULL)
+			break;
+		cpu = prev_node;
+		level--;
+	}
+	return cpu;
+}
+
+/**
+ * topology_get_acpi_cpu_tag() - Find a unique topology value for a feature
+ * @table: Pointer to the head of the PPTT table
+ * @cpu: Kernel logical cpu number
+ * @level: A level that terminates the search
+ * @flag: A flag which terminates the search
+ *
+ * Get a unique value given a cpu, and a topology level, that can be
+ * matched to determine which cpus share common topological features
+ * at that level.
+ *
+ * Return: Unique value, or -ENOENT if unable to locate cpu
+ */
+static int topology_get_acpi_cpu_tag(struct acpi_table_header *table,
+				     unsigned int cpu, int level, int flag)
+{
+	struct acpi_pptt_processor *cpu_node;
+	u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+
+	cpu_node = acpi_find_processor_node(table, acpi_cpu_id);
+	if (cpu_node) {
+		cpu_node = acpi_find_processor_package_id(table, cpu_node,
+							  level, flag);
+		/* Only the first level has a guaranteed id */
+		if (level == 0)
+			return cpu_node->acpi_processor_id;
+		return ACPI_PTR_DIFF(cpu_node, table);
+	}
+	pr_warn_once("PPTT table found, but unable to locate core %d (%d)\n",
+		    cpu, acpi_cpu_id);
+	return -ENOENT;
+}
+
+static int find_acpi_cpu_topology_tag(unsigned int cpu, int level, int flag)
+{
+	struct acpi_table_header *table;
+	acpi_status status;
+	int retval;
+
+	status = acpi_get_table(ACPI_SIG_PPTT, 0, &table);
+	if (ACPI_FAILURE(status)) {
+		pr_warn_once("No PPTT table found, cpu topology may be inaccurate\n");
+		return -ENOENT;
+	}
+	retval = topology_get_acpi_cpu_tag(table, cpu, level, flag);
+	pr_debug("Topology Setup ACPI cpu %d, level %d ret = %d\n",
+		 cpu, level, retval);
+	acpi_put_table(table);
+
+	return retval;
+}
+
+/**
+ * acpi_find_last_cache_level() - Determines the number of cache levels for a PE
+ * @cpu: Kernel logical cpu number
+ *
+ * Given a logical cpu number, returns the number of levels of cache represented
+ * in the PPTT. Errors caused by lack of a PPTT table, or otherwise, return 0
+ * indicating we didn't find any cache levels.
+ *
+ * Return: Cache levels visible to this core.
+ */
+int acpi_find_last_cache_level(unsigned int cpu)
+{
+	u32 acpi_cpu_id;
+	struct acpi_table_header *table;
+	int number_of_levels = 0;
+	acpi_status status;
+
+	pr_debug("Cache Setup find last level cpu=%d\n", cpu);
+
+	acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+	status = acpi_get_table(ACPI_SIG_PPTT, 0, &table);
+	if (ACPI_FAILURE(status)) {
+		pr_warn_once("No PPTT table found, cache topology may be inaccurate\n");
+	} else {
+		number_of_levels = acpi_find_cache_levels(table, acpi_cpu_id);
+		acpi_put_table(table);
+	}
+	pr_debug("Cache Setup find last level level=%d\n", number_of_levels);
+
+	return number_of_levels;
+}
+
+/**
+ * cache_setup_acpi() - Override CPU cache topology with data from the PPTT
+ * @cpu: Kernel logical cpu number
+ *
+ * Updates the global cache info provided by cpu_get_cacheinfo()
+ * when there are valid properties in the acpi_pptt_cache nodes. A
+ * successful parse may not result in any updates if none of the
+ * cache levels have any valid flags set.  Futher, a unique value is
+ * associated with each known CPU cache entry. This unique value
+ * can be used to determine whether caches are shared between cpus.
+ *
+ * Return: -ENOENT on failure to find table, or 0 on success
+ */
+int cache_setup_acpi(unsigned int cpu)
+{
+	struct acpi_table_header *table;
+	acpi_status status;
+
+	pr_debug("Cache Setup ACPI cpu %d\n", cpu);
+
+	status = acpi_get_table(ACPI_SIG_PPTT, 0, &table);
+	if (ACPI_FAILURE(status)) {
+		pr_warn_once("No PPTT table found, cache topology may be inaccurate\n");
+		return -ENOENT;
+	}
+
+	cache_setup_acpi_cpu(table, cpu);
+	acpi_put_table(table);
+
+	return status;
+}
+
+/**
+ * find_acpi_cpu_topology() - Determine a unique topology value for a given cpu
+ * @cpu: Kernel logical cpu number
+ * @level: The topological level for which we would like a unique ID
+ *
+ * Determine a topology unique ID for each thread/core/cluster/mc_grouping
+ * /socket/etc. This ID can then be used to group peers, which will have
+ * matching ids.
+ *
+ * The search terminates when either the requested level is found or
+ * we reach a root node. Levels beyond the termination point will return the
+ * same unique ID. The unique id for level 0 is the acpi processor id. All
+ * other levels beyond this use a generated value to uniquely identify
+ * a topological feature.
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, or the cpu cannot be found.
+ * Otherwise returns a value which represents a unique topological feature.
+ */
+int find_acpi_cpu_topology(unsigned int cpu, int level)
+{
+	return find_acpi_cpu_topology_tag(cpu, level, 0);
+}
+
+/**
+ * find_acpi_cpu_cache_topology() - Determine a unique cache topology value
+ * @cpu: Kernel logical cpu number
+ * @level: The cache level for which we would like a unique ID
+ *
+ * Determine a unique ID for each unified cache in the system
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, or the cpu cannot be found.
+ * Otherwise returns a value which represents a unique topological feature.
+ */
+int find_acpi_cpu_cache_topology(unsigned int cpu, int level)
+{
+	struct acpi_table_header *table;
+	struct acpi_pptt_cache *found_cache;
+	acpi_status status;
+	u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+	struct acpi_pptt_processor *cpu_node = NULL;
+	int ret = -1;
+
+	status = acpi_get_table(ACPI_SIG_PPTT, 0, &table);
+	if (ACPI_FAILURE(status)) {
+		pr_warn_once("No PPTT table found, topology may be inaccurate\n");
+		return -ENOENT;
+	}
+
+	found_cache = acpi_find_cache_node(table, acpi_cpu_id,
+					   CACHE_TYPE_UNIFIED,
+					   level,
+					   &cpu_node);
+	if (found_cache)
+		ret = ACPI_PTR_DIFF(cpu_node, table);
+
+	acpi_put_table(table);
+
+	return ret;
+}
+
+
+/**
+ * find_acpi_cpu_topology_package() - Determine a unique cpu package value
+ * @cpu: Kernel logical cpu number
+ *
+ * Determine a topology unique package ID for the given cpu.
+ * This ID can then be used to group peers, which will have matching ids.
+ *
+ * The search terminates when either a level is found with the PHYSICAL_PACKAGE
+ * flag set or we reach a root node.
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, or the cpu cannot be found.
+ * Otherwise returns a value which represents the package for this cpu.
+ */
+int find_acpi_cpu_topology_package(unsigned int cpu)
+{
+	return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE,
+					  ACPI_PPTT_PHYSICAL_PACKAGE);
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 15bfb15c2fa5..032e12a2fdc2 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1297,4 +1297,8 @@ static inline int lpit_read_residency_count_address(u64 *address)
 }
 #endif
 
+int find_acpi_cpu_topology(unsigned int cpu, int level);
+int find_acpi_cpu_topology_package(unsigned int cpu);
+int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
+
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3


From 582b468bdc6d9c287a432a63225cf7922e985e15 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Fri, 11 May 2018 18:58:02 -0500
Subject: drivers: base cacheinfo: Add support for ACPI based firmware tables

Call ACPI cache parsing routines from base cacheinfo code if ACPI
is enabled. Also stub out cache_setup_acpi and acpi_find_last_cache_level
so that individual architectures can enable ACPI topology parsing.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Vijaya Kumar K <vkilari@codeaurora.org>
Tested-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Tested-by: Tomasz Nowicki <Tomasz.Nowicki@cavium.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/base/cacheinfo.c  | 14 ++++++++++----
 include/linux/cacheinfo.h | 17 +++++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index 597aacb233fc..2880e2ab01f5 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -206,7 +206,7 @@ static inline bool cache_leaves_are_shared(struct cacheinfo *this_leaf,
 					   struct cacheinfo *sib_leaf)
 {
 	/*
-	 * For non-DT systems, assume unique level 1 cache, system-wide
+	 * For non-DT/ACPI systems, assume unique level 1 caches, system-wide
 	 * shared caches for all other levels. This will be used only if
 	 * arch specific code has not populated shared_cpu_map
 	 */
@@ -214,6 +214,11 @@ static inline bool cache_leaves_are_shared(struct cacheinfo *this_leaf,
 }
 #endif
 
+int __weak cache_setup_acpi(unsigned int cpu)
+{
+	return -ENOTSUPP;
+}
+
 static int cache_shared_cpu_map_setup(unsigned int cpu)
 {
 	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -227,8 +232,8 @@ static int cache_shared_cpu_map_setup(unsigned int cpu)
 	if (of_have_populated_dt())
 		ret = cache_setup_of_node(cpu);
 	else if (!acpi_disabled)
-		/* No cache property/hierarchy support yet in ACPI */
-		ret = -ENOTSUPP;
+		ret = cache_setup_acpi(cpu);
+
 	if (ret)
 		return ret;
 
@@ -279,7 +284,8 @@ static void cache_shared_cpu_map_remove(unsigned int cpu)
 			cpumask_clear_cpu(cpu, &sib_leaf->shared_cpu_map);
 			cpumask_clear_cpu(sibling, &this_leaf->shared_cpu_map);
 		}
-		of_node_put(this_leaf->fw_token);
+		if (of_have_populated_dt())
+			of_node_put(this_leaf->fw_token);
 	}
 }
 
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 0c6f658054d2..89397e30e269 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -97,6 +97,23 @@ int func(unsigned int cpu)					\
 struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu);
 int init_cache_level(unsigned int cpu);
 int populate_cache_leaves(unsigned int cpu);
+int cache_setup_acpi(unsigned int cpu);
+#ifndef CONFIG_ACPI
+/*
+ * acpi_find_last_cache_level is only called on ACPI enabled
+ * platforms using the PPTT for topology. This means that if
+ * the platform supports other firmware configuration methods
+ * we need to stub out the call when ACPI is disabled.
+ * ACPI enabled platforms not using PPTT won't be making calls
+ * to this function so we need not worry about them.
+ */
+static inline int acpi_find_last_cache_level(unsigned int cpu)
+{
+	return 0;
+}
+#else
+int acpi_find_last_cache_level(unsigned int cpu);
+#endif
 
 const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
 
-- 
cgit v1.2.3


From 96009c7d500efdd5534e83b2e3eb2c58d4b137ae Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 May 2018 16:24:36 +0200
Subject: sched: replace __QDISC_STATE_RUNNING bit with a spin lock

So that we can use lockdep on it.
The newly introduced sequence lock has the same scope of busylock,
so it shares the same lockdep annotation, but it's only used for
NOLOCK qdiscs.

With this changeset we acquire such lock in the control path around
flushing operation (qdisc reset), to allow more NOLOCK qdisc perf
improvement in the next patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 10 +++++-----
 net/sched/sch_generic.c   | 11 +++++++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4d2b37226e75..98c10a28cd01 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -30,7 +30,6 @@ struct qdisc_rate_table {
 enum qdisc_state_t {
 	__QDISC_STATE_SCHED,
 	__QDISC_STATE_DEACTIVATED,
-	__QDISC_STATE_RUNNING,
 };
 
 struct qdisc_size_table {
@@ -102,6 +101,7 @@ struct Qdisc {
 	refcount_t		refcnt;
 
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
+	spinlock_t		seqlock;
 };
 
 static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
@@ -111,17 +111,17 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
 	refcount_inc(&qdisc->refcnt);
 }
 
-static inline bool qdisc_is_running(const struct Qdisc *qdisc)
+static inline bool qdisc_is_running(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK)
-		return test_bit(__QDISC_STATE_RUNNING, &qdisc->state);
+		return spin_is_locked(&qdisc->seqlock);
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK) {
-		if (test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state))
+		if (!spin_trylock(&qdisc->seqlock))
 			return false;
 	} else if (qdisc_is_running(qdisc)) {
 		return false;
@@ -138,7 +138,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
 	write_seqcount_end(&qdisc->running);
 	if (qdisc->flags & TCQ_F_NOLOCK)
-		clear_bit(__QDISC_STATE_RUNNING, &qdisc->state);
+		spin_unlock(&qdisc->seqlock);
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ff3ce71aec93..a126f16bc30b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -858,6 +858,11 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	lockdep_set_class(&sch->busylock,
 			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
 
+	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
+	spin_lock_init(&sch->seqlock);
+	lockdep_set_class(&sch->busylock,
+			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
 	seqcount_init(&sch->running);
 	lockdep_set_class(&sch->running,
 			  dev->qdisc_running_key ?: &qdisc_running_key);
@@ -1097,6 +1102,10 @@ static void dev_deactivate_queue(struct net_device *dev,
 
 	qdisc = rtnl_dereference(dev_queue->qdisc);
 	if (qdisc) {
+		bool nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+		if (nolock)
+			spin_lock_bh(&qdisc->seqlock);
 		spin_lock_bh(qdisc_lock(qdisc));
 
 		if (!(qdisc->flags & TCQ_F_BUILTIN))
@@ -1106,6 +1115,8 @@ static void dev_deactivate_queue(struct net_device *dev,
 		qdisc_reset(qdisc);
 
 		spin_unlock_bh(qdisc_lock(qdisc));
+		if (nolock)
+			spin_unlock_bh(&qdisc->seqlock);
 	}
 }
 
-- 
cgit v1.2.3


From 021a17ed796b62383f7623f4fea73787abddad77 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 May 2018 16:24:37 +0200
Subject: pfifo_fast: drop unneeded additional lock on dequeue

After the previous patch, for NOLOCK qdiscs, q->seqlock is
always held when the dequeue() is invoked, we can drop
any additional locking to protect such operation.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skb_array.h | 5 +++++
 net/sched/sch_generic.c   | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index a6b6e8bb3d7b..62d9b0a6329f 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -97,6 +97,11 @@ static inline bool skb_array_empty_any(struct skb_array *a)
 	return ptr_ring_empty_any(&a->ring);
 }
 
+static inline struct sk_buff *__skb_array_consume(struct skb_array *a)
+{
+	return __ptr_ring_consume(&a->ring);
+}
+
 static inline struct sk_buff *skb_array_consume(struct skb_array *a)
 {
 	return ptr_ring_consume(&a->ring);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a126f16bc30b..760ab1b09f8b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -656,7 +656,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 		if (__skb_array_empty(q))
 			continue;
 
-		skb = skb_array_consume_bh(q);
+		skb = __skb_array_consume(q);
 	}
 	if (likely(skb)) {
 		qdisc_qstats_cpu_backlog_dec(qdisc, skb);
@@ -697,7 +697,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
 		if (!q->ring.queue)
 			continue;
 
-		while ((skb = skb_array_consume_bh(q)) != NULL)
+		while ((skb = __skb_array_consume(q)) != NULL)
 			kfree_skb(skb);
 	}
 
-- 
cgit v1.2.3


From 20b654dfe1beaca60ab51894ff405a049248433d Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:10 -0700
Subject: tcp: support DUPACK threshold in RACK

This patch adds support for the classic DUPACK threshold rule
(#DupThresh) in RACK.

When the number of packets SACKed is greater or equal to the
threshold, RACK sets the reordering window to zero which would
immediately mark all the unsacked packets below the highest SACKed
sequence lost. Since this approach is known to not work well with
reordering, RACK only uses it if no reordering has been observed.

The DUPACK threshold rule is a particularly useful extension to the
fast recoveries triggered by RACK reordering timer. For example
data-center transfers where the RTT is much smaller than a timer
tick, or high RTT path where the default RTT/4 may take too long.

Note that this patch differs slightly from RFC6675. RFC6675
considers a packet lost when at least #DupThresh higher-sequence
packets are SACKed.

With RACK, for connections that have seen reordering, RACK
continues to use a dynamically-adaptive time-based reordering
window to detect losses. But for connections on which we have not
yet seen reordering, this patch considers a packet lost when at
least one higher sequence packet is SACKed and the total number
of SACKed packets is at least DupThresh. For example, suppose a
connection has not seen reordering, and sends 10 packets, and
packets 3, 5, 7 are SACKed. RFC6675 considers packets 1 and 2
lost. RACK considers packets 1, 2, 4, 6 lost.

There is some small risk of spurious retransmits here due to
reordering. However, this is mostly limited to the first flight of
a connection on which the sender receives SACKs from reordering.
And RFC 6675 and FACK loss detection have a similar risk on the
first flight with reordering (it's just that the risk of spurious
retransmits from reordering was slightly narrower for those older
algorithms due to the margin of 3*MSS).

Also the minimum reordering window is reduced from 1 msec to 0
to recover quicker on short RTT transfers. Therefore RACK is more
aggressive in marking packets lost during recovery to reduce the
reordering window timeouts.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  1 +
 include/net/tcp.h                      |  1 +
 net/ipv4/tcp_recovery.c                | 40 +++++++++++++++++++++++-----------
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 59afc9a10b4f..13bbac50dc8b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -451,6 +451,7 @@ tcp_recovery - INTEGER
 	RACK: 0x1 enables the RACK loss detection for fast detection of lost
 	      retransmissions and tail drops.
 	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
+	RACK: 0x4 disables RACK's DUPACK threshold heuristic
 
 	Default: 0x1
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a08eab58ef70..994f869d793c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
+#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 3a81720ac0c4..1c1bdf12a96f 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -21,6 +21,32 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
 }
 
+u32 tcp_rack_reo_wnd(const struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->rack.reord) {
+		/* If reordering has not been observed, be aggressive during
+		 * the recovery or starting the recovery by DUPACK threshold.
+		 */
+		if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
+			return 0;
+
+		if (tp->sacked_out >= tp->reordering &&
+		    !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
+			return 0;
+	}
+
+	/* To be more reordering resilient, allow min_rtt/4 settling delay.
+	 * Use min_rtt instead of the smoothed RTT because reordering is
+	 * often a path property and less related to queuing or delayed ACKs.
+	 * Upon receiving DSACKs, linearly increase the window up to the
+	 * smoothed RTT.
+	 */
+	return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
+		   tp->srtt_us >> 3);
+}
+
 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
  *
  * Marks a packet lost, if some packet sent later has been (s)acked.
@@ -44,23 +70,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 min_rtt = tcp_min_rtt(tp);
 	struct sk_buff *skb, *n;
 	u32 reo_wnd;
 
 	*reo_timeout = 0;
-	/* To be more reordering resilient, allow min_rtt/4 settling delay
-	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
-	 * RTT because reordering is often a path property and less related
-	 * to queuing or delayed ACKs.
-	 */
-	reo_wnd = 1000;
-	if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
-	    min_rtt != ~0U) {
-		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
-		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
-	}
-
+	reo_wnd = tcp_rack_reo_wnd(sk);
 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
 				 tcp_tsorted_anchor) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-- 
cgit v1.2.3


From 6ac06ecd3a5d1dd1aaea5c2a8f6d6e4c81d5de6a Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:12 -0700
Subject: tcp: simpler NewReno implementation

This is a rewrite of NewReno loss recovery implementation that is
simpler and standalone for readability and better performance by
using less states.

Note that NewReno refers to RFC6582 as a modification to the fast
recovery algorithm. It is used only if the connection does not
support SACK in Linux. It should not to be confused with the Reno
(AIMD) congestion control.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  1 +
 net/ipv4/tcp_input.c    | 19 +++++++++++--------
 net/ipv4/tcp_recovery.c | 27 +++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 994f869d793c..9a3ce379b994 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1877,6 +1877,7 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
 extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     u64 xmit_time);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ccbe04f80040..076206873e3e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2223,9 +2223,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tcp_is_reno(tp)) {
-		tcp_mark_head_lost(sk, 1, 1);
-	} else {
+	if (tcp_is_sack(tp)) {
 		int sacked_upto = tp->sacked_out - tp->reordering;
 		if (sacked_upto >= 0)
 			tcp_mark_head_lost(sk, sacked_upto, 0);
@@ -2723,11 +2721,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
 	return false;
 }
 
-static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tcp_is_rack(sk)) {
+	if (tcp_rtx_queue_empty(sk))
+		return;
+
+	if (unlikely(tcp_is_reno(tp))) {
+		tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
+	} else if (tcp_is_rack(sk)) {
 		u32 prior_retrans = tp->retrans_out;
 
 		tcp_rack_mark_lost(sk);
@@ -2823,11 +2826,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 			tcp_try_keep_open(sk);
 			return;
 		}
-		tcp_rack_identify_loss(sk, ack_flag);
+		tcp_identify_packet_loss(sk, ack_flag);
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, is_dupack, rexmit);
-		tcp_rack_identify_loss(sk, ack_flag);
+		tcp_identify_packet_loss(sk, ack_flag);
 		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
 		      (*ack_flag & FLAG_LOST_RETRANS)))
 			return;
@@ -2844,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk);
 
-		tcp_rack_identify_loss(sk, ack_flag);
+		tcp_identify_packet_loss(sk, ack_flag);
 		if (!tcp_time_to_recover(sk, flag)) {
 			tcp_try_to_open(sk, flag);
 			return;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 1c1bdf12a96f..299b0e38aa9a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -216,3 +216,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
 		tp->rack.reo_wnd_steps = 1;
 	}
 }
+
+/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
+ * the next unacked packet upon receiving
+ * a) three or more DUPACKs to start the fast recovery
+ * b) an ACK acknowledging new data during the fast recovery.
+ */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
+{
+	const u8 state = inet_csk(sk)->icsk_ca_state;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
+	    (state == TCP_CA_Recovery && snd_una_advanced)) {
+		struct sk_buff *skb = tcp_rtx_queue_head(sk);
+		u32 mss;
+
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+			return;
+
+		mss = tcp_skb_mss(skb);
+		if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
+			tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+				     mss, mss, GFP_ATOMIC);
+
+		tcp_skb_mark_lost_uncond_verify(tp, skb);
+	}
+}
-- 
cgit v1.2.3


From d716bfdb10b4250617783c94253e48b0e85adcb1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:13 -0700
Subject: tcp: account lost retransmit after timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous approach for the lost and retransmit bits was to
wipe the slate clean: zero all the lost and retransmit bits,
correspondingly zero the lost_out and retrans_out counters, and
then add back the lost bits (and correspondingly increment lost_out).

The new approach is to treat this very much like marking packets
lost in fast recovery. We don’t wipe the slate clean. We just say
that for all packets that were not yet marked sacked or lost, we now
mark them as lost in exactly the same way we do for fast recovery.

This fixes the lost retransmit accounting at RTO time and greatly
simplifies the RTO code by sharing much of the logic with Fast
Recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  1 +
 net/ipv4/tcp_input.c    | 18 +++---------------
 net/ipv4/tcp_recovery.c |  4 ++--
 3 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9a3ce379b994..2b5372ef2e0e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1877,6 +1877,7 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
 void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
 extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 076206873e3e..6fb0a28977a0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1929,7 +1929,6 @@ void tcp_enter_loss(struct sock *sk)
 	struct sk_buff *skb;
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
-	bool mark_lost;
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1945,9 +1944,6 @@ void tcp_enter_loss(struct sock *sk)
 	tp->snd_cwnd_cnt   = 0;
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 
-	tp->retrans_out = 0;
-	tp->lost_out = 0;
-
 	if (tcp_is_reno(tp))
 		tcp_reset_reno_sack(tp);
 
@@ -1959,21 +1955,13 @@ void tcp_enter_loss(struct sock *sk)
 		/* Mark SACK reneging until we recover from this loss event. */
 		tp->is_sack_reneg = 1;
 	}
-	tcp_clear_all_retrans_hints(tp);
-
 	skb_rbtree_walk_from(skb) {
-		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
-			     is_reneg);
-		if (mark_lost)
-			tcp_sum_lost(tp, skb);
-		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-		if (mark_lost) {
+		if (is_reneg)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
-			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-			tp->lost_out += tcp_skb_pcount(skb);
-		}
+		tcp_mark_skb_lost(sk, skb);
 	}
 	tcp_verify_left_out(tp);
+	tcp_clear_all_retrans_hints(tp);
 
 	/* Timeout in disordered state after receiving substantial DUPACKs
 	 * suggests that the degree of reordering is over-estimated.
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 299b0e38aa9a..b2f9be388bf3 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,7 +2,7 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 
-static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -95,7 +95,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 		remaining = tp->rack.rtt_us + reo_wnd -
 			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
 		if (remaining <= 0) {
-			tcp_rack_mark_skb_lost(sk, skb);
+			tcp_mark_skb_lost(sk, skb);
 			list_del_init(&skb->tcp_tsorted_anchor);
 		} else {
 			/* Record maximum wait time */
-- 
cgit v1.2.3


From b8fef65a8a76c2f887c56bf8e9684ff04e8d3b9f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:16 -0700
Subject: tcp: new helper tcp_rack_skb_timeout

Create and export a new helper tcp_rack_skb_timeout and move tcp_is_rack
to prepare the final RTO change.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  2 ++
 net/ipv4/tcp_input.c    | 10 +++++-----
 net/ipv4/tcp_recovery.c |  9 +++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2b5372ef2e0e..6deb540297cc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1879,6 +1879,8 @@ void tcp_init(void);
 /* tcp_recovery.c */
 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
 void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
+extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
+				u32 reo_wnd);
 extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     u64 xmit_time);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1ccc97b368c7..ba8a8e3464aa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1917,6 +1917,11 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
 	tp->undo_retrans = tp->retrans_out ? : -1;
 }
 
+static bool tcp_is_rack(const struct sock *sk)
+{
+	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
+}
+
 /* If we detect SACK reneging, forget all SACK information
  * and reset tags completely, otherwise preserve SACKs. If receiver
  * dropped its ofo queue, we will know this due to reneging detection.
@@ -2031,11 +2036,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tp->sacked_out + 1;
 }
 
-static bool tcp_is_rack(const struct sock *sk)
-{
-	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
-}
-
 /* Linux NewReno/SACK/ECN state machine.
  * --------------------------------------
  *
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index b2f9be388bf3..30cbfb69b1de 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -47,6 +47,12 @@ u32 tcp_rack_reo_wnd(const struct sock *sk)
 		   tp->srtt_us >> 3);
 }
 
+s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
+{
+	return tp->rack.rtt_us + reo_wnd -
+	       tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+}
+
 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
  *
  * Marks a packet lost, if some packet sent later has been (s)acked.
@@ -92,8 +98,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 		/* A packet is lost if it has not been s/acked beyond
 		 * the recent RTT plus the reordering window.
 		 */
-		remaining = tp->rack.rtt_us + reo_wnd -
-			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+		remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
 		if (remaining <= 0) {
 			tcp_mark_skb_lost(sk, skb);
 			list_del_init(&skb->tcp_tsorted_anchor);
-- 
cgit v1.2.3


From 5490b8725d80cdfd5266a3732e23d61d0e9f7d2f Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Thu, 17 May 2018 10:29:32 +0200
Subject: phy: add 2.5G SGMII mode to the phy_mode enum

This patch adds one more generic PHY mode to the phy_mode enum, to allow
configuring generic PHYs to the 2.5G SGMII mode by using the set_mode
callback.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index c9d14eeee7f5..9713aebdd348 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -36,6 +36,7 @@ enum phy_mode {
 	PHY_MODE_USB_DEVICE_SS,
 	PHY_MODE_USB_OTG,
 	PHY_MODE_SGMII,
+	PHY_MODE_2500SGMII,
 	PHY_MODE_10GKR,
 	PHY_MODE_UFS_HS_A,
 	PHY_MODE_UFS_HS_B,
-- 
cgit v1.2.3


From a6d04569120d53ebac1f0e6ffd5497b22c75f037 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 5 Dec 2017 10:38:58 +0200
Subject: net/mlx5: Add merged e-switch cap

When merged e-switch is supported, the per-port e-switch is logically
merged into one e-switch that spans both physical ports and all the VFs.
Under merged eswitch, both the matching on source vport and setting
destination vport can have a 2nd attribute which is the vhca id of the
eswitch owner.

For example:
esw0: {match: <src vport=1 owner=0> action: fwd to <dst vport=7, owner=1>}
is a flow set on eswitch0 matching on source vport=1 from his eswitch
and the action being fwd to dest vport=7 of eswitch1.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz Klein <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1aad455538f4..ef15f751a984 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -557,7 +557,8 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x19];
+	u8         reserved_at_5[0x18];
+	u8         merged_eswitch[0x1];
 	u8         nic_vport_node_guid_modify[0x1];
 	u8         nic_vport_port_guid_modify[0x1];
 
-- 
cgit v1.2.3


From b17f7fc10facca9e1b2b9cef39af0a8179796c14 Mon Sep 17 00:00:00 2001
From: Shahar Klein <shahark@mellanox.com>
Date: Thu, 22 Mar 2018 12:32:12 +0200
Subject: net/mlx5: Add destination e-switch owner

The destination e-switch owner allows a rule in namespace of one e-switch
owner to point to a vport that is natively associated with another
e-switch owner.

Signed-off-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c            | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c   | 6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c             | 8 +++++++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c            | 2 +-
 include/linux/mlx5/fs.h                                      | 6 +++++-
 include/linux/mlx5/mlx5_ifc.h                                | 5 +++--
 7 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
index d93ff567b40d..b3820a34e773 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -235,7 +235,7 @@ const char *parse_fs_dst(struct trace_seq *p,
 
 	switch (dst->type) {
 	case MLX5_FLOW_DESTINATION_TYPE_VPORT:
-		trace_seq_printf(p, "vport=%u\n", dst->vport_num);
+		trace_seq_printf(p, "vport=%u\n", dst->vport.num);
 		break;
 	case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
 		trace_seq_printf(p, "ft=%p\n", dst->ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 332bc56306bf..9a24314b817a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -192,7 +192,7 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 	}
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-	dest.vport_num = vport;
+	dest.vport.num = vport;
 
 	esw_debug(esw->dev,
 		  "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b123f8a52ad8..90c8cb31e633 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -71,7 +71,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-		dest[i].vport_num = attr->out_rep->vport;
+		dest[i].vport.num = attr->out_rep->vport;
 		i++;
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
@@ -343,7 +343,7 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
 
 	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-	dest.vport_num = vport;
+	dest.vport.num = vport;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
 	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
@@ -387,7 +387,7 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 	dmac_c[0] = 0x01;
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-	dest.vport_num = 0;
+	dest.vport.num = 0;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
 	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 0bfce6a82c91..5a00deff5457 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -374,7 +374,13 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 				id = dst->dest_attr.ft->id;
 			} else if (dst->dest_attr.type ==
 				   MLX5_FLOW_DESTINATION_TYPE_VPORT) {
-				id = dst->dest_attr.vport_num;
+				id = dst->dest_attr.vport.num;
+				MLX5_SET(dest_format_struct, in_dests,
+					 destination_eswitch_owner_vhca_id_valid,
+					 dst->dest_attr.vport.vhca_id_valid);
+				MLX5_SET(dest_format_struct, in_dests,
+					 destination_eswitch_owner_vhca_id,
+					 dst->dest_attr.vport.vhca_id);
 			} else {
 				id = dst->dest_attr.tir_num;
 			}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index de51e7c39bc8..5a80279b052a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1404,7 +1404,7 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 {
 	if (d1->type == d2->type) {
 		if ((d1->type == MLX5_FLOW_DESTINATION_TYPE_VPORT &&
-		     d1->vport_num == d2->vport_num) ||
+		     d1->vport.num == d2->vport.num) ||
 		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE &&
 		     d1->ft == d2->ft) ||
 		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 47aecc4fa8c2..9f4d32e41c06 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -90,8 +90,12 @@ struct mlx5_flow_destination {
 	union {
 		u32			tir_num;
 		struct mlx5_flow_table	*ft;
-		u32			vport_num;
 		struct mlx5_fc		*counter;
+		struct {
+			u16		num;
+			u16		vhca_id;
+			bool		vhca_id_valid;
+		} vport;
 	};
 };
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ef15f751a984..3d17709bc30c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1148,8 +1148,9 @@ enum mlx5_flow_destination_type {
 struct mlx5_ifc_dest_format_struct_bits {
 	u8         destination_type[0x8];
 	u8         destination_id[0x18];
-
-	u8         reserved_at_20[0x20];
+	u8         destination_eswitch_owner_vhca_id_valid[0x1];
+	u8         reserved_at_21[0xf];
+	u8         destination_eswitch_owner_vhca_id[0x10];
 };
 
 struct mlx5_ifc_flow_counter_list_bits {
-- 
cgit v1.2.3


From 3e99df87722874c669f6c24a79bd1b4ba5fec819 Mon Sep 17 00:00:00 2001
From: Shahar Klein <shahark@mellanox.com>
Date: Sun, 18 Mar 2018 09:02:06 +0200
Subject: net/mlx5: Add source e-switch owner

The source e-switch owner allows a vport on one e-switch port be associated
with a rule defined on the second port e-switch.

The role of the source eswitch owner valid bit in the flow group is to
allow the firmware fail driver attempts to wild card the source eswitch
match field. If this bit is not set, the firmware ignores the source
eswitch owner field totally.

Signed-off-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 10 ++++++++++
 include/linux/mlx5/mlx5_ifc.h                     |  6 ++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5a80279b052a..b1a2ca0ff320 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1372,6 +1372,8 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft,
 	struct mlx5_core_dev *dev = get_dev(&ft->node);
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	void *match_criteria_addr;
+	u8 src_esw_owner_mask_on;
+	void *misc;
 	int err;
 	u32 *in;
 
@@ -1384,6 +1386,14 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft,
 	MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index);
 	MLX5_SET(create_flow_group_in, in, end_flow_index,   fg->start_index +
 		 fg->max_ftes - 1);
+
+	misc = MLX5_ADDR_OF(fte_match_param, fg->mask.match_criteria,
+			    misc_parameters);
+	src_esw_owner_mask_on = !!MLX5_GET(fte_match_set_misc, misc,
+					 source_eswitch_owner_vhca_id);
+	MLX5_SET(create_flow_group_in, in,
+		 source_eswitch_owner_vhca_id_valid, src_esw_owner_mask_on);
+
 	match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in,
 					   in, match_criteria);
 	memcpy(match_criteria_addr, fg->mask.match_criteria,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3d17709bc30c..9c3538f1b8b9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -412,7 +412,7 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         reserved_at_0[0x8];
 	u8         source_sqn[0x18];
 
-	u8         reserved_at_20[0x10];
+	u8         source_eswitch_owner_vhca_id[0x10];
 	u8         source_port[0x10];
 
 	u8         outer_second_prio[0x3];
@@ -6995,7 +6995,9 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
 
-	u8         reserved_at_c0[0x20];
+	u8         source_eswitch_owner_vhca_id_valid[0x1];
+
+	u8         reserved_at_c1[0x1f];
 
 	u8         start_flow_index[0x20];
 
-- 
cgit v1.2.3


From 2e28bc84cf6eecd3759d7ae723bb0f5f09becf76 Mon Sep 17 00:00:00 2001
From: Oza Pawandeep <poza@codeaurora.org>
Date: Thu, 17 May 2018 16:44:15 -0500
Subject: PCI/AER: Factor out error reporting to drivers/pci/pcie/err.c

Move the error reporting callbacks from aerdrv_core.c to err.c, where they
can be used by DPC in addition to AER.

As part of aerdrv_core.c, these callbacks were built under CONFIG_PCIEAER.
Moving them to the new err.c means they will now be built under
CONFIG_PCIEPORTBUS, so adjust the definition of pci_uevent_ers() to match.

Signed-off-by: Oza Pawandeep <poza@codeaurora.org>
[bhelgaas: in reset_link(), initialize "driver" even if CONFIG_PCIEAER is
unset, update pci_uevent_ers() #ifdef wrapper]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci-driver.c           |   2 +-
 drivers/pci/pci.h                  |   4 +
 drivers/pci/pcie/Makefile          |   2 +-
 drivers/pci/pcie/aer/aerdrv.h      |  30 ---
 drivers/pci/pcie/aer/aerdrv_core.c | 334 +------------------------------
 drivers/pci/pcie/err.c             | 389 +++++++++++++++++++++++++++++++++++++
 drivers/pci/pcie/portdrv.h         |   1 +
 include/linux/pci.h                |   2 +-
 8 files changed, 398 insertions(+), 366 deletions(-)
 create mode 100644 drivers/pci/pcie/err.c

(limited to 'include')

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 6ace47099fc5..ffb956457b4a 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1535,7 +1535,7 @@ static int pci_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH)
+#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH)
 /**
  * pci_uevent_ers - emit a uevent during recovery path of PCI device
  * @pdev: PCI device undergoing error recovery
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c2b7b8..5e8857a3a575 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,10 @@ static inline resource_size_t pci_resource_alignment(struct pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_nonfatal_recovery(struct pci_dev *dev);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d404a45..03f4e0b3a140 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y			:= portdrv_core.o portdrv_pci.o
+pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584f62fe..b4c950683cc7 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 					 */
 };
 
-struct aer_broadcast_data {
-	enum pci_channel_state state;
-	enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-		enum pci_ers_result new)
-{
-	if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-		return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-	if (new == PCI_ERS_RESULT_NONE)
-		return orig;
-
-	switch (orig) {
-	case PCI_ERS_RESULT_CAN_RECOVER:
-	case PCI_ERS_RESULT_RECOVERED:
-		orig = new;
-		break;
-	case PCI_ERS_RESULT_DISCONNECT:
-		if (new == PCI_ERS_RESULT_NEED_RESET)
-			orig = PCI_ERS_RESULT_NEED_RESET;
-		break;
-	default:
-		break;
-	}
-
-	return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 58209e88f631..4fa1ee489f7a 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
 	return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-	pci_ers_result_t vote;
-	const struct pci_error_handlers *err_handler;
-	struct aer_broadcast_data *result_data;
-	result_data = (struct aer_broadcast_data *) data;
-
-	device_lock(&dev->dev);
-	dev->error_state = result_data->state;
-
-	if (!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->error_detected) {
-		if (result_data->state == pci_channel_io_frozen &&
-			dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-			/*
-			 * In case of fatal recovery, if one of down-
-			 * stream device has no driver. We might be
-			 * unable to recover because a later insmod
-			 * of a driver for this device is unaware of
-			 * its hw state.
-			 */
-			pci_printk(KERN_DEBUG, dev, "device has %s\n",
-				   dev->driver ?
-				   "no AER-aware driver" : "no driver");
-		}
-
-		/*
-		 * If there's any device in the subtree that does not
-		 * have an error_detected callback, returning
-		 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-		 * the subsequent mmio_enabled/slot_reset/resume
-		 * callbacks of "any" device in the subtree. All the
-		 * devices in the subtree are left in the error state
-		 * without recovery.
-		 */
-
-		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-		else
-			vote = PCI_ERS_RESULT_NONE;
-	} else {
-		err_handler = dev->driver->err_handler;
-		vote = err_handler->error_detected(dev, result_data->state);
-		pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-	}
-
-	result_data->result = merge_result(result_data->result, vote);
-	device_unlock(&dev->dev);
-	return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-	pci_ers_result_t vote;
-	const struct pci_error_handlers *err_handler;
-	struct aer_broadcast_data *result_data;
-	result_data = (struct aer_broadcast_data *) data;
-
-	device_lock(&dev->dev);
-	if (!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->mmio_enabled)
-		goto out;
-
-	err_handler = dev->driver->err_handler;
-	vote = err_handler->mmio_enabled(dev);
-	result_data->result = merge_result(result_data->result, vote);
-out:
-	device_unlock(&dev->dev);
-	return 0;
-}
-
-static int report_slot_reset(struct pci_dev *dev, void *data)
-{
-	pci_ers_result_t vote;
-	const struct pci_error_handlers *err_handler;
-	struct aer_broadcast_data *result_data;
-	result_data = (struct aer_broadcast_data *) data;
-
-	device_lock(&dev->dev);
-	if (!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->slot_reset)
-		goto out;
-
-	err_handler = dev->driver->err_handler;
-	vote = err_handler->slot_reset(dev);
-	result_data->result = merge_result(result_data->result, vote);
-out:
-	device_unlock(&dev->dev);
-	return 0;
-}
-
-static int report_resume(struct pci_dev *dev, void *data)
-{
-	const struct pci_error_handlers *err_handler;
-
-	device_lock(&dev->dev);
-	dev->error_state = pci_channel_io_normal;
-
-	if (!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->resume)
-		goto out;
-
-	err_handler = dev->driver->err_handler;
-	err_handler->resume(dev);
-	pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
-out:
-	device_unlock(&dev->dev);
-	return 0;
-}
-
-/**
- * broadcast_error_message - handle message broadcast to downstream drivers
- * @dev: pointer to from where in a hierarchy message is broadcasted down
- * @state: error state
- * @error_mesg: message to print
- * @cb: callback to be broadcasted
- *
- * Invoked during error recovery process. Once being invoked, the content
- * of error severity will be broadcasted to all downstream drivers in a
- * hierarchy in question.
- */
-static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
-	enum pci_channel_state state,
-	char *error_mesg,
-	int (*cb)(struct pci_dev *, void *))
-{
-	struct aer_broadcast_data result_data;
-
-	pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
-	result_data.state = state;
-	if (cb == report_error_detected)
-		result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
-	else
-		result_data.result = PCI_ERS_RESULT_RECOVERED;
-
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		/*
-		 * If the error is reported by a bridge, we think this error
-		 * is related to the downstream link of the bridge, so we
-		 * do error recovery on all subordinates of the bridge instead
-		 * of the bridge and clear the error status of the bridge.
-		 */
-		if (cb == report_error_detected)
-			dev->error_state = state;
-		pci_walk_bus(dev->subordinate, cb, &result_data);
-		if (cb == report_resume) {
-			pci_cleanup_aer_uncorrect_error_status(dev);
-			dev->error_state = pci_channel_io_normal;
-		}
-	} else {
-		/*
-		 * If the error is reported by an end point, we think this
-		 * error is related to the upstream link of the end point.
-		 */
-		if (state == pci_channel_io_normal)
-			/*
-			 * the error is non fatal so the bus is ok, just invoke
-			 * the callback for the function that logged the error.
-			 */
-			cb(dev, &result_data);
-		else
-			pci_walk_bus(dev->bus, cb, &result_data);
-	}
-
-	return result_data.result;
-}
-
-/**
- * default_reset_link - default reset function
- * @dev: pointer to pci_dev data structure
- *
- * Invoked when performing link reset on a Downstream Port or a
- * Root Port with no aer driver.
- */
-static pci_ers_result_t default_reset_link(struct pci_dev *dev)
-{
-	pci_reset_bridge_secondary_bus(dev);
-	pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
-	return PCI_ERS_RESULT_RECOVERED;
-}
-
 static int find_aer_service_iter(struct device *device, void *data)
 {
 	struct pcie_port_service_driver *service_driver, **drv;
@@ -430,7 +245,7 @@ static int find_aer_service_iter(struct device *device, void *data)
 	return 0;
 }
 
-static struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
+struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
 {
 	struct pcie_port_service_driver *drv = NULL;
 
@@ -439,153 +254,6 @@ static struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
 	return drv;
 }
 
-static pci_ers_result_t reset_link(struct pci_dev *dev)
-{
-	struct pci_dev *udev;
-	pci_ers_result_t status;
-	struct pcie_port_service_driver *driver;
-
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		/* Reset this port for all subordinates */
-		udev = dev;
-	} else {
-		/* Reset the upstream component (likely downstream port) */
-		udev = dev->bus->self;
-	}
-
-	/* Use the aer driver of the component firstly */
-	driver = find_aer_service(udev);
-
-	if (driver && driver->reset_link) {
-		status = driver->reset_link(udev);
-	} else if (udev->has_secondary_link) {
-		status = default_reset_link(udev);
-	} else {
-		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
-			pci_name(udev));
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	if (status != PCI_ERS_RESULT_RECOVERED) {
-		pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
-			pci_name(udev));
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	return status;
-}
-
-/**
- * pcie_do_fatal_recovery - handle fatal error recovery process
- * @dev: pointer to a pci_dev data structure of agent detecting an error
- *
- * Invoked when an error is fatal. Once being invoked, removes the devices
- * beneath this AER agent, followed by reset link e.g. secondary bus reset
- * followed by re-enumeration of devices.
- */
-static void pcie_do_fatal_recovery(struct pci_dev *dev)
-{
-	struct pci_dev *udev;
-	struct pci_bus *parent;
-	struct pci_dev *pdev, *temp;
-	pci_ers_result_t result;
-
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
-		udev = dev;
-	else
-		udev = dev->bus->self;
-
-	parent = udev->subordinate;
-	pci_lock_rescan_remove();
-	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
-					 bus_list) {
-		pci_dev_get(pdev);
-		pci_dev_set_disconnected(pdev, NULL);
-		if (pci_has_subordinate(pdev))
-			pci_walk_bus(pdev->subordinate,
-				     pci_dev_set_disconnected, NULL);
-		pci_stop_and_remove_bus_device(pdev);
-		pci_dev_put(pdev);
-	}
-
-	result = reset_link(udev);
-
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		/*
-		 * If the error is reported by a bridge, we think this error
-		 * is related to the downstream link of the bridge, so we
-		 * do error recovery on all subordinates of the bridge instead
-		 * of the bridge and clear the error status of the bridge.
-		 */
-		pci_cleanup_aer_uncorrect_error_status(dev);
-	}
-
-	if (result == PCI_ERS_RESULT_RECOVERED) {
-		if (pcie_wait_for_link(udev, true))
-			pci_rescan_bus(udev->bus);
-	} else {
-		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
-		pci_info(dev, "AER: Device recovery from fatal error failed\n");
-	}
-
-	pci_unlock_rescan_remove();
-}
-
-/**
- * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
- * @dev: pointer to a pci_dev data structure of agent detecting an error
- *
- * Invoked when an error is nonfatal. Once being invoked, broadcast
- * error detected message to all downstream drivers within a hierarchy in
- * question and return the returned code.
- */
-static void pcie_do_nonfatal_recovery(struct pci_dev *dev)
-{
-	pci_ers_result_t status;
-	enum pci_channel_state state;
-
-	state = pci_channel_io_normal;
-
-	status = broadcast_error_message(dev,
-			state,
-			"error_detected",
-			report_error_detected);
-
-	if (status == PCI_ERS_RESULT_CAN_RECOVER)
-		status = broadcast_error_message(dev,
-				state,
-				"mmio_enabled",
-				report_mmio_enabled);
-
-	if (status == PCI_ERS_RESULT_NEED_RESET) {
-		/*
-		 * TODO: Should call platform-specific
-		 * functions to reset slot before calling
-		 * drivers' slot_reset callbacks?
-		 */
-		status = broadcast_error_message(dev,
-				state,
-				"slot_reset",
-				report_slot_reset);
-	}
-
-	if (status != PCI_ERS_RESULT_RECOVERED)
-		goto failed;
-
-	broadcast_error_message(dev,
-				state,
-				"resume",
-				report_resume);
-
-	pci_info(dev, "AER: Device recovery successful\n");
-	return;
-
-failed:
-	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
-	/* TODO: Should kernel panic here? */
-	pci_info(dev, "AER: Device recovery failed\n");
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
new file mode 100644
index 000000000000..885276d995e1
--- /dev/null
+++ b/drivers/pci/pcie/err.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file implements the error recovery as a core part of PCIe error
+ * reporting. When a PCIe error is delivered, an error message will be
+ * collected and printed to console, then, an error recovery procedure
+ * will be executed by following the PCI error recovery rules.
+ *
+ * Copyright (C) 2006 Intel Corp.
+ *	Tom Long Nguyen (tom.l.nguyen@intel.com)
+ *	Zhang Yanmin (yanmin.zhang@intel.com)
+ */
+
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/aer.h>
+#include "portdrv.h"
+#include "../pci.h"
+
+struct aer_broadcast_data {
+	enum pci_channel_state state;
+	enum pci_ers_result result;
+};
+
+static pci_ers_result_t merge_result(enum pci_ers_result orig,
+				  enum pci_ers_result new)
+{
+	if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
+		return PCI_ERS_RESULT_NO_AER_DRIVER;
+
+	if (new == PCI_ERS_RESULT_NONE)
+		return orig;
+
+	switch (orig) {
+	case PCI_ERS_RESULT_CAN_RECOVER:
+	case PCI_ERS_RESULT_RECOVERED:
+		orig = new;
+		break;
+	case PCI_ERS_RESULT_DISCONNECT:
+		if (new == PCI_ERS_RESULT_NEED_RESET)
+			orig = PCI_ERS_RESULT_NEED_RESET;
+		break;
+	default:
+		break;
+	}
+
+	return orig;
+}
+
+static int report_error_detected(struct pci_dev *dev, void *data)
+{
+	pci_ers_result_t vote;
+	const struct pci_error_handlers *err_handler;
+	struct aer_broadcast_data *result_data;
+
+	result_data = (struct aer_broadcast_data *) data;
+
+	device_lock(&dev->dev);
+	dev->error_state = result_data->state;
+
+	if (!dev->driver ||
+		!dev->driver->err_handler ||
+		!dev->driver->err_handler->error_detected) {
+		if (result_data->state == pci_channel_io_frozen &&
+			dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+			/*
+			 * In case of fatal recovery, if one of down-
+			 * stream device has no driver. We might be
+			 * unable to recover because a later insmod
+			 * of a driver for this device is unaware of
+			 * its hw state.
+			 */
+			pci_printk(KERN_DEBUG, dev, "device has %s\n",
+				   dev->driver ?
+				   "no AER-aware driver" : "no driver");
+		}
+
+		/*
+		 * If there's any device in the subtree that does not
+		 * have an error_detected callback, returning
+		 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
+		 * the subsequent mmio_enabled/slot_reset/resume
+		 * callbacks of "any" device in the subtree. All the
+		 * devices in the subtree are left in the error state
+		 * without recovery.
+		 */
+
+		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
+		else
+			vote = PCI_ERS_RESULT_NONE;
+	} else {
+		err_handler = dev->driver->err_handler;
+		vote = err_handler->error_detected(dev, result_data->state);
+		pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
+	}
+
+	result_data->result = merge_result(result_data->result, vote);
+	device_unlock(&dev->dev);
+	return 0;
+}
+
+static int report_mmio_enabled(struct pci_dev *dev, void *data)
+{
+	pci_ers_result_t vote;
+	const struct pci_error_handlers *err_handler;
+	struct aer_broadcast_data *result_data;
+
+	result_data = (struct aer_broadcast_data *) data;
+
+	device_lock(&dev->dev);
+	if (!dev->driver ||
+		!dev->driver->err_handler ||
+		!dev->driver->err_handler->mmio_enabled)
+		goto out;
+
+	err_handler = dev->driver->err_handler;
+	vote = err_handler->mmio_enabled(dev);
+	result_data->result = merge_result(result_data->result, vote);
+out:
+	device_unlock(&dev->dev);
+	return 0;
+}
+
+static int report_slot_reset(struct pci_dev *dev, void *data)
+{
+	pci_ers_result_t vote;
+	const struct pci_error_handlers *err_handler;
+	struct aer_broadcast_data *result_data;
+
+	result_data = (struct aer_broadcast_data *) data;
+
+	device_lock(&dev->dev);
+	if (!dev->driver ||
+		!dev->driver->err_handler ||
+		!dev->driver->err_handler->slot_reset)
+		goto out;
+
+	err_handler = dev->driver->err_handler;
+	vote = err_handler->slot_reset(dev);
+	result_data->result = merge_result(result_data->result, vote);
+out:
+	device_unlock(&dev->dev);
+	return 0;
+}
+
+static int report_resume(struct pci_dev *dev, void *data)
+{
+	const struct pci_error_handlers *err_handler;
+
+	device_lock(&dev->dev);
+	dev->error_state = pci_channel_io_normal;
+
+	if (!dev->driver ||
+		!dev->driver->err_handler ||
+		!dev->driver->err_handler->resume)
+		goto out;
+
+	err_handler = dev->driver->err_handler;
+	err_handler->resume(dev);
+	pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
+out:
+	device_unlock(&dev->dev);
+	return 0;
+}
+
+/**
+ * default_reset_link - default reset function
+ * @dev: pointer to pci_dev data structure
+ *
+ * Invoked when performing link reset on a Downstream Port or a
+ * Root Port with no aer driver.
+ */
+static pci_ers_result_t default_reset_link(struct pci_dev *dev)
+{
+	pci_reset_bridge_secondary_bus(dev);
+	pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static pci_ers_result_t reset_link(struct pci_dev *dev)
+{
+	struct pci_dev *udev;
+	pci_ers_result_t status;
+	struct pcie_port_service_driver *driver = NULL;
+
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+		/* Reset this port for all subordinates */
+		udev = dev;
+	} else {
+		/* Reset the upstream component (likely downstream port) */
+		udev = dev->bus->self;
+	}
+
+#if IS_ENABLED(CONFIG_PCIEAER)
+	/* Use the aer driver of the component firstly */
+	driver = find_aer_service(udev);
+#endif
+
+	if (driver && driver->reset_link) {
+		status = driver->reset_link(udev);
+	} else if (udev->has_secondary_link) {
+		status = default_reset_link(udev);
+	} else {
+		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
+			pci_name(udev));
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	if (status != PCI_ERS_RESULT_RECOVERED) {
+		pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
+			pci_name(udev));
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	return status;
+}
+
+/**
+ * broadcast_error_message - handle message broadcast to downstream drivers
+ * @dev: pointer to from where in a hierarchy message is broadcasted down
+ * @state: error state
+ * @error_mesg: message to print
+ * @cb: callback to be broadcasted
+ *
+ * Invoked during error recovery process. Once being invoked, the content
+ * of error severity will be broadcasted to all downstream drivers in a
+ * hierarchy in question.
+ */
+static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
+	enum pci_channel_state state,
+	char *error_mesg,
+	int (*cb)(struct pci_dev *, void *))
+{
+	struct aer_broadcast_data result_data;
+
+	pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
+	result_data.state = state;
+	if (cb == report_error_detected)
+		result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
+	else
+		result_data.result = PCI_ERS_RESULT_RECOVERED;
+
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+		/*
+		 * If the error is reported by a bridge, we think this error
+		 * is related to the downstream link of the bridge, so we
+		 * do error recovery on all subordinates of the bridge instead
+		 * of the bridge and clear the error status of the bridge.
+		 */
+		if (cb == report_error_detected)
+			dev->error_state = state;
+		pci_walk_bus(dev->subordinate, cb, &result_data);
+		if (cb == report_resume) {
+			pci_cleanup_aer_uncorrect_error_status(dev);
+			dev->error_state = pci_channel_io_normal;
+		}
+	} else {
+		/*
+		 * If the error is reported by an end point, we think this
+		 * error is related to the upstream link of the end point.
+		 */
+		if (state == pci_channel_io_normal)
+			/*
+			 * the error is non fatal so the bus is ok, just invoke
+			 * the callback for the function that logged the error.
+			 */
+			cb(dev, &result_data);
+		else
+			pci_walk_bus(dev->bus, cb, &result_data);
+	}
+
+	return result_data.result;
+}
+
+/**
+ * pcie_do_fatal_recovery - handle fatal error recovery process
+ * @dev: pointer to a pci_dev data structure of agent detecting an error
+ *
+ * Invoked when an error is fatal. Once being invoked, removes the devices
+ * beneath this AER agent, followed by reset link e.g. secondary bus reset
+ * followed by re-enumeration of devices.
+ */
+void pcie_do_fatal_recovery(struct pci_dev *dev)
+{
+	struct pci_dev *udev;
+	struct pci_bus *parent;
+	struct pci_dev *pdev, *temp;
+	pci_ers_result_t result;
+
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+		udev = dev;
+	else
+		udev = dev->bus->self;
+
+	parent = udev->subordinate;
+	pci_lock_rescan_remove();
+	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
+					 bus_list) {
+		pci_dev_get(pdev);
+		pci_dev_set_disconnected(pdev, NULL);
+		if (pci_has_subordinate(pdev))
+			pci_walk_bus(pdev->subordinate,
+				     pci_dev_set_disconnected, NULL);
+		pci_stop_and_remove_bus_device(pdev);
+		pci_dev_put(pdev);
+	}
+
+	result = reset_link(udev);
+
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+		/*
+		 * If the error is reported by a bridge, we think this error
+		 * is related to the downstream link of the bridge, so we
+		 * do error recovery on all subordinates of the bridge instead
+		 * of the bridge and clear the error status of the bridge.
+		 */
+		pci_cleanup_aer_uncorrect_error_status(dev);
+	}
+
+	if (result == PCI_ERS_RESULT_RECOVERED) {
+		if (pcie_wait_for_link(udev, true))
+			pci_rescan_bus(udev->bus);
+		pci_info(dev, "Device recovery from fatal error successful\n");
+	} else {
+		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+		pci_info(dev, "Device recovery from fatal error failed\n");
+	}
+
+	pci_unlock_rescan_remove();
+}
+
+/**
+ * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
+ * @dev: pointer to a pci_dev data structure of agent detecting an error
+ *
+ * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
+ * error detected message to all downstream drivers within a hierarchy in
+ * question and return the returned code.
+ */
+void pcie_do_nonfatal_recovery(struct pci_dev *dev)
+{
+	pci_ers_result_t status;
+	enum pci_channel_state state;
+
+	state = pci_channel_io_normal;
+
+	status = broadcast_error_message(dev,
+			state,
+			"error_detected",
+			report_error_detected);
+
+	if (status == PCI_ERS_RESULT_CAN_RECOVER)
+		status = broadcast_error_message(dev,
+				state,
+				"mmio_enabled",
+				report_mmio_enabled);
+
+	if (status == PCI_ERS_RESULT_NEED_RESET) {
+		/*
+		 * TODO: Should call platform-specific
+		 * functions to reset slot before calling
+		 * drivers' slot_reset callbacks?
+		 */
+		status = broadcast_error_message(dev,
+				state,
+				"slot_reset",
+				report_slot_reset);
+	}
+
+	if (status != PCI_ERS_RESULT_RECOVERED)
+		goto failed;
+
+	broadcast_error_message(dev,
+				state,
+				"resume",
+				report_resume);
+
+	pci_info(dev, "AER: Device recovery successful\n");
+	return;
+
+failed:
+	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+
+	/* TODO: Should kernel panic here? */
+	pci_info(dev, "AER: Device recovery failed\n");
+}
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index d0c6783dbfe3..47c982451585 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,4 +112,5 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
+struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
 #endif /* _PORTDRV_H_ */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..4f721f757363 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2284,7 +2284,7 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
 	return false;
 }
 
-#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH)
+#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH)
 void pci_uevent_ers(struct pci_dev *pdev, enum  pci_ers_result err_type);
 #endif
 
-- 
cgit v1.2.3


From 75abe32946604887a3fa600eb0956e9946d4170c Mon Sep 17 00:00:00 2001
From: Li Qiang <liq3ea@gmail.com>
Date: Thu, 17 May 2018 19:01:03 -0700
Subject: fs.h: fix outdated comment about file flags

The __dentry_open function was removed in
commit <2a027e7a18738>("fold __dentry_open() into its sole caller").

Signed-off-by: Li Qiang <liq3ea@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..7f07977bdfd7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -94,7 +94,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
- * to O_WRONLY and O_RDWR via the strange trick in __dentry_open()
+ * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
  */
 
 /* file is open for reading */
-- 
cgit v1.2.3


From d6ee6ad774a986d4faaa794a0980e7c50ed359c6 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Thu, 26 Apr 2018 13:13:26 +0200
Subject: Bluetooth: Add __hci_cmd_send function

This function allows to send a HCI command without expecting any
controller event/response in return. This is allowed for vendor-
specific commands only.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_core.c         | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b619a190ff12..893bbbb5d2fa 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1393,6 +1393,8 @@ struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
 			       const void *param, u32 timeout);
 struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
 				  const void *param, u8 event, u32 timeout);
+int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
+		   const void *param);
 
 int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
 		 const void *param);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 40d260f2bea5..b0ee9edaae35 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3422,6 +3422,37 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
 	return 0;
 }
 
+int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
+		   const void *param)
+{
+	struct sk_buff *skb;
+
+	if (hci_opcode_ogf(opcode) != 0x3f) {
+		/* A controller receiving a command shall respond with either
+		 * a Command Status Event or a Command Complete Event.
+		 * Therefore, all standard HCI commands must be sent via the
+		 * standard API, using hci_send_cmd or hci_cmd_sync helpers.
+		 * Some vendors do not comply with this rule for vendor-specific
+		 * commands and do not return any event. We want to support
+		 * unresponded commands for such cases only.
+		 */
+		bt_dev_err(hdev, "unresponded command not supported");
+		return -EINVAL;
+	}
+
+	skb = hci_prepare_cmd(hdev, opcode, plen, param);
+	if (!skb) {
+		bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+			   opcode);
+		return -ENOMEM;
+	}
+
+	hci_send_frame(hdev, skb);
+
+	return 0;
+}
+EXPORT_SYMBOL(__hci_cmd_send);
+
 /* Get data from the previously sent command */
 void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
 {
-- 
cgit v1.2.3


From dc689c5b336b9db9745654791f5cdf3ebbb54f4a Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Tue, 8 May 2018 14:06:18 -0700
Subject: ACPICA: Update version to 20180508

Version 20180508.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index fc58c28a4885..e8c9199bf98f 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -12,7 +12,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20180427
+#define ACPI_CA_VERSION                 0x20180508
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>
-- 
cgit v1.2.3


From 8689c051a20195b228e19acb155c7d6e48a86753 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <aspriel@gmail.com>
Date: Thu, 10 May 2018 13:50:12 +0200
Subject: cfg80211: dynamically allocate per-tid stats for station info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the addition of TXQ stats in the per-tid statistics the struct
station_info grew significantly. This resulted in stack size warnings
due to the structure itself being above the limit for the warnings.

Add an allocation function that those who want to provide per-tid
stats should use to allocate the tid array, i.e.
struct station_info::pertid.

Cc: Toke Høiland-Jørgensen <toke@toke.dk>
Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace")
Signed-off-by: Arend van Spriel <aspriel@gmail.com>
[johannes: fix missing BIT() and logic by removing]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  | 10 +++++++++-
 net/mac80211/sta_info.c |  9 +++++----
 net/wireless/nl80211.c  |  3 ++-
 net/wireless/util.c     | 11 +++++++++++
 4 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8db6071b6063..8984d24d68b7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1184,6 +1184,7 @@ struct cfg80211_tid_stats {
  * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
  * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
  *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
+ *	Note that this doesn't use the @filled bit, but is used if non-NULL.
  * @ack_signal: signal strength (in dBm) of the last ACK frame.
  * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
  *	been sent.
@@ -1230,7 +1231,7 @@ struct station_info {
 	u64 rx_beacon;
 	u64 rx_duration;
 	u8 rx_beacon_signal_avg;
-	struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1];
+	struct cfg80211_tid_stats *pertid;
 	s8 ack_signal;
 	s8 avg_ack_signal;
 };
@@ -5701,6 +5702,13 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
 					struct ieee80211_channel *chan,
 					gfp_t gfp);
 
+/**
+ * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics.
+ *
+ * @sinfo: the station information
+ * @gfp: allocation flags
+ */
+int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
 
 /**
  * cfg80211_new_sta - notify userspace about station
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 43f34aa873bc..04d47689b557 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2233,11 +2233,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 			sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
 	}
 
-	sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS);
-	for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
-		struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
+	if (!cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
+		for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
+			struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
 
-		sta_set_tidstats(sta, tidstats, i);
+			sta_set_tidstats(sta, tidstats, i);
+		}
 	}
 
 	if (ieee80211_vif_is_mesh(&sdata->vif)) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f7715b85fd2b..3d638f11edb5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4658,7 +4658,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 #undef PUT_SINFO
 #undef PUT_SINFO_U64
 
-	if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) {
+	if (sinfo->pertid) {
 		struct nlattr *tidsattr;
 		int tid;
 
@@ -4702,6 +4702,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 		}
 
 		nla_nest_end(msg, tidsattr);
+		kfree(sinfo->pertid);
 	}
 
 	nla_nest_end(msg, sinfoattr);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d112e9a89364..b5bb1c309914 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1787,6 +1787,17 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
 	return false;
 }
 
+int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
+{
+	sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)),
+				IEEE80211_NUM_TIDS + 1, gfp);
+	if (!sinfo->pertid)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats);
+
 /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
 /* Ethernet-II snap header (RFC1042 for most EtherTypes) */
 const unsigned char rfc1042_header[] __aligned(2) =
-- 
cgit v1.2.3


From 7ea3e110f2f8ba23f330c2f702f556acd539bcb8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 18 May 2018 11:40:44 +0200
Subject: cfg80211: release station info tidstats where needed

This fixes memory leaks in cases where we got the station
info but failed sending it out properly.

Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info")
Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 13 +++++++++++++
 net/wireless/nl80211.c | 11 ++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8984d24d68b7..11a218445448 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5710,6 +5710,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
  */
 int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
 
+/**
+ * cfg80211_sinfo_release_content - release contents of station info
+ * @sinfo: the station information
+ *
+ * Releases any potentially allocated sub-information of the station
+ * information, but not the struct itself (since it's typically on
+ * the stack.)
+ */
+static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
+{
+	kfree(sinfo->pertid);
+}
+
 /**
  * cfg80211_new_sta - notify userspace about station
  *
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7daceb1f253d..e4a52a2b5e65 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4702,7 +4702,6 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 		}
 
 		nla_nest_end(msg, tidsattr);
-		kfree(sinfo->pertid);
 	}
 
 	nla_nest_end(msg, sinfoattr);
@@ -4712,10 +4711,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 		    sinfo->assoc_req_ies))
 		goto nla_put_failure;
 
+	cfg80211_sinfo_release_content(sinfo);
 	genlmsg_end(msg, hdr);
 	return 0;
 
  nla_put_failure:
+	cfg80211_sinfo_release_content(sinfo);
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;
 }
@@ -4797,8 +4798,10 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 		return err;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
+	if (!msg) {
+		cfg80211_sinfo_release_content(sinfo);
 		return -ENOMEM;
+	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
 				 info->snd_portid, info->snd_seq, 0,
@@ -14624,8 +14627,10 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
 	trace_cfg80211_del_sta(dev, mac_addr);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
-	if (!msg)
+	if (!msg) {
+		cfg80211_sinfo_release_content(sinfo);
 		return;
+	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
 				 rdev, dev, mac_addr, sinfo) < 0) {
-- 
cgit v1.2.3


From 564def71765caf65040f926c0783b9c27cc6c087 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 18 May 2018 11:46:15 +0100
Subject: proc: Add a way to make network proc files writable

Provide two extra functions, proc_create_net_data_write() and
proc_create_net_single_write() that act like their non-write versions but
also set a write method in the proc_dir_entry struct.

An internal simple write function is provided that will copy its buffer and
hand it to the pde->write() method if available (or give an error if not).
The buffer may be modified by the write method.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/proc/generic.c       | 24 +++++++++++++
 fs/proc/internal.h      |  2 ++
 fs/proc/proc_net.c      | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/proc_fs.h | 12 +++++++
 4 files changed, 130 insertions(+)

(limited to 'include')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 02bb1914f5f7..d0e5a68ae14a 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -741,3 +741,27 @@ void *PDE_DATA(const struct inode *inode)
 	return __PDE_DATA(inode);
 }
 EXPORT_SYMBOL(PDE_DATA);
+
+/*
+ * Pull a user buffer into memory and pass it to the file's write handler if
+ * one is supplied.  The ->write() method is permitted to modify the
+ * kernel-side buffer.
+ */
+ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
+			  loff_t *_pos)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(f));
+	char *buf;
+	int ret;
+
+	if (!pde->write)
+		return -EACCES;
+	if (size == 0 || size > PAGE_SIZE - 1)
+		return -EINVAL;
+	buf = memdup_user_nul(ubuf, size);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+	ret = pde->write(f, buf, size);
+	kfree(buf);
+	return ret == 0 ? size : ret;
+}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a318ae5b36b4..916ccc39073d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -48,6 +48,7 @@ struct proc_dir_entry {
 		const struct seq_operations *seq_ops;
 		int (*single_show)(struct seq_file *, void *);
 	};
+	proc_write_t write;
 	void *data;
 	unsigned int state_size;
 	unsigned int low_ino;
@@ -187,6 +188,7 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde)
 {
 	return S_ISDIR(pde->mode) && !pde->proc_iops;
 }
+extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);
 
 /*
  * inode.c
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7d94fa005b0d..d5e0fcb3439e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -46,6 +46,9 @@ static int seq_open_net(struct inode *inode, struct file *file)
 
 	WARN_ON_ONCE(state_size < sizeof(*p));
 
+	if (file->f_mode & FMODE_WRITE && !PDE(inode)->write)
+		return -EACCES;
+
 	net = get_proc_net(inode);
 	if (!net)
 		return -ENXIO;
@@ -73,6 +76,7 @@ static int seq_release_net(struct inode *ino, struct file *f)
 static const struct file_operations proc_net_seq_fops = {
 	.open		= seq_open_net,
 	.read		= seq_read,
+	.write		= proc_simple_write,
 	.llseek		= seq_lseek,
 	.release	= seq_release_net,
 };
@@ -93,6 +97,50 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(proc_create_net_data);
 
+/**
+ * proc_create_net_data_write - Create a writable net_ns-specific proc file
+ * @name: The name of the file.
+ * @mode: The file's access mode.
+ * @parent: The parent directory in which to create.
+ * @ops: The seq_file ops with which to read the file.
+ * @write: The write method which which to 'modify' the file.
+ * @data: Data for retrieval by PDE_DATA().
+ *
+ * Create a network namespaced proc file in the @parent directory with the
+ * specified @name and @mode that allows reading of a file that displays a
+ * series of elements and also provides for the file accepting writes that have
+ * some arbitrary effect.
+ *
+ * The functions in the @ops table are used to iterate over items to be
+ * presented and extract the readable content using the seq_file interface.
+ *
+ * The @write function is called with the data copied into a kernel space
+ * scratch buffer and has a NUL appended for convenience.  The buffer may be
+ * modified by the @write function.  @write should return 0 on success.
+ *
+ * The @data value is accessible from the @show and @write functions by calling
+ * PDE_DATA() on the file inode.  The network namespace must be accessed by
+ * calling seq_file_net() on the seq_file struct.
+ */
+struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
+						  struct proc_dir_entry *parent,
+						  const struct seq_operations *ops,
+						  proc_write_t write,
+						  unsigned int state_size, void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_net_seq_fops;
+	p->seq_ops = ops;
+	p->state_size = state_size;
+	p->write = write;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data_write);
+
 static int single_open_net(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *de = PDE(inode);
@@ -119,6 +167,7 @@ static int single_release_net(struct inode *ino, struct file *f)
 static const struct file_operations proc_net_single_fops = {
 	.open		= single_open_net,
 	.read		= seq_read,
+	.write		= proc_simple_write,
 	.llseek		= seq_lseek,
 	.release	= single_release_net,
 };
@@ -138,6 +187,49 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(proc_create_net_single);
 
+/**
+ * proc_create_net_single_write - Create a writable net_ns-specific proc file
+ * @name: The name of the file.
+ * @mode: The file's access mode.
+ * @parent: The parent directory in which to create.
+ * @show: The seqfile show method with which to read the file.
+ * @write: The write method which which to 'modify' the file.
+ * @data: Data for retrieval by PDE_DATA().
+ *
+ * Create a network-namespaced proc file in the @parent directory with the
+ * specified @name and @mode that allows reading of a file that displays a
+ * single element rather than a series and also provides for the file accepting
+ * writes that have some arbitrary effect.
+ *
+ * The @show function is called to extract the readable content via the
+ * seq_file interface.
+ *
+ * The @write function is called with the data copied into a kernel space
+ * scratch buffer and has a NUL appended for convenience.  The buffer may be
+ * modified by the @write function.  @write should return 0 on success.
+ *
+ * The @data value is accessible from the @show and @write functions by calling
+ * PDE_DATA() on the file inode.  The network namespace must be accessed by
+ * calling seq_file_single_net() on the seq_file struct.
+ */
+struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
+						    struct proc_dir_entry *parent,
+						    int (*show)(struct seq_file *, void *),
+						    proc_write_t write,
+						    void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_net_single_fops;
+	p->single_show = show;
+	p->write = write;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single_write);
+
 static struct net *get_proc_task_net(struct inode *dir)
 {
 	struct task_struct *task;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index e518352137e7..626fc65c4336 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -14,6 +14,8 @@ struct seq_operations;
 
 #ifdef CONFIG_PROC_FS
 
+typedef int (*proc_write_t)(struct file *, char *, size_t);
+
 extern void proc_root_init(void);
 extern void proc_flush_task(struct task_struct *);
 
@@ -61,6 +63,16 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
 		struct proc_dir_entry *parent,
 		int (*show)(struct seq_file *, void *), void *data);
+struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
+						  struct proc_dir_entry *parent,
+						  const struct seq_operations *ops,
+						  proc_write_t write,
+						  unsigned int state_size, void *data);
+struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
+						    struct proc_dir_entry *parent,
+						    int (*show)(struct seq_file *, void *),
+						    proc_write_t write,
+						    void *data);
 
 #else /* CONFIG_PROC_FS */
 
-- 
cgit v1.2.3


From d6f7b98bc8147abd290ead82922f8d83c525fb42 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 20 Apr 2018 16:10:49 -0700
Subject: fsnotify: use type id to identify connector object type

An fsnotify_mark_connector is referencing a single type of object
(either inode or vfsmount). Instead of storing a type mask in
connector->flags, store a single type id in connector->type to
identify the type of object.

When a connector object is detached from the object, its type is set
to FSNOTIFY_OBJ_TYPE_DETACHED and this object is not going to be
reused.

The function fsnotify_clear_marks_by_group() is the only place where
type mask was used, so use type flags instead of type id to this
function.

This change is going to be more convenient when adding a new object
type (super block).

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fdinfo.c               |  6 +++---
 fs/notify/group.c                |  2 +-
 fs/notify/mark.c                 | 29 ++++++++++++++---------------
 include/linux/fsnotify_backend.h | 21 ++++++++++++++-------
 4 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index d478629c728b..10aac1942c9f 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -77,7 +77,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	struct inotify_inode_mark *inode_mark;
 	struct inode *inode;
 
-	if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE))
+	if (mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE)
 		return;
 
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
@@ -116,7 +116,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
 		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
 
-	if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+	if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) {
 		inode = igrab(mark->connector->inode);
 		if (!inode)
 			return;
@@ -126,7 +126,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
 		iput(inode);
-	} else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+	} else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
 		struct mount *mnt = real_mount(mark->connector->mnt);
 
 		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b7a4b6a69efa..aa5468f23e45 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -67,7 +67,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
 	fsnotify_group_stop_queueing(group);
 
 	/* Clear all marks for this group and queue them for destruction */
-	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES_MASK);
 
 	/*
 	 * Some marks can still be pinned when waiting for response from
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index e9191b416434..ef44808b28ca 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -119,9 +119,9 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 		if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
 			new_mask |= mark->mask;
 	}
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+	if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
 		conn->inode->i_fsnotify_mask = new_mask;
-	else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+	else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 		real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask;
 }
 
@@ -139,7 +139,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 	spin_lock(&conn->lock);
 	__fsnotify_recalc_mask(conn);
 	spin_unlock(&conn->lock);
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+	if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
 		__fsnotify_update_child_dentry_flags(conn->inode);
 }
 
@@ -166,18 +166,18 @@ static struct inode *fsnotify_detach_connector_from_object(
 {
 	struct inode *inode = NULL;
 
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+	if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
 		inode = conn->inode;
 		rcu_assign_pointer(inode->i_fsnotify_marks, NULL);
 		inode->i_fsnotify_mask = 0;
 		conn->inode = NULL;
-		conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
-	} else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+		conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
+	} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
 		rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks,
 				   NULL);
 		real_mount(conn->mnt)->mnt_fsnotify_mask = 0;
 		conn->mnt = NULL;
-		conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+		conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
 	}
 
 	return inode;
@@ -442,10 +442,10 @@ static int fsnotify_attach_connector_to_object(
 	spin_lock_init(&conn->lock);
 	INIT_HLIST_HEAD(&conn->list);
 	if (inode) {
-		conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
+		conn->type = FSNOTIFY_OBJ_TYPE_INODE;
 		conn->inode = igrab(inode);
 	} else {
-		conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+		conn->type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
 		conn->mnt = mnt;
 	}
 	/*
@@ -479,8 +479,7 @@ static struct fsnotify_mark_connector *fsnotify_grab_connector(
 	if (!conn)
 		goto out;
 	spin_lock(&conn->lock);
-	if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE |
-			     FSNOTIFY_OBJ_TYPE_VFSMOUNT))) {
+	if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) {
 		spin_unlock(&conn->lock);
 		srcu_read_unlock(&fsnotify_mark_srcu, idx);
 		return NULL;
@@ -646,16 +645,16 @@ struct fsnotify_mark *fsnotify_find_mark(
 	return NULL;
 }
 
-/* Clear any marks in a group with given type */
+/* Clear any marks in a group with given type mask */
 void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
-				   unsigned int type)
+				   unsigned int type_mask)
 {
 	struct fsnotify_mark *lmark, *mark;
 	LIST_HEAD(to_free);
 	struct list_head *head = &to_free;
 
 	/* Skip selection step if we want to clear all marks. */
-	if (type == FSNOTIFY_OBJ_ALL_TYPES) {
+	if (type_mask == FSNOTIFY_OBJ_ALL_TYPES_MASK) {
 		head = &group->marks_list;
 		goto clear;
 	}
@@ -670,7 +669,7 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
 	 */
 	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
-		if (mark->connector->flags & type)
+		if ((1U << mark->connector->type) & type_mask)
 			list_move(&mark->g_list, &to_free);
 	}
 	mutex_unlock(&group->mark_mutex);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e64c0294f50b..435c94a31c8c 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -201,6 +201,17 @@ struct fsnotify_group {
 #define FSNOTIFY_EVENT_PATH	1
 #define FSNOTIFY_EVENT_INODE	2
 
+enum fsnotify_obj_type {
+	FSNOTIFY_OBJ_TYPE_INODE,
+	FSNOTIFY_OBJ_TYPE_VFSMOUNT,
+	FSNOTIFY_OBJ_TYPE_COUNT,
+	FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
+};
+
+#define FSNOTIFY_OBJ_TYPE_INODE_FL	(1U << FSNOTIFY_OBJ_TYPE_INODE)
+#define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL	(1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+#define FSNOTIFY_OBJ_ALL_TYPES_MASK	((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
+
 /*
  * Inode / vfsmount point to this structure which tracks all marks attached to
  * the inode / vfsmount. The reference to inode / vfsmount is held by this
@@ -209,11 +220,7 @@ struct fsnotify_group {
  */
 struct fsnotify_mark_connector {
 	spinlock_t lock;
-#define FSNOTIFY_OBJ_TYPE_INODE		0x01
-#define FSNOTIFY_OBJ_TYPE_VFSMOUNT	0x02
-#define FSNOTIFY_OBJ_ALL_TYPES		(FSNOTIFY_OBJ_TYPE_INODE | \
-					 FSNOTIFY_OBJ_TYPE_VFSMOUNT)
-	unsigned int flags;	/* Type of object [lock] */
+	unsigned int type;	/* Type of object [lock] */
 	union {	/* Object pointer [lock] */
 		struct inode *inode;
 		struct vfsmount *mnt;
@@ -369,12 +376,12 @@ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned
 /* run all the marks in a group, and clear all of the vfsmount marks */
 static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL);
 }
 /* run all the marks in a group, and clear all of the inode marks */
 static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL);
 }
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
-- 
cgit v1.2.3


From 5b0457ad021f3f7e3d9f4b84e7c3080748f383f8 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 20 Apr 2018 16:10:50 -0700
Subject: fsnotify: remove redundant arguments to handle_event()

inode_mark and vfsmount_mark arguments are passed to handle_event()
operation as function arguments as well as on iter_info struct.
The difference is that iter_info struct may contain marks that should
not be handled and are represented as NULL arguments to inode_mark or
vfsmount_mark.

Instead of passing the inode_mark and vfsmount_mark arguments, add
a report_mask member to iter_info struct to indicate which marks should
be handled, versus marks that should only be kept alive during user
wait.

This change is going to be used for passing more mark types
with handle_event() (i.e. super block marks).

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c          |  6 +++---
 fs/notify/fanotify/fanotify.c        | 14 ++++++-------
 fs/notify/fsnotify.c                 | 38 +++++++++++++++++++-----------------
 fs/notify/fsnotify.h                 |  6 ------
 fs/notify/inotify/inotify.h          |  2 --
 fs/notify/inotify/inotify_fsnotify.c |  6 +++---
 fs/notify/inotify/inotify_user.c     |  8 ++++++--
 include/linux/fsnotify_backend.h     | 20 +++++++++++++++++--
 kernel/audit_fsnotify.c              |  3 +--
 kernel/audit_tree.c                  |  2 --
 kernel/audit_watch.c                 |  3 +--
 11 files changed, 58 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 63a1ca4b9dee..b4e685b63f33 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -79,12 +79,11 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  */
 static int dnotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
-				struct fsnotify_mark *inode_mark,
-				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, const void *data, int data_type,
 				const unsigned char *file_name, u32 cookie,
 				struct fsnotify_iter_info *iter_info)
 {
+	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
@@ -95,7 +94,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 	if (!S_ISDIR(inode->i_mode))
 		return 0;
 
-	BUG_ON(vfsmount_mark);
+	if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info)))
+		return 0;
 
 	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
 
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d94e8031fe5f..f83650486052 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -87,11 +87,12 @@ static int fanotify_get_response(struct fsnotify_group *group,
 	return ret;
 }
 
-static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
-				       struct fsnotify_mark *vfsmnt_mark,
-				       u32 event_mask,
-				       const void *data, int data_type)
+static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info,
+				       u32 event_mask, const void *data,
+				       int data_type)
 {
+	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
+	struct fsnotify_mark *vfsmnt_mark = fsnotify_iter_vfsmount_mark(iter_info);
 	__u32 marks_mask = 0, marks_ignored_mask = 0;
 	const struct path *path = data;
 
@@ -178,8 +179,6 @@ init: __maybe_unused
 
 static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct inode *inode,
-				 struct fsnotify_mark *inode_mark,
-				 struct fsnotify_mark *fanotify_mark,
 				 u32 mask, const void *data, int data_type,
 				 const unsigned char *file_name, u32 cookie,
 				 struct fsnotify_iter_info *iter_info)
@@ -199,8 +198,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
 	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
 
-	if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
-					data_type))
+	if (!fanotify_should_send_event(iter_info, mask, data, data_type))
 		return 0;
 
 	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 613ec7e5a465..9a63cf07f858 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -184,22 +184,20 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
 static int send_to_group(struct inode *to_tell,
-			 struct fsnotify_mark *inode_mark,
-			 struct fsnotify_mark *vfsmount_mark,
 			 __u32 mask, const void *data,
 			 int data_is, u32 cookie,
 			 const unsigned char *file_name,
 			 struct fsnotify_iter_info *iter_info)
 {
+	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
+	struct fsnotify_mark *vfsmount_mark = fsnotify_iter_vfsmount_mark(iter_info);
 	struct fsnotify_group *group = NULL;
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 	__u32 marks_mask = 0;
 	__u32 marks_ignored_mask = 0;
 
-	if (unlikely(!inode_mark && !vfsmount_mark)) {
-		BUG();
+	if (WARN_ON(!iter_info->report_mask))
 		return 0;
-	}
 
 	/* clear ignored on inode modification */
 	if (mask & FS_MODIFY) {
@@ -235,8 +233,7 @@ static int send_to_group(struct inode *to_tell,
 	if (!(test_mask & marks_mask & ~marks_ignored_mask))
 		return 0;
 
-	return group->ops->handle_event(group, to_tell, inode_mark,
-					vfsmount_mark, mask, data, data_is,
+	return group->ops->handle_event(group, to_tell, mask, data, data_is,
 					file_name, cookie, iter_info);
 }
 
@@ -327,27 +324,32 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	while (iter_info.inode_mark || iter_info.vfsmount_mark) {
 		struct fsnotify_mark *inode_mark = iter_info.inode_mark;
 		struct fsnotify_mark *vfsmount_mark = iter_info.vfsmount_mark;
+		int cmp;
 
 		if (inode_mark && vfsmount_mark) {
-			int cmp = fsnotify_compare_groups(inode_mark->group,
-							  vfsmount_mark->group);
-			if (cmp > 0)
-				inode_mark = NULL;
-			else if (cmp < 0)
-				vfsmount_mark = NULL;
+			cmp = fsnotify_compare_groups(inode_mark->group,
+						      vfsmount_mark->group);
+		} else {
+			cmp = inode_mark ? -1 : 1;
 		}
 
-		ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
-				    data, data_is, cookie, file_name,
-				    &iter_info);
+		iter_info.report_mask = 0;
+		if (cmp <= 0)
+			iter_info.report_mask |= FSNOTIFY_OBJ_TYPE_INODE_FL;
+		if (cmp >= 0)
+			iter_info.report_mask |= FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL;
+
+		ret = send_to_group(to_tell, mask, data, data_is, cookie,
+				    file_name, &iter_info);
 
 		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
 			goto out;
 
-		if (inode_mark)
+		if (iter_info.report_mask & FSNOTIFY_OBJ_TYPE_INODE_FL)
 			iter_info.inode_mark =
 				fsnotify_next_mark(iter_info.inode_mark);
-		if (vfsmount_mark)
+
+		if (iter_info.report_mask & FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL)
 			iter_info.vfsmount_mark =
 				fsnotify_next_mark(iter_info.vfsmount_mark);
 	}
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 60f365dc1408..34515d2c4ba3 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -9,12 +9,6 @@
 
 #include "../mount.h"
 
-struct fsnotify_iter_info {
-	struct fsnotify_mark *inode_mark;
-	struct fsnotify_mark *vfsmount_mark;
-	int srcu_idx;
-};
-
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index c00d2caca894..7e4578d35b61 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -25,8 +25,6 @@ extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 					   struct fsnotify_group *group);
 extern int inotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
-				struct fsnotify_mark *inode_mark,
-				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, const void *data, int data_type,
 				const unsigned char *file_name, u32 cookie,
 				struct fsnotify_iter_info *iter_info);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 40dedb37a1f3..9ab6dde38a14 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -65,12 +65,11 @@ static int inotify_merge(struct list_head *list,
 
 int inotify_handle_event(struct fsnotify_group *group,
 			 struct inode *inode,
-			 struct fsnotify_mark *inode_mark,
-			 struct fsnotify_mark *vfsmount_mark,
 			 u32 mask, const void *data, int data_type,
 			 const unsigned char *file_name, u32 cookie,
 			 struct fsnotify_iter_info *iter_info)
 {
+	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	struct inotify_inode_mark *i_mark;
 	struct inotify_event_info *event;
 	struct fsnotify_event *fsn_event;
@@ -78,7 +77,8 @@ int inotify_handle_event(struct fsnotify_group *group,
 	int len = 0;
 	int alloc_len = sizeof(struct inotify_event_info);
 
-	BUG_ON(vfsmount_mark);
+	if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info)))
+		return 0;
 
 	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
 	    (data_type == FSNOTIFY_EVENT_PATH)) {
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ef32f3657958..22a3d0471fee 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -485,10 +485,14 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark *i_mark;
+	struct fsnotify_iter_info iter_info = {
+		.inode_mark = fsn_mark,
+		.report_mask = FSNOTIFY_OBJ_TYPE_INODE_FL,
+	};
 
 	/* Queue ignore event for the watch */
-	inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
-			     NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL);
+	inotify_handle_event(group, NULL, FS_IN_IGNORED, NULL,
+			     FSNOTIFY_EVENT_NONE, NULL, 0, &iter_info);
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 	/* remove this mark from the idr */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 435c94a31c8c..9da5edf4ac0f 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -98,8 +98,6 @@ struct fsnotify_iter_info;
 struct fsnotify_ops {
 	int (*handle_event)(struct fsnotify_group *group,
 			    struct inode *inode,
-			    struct fsnotify_mark *inode_mark,
-			    struct fsnotify_mark *vfsmount_mark,
 			    u32 mask, const void *data, int data_type,
 			    const unsigned char *file_name, u32 cookie,
 			    struct fsnotify_iter_info *iter_info);
@@ -212,6 +210,24 @@ enum fsnotify_obj_type {
 #define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL	(1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 #define FSNOTIFY_OBJ_ALL_TYPES_MASK	((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
 
+struct fsnotify_iter_info {
+	struct fsnotify_mark *inode_mark;
+	struct fsnotify_mark *vfsmount_mark;
+	unsigned int report_mask;
+	int srcu_idx;
+};
+
+#define FSNOTIFY_ITER_FUNCS(name, NAME) \
+static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
+		struct fsnotify_iter_info *iter_info) \
+{ \
+	return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \
+		iter_info->name##_mark : NULL; \
+}
+
+FSNOTIFY_ITER_FUNCS(inode, INODE)
+FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
+
 /*
  * Inode / vfsmount point to this structure which tracks all marks attached to
  * the inode / vfsmount. The reference to inode / vfsmount is held by this
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 52f368b6561e..1b80ff8d6632 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -165,12 +165,11 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
 /* Update mark data in audit rules based on fsnotify events. */
 static int audit_mark_handle_event(struct fsnotify_group *group,
 				    struct inode *to_tell,
-				    struct fsnotify_mark *inode_mark,
-				    struct fsnotify_mark *vfsmount_mark,
 				    u32 mask, const void *data, int data_type,
 				    const unsigned char *dname, u32 cookie,
 				    struct fsnotify_iter_info *iter_info)
 {
+	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	struct audit_fsnotify_mark *audit_mark;
 	const struct inode *inode = NULL;
 
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 67e6956c0b61..1f4de0517fb6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -989,8 +989,6 @@ static void evict_chunk(struct audit_chunk *chunk)
 
 static int audit_tree_handle_event(struct fsnotify_group *group,
 				   struct inode *to_tell,
-				   struct fsnotify_mark *inode_mark,
-				   struct fsnotify_mark *vfsmount_mark,
 				   u32 mask, const void *data, int data_type,
 				   const unsigned char *file_name, u32 cookie,
 				   struct fsnotify_iter_info *iter_info)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9eb8b3511636..43fcae4b0500 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -472,12 +472,11 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 /* Update watch data in audit rules based on fsnotify events. */
 static int audit_watch_handle_event(struct fsnotify_group *group,
 				    struct inode *to_tell,
-				    struct fsnotify_mark *inode_mark,
-				    struct fsnotify_mark *vfsmount_mark,
 				    u32 mask, const void *data, int data_type,
 				    const unsigned char *dname, u32 cookie,
 				    struct fsnotify_iter_info *iter_info)
 {
+	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	const struct inode *inode;
 	struct audit_parent *parent;
 
-- 
cgit v1.2.3


From 47d9c7cc457adc5d6d8ca966482a51459f81e852 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 20 Apr 2018 16:10:52 -0700
Subject: fsnotify: generalize iteration of marks by object type

Make some code that handles marks of object types inode and vfsmount
generic, so it can handle other object types.

Introduce fsnotify_foreach_obj_type macro to iterate marks by object type
and fsnotify_iter_{should|set}_report_type macros to set/test report_mask.

This is going to be used for adding mark of another object type
(super block mark).

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fsnotify.c             | 52 +++++++++++++++++++++-------------------
 fs/notify/inotify/inotify_user.c |  8 +++----
 fs/notify/mark.c                 | 23 +++++++++++-------
 include/linux/fsnotify_backend.h | 28 +++++++++++++++++++---
 4 files changed, 72 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 98e91037b11d..bc9a51480156 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -269,25 +269,29 @@ static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
 static unsigned int fsnotify_iter_select_report_types(
 		struct fsnotify_iter_info *iter_info)
 {
-	struct fsnotify_mark *inode_mark = iter_info->inode_mark;
-	struct fsnotify_mark *vfsmount_mark = iter_info->vfsmount_mark;
-	int cmp;
+	struct fsnotify_group *max_prio_group = NULL;
+	struct fsnotify_mark *mark;
+	int type;
+
+	/* Choose max prio group among groups of all queue heads */
+	fsnotify_foreach_obj_type(type) {
+		mark = iter_info->marks[type];
+		if (mark &&
+		    fsnotify_compare_groups(max_prio_group, mark->group) > 0)
+			max_prio_group = mark->group;
+	}
 
-	if (!inode_mark && !vfsmount_mark)
+	if (!max_prio_group)
 		return 0;
 
-	if (inode_mark && vfsmount_mark) {
-		cmp = fsnotify_compare_groups(inode_mark->group,
-					      vfsmount_mark->group);
-	} else {
-		cmp = inode_mark ? -1 : 1;
-	}
-
+	/* Set the report mask for marks from same group as max prio group */
 	iter_info->report_mask = 0;
-	if (cmp <= 0)
-		iter_info->report_mask |= FSNOTIFY_OBJ_TYPE_INODE_FL;
-	if (cmp >= 0)
-		iter_info->report_mask |= FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL;
+	fsnotify_foreach_obj_type(type) {
+		mark = iter_info->marks[type];
+		if (mark &&
+		    fsnotify_compare_groups(max_prio_group, mark->group) == 0)
+			fsnotify_iter_set_report_type(iter_info, type);
+	}
 
 	return iter_info->report_mask;
 }
@@ -298,13 +302,13 @@ static unsigned int fsnotify_iter_select_report_types(
  */
 static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
 {
-	if (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_INODE_FL)
-		iter_info->inode_mark =
-			fsnotify_next_mark(iter_info->inode_mark);
+	int type;
 
-	if (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL)
-		iter_info->vfsmount_mark =
-			fsnotify_next_mark(iter_info->vfsmount_mark);
+	fsnotify_foreach_obj_type(type) {
+		if (fsnotify_iter_should_report_type(iter_info, type))
+			iter_info->marks[type] =
+				fsnotify_next_mark(iter_info->marks[type]);
+	}
 }
 
 /*
@@ -351,15 +355,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 
 	if ((mask & FS_MODIFY) ||
 	    (test_mask & to_tell->i_fsnotify_mask)) {
-		iter_info.inode_mark =
+		iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
 			fsnotify_first_mark(&to_tell->i_fsnotify_marks);
 	}
 
 	if (mnt && ((mask & FS_MODIFY) ||
 		    (test_mask & mnt->mnt_fsnotify_mask))) {
-		iter_info.inode_mark =
+		iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
 			fsnotify_first_mark(&to_tell->i_fsnotify_marks);
-		iter_info.vfsmount_mark =
+		iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
 			fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
 	}
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 22a3d0471fee..6a408ab3169d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -485,10 +485,10 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark *i_mark;
-	struct fsnotify_iter_info iter_info = {
-		.inode_mark = fsn_mark,
-		.report_mask = FSNOTIFY_OBJ_TYPE_INODE_FL,
-	};
+	struct fsnotify_iter_info iter_info = { };
+
+	fsnotify_iter_set_report_type_mark(&iter_info, FSNOTIFY_OBJ_TYPE_INODE,
+					   fsn_mark);
 
 	/* Queue ignore event for the watch */
 	inotify_handle_event(group, NULL, FS_IN_IGNORED, NULL,
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index ef44808b28ca..61f4c5fa34c7 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -294,12 +294,12 @@ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark)
 
 bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
 {
-	/* This can fail if mark is being removed */
-	if (!fsnotify_get_mark_safe(iter_info->inode_mark))
-		return false;
-	if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark)) {
-		fsnotify_put_mark_wake(iter_info->inode_mark);
-		return false;
+	int type;
+
+	fsnotify_foreach_obj_type(type) {
+		/* This can fail if mark is being removed */
+		if (!fsnotify_get_mark_safe(iter_info->marks[type]))
+			goto fail;
 	}
 
 	/*
@@ -310,13 +310,20 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
 	srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);
 
 	return true;
+
+fail:
+	for (type--; type >= 0; type--)
+		fsnotify_put_mark_wake(iter_info->marks[type]);
+	return false;
 }
 
 void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
 {
+	int type;
+
 	iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
-	fsnotify_put_mark_wake(iter_info->inode_mark);
-	fsnotify_put_mark_wake(iter_info->vfsmount_mark);
+	fsnotify_foreach_obj_type(type)
+		fsnotify_put_mark_wake(iter_info->marks[type]);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9da5edf4ac0f..763423bfa3d6 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -211,23 +211,45 @@ enum fsnotify_obj_type {
 #define FSNOTIFY_OBJ_ALL_TYPES_MASK	((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
 
 struct fsnotify_iter_info {
-	struct fsnotify_mark *inode_mark;
-	struct fsnotify_mark *vfsmount_mark;
+	struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT];
 	unsigned int report_mask;
 	int srcu_idx;
 };
 
+static inline bool fsnotify_iter_should_report_type(
+		struct fsnotify_iter_info *iter_info, int type)
+{
+	return (iter_info->report_mask & (1U << type));
+}
+
+static inline void fsnotify_iter_set_report_type(
+		struct fsnotify_iter_info *iter_info, int type)
+{
+	iter_info->report_mask |= (1U << type);
+}
+
+static inline void fsnotify_iter_set_report_type_mark(
+		struct fsnotify_iter_info *iter_info, int type,
+		struct fsnotify_mark *mark)
+{
+	iter_info->marks[type] = mark;
+	iter_info->report_mask |= (1U << type);
+}
+
 #define FSNOTIFY_ITER_FUNCS(name, NAME) \
 static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
 		struct fsnotify_iter_info *iter_info) \
 { \
 	return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \
-		iter_info->name##_mark : NULL; \
+		iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \
 }
 
 FSNOTIFY_ITER_FUNCS(inode, INODE)
 FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
 
+#define fsnotify_foreach_obj_type(type) \
+	for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++)
+
 /*
  * Inode / vfsmount point to this structure which tracks all marks attached to
  * the inode / vfsmount. The reference to inode / vfsmount is held by this
-- 
cgit v1.2.3


From b249f5be6165811749b04a927806056c198222b1 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 20 Apr 2018 16:10:55 -0700
Subject: fsnotify: add fsnotify_add_inode_mark() wrappers

Before changing the arguments of the functions fsnotify_add_mark()
and fsnotify_add_mark_locked(), convert most callers to use a wrapper.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c      |  2 +-
 fs/notify/inotify/inotify_user.c |  2 +-
 include/linux/fsnotify_backend.h | 16 +++++++++++++++-
 kernel/audit_fsnotify.c          |  2 +-
 kernel/audit_tree.c              | 10 +++++-----
 kernel/audit_watch.c             |  2 +-
 6 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b4e685b63f33..e2bea2ac5dfb 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -319,7 +319,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
 	} else {
-		error = fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0);
+		error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
 		if (error) {
 			mutex_unlock(&dnotify_group->mark_mutex);
 			goto out_err;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 6a408ab3169d..1cf5b779d862 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -582,7 +582,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	}
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0);
+	ret = fsnotify_add_inode_mark_locked(&tmp_i_mark->fsn_mark, inode, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_i_mark);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 763423bfa3d6..b38964a7a521 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -401,7 +401,21 @@ extern struct fsnotify_mark *fsnotify_find_mark(
 extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode,
 			     struct vfsmount *mnt, int allow_dups);
 extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
-				    struct inode *inode, struct vfsmount *mnt, int allow_dups);
+				    struct inode *inode, struct vfsmount *mnt,
+				    int allow_dups);
+/* attach the mark to the inode */
+static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+					  struct inode *inode,
+					  int allow_dups)
+{
+	return fsnotify_add_mark(mark, inode, NULL, allow_dups);
+}
+static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
+						 struct inode *inode,
+						 int allow_dups)
+{
+	return fsnotify_add_mark_locked(mark, inode, NULL, allow_dups);
+}
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 				  struct fsnotify_group *group);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 1b80ff8d6632..fba78047fb37 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -109,7 +109,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 	audit_update_mark(audit_mark, dentry->d_inode);
 	audit_mark->rule = krule;
 
-	ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true);
+	ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
 	if (ret < 0) {
 		fsnotify_put_mark(&audit_mark->mark);
 		audit_mark = ERR_PTR(ret);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f4de0517fb6..c99ebaae5abc 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -288,8 +288,8 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 
-	if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode,
-				     NULL, 1)) {
+	if (fsnotify_add_inode_mark_locked(&new->mark, entry->connector->inode,
+					   1)) {
 		fsnotify_put_mark(&new->mark);
 		goto Fallback;
 	}
@@ -354,7 +354,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOMEM;
 
 	entry = &chunk->mark;
-	if (fsnotify_add_mark(entry, inode, NULL, 0)) {
+	if (fsnotify_add_inode_mark(entry, inode, 0)) {
 		fsnotify_put_mark(entry);
 		return -ENOSPC;
 	}
@@ -434,8 +434,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOENT;
 	}
 
-	if (fsnotify_add_mark_locked(chunk_entry,
-			     old_entry->connector->inode, NULL, 1)) {
+	if (fsnotify_add_inode_mark_locked(chunk_entry,
+			     old_entry->connector->inode, 1)) {
 		spin_unlock(&old_entry->lock);
 		mutex_unlock(&old_entry->group->mark_mutex);
 		fsnotify_put_mark(chunk_entry);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 43fcae4b0500..439a3a01368c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -160,7 +160,7 @@ static struct audit_parent *audit_init_parent(struct path *path)
 
 	fsnotify_init_mark(&parent->mark, audit_watch_group);
 	parent->mark.mask = AUDIT_FS_WATCH;
-	ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0);
+	ret = fsnotify_add_inode_mark(&parent->mark, inode, 0);
 	if (ret < 0) {
 		audit_free_parent(parent);
 		return ERR_PTR(ret);
-- 
cgit v1.2.3


From dac09149d992995adbef0f472093fbb6940a8653 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Fri, 18 May 2018 14:00:21 +0200
Subject: xsk: clean up SPDX headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clean up SPDX-License-Identifier and removing licensing leftovers.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      | 13 ++-----------
 include/uapi/linux/if_xdp.h | 13 ++-----------
 kernel/bpf/xskmap.c         |  9 ---------
 net/xdp/xdp_umem.c          |  9 ---------
 net/xdp/xdp_umem.h          | 13 ++-----------
 net/xdp/xdp_umem_props.h    | 13 ++-----------
 net/xdp/xsk.c               |  9 ---------
 net/xdp/xsk_queue.c         |  9 ---------
 net/xdp/xsk_queue.h         | 13 ++-----------
 samples/bpf/xdpsock_user.c  | 12 +-----------
 10 files changed, 11 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 185f4928fbda..7a647c56ec15 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * AF_XDP internal functions
+/* SPDX-License-Identifier: GPL-2.0 */
+/* AF_XDP internal functions
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_XDP_SOCK_H
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 77b88c4efe98..56db977221d2 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -1,17 +1,8 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
- *
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
  * if_xdp: XDP socket user-space interface
  * Copyright(c) 2018 Intel Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
  * Author(s): Björn Töpel <bjorn.topel@intel.com>
  *	      Magnus Karlsson <magnus.karlsson@intel.com>
  */
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index cb3a12137404..b3c557476a8d 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XSKMAP used for AF_XDP sockets
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/bpf.h>
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2b47a1dd7c6c..df4ea97c433b 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/init.h>
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 7e0b2fab8522..70fe225baa51 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space packet buffer
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef XDP_UMEM_H_
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
index 77fb5daf29f3..2cf8ec485fd2 100644
--- a/net/xdp/xdp_umem_props.h
+++ b/net/xdp/xdp_umem_props.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space packet buffer
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef XDP_UMEM_PROPS_H_
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 009c5af5bba5..b8d1cb4d78c0 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -5,15 +5,6 @@
  * applications.
  * Copyright(c) 2018 Intel Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
  * Author(s): Björn Töpel <bjorn.topel@intel.com>
  *	      Magnus Karlsson <magnus.karlsson@intel.com>
  */
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index d012e5e23591..9f605d22dad4 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XDP user-space ring structure
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/slab.h>
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 7aa9a535db0e..928d464e57b9 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space ring structure
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space ring structure
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_XSK_QUEUE_H
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 7fe60f6f7d53..60a882a2296c 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -1,15 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright(c) 2017 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
+/* Copyright(c) 2017 - 2018 Intel Corporation. */
 
 #include <assert.h>
 #include <errno.h>
-- 
cgit v1.2.3


From cf0dd203728c20e76976b69cdf42d065c1e5cab3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:24 -0700
Subject: tcp: use __sock_put() instead of sock_put() in
 tcp_clear_xmit_timers()

Socket can not disappear under us.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6deb540297cc..511bd0fde1dc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -559,7 +559,7 @@ void tcp_init_xmit_timers(struct sock *);
 static inline void tcp_clear_xmit_timers(struct sock *sk)
 {
 	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
-		sock_put(sk);
+		__sock_put(sk);
 
 	inet_csk_clear_xmit_timers(sk);
 }
-- 
cgit v1.2.3


From 5d9f4262b7ea41ca9981cc790e37cca6e37c789e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:26 -0700
Subject: tcp: add SACK compression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When TCP receives an out-of-order packet, it immediately sends
a SACK packet, generating network load but also forcing the
receiver to send 1-MSS pathological packets, increasing its
RTX queue length/depth, and thus processing time.

Wifi networks suffer from this aggressive behavior, but generally
speaking, all these SACK packets add fuel to the fire when networks
are under congestion.

This patch adds a high resolution timer and tp->compressed_ack counter.

Instead of sending a SACK, we program this timer with a small delay,
based on RTT and capped to 1 ms :

	delay = min ( 5 % of RTT, 1 ms)

If subsequent SACKs need to be sent while the timer has not yet
expired, we simply increment tp->compressed_ack.

When timer expires, a SACK is sent with the latest information.
Whenever an ACK is sent (if data is sent, or if in-order
data is received) timer is canceled.

Note that tcp_sack_new_ofo_skb() is able to force a SACK to be sent
if the sack blocks need to be shuffled, even if the timer has not
expired.

A new SNMP counter is added in the following patch.

Two other patches add sysctls to allow changing the 1,000,000 and 44
values that this commit hard-coded.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  2 ++
 include/net/tcp.h     |  3 +++
 net/ipv4/tcp.c        |  1 +
 net/ipv4/tcp_input.c  | 35 +++++++++++++++++++++++++++++------
 net/ipv4/tcp_output.c |  7 +++++++
 net/ipv4/tcp_timer.c  | 25 +++++++++++++++++++++++++
 6 files changed, 67 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 807776928cb8..72705eaf4b84 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -218,6 +218,7 @@ struct tcp_sock {
 		   reord:1;	 /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
+	u8	compressed_ack;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
@@ -297,6 +298,7 @@ struct tcp_sock {
 	u32	sacked_out;	/* SACK'd packets			*/
 
 	struct hrtimer	pacing_timer;
+	struct hrtimer	compressed_ack_timer;
 
 	/* from STCP, retrans queue hinting */
 	struct sk_buff* lost_skb_hint;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 511bd0fde1dc..952d842a604a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -561,6 +561,9 @@ static inline void tcp_clear_xmit_timers(struct sock *sk)
 	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
 		__sock_put(sk);
 
+	if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
+		__sock_put(sk);
+
 	inet_csk_clear_xmit_timers(sk);
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 62b776f90037..0a2ea0bbf867 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	dst_release(sk->sk_rx_dst);
 	sk->sk_rx_dst = NULL;
 	tcp_saved_syn_free(tp);
+	tp->compressed_ack = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f5622b250665..cc2ac5346b92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4249,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 	 * If the sack array is full, forget about the last one.
 	 */
 	if (this_sack >= TCP_NUM_SACKS) {
+		if (tp->compressed_ack)
+			tcp_send_ack(sk);
 		this_sack--;
 		tp->rx_opt.num_sacks--;
 		sp--;
@@ -5081,6 +5083,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long rtt, delay;
 
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5092,15 +5095,35 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
 	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
 	    /* We ACK each frame or... */
-	    tcp_in_quickack_mode(sk) ||
-	    /* We have out of order data. */
-	    (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
-		/* Then ack it now */
+	    tcp_in_quickack_mode(sk)) {
+send_now:
 		tcp_send_ack(sk);
-	} else {
-		/* Else, send delayed ack. */
+		return;
+	}
+
+	if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		tcp_send_delayed_ack(sk);
+		return;
 	}
+
+	if (!tcp_is_sack(tp) || tp->compressed_ack >= 44)
+		goto send_now;
+	tp->compressed_ack++;
+
+	if (hrtimer_is_queued(&tp->compressed_ack_timer))
+		return;
+
+	/* compress ack timer : 5 % of rtt, but no more than 1 ms */
+
+	rtt = tp->rcv_rtt_est.rtt_us;
+	if (tp->srtt_us && tp->srtt_us < rtt)
+		rtt = tp->srtt_us;
+
+	delay = min_t(unsigned long, NSEC_PER_MSEC,
+		      rtt * (NSEC_PER_USEC >> 3)/20);
+	sock_hold(sk);
+	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED_SOFT);
 }
 
 static inline void tcp_ack_snd_check(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0d8f950a9006..7ee98aad82b7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -162,6 +162,13 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 /* Account for an ACK we sent. */
 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unlikely(tp->compressed_ack)) {
+		tp->compressed_ack = 0;
+		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+			__sock_put(sk);
+	}
 	tcp_dec_quickack_mode(sk, pkts);
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 92bdf64fffae..3b3611729928 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -708,6 +708,27 @@ out:
 	sock_put(sk);
 }
 
+static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
+{
+	struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
+	struct sock *sk = (struct sock *)tp;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		if (tp->compressed_ack)
+			tcp_send_ack(sk);
+	} else {
+		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+				      &sk->sk_tsq_flags))
+			sock_hold(sk);
+	}
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+
+	return HRTIMER_NORESTART;
+}
+
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
@@ -715,4 +736,8 @@ void tcp_init_xmit_timers(struct sock *sk)
 	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
+
+	hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_PINNED_SOFT);
+	tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
 }
-- 
cgit v1.2.3


From 200d95f4575934e49f872109cce18c5e72383eb8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:27 -0700
Subject: tcp: add TCPAckCompressed SNMP counter

This counter tracks number of ACK packets that the host has not sent,
thanks to ACK compression.

Sample output :

$ nstat -n;sleep 1;nstat|egrep "IpInReceives|IpOutRequests|TcpInSegs|TcpOutSegs|TcpExtTCPAckCompressed"
IpInReceives                    123250             0.0
IpOutRequests                   3684               0.0
TcpInSegs                       123251             0.0
TcpOutSegs                      3684               0.0
TcpExtTCPAckCompressed          119252             0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp_output.c     | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index d02e859301ff..750d89120335 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -278,6 +278,7 @@ enum
 	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
 	LINUX_MIB_TCPDELIVERED,			/* TCPDelivered */
 	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
+	LINUX_MIB_TCPACKCOMPRESSED,		/* TCPAckCompressed */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 261b71d0ccc5..6c1ff89a60fa 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
 	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
 	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
+	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7ee98aad82b7..437bb7ceba7f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -165,6 +165,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (unlikely(tp->compressed_ack)) {
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+			      tp->compressed_ack);
 		tp->compressed_ack = 0;
 		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 			__sock_put(sk);
-- 
cgit v1.2.3


From 6d82aa242092d73c6d2e210cfaf0ebfbe6de1ccf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:28 -0700
Subject: tcp: add tcp_comp_sack_delay_ns sysctl

This per netns sysctl allows for TCP SACK compression fine-tuning.

Its default value is 1,000,000, or 1 ms to meet TSO autosizing period.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/net/netns/ipv4.h               | 1 +
 net/ipv4/sysctl_net_ipv4.c             | 7 +++++++
 net/ipv4/tcp_input.c                   | 4 ++--
 net/ipv4/tcp_ipv4.c                    | 1 +
 5 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ea304a23c8d7..7ba952959bca 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -525,6 +525,13 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
 tcp_sack - BOOLEAN
 	Enable select acknowledgments (SACKS).
 
+tcp_comp_sack_delay_ns - LONG INTEGER
+	TCP tries to reduce number of SACK sent, using a timer
+	based on 5% of SRTT, capped by this sysctl, in nano seconds.
+	The default is 1ms, based on TSO autosizing period.
+
+	Default : 1,000,000 ns (1 ms)
+
 tcp_slow_start_after_idle - BOOLEAN
 	If set, provide RFC2861 behavior and time out the congestion
 	window after an idle period.  An idle period is defined at
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8491bc9c86b1..927318243cfa 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -160,6 +160,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_pacing_ca_ratio;
 	int sysctl_tcp_wmem[3];
 	int sysctl_tcp_rmem[3];
+	unsigned long sysctl_tcp_comp_sack_delay_ns;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b195bac8ac0..11fbfdc1566e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1151,6 +1151,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "tcp_comp_sack_delay_ns",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cc2ac5346b92..6a1dae38c955 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5113,13 +5113,13 @@ send_now:
 	if (hrtimer_is_queued(&tp->compressed_ack_timer))
 		return;
 
-	/* compress ack timer : 5 % of rtt, but no more than 1 ms */
+	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
 
 	rtt = tp->rcv_rtt_est.rtt_us;
 	if (tp->srtt_us && tp->srtt_us < rtt)
 		rtt = tp->srtt_us;
 
-	delay = min_t(unsigned long, NSEC_PER_MSEC,
+	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
 		      rtt * (NSEC_PER_USEC >> 3)/20);
 	sock_hold(sk);
 	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index caf23de88f8a..a3f4647341db 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2572,6 +2572,7 @@ static int __net_init tcp_sk_init(struct net *net)
 		       init_net.ipv4.sysctl_tcp_wmem,
 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
 	}
+	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
-- 
cgit v1.2.3


From 9c21d2fc41c0c8930600c9dd83eadac2e336fcfa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:29 -0700
Subject: tcp: add tcp_comp_sack_nr sysctl

This per netns sysctl allows for TCP SACK compression fine-tuning.

This limits number of SACK that can be compressed.
Using 0 disables SACK compression.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 ++++++++++
 net/ipv4/tcp_input.c                   |  3 ++-
 net/ipv4/tcp_ipv4.c                    |  1 +
 5 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7ba952959bca..924bd51327b7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -532,6 +532,12 @@ tcp_comp_sack_delay_ns - LONG INTEGER
 
 	Default : 1,000,000 ns (1 ms)
 
+tcp_comp_sack_nr - INTEGER
+	Max numer of SACK that can be compressed.
+	Using 0 disables SACK compression.
+
+	Detault : 44
+
 tcp_slow_start_after_idle - BOOLEAN
 	If set, provide RFC2861 behavior and time out the congestion
 	window after an idle period.  An idle period is defined at
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 927318243cfa..661348f23ea5 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -160,6 +160,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_pacing_ca_ratio;
 	int sysctl_tcp_wmem[3];
 	int sysctl_tcp_rmem[3];
+	int sysctl_tcp_comp_sack_nr;
 	unsigned long sysctl_tcp_comp_sack_delay_ns;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 11fbfdc1566e..d2eed3ddcb0a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static int comp_sack_nr_max = 255;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -1158,6 +1159,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "tcp_comp_sack_nr",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &comp_sack_nr_max,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6a1dae38c955..aebb29ab2fdf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5106,7 +5106,8 @@ send_now:
 		return;
 	}
 
-	if (!tcp_is_sack(tp) || tp->compressed_ack >= 44)
+	if (!tcp_is_sack(tp) ||
+	    tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
 		goto send_now;
 	tp->compressed_ack++;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f4647341db..adbdb503db0c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2573,6 +2573,7 @@ static int __net_init tcp_sk_init(struct net *net)
 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
 	}
 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
-- 
cgit v1.2.3


From ef1433f717a2c63747a519d86965d73ff9bd08b3 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 2 Apr 2018 18:59:35 +0530
Subject: PCI: endpoint: Create configfs entry for each pci_epf_device_id table
 entry

In order to be able to provide correct driver_data for pci_epf device,
a separate configfs entry for each pci_epf_device_id table entry in
pci_epf_driver is required.

Add support to create configfs entry for each pci_epf_device_id
table entry here.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 drivers/pci/endpoint/pci-epf-core.c | 23 +++++++++++++++++++++--
 include/linux/pci-epf.h             |  4 ++--
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index 465b5f058b6d..523a8cab3bfb 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -15,6 +15,8 @@
 #include <linux/pci-epf.h>
 #include <linux/pci-ep-cfs.h>
 
+static DEFINE_MUTEX(pci_epf_mutex);
+
 static struct bus_type pci_epf_bus_type;
 static const struct device_type pci_epf_type;
 
@@ -143,7 +145,13 @@ EXPORT_SYMBOL_GPL(pci_epf_alloc_space);
  */
 void pci_epf_unregister_driver(struct pci_epf_driver *driver)
 {
-	pci_ep_cfs_remove_epf_group(driver->group);
+	struct config_group *group;
+
+	mutex_lock(&pci_epf_mutex);
+	list_for_each_entry(group, &driver->epf_group, group_entry)
+		pci_ep_cfs_remove_epf_group(group);
+	list_del(&driver->epf_group);
+	mutex_unlock(&pci_epf_mutex);
 	driver_unregister(&driver->driver);
 }
 EXPORT_SYMBOL_GPL(pci_epf_unregister_driver);
@@ -159,6 +167,8 @@ int __pci_epf_register_driver(struct pci_epf_driver *driver,
 			      struct module *owner)
 {
 	int ret;
+	struct config_group *group;
+	const struct pci_epf_device_id *id;
 
 	if (!driver->ops)
 		return -EINVAL;
@@ -173,7 +183,16 @@ int __pci_epf_register_driver(struct pci_epf_driver *driver,
 	if (ret)
 		return ret;
 
-	driver->group = pci_ep_cfs_add_epf_group(driver->driver.name);
+	INIT_LIST_HEAD(&driver->epf_group);
+
+	id = driver->id_table;
+	while (id->name[0]) {
+		group = pci_ep_cfs_add_epf_group(id->name);
+		mutex_lock(&pci_epf_mutex);
+		list_add_tail(&group->group_entry, &driver->epf_group);
+		mutex_unlock(&pci_epf_mutex);
+		id++;
+	}
 
 	return 0;
 }
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index f7d6f4883f8b..4e7764935fa8 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -72,7 +72,7 @@ struct pci_epf_ops {
  * @driver: PCI EPF driver
  * @ops: set of function pointers for performing EPF operations
  * @owner: the owner of the module that registers the PCI EPF driver
- * @group: configfs group corresponding to the PCI EPF driver
+ * @epf_group: list of configfs group corresponding to the PCI EPF driver
  * @id_table: identifies EPF devices for probing
  */
 struct pci_epf_driver {
@@ -82,7 +82,7 @@ struct pci_epf_driver {
 	struct device_driver	driver;
 	struct pci_epf_ops	*ops;
 	struct module		*owner;
-	struct config_group	*group;
+	struct list_head	epf_group;
 	const struct pci_epf_device_id	*id_table;
 };
 
-- 
cgit v1.2.3


From 6b59808bfe482642287ddf3fe9d4cccb10756652 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 May 2018 08:47:13 -0700
Subject: workqueue: Show the latest workqueue name in
 /proc/PID/{comm,stat,status}

There can be a lot of workqueue workers and they all show up with the
cryptic kworker/* names making it difficult to understand which is
doing what and how they came to be.

  # ps -ef | grep kworker
  root           4       2  0 Feb25 ?        00:00:00 [kworker/0:0H]
  root           6       2  0 Feb25 ?        00:00:00 [kworker/u112:0]
  root          19       2  0 Feb25 ?        00:00:00 [kworker/1:0H]
  root          25       2  0 Feb25 ?        00:00:00 [kworker/2:0H]
  root          31       2  0 Feb25 ?        00:00:00 [kworker/3:0H]
  ...

This patch makes workqueue workers report the latest workqueue it was
executing for through /proc/PID/{comm,stat,status}.  The extra
information is appended to the kthread name with intervening '+' if
currently executing, otherwise '-'.

  # cat /proc/25/comm
  kworker/2:0-events_power_efficient
  # cat /proc/25/stat
  25 (kworker/2:0-events_power_efficient) I 2 0 0 0 -1 69238880 0 0...
  # grep Name /proc/25/status
  Name:   kworker/2:0-events_power_efficient

Unfortunately, ps(1) truncates comm to 15 characters,

  # ps 25
    PID TTY      STAT   TIME COMMAND
     25 ?        I      0:00 [kworker/2:0-eve]

making it a lot less useful; however, this should be an easy fix from
ps(1) side.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Craig Small <csmall@enc.com.au>
---
 fs/proc/array.c           |  7 +++++--
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index f29221e95792..bb1d3619ca12 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -99,10 +99,13 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
 {
 	char *buf;
 	size_t size;
-	char tcomm[sizeof(p->comm)];
+	char tcomm[64];
 	int ret;
 
-	get_task_comm(tcomm, p);
+	if (p->flags & PF_WQ_WORKER)
+		wq_worker_comm(tcomm, sizeof(tcomm), p);
+	else
+		__get_task_comm(tcomm, sizeof(tcomm), p);
 
 	size = seq_get_buf(m, &buf);
 	if (escape) {
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 39a0e215022a..60d673e15632 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -494,6 +494,7 @@ extern unsigned int work_busy(struct work_struct *work);
 extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
 extern void print_worker_info(const char *log_lvl, struct task_struct *task);
 extern void show_workqueue_state(void);
+extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);
 
 /**
  * queue_work - queue work on a workqueue
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3fbe0076492c..b4a39a15c931 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4577,6 +4577,45 @@ void show_workqueue_state(void)
 	rcu_read_unlock_sched();
 }
 
+/* used to show worker information through /proc/PID/{comm,stat,status} */
+void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
+{
+	struct worker *worker;
+	struct worker_pool *pool;
+	int off;
+
+	/* always show the actual comm */
+	off = strscpy(buf, task->comm, size);
+	if (off < 0)
+		return;
+
+	/* stabilize worker pool association */
+	mutex_lock(&wq_pool_attach_mutex);
+
+	worker = kthread_data(task);
+	pool = worker->pool;
+
+	if (pool) {
+		spin_lock_irq(&pool->lock);
+		/*
+		 * ->desc tracks information (wq name or set_worker_desc())
+		 * for the latest execution.  If current, prepend '+',
+		 * otherwise '-'.
+		 */
+		if (worker->desc[0] != '\0') {
+			if (worker->current_work)
+				scnprintf(buf + off, size - off, "+%s",
+					  worker->desc);
+			else
+				scnprintf(buf + off, size - off, "-%s",
+					  worker->desc);
+		}
+		spin_unlock_irq(&pool->lock);
+	}
+
+	mutex_unlock(&wq_pool_attach_mutex);
+}
+
 /*
  * CPU hotplug.
  *
-- 
cgit v1.2.3


From 396be41f16fd05af6c914eeb2c96e0cc2dadf28c Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnacek@gmail.com>
Date: Fri, 11 May 2018 14:19:09 +0200
Subject: crypto: morus - Add generic MORUS AEAD implementations

This patch adds the generic implementation of the MORUS family of AEAD
algorithms (MORUS-640 and MORUS-1280). The original authors of MORUS
are Hongjun Wu and Tao Huang.

At the time of writing, MORUS is one of the finalists in CAESAR, an
open competition intended to select a portfolio of alternatives to
the problematic AES-GCM:

https://competitions.cr.yp.to/caesar-submissions.html
https://competitions.cr.yp.to/round3/morusv2.pdf

Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                |  12 +
 crypto/Makefile               |   2 +
 crypto/morus1280.c            | 549 ++++++++++++++++++++++++++++++++++++++++++
 crypto/morus640.c             | 544 +++++++++++++++++++++++++++++++++++++++++
 include/crypto/morus_common.h |  23 ++
 5 files changed, 1130 insertions(+)
 create mode 100644 crypto/morus1280.c
 create mode 100644 crypto/morus640.c
 create mode 100644 include/crypto/morus_common.h

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index d8d123ea47c6..7c53547f41a1 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -334,6 +334,18 @@ config CRYPTO_AEGIS256_AESNI_SSE2
 	help
 	 AESNI+SSE2 implementation of the AEGSI-256 dedicated AEAD algorithm.
 
+config CRYPTO_MORUS640
+	tristate "MORUS-640 AEAD algorithm"
+	select CRYPTO_AEAD
+	help
+	  Support for the MORUS-640 dedicated AEAD algorithm.
+
+config CRYPTO_MORUS1280
+	tristate "MORUS-1280 AEAD algorithm"
+	select CRYPTO_AEAD
+	help
+	  Support for the MORUS-1280 dedicated AEAD algorithm.
+
 config CRYPTO_SEQIV
 	tristate "Sequence Number IV Generator"
 	select CRYPTO_AEAD
diff --git a/crypto/Makefile b/crypto/Makefile
index f2008d493a28..6d1d40eeb964 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -89,6 +89,8 @@ obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
 obj-$(CONFIG_CRYPTO_AEGIS128) += aegis128.o
 obj-$(CONFIG_CRYPTO_AEGIS128L) += aegis128l.o
 obj-$(CONFIG_CRYPTO_AEGIS256) += aegis256.o
+obj-$(CONFIG_CRYPTO_MORUS640) += morus640.o
+obj-$(CONFIG_CRYPTO_MORUS1280) += morus1280.o
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
 obj-$(CONFIG_CRYPTO_MCRYPTD) += mcryptd.o
diff --git a/crypto/morus1280.c b/crypto/morus1280.c
new file mode 100644
index 000000000000..6180b2557836
--- /dev/null
+++ b/crypto/morus1280.c
@@ -0,0 +1,549 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus_common.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+
+#define MORUS1280_WORD_SIZE 8
+#define MORUS1280_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS1280_WORD_SIZE)
+#define MORUS1280_BLOCK_ALIGN (__alignof__(__le64))
+#define MORUS1280_ALIGNED(p) IS_ALIGNED((uintptr_t)p, MORUS1280_BLOCK_ALIGN)
+
+struct morus1280_block {
+	u64 words[MORUS_BLOCK_WORDS];
+};
+
+union morus1280_block_in {
+	__le64 words[MORUS_BLOCK_WORDS];
+	u8 bytes[MORUS1280_BLOCK_SIZE];
+};
+
+struct morus1280_state {
+	struct morus1280_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus1280_ctx {
+	struct morus1280_block key;
+};
+
+struct morus1280_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_chunk)(struct morus1280_state *state,
+			    u8 *dst, const u8 *src, unsigned int size);
+};
+
+static const struct morus1280_block crypto_morus1280_const[1] = {
+	{ .words = {
+		U64_C(0x0d08050302010100),
+		U64_C(0x6279e99059372215),
+		U64_C(0xf12fc26d55183ddb),
+		U64_C(0xdd28b57342311120),
+	} },
+};
+
+static void crypto_morus1280_round(struct morus1280_block *b0,
+				   struct morus1280_block *b1,
+				   struct morus1280_block *b2,
+				   struct morus1280_block *b3,
+				   struct morus1280_block *b4,
+				   const struct morus1280_block *m,
+				   unsigned int b, unsigned int w)
+{
+	unsigned int i;
+	struct morus1280_block tmp;
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		b0->words[i] ^= b1->words[i] & b2->words[i];
+		b0->words[i] ^= b3->words[i];
+		b0->words[i] ^= m->words[i];
+		b0->words[i] = rol64(b0->words[i], b);
+	}
+
+	tmp = *b3;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		b3->words[(i + w) % MORUS_BLOCK_WORDS] = tmp.words[i];
+}
+
+static void crypto_morus1280_update(struct morus1280_state *state,
+				    const struct morus1280_block *m)
+{
+	static const struct morus1280_block z = {};
+
+	struct morus1280_block *s = state->s;
+
+	crypto_morus1280_round(&s[0], &s[1], &s[2], &s[3], &s[4], &z, 13, 1);
+	crypto_morus1280_round(&s[1], &s[2], &s[3], &s[4], &s[0], m,  46, 2);
+	crypto_morus1280_round(&s[2], &s[3], &s[4], &s[0], &s[1], m,  38, 3);
+	crypto_morus1280_round(&s[3], &s[4], &s[0], &s[1], &s[2], m,   7, 2);
+	crypto_morus1280_round(&s[4], &s[0], &s[1], &s[2], &s[3], m,   4, 1);
+}
+
+static void crypto_morus1280_load_a(struct morus1280_block *dst, const u8 *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		dst->words[i] = le64_to_cpu(*(const __le64 *)src);
+		src += MORUS1280_WORD_SIZE;
+	}
+}
+
+static void crypto_morus1280_load_u(struct morus1280_block *dst, const u8 *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		dst->words[i] = get_unaligned_le64(src);
+		src += MORUS1280_WORD_SIZE;
+	}
+}
+
+static void crypto_morus1280_load(struct morus1280_block *dst, const u8 *src)
+{
+	if (MORUS1280_ALIGNED(src))
+		crypto_morus1280_load_a(dst, src);
+	else
+		crypto_morus1280_load_u(dst, src);
+}
+
+static void crypto_morus1280_store_a(u8 *dst, const struct morus1280_block *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		*(__le64 *)dst = cpu_to_le64(src->words[i]);
+		dst += MORUS1280_WORD_SIZE;
+	}
+}
+
+static void crypto_morus1280_store_u(u8 *dst, const struct morus1280_block *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		put_unaligned_le64(src->words[i], dst);
+		dst += MORUS1280_WORD_SIZE;
+	}
+}
+
+static void crypto_morus1280_store(u8 *dst, const struct morus1280_block *src)
+{
+	if (MORUS1280_ALIGNED(dst))
+		crypto_morus1280_store_a(dst, src);
+	else
+		crypto_morus1280_store_u(dst, src);
+}
+
+static void crypto_morus1280_ad(struct morus1280_state *state, const u8 *src,
+				unsigned int size)
+{
+	struct morus1280_block m;
+
+	if (MORUS1280_ALIGNED(src)) {
+		while (size >= MORUS1280_BLOCK_SIZE) {
+			crypto_morus1280_load_a(&m, src);
+			crypto_morus1280_update(state, &m);
+
+			size -= MORUS1280_BLOCK_SIZE;
+			src += MORUS1280_BLOCK_SIZE;
+		}
+	} else {
+		while (size >= MORUS1280_BLOCK_SIZE) {
+			crypto_morus1280_load_u(&m, src);
+			crypto_morus1280_update(state, &m);
+
+			size -= MORUS1280_BLOCK_SIZE;
+			src += MORUS1280_BLOCK_SIZE;
+		}
+	}
+}
+
+static void crypto_morus1280_core(const struct morus1280_state *state,
+				  struct morus1280_block *blk)
+{
+	unsigned int i;
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		blk->words[(i + 3) % MORUS_BLOCK_WORDS] ^= state->s[1].words[i];
+
+        for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		blk->words[i] ^= state->s[0].words[i];
+		blk->words[i] ^= state->s[2].words[i] & state->s[3].words[i];
+	}
+}
+
+static void crypto_morus1280_encrypt_chunk(struct morus1280_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int size)
+{
+	struct morus1280_block c, m;
+
+	if (MORUS1280_ALIGNED(src) && MORUS1280_ALIGNED(dst)) {
+		while (size >= MORUS1280_BLOCK_SIZE) {
+			crypto_morus1280_load_a(&m, src);
+			c = m;
+			crypto_morus1280_core(state, &c);
+			crypto_morus1280_store_a(dst, &c);
+			crypto_morus1280_update(state, &m);
+
+			src += MORUS1280_BLOCK_SIZE;
+			dst += MORUS1280_BLOCK_SIZE;
+			size -= MORUS1280_BLOCK_SIZE;
+		}
+	} else {
+		while (size >= MORUS1280_BLOCK_SIZE) {
+			crypto_morus1280_load_u(&m, src);
+			c = m;
+			crypto_morus1280_core(state, &c);
+			crypto_morus1280_store_u(dst, &c);
+			crypto_morus1280_update(state, &m);
+
+			src += MORUS1280_BLOCK_SIZE;
+			dst += MORUS1280_BLOCK_SIZE;
+			size -= MORUS1280_BLOCK_SIZE;
+		}
+	}
+
+	if (size > 0) {
+		union morus1280_block_in tail;
+
+		memcpy(tail.bytes, src, size);
+		memset(tail.bytes + size, 0, MORUS1280_BLOCK_SIZE - size);
+
+		crypto_morus1280_load_a(&m, tail.bytes);
+		c = m;
+		crypto_morus1280_core(state, &c);
+		crypto_morus1280_store_a(tail.bytes, &c);
+		crypto_morus1280_update(state, &m);
+
+		memcpy(dst, tail.bytes, size);
+	}
+}
+
+static void crypto_morus1280_decrypt_chunk(struct morus1280_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int size)
+{
+	struct morus1280_block m;
+
+	if (MORUS1280_ALIGNED(src) && MORUS1280_ALIGNED(dst)) {
+		while (size >= MORUS1280_BLOCK_SIZE) {
+			crypto_morus1280_load_a(&m, src);
+			crypto_morus1280_core(state, &m);
+			crypto_morus1280_store_a(dst, &m);
+			crypto_morus1280_update(state, &m);
+
+			src += MORUS1280_BLOCK_SIZE;
+			dst += MORUS1280_BLOCK_SIZE;
+			size -= MORUS1280_BLOCK_SIZE;
+		}
+	} else {
+		while (size >= MORUS1280_BLOCK_SIZE) {
+			crypto_morus1280_load_u(&m, src);
+			crypto_morus1280_core(state, &m);
+			crypto_morus1280_store_u(dst, &m);
+			crypto_morus1280_update(state, &m);
+
+			src += MORUS1280_BLOCK_SIZE;
+			dst += MORUS1280_BLOCK_SIZE;
+			size -= MORUS1280_BLOCK_SIZE;
+		}
+	}
+
+	if (size > 0) {
+		union morus1280_block_in tail;
+
+		memcpy(tail.bytes, src, size);
+		memset(tail.bytes + size, 0, MORUS1280_BLOCK_SIZE - size);
+
+		crypto_morus1280_load_a(&m, tail.bytes);
+		crypto_morus1280_core(state, &m);
+		crypto_morus1280_store_a(tail.bytes, &m);
+		memset(tail.bytes + size, 0, MORUS1280_BLOCK_SIZE - size);
+		crypto_morus1280_load_a(&m, tail.bytes);
+		crypto_morus1280_update(state, &m);
+
+		memcpy(dst, tail.bytes, size);
+	}
+}
+
+static void crypto_morus1280_init(struct morus1280_state *state,
+				  const struct morus1280_block *key,
+				  const u8 *iv)
+{
+	static const struct morus1280_block z = {};
+
+	union morus1280_block_in tmp;
+	unsigned int i;
+
+	memcpy(tmp.bytes, iv, MORUS_NONCE_SIZE);
+	memset(tmp.bytes + MORUS_NONCE_SIZE, 0,
+	       MORUS1280_BLOCK_SIZE - MORUS_NONCE_SIZE);
+
+	crypto_morus1280_load(&state->s[0], tmp.bytes);
+	state->s[1] = *key;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		state->s[2].words[i] = U64_C(0xFFFFFFFFFFFFFFFF);
+	state->s[3] = z;
+	state->s[4] = crypto_morus1280_const[0];
+
+	for (i = 0; i < 16; i++)
+		crypto_morus1280_update(state, &z);
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		state->s[1].words[i] ^= key->words[i];
+}
+
+static void crypto_morus1280_process_ad(struct morus1280_state *state,
+					struct scatterlist *sg_src,
+					unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct morus1280_block m;
+	union morus1280_block_in buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= MORUS1280_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = MORUS1280_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+
+				crypto_morus1280_load_a(&m, buf.bytes);
+				crypto_morus1280_update(state, &m);
+
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			crypto_morus1280_ad(state, src, left);
+			src += left & ~(MORUS1280_BLOCK_SIZE - 1);
+			left &= MORUS1280_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+
+		pos += left;
+		assoclen -= size;
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos);
+
+		crypto_morus1280_load_a(&m, buf.bytes);
+		crypto_morus1280_update(state, &m);
+	}
+}
+
+static void crypto_morus1280_process_crypt(struct morus1280_state *state,
+					   struct aead_request *req,
+					   const struct morus1280_ops *ops)
+{
+	struct skcipher_walk walk;
+	u8 *dst;
+	const u8 *src;
+
+	ops->skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		src = walk.src.virt.addr;
+		dst = walk.dst.virt.addr;
+
+		ops->crypt_chunk(state, dst, src, walk.nbytes);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+static void crypto_morus1280_final(struct morus1280_state *state,
+				   struct morus1280_block *tag_xor,
+				   u64 assoclen, u64 cryptlen)
+{
+	u64 assocbits = assoclen * 8;
+	u64 cryptbits = cryptlen * 8;
+
+	struct morus1280_block tmp;
+	unsigned int i;
+
+	tmp.words[0] = cpu_to_le64(assocbits);
+	tmp.words[1] = cpu_to_le64(cryptbits);
+	tmp.words[2] = 0;
+	tmp.words[3] = 0;
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		state->s[4].words[i] ^= state->s[0].words[i];
+
+	for (i = 0; i < 10; i++)
+		crypto_morus1280_update(state, &tmp);
+
+	crypto_morus1280_core(state, tag_xor);
+}
+
+static int crypto_morus1280_setkey(struct crypto_aead *aead, const u8 *key,
+				   unsigned int keylen)
+{
+	struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+	union morus1280_block_in tmp;
+
+	if (keylen == MORUS1280_BLOCK_SIZE)
+		crypto_morus1280_load(&ctx->key, key);
+	else if (keylen == MORUS1280_BLOCK_SIZE / 2) {
+		memcpy(tmp.bytes, key, keylen);
+		memcpy(tmp.bytes + keylen, key, keylen);
+
+		crypto_morus1280_load(&ctx->key, tmp.bytes);
+	} else {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int crypto_morus1280_setauthsize(struct crypto_aead *tfm,
+					unsigned int authsize)
+{
+	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+
+static void crypto_morus1280_crypt(struct aead_request *req,
+				   struct morus1280_block *tag_xor,
+				   unsigned int cryptlen,
+				   const struct morus1280_ops *ops)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus1280_state state;
+
+	crypto_morus1280_init(&state, &ctx->key, req->iv);
+	crypto_morus1280_process_ad(&state, req->src, req->assoclen);
+	crypto_morus1280_process_crypt(&state, req, ops);
+	crypto_morus1280_final(&state, tag_xor, req->assoclen, cryptlen);
+}
+
+static int crypto_morus1280_encrypt(struct aead_request *req)
+{
+	static const struct morus1280_ops ops = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_chunk = crypto_morus1280_encrypt_chunk,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_block tag = {};
+	union morus1280_block_in tag_out;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_morus1280_crypt(req, &tag, cryptlen, &ops);
+	crypto_morus1280_store(tag_out.bytes, &tag);
+
+	scatterwalk_map_and_copy(tag_out.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+
+static int crypto_morus1280_decrypt(struct aead_request *req)
+{
+	static const struct morus1280_ops ops = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_chunk = crypto_morus1280_decrypt_chunk,
+	};
+	static const u8 zeros[MORUS1280_BLOCK_SIZE] = {};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	union morus1280_block_in tag_in;
+	struct morus1280_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag_in.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_morus1280_load(&tag, tag_in.bytes);
+	crypto_morus1280_crypt(req, &tag, cryptlen, &ops);
+	crypto_morus1280_store(tag_in.bytes, &tag);
+
+	return crypto_memneq(tag_in.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_morus1280_init_tfm(struct crypto_aead *tfm)
+{
+	return 0;
+}
+
+static void crypto_morus1280_exit_tfm(struct crypto_aead *tfm)
+{
+}
+
+static struct aead_alg crypto_morus1280_alg = {
+	.setkey = crypto_morus1280_setkey,
+	.setauthsize = crypto_morus1280_setauthsize,
+	.encrypt = crypto_morus1280_encrypt,
+	.decrypt = crypto_morus1280_decrypt,
+	.init = crypto_morus1280_init_tfm,
+	.exit = crypto_morus1280_exit_tfm,
+
+	.ivsize = MORUS_NONCE_SIZE,
+	.maxauthsize = MORUS_MAX_AUTH_SIZE,
+	.chunksize = MORUS1280_BLOCK_SIZE,
+
+	.base = {
+		.cra_flags = CRYPTO_ALG_TYPE_AEAD,
+		.cra_blocksize = 1,
+		.cra_ctxsize = sizeof(struct morus1280_ctx),
+		.cra_alignmask = 0,
+
+		.cra_priority = 100,
+
+		.cra_name = "morus1280",
+		.cra_driver_name = "morus1280-generic",
+
+		.cra_module = THIS_MODULE,
+	}
+};
+
+
+static int __init crypto_morus1280_module_init(void)
+{
+	return crypto_register_aead(&crypto_morus1280_alg);
+}
+
+static void __exit crypto_morus1280_module_exit(void)
+{
+	crypto_unregister_aead(&crypto_morus1280_alg);
+}
+
+module_init(crypto_morus1280_module_init);
+module_exit(crypto_morus1280_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-generic");
diff --git a/crypto/morus640.c b/crypto/morus640.c
new file mode 100644
index 000000000000..9fbcde307daf
--- /dev/null
+++ b/crypto/morus640.c
@@ -0,0 +1,544 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus_common.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+
+#define MORUS640_WORD_SIZE 4
+#define MORUS640_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS640_WORD_SIZE)
+#define MORUS640_BLOCK_ALIGN (__alignof__(__le32))
+#define MORUS640_ALIGNED(p) IS_ALIGNED((uintptr_t)p, MORUS640_BLOCK_ALIGN)
+
+struct morus640_block {
+	u32 words[MORUS_BLOCK_WORDS];
+};
+
+union morus640_block_in {
+	__le32 words[MORUS_BLOCK_WORDS];
+	u8 bytes[MORUS640_BLOCK_SIZE];
+};
+
+struct morus640_state {
+	struct morus640_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus640_ctx {
+	struct morus640_block key;
+};
+
+struct morus640_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_chunk)(struct morus640_state *state,
+			    u8 *dst, const u8 *src, unsigned int size);
+};
+
+static const struct morus640_block crypto_morus640_const[2] = {
+	{ .words = {
+		U32_C(0x02010100),
+		U32_C(0x0d080503),
+		U32_C(0x59372215),
+		U32_C(0x6279e990),
+	} },
+	{ .words = {
+		U32_C(0x55183ddb),
+		U32_C(0xf12fc26d),
+		U32_C(0x42311120),
+		U32_C(0xdd28b573),
+	} },
+};
+
+static void crypto_morus640_round(struct morus640_block *b0,
+				  struct morus640_block *b1,
+				  struct morus640_block *b2,
+				  struct morus640_block *b3,
+				  struct morus640_block *b4,
+				  const struct morus640_block *m,
+				  unsigned int b, unsigned int w)
+{
+	unsigned int i;
+	struct morus640_block tmp;
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		b0->words[i] ^= b1->words[i] & b2->words[i];
+		b0->words[i] ^= b3->words[i];
+		b0->words[i] ^= m->words[i];
+		b0->words[i] = rol32(b0->words[i], b);
+	}
+
+	tmp = *b3;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		b3->words[(i + w) % MORUS_BLOCK_WORDS] = tmp.words[i];
+}
+
+static void crypto_morus640_update(struct morus640_state *state,
+				   const struct morus640_block *m)
+{
+	static const struct morus640_block z = {};
+
+	struct morus640_block *s = state->s;
+
+	crypto_morus640_round(&s[0], &s[1], &s[2], &s[3], &s[4], &z,  5, 1);
+	crypto_morus640_round(&s[1], &s[2], &s[3], &s[4], &s[0], m,  31, 2);
+	crypto_morus640_round(&s[2], &s[3], &s[4], &s[0], &s[1], m,   7, 3);
+	crypto_morus640_round(&s[3], &s[4], &s[0], &s[1], &s[2], m,  22, 2);
+	crypto_morus640_round(&s[4], &s[0], &s[1], &s[2], &s[3], m,  13, 1);
+}
+
+static void crypto_morus640_load_a(struct morus640_block *dst, const u8 *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		dst->words[i] = le32_to_cpu(*(const __le32 *)src);
+		src += MORUS640_WORD_SIZE;
+	}
+}
+
+static void crypto_morus640_load_u(struct morus640_block *dst, const u8 *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		dst->words[i] = get_unaligned_le32(src);
+		src += MORUS640_WORD_SIZE;
+	}
+}
+
+static void crypto_morus640_load(struct morus640_block *dst, const u8 *src)
+{
+	if (MORUS640_ALIGNED(src))
+		crypto_morus640_load_a(dst, src);
+	else
+		crypto_morus640_load_u(dst, src);
+}
+
+static void crypto_morus640_store_a(u8 *dst, const struct morus640_block *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		*(__le32 *)dst = cpu_to_le32(src->words[i]);
+		dst += MORUS640_WORD_SIZE;
+	}
+}
+
+static void crypto_morus640_store_u(u8 *dst, const struct morus640_block *src)
+{
+	unsigned int i;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		put_unaligned_le32(src->words[i], dst);
+		dst += MORUS640_WORD_SIZE;
+	}
+}
+
+static void crypto_morus640_store(u8 *dst, const struct morus640_block *src)
+{
+	if (MORUS640_ALIGNED(dst))
+		crypto_morus640_store_a(dst, src);
+	else
+		crypto_morus640_store_u(dst, src);
+}
+
+static void crypto_morus640_ad(struct morus640_state *state, const u8 *src,
+			       unsigned int size)
+{
+	struct morus640_block m;
+
+	if (MORUS640_ALIGNED(src)) {
+		while (size >= MORUS640_BLOCK_SIZE) {
+			crypto_morus640_load_a(&m, src);
+			crypto_morus640_update(state, &m);
+
+			size -= MORUS640_BLOCK_SIZE;
+			src += MORUS640_BLOCK_SIZE;
+		}
+	} else {
+		while (size >= MORUS640_BLOCK_SIZE) {
+			crypto_morus640_load_u(&m, src);
+			crypto_morus640_update(state, &m);
+
+			size -= MORUS640_BLOCK_SIZE;
+			src += MORUS640_BLOCK_SIZE;
+		}
+	}
+}
+
+static void crypto_morus640_core(const struct morus640_state *state,
+				 struct morus640_block *blk)
+{
+	unsigned int i;
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		blk->words[(i + 3) % MORUS_BLOCK_WORDS] ^= state->s[1].words[i];
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++) {
+		blk->words[i] ^= state->s[0].words[i];
+		blk->words[i] ^= state->s[2].words[i] & state->s[3].words[i];
+	}
+}
+
+static void crypto_morus640_encrypt_chunk(struct morus640_state *state, u8 *dst,
+					  const u8 *src, unsigned int size)
+{
+	struct morus640_block c, m;
+
+	if (MORUS640_ALIGNED(src) && MORUS640_ALIGNED(dst)) {
+		while (size >= MORUS640_BLOCK_SIZE) {
+			crypto_morus640_load_a(&m, src);
+			c = m;
+			crypto_morus640_core(state, &c);
+			crypto_morus640_store_a(dst, &c);
+			crypto_morus640_update(state, &m);
+
+			src += MORUS640_BLOCK_SIZE;
+			dst += MORUS640_BLOCK_SIZE;
+			size -= MORUS640_BLOCK_SIZE;
+		}
+	} else {
+		while (size >= MORUS640_BLOCK_SIZE) {
+			crypto_morus640_load_u(&m, src);
+			c = m;
+			crypto_morus640_core(state, &c);
+			crypto_morus640_store_u(dst, &c);
+			crypto_morus640_update(state, &m);
+
+			src += MORUS640_BLOCK_SIZE;
+			dst += MORUS640_BLOCK_SIZE;
+			size -= MORUS640_BLOCK_SIZE;
+		}
+	}
+
+	if (size > 0) {
+		union morus640_block_in tail;
+
+		memcpy(tail.bytes, src, size);
+		memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size);
+
+		crypto_morus640_load_a(&m, tail.bytes);
+		c = m;
+		crypto_morus640_core(state, &c);
+		crypto_morus640_store_a(tail.bytes, &c);
+		crypto_morus640_update(state, &m);
+
+		memcpy(dst, tail.bytes, size);
+	}
+}
+
+static void crypto_morus640_decrypt_chunk(struct morus640_state *state, u8 *dst,
+					  const u8 *src, unsigned int size)
+{
+	struct morus640_block m;
+
+	if (MORUS640_ALIGNED(src) && MORUS640_ALIGNED(dst)) {
+		while (size >= MORUS640_BLOCK_SIZE) {
+			crypto_morus640_load_a(&m, src);
+			crypto_morus640_core(state, &m);
+			crypto_morus640_store_a(dst, &m);
+			crypto_morus640_update(state, &m);
+
+			src += MORUS640_BLOCK_SIZE;
+			dst += MORUS640_BLOCK_SIZE;
+			size -= MORUS640_BLOCK_SIZE;
+		}
+	} else {
+		while (size >= MORUS640_BLOCK_SIZE) {
+			crypto_morus640_load_u(&m, src);
+			crypto_morus640_core(state, &m);
+			crypto_morus640_store_u(dst, &m);
+			crypto_morus640_update(state, &m);
+
+			src += MORUS640_BLOCK_SIZE;
+			dst += MORUS640_BLOCK_SIZE;
+			size -= MORUS640_BLOCK_SIZE;
+		}
+	}
+
+	if (size > 0) {
+		union morus640_block_in tail;
+
+		memcpy(tail.bytes, src, size);
+
+		crypto_morus640_load_a(&m, src);
+		crypto_morus640_core(state, &m);
+		crypto_morus640_store_a(tail.bytes, &m);
+		memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size);
+		crypto_morus640_load_a(&m, tail.bytes);
+		crypto_morus640_update(state, &m);
+
+		memcpy(dst, tail.bytes, size);
+	}
+}
+
+static void crypto_morus640_init(struct morus640_state *state,
+				 const struct morus640_block *key,
+				 const u8 *iv)
+{
+	static const struct morus640_block z = {};
+
+	unsigned int i;
+
+	crypto_morus640_load(&state->s[0], iv);
+	state->s[1] = *key;
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		state->s[2].words[i] = U32_C(0xFFFFFFFF);
+	state->s[3] = crypto_morus640_const[0];
+	state->s[4] = crypto_morus640_const[1];
+
+	for (i = 0; i < 16; i++)
+		crypto_morus640_update(state, &z);
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		state->s[1].words[i] ^= key->words[i];
+}
+
+static void crypto_morus640_process_ad(struct morus640_state *state,
+				       struct scatterlist *sg_src,
+				       unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct morus640_block m;
+	union morus640_block_in buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= MORUS640_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = MORUS640_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+
+				crypto_morus640_load_a(&m, buf.bytes);
+				crypto_morus640_update(state, &m);
+
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			crypto_morus640_ad(state, src, left);
+			src += left & ~(MORUS640_BLOCK_SIZE - 1);
+			left &= MORUS640_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+
+		pos += left;
+		assoclen -= size;
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos);
+
+		crypto_morus640_load_a(&m, buf.bytes);
+		crypto_morus640_update(state, &m);
+	}
+}
+
+static void crypto_morus640_process_crypt(struct morus640_state *state,
+					  struct aead_request *req,
+					  const struct morus640_ops *ops)
+{
+	struct skcipher_walk walk;
+	u8 *dst;
+	const u8 *src;
+
+	ops->skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		src = walk.src.virt.addr;
+		dst = walk.dst.virt.addr;
+
+		ops->crypt_chunk(state, dst, src, walk.nbytes);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+static void crypto_morus640_final(struct morus640_state *state,
+				  struct morus640_block *tag_xor,
+				  u64 assoclen, u64 cryptlen)
+{
+	u64 assocbits = assoclen * 8;
+	u64 cryptbits = cryptlen * 8;
+
+	u32 assocbits_lo = (u32)assocbits;
+	u32 assocbits_hi = (u32)(assocbits >> 32);
+	u32 cryptbits_lo = (u32)cryptbits;
+	u32 cryptbits_hi = (u32)(cryptbits >> 32);
+
+	struct morus640_block tmp;
+	unsigned int i;
+
+	tmp.words[0] = cpu_to_le32(assocbits_lo);
+	tmp.words[1] = cpu_to_le32(assocbits_hi);
+	tmp.words[2] = cpu_to_le32(cryptbits_lo);
+	tmp.words[3] = cpu_to_le32(cryptbits_hi);
+
+	for (i = 0; i < MORUS_BLOCK_WORDS; i++)
+		state->s[4].words[i] ^= state->s[0].words[i];
+
+	for (i = 0; i < 10; i++)
+		crypto_morus640_update(state, &tmp);
+
+	crypto_morus640_core(state, tag_xor);
+}
+
+static int crypto_morus640_setkey(struct crypto_aead *aead, const u8 *key,
+				  unsigned int keylen)
+{
+	struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (keylen != MORUS640_BLOCK_SIZE) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	crypto_morus640_load(&ctx->key, key);
+	return 0;
+}
+
+static int crypto_morus640_setauthsize(struct crypto_aead *tfm,
+				       unsigned int authsize)
+{
+	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+
+static void crypto_morus640_crypt(struct aead_request *req,
+				  struct morus640_block *tag_xor,
+				  unsigned int cryptlen,
+				  const struct morus640_ops *ops)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus640_state state;
+
+	crypto_morus640_init(&state, &ctx->key, req->iv);
+	crypto_morus640_process_ad(&state, req->src, req->assoclen);
+	crypto_morus640_process_crypt(&state, req, ops);
+	crypto_morus640_final(&state, tag_xor, req->assoclen, cryptlen);
+}
+
+static int crypto_morus640_encrypt(struct aead_request *req)
+{
+	static const struct morus640_ops ops = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_chunk = crypto_morus640_encrypt_chunk,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_block tag = {};
+	union morus640_block_in tag_out;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_morus640_crypt(req, &tag, cryptlen, &ops);
+	crypto_morus640_store(tag_out.bytes, &tag);
+
+	scatterwalk_map_and_copy(tag_out.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+
+static int crypto_morus640_decrypt(struct aead_request *req)
+{
+	static const struct morus640_ops ops = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_chunk = crypto_morus640_decrypt_chunk,
+	};
+	static const u8 zeros[MORUS640_BLOCK_SIZE] = {};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	union morus640_block_in tag_in;
+	struct morus640_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag_in.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_morus640_load(&tag, tag_in.bytes);
+	crypto_morus640_crypt(req, &tag, cryptlen, &ops);
+	crypto_morus640_store(tag_in.bytes, &tag);
+
+	return crypto_memneq(tag_in.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_morus640_init_tfm(struct crypto_aead *tfm)
+{
+	return 0;
+}
+
+static void crypto_morus640_exit_tfm(struct crypto_aead *tfm)
+{
+}
+
+static struct aead_alg crypto_morus640_alg = {
+	.setkey = crypto_morus640_setkey,
+	.setauthsize = crypto_morus640_setauthsize,
+	.encrypt = crypto_morus640_encrypt,
+	.decrypt = crypto_morus640_decrypt,
+	.init = crypto_morus640_init_tfm,
+	.exit = crypto_morus640_exit_tfm,
+
+	.ivsize = MORUS_NONCE_SIZE,
+	.maxauthsize = MORUS_MAX_AUTH_SIZE,
+	.chunksize = MORUS640_BLOCK_SIZE,
+
+	.base = {
+		.cra_flags = CRYPTO_ALG_TYPE_AEAD,
+		.cra_blocksize = 1,
+		.cra_ctxsize = sizeof(struct morus640_ctx),
+		.cra_alignmask = 0,
+
+		.cra_priority = 100,
+
+		.cra_name = "morus640",
+		.cra_driver_name = "morus640-generic",
+
+		.cra_module = THIS_MODULE,
+	}
+};
+
+static int __init crypto_morus640_module_init(void)
+{
+	return crypto_register_aead(&crypto_morus640_alg);
+}
+
+static void __exit crypto_morus640_module_exit(void)
+{
+	crypto_unregister_aead(&crypto_morus640_alg);
+}
+
+module_init(crypto_morus640_module_init);
+module_exit(crypto_morus640_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD algorithm");
+MODULE_ALIAS_CRYPTO("morus640");
+MODULE_ALIAS_CRYPTO("morus640-generic");
diff --git a/include/crypto/morus_common.h b/include/crypto/morus_common.h
new file mode 100644
index 000000000000..39f28c749951
--- /dev/null
+++ b/include/crypto/morus_common.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The MORUS Authenticated-Encryption Algorithm
+ *   Common definitions
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef _CRYPTO_MORUS_COMMON_H
+#define _CRYPTO_MORUS_COMMON_H
+
+#define MORUS_BLOCK_WORDS 4
+#define MORUS_STATE_BLOCKS 5
+#define MORUS_NONCE_SIZE 16
+#define MORUS_MAX_AUTH_SIZE 16
+
+#endif /* _CRYPTO_MORUS_COMMON_H */
-- 
cgit v1.2.3


From 56e8e57fc3a707bf4f23f88c4822e6cbc9a950dc Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnacek@gmail.com>
Date: Fri, 11 May 2018 14:19:11 +0200
Subject: crypto: morus - Add common SIMD glue code for MORUS

This patch adds a common glue code for optimized implementations of
MORUS AEAD algorithms.

Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                  |  16 +++
 crypto/Makefile                 |   2 +
 crypto/morus1280_glue.c         | 302 ++++++++++++++++++++++++++++++++++++++++
 crypto/morus640_glue.c          | 298 +++++++++++++++++++++++++++++++++++++++
 include/crypto/morus1280_glue.h | 137 ++++++++++++++++++
 include/crypto/morus640_glue.h  | 137 ++++++++++++++++++
 6 files changed, 892 insertions(+)
 create mode 100644 crypto/morus1280_glue.c
 create mode 100644 crypto/morus640_glue.c
 create mode 100644 include/crypto/morus1280_glue.h
 create mode 100644 include/crypto/morus640_glue.h

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 7c53547f41a1..4761667fbcf9 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -340,12 +340,28 @@ config CRYPTO_MORUS640
 	help
 	  Support for the MORUS-640 dedicated AEAD algorithm.
 
+config CRYPTO_MORUS640_GLUE
+	tristate "MORUS-640 AEAD algorithm (glue for SIMD optimizations)"
+	select CRYPTO_AEAD
+	select CRYPTO_CRYPTD
+	help
+	  Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD
+	  algorithm.
+
 config CRYPTO_MORUS1280
 	tristate "MORUS-1280 AEAD algorithm"
 	select CRYPTO_AEAD
 	help
 	  Support for the MORUS-1280 dedicated AEAD algorithm.
 
+config CRYPTO_MORUS1280_GLUE
+	tristate "MORUS-1280 AEAD algorithm (glue for SIMD optimizations)"
+	select CRYPTO_AEAD
+	select CRYPTO_CRYPTD
+	help
+	  Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD
+	  algorithm.
+
 config CRYPTO_SEQIV
 	tristate "Sequence Number IV Generator"
 	select CRYPTO_AEAD
diff --git a/crypto/Makefile b/crypto/Makefile
index 6d1d40eeb964..68a7c546460a 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -91,6 +91,8 @@ obj-$(CONFIG_CRYPTO_AEGIS128L) += aegis128l.o
 obj-$(CONFIG_CRYPTO_AEGIS256) += aegis256.o
 obj-$(CONFIG_CRYPTO_MORUS640) += morus640.o
 obj-$(CONFIG_CRYPTO_MORUS1280) += morus1280.o
+obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o
+obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
 obj-$(CONFIG_CRYPTO_MCRYPTD) += mcryptd.o
diff --git a/crypto/morus1280_glue.c b/crypto/morus1280_glue.c
new file mode 100644
index 000000000000..ce1e5c34b09d
--- /dev/null
+++ b/crypto/morus1280_glue.c
@@ -0,0 +1,302 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *   Common glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus1280_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus1280_state {
+	struct morus1280_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus1280_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_blocks)(void *state, const void *src, void *dst,
+			     unsigned int length);
+	void (*crypt_tail)(void *state, const void *src, void *dst,
+			   unsigned int length);
+};
+
+static void crypto_morus1280_glue_process_ad(
+		struct morus1280_state *state,
+		const struct morus1280_glue_ops *ops,
+		struct scatterlist *sg_src, unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct morus1280_block buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= MORUS1280_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = MORUS1280_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+				ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			ops->ad(state, src, left);
+			src += left & ~(MORUS1280_BLOCK_SIZE - 1);
+			left &= MORUS1280_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+
+		pos += left;
+		assoclen -= size;
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos);
+		ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+	}
+}
+
+static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state,
+						struct morus1280_ops ops,
+						struct aead_request *req)
+{
+	struct skcipher_walk walk;
+	u8 *cursor_src, *cursor_dst;
+	unsigned int chunksize, base;
+
+	ops.skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		cursor_src = walk.src.virt.addr;
+		cursor_dst = walk.dst.virt.addr;
+		chunksize = walk.nbytes;
+
+		ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
+
+		base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1);
+		cursor_src += base;
+		cursor_dst += base;
+		chunksize &= MORUS1280_BLOCK_SIZE - 1;
+
+		if (chunksize > 0)
+			ops.crypt_tail(state, cursor_src, cursor_dst,
+				       chunksize);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen)
+{
+	struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (keylen == MORUS1280_BLOCK_SIZE) {
+		memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE);
+	} else if (keylen == MORUS1280_BLOCK_SIZE / 2) {
+		memcpy(ctx->key.bytes, key, keylen);
+		memcpy(ctx->key.bytes + keylen, key, keylen);
+	} else {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey);
+
+int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm,
+				      unsigned int authsize)
+{
+	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize);
+
+static void crypto_morus1280_glue_crypt(struct aead_request *req,
+					struct morus1280_ops ops,
+					unsigned int cryptlen,
+					struct morus1280_block *tag_xor)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus1280_state state;
+
+	kernel_fpu_begin();
+
+	ctx->ops->init(&state, &ctx->key, req->iv);
+	crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+	crypto_morus1280_glue_process_crypt(&state, ops, req);
+	ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+	kernel_fpu_end();
+}
+
+int crypto_morus1280_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus1280_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_blocks = ctx->ops->enc,
+		.crypt_tail = ctx->ops->enc_tail,
+	};
+
+	struct morus1280_block tag = {};
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+	scatterwalk_map_and_copy(tag.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt);
+
+int crypto_morus1280_glue_decrypt(struct aead_request *req)
+{
+	static const u8 zeros[MORUS1280_BLOCK_SIZE] = {};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus1280_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_blocks = ctx->ops->dec,
+		.crypt_tail = ctx->ops->dec_tail,
+	};
+
+	struct morus1280_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+	return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt);
+
+void crypto_morus1280_glue_init_ops(struct crypto_aead *aead,
+				    const struct morus1280_glue_ops *ops)
+{
+	struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+	ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops);
+
+int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey);
+
+int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
+				      unsigned int authsize)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize);
+
+int cryptd_morus1280_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt);
+
+int cryptd_morus1280_glue_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt);
+
+int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+	char internal_name[CRYPTO_MAX_ALG_NAME];
+
+	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+			>= CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm);
+
+void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for optimizations");
diff --git a/crypto/morus640_glue.c b/crypto/morus640_glue.c
new file mode 100644
index 000000000000..c7e788cfaa29
--- /dev/null
+++ b/crypto/morus640_glue.c
@@ -0,0 +1,298 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ *   Common glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus640_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus640_state {
+	struct morus640_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus640_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_blocks)(void *state, const void *src, void *dst,
+			     unsigned int length);
+	void (*crypt_tail)(void *state, const void *src, void *dst,
+			   unsigned int length);
+};
+
+static void crypto_morus640_glue_process_ad(
+		struct morus640_state *state,
+		const struct morus640_glue_ops *ops,
+		struct scatterlist *sg_src, unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct morus640_block buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= MORUS640_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = MORUS640_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+				ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			ops->ad(state, src, left);
+			src += left & ~(MORUS640_BLOCK_SIZE - 1);
+			left &= MORUS640_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+
+		pos += left;
+		assoclen -= size;
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos);
+		ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+	}
+}
+
+static void crypto_morus640_glue_process_crypt(struct morus640_state *state,
+					       struct morus640_ops ops,
+					       struct aead_request *req)
+{
+	struct skcipher_walk walk;
+	u8 *cursor_src, *cursor_dst;
+	unsigned int chunksize, base;
+
+	ops.skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		cursor_src = walk.src.virt.addr;
+		cursor_dst = walk.dst.virt.addr;
+		chunksize = walk.nbytes;
+
+		ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
+
+		base = chunksize & ~(MORUS640_BLOCK_SIZE - 1);
+		cursor_src += base;
+		cursor_dst += base;
+		chunksize &= MORUS640_BLOCK_SIZE - 1;
+
+		if (chunksize > 0)
+			ops.crypt_tail(state, cursor_src, cursor_dst,
+				       chunksize);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				unsigned int keylen)
+{
+	struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (keylen != MORUS640_BLOCK_SIZE) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey);
+
+int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm,
+				     unsigned int authsize)
+{
+	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize);
+
+static void crypto_morus640_glue_crypt(struct aead_request *req,
+				       struct morus640_ops ops,
+				       unsigned int cryptlen,
+				       struct morus640_block *tag_xor)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus640_state state;
+
+	kernel_fpu_begin();
+
+	ctx->ops->init(&state, &ctx->key, req->iv);
+	crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+	crypto_morus640_glue_process_crypt(&state, ops, req);
+	ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+	kernel_fpu_end();
+}
+
+int crypto_morus640_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus640_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_blocks = ctx->ops->enc,
+		.crypt_tail = ctx->ops->enc_tail,
+	};
+
+	struct morus640_block tag = {};
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+	scatterwalk_map_and_copy(tag.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt);
+
+int crypto_morus640_glue_decrypt(struct aead_request *req)
+{
+	static const u8 zeros[MORUS640_BLOCK_SIZE] = {};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus640_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_blocks = ctx->ops->dec,
+		.crypt_tail = ctx->ops->dec_tail,
+	};
+
+	struct morus640_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+	return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt);
+
+void crypto_morus640_glue_init_ops(struct crypto_aead *aead,
+				   const struct morus640_glue_ops *ops)
+{
+	struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+	ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops);
+
+int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				unsigned int keylen)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey);
+
+int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
+				     unsigned int authsize)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize);
+
+int cryptd_morus640_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt);
+
+int cryptd_morus640_glue_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt);
+
+int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+	char internal_name[CRYPTO_MAX_ALG_NAME];
+
+	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+			>= CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm);
+
+void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for optimizations");
diff --git a/include/crypto/morus1280_glue.h b/include/crypto/morus1280_glue.h
new file mode 100644
index 000000000000..b26dd70efd9a
--- /dev/null
+++ b/include/crypto/morus1280_glue.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *   Common glue skeleton -- header file
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef _CRYPTO_MORUS1280_GLUE_H
+#define _CRYPTO_MORUS1280_GLUE_H
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <crypto/aead.h>
+#include <crypto/morus_common.h>
+
+#define MORUS1280_WORD_SIZE 8
+#define MORUS1280_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS1280_WORD_SIZE)
+
+struct morus1280_block {
+	u8 bytes[MORUS1280_BLOCK_SIZE];
+};
+
+struct morus1280_glue_ops {
+	void (*init)(void *state, const void *key, const void *iv);
+	void (*ad)(void *state, const void *data, unsigned int length);
+	void (*enc)(void *state, const void *src, void *dst, unsigned int length);
+	void (*dec)(void *state, const void *src, void *dst, unsigned int length);
+	void (*enc_tail)(void *state, const void *src, void *dst, unsigned int length);
+	void (*dec_tail)(void *state, const void *src, void *dst, unsigned int length);
+	void (*final)(void *state, void *tag_xor, u64 assoclen, u64 cryptlen);
+};
+
+struct morus1280_ctx {
+	const struct morus1280_glue_ops *ops;
+	struct morus1280_block key;
+};
+
+void crypto_morus1280_glue_init_ops(struct crypto_aead *aead,
+				    const struct morus1280_glue_ops *ops);
+int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen);
+int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm,
+				      unsigned int authsize);
+int crypto_morus1280_glue_encrypt(struct aead_request *req);
+int crypto_morus1280_glue_decrypt(struct aead_request *req);
+
+int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen);
+int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
+				      unsigned int authsize);
+int cryptd_morus1280_glue_encrypt(struct aead_request *req);
+int cryptd_morus1280_glue_decrypt(struct aead_request *req);
+int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead);
+void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead);
+
+#define MORUS1280_DECLARE_ALGS(id, driver_name, priority) \
+	static const struct morus1280_glue_ops crypto_morus1280_##id##_ops = {\
+		.init = crypto_morus1280_##id##_init, \
+		.ad = crypto_morus1280_##id##_ad, \
+		.enc = crypto_morus1280_##id##_enc, \
+		.enc_tail = crypto_morus1280_##id##_enc_tail, \
+		.dec = crypto_morus1280_##id##_dec, \
+		.dec_tail = crypto_morus1280_##id##_dec_tail, \
+		.final = crypto_morus1280_##id##_final, \
+	}; \
+	\
+	static int crypto_morus1280_##id##_init_tfm(struct crypto_aead *tfm) \
+	{ \
+		crypto_morus1280_glue_init_ops(tfm, &crypto_morus1280_##id##_ops); \
+		return 0; \
+	} \
+	\
+	static void crypto_morus1280_##id##_exit_tfm(struct crypto_aead *tfm) \
+	{ \
+	} \
+	\
+	struct aead_alg crypto_morus1280_##id##_algs[] = {\
+		{ \
+			.setkey = crypto_morus1280_glue_setkey, \
+			.setauthsize = crypto_morus1280_glue_setauthsize, \
+			.encrypt = crypto_morus1280_glue_encrypt, \
+			.decrypt = crypto_morus1280_glue_decrypt, \
+			.init = crypto_morus1280_##id##_init_tfm, \
+			.exit = crypto_morus1280_##id##_exit_tfm, \
+			\
+			.ivsize = MORUS_NONCE_SIZE, \
+			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
+			.chunksize = MORUS1280_BLOCK_SIZE, \
+			\
+			.base = { \
+				.cra_flags = CRYPTO_ALG_INTERNAL, \
+				.cra_blocksize = 1, \
+				.cra_ctxsize = sizeof(struct morus1280_ctx), \
+				.cra_alignmask = 0, \
+				\
+				.cra_name = "__morus1280", \
+				.cra_driver_name = "__"driver_name, \
+				\
+				.cra_module = THIS_MODULE, \
+			} \
+		}, { \
+			.setkey = cryptd_morus1280_glue_setkey, \
+			.setauthsize = cryptd_morus1280_glue_setauthsize, \
+			.encrypt = cryptd_morus1280_glue_encrypt, \
+			.decrypt = cryptd_morus1280_glue_decrypt, \
+			.init = cryptd_morus1280_glue_init_tfm, \
+			.exit = cryptd_morus1280_glue_exit_tfm, \
+			\
+			.ivsize = MORUS_NONCE_SIZE, \
+			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
+			.chunksize = MORUS1280_BLOCK_SIZE, \
+			\
+			.base = { \
+				.cra_flags = CRYPTO_ALG_ASYNC, \
+				.cra_blocksize = 1, \
+				.cra_ctxsize = sizeof(struct crypto_aead *), \
+				.cra_alignmask = 0, \
+				\
+				.cra_priority = priority, \
+				\
+				.cra_name = "morus1280", \
+				.cra_driver_name = driver_name, \
+				\
+				.cra_module = THIS_MODULE, \
+			} \
+		} \
+	}
+
+#endif /* _CRYPTO_MORUS1280_GLUE_H */
diff --git a/include/crypto/morus640_glue.h b/include/crypto/morus640_glue.h
new file mode 100644
index 000000000000..90c8db07e740
--- /dev/null
+++ b/include/crypto/morus640_glue.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ *   Common glue skeleton -- header file
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef _CRYPTO_MORUS640_GLUE_H
+#define _CRYPTO_MORUS640_GLUE_H
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <crypto/aead.h>
+#include <crypto/morus_common.h>
+
+#define MORUS640_WORD_SIZE 4
+#define MORUS640_BLOCK_SIZE (MORUS_BLOCK_WORDS * MORUS640_WORD_SIZE)
+
+struct morus640_block {
+	u8 bytes[MORUS640_BLOCK_SIZE];
+};
+
+struct morus640_glue_ops {
+	void (*init)(void *state, const void *key, const void *iv);
+	void (*ad)(void *state, const void *data, unsigned int length);
+	void (*enc)(void *state, const void *src, void *dst, unsigned int length);
+	void (*dec)(void *state, const void *src, void *dst, unsigned int length);
+	void (*enc_tail)(void *state, const void *src, void *dst, unsigned int length);
+	void (*dec_tail)(void *state, const void *src, void *dst, unsigned int length);
+	void (*final)(void *state, void *tag_xor, u64 assoclen, u64 cryptlen);
+};
+
+struct morus640_ctx {
+	const struct morus640_glue_ops *ops;
+	struct morus640_block key;
+};
+
+void crypto_morus640_glue_init_ops(struct crypto_aead *aead,
+				   const struct morus640_glue_ops *ops);
+int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				unsigned int keylen);
+int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm,
+				     unsigned int authsize);
+int crypto_morus640_glue_encrypt(struct aead_request *req);
+int crypto_morus640_glue_decrypt(struct aead_request *req);
+
+int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				unsigned int keylen);
+int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
+				     unsigned int authsize);
+int cryptd_morus640_glue_encrypt(struct aead_request *req);
+int cryptd_morus640_glue_decrypt(struct aead_request *req);
+int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead);
+void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead);
+
+#define MORUS640_DECLARE_ALGS(id, driver_name, priority) \
+	static const struct morus640_glue_ops crypto_morus640_##id##_ops = {\
+		.init = crypto_morus640_##id##_init, \
+		.ad = crypto_morus640_##id##_ad, \
+		.enc = crypto_morus640_##id##_enc, \
+		.enc_tail = crypto_morus640_##id##_enc_tail, \
+		.dec = crypto_morus640_##id##_dec, \
+		.dec_tail = crypto_morus640_##id##_dec_tail, \
+		.final = crypto_morus640_##id##_final, \
+	}; \
+	\
+	static int crypto_morus640_##id##_init_tfm(struct crypto_aead *tfm) \
+	{ \
+		crypto_morus640_glue_init_ops(tfm, &crypto_morus640_##id##_ops); \
+		return 0; \
+	} \
+	\
+	static void crypto_morus640_##id##_exit_tfm(struct crypto_aead *tfm) \
+	{ \
+	} \
+	\
+	struct aead_alg crypto_morus640_##id##_algs[] = {\
+		{ \
+			.setkey = crypto_morus640_glue_setkey, \
+			.setauthsize = crypto_morus640_glue_setauthsize, \
+			.encrypt = crypto_morus640_glue_encrypt, \
+			.decrypt = crypto_morus640_glue_decrypt, \
+			.init = crypto_morus640_##id##_init_tfm, \
+			.exit = crypto_morus640_##id##_exit_tfm, \
+			\
+			.ivsize = MORUS_NONCE_SIZE, \
+			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
+			.chunksize = MORUS640_BLOCK_SIZE, \
+			\
+			.base = { \
+				.cra_flags = CRYPTO_ALG_INTERNAL, \
+				.cra_blocksize = 1, \
+				.cra_ctxsize = sizeof(struct morus640_ctx), \
+				.cra_alignmask = 0, \
+				\
+				.cra_name = "__morus640", \
+				.cra_driver_name = "__"driver_name, \
+				\
+				.cra_module = THIS_MODULE, \
+			} \
+		}, { \
+			.setkey = cryptd_morus640_glue_setkey, \
+			.setauthsize = cryptd_morus640_glue_setauthsize, \
+			.encrypt = cryptd_morus640_glue_encrypt, \
+			.decrypt = cryptd_morus640_glue_decrypt, \
+			.init = cryptd_morus640_glue_init_tfm, \
+			.exit = cryptd_morus640_glue_exit_tfm, \
+			\
+			.ivsize = MORUS_NONCE_SIZE, \
+			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
+			.chunksize = MORUS640_BLOCK_SIZE, \
+			\
+			.base = { \
+				.cra_flags = CRYPTO_ALG_ASYNC, \
+				.cra_blocksize = 1, \
+				.cra_ctxsize = sizeof(struct crypto_aead *), \
+				.cra_alignmask = 0, \
+				\
+				.cra_priority = priority, \
+				\
+				.cra_name = "morus640", \
+				.cra_driver_name = driver_name, \
+				\
+				.cra_module = THIS_MODULE, \
+			} \
+		} \
+	}
+
+#endif /* _CRYPTO_MORUS640_GLUE_H */
-- 
cgit v1.2.3


From bd81372065fa467e262150c70be885f47f9535df Mon Sep 17 00:00:00 2001
From: Lee Duncan <lduncan@suse.com>
Date: Tue, 15 May 2018 18:25:24 -0700
Subject: scsi: target: transport should handle st FM/EOM/ILI reads

When a tape drive is exported via LIO using the pscsi module, a read
that requests more bytes per block than the tape can supply returns an
empty buffer. This is because the pscsi pass-through target module sees
the "ILI" illegal length bit set and thinks there is no reason to return
the data.

This is a long-standing transport issue, since it assumes that no data
need be returned under a check condition, which isn't always the case
for tape.

Add in a check for tape reads with the ILI, EOM, or FM bits set, with a
sense code of NO_SENSE, treating such cases as if the read
succeeded. The layered tape driver then "does the right thing" when it
gets such a response.

Signed-off-by: Bodo Stroesser <bstroesser@ts.fujitsu.com>
Signed-off-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_pscsi.c     | 26 ++++++++++++++++++--
 drivers/target/target_core_transport.c | 43 +++++++++++++++++++++++++++++-----
 include/target/target_core_base.h      |  1 +
 3 files changed, 62 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 0d99b242e82e..f31215b1d009 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -689,8 +689,29 @@ after_mode_sense:
 	}
 after_mode_select:
 
-	if (scsi_status == SAM_STAT_CHECK_CONDITION)
+	if (scsi_status == SAM_STAT_CHECK_CONDITION) {
 		transport_copy_sense_to_cmd(cmd, req_sense);
+
+		/*
+		 * check for TAPE device reads with
+		 * FM/EOM/ILI set, so that we can get data
+		 * back despite framework assumption that a
+		 * check condition means there is no data
+		 */
+		if (sd->type == TYPE_TAPE &&
+		    cmd->data_direction == DMA_FROM_DEVICE) {
+			/*
+			 * is sense data valid, fixed format,
+			 * and have FM, EOM, or ILI set?
+			 */
+			if (req_sense[0] == 0xf0 &&	/* valid, fixed format */
+			    req_sense[2] & 0xe0 &&	/* FM, EOM, or ILI */
+			    (req_sense[2] & 0xf) == 0) { /* key==NO_SENSE */
+				pr_debug("Tape FM/EOM/ILI status detected. Treat as normal read.\n");
+				cmd->se_cmd_flags |= SCF_TREAT_READ_AS_NORMAL;
+			}
+		}
+	}
 }
 
 enum {
@@ -1061,7 +1082,8 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
 
 	switch (host_byte(result)) {
 	case DID_OK:
-		target_complete_cmd(cmd, scsi_status);
+		target_complete_cmd_with_length(cmd, scsi_status,
+			cmd->data_length - scsi_req(req)->resid_len);
 		break;
 	default:
 		pr_debug("PSCSI Host Byte exception at cmd: %p CDB:"
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 3500aa5927f2..bde14f46ed5b 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -779,7 +779,9 @@ EXPORT_SYMBOL(target_complete_cmd);
 
 void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int length)
 {
-	if (scsi_status == SAM_STAT_GOOD && length < cmd->data_length) {
+	if ((scsi_status == SAM_STAT_GOOD ||
+	     cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL) &&
+	    length < cmd->data_length) {
 		if (cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) {
 			cmd->residual_count += cmd->data_length - length;
 		} else {
@@ -2084,12 +2086,24 @@ static void transport_complete_qf(struct se_cmd *cmd)
 		goto queue_status;
 	}
 
-	if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
+	/*
+	 * Check if we need to send a sense buffer from
+	 * the struct se_cmd in question. We do NOT want
+	 * to take this path of the IO has been marked as
+	 * needing to be treated like a "normal read". This
+	 * is the case if it's a tape read, and either the
+	 * FM, EOM, or ILI bits are set, but there is no
+	 * sense data.
+	 */
+	if (!(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL) &&
+	    cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
 		goto queue_status;
 
 	switch (cmd->data_direction) {
 	case DMA_FROM_DEVICE:
-		if (cmd->scsi_status)
+		/* queue status if not treating this as a normal read */
+		if (cmd->scsi_status &&
+		    !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
 			goto queue_status;
 
 		trace_target_cmd_complete(cmd);
@@ -2194,9 +2208,15 @@ static void target_complete_ok_work(struct work_struct *work)
 
 	/*
 	 * Check if we need to send a sense buffer from
-	 * the struct se_cmd in question.
+	 * the struct se_cmd in question. We do NOT want
+	 * to take this path of the IO has been marked as
+	 * needing to be treated like a "normal read". This
+	 * is the case if it's a tape read, and either the
+	 * FM, EOM, or ILI bits are set, but there is no
+	 * sense data.
 	 */
-	if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) {
+	if (!(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL) &&
+	    cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) {
 		WARN_ON(!cmd->scsi_status);
 		ret = transport_send_check_condition_and_sense(
 					cmd, 0, 1);
@@ -2238,7 +2258,18 @@ static void target_complete_ok_work(struct work_struct *work)
 queue_rsp:
 	switch (cmd->data_direction) {
 	case DMA_FROM_DEVICE:
-		if (cmd->scsi_status)
+		/*
+		 * if this is a READ-type IO, but SCSI status
+		 * is set, then skip returning data and just
+		 * return the status -- unless this IO is marked
+		 * as needing to be treated as a normal read,
+		 * in which case we want to go ahead and return
+		 * the data. This happens, for example, for tape
+		 * reads with the FM, EOM, or ILI bits set, with
+		 * no sense data.
+		 */
+		if (cmd->scsi_status &&
+		    !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
 			goto queue_status;
 
 		atomic_long_add(cmd->data_length,
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 9f9f5902af38..922a39f45abc 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -143,6 +143,7 @@ enum se_cmd_flags_table {
 	SCF_ACK_KREF			= 0x00400000,
 	SCF_USE_CPUID			= 0x00800000,
 	SCF_TASK_ATTR_SET		= 0x01000000,
+	SCF_TREAT_READ_AS_NORMAL	= 0x02000000,
 };
 
 /*
-- 
cgit v1.2.3


From fa516b66a1bfce1d72f1620c54bdfebc493000d1 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@google.com>
Date: Tue, 15 May 2018 10:38:26 -0700
Subject: EVM: Allow runtime modification of the set of verified xattrs

Sites may wish to provide additional metadata alongside files in order
to make more fine-grained security decisions[1]. The security of this is
enhanced if this metadata is protected, something that EVM makes
possible. However, the kernel cannot know about the set of extended
attributes that local admins may wish to protect, and hardcoding this
policy in the kernel makes it difficult to change over time and less
convenient for distributions to enable.

This patch adds a new /sys/kernel/security/integrity/evm/evm_xattrs node,
which can be read to obtain the current set of EVM-protected extended
attributes or written to in order to add new entries. Extending this list
will not change the validity of any existing signatures provided that the
file in question does not have any of the additional extended attributes -
missing xattrs are skipped when calculating the EVM hash.

[1] For instance, a package manager could install information about the
package uploader in an additional extended attribute. Local LSM policy
could then be associated with that extended attribute in order to
restrict the privileges available to packages from less trusted
uploaders.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Reviewed-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 Documentation/ABI/testing/evm       |  13 +++
 include/uapi/linux/audit.h          |   1 +
 security/integrity/evm/Kconfig      |  11 +++
 security/integrity/evm/evm_crypto.c |   2 +-
 security/integrity/evm/evm_main.c   |   6 +-
 security/integrity/evm/evm_secfs.c  | 173 ++++++++++++++++++++++++++++++++++++
 6 files changed, 202 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/evm b/Documentation/ABI/testing/evm
index d12cb2eae9ee..201d10319fa1 100644
--- a/Documentation/ABI/testing/evm
+++ b/Documentation/ABI/testing/evm
@@ -57,3 +57,16 @@ Description:
 		dracut (via 97masterkey and 98integrity) and systemd (via
 		core/ima-setup) have support for loading keys at boot
 		time.
+
+What:		security/integrity/evm/evm_xattrs
+Date:		April 2018
+Contact:	Matthew Garrett <mjg59@google.com>
+Description:
+		Shows the set of extended attributes used to calculate or
+		validate the EVM signature, and allows additional attributes
+		to be added at runtime. Any signatures generated after
+		additional attributes are added (and on files posessing those
+		additional attributes) will only be valid if the same
+		additional attributes are configured on system boot. Writing
+		a single period (.) will lock the xattr list from any further
+		modification.
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 4e61a9e05132..65d9293f1fb8 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -147,6 +147,7 @@
 #define AUDIT_INTEGRITY_HASH	    1803 /* Integrity HASH type */
 #define AUDIT_INTEGRITY_PCR	    1804 /* PCR invalidation msgs */
 #define AUDIT_INTEGRITY_RULE	    1805 /* policy rule */
+#define AUDIT_INTEGRITY_EVM_XATTR   1806 /* New EVM-covered xattr */
 
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
diff --git a/security/integrity/evm/Kconfig b/security/integrity/evm/Kconfig
index e825e0ae78e7..d593346d0bba 100644
--- a/security/integrity/evm/Kconfig
+++ b/security/integrity/evm/Kconfig
@@ -42,6 +42,17 @@ config EVM_EXTRA_SMACK_XATTRS
 	  additional info to the calculation, requires existing EVM
 	  labeled file systems to be relabeled.
 
+config EVM_ADD_XATTRS
+	bool "Add additional EVM extended attributes at runtime"
+	depends on EVM
+	default n
+	help
+	  Allow userland to provide additional xattrs for HMAC calculation.
+
+	  When this option is enabled, root can add additional xattrs to the
+	  list used by EVM by writing them into
+	  /sys/kernel/security/integrity/evm/evm_xattrs.
+
 config EVM_LOAD_X509
 	bool "Load an X509 certificate onto the '.evm' trusted keyring"
 	depends on EVM && INTEGRITY_TRUSTED_KEYRING
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index caeea20670cc..494da5fcc092 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -208,7 +208,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 		return PTR_ERR(desc);
 
 	error = -ENODATA;
-	list_for_each_entry(xattr, &evm_config_xattrnames, list) {
+	list_for_each_entry_rcu(xattr, &evm_config_xattrnames, list) {
 		bool is_ima = false;
 
 		if (strcmp(xattr->name, XATTR_NAME_IMA) == 0)
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 09582d4fc4a8..f9eff5041e4c 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -35,7 +35,7 @@ static const char * const integrity_status_msg[] = {
 };
 int evm_hmac_attrs;
 
-static struct xattr_list evm_config_default_xattrnames[] __ro_after_init = {
+static struct xattr_list evm_config_default_xattrnames[] = {
 #ifdef CONFIG_SECURITY_SELINUX
 	{.name = XATTR_NAME_SELINUX},
 #endif
@@ -101,7 +101,7 @@ static int evm_find_protected_xattrs(struct dentry *dentry)
 	if (!(inode->i_opflags & IOP_XATTR))
 		return -EOPNOTSUPP;
 
-	list_for_each_entry(xattr, &evm_config_xattrnames, list) {
+	list_for_each_entry_rcu(xattr, &evm_config_xattrnames, list) {
 		error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0);
 		if (error < 0) {
 			if (error == -ENODATA)
@@ -228,7 +228,7 @@ static int evm_protected_xattr(const char *req_xattr_name)
 	struct xattr_list *xattr;
 
 	namelen = strlen(req_xattr_name);
-	list_for_each_entry(xattr, &evm_config_xattrnames, list) {
+	list_for_each_entry_rcu(xattr, &evm_config_xattrnames, list) {
 		if ((strlen(xattr->name) == namelen)
 		    && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) {
 			found = 1;
diff --git a/security/integrity/evm/evm_secfs.c b/security/integrity/evm/evm_secfs.c
index e44380f0cb45..a7a0a1acae99 100644
--- a/security/integrity/evm/evm_secfs.c
+++ b/security/integrity/evm/evm_secfs.c
@@ -15,14 +15,22 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/audit.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include "evm.h"
 
 static struct dentry *evm_dir;
 static struct dentry *evm_init_tpm;
 static struct dentry *evm_symlink;
 
+#ifdef CONFIG_EVM_ADD_XATTRS
+static struct dentry *evm_xattrs;
+static DEFINE_MUTEX(xattr_list_mutex);
+static int evm_xattrs_locked;
+#endif
+
 /**
  * evm_read_key - read() for <securityfs>/evm
  *
@@ -109,6 +117,166 @@ static const struct file_operations evm_key_ops = {
 	.write		= evm_write_key,
 };
 
+#ifdef CONFIG_EVM_ADD_XATTRS
+/**
+ * evm_read_xattrs - read() for <securityfs>/evm_xattrs
+ *
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @count: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t evm_read_xattrs(struct file *filp, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	char *temp;
+	int offset = 0;
+	ssize_t rc, size = 0;
+	struct xattr_list *xattr;
+
+	if (*ppos != 0)
+		return 0;
+
+	rc = mutex_lock_interruptible(&xattr_list_mutex);
+	if (rc)
+		return -ERESTARTSYS;
+
+	list_for_each_entry(xattr, &evm_config_xattrnames, list)
+		size += strlen(xattr->name) + 1;
+
+	temp = kmalloc(size + 1, GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	list_for_each_entry(xattr, &evm_config_xattrnames, list) {
+		sprintf(temp + offset, "%s\n", xattr->name);
+		offset += strlen(xattr->name) + 1;
+	}
+
+	mutex_unlock(&xattr_list_mutex);
+	rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
+
+	return rc;
+}
+
+/**
+ * evm_write_xattrs - write() for <securityfs>/evm_xattrs
+ * @file: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t evm_write_xattrs(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	int len, err;
+	struct xattr_list *xattr, *tmp;
+	struct audit_buffer *ab;
+	struct iattr newattrs;
+	struct inode *inode;
+
+	if (!capable(CAP_SYS_ADMIN) || evm_xattrs_locked)
+		return -EPERM;
+
+	if (*ppos != 0)
+		return -EINVAL;
+
+	if (count > XATTR_NAME_MAX)
+		return -E2BIG;
+
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_INTEGRITY_EVM_XATTR);
+	if (IS_ERR(ab))
+		return PTR_ERR(ab);
+
+	xattr = kmalloc(sizeof(struct xattr_list), GFP_KERNEL);
+	if (!xattr) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	xattr->name = memdup_user_nul(buf, count);
+	if (IS_ERR(xattr->name)) {
+		err = PTR_ERR(xattr->name);
+		xattr->name = NULL;
+		goto out;
+	}
+
+	/* Remove any trailing newline */
+	len = strlen(xattr->name);
+	if (xattr->name[len-1] == '\n')
+		xattr->name[len-1] = '\0';
+
+	if (strcmp(xattr->name, ".") == 0) {
+		evm_xattrs_locked = 1;
+		newattrs.ia_mode = S_IFREG | 0440;
+		newattrs.ia_valid = ATTR_MODE;
+		inode = evm_xattrs->d_inode;
+		inode_lock(inode);
+		err = simple_setattr(evm_xattrs, &newattrs);
+		inode_unlock(inode);
+		audit_log_format(ab, "locked");
+		if (!err)
+			err = count;
+		goto out;
+	}
+
+	audit_log_format(ab, "xattr=");
+	audit_log_untrustedstring(ab, xattr->name);
+
+	if (strncmp(xattr->name, XATTR_SECURITY_PREFIX,
+		    XATTR_SECURITY_PREFIX_LEN) != 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Guard against races in evm_read_xattrs */
+	mutex_lock(&xattr_list_mutex);
+	list_for_each_entry(tmp, &evm_config_xattrnames, list) {
+		if (strcmp(xattr->name, tmp->name) == 0) {
+			err = -EEXIST;
+			mutex_unlock(&xattr_list_mutex);
+			goto out;
+		}
+	}
+	list_add_tail_rcu(&xattr->list, &evm_config_xattrnames);
+	mutex_unlock(&xattr_list_mutex);
+
+	audit_log_format(ab, " res=0");
+	audit_log_end(ab);
+	return count;
+out:
+	audit_log_format(ab, " res=%d", err);
+	audit_log_end(ab);
+	kfree(xattr->name);
+	kfree(xattr);
+	return err;
+}
+
+static const struct file_operations evm_xattr_ops = {
+	.read		= evm_read_xattrs,
+	.write		= evm_write_xattrs,
+};
+
+static int evm_init_xattrs(void)
+{
+	evm_xattrs = securityfs_create_file("evm_xattrs", 0660, evm_dir, NULL,
+					    &evm_xattr_ops);
+	if (!evm_xattrs || IS_ERR(evm_xattrs))
+		return -EFAULT;
+
+	return 0;
+}
+#else
+static int evm_init_xattrs(void)
+{
+	return 0;
+}
+#endif
+
 int __init evm_init_secfs(void)
 {
 	int error = 0;
@@ -131,6 +299,11 @@ int __init evm_init_secfs(void)
 		goto out;
 	}
 
+	if (evm_init_xattrs() != 0) {
+		error = -EFAULT;
+		goto out;
+	}
+
 	return 0;
 out:
 	securityfs_remove(evm_symlink);
-- 
cgit v1.2.3


From 24c94e166dfe89839129b8e0fae208b6af60d6f1 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Sat, 5 May 2018 08:45:47 +0200
Subject: gpu: host1x: Remove wait check support

The job submission userspace ABI doesn't support this and there are no
plans to implement it, so all of this code is dead and can be removed.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c        |  62 +------------------
 drivers/gpu/host1x/dev.h           |   8 ---
 drivers/gpu/host1x/hw/channel_hw.c |   3 +-
 drivers/gpu/host1x/hw/syncpt_hw.c  |  11 ----
 drivers/gpu/host1x/job.c           | 124 +------------------------------------
 drivers/gpu/host1x/syncpt.c        |   6 --
 drivers/gpu/host1x/syncpt.h        |   3 -
 include/linux/host1x.h             |  15 +----
 include/trace/events/host1x.h      |  16 +++--
 9 files changed, 14 insertions(+), 234 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 3cdef659cd39..204b10e33f16 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -321,46 +321,14 @@ static int host1x_reloc_copy_from_user(struct host1x_reloc *dest,
 	return 0;
 }
 
-static int host1x_waitchk_copy_from_user(struct host1x_waitchk *dest,
-					 struct drm_tegra_waitchk __user *src,
-					 struct drm_file *file)
-{
-	u32 cmdbuf;
-	int err;
-
-	err = get_user(cmdbuf, &src->handle);
-	if (err < 0)
-		return err;
-
-	err = get_user(dest->offset, &src->offset);
-	if (err < 0)
-		return err;
-
-	err = get_user(dest->syncpt_id, &src->syncpt);
-	if (err < 0)
-		return err;
-
-	err = get_user(dest->thresh, &src->thresh);
-	if (err < 0)
-		return err;
-
-	dest->bo = host1x_bo_lookup(file, cmdbuf);
-	if (!dest->bo)
-		return -ENOENT;
-
-	return 0;
-}
-
 int tegra_drm_submit(struct tegra_drm_context *context,
 		     struct drm_tegra_submit *args, struct drm_device *drm,
 		     struct drm_file *file)
 {
 	unsigned int num_cmdbufs = args->num_cmdbufs;
 	unsigned int num_relocs = args->num_relocs;
-	unsigned int num_waitchks = args->num_waitchks;
 	struct drm_tegra_cmdbuf __user *user_cmdbufs;
 	struct drm_tegra_reloc __user *user_relocs;
-	struct drm_tegra_waitchk __user *user_waitchks;
 	struct drm_tegra_syncpt __user *user_syncpt;
 	struct drm_tegra_syncpt syncpt;
 	struct host1x *host1x = dev_get_drvdata(drm->dev->parent);
@@ -372,7 +340,6 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 
 	user_cmdbufs = u64_to_user_ptr(args->cmdbufs);
 	user_relocs = u64_to_user_ptr(args->relocs);
-	user_waitchks = u64_to_user_ptr(args->waitchks);
 	user_syncpt = u64_to_user_ptr(args->syncpts);
 
 	/* We don't yet support other than one syncpt_incr struct per submit */
@@ -384,12 +351,11 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 		return -EINVAL;
 
 	job = host1x_job_alloc(context->channel, args->num_cmdbufs,
-			       args->num_relocs, args->num_waitchks);
+			       args->num_relocs);
 	if (!job)
 		return -ENOMEM;
 
 	job->num_relocs = args->num_relocs;
-	job->num_waitchk = args->num_waitchks;
 	job->client = (u32)args->context;
 	job->class = context->client->base.class;
 	job->serialize = true;
@@ -398,7 +364,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 	 * Track referenced BOs so that they can be unreferenced after the
 	 * submission is complete.
 	 */
-	num_refs = num_cmdbufs + num_relocs * 2 + num_waitchks;
+	num_refs = num_cmdbufs + num_relocs * 2;
 
 	refs = kmalloc_array(num_refs, sizeof(*refs), GFP_KERNEL);
 	if (!refs) {
@@ -489,30 +455,6 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 		}
 	}
 
-	/* copy and resolve waitchks from submit */
-	while (num_waitchks--) {
-		struct host1x_waitchk *wait = &job->waitchk[num_waitchks];
-		struct tegra_bo *obj;
-
-		err = host1x_waitchk_copy_from_user(
-			wait, &user_waitchks[num_waitchks], file);
-		if (err < 0)
-			goto fail;
-
-		obj = host1x_to_tegra_bo(wait->bo);
-		refs[num_refs++] = &obj->gem;
-
-		/*
-		 * The unaligned offset will cause an unaligned write during
-		 * of the waitchks patching, corrupting the commands stream.
-		 */
-		if (wait->offset & 3 ||
-		    wait->offset >= obj->gem.size) {
-			err = -EINVAL;
-			goto fail;
-		}
-	}
-
 	if (copy_from_user(&syncpt, user_syncpt, sizeof(syncpt))) {
 		err = -EFAULT;
 		goto fail;
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 43e9fabb43a1..36f44ffebe73 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -78,7 +78,6 @@ struct host1x_syncpt_ops {
 	void (*load_wait_base)(struct host1x_syncpt *syncpt);
 	u32 (*load)(struct host1x_syncpt *syncpt);
 	int (*cpu_incr)(struct host1x_syncpt *syncpt);
-	int (*patch_wait)(struct host1x_syncpt *syncpt, void *patch_addr);
 	void (*assign_to_channel)(struct host1x_syncpt *syncpt,
 	                          struct host1x_channel *channel);
 	void (*enable_protection)(struct host1x *host);
@@ -183,13 +182,6 @@ static inline int host1x_hw_syncpt_cpu_incr(struct host1x *host,
 	return host->syncpt_op->cpu_incr(sp);
 }
 
-static inline int host1x_hw_syncpt_patch_wait(struct host1x *host,
-					      struct host1x_syncpt *sp,
-					      void *patch_addr)
-{
-	return host->syncpt_op->patch_wait(sp, patch_addr);
-}
-
 static inline void host1x_hw_syncpt_assign_to_channel(
 	struct host1x *host, struct host1x_syncpt *sp,
 	struct host1x_channel *ch)
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index 9af758785a11..4c9555038a95 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -104,8 +104,7 @@ static int channel_submit(struct host1x_job *job)
 	sp = host->syncpt + job->syncpt_id;
 	trace_host1x_channel_submit(dev_name(ch->dev),
 				    job->num_gathers, job->num_relocs,
-				    job->num_waitchk, job->syncpt_id,
-				    job->syncpt_incrs);
+				    job->syncpt_id, job->syncpt_incrs);
 
 	/* before error checks, return current max */
 	prev_max = job->syncpt_end = host1x_syncpt_read_max(sp);
diff --git a/drivers/gpu/host1x/hw/syncpt_hw.c b/drivers/gpu/host1x/hw/syncpt_hw.c
index 7dfd47d74f89..a23bb3352d02 100644
--- a/drivers/gpu/host1x/hw/syncpt_hw.c
+++ b/drivers/gpu/host1x/hw/syncpt_hw.c
@@ -96,16 +96,6 @@ static int syncpt_cpu_incr(struct host1x_syncpt *sp)
 	return 0;
 }
 
-/* remove a wait pointed to by patch_addr */
-static int syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
-{
-	u32 override = host1x_class_host_wait_syncpt(HOST1X_SYNCPT_RESERVED, 0);
-
-	*((u32 *)patch_addr) = override;
-
-	return 0;
-}
-
 /**
  * syncpt_assign_to_channel() - Assign syncpoint to channel
  * @sp: syncpoint
@@ -156,7 +146,6 @@ static const struct host1x_syncpt_ops host1x_syncpt_ops = {
 	.load_wait_base = syncpt_read_wait_base,
 	.load = syncpt_load,
 	.cpu_incr = syncpt_cpu_incr,
-	.patch_wait = syncpt_patch_wait,
 	.assign_to_channel = syncpt_assign_to_channel,
 	.enable_protection = syncpt_enable_protection,
 };
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index db509ab8874e..3cbfc6e37668 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -34,8 +34,7 @@
 #define HOST1X_WAIT_SYNCPT_OFFSET 0x8
 
 struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
-				    u32 num_cmdbufs, u32 num_relocs,
-				    u32 num_waitchks)
+				    u32 num_cmdbufs, u32 num_relocs)
 {
 	struct host1x_job *job = NULL;
 	unsigned int num_unpins = num_cmdbufs + num_relocs;
@@ -46,7 +45,6 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
 	total = sizeof(struct host1x_job) +
 		(u64)num_relocs * sizeof(struct host1x_reloc) +
 		(u64)num_unpins * sizeof(struct host1x_job_unpin_data) +
-		(u64)num_waitchks * sizeof(struct host1x_waitchk) +
 		(u64)num_cmdbufs * sizeof(struct host1x_job_gather) +
 		(u64)num_unpins * sizeof(dma_addr_t) +
 		(u64)num_unpins * sizeof(u32 *);
@@ -66,8 +64,6 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
 	mem += num_relocs * sizeof(struct host1x_reloc);
 	job->unpins = num_unpins ? mem : NULL;
 	mem += num_unpins * sizeof(struct host1x_job_unpin_data);
-	job->waitchk = num_waitchks ? mem : NULL;
-	mem += num_waitchks * sizeof(struct host1x_waitchk);
 	job->gathers = num_cmdbufs ? mem : NULL;
 	mem += num_cmdbufs * sizeof(struct host1x_job_gather);
 	job->addr_phys = num_unpins ? mem : NULL;
@@ -111,73 +107,6 @@ void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *bo,
 }
 EXPORT_SYMBOL(host1x_job_add_gather);
 
-/*
- * NULL an already satisfied WAIT_SYNCPT host method, by patching its
- * args in the command stream. The method data is changed to reference
- * a reserved (never given out or incr) HOST1X_SYNCPT_RESERVED syncpt
- * with a matching threshold value of 0, so is guaranteed to be popped
- * by the host HW.
- */
-static void host1x_syncpt_patch_offset(struct host1x_syncpt *sp,
-				       struct host1x_bo *h, u32 offset)
-{
-	void *patch_addr = NULL;
-
-	/* patch the wait */
-	patch_addr = host1x_bo_kmap(h, offset >> PAGE_SHIFT);
-	if (patch_addr) {
-		host1x_syncpt_patch_wait(sp,
-					 patch_addr + (offset & ~PAGE_MASK));
-		host1x_bo_kunmap(h, offset >> PAGE_SHIFT, patch_addr);
-	} else
-		pr_err("Could not map cmdbuf for wait check\n");
-}
-
-/*
- * Check driver supplied waitchk structs for syncpt thresholds
- * that have already been satisfied and NULL the comparison (to
- * avoid a wrap condition in the HW).
- */
-static int do_waitchks(struct host1x_job *job, struct host1x *host,
-		       struct host1x_job_gather *g)
-{
-	struct host1x_bo *patch = g->bo;
-	int i;
-
-	/* compare syncpt vs wait threshold */
-	for (i = 0; i < job->num_waitchk; i++) {
-		struct host1x_waitchk *wait = &job->waitchk[i];
-		struct host1x_syncpt *sp =
-			host1x_syncpt_get(host, wait->syncpt_id);
-
-		/* validate syncpt id */
-		if (wait->syncpt_id > host1x_syncpt_nb_pts(host))
-			continue;
-
-		/* skip all other gathers */
-		if (patch != wait->bo)
-			continue;
-
-		trace_host1x_syncpt_wait_check(wait->bo, wait->offset,
-					       wait->syncpt_id, wait->thresh,
-					       host1x_syncpt_read_min(sp));
-
-		if (host1x_syncpt_is_expired(sp, wait->thresh)) {
-			dev_dbg(host->dev,
-				"drop WAIT id %u (%s) thresh 0x%x, min 0x%x\n",
-				wait->syncpt_id, sp->name, wait->thresh,
-				host1x_syncpt_read_min(sp));
-
-			host1x_syncpt_patch_offset(sp, patch,
-						   g->offset + wait->offset);
-		}
-
-		wait->bo = NULL;
-	}
-
-	return 0;
-}
-
 static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 {
 	unsigned int i;
@@ -331,17 +260,6 @@ static bool check_reloc(struct host1x_reloc *reloc, struct host1x_bo *cmdbuf,
 	return true;
 }
 
-static bool check_wait(struct host1x_waitchk *wait, struct host1x_bo *cmdbuf,
-		       unsigned int offset)
-{
-	offset *= sizeof(u32);
-
-	if (wait->bo != cmdbuf || wait->offset != offset)
-		return false;
-
-	return true;
-}
-
 struct host1x_firewall {
 	struct host1x_job *job;
 	struct device *dev;
@@ -349,9 +267,6 @@ struct host1x_firewall {
 	unsigned int num_relocs;
 	struct host1x_reloc *reloc;
 
-	unsigned int num_waitchks;
-	struct host1x_waitchk *waitchk;
-
 	struct host1x_bo *cmdbuf;
 	unsigned int offset;
 
@@ -378,20 +293,6 @@ static int check_register(struct host1x_firewall *fw, unsigned long offset)
 		fw->reloc++;
 	}
 
-	if (offset == HOST1X_WAIT_SYNCPT_OFFSET) {
-		if (fw->class != HOST1X_CLASS_HOST1X)
-			return -EINVAL;
-
-		if (!fw->num_waitchks)
-			return -EINVAL;
-
-		if (!check_wait(fw->waitchk, fw->cmdbuf, fw->offset))
-			return -EINVAL;
-
-		fw->num_waitchks--;
-		fw->waitchk++;
-	}
-
 	return 0;
 }
 
@@ -556,8 +457,6 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
 	fw.dev = dev;
 	fw.reloc = job->relocarray;
 	fw.num_relocs = job->num_relocs;
-	fw.waitchk = job->waitchk;
-	fw.num_waitchks = job->num_waitchk;
 	fw.class = job->class;
 
 	for (i = 0; i < job->num_gathers; i++) {
@@ -604,8 +503,8 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
 		offset += g->words * sizeof(u32);
 	}
 
-	/* No relocs and waitchks should remain at this point */
-	if (fw.num_relocs || fw.num_waitchks)
+	/* No relocs should remain at this point */
+	if (fw.num_relocs)
 		return -EINVAL;
 
 	return 0;
@@ -616,19 +515,6 @@ int host1x_job_pin(struct host1x_job *job, struct device *dev)
 	int err;
 	unsigned int i, j;
 	struct host1x *host = dev_get_drvdata(dev->parent);
-	DECLARE_BITMAP(waitchk_mask, host1x_syncpt_nb_pts(host));
-
-	bitmap_zero(waitchk_mask, host1x_syncpt_nb_pts(host));
-	for (i = 0; i < job->num_waitchk; i++) {
-		u32 syncpt_id = job->waitchk[i].syncpt_id;
-
-		if (syncpt_id < host1x_syncpt_nb_pts(host))
-			set_bit(syncpt_id, waitchk_mask);
-	}
-
-	/* get current syncpt values for waitchk */
-	for_each_set_bit(i, waitchk_mask, host1x_syncpt_nb_pts(host))
-		host1x_syncpt_load(host->syncpt + i);
 
 	/* pin memory */
 	err = pin_job(host, job);
@@ -663,10 +549,6 @@ int host1x_job_pin(struct host1x_job *job, struct device *dev)
 		err = do_relocs(job, g);
 		if (err)
 			break;
-
-		err = do_waitchks(job, host, g);
-		if (err)
-			break;
 	}
 
 out:
diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index a2a952adc136..a108669188e8 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -373,12 +373,6 @@ bool host1x_syncpt_is_expired(struct host1x_syncpt *sp, u32 thresh)
 		return (s32)(current_val - thresh) >= 0;
 }
 
-/* remove a wait pointed to by patch_addr */
-int host1x_syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
-{
-	return host1x_hw_syncpt_patch_wait(sp->host, sp, patch_addr);
-}
-
 int host1x_syncpt_init(struct host1x *host)
 {
 	struct host1x_syncpt_base *bases;
diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
index 9d88d37c2397..d98e22325e9d 100644
--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -124,7 +124,4 @@ static inline int host1x_syncpt_is_valid(struct host1x_syncpt *sp)
 	return sp->id < host1x_syncpt_nb_pts(sp->host);
 }
 
-/* Patch a wait by replacing it with a wait for syncpt 0 value 0 */
-int host1x_syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr);
-
 #endif
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index ddf7f9ca86cc..f66bece1e1b7 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -192,13 +192,6 @@ struct host1x_reloc {
 	unsigned long shift;
 };
 
-struct host1x_waitchk {
-	struct host1x_bo *bo;
-	u32 offset;
-	u32 syncpt_id;
-	u32 thresh;
-};
-
 struct host1x_job {
 	/* When refcount goes to zero, job can be freed */
 	struct kref ref;
@@ -215,11 +208,6 @@ struct host1x_job {
 	struct host1x_job_gather *gathers;
 	unsigned int num_gathers;
 
-	/* Wait checks to be processed at submit time */
-	struct host1x_waitchk *waitchk;
-	unsigned int num_waitchk;
-	u32 waitchk_mask;
-
 	/* Array of handles to be pinned & unpinned */
 	struct host1x_reloc *relocarray;
 	unsigned int num_relocs;
@@ -261,8 +249,7 @@ struct host1x_job {
 };
 
 struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
-				    u32 num_cmdbufs, u32 num_relocs,
-				    u32 num_waitchks);
+				    u32 num_cmdbufs, u32 num_relocs);
 void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *mem_id,
 			   u32 words, u32 offset);
 struct host1x_job *host1x_job_get(struct host1x_job *job);
diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h
index 63116362543c..a37ef73092e5 100644
--- a/include/trace/events/host1x.h
+++ b/include/trace/events/host1x.h
@@ -115,16 +115,15 @@ TRACE_EVENT(host1x_cdma_push_gather,
 );
 
 TRACE_EVENT(host1x_channel_submit,
-	TP_PROTO(const char *name, u32 cmdbufs, u32 relocs, u32 waitchks,
-			u32 syncpt_id, u32 syncpt_incrs),
+	TP_PROTO(const char *name, u32 cmdbufs, u32 relocs, u32 syncpt_id,
+		 u32 syncpt_incrs),
 
-	TP_ARGS(name, cmdbufs, relocs, waitchks, syncpt_id, syncpt_incrs),
+	TP_ARGS(name, cmdbufs, relocs, syncpt_id, syncpt_incrs),
 
 	TP_STRUCT__entry(
 		__field(const char *, name)
 		__field(u32, cmdbufs)
 		__field(u32, relocs)
-		__field(u32, waitchks)
 		__field(u32, syncpt_id)
 		__field(u32, syncpt_incrs)
 	),
@@ -133,15 +132,14 @@ TRACE_EVENT(host1x_channel_submit,
 		__entry->name = name;
 		__entry->cmdbufs = cmdbufs;
 		__entry->relocs = relocs;
-		__entry->waitchks = waitchks;
 		__entry->syncpt_id = syncpt_id;
 		__entry->syncpt_incrs = syncpt_incrs;
 	),
 
-	TP_printk("name=%s, cmdbufs=%u, relocs=%u, waitchks=%d,"
-		"syncpt_id=%u, syncpt_incrs=%u",
-	  __entry->name, __entry->cmdbufs, __entry->relocs, __entry->waitchks,
-	  __entry->syncpt_id, __entry->syncpt_incrs)
+	TP_printk("name=%s, cmdbufs=%u, relocs=%u, syncpt_id=%u, "
+		  "syncpt_incrs=%u",
+		  __entry->name, __entry->cmdbufs, __entry->relocs,
+		  __entry->syncpt_id, __entry->syncpt_incrs)
 );
 
 TRACE_EVENT(host1x_channel_submitted,
-- 
cgit v1.2.3


From bf3d41ccabb53c57e19fcfc8b81d790043ac2bed Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 16 May 2018 14:12:33 +0200
Subject: gpu: host1x: Store pointer to client in jobs

Rather than storing some identifier derived from the application
context that can't be used concretely anywhere, store a pointer to the
client directly so that accesses can be made directly through that
client object.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c | 5 +++--
 drivers/gpu/host1x/cdma.c   | 2 +-
 drivers/gpu/host1x/cdma.h   | 2 +-
 include/linux/host1x.h      | 3 ++-
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 204b10e33f16..8f29323611dd 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -325,6 +325,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 		     struct drm_tegra_submit *args, struct drm_device *drm,
 		     struct drm_file *file)
 {
+	struct host1x_client *client = &context->client->base;
 	unsigned int num_cmdbufs = args->num_cmdbufs;
 	unsigned int num_relocs = args->num_relocs;
 	struct drm_tegra_cmdbuf __user *user_cmdbufs;
@@ -356,8 +357,8 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 		return -ENOMEM;
 
 	job->num_relocs = args->num_relocs;
-	job->client = (u32)args->context;
-	job->class = context->client->base.class;
+	job->client = client;
+	job->class = client->class;
 	job->serialize = true;
 
 	/*
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 69bb77372ed9..91df51e631b2 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -247,7 +247,7 @@ static void cdma_start_timer_locked(struct host1x_cdma *cdma,
 static void stop_cdma_timer_locked(struct host1x_cdma *cdma)
 {
 	cancel_delayed_work(&cdma->timeout.wq);
-	cdma->timeout.client = 0;
+	cdma->timeout.client = NULL;
 }
 
 /*
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
index 446ee1a84969..e97e17b82370 100644
--- a/drivers/gpu/host1x/cdma.h
+++ b/drivers/gpu/host1x/cdma.h
@@ -58,7 +58,7 @@ struct buffer_timeout {
 	u32 syncpt_val;			/* syncpt value when completed */
 	ktime_t start_ktime;		/* starting time */
 	/* context timeout information */
-	int client;
+	struct host1x_client *client;
 };
 
 enum cdma_event {
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index f66bece1e1b7..0632010f47fb 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -202,7 +202,8 @@ struct host1x_job {
 	/* Channel where job is submitted to */
 	struct host1x_channel *channel;
 
-	u32 client;
+	/* client where the job originated */
+	struct host1x_client *client;
 
 	/* Gathers and their memory */
 	struct host1x_job_gather *gathers;
-- 
cgit v1.2.3


From 06490bb99e1840ab2b6814af7356e8b4ab0e3ee6 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 16 May 2018 16:58:44 +0200
Subject: gpu: host1x: Rename relocarray -> relocs for consistency

All other array variables use a plural, and this is the only one using
the *array suffix. This is confusing, so rename it for consistency.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c | 4 ++--
 drivers/gpu/host1x/job.c    | 8 ++++----
 include/linux/host1x.h      | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 8f29323611dd..bfbd3a89c26f 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -426,13 +426,13 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 		struct host1x_reloc *reloc;
 		struct tegra_bo *obj;
 
-		err = host1x_reloc_copy_from_user(&job->relocarray[num_relocs],
+		err = host1x_reloc_copy_from_user(&job->relocs[num_relocs],
 						  &user_relocs[num_relocs], drm,
 						  file);
 		if (err < 0)
 			goto fail;
 
-		reloc = &job->relocarray[num_relocs];
+		reloc = &job->relocs[num_relocs];
 		obj = host1x_to_tegra_bo(reloc->cmdbuf.bo);
 		refs[num_refs++] = &obj->gem;
 
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 2be0bcaf8288..9d6d3e151291 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -60,7 +60,7 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
 
 	/* Redistribute memory to the structs  */
 	mem += sizeof(struct host1x_job);
-	job->relocarray = num_relocs ? mem : NULL;
+	job->relocs = num_relocs ? mem : NULL;
 	mem += num_relocs * sizeof(struct host1x_reloc);
 	job->unpins = num_unpins ? mem : NULL;
 	mem += num_unpins * sizeof(struct host1x_job_unpin_data);
@@ -115,7 +115,7 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 	job->num_unpins = 0;
 
 	for (i = 0; i < job->num_relocs; i++) {
-		struct host1x_reloc *reloc = &job->relocarray[i];
+		struct host1x_reloc *reloc = &job->relocs[i];
 		struct sg_table *sgt;
 		dma_addr_t phys_addr;
 
@@ -203,7 +203,7 @@ static int do_relocs(struct host1x_job *job, struct host1x_job_gather *g)
 
 	/* pin & patch the relocs for one gather */
 	for (i = 0; i < job->num_relocs; i++) {
-		struct host1x_reloc *reloc = &job->relocarray[i];
+		struct host1x_reloc *reloc = &job->relocs[i];
 		u32 reloc_addr = (job->reloc_addr_phys[i] +
 				  reloc->target.offset) >> reloc->shift;
 		u32 *target;
@@ -455,7 +455,7 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
 
 	fw.job = job;
 	fw.dev = dev;
-	fw.reloc = job->relocarray;
+	fw.reloc = job->relocs;
 	fw.num_relocs = job->num_relocs;
 	fw.class = job->class;
 
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 0632010f47fb..dcb6140d39d7 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -210,7 +210,7 @@ struct host1x_job {
 	unsigned int num_gathers;
 
 	/* Array of handles to be pinned & unpinned */
-	struct host1x_reloc *relocarray;
+	struct host1x_reloc *relocs;
 	unsigned int num_relocs;
 	struct host1x_job_unpin_data *unpins;
 	unsigned int num_unpins;
-- 
cgit v1.2.3


From 326bbd79fd61716841585a52d5b68f48f4e6644e Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 16 May 2018 17:01:43 +0200
Subject: gpu: host1x: Use not explicitly sized types

The number of words and the offset in a gather don't need to be
explicitly sized, so make them unsigned int instead.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/job.c | 11 ++++++-----
 drivers/gpu/host1x/job.h |  4 ++--
 include/linux/host1x.h   |  4 ++--
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 9d6d3e151291..e2f4a4d93d20 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -96,13 +96,14 @@ void host1x_job_put(struct host1x_job *job)
 EXPORT_SYMBOL(host1x_job_put);
 
 void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *bo,
-			   u32 words, u32 offset)
+			   unsigned int words, unsigned int offset)
 {
-	struct host1x_job_gather *cur_gather = &job->gathers[job->num_gathers];
+	struct host1x_job_gather *gather = &job->gathers[job->num_gathers];
+
+	gather->words = words;
+	gather->bo = bo;
+	gather->offset = offset;
 
-	cur_gather->words = words;
-	cur_gather->bo = bo;
-	cur_gather->offset = offset;
 	job->num_gathers++;
 }
 EXPORT_SYMBOL(host1x_job_add_gather);
diff --git a/drivers/gpu/host1x/job.h b/drivers/gpu/host1x/job.h
index 4bda51d503ec..188400e00192 100644
--- a/drivers/gpu/host1x/job.h
+++ b/drivers/gpu/host1x/job.h
@@ -20,10 +20,10 @@
 #define __HOST1X_JOB_H
 
 struct host1x_job_gather {
-	u32 words;
+	unsigned int words;
 	dma_addr_t base;
 	struct host1x_bo *bo;
-	u32 offset;
+	unsigned int offset;
 	bool handled;
 };
 
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index dcb6140d39d7..89110d896d72 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -251,8 +251,8 @@ struct host1x_job {
 
 struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
 				    u32 num_cmdbufs, u32 num_relocs);
-void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *mem_id,
-			   u32 words, u32 offset);
+void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *bo,
+			   unsigned int words, unsigned int offset);
 struct host1x_job *host1x_job_get(struct host1x_job *job);
 void host1x_job_put(struct host1x_job *job);
 int host1x_job_pin(struct host1x_job *job, struct device *dev);
-- 
cgit v1.2.3


From c850ece71f71c2a68a9921c52fb5fd8d3ec2b8d7 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Sat, 5 May 2018 08:12:53 +0200
Subject: drm/tegra: Use proper arguments for DRM_TEGRA_CLOSE_CHANNEL IOCTL

A separate data structure exists for the DRM_TEGRA_CLOSE_CHANNEL IOCTL,
but it is currently unused. The IOCTL was using the data structure for
the DRM_TEGRA_OPEN_CHANNEL IOCTL.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/uapi/drm/tegra_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/drm/tegra_drm.h b/include/uapi/drm/tegra_drm.h
index d954f8c33321..99e15d82d1e9 100644
--- a/include/uapi/drm/tegra_drm.h
+++ b/include/uapi/drm/tegra_drm.h
@@ -193,7 +193,7 @@ struct drm_tegra_gem_get_flags {
 #define DRM_IOCTL_TEGRA_SYNCPT_INCR DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_SYNCPT_INCR, struct drm_tegra_syncpt_incr)
 #define DRM_IOCTL_TEGRA_SYNCPT_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_SYNCPT_WAIT, struct drm_tegra_syncpt_wait)
 #define DRM_IOCTL_TEGRA_OPEN_CHANNEL DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_OPEN_CHANNEL, struct drm_tegra_open_channel)
-#define DRM_IOCTL_TEGRA_CLOSE_CHANNEL DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_CLOSE_CHANNEL, struct drm_tegra_open_channel)
+#define DRM_IOCTL_TEGRA_CLOSE_CHANNEL DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_CLOSE_CHANNEL, struct drm_tegra_close_channel)
 #define DRM_IOCTL_TEGRA_GET_SYNCPT DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_GET_SYNCPT, struct drm_tegra_get_syncpt)
 #define DRM_IOCTL_TEGRA_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_SUBMIT, struct drm_tegra_submit)
 #define DRM_IOCTL_TEGRA_GET_SYNCPT_BASE DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_GET_SYNCPT_BASE, struct drm_tegra_get_syncpt_base)
-- 
cgit v1.2.3


From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 17 May 2018 14:16:58 -0700
Subject: bpf: allow sk_msg programs to read sock fields

Currently sk_msg programs only have access to the raw data. However,
it is often useful when building policies to have the policies specific
to the socket endpoint. This allows using the socket tuple as input
into filters, etc.

This patch adds ctx access to the sock fields.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h   |   1 +
 include/uapi/linux/bpf.h |   8 ++++
 kernel/bpf/sockmap.c     |   1 +
 net/core/filter.c        | 114 +++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 121 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9dbcb9d55921..d358d1815c16 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -517,6 +517,7 @@ struct sk_msg_buff {
 	bool sg_copy[MAX_SKB_FRAGS];
 	__u32 flags;
 	struct sock *sk_redir;
+	struct sock *sk;
 	struct sk_buff *skb;
 	struct list_head list;
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333a8225..97446bbe2ca5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2176,6 +2176,14 @@ enum sk_action {
 struct sk_msg_md {
 	void *data;
 	void *data_end;
+
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index c682669570dc..3b28955a6383 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -523,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
 	}
 
 	bpf_compute_data_pointers_sg(md);
+	md->sk = sk;
 	rc = (*prog->bpf_func)(md, prog->insnsi);
 	psock->apply_bytes = md->apply_bytes;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6d0d1560bd70..aec5ebafb262 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5148,18 +5148,23 @@ static bool sk_msg_is_valid_access(int off, int size,
 	switch (off) {
 	case offsetof(struct sk_msg_md, data):
 		info->reg_type = PTR_TO_PACKET;
+		if (size != sizeof(__u64))
+			return false;
 		break;
 	case offsetof(struct sk_msg_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
+		if (size != sizeof(__u64))
+			return false;
 		break;
+	default:
+		if (size != sizeof(__u32))
+			return false;
 	}
 
 	if (off < 0 || off >= sizeof(struct sk_msg_md))
 		return false;
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u64))
-		return false;
 
 	return true;
 }
@@ -5835,7 +5840,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct bpf_sock_ops, local_ip4):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
 					      struct bpf_sock_ops_kern, sk),
@@ -6152,6 +6158,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
@@ -6164,6 +6171,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_msg_buff, data_end));
 		break;
+	case offsetof(struct sk_msg_md, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_family));
+		break;
+
+	case offsetof(struct sk_msg_md, remote_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_daddr));
+		break;
+
+	case offsetof(struct sk_msg_md, local_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_rcv_saddr));
+		break;
+
+	case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+	     offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_daddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_daddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, local_ip6[0]) ...
+	     offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct sk_msg_md, local_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_rcv_saddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, remote_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, local_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_num));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From a1be3cfdfb81cc55c1b2feb73aca6945f61acddb Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Tue, 8 May 2018 19:49:46 +0300
Subject: dt-bindings: memory: tegra: Remove Tegra114 SATA and AFI reset
 definitions

Tegra114 doesn't have SATA nor PCIe, but TRM seems erroneously document
them.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/memory/tegra114-mc.h | 34 +++++++++++++++-----------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/memory/tegra114-mc.h b/include/dt-bindings/memory/tegra114-mc.h
index 54a12adec7b8..dfe99c8a5ba5 100644
--- a/include/dt-bindings/memory/tegra114-mc.h
+++ b/include/dt-bindings/memory/tegra114-mc.h
@@ -23,23 +23,21 @@
 #define TEGRA_SWGROUP_EMUCIF	18
 #define TEGRA_SWGROUP_TSEC	19
 
-#define TEGRA114_MC_RESET_AFI		0
-#define TEGRA114_MC_RESET_AVPC		1
-#define TEGRA114_MC_RESET_DC		2
-#define TEGRA114_MC_RESET_DCB		3
-#define TEGRA114_MC_RESET_EPP		4
-#define TEGRA114_MC_RESET_2D		5
-#define TEGRA114_MC_RESET_HC		6
-#define TEGRA114_MC_RESET_HDA		7
-#define TEGRA114_MC_RESET_ISP		8
-#define TEGRA114_MC_RESET_MPCORE	9
-#define TEGRA114_MC_RESET_MPCORELP	10
-#define TEGRA114_MC_RESET_MPE		11
-#define TEGRA114_MC_RESET_3D		12
-#define TEGRA114_MC_RESET_3D2		13
-#define TEGRA114_MC_RESET_PPCS		14
-#define TEGRA114_MC_RESET_SATA		15
-#define TEGRA114_MC_RESET_VDE		16
-#define TEGRA114_MC_RESET_VI		17
+#define TEGRA114_MC_RESET_AVPC		0
+#define TEGRA114_MC_RESET_DC		1
+#define TEGRA114_MC_RESET_DCB		2
+#define TEGRA114_MC_RESET_EPP		3
+#define TEGRA114_MC_RESET_2D		4
+#define TEGRA114_MC_RESET_HC		5
+#define TEGRA114_MC_RESET_HDA		6
+#define TEGRA114_MC_RESET_ISP		7
+#define TEGRA114_MC_RESET_MPCORE	8
+#define TEGRA114_MC_RESET_MPCORELP	9
+#define TEGRA114_MC_RESET_MPE		10
+#define TEGRA114_MC_RESET_3D		11
+#define TEGRA114_MC_RESET_3D2		12
+#define TEGRA114_MC_RESET_PPCS		13
+#define TEGRA114_MC_RESET_VDE		14
+#define TEGRA114_MC_RESET_VI		15
 
 #endif
-- 
cgit v1.2.3


From 563e1e664d27292a3b55ca08366dc8c32db52450 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Tue, 15 May 2018 14:42:20 -0400
Subject: drm/scheduler: Remove obsolete spinlock.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This spinlock is superfluous, any call to drm_sched_entity_push_job
should already be under a lock together with matching drm_sched_job_init
to match the order of insertion into queue with job's fence seqence
number.

v2:
Improve patch description.
Add functions documentation describing the locking considerations

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Acked-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 15 ++++++++++-----
 include/drm/gpu_scheduler.h               |  1 -
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index a364fc0b38c3..df1578d6f42e 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -139,7 +139,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 	entity->last_scheduled = NULL;
 
 	spin_lock_init(&entity->rq_lock);
-	spin_lock_init(&entity->queue_lock);
 	spsc_queue_init(&entity->job_queue);
 
 	atomic_set(&entity->fence_seq, 0);
@@ -413,6 +412,10 @@ drm_sched_entity_pop_job(struct drm_sched_entity *entity)
  *
  * @sched_job		The pointer to job required to submit
  *
+ * Note: To guarantee that the order of insertion to queue matches
+ * the job's fence sequence number this function should be
+ * called with drm_sched_job_init under common lock.
+ *
  * Returns 0 for success, negative error code otherwise.
  */
 void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
@@ -423,11 +426,8 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
 
 	trace_drm_sched_job(sched_job, entity);
 
-	spin_lock(&entity->queue_lock);
 	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
 
-	spin_unlock(&entity->queue_lock);
-
 	/* first job wakes up scheduler */
 	if (first) {
 		/* Add the entity to the run queue */
@@ -593,7 +593,12 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
 }
 EXPORT_SYMBOL(drm_sched_job_recovery);
 
-/* init a sched_job with basic field */
+/**
+ * Init a sched_job with basic field
+ *
+ * Note: Refer to drm_sched_entity_push_job documentation
+ * for locking considerations.
+ */
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_gpu_scheduler *sched,
 		       struct drm_sched_entity *entity,
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 52380067a43f..dec655894d08 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -56,7 +56,6 @@ struct drm_sched_entity {
 	spinlock_t			rq_lock;
 	struct drm_gpu_scheduler	*sched;
 
-	spinlock_t			queue_lock;
 	struct spsc_queue		job_queue;
 
 	atomic_t			fence_seq;
-- 
cgit v1.2.3


From 6134534ca24f42043cacdd7108026803577f6c59 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 16 May 2018 16:43:11 +0200
Subject: drm/tegra: Add kerneldoc for UAPI

Document the userspace ABI with kerneldoc to provide some information on
how to use it.

v3:
- reword description of arrays and array lengths

v2:
- keep GEM object creation flags for ABI compatibility
- fix typo in struct drm_tegra_syncpt_incr kerneldoc
- fix typos in struct drm_tegra_submit kerneldoc
- reworded some descriptions as suggested

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/uapi/drm/tegra_drm.h | 490 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 481 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/tegra_drm.h b/include/uapi/drm/tegra_drm.h
index 99e15d82d1e9..c4df3c3668b3 100644
--- a/include/uapi/drm/tegra_drm.h
+++ b/include/uapi/drm/tegra_drm.h
@@ -32,143 +32,615 @@ extern "C" {
 #define DRM_TEGRA_GEM_CREATE_TILED     (1 << 0)
 #define DRM_TEGRA_GEM_CREATE_BOTTOM_UP (1 << 1)
 
+/**
+ * struct drm_tegra_gem_create - parameters for the GEM object creation IOCTL
+ */
 struct drm_tegra_gem_create {
+	/**
+	 * @size:
+	 *
+	 * The size, in bytes, of the buffer object to be created.
+	 */
 	__u64 size;
+
+	/**
+	 * @flags:
+	 *
+	 * A bitmask of flags that influence the creation of GEM objects:
+	 *
+	 * DRM_TEGRA_GEM_CREATE_TILED
+	 *   Use the 16x16 tiling format for this buffer.
+	 *
+	 * DRM_TEGRA_GEM_CREATE_BOTTOM_UP
+	 *   The buffer has a bottom-up layout.
+	 */
 	__u32 flags;
+
+	/**
+	 * @handle:
+	 *
+	 * The handle of the created GEM object. Set by the kernel upon
+	 * successful completion of the IOCTL.
+	 */
 	__u32 handle;
 };
 
+/**
+ * struct drm_tegra_gem_mmap - parameters for the GEM mmap IOCTL
+ */
 struct drm_tegra_gem_mmap {
+	/**
+	 * @handle:
+	 *
+	 * Handle of the GEM object to obtain an mmap offset for.
+	 */
 	__u32 handle;
+
+	/**
+	 * @pad:
+	 *
+	 * Structure padding that may be used in the future. Must be 0.
+	 */
 	__u32 pad;
+
+	/**
+	 * @offset:
+	 *
+	 * The mmap offset for the given GEM object. Set by the kernel upon
+	 * successful completion of the IOCTL.
+	 */
 	__u64 offset;
 };
 
+/**
+ * struct drm_tegra_syncpt_read - parameters for the read syncpoint IOCTL
+ */
 struct drm_tegra_syncpt_read {
+	/**
+	 * @id:
+	 *
+	 * ID of the syncpoint to read the current value from.
+	 */
 	__u32 id;
+
+	/**
+	 * @value:
+	 *
+	 * The current syncpoint value. Set by the kernel upon successful
+	 * completion of the IOCTL.
+	 */
 	__u32 value;
 };
 
+/**
+ * struct drm_tegra_syncpt_incr - parameters for the increment syncpoint IOCTL
+ */
 struct drm_tegra_syncpt_incr {
+	/**
+	 * @id:
+	 *
+	 * ID of the syncpoint to increment.
+	 */
 	__u32 id;
+
+	/**
+	 * @pad:
+	 *
+	 * Structure padding that may be used in the future. Must be 0.
+	 */
 	__u32 pad;
 };
 
+/**
+ * struct drm_tegra_syncpt_wait - parameters for the wait syncpoint IOCTL
+ */
 struct drm_tegra_syncpt_wait {
+	/**
+	 * @id:
+	 *
+	 * ID of the syncpoint to wait on.
+	 */
 	__u32 id;
+
+	/**
+	 * @thresh:
+	 *
+	 * Threshold value for which to wait.
+	 */
 	__u32 thresh;
+
+	/**
+	 * @timeout:
+	 *
+	 * Timeout, in milliseconds, to wait.
+	 */
 	__u32 timeout;
+
+	/**
+	 * @value:
+	 *
+	 * The new syncpoint value after the wait. Set by the kernel upon
+	 * successful completion of the IOCTL.
+	 */
 	__u32 value;
 };
 
 #define DRM_TEGRA_NO_TIMEOUT	(0xffffffff)
 
+/**
+ * struct drm_tegra_open_channel - parameters for the open channel IOCTL
+ */
 struct drm_tegra_open_channel {
+	/**
+	 * @client:
+	 *
+	 * The client ID for this channel.
+	 */
 	__u32 client;
+
+	/**
+	 * @pad:
+	 *
+	 * Structure padding that may be used in the future. Must be 0.
+	 */
 	__u32 pad;
+
+	/**
+	 * @context:
+	 *
+	 * The application context of this channel. Set by the kernel upon
+	 * successful completion of the IOCTL. This context needs to be passed
+	 * to the DRM_TEGRA_CHANNEL_CLOSE or the DRM_TEGRA_SUBMIT IOCTLs.
+	 */
 	__u64 context;
 };
 
+/**
+ * struct drm_tegra_close_channel - parameters for the close channel IOCTL
+ */
 struct drm_tegra_close_channel {
+	/**
+	 * @context:
+	 *
+	 * The application context of this channel. This is obtained from the
+	 * DRM_TEGRA_OPEN_CHANNEL IOCTL.
+	 */
 	__u64 context;
 };
 
+/**
+ * struct drm_tegra_get_syncpt - parameters for the get syncpoint IOCTL
+ */
 struct drm_tegra_get_syncpt {
+	/**
+	 * @context:
+	 *
+	 * The application context identifying the channel for which to obtain
+	 * the syncpoint ID.
+	 */
 	__u64 context;
+
+	/**
+	 * @index:
+	 *
+	 * Index of the client syncpoint for which to obtain the ID.
+	 */
 	__u32 index;
+
+	/**
+	 * @id:
+	 *
+	 * The ID of the given syncpoint. Set by the kernel upon successful
+	 * completion of the IOCTL.
+	 */
 	__u32 id;
 };
 
+/**
+ * struct drm_tegra_get_syncpt_base - parameters for the get wait base IOCTL
+ */
 struct drm_tegra_get_syncpt_base {
+	/**
+	 * @context:
+	 *
+	 * The application context identifying for which channel to obtain the
+	 * wait base.
+	 */
 	__u64 context;
+
+	/**
+	 * @syncpt:
+	 *
+	 * ID of the syncpoint for which to obtain the wait base.
+	 */
 	__u32 syncpt;
+
+	/**
+	 * @id:
+	 *
+	 * The ID of the wait base corresponding to the client syncpoint. Set
+	 * by the kernel upon successful completion of the IOCTL.
+	 */
 	__u32 id;
 };
 
+/**
+ * struct drm_tegra_syncpt - syncpoint increment operation
+ */
 struct drm_tegra_syncpt {
+	/**
+	 * @id:
+	 *
+	 * ID of the syncpoint to operate on.
+	 */
 	__u32 id;
+
+	/**
+	 * @incrs:
+	 *
+	 * Number of increments to perform for the syncpoint.
+	 */
 	__u32 incrs;
 };
 
+/**
+ * struct drm_tegra_cmdbuf - structure describing a command buffer
+ */
 struct drm_tegra_cmdbuf {
+	/**
+	 * @handle:
+	 *
+	 * Handle to a GEM object containing the command buffer.
+	 */
 	__u32 handle;
+
+	/**
+	 * @offset:
+	 *
+	 * Offset, in bytes, into the GEM object identified by @handle at
+	 * which the command buffer starts.
+	 */
 	__u32 offset;
+
+	/**
+	 * @words:
+	 *
+	 * Number of 32-bit words in this command buffer.
+	 */
 	__u32 words;
+
+	/**
+	 * @pad:
+	 *
+	 * Structure padding that may be used in the future. Must be 0.
+	 */
 	__u32 pad;
 };
 
+/**
+ * struct drm_tegra_reloc - GEM object relocation structure
+ */
 struct drm_tegra_reloc {
 	struct {
+		/**
+		 * @cmdbuf.handle:
+		 *
+		 * Handle to the GEM object containing the command buffer for
+		 * which to perform this GEM object relocation.
+		 */
 		__u32 handle;
+
+		/**
+		 * @cmdbuf.offset:
+		 *
+		 * Offset, in bytes, into the command buffer at which to
+		 * insert the relocated address.
+		 */
 		__u32 offset;
 	} cmdbuf;
 	struct {
+		/**
+		 * @target.handle:
+		 *
+		 * Handle to the GEM object to be relocated.
+		 */
 		__u32 handle;
+
+		/**
+		 * @target.offset:
+		 *
+		 * Offset, in bytes, into the target GEM object at which the
+		 * relocated data starts.
+		 */
 		__u32 offset;
 	} target;
+
+	/**
+	 * @shift:
+	 *
+	 * The number of bits by which to shift relocated addresses.
+	 */
 	__u32 shift;
+
+	/**
+	 * @pad:
+	 *
+	 * Structure padding that may be used in the future. Must be 0.
+	 */
 	__u32 pad;
 };
 
+/**
+ * struct drm_tegra_waitchk - wait check structure
+ */
 struct drm_tegra_waitchk {
+	/**
+	 * @handle:
+	 *
+	 * Handle to the GEM object containing a command stream on which to
+	 * perform the wait check.
+	 */
 	__u32 handle;
+
+	/**
+	 * @offset:
+	 *
+	 * Offset, in bytes, of the location in the command stream to perform
+	 * the wait check on.
+	 */
 	__u32 offset;
+
+	/**
+	 * @syncpt:
+	 *
+	 * ID of the syncpoint to wait check.
+	 */
 	__u32 syncpt;
+
+	/**
+	 * @thresh:
+	 *
+	 * Threshold value for which to check.
+	 */
 	__u32 thresh;
 };
 
+/**
+ * struct drm_tegra_submit - job submission structure
+ */
 struct drm_tegra_submit {
+	/**
+	 * @context:
+	 *
+	 * The application context identifying the channel to use for the
+	 * execution of this job.
+	 */
 	__u64 context;
+
+	/**
+	 * @num_syncpts:
+	 *
+	 * The number of syncpoints operated on by this job. This defines the
+	 * length of the array pointed to by @syncpts.
+	 */
 	__u32 num_syncpts;
+
+	/**
+	 * @num_cmdbufs:
+	 *
+	 * The number of command buffers to execute as part of this job. This
+	 * defines the length of the array pointed to by @cmdbufs.
+	 */
 	__u32 num_cmdbufs;
+
+	/**
+	 * @num_relocs:
+	 *
+	 * The number of relocations to perform before executing this job.
+	 * This defines the length of the array pointed to by @relocs.
+	 */
 	__u32 num_relocs;
+
+	/**
+	 * @num_waitchks:
+	 *
+	 * The number of wait checks to perform as part of this job. This
+	 * defines the length of the array pointed to by @waitchks.
+	 */
 	__u32 num_waitchks;
+
+	/**
+	 * @waitchk_mask:
+	 *
+	 * Bitmask of valid wait checks.
+	 */
 	__u32 waitchk_mask;
+
+	/**
+	 * @timeout:
+	 *
+	 * Timeout, in milliseconds, before this job is cancelled.
+	 */
 	__u32 timeout;
+
+	/**
+	 * @syncpts:
+	 *
+	 * A pointer to an array of &struct drm_tegra_syncpt structures that
+	 * specify the syncpoint operations performed as part of this job.
+	 * The number of elements in the array must be equal to the value
+	 * given by @num_syncpts.
+	 */
 	__u64 syncpts;
+
+	/**
+	 * @cmdbufs:
+	 *
+	 * A pointer to an array of &struct drm_tegra_cmdbuf structures that
+	 * define the command buffers to execute as part of this job. The
+	 * number of elements in the array must be equal to the value given
+	 * by @num_syncpts.
+	 */
 	__u64 cmdbufs;
+
+	/**
+	 * @relocs:
+	 *
+	 * A pointer to an array of &struct drm_tegra_reloc structures that
+	 * specify the relocations that need to be performed before executing
+	 * this job. The number of elements in the array must be equal to the
+	 * value given by @num_relocs.
+	 */
 	__u64 relocs;
+
+	/**
+	 * @waitchks:
+	 *
+	 * A pointer to an array of &struct drm_tegra_waitchk structures that
+	 * specify the wait checks to be performed while executing this job.
+	 * The number of elements in the array must be equal to the value
+	 * given by @num_waitchks.
+	 */
 	__u64 waitchks;
-	__u32 fence;		/* Return value */
 
-	__u32 reserved[5];	/* future expansion */
+	/**
+	 * @fence:
+	 *
+	 * The threshold of the syncpoint associated with this job after it
+	 * has been completed. Set by the kernel upon successful completion of
+	 * the IOCTL. This can be used with the DRM_TEGRA_SYNCPT_WAIT IOCTL to
+	 * wait for this job to be finished.
+	 */
+	__u32 fence;
+
+	/**
+	 * @reserved:
+	 *
+	 * This field is reserved for future use. Must be 0.
+	 */
+	__u32 reserved[5];
 };
 
 #define DRM_TEGRA_GEM_TILING_MODE_PITCH 0
 #define DRM_TEGRA_GEM_TILING_MODE_TILED 1
 #define DRM_TEGRA_GEM_TILING_MODE_BLOCK 2
 
+/**
+ * struct drm_tegra_gem_set_tiling - parameters for the set tiling IOCTL
+ */
 struct drm_tegra_gem_set_tiling {
-	/* input */
+	/**
+	 * @handle:
+	 *
+	 * Handle to the GEM object for which to set the tiling parameters.
+	 */
 	__u32 handle;
+
+	/**
+	 * @mode:
+	 *
+	 * The tiling mode to set. Must be one of:
+	 *
+	 * DRM_TEGRA_GEM_TILING_MODE_PITCH
+	 *   pitch linear format
+	 *
+	 * DRM_TEGRA_GEM_TILING_MODE_TILED
+	 *   16x16 tiling format
+	 *
+	 * DRM_TEGRA_GEM_TILING_MODE_BLOCK
+	 *   16Bx2 tiling format
+	 */
 	__u32 mode;
+
+	/**
+	 * @value:
+	 *
+	 * The value to set for the tiling mode parameter.
+	 */
 	__u32 value;
+
+	/**
+	 * @pad:
+	 *
+	 * Structure padding that may be used in the future. Must be 0.
+	 */
 	__u32 pad;
 };
 
+/**
+ * struct drm_tegra_gem_get_tiling - parameters for the get tiling IOCTL
+ */
 struct drm_tegra_gem_get_tiling {
-	/* input */
+	/**
+	 * @handle:
+	 *
+	 * Handle to the GEM object for which to query the tiling parameters.
+	 */
 	__u32 handle;
-	/* output */
+
+	/**
+	 * @mode:
+	 *
+	 * The tiling mode currently associated with the GEM object. Set by
+	 * the kernel upon successful completion of the IOCTL.
+	 */
 	__u32 mode;
+
+	/**
+	 * @value:
+	 *
+	 * The tiling mode parameter currently associated with the GEM object.
+	 * Set by the kernel upon successful completion of the IOCTL.
+	 */
 	__u32 value;
+
+	/**
+	 * @pad:
+	 *
+	 * Structure padding that may be used in the future. Must be 0.
+	 */
 	__u32 pad;
 };
 
 #define DRM_TEGRA_GEM_BOTTOM_UP		(1 << 0)
 #define DRM_TEGRA_GEM_FLAGS		(DRM_TEGRA_GEM_BOTTOM_UP)
 
+/**
+ * struct drm_tegra_gem_set_flags - parameters for the set flags IOCTL
+ */
 struct drm_tegra_gem_set_flags {
-	/* input */
+	/**
+	 * @handle:
+	 *
+	 * Handle to the GEM object for which to set the flags.
+	 */
 	__u32 handle;
-	/* output */
+
+	/**
+	 * @flags:
+	 *
+	 * The flags to set for the GEM object.
+	 */
 	__u32 flags;
 };
 
+/**
+ * struct drm_tegra_gem_get_flags - parameters for the get flags IOCTL
+ */
 struct drm_tegra_gem_get_flags {
-	/* input */
+	/**
+	 * @handle:
+	 *
+	 * Handle to the GEM object for which to query the flags.
+	 */
 	__u32 handle;
-	/* output */
+
+	/**
+	 * @flags:
+	 *
+	 * The flags currently associated with the GEM object. Set by the
+	 * kernel upon successful completion of the IOCTL.
+	 */
 	__u32 flags;
 };
 
-- 
cgit v1.2.3


From 782e6769c0df744e773dc2acff71c974b3bba4e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Apr 2018 15:24:51 +0200
Subject: dma-mapping: provide a generic dma-noncoherent implementation

Add a new dma_map_ops implementation that uses dma-direct for the
address mapping of streaming mappings, and which requires arch-specific
implemenations of coherent allocate/free.

Architectures have to provide flushing helpers to ownership trasnfers
to the device and/or CPU, and can provide optional implementations of
the coherent mmap functionality, and the cache_flush routines for
non-coherent long term allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Brodkin <abrodkin@synopsys.com>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
---
 MAINTAINERS                       |   2 +
 include/asm-generic/dma-mapping.h |   9 ++++
 include/linux/dma-direct.h        |   7 ++-
 include/linux/dma-mapping.h       |   1 +
 include/linux/dma-noncoherent.h   |  47 ++++++++++++++++++
 lib/Kconfig                       |  20 ++++++++
 lib/Makefile                      |   1 +
 lib/dma-direct.c                  |   8 +--
 lib/dma-noncoherent.c             | 102 ++++++++++++++++++++++++++++++++++++++
 9 files changed, 192 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/dma-noncoherent.h
 create mode 100644 lib/dma-noncoherent.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 79bb02ff812f..08d0d15d4958 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4334,12 +4334,14 @@ W:	http://git.infradead.org/users/hch/dma-mapping.git
 S:	Supported
 F:	lib/dma-debug.c
 F:	lib/dma-direct.c
+F:	lib/dma-noncoherent.c
 F:	lib/dma-virt.c
 F:	drivers/base/dma-mapping.c
 F:	drivers/base/dma-coherent.c
 F:	include/asm-generic/dma-mapping.h
 F:	include/linux/dma-direct.h
 F:	include/linux/dma-mapping.h
+F:	include/linux/dma-noncoherent.h
 
 DME1737 HARDWARE MONITOR DRIVER
 M:	Juerg Haefliger <juergh@gmail.com>
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 880a292d792f..ad2868263867 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -4,7 +4,16 @@
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
+	/*
+	 * Use the non-coherent ops if available.  If an architecture wants a
+	 * more fine-grained selection of operations it will have to implement
+	 * get_arch_dma_ops itself or use the per-device dma_ops.
+	 */
+#ifdef CONFIG_DMA_NONCOHERENT_OPS
+	return &dma_noncoherent_ops;
+#else
 	return &dma_direct_ops;
+#endif
 }
 
 #endif /* _ASM_GENERIC_DMA_MAPPING_H */
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 53ad6a47f513..8d9f33febde5 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -59,6 +59,11 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
-
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 25a9a2b04f78..4be070df5fc5 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -136,6 +136,7 @@ struct dma_map_ops {
 };
 
 extern const struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_noncoherent_ops;
 extern const struct dma_map_ops dma_virt_ops;
 
 #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
new file mode 100644
index 000000000000..10b2654d549b
--- /dev/null
+++ b/include/linux/dma-noncoherent.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_DMA_NONCOHERENT_H
+#define _LINUX_DMA_NONCOHERENT_H 1
+
+#include <linux/dma-mapping.h>
+
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t gfp, unsigned long attrs);
+void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_addr, unsigned long attrs);
+
+#ifdef CONFIG_DMA_NONCOHERENT_MMAP
+int arch_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+#else
+#define arch_dma_mmap NULL
+#endif /* CONFIG_DMA_NONCOHERENT_MMAP */
+
+#ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC
+void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+		enum dma_data_direction direction);
+#else
+#define arch_dma_cache_sync NULL
+#endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir);
+#else
+static inline void arch_sync_dma_for_device(struct device *dev,
+		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir);
+#else
+static inline void arch_sync_dma_for_cpu(struct device *dev,
+		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
+
+#endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 6c4e9d0ce5d1..7a913937888b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -441,10 +441,30 @@ config ARCH_DMA_ADDR_T_64BIT
 config IOMMU_HELPER
 	bool
 
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE
+	bool
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU
+	bool
+	select NEED_DMA_MAP_STATE
+
 config DMA_DIRECT_OPS
 	bool
 	depends on HAS_DMA
 
+config DMA_NONCOHERENT_OPS
+	bool
+	depends on HAS_DMA
+	select DMA_DIRECT_OPS
+
+config DMA_NONCOHERENT_MMAP
+	bool
+	depends on DMA_NONCOHERENT_OPS
+
+config DMA_NONCOHERENT_CACHE_SYNC
+	bool
+	depends on DMA_NONCOHERENT_OPS
+
 config DMA_VIRT_OPS
 	bool
 	depends on HAS_DMA
diff --git a/lib/Makefile b/lib/Makefile
index 94203b5eecd4..9f18c8152281 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,6 +30,7 @@ lib-$(CONFIG_PRINTK) += dump_stack.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o
+lib-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o
 lib-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o
 
 lib-y	+= kobject.o klist.o
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index df9e726e0712..b824eb218782 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -128,7 +128,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		free_pages((unsigned long)cpu_addr, page_order);
 }
 
-static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
 {
@@ -139,8 +139,8 @@ static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	return dma_addr;
 }
 
-static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs)
 {
 	int i;
 	struct scatterlist *sg;
@@ -175,7 +175,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-static int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == DIRECT_MAPPING_ERROR;
 }
diff --git a/lib/dma-noncoherent.c b/lib/dma-noncoherent.c
new file mode 100644
index 000000000000..79e9a757387f
--- /dev/null
+++ b/lib/dma-noncoherent.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without providing cache
+ * coherence.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/scatterlist.h>
+
+static void dma_noncoherent_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+}
+
+static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	dma_addr_t addr;
+
+	addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(dev, page_to_phys(page) + offset,
+				size, dir);
+	return addr;
+}
+
+static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs);
+	if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir);
+	return nents;
+}
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+}
+
+static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir);
+}
+#endif
+
+const struct dma_map_ops dma_noncoherent_ops = {
+	.alloc			= arch_dma_alloc,
+	.free			= arch_dma_free,
+	.mmap			= arch_dma_mmap,
+	.sync_single_for_device	= dma_noncoherent_sync_single_for_device,
+	.sync_sg_for_device	= dma_noncoherent_sync_sg_for_device,
+	.map_page		= dma_noncoherent_map_page,
+	.map_sg			= dma_noncoherent_map_sg,
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+	.sync_single_for_cpu	= dma_noncoherent_sync_single_for_cpu,
+	.sync_sg_for_cpu	= dma_noncoherent_sync_sg_for_cpu,
+	.unmap_page		= dma_noncoherent_unmap_page,
+	.unmap_sg		= dma_noncoherent_unmap_sg,
+#endif
+	.dma_supported		= dma_direct_supported,
+	.mapping_error		= dma_direct_mapping_error,
+	.cache_sync		= arch_dma_cache_sync,
+};
+EXPORT_SYMBOL(dma_noncoherent_ops);
-- 
cgit v1.2.3


From 4f0fad9a603aee91a374e8411c23953894a77479 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 Apr 2018 15:40:12 +0200
Subject: timekeeping: Remove timespec64 hack

At this point, we have converted most of the kernel to use timespec64
consistently in place of timespec, so it seems it's time to make
timespec64 the native structure and define timespec in terms of that
one on 64-bit architectures.

Starting with gcc-5, the compiler can completely optimize away the
timespec_to_timespec64 and timespec64_to_timespec functions on 64-bit
architectures. With older compilers, we introduce a couple of extra
copies of local variables, but those are easily avoided by using
the timespec64 based interfaces consistently, as we do in most of the
important code paths already.

The main upside of removing the hack is that printing the tv_sec
field of a timespec64 structure can now use the %lld format
string on all architectures without a cast to time64_t. Without
this patch, the field is a 'long' type and would have to be printed
using %ld on 64-bit architectures.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: y2038@lists.linaro.org
Cc: John Stultz <john.stultz@linaro.org>
Link: https://lkml.kernel.org/r/20180427134016.2525989-2-arnd@arndb.de
---
 include/linux/time32.h        | 18 +++--------------
 include/linux/time64.h        |  7 -------
 include/linux/timekeeping32.h | 45 -------------------------------------------
 kernel/time/time.c            |  2 --
 4 files changed, 3 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/include/linux/time32.h b/include/linux/time32.h
index d2bcd4377b56..0b14f936100a 100644
--- a/include/linux/time32.h
+++ b/include/linux/time32.h
@@ -18,25 +18,14 @@
 /* timespec64 is defined as timespec here */
 static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
 {
-	return ts64;
+	return *(const struct timespec *)&ts64;
 }
 
 static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 {
-	return ts;
+	return *(const struct timespec64 *)&ts;
 }
 
-# define timespec_equal			timespec64_equal
-# define timespec_compare		timespec64_compare
-# define set_normalized_timespec	set_normalized_timespec64
-# define timespec_add			timespec64_add
-# define timespec_sub			timespec64_sub
-# define timespec_valid			timespec64_valid
-# define timespec_valid_strict		timespec64_valid_strict
-# define timespec_to_ns			timespec64_to_ns
-# define ns_to_timespec			ns_to_timespec64
-# define timespec_add_ns		timespec64_add_ns
-
 #else
 static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
 {
@@ -55,6 +44,7 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 	ret.tv_nsec = ts.tv_nsec;
 	return ret;
 }
+#endif
 
 static inline int timespec_equal(const struct timespec *a,
 				 const struct timespec *b)
@@ -159,8 +149,6 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
 	a->tv_nsec = ns;
 }
 
-#endif
-
 /**
  * time_to_tm - converts the calendar time to local broken-down time
  *
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 0d96887ba4e0..0a7b2f79cec7 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -16,11 +16,6 @@ typedef __u64 timeu64_t;
 
 #include <uapi/linux/time.h>
 
-#if __BITS_PER_LONG == 64
-/* this trick allows us to optimize out timespec64_to_timespec */
-# define timespec64 timespec
-#define itimerspec64 itimerspec
-#else
 struct timespec64 {
 	time64_t	tv_sec;			/* seconds */
 	long		tv_nsec;		/* nanoseconds */
@@ -31,8 +26,6 @@ struct itimerspec64 {
 	struct timespec64 it_value;
 };
 
-#endif
-
 /* Parameters used to convert the timespec values: */
 #define MSEC_PER_SEC	1000L
 #define USEC_PER_MSEC	1000L
diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
index 3616b4becb59..4ea45d0df1d4 100644
--- a/include/linux/timekeeping32.h
+++ b/include/linux/timekeeping32.h
@@ -16,50 +16,6 @@ static inline struct timespec current_kernel_time(void)
 	return timespec64_to_timespec(now);
 }
 
-#if BITS_PER_LONG == 64
-/**
- * Deprecated. Use do_settimeofday64().
- */
-static inline int do_settimeofday(const struct timespec *ts)
-{
-	return do_settimeofday64(ts);
-}
-
-static inline int __getnstimeofday(struct timespec *ts)
-{
-	return __getnstimeofday64(ts);
-}
-
-static inline void getnstimeofday(struct timespec *ts)
-{
-	getnstimeofday64(ts);
-}
-
-static inline void ktime_get_ts(struct timespec *ts)
-{
-	ktime_get_ts64(ts);
-}
-
-static inline void ktime_get_real_ts(struct timespec *ts)
-{
-	getnstimeofday64(ts);
-}
-
-static inline void getrawmonotonic(struct timespec *ts)
-{
-	getrawmonotonic64(ts);
-}
-
-static inline struct timespec get_monotonic_coarse(void)
-{
-	return get_monotonic_coarse64();
-}
-
-static inline void getboottime(struct timespec *ts)
-{
-	return getboottime64(ts);
-}
-#else
 /**
  * Deprecated. Use do_settimeofday64().
  */
@@ -124,7 +80,6 @@ static inline void getboottime(struct timespec *ts)
 	getboottime64(&ts64);
 	*ts = timespec64_to_timespec(ts64);
 }
-#endif
 
 /*
  * Timespec interfaces utilizing the ktime based ones
diff --git a/kernel/time/time.c b/kernel/time/time.c
index ccd751e95fcb..6fa99213fc72 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -407,7 +407,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
 }
 EXPORT_SYMBOL(mktime64);
 
-#if __BITS_PER_LONG == 32
 /**
  * set_normalized_timespec - set timespec sec and nsec parts and normalize
  *
@@ -468,7 +467,6 @@ struct timespec ns_to_timespec(const s64 nsec)
 	return ts;
 }
 EXPORT_SYMBOL(ns_to_timespec);
-#endif
 
 /**
  * ns_to_timeval - Convert nanoseconds to timeval
-- 
cgit v1.2.3


From edca71fecb77e2697337d192cbfe96f513407761 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 Apr 2018 15:40:13 +0200
Subject: timekeeping: Clean up ktime_get_real_ts64

In a move to make ktime_get_*() the preferred driver interface into the
timekeeping code, sanitizes ktime_get_real_ts64() to be a proper exported
symbol rather than an alias for getnstimeofday64().

The internal __getnstimeofday64() is no longer used, so remove that
and merge it into ktime_get_real_ts64().

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: y2038@lists.linaro.org
Cc: John Stultz <john.stultz@linaro.org>
Link: https://lkml.kernel.org/r/20180427134016.2525989-3-arnd@arndb.de
---
 include/linux/timekeeping.h   |  8 +++++---
 include/linux/timekeeping32.h | 13 ++-----------
 kernel/time/timekeeping.c     | 31 ++++++-------------------------
 3 files changed, 13 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 588a0e4b1ab9..415dae6bf1f5 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -30,15 +30,13 @@ struct timespec64 current_kernel_time64(void);
 struct timespec64 get_monotonic_coarse64(void);
 extern void getrawmonotonic64(struct timespec64 *ts);
 extern void ktime_get_ts64(struct timespec64 *ts);
+extern void ktime_get_real_ts64(struct timespec64 *tv);
 extern time64_t ktime_get_seconds(void);
 extern time64_t __ktime_get_real_seconds(void);
 extern time64_t ktime_get_real_seconds(void);
 
-extern int __getnstimeofday64(struct timespec64 *tv);
-extern void getnstimeofday64(struct timespec64 *tv);
 extern void getboottime64(struct timespec64 *ts);
 
-#define ktime_get_real_ts64(ts)	getnstimeofday64(ts)
 
 /*
  * ktime_t based interfaces
@@ -210,5 +208,9 @@ extern void read_persistent_clock64(struct timespec64 *ts);
 extern void read_boot_clock64(struct timespec64 *ts);
 extern int update_persistent_clock64(struct timespec64 now);
 
+/*
+ * deprecated aliases, don't use in new code
+ */
+#define getnstimeofday64(ts)		ktime_get_real_ts64(ts)
 
 #endif
diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
index 4ea45d0df1d4..5abff52d07fd 100644
--- a/include/linux/timekeeping32.h
+++ b/include/linux/timekeeping32.h
@@ -27,20 +27,11 @@ static inline int do_settimeofday(const struct timespec *ts)
 	return do_settimeofday64(&ts64);
 }
 
-static inline int __getnstimeofday(struct timespec *ts)
-{
-	struct timespec64 ts64;
-	int ret = __getnstimeofday64(&ts64);
-
-	*ts = timespec64_to_timespec(ts64);
-	return ret;
-}
-
 static inline void getnstimeofday(struct timespec *ts)
 {
 	struct timespec64 ts64;
 
-	getnstimeofday64(&ts64);
+	ktime_get_real_ts64(&ts64);
 	*ts = timespec64_to_timespec(ts64);
 }
 
@@ -56,7 +47,7 @@ static inline void ktime_get_real_ts(struct timespec *ts)
 {
 	struct timespec64 ts64;
 
-	getnstimeofday64(&ts64);
+	ktime_get_real_ts64(&ts64);
 	*ts = timespec64_to_timespec(ts64);
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49cbceef5deb..7bbc7a6e6095 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -705,18 +705,19 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 }
 
 /**
- * __getnstimeofday64 - Returns the time of day in a timespec64.
+ * ktime_get_real_ts64 - Returns the time of day in a timespec64.
  * @ts:		pointer to the timespec to be set
  *
- * Updates the time of day in the timespec.
- * Returns 0 on success, or -ve when suspended (timespec will be undefined).
+ * Returns the time of day in a timespec64 (WARN if suspended).
  */
-int __getnstimeofday64(struct timespec64 *ts)
+void ktime_get_real_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long seq;
 	u64 nsecs;
 
+	WARN_ON(timekeeping_suspended);
+
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
@@ -727,28 +728,8 @@ int __getnstimeofday64(struct timespec64 *ts)
 
 	ts->tv_nsec = 0;
 	timespec64_add_ns(ts, nsecs);
-
-	/*
-	 * Do not bail out early, in case there were callers still using
-	 * the value, even in the face of the WARN_ON.
-	 */
-	if (unlikely(timekeeping_suspended))
-		return -EAGAIN;
-	return 0;
-}
-EXPORT_SYMBOL(__getnstimeofday64);
-
-/**
- * getnstimeofday64 - Returns the time of day in a timespec64.
- * @ts:		pointer to the timespec64 to be set
- *
- * Returns the time of day in a timespec64 (WARN if suspended).
- */
-void getnstimeofday64(struct timespec64 *ts)
-{
-	WARN_ON(__getnstimeofday64(ts));
 }
-EXPORT_SYMBOL(getnstimeofday64);
+EXPORT_SYMBOL(ktime_get_real_ts64);
 
 ktime_t ktime_get(void)
 {
-- 
cgit v1.2.3


From fb7fcc96a86cfaef0f6dcc0665516aa68611e736 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 Apr 2018 15:40:14 +0200
Subject: timekeeping: Standardize on ktime_get_*() naming

The current_kernel_time64, get_monotonic_coarse64, getrawmonotonic64,
get_monotonic_boottime64 and timekeeping_clocktai64 interfaces have
rather inconsistent naming, and they differ in the calling conventions
by passing the output either by reference or as a return value.

Rename them to ktime_get_coarse_real_ts64, ktime_get_coarse_ts64,
ktime_get_raw_ts64, ktime_get_boottime_ts64 and ktime_get_clocktai_ts64
respectively, and provide the interfaces with macros or inline
functions as needed.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: y2038@lists.linaro.org
Cc: John Stultz <john.stultz@linaro.org>
Link: https://lkml.kernel.org/r/20180427134016.2525989-4-arnd@arndb.de
---
 include/linux/timekeeping.h   | 43 ++++++++++++++++++++++++++++++++-----------
 include/linux/timekeeping32.h | 14 ++++++++++----
 kernel/time/timekeeping.c     | 23 +++++++++--------------
 3 files changed, 51 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 415dae6bf1f5..3ef9791d7d75 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -19,25 +19,25 @@ extern void xtime_update(unsigned long ticks);
 extern int do_settimeofday64(const struct timespec64 *ts);
 extern int do_sys_settimeofday64(const struct timespec64 *tv,
 				 const struct timezone *tz);
-/*
- * Kernel time accessors
- */
-struct timespec64 current_kernel_time64(void);
 
 /*
  * timespec64 based interfaces
  */
-struct timespec64 get_monotonic_coarse64(void);
-extern void getrawmonotonic64(struct timespec64 *ts);
+extern void ktime_get_raw_ts64(struct timespec64 *ts);
 extern void ktime_get_ts64(struct timespec64 *ts);
 extern void ktime_get_real_ts64(struct timespec64 *tv);
+extern void ktime_get_coarse_ts64(struct timespec64 *ts);
+extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
+
+void getboottime64(struct timespec64 *ts);
+
+/*
+ * time64_t base interfaces
+ */
 extern time64_t ktime_get_seconds(void);
 extern time64_t __ktime_get_real_seconds(void);
 extern time64_t ktime_get_real_seconds(void);
 
-extern void getboottime64(struct timespec64 *ts);
-
-
 /*
  * ktime_t based interfaces
  */
@@ -123,12 +123,12 @@ extern u64 ktime_get_real_fast_ns(void);
 /*
  * timespec64 interfaces utilizing the ktime based ones
  */
-static inline void get_monotonic_boottime64(struct timespec64 *ts)
+static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
 {
 	*ts = ktime_to_timespec64(ktime_get_boottime());
 }
 
-static inline void timekeeping_clocktai64(struct timespec64 *ts)
+static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
 {
 	*ts = ktime_to_timespec64(ktime_get_clocktai());
 }
@@ -212,5 +212,26 @@ extern int update_persistent_clock64(struct timespec64 now);
  * deprecated aliases, don't use in new code
  */
 #define getnstimeofday64(ts)		ktime_get_real_ts64(ts)
+#define get_monotonic_boottime64(ts)	ktime_get_boottime_ts64(ts)
+#define getrawmonotonic64(ts)		ktime_get_raw_ts64(ts)
+#define timekeeping_clocktai64(ts)	ktime_get_clocktai_ts64(ts)
+
+static inline struct timespec64 current_kernel_time64(void)
+{
+	struct timespec64 ts;
+
+	ktime_get_coarse_real_ts64(&ts);
+
+	return ts;
+}
+
+static inline struct timespec64 get_monotonic_coarse64(void)
+{
+	struct timespec64 ts;
+
+	ktime_get_coarse_ts64(&ts);
+
+	return ts;
+}
 
 #endif
diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
index 5abff52d07fd..8762c2f45f8b 100644
--- a/include/linux/timekeeping32.h
+++ b/include/linux/timekeeping32.h
@@ -11,9 +11,11 @@ unsigned long get_seconds(void);
 
 static inline struct timespec current_kernel_time(void)
 {
-	struct timespec64 now = current_kernel_time64();
+	struct timespec64 ts64;
+
+	ktime_get_coarse_real_ts64(&ts64);
 
-	return timespec64_to_timespec(now);
+	return timespec64_to_timespec(ts64);
 }
 
 /**
@@ -55,13 +57,17 @@ static inline void getrawmonotonic(struct timespec *ts)
 {
 	struct timespec64 ts64;
 
-	getrawmonotonic64(&ts64);
+	ktime_get_raw_ts64(&ts64);
 	*ts = timespec64_to_timespec(ts64);
 }
 
 static inline struct timespec get_monotonic_coarse(void)
 {
-	return timespec64_to_timespec(get_monotonic_coarse64());
+	struct timespec64 ts64;
+
+	ktime_get_coarse_ts64(&ts64);
+
+	return timespec64_to_timespec(ts64);
 }
 
 static inline void getboottime(struct timespec *ts)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7bbc7a6e6095..ed9b74ec9c0b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1391,12 +1391,12 @@ int timekeeping_notify(struct clocksource *clock)
 }
 
 /**
- * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
  * @ts:		pointer to the timespec64 to be set
  *
  * Returns the raw monotonic time (completely un-modified by ntp)
  */
-void getrawmonotonic64(struct timespec64 *ts)
+void ktime_get_raw_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long seq;
@@ -1412,7 +1412,7 @@ void getrawmonotonic64(struct timespec64 *ts)
 	ts->tv_nsec = 0;
 	timespec64_add_ns(ts, nsecs);
 }
-EXPORT_SYMBOL(getrawmonotonic64);
+EXPORT_SYMBOL(ktime_get_raw_ts64);
 
 
 /**
@@ -2114,23 +2114,20 @@ unsigned long get_seconds(void)
 }
 EXPORT_SYMBOL(get_seconds);
 
-struct timespec64 current_kernel_time64(void)
+void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	struct timespec64 now;
 	unsigned long seq;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		now = tk_xtime(tk);
+		*ts = tk_xtime(tk);
 	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	return now;
 }
-EXPORT_SYMBOL(current_kernel_time64);
+EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
 
-struct timespec64 get_monotonic_coarse64(void)
+void ktime_get_coarse_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct timespec64 now, mono;
@@ -2143,12 +2140,10 @@ struct timespec64 get_monotonic_coarse64(void)
 		mono = tk->wall_to_monotonic;
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
+	set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
-
-	return now;
 }
-EXPORT_SYMBOL(get_monotonic_coarse64);
+EXPORT_SYMBOL(ktime_get_coarse_ts64);
 
 /*
  * Must hold jiffies_lock
-- 
cgit v1.2.3


From b9ff604cff1135cc576cf952d394ed9401aa234b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 Apr 2018 15:40:15 +0200
Subject: timekeeping: Add ktime_get_coarse_with_offset

I have run into a couple of drivers using current_kernel_time()
suffering from the y2038 problem, and they could be converted
to using ktime_t, but don't have interfaces that skip the nanosecond
calculation at the moment.

This introduces ktime_get_coarse_with_offset() as a simpler
variant of ktime_get_with_offset(), and adds wrappers for the
three time domains we support with the existing function.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: y2038@lists.linaro.org
Cc: John Stultz <john.stultz@linaro.org>
Link: https://lkml.kernel.org/r/20180427134016.2525989-5-arnd@arndb.de
---
 include/linux/timekeeping.h | 16 ++++++++++++++++
 kernel/time/timekeeping.c   | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3ef9791d7d75..42f71f4b658a 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -51,6 +51,7 @@ enum tk_offsets {
 
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
+extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
 extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
 extern ktime_t ktime_get_raw(void);
 extern u32 ktime_get_resolution_ns(void);
@@ -63,6 +64,11 @@ static inline ktime_t ktime_get_real(void)
 	return ktime_get_with_offset(TK_OFFS_REAL);
 }
 
+static inline ktime_t ktime_get_coarse_real(void)
+{
+	return ktime_get_coarse_with_offset(TK_OFFS_REAL);
+}
+
 /**
  * ktime_get_boottime - Returns monotonic time since boot in ktime_t format
  *
@@ -74,6 +80,11 @@ static inline ktime_t ktime_get_boottime(void)
 	return ktime_get_with_offset(TK_OFFS_BOOT);
 }
 
+static inline ktime_t ktime_get_coarse_boottime(void)
+{
+	return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
+}
+
 /**
  * ktime_get_clocktai - Returns the TAI time of day in ktime_t format
  */
@@ -82,6 +93,11 @@ static inline ktime_t ktime_get_clocktai(void)
 	return ktime_get_with_offset(TK_OFFS_TAI);
 }
 
+static inline ktime_t ktime_get_coarse_clocktai(void)
+{
+	return ktime_get_coarse_with_offset(TK_OFFS_TAI);
+}
+
 /**
  * ktime_mono_to_real - Convert monotonic time to clock realtime
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ed9b74ec9c0b..4786df904c22 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -795,6 +795,25 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
 }
 EXPORT_SYMBOL_GPL(ktime_get_with_offset);
 
+ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned int seq;
+	ktime_t base, *offset = offsets[offs];
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		base = ktime_add(tk->tkr_mono.base, *offset);
+
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	return base;
+
+}
+EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
+
 /**
  * ktime_mono_to_any() - convert mononotic time to any other time
  * @tmono:	time to convert.
-- 
cgit v1.2.3


From 06aa376903b6e8c8741395a4702d78d47c7c27c6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 Apr 2018 15:40:16 +0200
Subject: timekeeping: Add more coarse clocktai/boottime interfaces

The set of APIs we provide has a few holes for coarse times, e.g. we
provide ktime_get_coarse_boottime() and ktime_get_boottime_ts64(),
but not the combination of the two.

This adds four new functions:

ktime_get_coarse_boottime_ts64()
ktime_get_boottime_seconds()
ktime_get_coarse_clocktai_ts64()
ktime_get_clocktai_seconds()

to fill in some of the missing pieces. I have missed only the
ktime_get_boottime_seconds() accessor in a few occasions in
the past, but it seems better to just provide all four together,
as there is very little cost to having them.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: y2038@lists.linaro.org
Cc: John Stultz <john.stultz@linaro.org>
Link: https://lkml.kernel.org/r/20180427134016.2525989-6-arnd@arndb.de
---
 include/linux/timekeeping.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 42f71f4b658a..86bc2026efce 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -137,18 +137,40 @@ extern u64 ktime_get_boot_fast_ns(void);
 extern u64 ktime_get_real_fast_ns(void);
 
 /*
- * timespec64 interfaces utilizing the ktime based ones
+ * timespec64/time64_t interfaces utilizing the ktime based ones
+ * for API completeness, these could be implemented more efficiently
+ * if needed.
  */
 static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
 {
 	*ts = ktime_to_timespec64(ktime_get_boottime());
 }
 
+static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_coarse_boottime());
+}
+
+static inline time64_t ktime_get_boottime_seconds(void)
+{
+	return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
+}
+
 static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
 {
 	*ts = ktime_to_timespec64(ktime_get_clocktai());
 }
 
+static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
+}
+
+static inline time64_t ktime_get_clocktai_seconds(void)
+{
+	return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
+}
+
 /*
  * RTC specific
  */
-- 
cgit v1.2.3


From b9ffcbaf56d3040efee64d3818688083c29b2a44 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:00 +0200
Subject: devlink: introduce devlink_port_attrs_set

Change existing setter for split port information into more generic
attrs setter. Alongside with that, allow to set port number and subport
number for split ports.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  7 ++--
 drivers/net/ethernet/mellanox/mlxsw/core.h       |  3 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c   |  4 +--
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c   |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  4 +--
 include/net/devlink.h                            | 20 +++++++----
 include/uapi/linux/devlink.h                     |  3 ++
 net/core/devlink.c                               | 46 ++++++++++++++++++------
 8 files changed, 64 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index e13ac3b8dff7..958769689ca2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1714,15 +1714,16 @@ EXPORT_SYMBOL(mlxsw_core_port_fini);
 
 void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			     void *port_driver_priv, struct net_device *dev,
-			     bool split, u32 split_group)
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber)
 {
 	struct mlxsw_core_port *mlxsw_core_port =
 					&mlxsw_core->ports[local_port];
 	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
 
 	mlxsw_core_port->port_driver_priv = port_driver_priv;
-	if (split)
-		devlink_port_split_set(devlink_port, split_group);
+	devlink_port_attrs_set(devlink_port, port_number,
+			       split, split_port_subnumber);
 	devlink_port_type_eth_set(devlink_port, dev);
 }
 EXPORT_SYMBOL(mlxsw_core_port_eth_set);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 092d39399f3c..e65287c15afd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -201,7 +201,8 @@ int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port);
 void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port);
 void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			     void *port_driver_priv, struct net_device *dev,
-			     bool split, u32 split_group);
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber);
 void mlxsw_core_port_ib_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			    void *port_driver_priv);
 void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 94132f6cec61..96f6d8af1fd8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2927,8 +2927,8 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
 	}
 
 	mlxsw_core_port_eth_set(mlxsw_sp->core, mlxsw_sp_port->local_port,
-				mlxsw_sp_port, dev, mlxsw_sp_port->split,
-				module);
+				mlxsw_sp_port, dev, module + 1,
+				mlxsw_sp_port->split, lane / width);
 	mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw, 0);
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index a655c5850aa6..551e05067b1e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -1149,7 +1149,7 @@ static int __mlxsw_sx_port_eth_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
 	}
 
 	mlxsw_core_port_eth_set(mlxsw_sx->core, mlxsw_sx_port->local_port,
-				mlxsw_sx_port, dev, false, 0);
+				mlxsw_sx_port, dev, module + 1, false, 0);
 	mlxsw_sx->ports[local_port] = mlxsw_sx_port;
 	return 0;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index eb0fc614673d..d7a768ff3112 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -175,8 +175,8 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
 		return ret;
 
 	devlink_port_type_eth_set(&port->dl_port, port->netdev);
-	if (eth_port.is_split)
-		devlink_port_split_set(&port->dl_port, eth_port.label_port);
+	devlink_port_attrs_set(&port->dl_port, eth_port.label_port,
+			       eth_port.is_split, eth_port.label_subport);
 
 	devlink = priv_to_devlink(app->pf);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 2e4f71e16e95..d6ca92266709 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -35,6 +35,13 @@ struct devlink {
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
+struct devlink_port_attrs {
+	bool set;
+	u32 port_number; /* same value as "split group" */
+	bool split;
+	u32 split_subport_number;
+};
+
 struct devlink_port {
 	struct list_head list;
 	struct devlink *devlink;
@@ -43,8 +50,7 @@ struct devlink_port {
 	enum devlink_port_type type;
 	enum devlink_port_type desired_type;
 	void *type_dev;
-	bool split;
-	u32 split_group;
+	struct devlink_port_attrs attrs;
 };
 
 struct devlink_sb_pool_info {
@@ -367,8 +373,9 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port,
 void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
-void devlink_port_split_set(struct devlink_port *devlink_port,
-			    u32 split_group);
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    u32 port_number, bool split,
+			    u32 split_subport_number);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
@@ -466,8 +473,9 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
 {
 }
 
-static inline void devlink_port_split_set(struct devlink_port *devlink_port,
-					  u32 split_group)
+static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
+					  u32 port_number, bool split,
+					  u32 split_subport_number)
 {
 }
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1df65a4c2044..15b031a5ee7a 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -224,6 +224,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798..8fde7d2df9b0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -453,6 +453,25 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
 				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
 }
 
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+				     struct devlink_port *devlink_port)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+	if (!attrs->set)
+		return 0;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+		return -EMSGSIZE;
+	if (!attrs->split)
+		return 0;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+			attrs->split_subport_number))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				struct devlink_port *devlink_port,
 				enum devlink_command cmd, u32 portid,
@@ -492,9 +511,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				   ibdev->name))
 			goto nla_put_failure;
 	}
-	if (devlink_port->split &&
-	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
-			devlink_port->split_group))
+	if (devlink_nl_port_attrs_put(msg, devlink_port))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -2971,19 +2988,28 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
 /**
- *	devlink_port_split_set - Set port is split
+ *	devlink_port_attrs_set - Set port attributes
  *
  *	@devlink_port: devlink port
- *	@split_group: split group - identifies group split port is part of
+ *	@port_number: number of the port that is facing user, for example
+ *	              the front panel port number
+ *	@split: indicates if this is split port
+ *	@split_subport_number: if the port is split, this is the number
+ *	                       of subport.
  */
-void devlink_port_split_set(struct devlink_port *devlink_port,
-			    u32 split_group)
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    u32 port_number, bool split,
+			    u32 split_subport_number)
 {
-	devlink_port->split = true;
-	devlink_port->split_group = split_group;
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+	attrs->set = true;
+	attrs->port_number = port_number;
+	attrs->split = split;
+	attrs->split_subport_number = split_subport_number;
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 }
-EXPORT_SYMBOL_GPL(devlink_port_split_set);
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
-- 
cgit v1.2.3


From 5ec1380a21bb6cd2ba89e31c44dfcc150f9ef792 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:01 +0200
Subject: devlink: extend attrs_set for setting port flavours

Devlink ports can have specific flavour according to the purpose of use.
This patch extend attrs_set so the driver can say which flavour port
has. Initial flavours are:
physical, cpu, dsa
User can query this to see right away what is the purpose of each port.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  5 +++--
 include/net/devlink.h                            |  3 +++
 include/uapi/linux/devlink.h                     | 11 +++++++++++
 net/core/devlink.c                               |  5 +++++
 5 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 958769689ca2..a720aa11bcc0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1722,8 +1722,8 @@ void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
 
 	mlxsw_core_port->port_driver_priv = port_driver_priv;
-	devlink_port_attrs_set(devlink_port, port_number,
-			       split, split_port_subnumber);
+	devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       port_number, split, split_port_subnumber);
 	devlink_port_type_eth_set(devlink_port, dev);
 }
 EXPORT_SYMBOL(mlxsw_core_port_eth_set);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index d7a768ff3112..b1e67cf4257a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -175,8 +175,9 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
 		return ret;
 
 	devlink_port_type_eth_set(&port->dl_port, port->netdev);
-	devlink_port_attrs_set(&port->dl_port, eth_port.label_port,
-			       eth_port.is_split, eth_port.label_subport);
+	devlink_port_attrs_set(&port->dl_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       eth_port.label_port, eth_port.is_split,
+			       eth_port.label_subport);
 
 	devlink = priv_to_devlink(app->pf);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index d6ca92266709..6eae15cf0f57 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -37,6 +37,7 @@ struct devlink {
 
 struct devlink_port_attrs {
 	bool set;
+	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
 	bool split;
 	u32 split_subport_number;
@@ -374,6 +375,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
@@ -474,6 +476,7 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
 }
 
 static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
+					  enum devlink_port_flavour flavour,
 					  u32 port_number, bool split,
 					  u32 split_subport_number)
 {
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 15b031a5ee7a..75cb5450c851 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -132,6 +132,16 @@ enum devlink_eswitch_encap_mode {
 	DEVLINK_ESWITCH_ENCAP_MODE_BASIC,
 };
 
+enum devlink_port_flavour {
+	DEVLINK_PORT_FLAVOUR_PHYSICAL, /* Any kind of a port physically
+					* facing the user.
+					*/
+	DEVLINK_PORT_FLAVOUR_CPU, /* CPU port */
+	DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
+				   * interconnect port.
+				   */
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -224,6 +234,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
 	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
 	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8fde7d2df9b0..af90d237cbc2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -460,6 +460,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 
 	if (!attrs->set)
 		return 0;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+		return -EMSGSIZE;
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
 		return -EMSGSIZE;
 	if (!attrs->split)
@@ -2991,6 +2993,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	devlink_port_attrs_set - Set port attributes
  *
  *	@devlink_port: devlink port
+ *	@flavour: flavour of the port
  *	@port_number: number of the port that is facing user, for example
  *	              the front panel port number
  *	@split: indicates if this is split port
@@ -2998,12 +3001,14 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	                       of subport.
  */
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number)
 {
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 
 	attrs->set = true;
+	attrs->flavour = flavour;
 	attrs->port_number = port_number;
 	attrs->split = split;
 	attrs->split_subport_number = split_subport_number;
-- 
cgit v1.2.3


From 08474c1a9df0cefcc3d197bd1d770695a34b9d60 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:02 +0200
Subject: devlink: introduce a helper to generate physical port names

Each driver implements physical port name generation by itself. However
as devlink has all needed info, it can easily do the job for all its
users. So implement this helper in devlink.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  9 +++++++++
 net/core/devlink.c    | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6eae15cf0f57..9686a1aa4ec9 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -378,6 +378,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				    char *name, size_t len);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
@@ -482,6 +484,13 @@ static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
 {
 }
 
+static inline int
+devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				char *name, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int devlink_sb_register(struct devlink *devlink,
 				      unsigned int sb_index, u32 size,
 				      u16 ingress_pools_count,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index af90d237cbc2..5c8a40e1a01e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3016,6 +3016,39 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				    char *name, size_t len)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+	int n = 0;
+
+	if (!attrs->set)
+		return -EOPNOTSUPP;
+
+	switch (attrs->flavour) {
+	case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+		if (!attrs->split)
+			n = snprintf(name, len, "p%u", attrs->port_number);
+		else
+			n = snprintf(name, len, "p%us%u", attrs->port_number,
+				     attrs->split_subport_number);
+		break;
+	case DEVLINK_PORT_FLAVOUR_CPU:
+	case DEVLINK_PORT_FLAVOUR_DSA:
+		/* As CPU and DSA ports do not have a netdevice associated
+		 * case should not ever happen.
+		 */
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (n >= len)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
+
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From ed582db639cfe65e2d901fca1be8fd813a18fec4 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Tue, 15 May 2018 17:19:17 +0200
Subject: iio: adc: stm32-dfsdm: include stm32-dfsdm-adc.h

Fix the following sparse warnings:
  CHECK   drivers/iio/adc/stm32-dfsdm-adc.c
symbol 'stm32_dfsdm_get_buff_cb' was not declared. Should it be static?
symbol 'stm32_dfsdm_release_buff_cb' was not declared. Should it be static?

BTW, move interrupt.h to sort headers alphabetically.

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/stm32-dfsdm-adc.c       | 4 ++--
 include/linux/iio/adc/stm32-dfsdm-adc.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c
index 1b78becaba5d..31462aef7238 100644
--- a/drivers/iio/adc/stm32-dfsdm-adc.c
+++ b/drivers/iio/adc/stm32-dfsdm-adc.c
@@ -8,11 +8,11 @@
 
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
-#include <linux/interrupt.h>
+#include <linux/iio/adc/stm32-dfsdm-adc.h>
 #include <linux/iio/buffer.h>
 #include <linux/iio/hw-consumer.h>
-#include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
diff --git a/include/linux/iio/adc/stm32-dfsdm-adc.h b/include/linux/iio/adc/stm32-dfsdm-adc.h
index e7dc7a542a4e..0da298b41737 100644
--- a/include/linux/iio/adc/stm32-dfsdm-adc.h
+++ b/include/linux/iio/adc/stm32-dfsdm-adc.h
@@ -9,6 +9,8 @@
 #ifndef STM32_DFSDM_ADC_H
 #define STM32_DFSDM_ADC_H
 
+#include <linux/iio/iio.h>
+
 int stm32_dfsdm_get_buff_cb(struct iio_dev *iio_dev,
 			    int (*cb)(const void *data, size_t size,
 				      void *private),
-- 
cgit v1.2.3


From a8b70ccf10e38775785d9cb12ead916474549f99 Mon Sep 17 00:00:00 2001
From: Manu Gautam <mgautam@codeaurora.org>
Date: Thu, 3 May 2018 02:36:13 +0530
Subject: dt-bindings: phy-qcom-usb2: Add support to override tuning values

To improve eye diagram for PHYs on different boards of same SOC,
some parameters may need to be changed. Provide device tree
properties to override these from board specific device tree
files. While at it, replace "qcom,qusb2-v2-phy" with compatible
string for USB2 PHY on sdm845 which was earlier added for
sdm845 only.

Signed-off-by: Manu Gautam <mgautam@codeaurora.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../devicetree/bindings/phy/qcom-qusb2-phy.txt     | 23 +++++++++++++-
 include/dt-bindings/phy/phy-qcom-qusb2.h           | 37 ++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/phy/phy-qcom-qusb2.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt b/Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt
index 42c97426836e..03025d97998b 100644
--- a/Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt
+++ b/Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt
@@ -6,7 +6,7 @@ QUSB2 controller supports LS/FS/HS usb connectivity on Qualcomm chipsets.
 Required properties:
  - compatible: compatible list, contains
 	       "qcom,msm8996-qusb2-phy" for 14nm PHY on msm8996,
-	       "qcom,qusb2-v2-phy" for QUSB2 V2 PHY.
+	       "qcom,sdm845-qusb2-phy" for 10nm PHY on sdm845.
 
  - reg: offset and length of the PHY register set.
  - #phy-cells: must be 0.
@@ -27,6 +27,27 @@ Optional properties:
 		tuning parameter value for qusb2 phy.
 
  - qcom,tcsr-syscon: Phandle to TCSR syscon register region.
+ - qcom,imp-res-offset-value: It is a 6 bit value that specifies offset to be
+		added to PHY refgen RESCODE via IMP_CTRL1 register. It is a PHY
+		tuning parameter that may vary for different boards of same SOC.
+		This property is applicable to only QUSB2 v2 PHY (sdm845).
+ - qcom,hstx-trim-value: It is a 4 bit value that specifies tuning for HSTX
+		output current.
+		Possible range is - 15mA to 24mA (stepsize of 600 uA).
+		See dt-bindings/phy/phy-qcom-qusb2.h for applicable values.
+		This property is applicable to only QUSB2 v2 PHY (sdm845).
+		Default value is 22.2mA for sdm845.
+ - qcom,preemphasis-level: It is a 2 bit value that specifies pre-emphasis level.
+		Possible range is 0 to 15% (stepsize of 5%).
+		See dt-bindings/phy/phy-qcom-qusb2.h for applicable values.
+		This property is applicable to only QUSB2 v2 PHY (sdm845).
+		Default value is 10% for sdm845.
+- qcom,preemphasis-width: It is a 1 bit value that specifies how long the HSTX
+		pre-emphasis (specified using qcom,preemphasis-level) must be in
+		effect. Duration could be half-bit of full-bit.
+		See dt-bindings/phy/phy-qcom-qusb2.h for applicable values.
+		This property is applicable to only QUSB2 v2 PHY (sdm845).
+		Default value is full-bit width for sdm845.
 
 Example:
 	hsusb_phy: phy@7411000 {
diff --git a/include/dt-bindings/phy/phy-qcom-qusb2.h b/include/dt-bindings/phy/phy-qcom-qusb2.h
new file mode 100644
index 000000000000..5c5e4d800cac
--- /dev/null
+++ b/include/dt-bindings/phy/phy-qcom-qusb2.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_QCOM_PHY_QUSB2_H_
+#define _DT_BINDINGS_QCOM_PHY_QUSB2_H_
+
+/* PHY HSTX TRIM bit values (24mA to 15mA) */
+#define QUSB2_V2_HSTX_TRIM_24_0_MA		0x0
+#define QUSB2_V2_HSTX_TRIM_23_4_MA		0x1
+#define QUSB2_V2_HSTX_TRIM_22_8_MA		0x2
+#define QUSB2_V2_HSTX_TRIM_22_2_MA		0x3
+#define QUSB2_V2_HSTX_TRIM_21_6_MA		0x4
+#define QUSB2_V2_HSTX_TRIM_21_0_MA		0x5
+#define QUSB2_V2_HSTX_TRIM_20_4_MA		0x6
+#define QUSB2_V2_HSTX_TRIM_19_8_MA		0x7
+#define QUSB2_V2_HSTX_TRIM_19_2_MA		0x8
+#define QUSB2_V2_HSTX_TRIM_18_6_MA		0x9
+#define QUSB2_V2_HSTX_TRIM_18_0_MA		0xa
+#define QUSB2_V2_HSTX_TRIM_17_4_MA		0xb
+#define QUSB2_V2_HSTX_TRIM_16_8_MA		0xc
+#define QUSB2_V2_HSTX_TRIM_16_2_MA		0xd
+#define QUSB2_V2_HSTX_TRIM_15_6_MA		0xe
+#define QUSB2_V2_HSTX_TRIM_15_0_MA		0xf
+
+/* PHY PREEMPHASIS bit values */
+#define QUSB2_V2_PREEMPHASIS_NONE		0
+#define QUSB2_V2_PREEMPHASIS_5_PERCENT		1
+#define QUSB2_V2_PREEMPHASIS_10_PERCENT		2
+#define QUSB2_V2_PREEMPHASIS_15_PERCENT		3
+
+/* PHY PREEMPHASIS-WIDTH bit values */
+#define QUSB2_V2_PREEMPHASIS_WIDTH_FULL_BIT	0
+#define QUSB2_V2_PREEMPHASIS_WIDTH_HALF_BIT	1
+
+#endif
-- 
cgit v1.2.3


From bbbc3fb663947764d338c305087322a646298894 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 30 Apr 2018 15:51:35 -0700
Subject: fs, fscrypt: only define ->s_cop when FS_ENCRYPTION is enabled

Now that filesystems only set and use their fscrypt_operations when they
are built with encryption support, we can remove ->s_cop from
'struct super_block' when FS_ENCRYPTION is disabled.  This saves a few
bytes on some kernels and also makes it consistent with ->i_crypt_info.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..8e2460694c3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1364,9 +1364,9 @@ struct super_block {
 	void                    *s_security;
 #endif
 	const struct xattr_handler **s_xattr;
-
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
 	const struct fscrypt_operations	*s_cop;
-
+#endif
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
-- 
cgit v1.2.3


From 54222025f2fe8055fa88c39b5d9f68cbd76b1be0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 30 Apr 2018 15:51:36 -0700
Subject: fscrypt: clean up after fscrypt_prepare_lookup() conversions

Now that all filesystems have been converted to use
fscrypt_prepare_lookup(), we can remove the fscrypt_set_d_op() and
fscrypt_set_encrypted_dentry() functions as well as un-export
fscrypt_d_ops.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c              |  1 -
 fs/crypto/fscrypt_private.h     |  1 +
 include/linux/fscrypt_notsupp.h | 10 ----------
 include/linux/fscrypt_supp.h    | 14 --------------
 4 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 984e190f9b89..dd3a0164eec4 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -353,7 +353,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 const struct dentry_operations fscrypt_d_ops = {
 	.d_revalidate = fscrypt_d_revalidate,
 };
-EXPORT_SYMBOL(fscrypt_d_ops);
 
 void fscrypt_restore_control_page(struct page *page)
 {
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index ad6722bae8b7..fb96e493167b 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -106,6 +106,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
 				  gfp_t gfp_flags);
 extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
 					      gfp_t gfp_flags);
+extern const struct dentry_operations fscrypt_d_ops;
 
 /* fname.c */
 extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 44b50c04bae9..25b6492de6e5 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -64,16 +64,6 @@ static inline void fscrypt_restore_control_page(struct page *page)
 	return;
 }
 
-static inline void fscrypt_set_d_op(struct dentry *dentry)
-{
-	return;
-}
-
-static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
-{
-	return;
-}
-
 /* policy.c */
 static inline int fscrypt_ioctl_set_policy(struct file *filp,
 					   const void __user *arg)
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 477a7a6504d2..c9c2cc26bc62 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -74,20 +74,6 @@ static inline struct page *fscrypt_control_page(struct page *page)
 
 extern void fscrypt_restore_control_page(struct page *);
 
-extern const struct dentry_operations fscrypt_d_ops;
-
-static inline void fscrypt_set_d_op(struct dentry *dentry)
-{
-	d_set_d_op(dentry, &fscrypt_d_ops);
-}
-
-static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
-{
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
-	spin_unlock(&dentry->d_lock);
-}
-
 /* policy.c */
 extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
 extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
-- 
cgit v1.2.3


From e12ee6836a3fd3c6ebc9b2dc8a7974af592340d0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 30 Apr 2018 15:51:44 -0700
Subject: fscrypt: make fscrypt_operations.max_namelen an integer

Now ->max_namelen() is only called to limit the filename length when
adding NUL padding, and only for real filenames -- not symlink targets.
It also didn't give the correct length for symlink targets anyway since
it forgot to subtract 'sizeof(struct fscrypt_symlink_data)'.

Thus, change ->max_namelen from a function to a simple 'unsigned int'
that gives the filesystem's maximum filename length.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/fname.c            |  2 +-
 fs/ext4/super.c              |  8 +-------
 fs/f2fs/super.c              |  8 +-------
 fs/ubifs/crypto.c            | 10 +---------
 include/linux/fscrypt_supp.h |  2 +-
 5 files changed, 5 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index c4eb3a235ae4..39091fc31e98 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -334,7 +334,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 
 	if (dir->i_crypt_info) {
 		if (!fscrypt_fname_encrypted_size(dir, iname->len,
-						  dir->i_sb->s_cop->max_namelen(dir),
+						  dir->i_sb->s_cop->max_namelen,
 						  &fname->crypto_buf.len))
 			return -ENAMETOOLONG;
 		fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb104e8476f0..502c36da292c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1237,19 +1237,13 @@ static bool ext4_dummy_context(struct inode *inode)
 	return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
 }
 
-static unsigned ext4_max_namelen(struct inode *inode)
-{
-	return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
-		EXT4_NAME_LEN;
-}
-
 static const struct fscrypt_operations ext4_cryptops = {
 	.key_prefix		= "ext4:",
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
 	.dummy_context		= ext4_dummy_context,
 	.empty_dir		= ext4_empty_dir,
-	.max_namelen		= ext4_max_namelen,
+	.max_namelen		= EXT4_NAME_LEN,
 };
 #endif
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 42d564c5ccd0..970ae27f401c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1930,19 +1930,13 @@ static bool f2fs_dummy_context(struct inode *inode)
 	return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
 }
 
-static unsigned f2fs_max_namelen(struct inode *inode)
-{
-	return S_ISLNK(inode->i_mode) ?
-			inode->i_sb->s_blocksize : F2FS_NAME_LEN;
-}
-
 static const struct fscrypt_operations f2fs_cryptops = {
 	.key_prefix	= "f2fs:",
 	.get_context	= f2fs_get_context,
 	.set_context	= f2fs_set_context,
 	.dummy_context	= f2fs_dummy_context,
 	.empty_dir	= f2fs_empty_dir,
-	.max_namelen	= f2fs_max_namelen,
+	.max_namelen	= F2FS_NAME_LEN,
 };
 #endif
 
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 616a688f5d8f..55c508fe8131 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -24,14 +24,6 @@ static bool ubifs_crypt_empty_dir(struct inode *inode)
 	return ubifs_check_dir_empty(inode) == 0;
 }
 
-static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
-{
-	if (S_ISLNK(inode->i_mode))
-		return UBIFS_MAX_INO_DATA;
-	else
-		return UBIFS_MAX_NLEN;
-}
-
 int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 		  unsigned int in_len, unsigned int *out_len, int block)
 {
@@ -89,5 +81,5 @@ const struct fscrypt_operations ubifs_crypt_operations = {
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
 	.empty_dir		= ubifs_crypt_empty_dir,
-	.max_namelen		= ubifs_crypt_max_namelen,
+	.max_namelen		= UBIFS_MAX_NLEN,
 };
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index c9c2cc26bc62..5080cb1bec4c 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -29,7 +29,7 @@ struct fscrypt_operations {
 	int (*set_context)(struct inode *, const void *, size_t, void *);
 	bool (*dummy_context)(struct inode *);
 	bool (*empty_dir)(struct inode *);
-	unsigned (*max_namelen)(struct inode *);
+	unsigned int max_namelen;
 };
 
 struct fscrypt_ctx {
-- 
cgit v1.2.3


From 12d28f79558f2e987c5f3817f89e1ccc0f11a7b5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 7 May 2018 17:22:08 -0700
Subject: fscrypt: add Speck128/256 support

fscrypt currently only supports AES encryption.  However, many low-end
mobile devices have older CPUs that don't have AES instructions, e.g.
the ARMv8 Cryptography Extensions.  Currently, user data on such devices
is not encrypted at rest because AES is too slow, even when the NEON
bit-sliced implementation of AES is used.  Unfortunately, it is
infeasible to encrypt these devices at all when AES is the only option.

Therefore, this patch updates fscrypt to support the Speck block cipher,
which was recently added to the crypto API.  The C implementation of
Speck is not especially fast, but Speck can be implemented very
efficiently with general-purpose vector instructions, e.g. ARM NEON.
For example, on an ARMv7 processor, we measured the NEON-accelerated
Speck128/256-XTS at 69 MB/s for both encryption and decryption, while
AES-256-XTS with the NEON bit-sliced implementation was only 22 MB/s
encryption and 19 MB/s decryption.

There are multiple variants of Speck.  This patch only adds support for
Speck128/256, which is the variant with a 128-bit block size and 256-bit
key size -- the same as AES-256.  This is believed to be the most secure
variant of Speck, and it's only about 6% slower than Speck128/128.
Speck64/128 would be at least 20% faster because it has 20% rounds, and
it can be even faster on CPUs that can't efficiently do the 64-bit
operations needed for Speck128.  However, Speck64's 64-bit block size is
not preferred security-wise.  ARM NEON also supports the needed 64-bit
operations even on 32-bit CPUs, resulting in Speck128 being fast enough
for our targeted use cases so far.

The chosen modes of operation are XTS for contents and CTS-CBC for
filenames.  These are the same modes of operation that fscrypt defaults
to for AES.  Note that as with the other fscrypt modes, Speck will not
be used unless userspace chooses to use it.  Nor are any of the existing
modes (which are all AES-based) being removed, of course.

We intentionally don't make CONFIG_FS_ENCRYPTION select
CONFIG_CRYPTO_SPECK, so people will have to enable Speck support
themselves if they need it.  This is because we shouldn't bloat the
FS_ENCRYPTION dependencies with every new cipher, especially ones that
aren't recommended for most users.  Moreover, CRYPTO_SPECK is just the
generic implementation, which won't be fast enough for many users; in
practice, they'll need to enable CRYPTO_SPECK_NEON to get acceptable
performance.

More details about our choice of Speck can be found in our patches that
added Speck to the crypto API, and the follow-on discussion threads.
We're planning a publication that explains the choice in more detail.
But briefly, we can't use ChaCha20 as we previously proposed, since it
would be insecure to use a stream cipher in this context, with potential
IV reuse during writes on f2fs and/or on wear-leveling flash storage.

We also evaluated many other lightweight and/or ARX-based block ciphers
such as Chaskey-LTS, RC5, LEA, CHAM, Threefish, RC6, NOEKEON, SPARX, and
XTEA.  However, all had disadvantages vs. Speck, such as insufficient
performance with NEON, much less published cryptanalysis, or an
insufficient security level.  Various design choices in Speck make it
perform better with NEON than competing ciphers while still having a
security margin similar to AES, and in the case of Speck128 also the
same available security levels.  Unfortunately, Speck does have some
political baggage attached -- it's an NSA designed cipher, and was
rejected from an ISO standard (though for context, as far as I know none
of the above-mentioned alternatives are ISO standards either).
Nevertheless, we believe it is a good solution to the problem from a
technical perspective.

Certain algorithms constructed from ChaCha or the ChaCha permutation,
such as MEM (Masked Even-Mansour) or HPolyC, may also meet our
performance requirements.  However, these are new constructions that
need more time to receive the cryptographic review and acceptance needed
to be confident in their security.  HPolyC hasn't been published yet,
and we are concerned that MEM makes stronger assumptions about the
underlying permutation than the ChaCha stream cipher does.  In contrast,
the XTS mode of operation is relatively well accepted, and Speck has
over 70 cryptanalysis papers.  Of course, these ChaCha-based algorithms
can still be added later if they become ready.

The best known attack on Speck128/256 is a differential cryptanalysis
attack on 25 of 34 rounds with 2^253 time complexity and 2^125 chosen
plaintexts, i.e. only marginally faster than brute force.  There is no
known attack on the full 34 rounds.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 Documentation/filesystems/fscrypt.rst | 10 ++++++++++
 fs/crypto/fscrypt_private.h           |  4 ++++
 fs/crypto/keyinfo.c                   |  2 ++
 include/uapi/linux/fs.h               |  2 ++
 4 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index cfbc18f0d9c9..48b424de85bb 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -191,11 +191,21 @@ Currently, the following pairs of encryption modes are supported:
 
 - AES-256-XTS for contents and AES-256-CTS-CBC for filenames
 - AES-128-CBC for contents and AES-128-CTS-CBC for filenames
+- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames
 
 It is strongly recommended to use AES-256-XTS for contents encryption.
 AES-128-CBC was added only for low-powered embedded devices with
 crypto accelerators such as CAAM or CESA that do not support XTS.
 
+Similarly, Speck128/256 support was only added for older or low-end
+CPUs which cannot do AES fast enough -- especially ARM CPUs which have
+NEON instructions but not the Cryptography Extensions -- and for which
+it would not otherwise be feasible to use encryption at all.  It is
+not recommended to use Speck on CPUs that have AES instructions.
+Speck support is only available if it has been enabled in the crypto
+API via CONFIG_CRYPTO_SPECK.  Also, on ARM platforms, to get
+acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled.
+
 New encryption modes can be added relatively easily, without changes
 to individual filesystems.  However, authenticated encryption (AE)
 modes are not currently supported because of the difficulty of dealing
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 09d6c72635b6..37562394c5de 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -83,6 +83,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
 		return true;
 
+	if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS &&
+	    filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS)
+		return true;
+
 	return false;
 }
 
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index c4d1388fc9b4..41f6025d5d7a 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -156,6 +156,8 @@ static const struct {
 	[FS_ENCRYPTION_MODE_AES_256_CTS]      = { "cts(cbc(aes))",	32 },
 	[FS_ENCRYPTION_MODE_AES_128_CBC]      = { "cbc(aes)",		16 },
 	[FS_ENCRYPTION_MODE_AES_128_CTS]      = { "cts(cbc(aes))",	16 },
+	[FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { "xts(speck128)",	64 },
+	[FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { "cts(cbc(speck128))",	32 },
 };
 
 static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index d2a8313fabd7..0b6e07ee63a6 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -275,6 +275,8 @@ struct fsxattr {
 #define FS_ENCRYPTION_MODE_AES_256_CTS		4
 #define FS_ENCRYPTION_MODE_AES_128_CBC		5
 #define FS_ENCRYPTION_MODE_AES_128_CTS		6
+#define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7
+#define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8
 
 struct fscrypt_policy {
 	__u8 version;
-- 
cgit v1.2.3


From d48f1958ab7d9fb896d86f0241f0dc83dc3a63de Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 18 May 2018 19:41:01 -0700
Subject: erspan: set bso bit based on mirrored packet's len

Before the patch, the erspan BSO bit (Bad/Short/Oversized) is not
handled.  BSO has 4 possible values:
  00 --> Good frame with no error, or unknown integrity
  11 --> Payload is a Bad Frame with CRC or Alignment Error
  01 --> Payload is a Short Frame
  10 --> Payload is an Oversized Frame

Based the short/oversized definitions in RFC1757, the patch sets
the bso bit based on the mirrored packet's size.

Reported-by: Xiaoyan Jin <xiaoyanj@vmware.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index d044aa60cc76..b39643ef4c95 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -219,6 +219,33 @@ static inline __be32 erspan_get_timestamp(void)
 	return htonl((u32)h_usecs);
 }
 
+/* ERSPAN BSO (Bad/Short/Oversized), see RFC1757
+ *   00b --> Good frame with no error, or unknown integrity
+ *   01b --> Payload is a Short Frame
+ *   10b --> Payload is an Oversized Frame
+ *   11b --> Payload is a Bad Frame with CRC or Alignment Error
+ */
+enum erspan_bso {
+	BSO_NOERROR = 0x0,
+	BSO_SHORT = 0x1,
+	BSO_OVERSIZED = 0x2,
+	BSO_BAD = 0x3,
+};
+
+static inline u8 erspan_detect_bso(struct sk_buff *skb)
+{
+	/* BSO_BAD is not handled because the frame CRC
+	 * or alignment error information is in FCS.
+	 */
+	if (skb->len < ETH_ZLEN)
+		return BSO_SHORT;
+
+	if (skb->len > ETH_FRAME_LEN)
+		return BSO_OVERSIZED;
+
+	return BSO_NOERROR;
+}
+
 static inline void erspan_build_header_v2(struct sk_buff *skb,
 					  u32 id, u8 direction, u16 hwid,
 					  bool truncate, bool is_ipv4)
@@ -248,6 +275,7 @@ static inline void erspan_build_header_v2(struct sk_buff *skb,
 		vlan_tci = ntohs(qp->tci);
 	}
 
+	bso = erspan_detect_bso(skb);
 	skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
 	ershdr = (struct erspan_base_hdr *)skb->data;
 	memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
-- 
cgit v1.2.3


From 877b7cb0b6f283593a663134ee52703f12c895cc Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 19 May 2018 22:31:34 +0200
Subject: net: dsa: mv88e6xxx: Add minimal platform_data support

Not all the world uses device tree. Some parts of the world still use
platform devices and platform data. Add basic support for probing a
Marvell switch via platform data.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                             |  1 +
 drivers/net/dsa/mv88e6xxx/chip.c        | 56 ++++++++++++++++++++++++++++-----
 include/linux/platform_data/mv88e6xxx.h | 17 ++++++++++
 3 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/platform_data/mv88e6xxx.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 658880464b9d..9f2045a5adac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8466,6 +8466,7 @@ M:	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/dsa/mv88e6xxx/
+F:	linux/platform_data/mv88e6xxx.h
 F:	Documentation/devicetree/bindings/net/dsa/marvell.txt
 
 MARVELL ARMADA DRM SUPPORT
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 2bb3f03ee1cb..5b40382036ea 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -28,6 +28,7 @@
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
 #include <linux/of_mdio.h>
+#include <linux/platform_data/mv88e6xxx.h>
 #include <linux/netdevice.h>
 #include <linux/gpio/consumer.h>
 #include <linux/phy.h>
@@ -4350,6 +4351,7 @@ static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip)
 		return -ENOMEM;
 
 	ds->priv = chip;
+	ds->dev = dev;
 	ds->ops = &mv88e6xxx_switch_ops;
 	ds->ageing_time_min = chip->info->age_time_coeff;
 	ds->ageing_time_max = chip->info->age_time_coeff * U8_MAX;
@@ -4364,36 +4366,73 @@ static void mv88e6xxx_unregister_switch(struct mv88e6xxx_chip *chip)
 	dsa_unregister_switch(chip->ds);
 }
 
+static const void *pdata_device_get_match_data(struct device *dev)
+{
+	const struct of_device_id *matches = dev->driver->of_match_table;
+	const struct dsa_mv88e6xxx_pdata *pdata = dev->platform_data;
+
+	for (; matches->name[0] || matches->type[0] || matches->compatible[0];
+	     matches++) {
+		if (!strcmp(pdata->compatible, matches->compatible))
+			return matches->data;
+	}
+	return NULL;
+}
+
 static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 {
+	struct dsa_mv88e6xxx_pdata *pdata = mdiodev->dev.platform_data;
 	struct device *dev = &mdiodev->dev;
 	struct device_node *np = dev->of_node;
 	const struct mv88e6xxx_info *compat_info;
 	struct mv88e6xxx_chip *chip;
 	u32 eeprom_len;
+	int port;
 	int err;
 
-	compat_info = of_device_get_match_data(dev);
+	if (np)
+		compat_info = of_device_get_match_data(dev);
+
+	if (pdata) {
+		compat_info = pdata_device_get_match_data(dev);
+
+		if (!pdata->netdev)
+			return -EINVAL;
+
+		for (port = 0; port < DSA_MAX_PORTS; port++) {
+			if (!(pdata->enabled_ports & (1 << port)))
+				continue;
+			if (strcmp(pdata->cd.port_names[port], "cpu"))
+				continue;
+			pdata->cd.netdev[port] = &pdata->netdev->dev;
+			break;
+		}
+	}
+
 	if (!compat_info)
 		return -EINVAL;
 
 	chip = mv88e6xxx_alloc_chip(dev);
-	if (!chip)
-		return -ENOMEM;
+	if (!chip) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	chip->info = compat_info;
 
 	err = mv88e6xxx_smi_init(chip, mdiodev->bus, mdiodev->addr);
 	if (err)
-		return err;
+		goto out;
 
 	chip->reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
-	if (IS_ERR(chip->reset))
-		return PTR_ERR(chip->reset);
+	if (IS_ERR(chip->reset)) {
+		err = PTR_ERR(chip->reset);
+		goto out;
+	}
 
 	err = mv88e6xxx_detect(chip);
 	if (err)
-		return err;
+		goto out;
 
 	mv88e6xxx_phy_init(chip);
 
@@ -4468,6 +4507,9 @@ out_g1_irq:
 		mv88e6xxx_irq_poll_free(chip);
 	mutex_unlock(&chip->reg_lock);
 out:
+	if (pdata)
+		dev_put(pdata->netdev);
+
 	return err;
 }
 
diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h
new file mode 100644
index 000000000000..88e91e05f48f
--- /dev/null
+++ b/include/linux/platform_data/mv88e6xxx.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DSA_MV88E6XXX_H
+#define __DSA_MV88E6XXX_H
+
+#include <net/dsa.h>
+
+struct dsa_mv88e6xxx_pdata {
+	/* Must be first, such that dsa_register_switch() can access this
+	 * without gory pointer manipulations
+	 */
+	struct dsa_chip_data cd;
+	const char *compatible;
+	unsigned int enabled_ports;
+	struct net_device *netdev;
+};
+
+#endif
-- 
cgit v1.2.3


From 00baabe5286c41d21ed4a78f479b021eba1f0d51 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 19 May 2018 22:31:35 +0200
Subject: net: dsa: mv88e6xxx: Add support for EEPROM via platform data

Add the size of the EEPROM to the platform data, so it can also be
instantiated by a platform device.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c        | 11 +++++++----
 drivers/net/dsa/mv88e6xxx/chip.h        |  2 +-
 include/linux/platform_data/mv88e6xxx.h |  1 +
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 5b40382036ea..1fa1f820a437 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4386,7 +4386,6 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 	struct device_node *np = dev->of_node;
 	const struct mv88e6xxx_info *compat_info;
 	struct mv88e6xxx_chip *chip;
-	u32 eeprom_len;
 	int port;
 	int err;
 
@@ -4436,9 +4435,13 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 
 	mv88e6xxx_phy_init(chip);
 
-	if (chip->info->ops->get_eeprom &&
-	    !of_property_read_u32(np, "eeprom-length", &eeprom_len))
-		chip->eeprom_len = eeprom_len;
+	if (chip->info->ops->get_eeprom) {
+		if (np)
+			of_property_read_u32(np, "eeprom-length",
+					     &chip->eeprom_len);
+		else
+			chip->eeprom_len = pdata->eeprom_len;
+	}
 
 	mutex_lock(&chip->reg_lock);
 	err = mv88e6xxx_switch_reset(chip);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 012268046442..8ac3fbb15352 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -238,7 +238,7 @@ struct mv88e6xxx_chip {
 	struct gpio_desc *reset;
 
 	/* set to size of eeprom if supported by the switch */
-	int		eeprom_len;
+	u32 eeprom_len;
 
 	/* List of mdio busses */
 	struct list_head mdios;
diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h
index 88e91e05f48f..f63af2955ea0 100644
--- a/include/linux/platform_data/mv88e6xxx.h
+++ b/include/linux/platform_data/mv88e6xxx.h
@@ -12,6 +12,7 @@ struct dsa_mv88e6xxx_pdata {
 	const char *compatible;
 	unsigned int enabled_ports;
 	struct net_device *netdev;
+	u32 eeprom_len;
 };
 
 #endif
-- 
cgit v1.2.3


From bf3c592b9d78ab2768150c680de22ef99e6e3b32 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sun, 20 May 2018 08:56:30 -0700
Subject: net: dsa: b53: Extend platform data to include DSA ports

The b53 driver already defines and internally uses platform data to let the
glue drivers specify parameters such as the chip id.  What we were missing was
a way to tell the core DSA layer about the ports and their type.

Place a dsa_chip_data structure at the beginning of b53_platform_data for
dsa_register_switch() to access it. This does not require modifications to
b53_common.c which will pass platform_data trough.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/b53.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/b53.h b/include/linux/platform_data/b53.h
index 69d279c0da96..8eaef2f2b691 100644
--- a/include/linux/platform_data/b53.h
+++ b/include/linux/platform_data/b53.h
@@ -20,8 +20,12 @@
 #define __B53_H
 
 #include <linux/kernel.h>
+#include <net/dsa.h>
 
 struct b53_platform_data {
+	/* Must be first such that dsa_register_switch() can access it */
+	struct dsa_chip_data cd;
+
 	u32 chip_id;
 	u16 enabled_ports;
 
-- 
cgit v1.2.3


From a665140e583caad47bd2d10a433669bb26ce3b64 Mon Sep 17 00:00:00 2001
From: Joel Pepper <joel.pepper@rwth-aachen.de>
Date: Thu, 26 Apr 2018 20:26:08 +0200
Subject: usb: gadget: composite Allow for larger configuration descriptors

The composite framework allows us to create gadgets composed from many
different functions, which need to fit into a single configuration
descriptor.

Some functions (like uvc) can produce configuration descriptors upwards
of 2500 bytes on their own.

This patch increases the limit from 1024 bytes to 4096.

Signed-off-by: Joel Pepper <joel.pepper@rwth-aachen.de>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 include/linux/usb/composite.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 4b6b9283fa7b..8675e145ea8b 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -52,7 +52,7 @@
 #define USB_GADGET_DELAYED_STATUS       0x7fff	/* Impossibly large value */
 
 /* big enough to hold our biggest descriptor */
-#define USB_COMP_EP0_BUFSIZ	1024
+#define USB_COMP_EP0_BUFSIZ	4096
 
 /* OS feature descriptor length <= 4kB */
 #define USB_COMP_EP0_OS_DESC_BUFSIZ	4096
-- 
cgit v1.2.3


From 6d796c68cd15234a33a4bd2ef7231125fea2dc6c Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Tue, 8 May 2018 09:04:20 +0800
Subject: mmc: core: add tunable delay waiting for power to be stable

The hard-coded 10ms delay in mmc_power_up came from
commit 79bccc5aefb4 ("mmc: increase power up delay"), which said "The TI
controller on Toshiba Tecra M5 needs more time to power up or the cards
will init incorrectly or not at all." But it's too engineering solution
for a special board but force all platforms to wait for that long time,
especially painful for mmc_power_up for eMMC when booting.

However, it's added since 2009, and we can't tell if other platforms
benefit from it. But in practise, the modern hardware are most likely to
have a stable power supply with 1ms after setting it for no matter PMIC
or discrete power. And more importnatly, most regulators implement the
callback of ->set_voltage_time_sel() for regulator core to wait for
specific period of time for the power supply to be stable, which means
once regulator_set_voltage_* return, the power should reach the the
minimum voltage that works for initialization. Of course, if there
are some other ways for host to power the card, we should allow them
to argue a suitable delay as well.

With this patch, we could assign the delay from firmware, or we could
assigne it via ->set_ios() callback from host drivers.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 4 ++--
 drivers/mmc/core/host.c  | 4 ++++
 include/linux/mmc/host.h | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 42cfcb64abe0..9a769edbabe0 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1660,7 +1660,7 @@ void mmc_power_up(struct mmc_host *host, u32 ocr)
 	 * This delay should be sufficient to allow the power supply
 	 * to reach the minimum voltage.
 	 */
-	mmc_delay(10);
+	mmc_delay(host->ios.power_delay_ms);
 
 	mmc_pwrseq_post_power_on(host);
 
@@ -1673,7 +1673,7 @@ void mmc_power_up(struct mmc_host *host, u32 ocr)
 	 * This delay must be at least 74 clock sizes, or 1 ms, or the
 	 * time required to reach a stable voltage.
 	 */
-	mmc_delay(10);
+	mmc_delay(host->ios.power_delay_ms);
 }
 
 void mmc_power_off(struct mmc_host *host)
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index da08a17fbf6c..c57ffff18e37 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -343,6 +343,9 @@ int mmc_of_parse(struct mmc_host *host)
 		host->dsr_req = 0;
 	}
 
+	device_property_read_u32(dev, "post-power-on-delay-ms",
+				 &host->ios.power_delay_ms);
+
 	return mmc_pwrseq_alloc(host);
 }
 
@@ -408,6 +411,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->max_blk_count = PAGE_SIZE / 512;
 
 	host->fixed_drv_type = -EINVAL;
+	host->ios.power_delay_ms = 10;
 
 	return host;
 }
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 8f1859044db1..64300a48dcce 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -22,6 +22,7 @@
 struct mmc_ios {
 	unsigned int	clock;			/* clock rate */
 	unsigned short	vdd;
+	unsigned int	power_delay_ms;		/* waiting for stable power */
 
 /* vdd stores the bit number of the selected voltage range from below. */
 
-- 
cgit v1.2.3


From 0788f1e97324d8378e860dc2560699ddc6f3aef9 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 10 May 2018 11:35:15 +0100
Subject: arm_pmu: simplify arm_pmu::handle_irq

The arm_pmu::handle_irq() callback has the same prototype as a generic
IRQ handler, taking the IRQ number and a void pointer argument which it
must convert to an arm_pmu pointer.

This means that all arm_pmu::handle_irq() take an IRQ number they never
use, and all must explicitly cast the void pointer to an arm_pmu
pointer.

Instead, let's change arm_pmu::handle_irq to take an arm_pmu pointer,
allowing these casts to be removed. The redundant IRQ number parameter
is also removed.

Suggested-by: Hoeun Ryu <hoeun.ryu@lge.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm/kernel/perf_event_v6.c     | 4 +---
 arch/arm/kernel/perf_event_v7.c     | 3 +--
 arch/arm/kernel/perf_event_xscale.c | 6 ++----
 arch/arm64/kernel/perf_event.c      | 3 +--
 drivers/perf/arm_pmu.c              | 2 +-
 include/linux/perf/arm_pmu.h        | 2 +-
 6 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index 1d7061a38922..be42c4f66a40 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -303,12 +303,10 @@ static void armv6pmu_enable_event(struct perf_event *event)
 }
 
 static irqreturn_t
-armv6pmu_handle_irq(int irq_num,
-		    void *dev)
+armv6pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	unsigned long pmcr = armv6_pmcr_read();
 	struct perf_sample_data data;
-	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 870b66c1e4ef..57f01e059f39 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -946,11 +946,10 @@ static void armv7pmu_disable_event(struct perf_event *event)
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
+static irqreturn_t armv7pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	u32 pmnc;
 	struct perf_sample_data data;
-	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index fcf218da660e..88d1a76f5367 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -142,11 +142,10 @@ xscale1_pmnc_counter_has_overflowed(unsigned long pmnc,
 }
 
 static irqreturn_t
-xscale1pmu_handle_irq(int irq_num, void *dev)
+xscale1pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	unsigned long pmnc;
 	struct perf_sample_data data;
-	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
@@ -489,11 +488,10 @@ xscale2_pmnc_counter_has_overflowed(unsigned long of_flags,
 }
 
 static irqreturn_t
-xscale2pmu_handle_irq(int irq_num, void *dev)
+xscale2pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	unsigned long pmnc, of_flags;
 	struct perf_sample_data data;
-	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 85a251b6dfa8..33147aacdafd 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -670,11 +670,10 @@ static void armv8pmu_disable_event(struct perf_event *event)
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev)
+static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	u32 pmovsr;
 	struct perf_sample_data data;
-	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 1a0d340b65cf..a6347d487635 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -339,7 +339,7 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 		return IRQ_NONE;
 
 	start_clock = sched_clock();
-	ret = armpmu->handle_irq(irq, armpmu);
+	ret = armpmu->handle_irq(armpmu);
 	finish_clock = sched_clock();
 
 	perf_sample_event_took(finish_clock - start_clock);
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 40036a57d072..ad5444491975 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -78,7 +78,7 @@ struct arm_pmu {
 	struct pmu	pmu;
 	cpumask_t	supported_cpus;
 	char		*name;
-	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
+	irqreturn_t	(*handle_irq)(struct arm_pmu *pmu);
 	void		(*enable)(struct perf_event *event);
 	void		(*disable)(struct perf_event *event);
 	int		(*get_event_idx)(struct pmu_hw_events *hw_events,
-- 
cgit v1.2.3


From 7b0c6b38c4f9f9736ccd41363b3cfd7143e1f823 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 21 May 2018 13:08:44 +0200
Subject: tty: add missing const to termios hw-change helper

Add missing const qualifiers to the parameters of the termios hw-change
helper, which is used by a few USB serial drivers. This specifically
allows the pl2303 driver to use const arguments in one of its helper as
well.

Cc: Jiri Slaby <jslaby@suse.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/tty/tty_ioctl.c | 2 +-
 include/linux/tty.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index d9b561d89432..d99fec44036c 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(tty_termios_copy_hw);
  *	between the two termios structures, or a speed change is needed.
  */
 
-int tty_termios_hw_change(struct ktermios *a, struct ktermios *b)
+int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b)
 {
 	if (a->c_ispeed != b->c_ispeed || a->c_ospeed != b->c_ospeed)
 		return 1;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1dd587ba6d88..955cd0c93d84 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -527,7 +527,7 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 }
 
 extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
-extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
+extern int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
 extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
 
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
-- 
cgit v1.2.3


From ad75646c68a6e344a3609f40825855a0e742d04d Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:34:57 +0200
Subject: xsk: fill hole in struct sockaddr_xdp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the sxdp_flags up, avoiding a hole in the uapi structure.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 56db977221d2..c46609a00168 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -17,10 +17,10 @@
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
+	__u16 sxdp_flags;
 	__u32 sxdp_ifindex;
 	__u32 sxdp_queue_id;
 	__u32 sxdp_shared_umem_fd;
-	__u16 sxdp_flags;
 };
 
 /* XDP socket options */
-- 
cgit v1.2.3


From b3a9e0be436960072ddf327d9d82f50ee2f620e0 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:34:59 +0200
Subject: xsk: remove explicit ring structure from uapi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In this commit we remove the explicit ring structure from the the
uapi. It is tricky for an uapi to depend on a certain L1 cache line
size, since it can differ for variants of the same architecture. Now,
we let the user application determine the offsets of the producer,
consumer and descriptors by asking the socket via getsockopt.

A typical flow would be (Rx ring):

  struct xdp_mmap_offsets off;
  struct xdp_desc *ring;
  u32 *prod, *cons;
  void *map;
  ...

  getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);

  map = mmap(NULL, off.rx.desc +
		   NUM_DESCS * sizeof(struct xdp_desc),
		   PROT_READ | PROT_WRITE,
		   MAP_SHARED | MAP_POPULATE, sfd,
		   XDP_PGOFF_RX_RING);
  prod = map + off.rx.producer;
  cons = map + off.rx.consumer;
  ring = map + off.rx.desc;

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 44 ++++++++++++++++++++++----------------------
 net/xdp/xsk.c               | 29 +++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 17 +++++++++++++++++
 3 files changed, 68 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index c46609a00168..4737cfe222f5 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -23,13 +23,27 @@ struct sockaddr_xdp {
 	__u32 sxdp_shared_umem_fd;
 };
 
+struct xdp_ring_offset {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets {
+	struct xdp_ring_offset rx;
+	struct xdp_ring_offset tx;
+	struct xdp_ring_offset fr; /* Fill */
+	struct xdp_ring_offset cr; /* Completion */
+};
+
 /* XDP socket options */
-#define XDP_RX_RING			1
-#define XDP_TX_RING			2
-#define XDP_UMEM_REG			3
-#define XDP_UMEM_FILL_RING		4
-#define XDP_UMEM_COMPLETION_RING	5
-#define XDP_STATISTICS			6
+#define XDP_MMAP_OFFSETS		1
+#define XDP_RX_RING			2
+#define XDP_TX_RING			3
+#define XDP_UMEM_REG			4
+#define XDP_UMEM_FILL_RING		5
+#define XDP_UMEM_COMPLETION_RING	6
+#define XDP_STATISTICS			7
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -50,6 +64,7 @@ struct xdp_statistics {
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
+/* Rx/Tx descriptor */
 struct xdp_desc {
 	__u32 idx;
 	__u32 len;
@@ -58,21 +73,6 @@ struct xdp_desc {
 	__u8 padding[5];
 };
 
-struct xdp_ring {
-	__u32 producer __attribute__((aligned(64)));
-	__u32 consumer __attribute__((aligned(64)));
-};
-
-/* Used for the RX and TX queues for packets */
-struct xdp_rxtx_ring {
-	struct xdp_ring ptrs;
-	struct xdp_desc desc[0] __attribute__((aligned(64)));
-};
-
-/* Used for the fill and completion queues for buffers */
-struct xdp_umem_ring {
-	struct xdp_ring ptrs;
-	__u32 desc[0] __attribute__((aligned(64)));
-};
+/* UMEM descriptor is __u32 */
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 29707354cf78..378dd9287da5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -489,6 +489,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 
 		return 0;
 	}
+	case XDP_MMAP_OFFSETS:
+	{
+		struct xdp_mmap_offsets off;
+
+		if (len < sizeof(off))
+			return -EINVAL;
+
+		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
+		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
+
+		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
+		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
+
+		len = sizeof(off);
+		if (copy_to_user(optval, &off, len))
+			return -EFAULT;
+		if (put_user(len, optlen))
+			return -EFAULT;
+
+		return 0;
+	}
 	default:
 		break;
 	}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 62e43be407d8..cb8e5be35110 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -13,6 +13,23 @@
 
 #define RX_BATCH_SIZE 16
 
+struct xdp_ring {
+	u32 producer ____cacheline_aligned_in_smp;
+	u32 consumer ____cacheline_aligned_in_smp;
+};
+
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+	struct xdp_ring ptrs;
+	struct xdp_desc desc[0] ____cacheline_aligned_in_smp;
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	u32 desc[0] ____cacheline_aligned_in_smp;
+};
+
 struct xsk_queue {
 	struct xdp_umem_props umem_props;
 	u32 ring_mask;
-- 
cgit v1.2.3


From 50d889b1789458d1f7d7f40ff4f628b670047773 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 09:08:13 -0700
Subject: net/ipv4: Add helper to return path MTU based on fib result

Determine path MTU from a FIB lookup result. Logic is a distillation of
ip_dst_mtu_maybe_forward.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip_fib.h |  2 ++
 net/ipv4/route.c     | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 81d0f2107ff1..69c91d1934c1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -449,4 +449,6 @@ static inline void fib_proc_exit(struct net *net)
 }
 #endif
 
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
+
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 29268efad247..ac3b22bc51b2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1352,6 +1352,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 	return NULL;
 }
 
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+	struct fib_info *fi = res->fi;
+	struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
+	struct net_device *dev = nh->nh_dev;
+	u32 mtu = 0;
+
+	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+		mtu = fi->fib_mtu;
+
+	if (likely(!mtu)) {
+		struct fib_nh_exception *fnhe;
+
+		fnhe = find_exception(nh, daddr);
+		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+			mtu = fnhe->fnhe_pmtu;
+	}
+
+	if (likely(!mtu))
+		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+	return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+}
+
 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 			      __be32 daddr, const bool do_cache)
 {
-- 
cgit v1.2.3


From 901731b882d77dc53897aec45015ced42d56fe4c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 09:08:14 -0700
Subject: net/ipv6: Add helper to return path MTU based on fib result

Determine path MTU from a FIB lookup result. Logic is based on
ip6_dst_mtu_forward plus lookup of nexthop exception.

Add ip6_dst_mtu_forward to ipv6_stubs to handle access by core
bpf code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/addrconf.h   |  2 ++
 include/net/ip6_fib.h    |  6 ++++++
 include/net/ip6_route.h  |  3 +++
 net/ipv6/addrconf_core.c |  8 ++++++++
 net/ipv6/af_inet6.c      |  1 +
 net/ipv6/route.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 68 insertions(+)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index ff766ab207e0..c07d4dd09361 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -236,6 +236,8 @@ struct ipv6_stub {
 						   struct flowi6 *fl6, int oif,
 						   const struct sk_buff *skb,
 						   int strict);
+	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
+				 struct in6_addr *saddr);
 
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index cc70f6da8462..7897efe80727 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -412,6 +412,12 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 	return f6i->fib6_nh.nh_dev;
 }
 
+static inline
+struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
+{
+	return f6i->fib6_nh.nh_lwtstate;
+}
+
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 4cf1ef935ed9..7b9c82de11cc 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -300,6 +300,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 	return mtu;
 }
 
+u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+		      struct in6_addr *saddr);
+
 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 				   struct net_device *dev, struct sk_buff *skb,
 				   const void *daddr);
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 2fe754fd4f5e..5cd0029d930e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -161,12 +161,20 @@ eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
 	return f6i;
 }
 
+static u32
+eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+			       struct in6_addr *saddr)
+{
+	return 0;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
 	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
 	.fib6_get_table    = eafnosupport_fib6_get_table,
 	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
 	.fib6_lookup       = eafnosupport_fib6_lookup,
 	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
+	.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 50de8b0d4f70..9ed0eae91758 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -894,6 +894,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.fib6_table_lookup = fib6_table_lookup,
 	.fib6_lookup       = fib6_lookup,
 	.fib6_multipath_select = fib6_multipath_select,
+	.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cc24ed3bc334..dc5d5c84dbef 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2603,6 +2603,54 @@ out:
 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ *
+ * based on ip6_dst_mtu_forward and exception logic of
+ * rt6_find_cached_rt; called with rcu_read_lock
+ */
+u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+		      struct in6_addr *saddr)
+{
+	struct rt6_exception_bucket *bucket;
+	struct rt6_exception *rt6_ex;
+	struct in6_addr *src_key;
+	struct inet6_dev *idev;
+	u32 mtu = 0;
+
+	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
+		mtu = f6i->fib6_pmtu;
+		if (mtu)
+			goto out;
+	}
+
+	src_key = NULL;
+#ifdef CONFIG_IPV6_SUBTREES
+	if (f6i->fib6_src.plen)
+		src_key = saddr;
+#endif
+
+	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
+	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
+
+	if (likely(!mtu)) {
+		struct net_device *dev = fib6_info_nh_dev(f6i);
+
+		mtu = IPV6_MIN_MTU;
+		idev = __in6_dev_get(dev);
+		if (idev && idev->cnf.mtu6 > mtu)
+			mtu = idev->cnf.mtu6;
+	}
+
+	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+out:
+	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
+}
+
 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 				  struct flowi6 *fl6)
 {
-- 
cgit v1.2.3


From f3a7ca64587f58686d4e2e894e9abbfbc9dffb25 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 22 May 2018 11:31:59 +0200
Subject: cfg80211: add missing kernel-doc

Add the kernel-doc missed earlier.

Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 11a218445448..df33c2766d22 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3856,6 +3856,10 @@ struct wiphy_iftype_ext_capab {
  *	bitmap of &enum nl80211_band values.  For instance, for
  *	NL80211_BAND_2GHZ, bit 0 would be set
  *	(i.e. BIT(NL80211_BAND_2GHZ)).
+ *
+ * @txq_limit: configuration of internal TX queue frame limit
+ * @txq_memory_limit: configuration internal TX queue memory limit
+ * @txq_quantum: configuration of internal TX queue scheduler quantum
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
-- 
cgit v1.2.3


From 2bc2e05f925dda7caaec4b715f4a22cb58e23393 Mon Sep 17 00:00:00 2001
From: Bin Liu <b-liu@ti.com>
Date: Mon, 21 May 2018 08:42:16 -0500
Subject: usb: musb: remove unused members in struct musb_hdrc_config

The following members in struct musb_hdrc_config are not used,
so remove them.

soft_con
utm_16
big_endian
mult_bulk_tx
mult_bulk_rx
high_iso_tx
high_iso_rx
dma
dma_channels
dyn_fifo_size
vendor_ctrl
vendor_stat
vendor_req
dma_req_chan
musb_hdrc_eps_bits

Signed-off-by: Bin Liu <b-liu@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/musb/sunxi.c |  4 ----
 include/linux/usb/musb.h | 15 ---------------
 2 files changed, 19 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/musb/sunxi.c b/drivers/usb/musb/sunxi.c
index 8f7d378b7e7e..62ab2ca03779 100644
--- a/drivers/usb/musb/sunxi.c
+++ b/drivers/usb/musb/sunxi.c
@@ -651,10 +651,8 @@ static const struct musb_hdrc_config sunxi_musb_hdrc_config = {
 	.fifo_cfg_size  = ARRAY_SIZE(sunxi_musb_mode_cfg),
 	.multipoint	= true,
 	.dyn_fifo	= true,
-	.soft_con       = true,
 	.num_eps	= SUNXI_MUSB_MAX_EP_NUM,
 	.ram_bits	= SUNXI_MUSB_RAM_BITS,
-	.dma		= 0,
 };
 
 static struct musb_hdrc_config sunxi_musb_hdrc_config_h3 = {
@@ -662,10 +660,8 @@ static struct musb_hdrc_config sunxi_musb_hdrc_config_h3 = {
 	.fifo_cfg_size  = ARRAY_SIZE(sunxi_musb_mode_cfg_h3),
 	.multipoint	= true,
 	.dyn_fifo	= true,
-	.soft_con       = true,
 	.num_eps	= SUNXI_MUSB_MAX_EP_NUM_H3,
 	.ram_bits	= SUNXI_MUSB_RAM_BITS,
-	.dma		= 0,
 };
 
 
diff --git a/include/linux/usb/musb.h b/include/linux/usb/musb.h
index 9eb908a98033..fc6c77918481 100644
--- a/include/linux/usb/musb.h
+++ b/include/linux/usb/musb.h
@@ -67,28 +67,13 @@ struct musb_hdrc_config {
 	/* MUSB configuration-specific details */
 	unsigned	multipoint:1;	/* multipoint device */
 	unsigned	dyn_fifo:1 __deprecated; /* supports dynamic fifo sizing */
-	unsigned	soft_con:1 __deprecated; /* soft connect required */
-	unsigned	utm_16:1 __deprecated; /* utm data witdh is 16 bits */
-	unsigned	big_endian:1;	/* true if CPU uses big-endian */
-	unsigned	mult_bulk_tx:1;	/* Tx ep required for multbulk pkts */
-	unsigned	mult_bulk_rx:1;	/* Rx ep required for multbulk pkts */
-	unsigned	high_iso_tx:1;	/* Tx ep required for HB iso */
-	unsigned	high_iso_rx:1;	/* Rx ep required for HD iso */
-	unsigned	dma:1 __deprecated; /* supports DMA */
-	unsigned	vendor_req:1 __deprecated; /* vendor registers required */
 
 	/* need to explicitly de-assert the port reset after resume? */
 	unsigned	host_port_deassert_reset_at_resume:1;
 
 	u8		num_eps;	/* number of endpoints _with_ ep0 */
-	u8		dma_channels __deprecated; /* number of dma channels */
-	u8		dyn_fifo_size;	/* dynamic size in bytes */
-	u8		vendor_ctrl __deprecated; /* vendor control reg width */
-	u8		vendor_stat __deprecated; /* vendor status reg witdh */
-	u8		dma_req_chan __deprecated; /* bitmask for required dma channels */
 	u8		ram_bits;	/* ram address size */
 
-	struct musb_hdrc_eps_bits *eps_bits __deprecated;
 	u32		maximum_speed;
 };
 
-- 
cgit v1.2.3


From da0086d018223529c686ef46db039533572418d8 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Sun, 25 Mar 2018 14:49:03 +0200
Subject: i2c: Export of_i2c_get_board_info()

I3C busses have to know about all I2C devices connected on the I3C bus
to properly initialize the I3C master, and I2C frames can't be sent on
the bus until this initialization is done.

We can't let the I2C core parse the DT and instantiate I2C devices as
part of its i2c_add_adapter() procedure because, when done this way,
I2C devices are directly registered to the device-model and might be
attached to drivers which could in turn start sending frames on the bus,
which won't work since, as said above, the bus is not yet initialized.

Export of_i2c_register_device() in order to let the I3C core parse the
I2C device nodes by itself and initialize the bus.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-of.c | 48 ++++++++++++++++++++++++++++++-----------------
 include/linux/i2c.h       | 10 ++++++++++
 2 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/i2c-core-of.c b/drivers/i2c/i2c-core-of.c
index 9fb38e99a6c6..6cb7ad608bcd 100644
--- a/drivers/i2c/i2c-core-of.c
+++ b/drivers/i2c/i2c-core-of.c
@@ -22,46 +22,60 @@
 
 #include "i2c-core.h"
 
-static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap,
-						 struct device_node *node)
+int of_i2c_get_board_info(struct device *dev, struct device_node *node,
+			  struct i2c_board_info *info)
 {
-	struct i2c_client *client;
-	struct i2c_board_info info = {};
 	u32 addr;
 	int ret;
 
-	dev_dbg(&adap->dev, "of_i2c: register %pOF\n", node);
+	memset(info, 0, sizeof(*info));
 
-	if (of_modalias_node(node, info.type, sizeof(info.type)) < 0) {
-		dev_err(&adap->dev, "of_i2c: modalias failure on %pOF\n",
-			node);
-		return ERR_PTR(-EINVAL);
+	if (of_modalias_node(node, info->type, sizeof(info->type)) < 0) {
+		dev_err(dev, "of_i2c: modalias failure on %pOF\n", node);
+		return -EINVAL;
 	}
 
 	ret = of_property_read_u32(node, "reg", &addr);
 	if (ret) {
-		dev_err(&adap->dev, "of_i2c: invalid reg on %pOF\n", node);
-		return ERR_PTR(ret);
+		dev_err(dev, "of_i2c: invalid reg on %pOF\n", node);
+		return ret;
 	}
 
 	if (addr & I2C_TEN_BIT_ADDRESS) {
 		addr &= ~I2C_TEN_BIT_ADDRESS;
-		info.flags |= I2C_CLIENT_TEN;
+		info->flags |= I2C_CLIENT_TEN;
 	}
 
 	if (addr & I2C_OWN_SLAVE_ADDRESS) {
 		addr &= ~I2C_OWN_SLAVE_ADDRESS;
-		info.flags |= I2C_CLIENT_SLAVE;
+		info->flags |= I2C_CLIENT_SLAVE;
 	}
 
-	info.addr = addr;
-	info.of_node = node;
+	info->addr = addr;
+	info->of_node = node;
 
 	if (of_property_read_bool(node, "host-notify"))
-		info.flags |= I2C_CLIENT_HOST_NOTIFY;
+		info->flags |= I2C_CLIENT_HOST_NOTIFY;
 
 	if (of_get_property(node, "wakeup-source", NULL))
-		info.flags |= I2C_CLIENT_WAKE;
+		info->flags |= I2C_CLIENT_WAKE;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_i2c_get_board_info);
+
+static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap,
+						 struct device_node *node)
+{
+	struct i2c_client *client;
+	struct i2c_board_info info;
+	int ret;
+
+	dev_dbg(&adap->dev, "of_i2c: register %pOF\n", node);
+
+	ret = of_i2c_get_board_info(&adap->dev, node, &info);
+	if (ret)
+		return ERR_PTR(ret);
 
 	client = i2c_new_device(adap, &info);
 	if (!client) {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index aeb655772ef8..254cd34eeae2 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -901,6 +901,9 @@ extern const struct of_device_id
 *i2c_of_match_device(const struct of_device_id *matches,
 		     struct i2c_client *client);
 
+int of_i2c_get_board_info(struct device *dev, struct device_node *node,
+			  struct i2c_board_info *info);
+
 #else
 
 static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node)
@@ -925,6 +928,13 @@ static inline const struct of_device_id
 	return NULL;
 }
 
+static inline int of_i2c_get_board_info(struct device *dev,
+					struct device_node *node,
+					struct i2c_board_info *info)
+{
+	return -ENOTSUPP;
+}
+
 #endif /* CONFIG_OF */
 
 #if IS_ENABLED(CONFIG_ACPI)
-- 
cgit v1.2.3


From e7638488434415aa478e78435cac8f0365737638 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 16 May 2018 11:46:08 -0700
Subject: mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for fixing dax-dma-vs-unmap issues, filesystems need to
be able to rely on the fact that they will get wakeups on dev_pagemap
page-idle events. Introduce MEMORY_DEVICE_FS_DAX and
generic_dax_page_free() as common indicator / infrastructure for dax
filesytems to require. With this change there are no users of the
MEMORY_DEVICE_HOST designation, so remove it.

The HMM sub-system extended dev_pagemap to arrange a callback when a
dev_pagemap managed page is freed. Since a dev_pagemap page is free /
idle when its reference count is 1 it requires an additional branch to
check the page-type at put_page() time. Given put_page() is a hot-path
we do not want to incur that check if HMM is not in use, so a static
branch is used to avoid that overhead when not necessary.

Now, the FS_DAX implementation wants to reuse this mechanism for
receiving dev_pagemap ->page_free() callbacks. Rework the HMM-specific
static-key into a generic mechanism that either HMM or FS_DAX code paths
can enable.

For ARCH=um builds, and any other arch that lacks ZONE_DEVICE support,
care must be taken to compile out the DEV_PAGEMAP_OPS infrastructure.
However, we still need to support FS_DAX in the FS_DAX_LIMITED case
implemented by the s390/dcssblk driver.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Thomas Meyer <thomas@m3y3r.de>
Reported-by: Dave Jiang <dave.jiang@intel.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c       | 14 ++++++++--
 drivers/nvdimm/pfn_devs.c |  2 --
 drivers/nvdimm/pmem.c     | 25 +++++++++++++++++
 fs/Kconfig                |  1 +
 include/linux/memremap.h  | 36 +++++++-----------------
 include/linux/mm.h        | 71 +++++++++++++++++++++++++++++++++++------------
 kernel/memremap.c         | 32 ++++++++++++++++++---
 mm/Kconfig                |  5 ++++
 mm/hmm.c                  | 13 ++-------
 mm/swap.c                 |  3 +-
 10 files changed, 137 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..bf8bfaf5596f 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -86,6 +86,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 {
 	struct block_device *bdev = sb->s_bdev;
 	struct dax_device *dax_dev;
+	bool dax_enabled = false;
 	pgoff_t pgoff;
 	int err, id;
 	void *kaddr;
@@ -134,14 +135,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 		 * on being able to do (page_address(pfn_to_page())).
 		 */
 		WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
+		dax_enabled = true;
 	} else if (pfn_t_devmap(pfn)) {
-		/* pass */;
-	} else {
+		struct dev_pagemap *pgmap;
+
+		pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
+		if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
+			dax_enabled = true;
+		put_dev_pagemap(pgmap);
+	}
+
+	if (!dax_enabled) {
 		pr_debug("VFS (%s): error: dax support not enabled\n",
 				sb->s_id);
 		return -EOPNOTSUPP;
 	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__bdev_dax_supported);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 30b08791597d..3f7ad5bc443e 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 	res->start += start_pad;
 	res->end -= end_trunc;
 
-	pgmap->type = MEMORY_DEVICE_HOST;
-
 	if (nd_pfn->mode == PFN_MODE_RAM) {
 		if (offset < SZ_8K)
 			return -EINVAL;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..06b41ec9f1b3 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -294,6 +294,27 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
+static void pmem_release_pgmap_ops(void *__pgmap)
+{
+	dev_pagemap_put_ops();
+}
+
+static void fsdax_pagefree(struct page *page, void *data)
+{
+	wake_up_var(&page->_refcount);
+}
+
+static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
+{
+	dev_pagemap_get_ops();
+	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
+		return -ENOMEM;
+	pgmap->type = MEMORY_DEVICE_FS_DAX;
+	pgmap->page_free = fsdax_pagefree;
+
+	return 0;
+}
+
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -353,6 +374,8 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
 	if (is_nd_pfn(dev)) {
+		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+			return -ENOMEM;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -364,6 +387,8 @@ static int pmem_attach_disk(struct device *dev,
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
 		pmem->pgmap.altmap_valid = false;
+		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+			return -ENOMEM;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a86d965..1e050e012eb9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
 	bool "Direct Access (DAX) support"
 	depends on MMU
 	depends on !(ARM || MIPS || SPARC)
+	select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
 	select FS_IOMAP
 	select DAX
 	help
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7b4899c06f49..5ebfff65da4d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
-#include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
 
@@ -30,13 +29,6 @@ struct vmem_altmap {
  * Specialize ZONE_DEVICE memory into multiple types each having differents
  * usage.
  *
- * MEMORY_DEVICE_HOST:
- * Persistent device memory (pmem): struct page might be allocated in different
- * memory and architecture might want to perform special actions. It is similar
- * to regular memory, in that the CPU can access it transparently. However,
- * it is likely to have different bandwidth and latency than regular memory.
- * See Documentation/nvdimm/nvdimm.txt for more information.
- *
  * MEMORY_DEVICE_PRIVATE:
  * Device memory that is not directly addressable by the CPU: CPU can neither
  * read nor write private memory. In this case, we do still have struct pages
@@ -53,11 +45,19 @@ struct vmem_altmap {
  * driver can hotplug the device memory using ZONE_DEVICE and with that memory
  * type. Any page of a process can be migrated to such memory. However no one
  * should be allow to pin such memory so that it can always be evicted.
+ *
+ * MEMORY_DEVICE_FS_DAX:
+ * Host memory that has similar access semantics as System RAM i.e. DMA
+ * coherent and supports page pinning. In support of coordinating page
+ * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
+ * wakeup event whenever a page is unpinned and becomes idle. This
+ * wakeup is used to coordinate physical address space management (ex:
+ * fs truncate/hole punch) vs pinned pages (ex: device dma).
  */
 enum memory_type {
-	MEMORY_DEVICE_HOST = 0,
-	MEMORY_DEVICE_PRIVATE,
+	MEMORY_DEVICE_PRIVATE = 1,
 	MEMORY_DEVICE_PUBLIC,
+	MEMORY_DEVICE_FS_DAX,
 };
 
 /*
@@ -129,8 +129,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
-
-static inline bool is_zone_device_page(const struct page *page);
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct dev_pagemap *pgmap)
@@ -161,20 +159,6 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
 }
 #endif /* CONFIG_ZONE_DEVICE */
 
-#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
-static inline bool is_device_private_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
-}
-
-static inline bool is_device_public_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
-}
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
 static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
 {
 	if (pgmap)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06a4be6..6e19265ee8f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -821,27 +821,65 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
-#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
-void put_zone_device_private_or_public_page(struct page *page);
-DECLARE_STATIC_KEY_FALSE(device_private_key);
-#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
-static inline bool is_device_private_page(const struct page *page);
-static inline bool is_device_public_page(const struct page *page);
-#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-static inline void put_zone_device_private_or_public_page(struct page *page)
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void dev_pagemap_get_ops(void);
+void dev_pagemap_put_ops(void);
+void __put_devmap_managed_page(struct page *page);
+DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	if (!static_branch_unlikely(&devmap_managed_key))
+		return false;
+	if (!is_zone_device_page(page))
+		return false;
+	switch (page->pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+	case MEMORY_DEVICE_PUBLIC:
+	case MEMORY_DEVICE_FS_DAX:
+		__put_devmap_managed_page(page);
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+static inline bool is_device_private_page(const struct page *page)
 {
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
-#define IS_HMM_ENABLED 0
+
+static inline bool is_device_public_page(const struct page *page)
+{
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+}
+
+#else /* CONFIG_DEV_PAGEMAP_OPS */
+static inline void dev_pagemap_get_ops(void)
+{
+}
+
+static inline void dev_pagemap_put_ops(void)
+{
+}
+
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	return false;
+}
+
 static inline bool is_device_private_page(const struct page *page)
 {
 	return false;
 }
+
 static inline bool is_device_public_page(const struct page *page)
 {
 	return false;
 }
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 static inline void get_page(struct page *page)
 {
@@ -859,16 +897,13 @@ static inline void put_page(struct page *page)
 	page = compound_head(page);
 
 	/*
-	 * For private device pages we need to catch refcount transition from
-	 * 2 to 1, when refcount reach one it means the private device page is
-	 * free and we need to inform the device driver through callback. See
+	 * For devmap managed pages we need to catch refcount transition from
+	 * 2 to 1, when refcount reach one it means the page is free and we
+	 * need to inform the device driver through callback. See
 	 * include/linux/memremap.h and HMM for details.
 	 */
-	if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
-	    unlikely(is_device_public_page(page)))) {
-		put_zone_device_private_or_public_page(page);
+	if (put_devmap_managed_page(page))
 		return;
-	}
 
 	if (put_page_testzero(page))
 		__put_page(page);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 37a9604133f6..5857267a4af5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -9,6 +9,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/wait_bit.h>
 
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -300,9 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 
 	return pgmap;
 }
+EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
-void put_zone_device_private_or_public_page(struct page *page)
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL_GPL(devmap_managed_key);
+static atomic_t devmap_enable;
+
+/*
+ * Toggle the static key for ->page_free() callbacks when dev_pagemap
+ * pages go idle.
+ */
+void dev_pagemap_get_ops(void)
+{
+	if (atomic_inc_return(&devmap_enable) == 1)
+		static_branch_enable(&devmap_managed_key);
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
+
+void dev_pagemap_put_ops(void)
+{
+	if (atomic_dec_and_test(&devmap_enable))
+		static_branch_disable(&devmap_managed_key);
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
+
+void __put_devmap_managed_page(struct page *page)
 {
 	int count = page_ref_dec_return(page);
 
@@ -322,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page)
 	} else if (!count)
 		__put_page(page);
 }
-EXPORT_SYMBOL(put_zone_device_private_or_public_page);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..bf9d6366bced 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -692,6 +692,9 @@ config ARCH_HAS_HMM
 config MIGRATE_VMA_HELPER
 	bool
 
+config DEV_PAGEMAP_OPS
+	bool
+
 config HMM
 	bool
 	select MIGRATE_VMA_HELPER
@@ -712,6 +715,7 @@ config DEVICE_PRIVATE
 	bool "Unaddressable device memory (GPU memory, ...)"
 	depends on ARCH_HAS_HMM
 	select HMM
+	select DEV_PAGEMAP_OPS
 
 	help
 	  Allows creation of struct pages to represent unaddressable device
@@ -722,6 +726,7 @@ config DEVICE_PUBLIC
 	bool "Addressable device memory (like GPU memory)"
 	depends on ARCH_HAS_HMM
 	select HMM
+	select DEV_PAGEMAP_OPS
 
 	help
 	  Allows creation of struct pages to represent addressable device
diff --git a/mm/hmm.c b/mm/hmm.c
index 486dc394a5a3..de7b6bf77201 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -35,15 +35,6 @@
 
 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
-#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
-/*
- * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
- */
-DEFINE_STATIC_KEY_FALSE(device_private_key);
-EXPORT_SYMBOL(device_private_key);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
-
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
@@ -1167,7 +1158,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	resource_size_t addr;
 	int ret;
 
-	static_branch_enable(&device_private_key);
+	dev_pagemap_get_ops();
 
 	devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
 				   GFP_KERNEL, dev_to_node(device));
@@ -1261,7 +1252,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
 		return ERR_PTR(-EINVAL);
 
-	static_branch_enable(&device_private_key);
+	dev_pagemap_get_ops();
 
 	devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
 				   GFP_KERNEL, dev_to_node(device));
diff --git a/mm/swap.c b/mm/swap.c
index 3dd518832096..26fc9b5f1b6c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,6 +29,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
+#include <linux/memremap.h>
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
@@ -743,7 +744,7 @@ void release_pages(struct page **pages, int nr)
 						       flags);
 				locked_pgdat = NULL;
 			}
-			put_zone_device_private_or_public_page(page);
+			put_devmap_managed_page(page);
 			continue;
 		}
 
-- 
cgit v1.2.3


From 5fac7408d828719db6d3fdba63e3c3726a6d1ee5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 9 Mar 2018 17:44:31 -0800
Subject: mm, fs, dax: handle layout changes to pinned dax mappings

Background:

get_user_pages() in the filesystem pins file backed memory pages for
access by devices performing dma. However, it only pins the memory pages
not the page-to-file offset association. If a file is truncated the
pages are mapped out of the file and dma may continue indefinitely into
a page that is owned by a device driver. This breaks coherency of the
file vs dma, but the assumption is that if userspace wants the
file-space truncated it does not matter what data is inbound from the
device, it is not relevant anymore. The only expectation is that dma can
safely continue while the filesystem reallocates the block(s).

Problem:

This expectation that dma can safely continue while the filesystem
changes the block map is broken by dax. With dax the target dma page
*is* the filesystem block. The model of leaving the page pinned for dma,
but truncating the file block out of the file, means that the filesytem
is free to reallocate a block under active dma to another file and now
the expected data-incoherency situation has turned into active
data-corruption.

Solution:

Defer all filesystem operations (fallocate(), truncate()) on a dax mode
file while any page/block in the file is under active dma. This solution
assumes that dma is transient. Cases where dma operations are known to
not be transient, like RDMA, have been explicitly disabled via
commits like 5f1d43de5416 "IB/core: disable memory registration of
filesystem-dax vmas".

The dax_layout_busy_page() routine is called by filesystems with a lock
held against mm faults (i_mmap_lock) to find pinned / busy dax pages.
The process of looking up a busy page invalidates all mappings
to trigger any subsequent get_user_pages() to block on i_mmap_lock.
The filesystem continues to call dax_layout_busy_page() until it finally
returns no more active pages. This approach assumes that the page
pinning is transient, if that assumption is violated the system would
have likely hung from the uncompleted I/O.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h |  7 ++++
 2 files changed, 104 insertions(+)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index aaec72ded1b6..e8f61ea690f7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 	}
 }
 
+static struct page *dax_busy_page(void *entry)
+{
+	unsigned long pfn;
+
+	for_each_mapped_pfn(entry, pfn) {
+		struct page *page = pfn_to_page(pfn);
+
+		if (page_ref_count(page) > 1)
+			return page;
+	}
+	return NULL;
+}
+
 /*
  * Find radix tree entry at given index. If it points to an exceptional entry,
  * return it with the radix tree entry locked. If the radix tree doesn't
@@ -492,6 +505,90 @@ restart:
 	return entry;
 }
 
+/**
+ * dax_layout_busy_page - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+	pgoff_t	indices[PAGEVEC_SIZE];
+	struct page *page = NULL;
+	struct pagevec pvec;
+	pgoff_t	index, end;
+	unsigned i;
+
+	/*
+	 * In the 'limited' case get_user_pages() for dax is disabled.
+	 */
+	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+		return NULL;
+
+	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+		return NULL;
+
+	pagevec_init(&pvec);
+	index = 0;
+	end = -1;
+
+	/*
+	 * If we race get_user_pages_fast() here either we'll see the
+	 * elevated page count in the pagevec_lookup and wait, or
+	 * get_user_pages_fast() will see that the page it took a reference
+	 * against is no longer mapped in the page tables and bail to the
+	 * get_user_pages() slow path.  The slow path is protected by
+	 * pte_lock() and pmd_lock(). New references are not taken without
+	 * holding those locks, and unmap_mapping_range() will not zero the
+	 * pte or pmd without holding the respective lock, so we are
+	 * guaranteed to either see new references or prevent new
+	 * references from being established.
+	 */
+	unmap_mapping_range(mapping, 0, 0, 1);
+
+	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE),
+				indices)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *pvec_ent = pvec.pages[i];
+			void *entry;
+
+			index = indices[i];
+			if (index >= end)
+				break;
+
+			if (!radix_tree_exceptional_entry(pvec_ent))
+				continue;
+
+			xa_lock_irq(&mapping->i_pages);
+			entry = get_unlocked_mapping_entry(mapping, index, NULL);
+			if (entry)
+				page = dax_busy_page(entry);
+			put_unlocked_mapping_entry(mapping, index, entry);
+			xa_unlock_irq(&mapping->i_pages);
+			if (page)
+				break;
+		}
+		pagevec_remove_exceptionals(&pvec);
+		pagevec_release(&pvec);
+		index++;
+
+		if (page)
+			break;
+	}
+	return page;
+}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+
 static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 					  pgoff_t index, bool trunc)
 {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f9eb22ad341e..25bab6abb695 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -83,6 +83,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc);
+
+struct page *dax_layout_busy_page(struct address_space *mapping);
 #else
 static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
@@ -103,6 +105,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
 	return NULL;
 }
 
+static inline struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+	return NULL;
+}
+
 static inline int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc)
 {
-- 
cgit v1.2.3


From fbdb0a9181cb4c489a857f6bf71648276c85969c Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Thu, 10 May 2018 07:31:36 -0700
Subject: RDMA/CMA: add rdma_iw_cm_id() and rdma_res_to_id() helpers

Add a helper function for iwarp drivers to be able to map an
rdma_cm_id to an iw_cm_id.  This is useful for dumping driver specific
NLDEV/RESTRACK connection state.

Add a helper to return the rdma_cm_id pointer from the rdma_restack
pointer.  This is needed for rdma drivers to map a res entry back to
the public rdma_cm_id struct.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cma.c | 28 ++++++++++++++++++++++++++++
 include/rdma/rdma_cm.h        |  3 +++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index a403e679c6c1..441555a35525 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -146,6 +146,34 @@ const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
 }
 EXPORT_SYMBOL(rdma_consumer_reject_data);
 
+/**
+ * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id.
+ * @id: Communication Identifier
+ */
+struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id->device->node_type == RDMA_NODE_RNIC)
+		return id_priv->cm_id.iw;
+	return NULL;
+}
+EXPORT_SYMBOL(rdma_iw_cm_id);
+
+/**
+ * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
+ * @res: rdma resource tracking entry pointer
+ */
+struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
+{
+	struct rdma_id_private *id_priv =
+		container_of(res, struct rdma_id_private, res);
+
+	return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_res_to_id);
+
 static void cma_add_one(struct ib_device *device);
 static void cma_remove_one(struct ib_device *device, void *client_data);
 
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 690934733ba7..c5c1435c129a 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -420,4 +420,7 @@ const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
 void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
 		    union ib_gid *dgid);
 
+struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *cm_id);
+struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res);
+
 #endif /* RDMA_CM_H */
-- 
cgit v1.2.3


From f34436a430920bdfdba575b509d994c619f5101d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:53 -0700
Subject: net/ipv6: Simplify route replace and appending into multipath route

Bring consistency to ipv6 route replace and append semantics.

Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases:
1. can not replace a route with a reject route. Existing code appends
   a new route instead of replacing the existing one.

2. can not have a multipath route where a leg uses a dev only nexthop

Existing use cases affected by this change:
1. adding a route with existing prefix and metric using NLM_F_CREATE
   without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls
   'prepend'). Existing code auto-determines that the new nexthop can
   be appended to an existing route to create a multipath route. This
   change breaks that by requiring the APPEND flag for the new route
   to be added to an existing one. Instead the prepend just adds another
   route entry.

2. route replace. Existing code replaces first matching multipath route
   if new route is multipath capable and fallback to first matching
   non-ECMP route (reject or dev only route) in case one isn't available.
   New behavior replaces first matching route. (Thanks to Ido for spotting
   this one)

Note: Newer iproute2 is needed to display multipath routes with a dev-only
      nexthop. This is due to a bug in iproute2 and parsing nexthops.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |   6 --
 net/ipv6/ip6_fib.c      | 157 ++++++++++++++++++++++--------------------------
 net/ipv6/route.c        |   3 +-
 3 files changed, 73 insertions(+), 93 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 4cf1ef935ed9..9e4d0f0aeb6d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
-static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
-{
-	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
-}
-
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct net_device *dev,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d1dc6017f5a6..f9132a6de917 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 {
 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-	struct fib6_info *iter = NULL;
+	struct fib6_info *iter = NULL, *match = NULL;
 	struct fib6_info __rcu **ins;
-	struct fib6_info __rcu **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
+	int append = (info->nlh &&
+		       (info->nlh->nlmsg_flags & NLM_F_APPEND));
 	int add = (!info->nlh ||
 		   (info->nlh->nlmsg_flags & NLM_F_CREATE));
 	int found = 0;
-	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
 	u16 nlflags = NLM_F_EXCL;
 	int err;
 
-	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+	if (append)
 		nlflags |= NLM_F_APPEND;
 
 	ins = &fn->leaf;
@@ -968,13 +968,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 
 			nlflags &= ~NLM_F_EXCL;
 			if (replace) {
-				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
-					found++;
-					break;
-				}
-				if (rt_can_ecmp)
-					fallback_ins = fallback_ins ?: ins;
-				goto next_iter;
+				found++;
+				break;
 			}
 
 			if (rt6_duplicate_nexthop(iter, rt)) {
@@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
-			/* If we have the same destination and the same metric,
-			 * but not the same gateway, then the route we try to
-			 * add is sibling to this route, increment our counter
-			 * of siblings, and later we will add our route to the
-			 * list.
-			 * Only static routes (which don't have flag
-			 * RTF_EXPIRES) are used for ECMPv6.
-			 *
-			 * To avoid long list, we only had siblings if the
-			 * route have a gateway.
-			 */
-			if (rt_can_ecmp &&
-			    rt6_qualify_for_ecmp(iter))
-				rt->fib6_nsiblings++;
+
+			/* first route that matches */
+			if (!match)
+				match = iter;
 		}
 
 		if (iter->fib6_metric > rt->fib6_metric)
 			break;
 
-next_iter:
 		ins = &iter->fib6_next;
 	}
 
-	if (fallback_ins && !found) {
-		/* No ECMP-able route found, replace first non-ECMP one */
-		ins = fallback_ins;
-		iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-		found++;
-	}
-
 	/* Reset round-robin state, if necessary */
 	if (ins == &fn->leaf)
 		fn->rr_ptr = NULL;
 
 	/* Link this route to others same route. */
-	if (rt->fib6_nsiblings) {
-		unsigned int fib6_nsiblings;
+	if (append && match) {
 		struct fib6_info *sibling, *temp_sibling;
 
-		/* Find the first route that have the same metric */
-		sibling = leaf;
-		while (sibling) {
-			if (sibling->fib6_metric == rt->fib6_metric &&
-			    rt6_qualify_for_ecmp(sibling)) {
-				list_add_tail(&rt->fib6_siblings,
-					      &sibling->fib6_siblings);
-				break;
-			}
-			sibling = rcu_dereference_protected(sibling->fib6_next,
-				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+		if (rt->fib6_flags & RTF_REJECT) {
+			NL_SET_ERR_MSG(extack,
+				       "Can not append a REJECT route");
+			return -EINVAL;
+		} else if (match->fib6_flags & RTF_REJECT) {
+			NL_SET_ERR_MSG(extack,
+				       "Can not append to a REJECT route");
+			return -EINVAL;
 		}
+		rt->fib6_nsiblings = match->fib6_nsiblings;
+		list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
+		match->fib6_nsiblings++;
+
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
 		 */
-		fib6_nsiblings = 0;
 		list_for_each_entry_safe(sibling, temp_sibling,
-					 &rt->fib6_siblings, fib6_siblings) {
+					 &match->fib6_siblings, fib6_siblings) {
 			sibling->fib6_nsiblings++;
-			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
-			fib6_nsiblings++;
+			BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
 		}
-		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
-		rt6_multipath_rebalance(temp_sibling);
+
+		rt6_multipath_rebalance(match);
 	}
 
 	/*
 	 *	insert node
 	 */
 	if (!replace) {
+		enum fib_event_type event;
+
 		if (!add)
 			pr_warn("NLM_F_CREATE should be set when creating new route\n");
 
 add:
 		nlflags |= NLM_F_CREATE;
 
-		err = call_fib6_entry_notifiers(info->nl_net,
-						FIB_EVENT_ENTRY_ADD,
-						rt, extack);
+		event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
+		err = call_fib6_entry_notifiers(info->nl_net, event, rt,
+						extack);
 		if (err)
 			return err;
 
@@ -1086,7 +1062,7 @@ add:
 		}
 
 	} else {
-		int nsiblings;
+		struct fib6_info *tmp;
 
 		if (!found) {
 			if (add)
@@ -1101,48 +1077,57 @@ add:
 		if (err)
 			return err;
 
+		/* if route being replaced has siblings, set tmp to
+		 * last one, otherwise tmp is current route. this is
+		 * used to set fib6_next for new route
+		 */
+		if (iter->fib6_nsiblings)
+			tmp = list_last_entry(&iter->fib6_siblings,
+					      struct fib6_info,
+					      fib6_siblings);
+		else
+			tmp = iter;
+
+		/* insert new route */
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
-		rt->fib6_next = iter->fib6_next;
+		rt->fib6_next = tmp->fib6_next;
 		rcu_assign_pointer(*ins, rt);
+
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
-		nsiblings = iter->fib6_nsiblings;
-		iter->fib6_node = NULL;
-		fib6_purge_rt(iter, fn, info->nl_net);
-		if (rcu_access_pointer(fn->rr_ptr) == iter)
-			fn->rr_ptr = NULL;
-		fib6_info_release(iter);
 
-		if (nsiblings) {
+		/* delete old route */
+		rt = iter;
+
+		if (rt->fib6_nsiblings) {
+			struct fib6_info *tmp;
+
 			/* Replacing an ECMP route, remove all siblings */
-			ins = &rt->fib6_next;
-			iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-			while (iter) {
-				if (iter->fib6_metric > rt->fib6_metric)
-					break;
-				if (rt6_qualify_for_ecmp(iter)) {
-					*ins = iter->fib6_next;
-					iter->fib6_node = NULL;
-					fib6_purge_rt(iter, fn, info->nl_net);
-					if (rcu_access_pointer(fn->rr_ptr) == iter)
-						fn->rr_ptr = NULL;
-					fib6_info_release(iter);
-					nsiblings--;
-					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
-				} else {
-					ins = &iter->fib6_next;
-				}
-				iter = rcu_dereference_protected(*ins,
-					lockdep_is_held(&rt->fib6_table->tb6_lock));
+			list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
+						 fib6_siblings) {
+				iter->fib6_node = NULL;
+				fib6_purge_rt(iter, fn, info->nl_net);
+				if (rcu_access_pointer(fn->rr_ptr) == iter)
+					fn->rr_ptr = NULL;
+				fib6_info_release(iter);
+
+				rt->fib6_nsiblings--;
+				info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 			}
-			WARN_ON(nsiblings != 0);
 		}
+
+		WARN_ON(rt->fib6_nsiblings != 0);
+
+		rt->fib6_node = NULL;
+		fib6_purge_rt(rt, fn, info->nl_net);
+		if (rcu_access_pointer(fn->rr_ptr) == rt)
+			fn->rr_ptr = NULL;
+		fib6_info_release(rt);
 	}
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cc24ed3bc334..bcb8785c0451 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	while (iter) {
 		if (iter->fib6_metric == rt->fib6_metric &&
-		    rt6_qualify_for_ecmp(iter))
+		    iter->fib6_nsiblings)
 			return iter;
 		iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		 */
 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
 						     NLM_F_REPLACE);
+		cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
 		nhn++;
 	}
 
-- 
cgit v1.2.3


From f64c6013a2029316ea552f99823d116ee5f5f955 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 22 May 2018 09:50:53 -0700
Subject: rcu/x86: Provide early rcu_cpu_starting() callback

The x86/mtrr code does horrific things because hardware. It uses
stop_machine_from_inactive_cpu(), which does a wakeup (of the stopper
thread on another CPU), which uses RCU, all before the CPU is onlined.

RCU complains about this, because wakeups use RCU and RCU does
(rightfully) not consider offline CPUs for grace-periods.

Fix this by initializing RCU way early in the MTRR case.

Tested-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Add !SMP support, per 0day Test Robot report. ]
---
 arch/x86/kernel/cpu/mtrr/main.c | 4 ++++
 include/linux/rcupdate.h        | 1 -
 include/linux/rcutiny.h         | 1 +
 include/linux/rcutree.h         | 1 +
 kernel/rcu/tree.c               | 9 +++++++++
 5 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7468de429087..3ea0047beb40 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -46,6 +46,7 @@
 #include <linux/pci.h>
 #include <linux/smp.h>
 #include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820/api.h>
@@ -793,6 +794,9 @@ void mtrr_ap_init(void)
 
 	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
+
+	rcu_cpu_starting(smp_processor_id());
+
 	/*
 	 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
 	 * changed, but this routine will be called in cpu boot time,
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 19d235fefdb9..e679b175b411 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -108,7 +108,6 @@ void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
 void rcu_report_dead(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
 void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_RCU_STALL_COMMON
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ce9beec35e34..7b3c82e8a625 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -132,5 +132,6 @@ static inline void rcu_all_qs(void) { barrier(); }
 #define rcutree_offline_cpu      NULL
 #define rcutree_dead_cpu         NULL
 #define rcutree_dying_cpu        NULL
+static inline void rcu_cpu_starting(unsigned int cpu) { }
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 448f20f27396..914655848ef6 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -101,5 +101,6 @@ int rcutree_online_cpu(unsigned int cpu);
 int rcutree_offline_cpu(unsigned int cpu);
 int rcutree_dead_cpu(unsigned int cpu);
 int rcutree_dying_cpu(unsigned int cpu);
+void rcu_cpu_starting(unsigned int cpu);
 
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4fccdfa25716..aa7cade1b9f3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3665,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu)
 	return 0;
 }
 
+static DEFINE_PER_CPU(int, rcu_cpu_started);
+
 /*
  * Mark the specified CPU as being online so that subsequent grace periods
  * (both expedited and normal) will wait on it.  Note that this means that
@@ -3686,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu)
 	struct rcu_node *rnp;
 	struct rcu_state *rsp;
 
+	if (per_cpu(rcu_cpu_started, cpu))
+		return;
+
+	per_cpu(rcu_cpu_started, cpu) = 1;
+
 	for_each_rcu_flavor(rsp) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
 		rnp = rdp->mynode;
@@ -3742,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu)
 	preempt_enable();
 	for_each_rcu_flavor(rsp)
 		rcu_cleanup_dying_idle_cpu(cpu, rsp);
+
+	per_cpu(rcu_cpu_started, cpu) = 0;
 }
 
 /* Migrate the dead CPU's callbacks to the current CPU. */
-- 
cgit v1.2.3


From 2528c389936efbbece25088426fe7c3c91ff355f Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:38 -0700
Subject: qed: Add support for tlv request processing.

The patch adds driver support for processing TLV requests/repsonses
from the mfw and upper driver layers respectively. The implementation
reads the requested TLVs from the shared memory, requests the values
from upper layer drivers, populates this info (TLVs) shared memory and
notifies MFW about the TLV values.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile      |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h         |   5 +
 drivers/net/ethernet/qlogic/qed/qed_main.c    |   6 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.h     |  53 ++++
 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 378 ++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                    |  37 +++
 6 files changed, 480 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index c70cf2ad81c0..a0acb94d65f0 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
-	 qed_selftest.o qed_dcbx.o qed_debug.o qed_ptp.o
+	 qed_selftest.o qed_dcbx.o qed_debug.o qed_ptp.o qed_mng_tlv.o
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_QED_RDMA) += qed_roce.o qed_rdma.o qed_iwarp.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index adcff495466e..dfdbe529e2bf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -92,6 +92,8 @@ struct qed_eth_cb_ops;
 struct qed_dev_info;
 union qed_mcp_protocol_stats;
 enum qed_mcp_protocol_type;
+enum qed_mfw_tlv_type;
+union qed_mfw_tlv_data;
 
 /* helpers */
 #define QED_MFW_GET_FIELD(name, field) \
@@ -907,4 +909,7 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
 void qed_slowpath_irq_sync(struct qed_hwfn *p_hwfn);
 
+int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn,
+			  enum qed_mfw_tlv_type type,
+			  union qed_mfw_tlv_data *tlv_data);
 #endif /* _QED_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 9feed3b79cd4..cbf0ea97fc8e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -2088,3 +2088,9 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 		return;
 	}
 }
+
+int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn, enum qed_mfw_tlv_type type,
+			  union qed_mfw_tlv_data *tlv_buf)
+{
+	return -EINVAL;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 250579ba632b..591877fca67a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -213,6 +213,40 @@ enum qed_ov_wol {
 	QED_OV_WOL_ENABLED
 };
 
+enum qed_mfw_tlv_type {
+	QED_MFW_TLV_GENERIC = 0x1,	/* Core driver TLVs */
+	QED_MFW_TLV_ETH = 0x2,		/* L2 driver TLVs */
+	QED_MFW_TLV_MAX = 0x4,
+};
+
+struct qed_mfw_tlv_generic {
+#define QED_MFW_TLV_FLAGS_SIZE	2
+	struct {
+		u8 ipv4_csum_offload;
+		u8 lso_supported;
+		bool b_set;
+	} flags;
+
+#define QED_MFW_TLV_MAC_COUNT 3
+	/* First entry for primary MAC, 2 secondary MACs possible */
+	u8 mac[QED_MFW_TLV_MAC_COUNT][6];
+	bool mac_set[QED_MFW_TLV_MAC_COUNT];
+
+	u64 rx_frames;
+	bool rx_frames_set;
+	u64 rx_bytes;
+	bool rx_bytes_set;
+	u64 tx_frames;
+	bool tx_frames_set;
+	u64 tx_bytes;
+	bool tx_bytes_set;
+};
+
+union qed_mfw_tlv_data {
+	struct qed_mfw_tlv_generic generic;
+	struct qed_mfw_tlv_eth eth;
+};
+
 /**
  * @brief - returns the link params of the hw function
  *
@@ -561,6 +595,17 @@ int qed_mcp_bist_nvm_get_image_att(struct qed_hwfn *p_hwfn,
 				   struct bist_nvm_image_att *p_image_att,
 				   u32 image_index);
 
+/**
+ * @brief - Processes the TLV request from MFW i.e., get the required TLV info
+ *          from the qed client and send it to the MFW.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @param return 0 upon success.
+ */
+int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
 /* Using hwfn number (and not pf_num) is required since in CMT mode,
  * same pf_num may be used by two different hwfn
  * TODO - this shouldn't really be in .h file, but until all fields
@@ -621,6 +666,14 @@ struct qed_mcp_mb_params {
 	u32			mcp_param;
 };
 
+struct qed_drv_tlv_hdr {
+	u8 tlv_type;
+	u8 tlv_length;	/* In dwords - not including this header */
+	u8 tlv_reserved;
+#define QED_DRV_TLV_FLAGS_CHANGED 0x01
+	u8 tlv_flags;
+};
+
 /**
  * @brief Initialize the interface with the MCP
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
new file mode 100644
index 000000000000..d58a7146cce5
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include "qed.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+#define TLV_TYPE(p)     (p[0])
+#define TLV_LENGTH(p)   (p[1])
+#define TLV_FLAGS(p)    (p[3])
+
+#define QED_TLV_DATA_MAX (14)
+struct qed_tlv_parsed_buf {
+	/* To be filled with the address to set in Value field */
+	void *p_val;
+
+	/* To be used internally in case the value has to be modified */
+	u8 data[QED_TLV_DATA_MAX];
+};
+
+static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group)
+{
+	switch (tlv_type) {
+	case DRV_TLV_FEATURE_FLAGS:
+	case DRV_TLV_LOCAL_ADMIN_ADDR:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_1:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_2:
+	case DRV_TLV_OS_DRIVER_STATES:
+	case DRV_TLV_PXE_BOOT_PROGRESS:
+	case DRV_TLV_RX_FRAMES_RECEIVED:
+	case DRV_TLV_RX_BYTES_RECEIVED:
+	case DRV_TLV_TX_FRAMES_SENT:
+	case DRV_TLV_TX_BYTES_SENT:
+	case DRV_TLV_NPIV_ENABLED:
+	case DRV_TLV_PCIE_BUS_RX_UTILIZATION:
+	case DRV_TLV_PCIE_BUS_TX_UTILIZATION:
+	case DRV_TLV_DEVICE_CPU_CORES_UTILIZATION:
+	case DRV_TLV_LAST_VALID_DCC_TLV_RECEIVED:
+	case DRV_TLV_NCSI_RX_BYTES_RECEIVED:
+	case DRV_TLV_NCSI_TX_BYTES_SENT:
+		*tlv_group |= QED_MFW_TLV_GENERIC;
+		break;
+	case DRV_TLV_LSO_MAX_OFFLOAD_SIZE:
+	case DRV_TLV_LSO_MIN_SEGMENT_COUNT:
+	case DRV_TLV_PROMISCUOUS_MODE:
+	case DRV_TLV_TX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_NUM_OF_NET_QUEUE_VMQ_CFG:
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV4:
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV6:
+	case DRV_TLV_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+	case DRV_TLV_IOV_OFFLOAD:
+	case DRV_TLV_TX_QUEUES_EMPTY:
+	case DRV_TLV_RX_QUEUES_EMPTY:
+	case DRV_TLV_TX_QUEUES_FULL:
+	case DRV_TLV_RX_QUEUES_FULL:
+		*tlv_group |= QED_MFW_TLV_ETH;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Returns size of the data buffer or, -1 in case TLV data is not available. */
+static int
+qed_mfw_get_gen_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			  struct qed_mfw_tlv_generic *p_drv_buf,
+			  struct qed_tlv_parsed_buf *p_buf)
+{
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_FEATURE_FLAGS:
+		if (p_drv_buf->flags.b_set) {
+			memset(p_buf->data, 0, sizeof(u8) * QED_TLV_DATA_MAX);
+			p_buf->data[0] = p_drv_buf->flags.ipv4_csum_offload ?
+			    1 : 0;
+			p_buf->data[0] |= (p_drv_buf->flags.lso_supported ?
+					   1 : 0) << 1;
+			p_buf->p_val = p_buf->data;
+			return QED_MFW_TLV_FLAGS_SIZE;
+		}
+		break;
+
+	case DRV_TLV_LOCAL_ADMIN_ADDR:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_1:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_2:
+		{
+			int idx = p_tlv->tlv_type - DRV_TLV_LOCAL_ADMIN_ADDR;
+
+			if (p_drv_buf->mac_set[idx]) {
+				p_buf->p_val = p_drv_buf->mac[idx];
+				return ETH_ALEN;
+			}
+			break;
+		}
+
+	case DRV_TLV_RX_FRAMES_RECEIVED:
+		if (p_drv_buf->rx_frames_set) {
+			p_buf->p_val = &p_drv_buf->rx_frames;
+			return sizeof(p_drv_buf->rx_frames);
+		}
+		break;
+	case DRV_TLV_RX_BYTES_RECEIVED:
+		if (p_drv_buf->rx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->rx_bytes;
+			return sizeof(p_drv_buf->rx_bytes);
+		}
+		break;
+	case DRV_TLV_TX_FRAMES_SENT:
+		if (p_drv_buf->tx_frames_set) {
+			p_buf->p_val = &p_drv_buf->tx_frames;
+			return sizeof(p_drv_buf->tx_frames);
+		}
+		break;
+	case DRV_TLV_TX_BYTES_SENT:
+		if (p_drv_buf->tx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->tx_bytes;
+			return sizeof(p_drv_buf->tx_bytes);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return -1;
+}
+
+static int
+qed_mfw_get_eth_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			  struct qed_mfw_tlv_eth *p_drv_buf,
+			  struct qed_tlv_parsed_buf *p_buf)
+{
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_LSO_MAX_OFFLOAD_SIZE:
+		if (p_drv_buf->lso_maxoff_size_set) {
+			p_buf->p_val = &p_drv_buf->lso_maxoff_size;
+			return sizeof(p_drv_buf->lso_maxoff_size);
+		}
+		break;
+	case DRV_TLV_LSO_MIN_SEGMENT_COUNT:
+		if (p_drv_buf->lso_minseg_size_set) {
+			p_buf->p_val = &p_drv_buf->lso_minseg_size;
+			return sizeof(p_drv_buf->lso_minseg_size);
+		}
+		break;
+	case DRV_TLV_PROMISCUOUS_MODE:
+		if (p_drv_buf->prom_mode_set) {
+			p_buf->p_val = &p_drv_buf->prom_mode;
+			return sizeof(p_drv_buf->prom_mode);
+		}
+		break;
+	case DRV_TLV_TX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->tx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->tx_descr_size;
+			return sizeof(p_drv_buf->tx_descr_size);
+		}
+		break;
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->rx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->rx_descr_size;
+			return sizeof(p_drv_buf->rx_descr_size);
+		}
+		break;
+	case DRV_TLV_NUM_OF_NET_QUEUE_VMQ_CFG:
+		if (p_drv_buf->netq_count_set) {
+			p_buf->p_val = &p_drv_buf->netq_count;
+			return sizeof(p_drv_buf->netq_count);
+		}
+		break;
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV4:
+		if (p_drv_buf->tcp4_offloads_set) {
+			p_buf->p_val = &p_drv_buf->tcp4_offloads;
+			return sizeof(p_drv_buf->tcp4_offloads);
+		}
+		break;
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV6:
+		if (p_drv_buf->tcp6_offloads_set) {
+			p_buf->p_val = &p_drv_buf->tcp6_offloads;
+			return sizeof(p_drv_buf->tcp6_offloads);
+		}
+		break;
+	case DRV_TLV_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->tx_descr_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->tx_descr_qdepth;
+			return sizeof(p_drv_buf->tx_descr_qdepth);
+		}
+		break;
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->rx_descr_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->rx_descr_qdepth;
+			return sizeof(p_drv_buf->rx_descr_qdepth);
+		}
+		break;
+	case DRV_TLV_IOV_OFFLOAD:
+		if (p_drv_buf->iov_offload_set) {
+			p_buf->p_val = &p_drv_buf->iov_offload;
+			return sizeof(p_drv_buf->iov_offload);
+		}
+		break;
+	case DRV_TLV_TX_QUEUES_EMPTY:
+		if (p_drv_buf->txqs_empty_set) {
+			p_buf->p_val = &p_drv_buf->txqs_empty;
+			return sizeof(p_drv_buf->txqs_empty);
+		}
+		break;
+	case DRV_TLV_RX_QUEUES_EMPTY:
+		if (p_drv_buf->rxqs_empty_set) {
+			p_buf->p_val = &p_drv_buf->rxqs_empty;
+			return sizeof(p_drv_buf->rxqs_empty);
+		}
+		break;
+	case DRV_TLV_TX_QUEUES_FULL:
+		if (p_drv_buf->num_txqs_full_set) {
+			p_buf->p_val = &p_drv_buf->num_txqs_full;
+			return sizeof(p_drv_buf->num_txqs_full);
+		}
+		break;
+	case DRV_TLV_RX_QUEUES_FULL:
+		if (p_drv_buf->num_rxqs_full_set) {
+			p_buf->p_val = &p_drv_buf->num_rxqs_full;
+			return sizeof(p_drv_buf->num_rxqs_full);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return -1;
+}
+
+static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
+			       u8 tlv_group, u8 *p_mfw_buf, u32 size)
+{
+	union qed_mfw_tlv_data *p_tlv_data;
+	struct qed_tlv_parsed_buf buffer;
+	struct qed_drv_tlv_hdr tlv;
+	int len = 0;
+	u32 offset;
+	u8 *p_tlv;
+
+	p_tlv_data = vzalloc(sizeof(*p_tlv_data));
+	if (!p_tlv_data)
+		return -ENOMEM;
+
+	if (qed_mfw_fill_tlv_data(p_hwfn, tlv_group, p_tlv_data)) {
+		vfree(p_tlv_data);
+		return -EINVAL;
+	}
+
+	memset(&tlv, 0, sizeof(tlv));
+	for (offset = 0; offset < size;
+	     offset += sizeof(tlv) + sizeof(u32) * tlv.tlv_length) {
+		p_tlv = &p_mfw_buf[offset];
+		tlv.tlv_type = TLV_TYPE(p_tlv);
+		tlv.tlv_length = TLV_LENGTH(p_tlv);
+		tlv.tlv_flags = TLV_FLAGS(p_tlv);
+
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Type %d length = %d flags = 0x%x\n", tlv.tlv_type,
+			   tlv.tlv_length, tlv.tlv_flags);
+
+		if (tlv_group == QED_MFW_TLV_GENERIC)
+			len = qed_mfw_get_gen_tlv_value(&tlv,
+							&p_tlv_data->generic,
+							&buffer);
+		else if (tlv_group == QED_MFW_TLV_ETH)
+			len = qed_mfw_get_eth_tlv_value(&tlv,
+							&p_tlv_data->eth,
+							&buffer);
+
+		if (len > 0) {
+			WARN(len > 4 * tlv.tlv_length,
+			     "Incorrect MFW TLV length %d, it shouldn't be greater than %d\n",
+			     len, 4 * tlv.tlv_length);
+			len = min_t(int, len, 4 * tlv.tlv_length);
+			tlv.tlv_flags |= QED_DRV_TLV_FLAGS_CHANGED;
+			TLV_FLAGS(p_tlv) = tlv.tlv_flags;
+			memcpy(p_mfw_buf + offset + sizeof(tlv),
+			       buffer.p_val, len);
+		}
+	}
+
+	vfree(p_tlv_data);
+
+	return 0;
+}
+
+int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 addr, size, offset, resp, param, val, global_offsize, global_addr;
+	u8 tlv_group = 0, id, *p_mfw_buf = NULL, *p_temp;
+	struct qed_drv_tlv_hdr tlv;
+	int rc;
+
+	addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+				    PUBLIC_GLOBAL);
+	global_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	global_addr = SECTION_ADDR(global_offsize, 0);
+	addr = global_addr + offsetof(struct public_global, data_ptr);
+	addr = qed_rd(p_hwfn, p_ptt, addr);
+	size = qed_rd(p_hwfn, p_ptt, global_addr +
+		      offsetof(struct public_global, data_size));
+
+	if (!size) {
+		DP_NOTICE(p_hwfn, "Invalid TLV req size = %d\n", size);
+		goto drv_done;
+	}
+
+	p_mfw_buf = vzalloc(size);
+	if (!p_mfw_buf) {
+		DP_NOTICE(p_hwfn, "Failed allocate memory for p_mfw_buf\n");
+		goto drv_done;
+	}
+
+	/* Read the TLV request to local buffer. MFW represents the TLV in
+	 * little endian format and mcp returns it bigendian format. Hence
+	 * driver need to convert data to little endian first and then do the
+	 * memcpy (casting) to preserve the MFW TLV format in the driver buffer.
+	 *
+	 */
+	for (offset = 0; offset < size; offset += sizeof(u32)) {
+		val = qed_rd(p_hwfn, p_ptt, addr + offset);
+		val = be32_to_cpu(val);
+		memcpy(&p_mfw_buf[offset], &val, sizeof(u32));
+	}
+
+	/* Parse the headers to enumerate the requested TLV groups */
+	for (offset = 0; offset < size;
+	     offset += sizeof(tlv) + sizeof(u32) * tlv.tlv_length) {
+		p_temp = &p_mfw_buf[offset];
+		tlv.tlv_type = TLV_TYPE(p_temp);
+		tlv.tlv_length = TLV_LENGTH(p_temp);
+		if (qed_mfw_get_tlv_group(tlv.tlv_type, &tlv_group))
+			DP_VERBOSE(p_hwfn, NETIF_MSG_DRV,
+				   "Un recognized TLV %d\n", tlv.tlv_type);
+	}
+
+	/* Sanitize the TLV groups according to personality */
+	if ((tlv_group & QED_MFW_TLV_ETH) && !QED_IS_L2_PERSONALITY(p_hwfn)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Skipping L2 TLVs for non-L2 function\n");
+		tlv_group &= ~QED_MFW_TLV_ETH;
+	}
+
+	/* Update the TLV values in the local buffer */
+	for (id = QED_MFW_TLV_GENERIC; id < QED_MFW_TLV_MAX; id <<= 1) {
+		if (tlv_group & id)
+			if (qed_mfw_update_tlvs(p_hwfn, id, p_mfw_buf, size))
+				goto drv_done;
+	}
+
+	/* Write the TLV data to shared memory. The stream of 4 bytes first need
+	 * to be mem-copied to u32 element to make it as LSB format. And then
+	 * converted to big endian as required by mcp-write.
+	 */
+	for (offset = 0; offset < size; offset += sizeof(u32)) {
+		memcpy(&val, &p_mfw_buf[offset], sizeof(u32));
+		val = cpu_to_be32(val);
+		qed_wr(p_hwfn, p_ptt, addr + offset, val);
+	}
+
+drv_done:
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_GET_TLV_DONE, 0, &resp,
+			 &param);
+
+	vfree(p_mfw_buf);
+
+	return rc;
+}
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 907976fd56f7..8e4fad427ac2 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -182,6 +182,43 @@ enum qed_led_mode {
 	QED_LED_MODE_RESTORE
 };
 
+struct qed_mfw_tlv_eth {
+	u16 lso_maxoff_size;
+	bool lso_maxoff_size_set;
+	u16 lso_minseg_size;
+	bool lso_minseg_size_set;
+	u8 prom_mode;
+	bool prom_mode_set;
+	u16 tx_descr_size;
+	bool tx_descr_size_set;
+	u16 rx_descr_size;
+	bool rx_descr_size_set;
+	u16 netq_count;
+	bool netq_count_set;
+	u32 tcp4_offloads;
+	bool tcp4_offloads_set;
+	u32 tcp6_offloads;
+	bool tcp6_offloads_set;
+	u16 tx_descr_qdepth;
+	bool tx_descr_qdepth_set;
+	u16 rx_descr_qdepth;
+	bool rx_descr_qdepth_set;
+	u8 iov_offload;
+#define QED_MFW_TLV_IOV_OFFLOAD_NONE            (0)
+#define QED_MFW_TLV_IOV_OFFLOAD_MULTIQUEUE      (1)
+#define QED_MFW_TLV_IOV_OFFLOAD_VEB             (2)
+#define QED_MFW_TLV_IOV_OFFLOAD_VEPA            (3)
+	bool iov_offload_set;
+	u8 txqs_empty;
+	bool txqs_empty_set;
+	u8 rxqs_empty;
+	bool rxqs_empty_set;
+	u8 num_txqs_full;
+	bool num_txqs_full_set;
+	u8 num_rxqs_full;
+	bool num_rxqs_full_set;
+};
+
 #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
 					    (void __iomem *)(reg_addr))
 
-- 
cgit v1.2.3


From f240b6882211aae7155a9839dff1426e2853fe30 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:39 -0700
Subject: qed: Add support for processing fcoe tlv request.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.h     |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 828 ++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                    | 193 ++++++
 3 files changed, 1024 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 591877fca67a..b31f5d8bd27c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -216,7 +216,8 @@ enum qed_ov_wol {
 enum qed_mfw_tlv_type {
 	QED_MFW_TLV_GENERIC = 0x1,	/* Core driver TLVs */
 	QED_MFW_TLV_ETH = 0x2,		/* L2 driver TLVs */
-	QED_MFW_TLV_MAX = 0x4,
+	QED_MFW_TLV_FCOE = 0x4,		/* FCoE protocol TLVs */
+	QED_MFW_TLV_MAX = 0x8,
 };
 
 struct qed_mfw_tlv_generic {
@@ -245,6 +246,7 @@ struct qed_mfw_tlv_generic {
 union qed_mfw_tlv_data {
 	struct qed_mfw_tlv_generic generic;
 	struct qed_mfw_tlv_eth eth;
+	struct qed_mfw_tlv_fcoe fcoe;
 };
 
 /**
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
index d58a7146cce5..1873cfc08715 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
@@ -64,6 +64,157 @@ static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group)
 	case DRV_TLV_RX_QUEUES_FULL:
 		*tlv_group |= QED_MFW_TLV_ETH;
 		break;
+	case DRV_TLV_SCSI_TO:
+	case DRV_TLV_R_T_TOV:
+	case DRV_TLV_R_A_TOV:
+	case DRV_TLV_E_D_TOV:
+	case DRV_TLV_CR_TOV:
+	case DRV_TLV_BOOT_TYPE:
+	case DRV_TLV_NPIV_STATE:
+	case DRV_TLV_NUM_OF_NPIV_IDS:
+	case DRV_TLV_SWITCH_NAME:
+	case DRV_TLV_SWITCH_PORT_NUM:
+	case DRV_TLV_SWITCH_PORT_ID:
+	case DRV_TLV_VENDOR_NAME:
+	case DRV_TLV_SWITCH_MODEL:
+	case DRV_TLV_SWITCH_FW_VER:
+	case DRV_TLV_QOS_PRIORITY_PER_802_1P:
+	case DRV_TLV_PORT_ALIAS:
+	case DRV_TLV_PORT_STATE:
+	case DRV_TLV_FIP_TX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_LINK_FAILURE_COUNT:
+	case DRV_TLV_FCOE_BOOT_PROGRESS:
+	case DRV_TLV_RX_BROADCAST_PACKETS:
+	case DRV_TLV_TX_BROADCAST_PACKETS:
+	case DRV_TLV_FCOE_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+	case DRV_TLV_FCOE_RX_FRAMES_RECEIVED:
+	case DRV_TLV_FCOE_RX_BYTES_RECEIVED:
+	case DRV_TLV_FCOE_TX_FRAMES_SENT:
+	case DRV_TLV_FCOE_TX_BYTES_SENT:
+	case DRV_TLV_CRC_ERROR_COUNT:
+	case DRV_TLV_CRC_ERROR_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_1_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_2_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_3_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_4_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_5_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_5_TIMESTAMP:
+	case DRV_TLV_LOSS_OF_SYNC_ERROR_COUNT:
+	case DRV_TLV_LOSS_OF_SIGNAL_ERRORS:
+	case DRV_TLV_PRIMITIVE_SEQUENCE_PROTOCOL_ERROR_COUNT:
+	case DRV_TLV_DISPARITY_ERROR_COUNT:
+	case DRV_TLV_CODE_VIOLATION_ERROR_COUNT:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_4:
+	case DRV_TLV_LAST_FLOGI_TIMESTAMP:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_4:
+	case DRV_TLV_LAST_FLOGI_ACC_TIMESTAMP:
+	case DRV_TLV_LAST_FLOGI_RJT:
+	case DRV_TLV_LAST_FLOGI_RJT_TIMESTAMP:
+	case DRV_TLV_FDISCS_SENT_COUNT:
+	case DRV_TLV_FDISC_ACCS_RECEIVED:
+	case DRV_TLV_FDISC_RJTS_RECEIVED:
+	case DRV_TLV_PLOGI_SENT_COUNT:
+	case DRV_TLV_PLOGI_ACCS_RECEIVED:
+	case DRV_TLV_PLOGI_RJTS_RECEIVED:
+	case DRV_TLV_PLOGI_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_1_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_2_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_3_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_4_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_5_TIMESTAMP:
+	case DRV_TLV_PLOGI_1_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_1_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_2_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_3_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_4_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_5_ACC_TIMESTAMP:
+	case DRV_TLV_LOGOS_ISSUED:
+	case DRV_TLV_LOGO_ACCS_RECEIVED:
+	case DRV_TLV_LOGO_RJTS_RECEIVED:
+	case DRV_TLV_LOGO_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_1_TIMESTAMP:
+	case DRV_TLV_LOGO_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_2_TIMESTAMP:
+	case DRV_TLV_LOGO_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_3_TIMESTAMP:
+	case DRV_TLV_LOGO_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_4_TIMESTAMP:
+	case DRV_TLV_LOGO_5_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_5_TIMESTAMP:
+	case DRV_TLV_LOGOS_RECEIVED:
+	case DRV_TLV_ACCS_ISSUED:
+	case DRV_TLV_PRLIS_ISSUED:
+	case DRV_TLV_ACCS_RECEIVED:
+	case DRV_TLV_ABTS_SENT_COUNT:
+	case DRV_TLV_ABTS_ACCS_RECEIVED:
+	case DRV_TLV_ABTS_RJTS_RECEIVED:
+	case DRV_TLV_ABTS_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_1_TIMESTAMP:
+	case DRV_TLV_ABTS_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_2_TIMESTAMP:
+	case DRV_TLV_ABTS_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_3_TIMESTAMP:
+	case DRV_TLV_ABTS_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_4_TIMESTAMP:
+	case DRV_TLV_ABTS_5_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_5_TIMESTAMP:
+	case DRV_TLV_RSCNS_RECEIVED:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_1:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_2:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_3:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_4:
+	case DRV_TLV_LUN_RESETS_ISSUED:
+	case DRV_TLV_ABORT_TASK_SETS_ISSUED:
+	case DRV_TLV_TPRLOS_SENT:
+	case DRV_TLV_NOS_SENT_COUNT:
+	case DRV_TLV_NOS_RECEIVED_COUNT:
+	case DRV_TLV_OLS_COUNT:
+	case DRV_TLV_LR_COUNT:
+	case DRV_TLV_LRR_COUNT:
+	case DRV_TLV_LIP_SENT_COUNT:
+	case DRV_TLV_LIP_RECEIVED_COUNT:
+	case DRV_TLV_EOFA_COUNT:
+	case DRV_TLV_EOFNI_COUNT:
+	case DRV_TLV_SCSI_STATUS_CHECK_CONDITION_COUNT:
+	case DRV_TLV_SCSI_STATUS_CONDITION_MET_COUNT:
+	case DRV_TLV_SCSI_STATUS_BUSY_COUNT:
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_COUNT:
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_CONDITION_MET_COUNT:
+	case DRV_TLV_SCSI_STATUS_RESERVATION_CONFLICT_COUNT:
+	case DRV_TLV_SCSI_STATUS_TASK_SET_FULL_COUNT:
+	case DRV_TLV_SCSI_STATUS_ACA_ACTIVE_COUNT:
+	case DRV_TLV_SCSI_STATUS_TASK_ABORTED_COUNT:
+	case DRV_TLV_SCSI_CHECK_CONDITION_1_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_1_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_2_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_2_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_3_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_3_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_4_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_4_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_5_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_5_TIMESTAMP:
+		*tlv_group = QED_MFW_TLV_FCOE;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -237,6 +388,672 @@ qed_mfw_get_eth_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
 	return -1;
 }
 
+static int
+qed_mfw_get_tlv_time_value(struct qed_mfw_tlv_time *p_time,
+			   struct qed_tlv_parsed_buf *p_buf)
+{
+	if (!p_time->b_set)
+		return -1;
+
+	/* Validate numbers */
+	if (p_time->month > 12)
+		p_time->month = 0;
+	if (p_time->day > 31)
+		p_time->day = 0;
+	if (p_time->hour > 23)
+		p_time->hour = 0;
+	if (p_time->min > 59)
+		p_time->hour = 0;
+	if (p_time->msec > 999)
+		p_time->msec = 0;
+	if (p_time->usec > 999)
+		p_time->usec = 0;
+
+	memset(p_buf->data, 0, sizeof(u8) * QED_TLV_DATA_MAX);
+	snprintf(p_buf->data, 14, "%d%d%d%d%d%d",
+		 p_time->month, p_time->day,
+		 p_time->hour, p_time->min, p_time->msec, p_time->usec);
+
+	p_buf->p_val = p_buf->data;
+
+	return QED_MFW_TLV_TIME_SIZE;
+}
+
+static int
+qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			   struct qed_mfw_tlv_fcoe *p_drv_buf,
+			   struct qed_tlv_parsed_buf *p_buf)
+{
+	struct qed_mfw_tlv_time *p_time;
+	u8 idx;
+
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_SCSI_TO:
+		if (p_drv_buf->scsi_timeout_set) {
+			p_buf->p_val = &p_drv_buf->scsi_timeout;
+			return sizeof(p_drv_buf->scsi_timeout);
+		}
+		break;
+	case DRV_TLV_R_T_TOV:
+		if (p_drv_buf->rt_tov_set) {
+			p_buf->p_val = &p_drv_buf->rt_tov;
+			return sizeof(p_drv_buf->rt_tov);
+		}
+		break;
+	case DRV_TLV_R_A_TOV:
+		if (p_drv_buf->ra_tov_set) {
+			p_buf->p_val = &p_drv_buf->ra_tov;
+			return sizeof(p_drv_buf->ra_tov);
+		}
+		break;
+	case DRV_TLV_E_D_TOV:
+		if (p_drv_buf->ed_tov_set) {
+			p_buf->p_val = &p_drv_buf->ed_tov;
+			return sizeof(p_drv_buf->ed_tov);
+		}
+		break;
+	case DRV_TLV_CR_TOV:
+		if (p_drv_buf->cr_tov_set) {
+			p_buf->p_val = &p_drv_buf->cr_tov;
+			return sizeof(p_drv_buf->cr_tov);
+		}
+		break;
+	case DRV_TLV_BOOT_TYPE:
+		if (p_drv_buf->boot_type_set) {
+			p_buf->p_val = &p_drv_buf->boot_type;
+			return sizeof(p_drv_buf->boot_type);
+		}
+		break;
+	case DRV_TLV_NPIV_STATE:
+		if (p_drv_buf->npiv_state_set) {
+			p_buf->p_val = &p_drv_buf->npiv_state;
+			return sizeof(p_drv_buf->npiv_state);
+		}
+		break;
+	case DRV_TLV_NUM_OF_NPIV_IDS:
+		if (p_drv_buf->num_npiv_ids_set) {
+			p_buf->p_val = &p_drv_buf->num_npiv_ids;
+			return sizeof(p_drv_buf->num_npiv_ids);
+		}
+		break;
+	case DRV_TLV_SWITCH_NAME:
+		if (p_drv_buf->switch_name_set) {
+			p_buf->p_val = &p_drv_buf->switch_name;
+			return sizeof(p_drv_buf->switch_name);
+		}
+		break;
+	case DRV_TLV_SWITCH_PORT_NUM:
+		if (p_drv_buf->switch_portnum_set) {
+			p_buf->p_val = &p_drv_buf->switch_portnum;
+			return sizeof(p_drv_buf->switch_portnum);
+		}
+		break;
+	case DRV_TLV_SWITCH_PORT_ID:
+		if (p_drv_buf->switch_portid_set) {
+			p_buf->p_val = &p_drv_buf->switch_portid;
+			return sizeof(p_drv_buf->switch_portid);
+		}
+		break;
+	case DRV_TLV_VENDOR_NAME:
+		if (p_drv_buf->vendor_name_set) {
+			p_buf->p_val = &p_drv_buf->vendor_name;
+			return sizeof(p_drv_buf->vendor_name);
+		}
+		break;
+	case DRV_TLV_SWITCH_MODEL:
+		if (p_drv_buf->switch_model_set) {
+			p_buf->p_val = &p_drv_buf->switch_model;
+			return sizeof(p_drv_buf->switch_model);
+		}
+		break;
+	case DRV_TLV_SWITCH_FW_VER:
+		if (p_drv_buf->switch_fw_version_set) {
+			p_buf->p_val = &p_drv_buf->switch_fw_version;
+			return sizeof(p_drv_buf->switch_fw_version);
+		}
+		break;
+	case DRV_TLV_QOS_PRIORITY_PER_802_1P:
+		if (p_drv_buf->qos_pri_set) {
+			p_buf->p_val = &p_drv_buf->qos_pri;
+			return sizeof(p_drv_buf->qos_pri);
+		}
+		break;
+	case DRV_TLV_PORT_ALIAS:
+		if (p_drv_buf->port_alias_set) {
+			p_buf->p_val = &p_drv_buf->port_alias;
+			return sizeof(p_drv_buf->port_alias);
+		}
+		break;
+	case DRV_TLV_PORT_STATE:
+		if (p_drv_buf->port_state_set) {
+			p_buf->p_val = &p_drv_buf->port_state;
+			return sizeof(p_drv_buf->port_state);
+		}
+		break;
+	case DRV_TLV_FIP_TX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->fip_tx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->fip_tx_descr_size;
+			return sizeof(p_drv_buf->fip_tx_descr_size);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->fip_rx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->fip_rx_descr_size;
+			return sizeof(p_drv_buf->fip_rx_descr_size);
+		}
+		break;
+	case DRV_TLV_LINK_FAILURE_COUNT:
+		if (p_drv_buf->link_failures_set) {
+			p_buf->p_val = &p_drv_buf->link_failures;
+			return sizeof(p_drv_buf->link_failures);
+		}
+		break;
+	case DRV_TLV_FCOE_BOOT_PROGRESS:
+		if (p_drv_buf->fcoe_boot_progress_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_boot_progress;
+			return sizeof(p_drv_buf->fcoe_boot_progress);
+		}
+		break;
+	case DRV_TLV_RX_BROADCAST_PACKETS:
+		if (p_drv_buf->rx_bcast_set) {
+			p_buf->p_val = &p_drv_buf->rx_bcast;
+			return sizeof(p_drv_buf->rx_bcast);
+		}
+		break;
+	case DRV_TLV_TX_BROADCAST_PACKETS:
+		if (p_drv_buf->tx_bcast_set) {
+			p_buf->p_val = &p_drv_buf->tx_bcast;
+			return sizeof(p_drv_buf->tx_bcast);
+		}
+		break;
+	case DRV_TLV_FCOE_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->fcoe_txq_depth_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_txq_depth;
+			return sizeof(p_drv_buf->fcoe_txq_depth);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->fcoe_rxq_depth_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_rxq_depth;
+			return sizeof(p_drv_buf->fcoe_rxq_depth);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_FRAMES_RECEIVED:
+		if (p_drv_buf->fcoe_rx_frames_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_rx_frames;
+			return sizeof(p_drv_buf->fcoe_rx_frames);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_BYTES_RECEIVED:
+		if (p_drv_buf->fcoe_rx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_rx_bytes;
+			return sizeof(p_drv_buf->fcoe_rx_bytes);
+		}
+		break;
+	case DRV_TLV_FCOE_TX_FRAMES_SENT:
+		if (p_drv_buf->fcoe_tx_frames_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_tx_frames;
+			return sizeof(p_drv_buf->fcoe_tx_frames);
+		}
+		break;
+	case DRV_TLV_FCOE_TX_BYTES_SENT:
+		if (p_drv_buf->fcoe_tx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_tx_bytes;
+			return sizeof(p_drv_buf->fcoe_tx_bytes);
+		}
+		break;
+	case DRV_TLV_CRC_ERROR_COUNT:
+		if (p_drv_buf->crc_count_set) {
+			p_buf->p_val = &p_drv_buf->crc_count;
+			return sizeof(p_drv_buf->crc_count);
+		}
+		break;
+	case DRV_TLV_CRC_ERROR_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_5_RECEIVED_SOURCE_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_CRC_ERROR_1_RECEIVED_SOURCE_FC_ID) / 2;
+
+		if (p_drv_buf->crc_err_src_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->crc_err_src_fcid[idx];
+			return sizeof(p_drv_buf->crc_err_src_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_CRC_ERROR_1_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_2_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_3_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_4_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_CRC_ERROR_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->crc_err[idx],
+						  p_buf);
+	case DRV_TLV_LOSS_OF_SYNC_ERROR_COUNT:
+		if (p_drv_buf->losync_err_set) {
+			p_buf->p_val = &p_drv_buf->losync_err;
+			return sizeof(p_drv_buf->losync_err);
+		}
+		break;
+	case DRV_TLV_LOSS_OF_SIGNAL_ERRORS:
+		if (p_drv_buf->losig_err_set) {
+			p_buf->p_val = &p_drv_buf->losig_err;
+			return sizeof(p_drv_buf->losig_err);
+		}
+		break;
+	case DRV_TLV_PRIMITIVE_SEQUENCE_PROTOCOL_ERROR_COUNT:
+		if (p_drv_buf->primtive_err_set) {
+			p_buf->p_val = &p_drv_buf->primtive_err;
+			return sizeof(p_drv_buf->primtive_err);
+		}
+		break;
+	case DRV_TLV_DISPARITY_ERROR_COUNT:
+		if (p_drv_buf->disparity_err_set) {
+			p_buf->p_val = &p_drv_buf->disparity_err;
+			return sizeof(p_drv_buf->disparity_err);
+		}
+		break;
+	case DRV_TLV_CODE_VIOLATION_ERROR_COUNT:
+		if (p_drv_buf->code_violation_err_set) {
+			p_buf->p_val = &p_drv_buf->code_violation_err;
+			return sizeof(p_drv_buf->code_violation_err);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_4:
+		idx = p_tlv->tlv_type -
+			DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_1;
+		if (p_drv_buf->flogi_param_set[idx]) {
+			p_buf->p_val = &p_drv_buf->flogi_param[idx];
+			return sizeof(p_drv_buf->flogi_param[idx]);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_TIMESTAMP:
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->flogi_tstamp,
+						  p_buf);
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_4:
+		idx = p_tlv->tlv_type -
+			DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_1;
+
+		if (p_drv_buf->flogi_acc_param_set[idx]) {
+			p_buf->p_val = &p_drv_buf->flogi_acc_param[idx];
+			return sizeof(p_drv_buf->flogi_acc_param[idx]);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_ACC_TIMESTAMP:
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->flogi_acc_tstamp,
+						  p_buf);
+	case DRV_TLV_LAST_FLOGI_RJT:
+		if (p_drv_buf->flogi_rjt_set) {
+			p_buf->p_val = &p_drv_buf->flogi_rjt;
+			return sizeof(p_drv_buf->flogi_rjt);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_RJT_TIMESTAMP:
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->flogi_rjt_tstamp,
+						  p_buf);
+	case DRV_TLV_FDISCS_SENT_COUNT:
+		if (p_drv_buf->fdiscs_set) {
+			p_buf->p_val = &p_drv_buf->fdiscs;
+			return sizeof(p_drv_buf->fdiscs);
+		}
+		break;
+	case DRV_TLV_FDISC_ACCS_RECEIVED:
+		if (p_drv_buf->fdisc_acc_set) {
+			p_buf->p_val = &p_drv_buf->fdisc_acc;
+			return sizeof(p_drv_buf->fdisc_acc);
+		}
+		break;
+	case DRV_TLV_FDISC_RJTS_RECEIVED:
+		if (p_drv_buf->fdisc_rjt_set) {
+			p_buf->p_val = &p_drv_buf->fdisc_rjt;
+			return sizeof(p_drv_buf->fdisc_rjt);
+		}
+		break;
+	case DRV_TLV_PLOGI_SENT_COUNT:
+		if (p_drv_buf->plogi_set) {
+			p_buf->p_val = &p_drv_buf->plogi;
+			return sizeof(p_drv_buf->plogi);
+		}
+		break;
+	case DRV_TLV_PLOGI_ACCS_RECEIVED:
+		if (p_drv_buf->plogi_acc_set) {
+			p_buf->p_val = &p_drv_buf->plogi_acc;
+			return sizeof(p_drv_buf->plogi_acc);
+		}
+		break;
+	case DRV_TLV_PLOGI_RJTS_RECEIVED:
+		if (p_drv_buf->plogi_rjt_set) {
+			p_buf->p_val = &p_drv_buf->plogi_rjt;
+			return sizeof(p_drv_buf->plogi_rjt);
+		}
+		break;
+	case DRV_TLV_PLOGI_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_5_SENT_DESTINATION_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_PLOGI_1_SENT_DESTINATION_FC_ID) / 2;
+
+		if (p_drv_buf->plogi_dst_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->plogi_dst_fcid[idx];
+			return sizeof(p_drv_buf->plogi_dst_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_PLOGI_1_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_PLOGI_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->plogi_tstamp[idx],
+						  p_buf);
+	case DRV_TLV_PLOGI_1_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_2_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_3_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_4_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_5_ACC_RECEIVED_SOURCE_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_PLOGI_1_ACC_RECEIVED_SOURCE_FC_ID) / 2;
+
+		if (p_drv_buf->plogi_acc_src_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->plogi_acc_src_fcid[idx];
+			return sizeof(p_drv_buf->plogi_acc_src_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_PLOGI_1_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_ACC_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_PLOGI_1_ACC_TIMESTAMP) / 2;
+		p_time = &p_drv_buf->plogi_acc_tstamp[idx];
+
+		return qed_mfw_get_tlv_time_value(p_time, p_buf);
+	case DRV_TLV_LOGOS_ISSUED:
+		if (p_drv_buf->tx_plogos_set) {
+			p_buf->p_val = &p_drv_buf->tx_plogos;
+			return sizeof(p_drv_buf->tx_plogos);
+		}
+		break;
+	case DRV_TLV_LOGO_ACCS_RECEIVED:
+		if (p_drv_buf->plogo_acc_set) {
+			p_buf->p_val = &p_drv_buf->plogo_acc;
+			return sizeof(p_drv_buf->plogo_acc);
+		}
+		break;
+	case DRV_TLV_LOGO_RJTS_RECEIVED:
+		if (p_drv_buf->plogo_rjt_set) {
+			p_buf->p_val = &p_drv_buf->plogo_rjt;
+			return sizeof(p_drv_buf->plogo_rjt);
+		}
+		break;
+	case DRV_TLV_LOGO_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_5_RECEIVED_SOURCE_FC_ID:
+		idx = (p_tlv->tlv_type - DRV_TLV_LOGO_1_RECEIVED_SOURCE_FC_ID) /
+			2;
+
+		if (p_drv_buf->plogo_src_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->plogo_src_fcid[idx];
+			return sizeof(p_drv_buf->plogo_src_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_LOGO_1_TIMESTAMP:
+	case DRV_TLV_LOGO_2_TIMESTAMP:
+	case DRV_TLV_LOGO_3_TIMESTAMP:
+	case DRV_TLV_LOGO_4_TIMESTAMP:
+	case DRV_TLV_LOGO_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_LOGO_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->plogo_tstamp[idx],
+						  p_buf);
+	case DRV_TLV_LOGOS_RECEIVED:
+		if (p_drv_buf->rx_logos_set) {
+			p_buf->p_val = &p_drv_buf->rx_logos;
+			return sizeof(p_drv_buf->rx_logos);
+		}
+		break;
+	case DRV_TLV_ACCS_ISSUED:
+		if (p_drv_buf->tx_accs_set) {
+			p_buf->p_val = &p_drv_buf->tx_accs;
+			return sizeof(p_drv_buf->tx_accs);
+		}
+		break;
+	case DRV_TLV_PRLIS_ISSUED:
+		if (p_drv_buf->tx_prlis_set) {
+			p_buf->p_val = &p_drv_buf->tx_prlis;
+			return sizeof(p_drv_buf->tx_prlis);
+		}
+		break;
+	case DRV_TLV_ACCS_RECEIVED:
+		if (p_drv_buf->rx_accs_set) {
+			p_buf->p_val = &p_drv_buf->rx_accs;
+			return sizeof(p_drv_buf->rx_accs);
+		}
+		break;
+	case DRV_TLV_ABTS_SENT_COUNT:
+		if (p_drv_buf->tx_abts_set) {
+			p_buf->p_val = &p_drv_buf->tx_abts;
+			return sizeof(p_drv_buf->tx_abts);
+		}
+		break;
+	case DRV_TLV_ABTS_ACCS_RECEIVED:
+		if (p_drv_buf->rx_abts_acc_set) {
+			p_buf->p_val = &p_drv_buf->rx_abts_acc;
+			return sizeof(p_drv_buf->rx_abts_acc);
+		}
+		break;
+	case DRV_TLV_ABTS_RJTS_RECEIVED:
+		if (p_drv_buf->rx_abts_rjt_set) {
+			p_buf->p_val = &p_drv_buf->rx_abts_rjt;
+			return sizeof(p_drv_buf->rx_abts_rjt);
+		}
+		break;
+	case DRV_TLV_ABTS_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_5_SENT_DESTINATION_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_ABTS_1_SENT_DESTINATION_FC_ID) / 2;
+
+		if (p_drv_buf->abts_dst_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->abts_dst_fcid[idx];
+			return sizeof(p_drv_buf->abts_dst_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_ABTS_1_TIMESTAMP:
+	case DRV_TLV_ABTS_2_TIMESTAMP:
+	case DRV_TLV_ABTS_3_TIMESTAMP:
+	case DRV_TLV_ABTS_4_TIMESTAMP:
+	case DRV_TLV_ABTS_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_ABTS_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->abts_tstamp[idx],
+						  p_buf);
+	case DRV_TLV_RSCNS_RECEIVED:
+		if (p_drv_buf->rx_rscn_set) {
+			p_buf->p_val = &p_drv_buf->rx_rscn;
+			return sizeof(p_drv_buf->rx_rscn);
+		}
+		break;
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_1:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_2:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_3:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_4:
+		idx = p_tlv->tlv_type - DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_1;
+
+		if (p_drv_buf->rx_rscn_nport_set[idx]) {
+			p_buf->p_val = &p_drv_buf->rx_rscn_nport[idx];
+			return sizeof(p_drv_buf->rx_rscn_nport[idx]);
+		}
+		break;
+	case DRV_TLV_LUN_RESETS_ISSUED:
+		if (p_drv_buf->tx_lun_rst_set) {
+			p_buf->p_val = &p_drv_buf->tx_lun_rst;
+			return sizeof(p_drv_buf->tx_lun_rst);
+		}
+		break;
+	case DRV_TLV_ABORT_TASK_SETS_ISSUED:
+		if (p_drv_buf->abort_task_sets_set) {
+			p_buf->p_val = &p_drv_buf->abort_task_sets;
+			return sizeof(p_drv_buf->abort_task_sets);
+		}
+		break;
+	case DRV_TLV_TPRLOS_SENT:
+		if (p_drv_buf->tx_tprlos_set) {
+			p_buf->p_val = &p_drv_buf->tx_tprlos;
+			return sizeof(p_drv_buf->tx_tprlos);
+		}
+		break;
+	case DRV_TLV_NOS_SENT_COUNT:
+		if (p_drv_buf->tx_nos_set) {
+			p_buf->p_val = &p_drv_buf->tx_nos;
+			return sizeof(p_drv_buf->tx_nos);
+		}
+		break;
+	case DRV_TLV_NOS_RECEIVED_COUNT:
+		if (p_drv_buf->rx_nos_set) {
+			p_buf->p_val = &p_drv_buf->rx_nos;
+			return sizeof(p_drv_buf->rx_nos);
+		}
+		break;
+	case DRV_TLV_OLS_COUNT:
+		if (p_drv_buf->ols_set) {
+			p_buf->p_val = &p_drv_buf->ols;
+			return sizeof(p_drv_buf->ols);
+		}
+		break;
+	case DRV_TLV_LR_COUNT:
+		if (p_drv_buf->lr_set) {
+			p_buf->p_val = &p_drv_buf->lr;
+			return sizeof(p_drv_buf->lr);
+		}
+		break;
+	case DRV_TLV_LRR_COUNT:
+		if (p_drv_buf->lrr_set) {
+			p_buf->p_val = &p_drv_buf->lrr;
+			return sizeof(p_drv_buf->lrr);
+		}
+		break;
+	case DRV_TLV_LIP_SENT_COUNT:
+		if (p_drv_buf->tx_lip_set) {
+			p_buf->p_val = &p_drv_buf->tx_lip;
+			return sizeof(p_drv_buf->tx_lip);
+		}
+		break;
+	case DRV_TLV_LIP_RECEIVED_COUNT:
+		if (p_drv_buf->rx_lip_set) {
+			p_buf->p_val = &p_drv_buf->rx_lip;
+			return sizeof(p_drv_buf->rx_lip);
+		}
+		break;
+	case DRV_TLV_EOFA_COUNT:
+		if (p_drv_buf->eofa_set) {
+			p_buf->p_val = &p_drv_buf->eofa;
+			return sizeof(p_drv_buf->eofa);
+		}
+		break;
+	case DRV_TLV_EOFNI_COUNT:
+		if (p_drv_buf->eofni_set) {
+			p_buf->p_val = &p_drv_buf->eofni;
+			return sizeof(p_drv_buf->eofni);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_CHECK_CONDITION_COUNT:
+		if (p_drv_buf->scsi_chks_set) {
+			p_buf->p_val = &p_drv_buf->scsi_chks;
+			return sizeof(p_drv_buf->scsi_chks);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_CONDITION_MET_COUNT:
+		if (p_drv_buf->scsi_cond_met_set) {
+			p_buf->p_val = &p_drv_buf->scsi_cond_met;
+			return sizeof(p_drv_buf->scsi_cond_met);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_BUSY_COUNT:
+		if (p_drv_buf->scsi_busy_set) {
+			p_buf->p_val = &p_drv_buf->scsi_busy;
+			return sizeof(p_drv_buf->scsi_busy);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_COUNT:
+		if (p_drv_buf->scsi_inter_set) {
+			p_buf->p_val = &p_drv_buf->scsi_inter;
+			return sizeof(p_drv_buf->scsi_inter);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_CONDITION_MET_COUNT:
+		if (p_drv_buf->scsi_inter_cond_met_set) {
+			p_buf->p_val = &p_drv_buf->scsi_inter_cond_met;
+			return sizeof(p_drv_buf->scsi_inter_cond_met);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_RESERVATION_CONFLICT_COUNT:
+		if (p_drv_buf->scsi_rsv_conflicts_set) {
+			p_buf->p_val = &p_drv_buf->scsi_rsv_conflicts;
+			return sizeof(p_drv_buf->scsi_rsv_conflicts);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_TASK_SET_FULL_COUNT:
+		if (p_drv_buf->scsi_tsk_full_set) {
+			p_buf->p_val = &p_drv_buf->scsi_tsk_full;
+			return sizeof(p_drv_buf->scsi_tsk_full);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_ACA_ACTIVE_COUNT:
+		if (p_drv_buf->scsi_aca_active_set) {
+			p_buf->p_val = &p_drv_buf->scsi_aca_active;
+			return sizeof(p_drv_buf->scsi_aca_active);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_TASK_ABORTED_COUNT:
+		if (p_drv_buf->scsi_tsk_abort_set) {
+			p_buf->p_val = &p_drv_buf->scsi_tsk_abort;
+			return sizeof(p_drv_buf->scsi_tsk_abort);
+		}
+		break;
+	case DRV_TLV_SCSI_CHECK_CONDITION_1_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_2_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_3_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_4_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_5_RECEIVED_SK_ASC_ASCQ:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_SCSI_CHECK_CONDITION_1_RECEIVED_SK_ASC_ASCQ) / 2;
+
+		if (p_drv_buf->scsi_rx_chk_set[idx]) {
+			p_buf->p_val = &p_drv_buf->scsi_rx_chk[idx];
+			return sizeof(p_drv_buf->scsi_rx_chk[idx]);
+		}
+		break;
+	case DRV_TLV_SCSI_CHECK_1_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_2_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_3_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_4_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_SCSI_CHECK_1_TIMESTAMP) / 2;
+		p_time = &p_drv_buf->scsi_chk_tstamp[idx];
+
+		return qed_mfw_get_tlv_time_value(p_time, p_buf);
+	default:
+		break;
+	}
+
+	return -1;
+}
+
 static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			       u8 tlv_group, u8 *p_mfw_buf, u32 size)
 {
@@ -276,6 +1093,10 @@ static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			len = qed_mfw_get_eth_tlv_value(&tlv,
 							&p_tlv_data->eth,
 							&buffer);
+		else if (tlv_group == QED_MFW_TLV_FCOE)
+			len = qed_mfw_get_fcoe_tlv_value(&tlv,
+							 &p_tlv_data->fcoe,
+							 &buffer);
 
 		if (len > 0) {
 			WARN(len > 4 * tlv.tlv_length,
@@ -351,6 +1172,13 @@ int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		tlv_group &= ~QED_MFW_TLV_ETH;
 	}
 
+	if ((tlv_group & QED_MFW_TLV_FCOE) &&
+	    p_hwfn->hw_info.personality != QED_PCI_FCOE) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Skipping FCoE TLVs for non-FCoE function\n");
+		tlv_group &= ~QED_MFW_TLV_FCOE;
+	}
+
 	/* Update the TLV values in the local buffer */
 	for (id = QED_MFW_TLV_GENERIC; id < QED_MFW_TLV_MAX; id <<= 1) {
 		if (tlv_group & id)
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 8e4fad427ac2..74c2b9ad8afa 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -219,6 +219,199 @@ struct qed_mfw_tlv_eth {
 	bool num_rxqs_full_set;
 };
 
+#define QED_MFW_TLV_TIME_SIZE	14
+struct qed_mfw_tlv_time {
+	bool b_set;
+	u8 month;
+	u8 day;
+	u8 hour;
+	u8 min;
+	u16 msec;
+	u16 usec;
+};
+
+struct qed_mfw_tlv_fcoe {
+	u8 scsi_timeout;
+	bool scsi_timeout_set;
+	u32 rt_tov;
+	bool rt_tov_set;
+	u32 ra_tov;
+	bool ra_tov_set;
+	u32 ed_tov;
+	bool ed_tov_set;
+	u32 cr_tov;
+	bool cr_tov_set;
+	u8 boot_type;
+	bool boot_type_set;
+	u8 npiv_state;
+	bool npiv_state_set;
+	u32 num_npiv_ids;
+	bool num_npiv_ids_set;
+	u8 switch_name[8];
+	bool switch_name_set;
+	u16 switch_portnum;
+	bool switch_portnum_set;
+	u8 switch_portid[3];
+	bool switch_portid_set;
+	u8 vendor_name[8];
+	bool vendor_name_set;
+	u8 switch_model[8];
+	bool switch_model_set;
+	u8 switch_fw_version[8];
+	bool switch_fw_version_set;
+	u8 qos_pri;
+	bool qos_pri_set;
+	u8 port_alias[3];
+	bool port_alias_set;
+	u8 port_state;
+#define QED_MFW_TLV_PORT_STATE_OFFLINE  (0)
+#define QED_MFW_TLV_PORT_STATE_LOOP             (1)
+#define QED_MFW_TLV_PORT_STATE_P2P              (2)
+#define QED_MFW_TLV_PORT_STATE_FABRIC           (3)
+	bool port_state_set;
+	u16 fip_tx_descr_size;
+	bool fip_tx_descr_size_set;
+	u16 fip_rx_descr_size;
+	bool fip_rx_descr_size_set;
+	u16 link_failures;
+	bool link_failures_set;
+	u8 fcoe_boot_progress;
+	bool fcoe_boot_progress_set;
+	u64 rx_bcast;
+	bool rx_bcast_set;
+	u64 tx_bcast;
+	bool tx_bcast_set;
+	u16 fcoe_txq_depth;
+	bool fcoe_txq_depth_set;
+	u16 fcoe_rxq_depth;
+	bool fcoe_rxq_depth_set;
+	u64 fcoe_rx_frames;
+	bool fcoe_rx_frames_set;
+	u64 fcoe_rx_bytes;
+	bool fcoe_rx_bytes_set;
+	u64 fcoe_tx_frames;
+	bool fcoe_tx_frames_set;
+	u64 fcoe_tx_bytes;
+	bool fcoe_tx_bytes_set;
+	u16 crc_count;
+	bool crc_count_set;
+	u32 crc_err_src_fcid[5];
+	bool crc_err_src_fcid_set[5];
+	struct qed_mfw_tlv_time crc_err[5];
+	u16 losync_err;
+	bool losync_err_set;
+	u16 losig_err;
+	bool losig_err_set;
+	u16 primtive_err;
+	bool primtive_err_set;
+	u16 disparity_err;
+	bool disparity_err_set;
+	u16 code_violation_err;
+	bool code_violation_err_set;
+	u32 flogi_param[4];
+	bool flogi_param_set[4];
+	struct qed_mfw_tlv_time flogi_tstamp;
+	u32 flogi_acc_param[4];
+	bool flogi_acc_param_set[4];
+	struct qed_mfw_tlv_time flogi_acc_tstamp;
+	u32 flogi_rjt;
+	bool flogi_rjt_set;
+	struct qed_mfw_tlv_time flogi_rjt_tstamp;
+	u32 fdiscs;
+	bool fdiscs_set;
+	u8 fdisc_acc;
+	bool fdisc_acc_set;
+	u8 fdisc_rjt;
+	bool fdisc_rjt_set;
+	u8 plogi;
+	bool plogi_set;
+	u8 plogi_acc;
+	bool plogi_acc_set;
+	u8 plogi_rjt;
+	bool plogi_rjt_set;
+	u32 plogi_dst_fcid[5];
+	bool plogi_dst_fcid_set[5];
+	struct qed_mfw_tlv_time plogi_tstamp[5];
+	u32 plogi_acc_src_fcid[5];
+	bool plogi_acc_src_fcid_set[5];
+	struct qed_mfw_tlv_time plogi_acc_tstamp[5];
+	u8 tx_plogos;
+	bool tx_plogos_set;
+	u8 plogo_acc;
+	bool plogo_acc_set;
+	u8 plogo_rjt;
+	bool plogo_rjt_set;
+	u32 plogo_src_fcid[5];
+	bool plogo_src_fcid_set[5];
+	struct qed_mfw_tlv_time plogo_tstamp[5];
+	u8 rx_logos;
+	bool rx_logos_set;
+	u8 tx_accs;
+	bool tx_accs_set;
+	u8 tx_prlis;
+	bool tx_prlis_set;
+	u8 rx_accs;
+	bool rx_accs_set;
+	u8 tx_abts;
+	bool tx_abts_set;
+	u8 rx_abts_acc;
+	bool rx_abts_acc_set;
+	u8 rx_abts_rjt;
+	bool rx_abts_rjt_set;
+	u32 abts_dst_fcid[5];
+	bool abts_dst_fcid_set[5];
+	struct qed_mfw_tlv_time abts_tstamp[5];
+	u8 rx_rscn;
+	bool rx_rscn_set;
+	u32 rx_rscn_nport[4];
+	bool rx_rscn_nport_set[4];
+	u8 tx_lun_rst;
+	bool tx_lun_rst_set;
+	u8 abort_task_sets;
+	bool abort_task_sets_set;
+	u8 tx_tprlos;
+	bool tx_tprlos_set;
+	u8 tx_nos;
+	bool tx_nos_set;
+	u8 rx_nos;
+	bool rx_nos_set;
+	u8 ols;
+	bool ols_set;
+	u8 lr;
+	bool lr_set;
+	u8 lrr;
+	bool lrr_set;
+	u8 tx_lip;
+	bool tx_lip_set;
+	u8 rx_lip;
+	bool rx_lip_set;
+	u8 eofa;
+	bool eofa_set;
+	u8 eofni;
+	bool eofni_set;
+	u8 scsi_chks;
+	bool scsi_chks_set;
+	u8 scsi_cond_met;
+	bool scsi_cond_met_set;
+	u8 scsi_busy;
+	bool scsi_busy_set;
+	u8 scsi_inter;
+	bool scsi_inter_set;
+	u8 scsi_inter_cond_met;
+	bool scsi_inter_cond_met_set;
+	u8 scsi_rsv_conflicts;
+	bool scsi_rsv_conflicts_set;
+	u8 scsi_tsk_full;
+	bool scsi_tsk_full_set;
+	u8 scsi_aca_active;
+	bool scsi_aca_active_set;
+	u8 scsi_tsk_abort;
+	bool scsi_tsk_abort_set;
+	u32 scsi_rx_chk[5];
+	bool scsi_rx_chk_set[5];
+	struct qed_mfw_tlv_time scsi_chk_tstamp[5];
+};
+
 #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
 					    (void __iomem *)(reg_addr))
 
-- 
cgit v1.2.3


From 77a509e4f6d14c2e09acbab5a89a769740bda62c Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:40 -0700
Subject: qed: Add support for processing iscsi tlv request.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.h     |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 131 ++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                    |  36 +++++++
 3 files changed, 170 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index b31f5d8bd27c..632a838f1fe3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -217,7 +217,8 @@ enum qed_mfw_tlv_type {
 	QED_MFW_TLV_GENERIC = 0x1,	/* Core driver TLVs */
 	QED_MFW_TLV_ETH = 0x2,		/* L2 driver TLVs */
 	QED_MFW_TLV_FCOE = 0x4,		/* FCoE protocol TLVs */
-	QED_MFW_TLV_MAX = 0x8,
+	QED_MFW_TLV_ISCSI = 0x8,	/* SCSI protocol TLVs */
+	QED_MFW_TLV_MAX = 0x16,
 };
 
 struct qed_mfw_tlv_generic {
@@ -247,6 +248,7 @@ union qed_mfw_tlv_data {
 	struct qed_mfw_tlv_generic generic;
 	struct qed_mfw_tlv_eth eth;
 	struct qed_mfw_tlv_fcoe fcoe;
+	struct qed_mfw_tlv_iscsi iscsi;
 };
 
 /**
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
index 1873cfc08715..6c16158d8090 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
@@ -215,6 +215,23 @@ static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group)
 	case DRV_TLV_SCSI_CHECK_5_TIMESTAMP:
 		*tlv_group = QED_MFW_TLV_FCOE;
 		break;
+	case DRV_TLV_TARGET_LLMNR_ENABLED:
+	case DRV_TLV_HEADER_DIGEST_FLAG_ENABLED:
+	case DRV_TLV_DATA_DIGEST_FLAG_ENABLED:
+	case DRV_TLV_AUTHENTICATION_METHOD:
+	case DRV_TLV_ISCSI_BOOT_TARGET_PORTAL:
+	case DRV_TLV_MAX_FRAME_SIZE:
+	case DRV_TLV_PDU_TX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_ISCSI_BOOT_PROGRESS:
+	case DRV_TLV_PDU_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+	case DRV_TLV_ISCSI_PDU_RX_FRAMES_RECEIVED:
+	case DRV_TLV_ISCSI_PDU_RX_BYTES_RECEIVED:
+	case DRV_TLV_ISCSI_PDU_TX_FRAMES_SENT:
+	case DRV_TLV_ISCSI_PDU_TX_BYTES_SENT:
+		*tlv_group |= QED_MFW_TLV_ISCSI;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1054,6 +1071,109 @@ qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
 	return -1;
 }
 
+static int
+qed_mfw_get_iscsi_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			    struct qed_mfw_tlv_iscsi *p_drv_buf,
+			    struct qed_tlv_parsed_buf *p_buf)
+{
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_TARGET_LLMNR_ENABLED:
+		if (p_drv_buf->target_llmnr_set) {
+			p_buf->p_val = &p_drv_buf->target_llmnr;
+			return sizeof(p_drv_buf->target_llmnr);
+		}
+		break;
+	case DRV_TLV_HEADER_DIGEST_FLAG_ENABLED:
+		if (p_drv_buf->header_digest_set) {
+			p_buf->p_val = &p_drv_buf->header_digest;
+			return sizeof(p_drv_buf->header_digest);
+		}
+		break;
+	case DRV_TLV_DATA_DIGEST_FLAG_ENABLED:
+		if (p_drv_buf->data_digest_set) {
+			p_buf->p_val = &p_drv_buf->data_digest;
+			return sizeof(p_drv_buf->data_digest);
+		}
+		break;
+	case DRV_TLV_AUTHENTICATION_METHOD:
+		if (p_drv_buf->auth_method_set) {
+			p_buf->p_val = &p_drv_buf->auth_method;
+			return sizeof(p_drv_buf->auth_method);
+		}
+		break;
+	case DRV_TLV_ISCSI_BOOT_TARGET_PORTAL:
+		if (p_drv_buf->boot_taget_portal_set) {
+			p_buf->p_val = &p_drv_buf->boot_taget_portal;
+			return sizeof(p_drv_buf->boot_taget_portal);
+		}
+		break;
+	case DRV_TLV_MAX_FRAME_SIZE:
+		if (p_drv_buf->frame_size_set) {
+			p_buf->p_val = &p_drv_buf->frame_size;
+			return sizeof(p_drv_buf->frame_size);
+		}
+		break;
+	case DRV_TLV_PDU_TX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->tx_desc_size_set) {
+			p_buf->p_val = &p_drv_buf->tx_desc_size;
+			return sizeof(p_drv_buf->tx_desc_size);
+		}
+		break;
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->rx_desc_size_set) {
+			p_buf->p_val = &p_drv_buf->rx_desc_size;
+			return sizeof(p_drv_buf->rx_desc_size);
+		}
+		break;
+	case DRV_TLV_ISCSI_BOOT_PROGRESS:
+		if (p_drv_buf->boot_progress_set) {
+			p_buf->p_val = &p_drv_buf->boot_progress;
+			return sizeof(p_drv_buf->boot_progress);
+		}
+		break;
+	case DRV_TLV_PDU_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->tx_desc_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->tx_desc_qdepth;
+			return sizeof(p_drv_buf->tx_desc_qdepth);
+		}
+		break;
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->rx_desc_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->rx_desc_qdepth;
+			return sizeof(p_drv_buf->rx_desc_qdepth);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_RX_FRAMES_RECEIVED:
+		if (p_drv_buf->rx_frames_set) {
+			p_buf->p_val = &p_drv_buf->rx_frames;
+			return sizeof(p_drv_buf->rx_frames);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_RX_BYTES_RECEIVED:
+		if (p_drv_buf->rx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->rx_bytes;
+			return sizeof(p_drv_buf->rx_bytes);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_TX_FRAMES_SENT:
+		if (p_drv_buf->tx_frames_set) {
+			p_buf->p_val = &p_drv_buf->tx_frames;
+			return sizeof(p_drv_buf->tx_frames);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_TX_BYTES_SENT:
+		if (p_drv_buf->tx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->tx_bytes;
+			return sizeof(p_drv_buf->tx_bytes);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return -1;
+}
+
 static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			       u8 tlv_group, u8 *p_mfw_buf, u32 size)
 {
@@ -1097,6 +1217,10 @@ static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			len = qed_mfw_get_fcoe_tlv_value(&tlv,
 							 &p_tlv_data->fcoe,
 							 &buffer);
+		else
+			len = qed_mfw_get_iscsi_tlv_value(&tlv,
+							  &p_tlv_data->iscsi,
+							  &buffer);
 
 		if (len > 0) {
 			WARN(len > 4 * tlv.tlv_length,
@@ -1179,6 +1303,13 @@ int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		tlv_group &= ~QED_MFW_TLV_FCOE;
 	}
 
+	if ((tlv_group & QED_MFW_TLV_ISCSI) &&
+	    p_hwfn->hw_info.personality != QED_PCI_ISCSI) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Skipping iSCSI TLVs for non-iSCSI function\n");
+		tlv_group &= ~QED_MFW_TLV_ISCSI;
+	}
+
 	/* Update the TLV values in the local buffer */
 	for (id = QED_MFW_TLV_GENERIC; id < QED_MFW_TLV_MAX; id <<= 1) {
 		if (tlv_group & id)
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 74c2b9ad8afa..92b5352287df 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -412,6 +412,42 @@ struct qed_mfw_tlv_fcoe {
 	struct qed_mfw_tlv_time scsi_chk_tstamp[5];
 };
 
+struct qed_mfw_tlv_iscsi {
+	u8 target_llmnr;
+	bool target_llmnr_set;
+	u8 header_digest;
+	bool header_digest_set;
+	u8 data_digest;
+	bool data_digest_set;
+	u8 auth_method;
+#define QED_MFW_TLV_AUTH_METHOD_NONE            (1)
+#define QED_MFW_TLV_AUTH_METHOD_CHAP            (2)
+#define QED_MFW_TLV_AUTH_METHOD_MUTUAL_CHAP     (3)
+	bool auth_method_set;
+	u16 boot_taget_portal;
+	bool boot_taget_portal_set;
+	u16 frame_size;
+	bool frame_size_set;
+	u16 tx_desc_size;
+	bool tx_desc_size_set;
+	u16 rx_desc_size;
+	bool rx_desc_size_set;
+	u8 boot_progress;
+	bool boot_progress_set;
+	u16 tx_desc_qdepth;
+	bool tx_desc_qdepth_set;
+	u16 rx_desc_qdepth;
+	bool rx_desc_qdepth_set;
+	u64 rx_frames;
+	bool rx_frames_set;
+	u64 rx_bytes;
+	bool rx_bytes_set;
+	u64 tx_frames;
+	bool tx_frames_set;
+	u64 tx_bytes;
+	bool tx_bytes_set;
+};
+
 #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
 					    (void __iomem *)(reg_addr))
 
-- 
cgit v1.2.3


From 59ccf86fe69a6a77afebe706913d6b551d84d5bc Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:41 -0700
Subject: qed: Add driver infrastucture for handling mfw requests.

MFW requests the TLVs in interrupt context. Extracting of the required
data from upper layers and populating of the TLVs require process context.
The patch adds work-queues for processing the tlv requests. It also adds
the implementation for requesting the tlv values from appropriate protocol
driver.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h      |   8 ++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 151 ++++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  |   2 +
 include/linux/qed/qed_if.h                 |  10 ++
 4 files changed, 170 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index dfdbe529e2bf..00db3401b898 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -515,6 +515,10 @@ struct qed_simd_fp_handler {
 	void	(*func)(void *);
 };
 
+enum qed_slowpath_wq_flag {
+	QED_SLOWPATH_MFW_TLV_REQ,
+};
+
 struct qed_hwfn {
 	struct qed_dev			*cdev;
 	u8				my_id;          /* ID inside the PF */
@@ -644,6 +648,9 @@ struct qed_hwfn {
 #endif
 
 	struct z_stream_s		*stream;
+	struct workqueue_struct *slowpath_wq;
+	struct delayed_work slowpath_task;
+	unsigned long slowpath_task_flags;
 };
 
 struct pci_params {
@@ -908,6 +915,7 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 			    union qed_mcp_protocol_stats *stats);
 int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
 void qed_slowpath_irq_sync(struct qed_hwfn *p_hwfn);
+int qed_mfw_tlv_req(struct qed_hwfn *hwfn);
 
 int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn,
 			  enum qed_mfw_tlv_type type,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index cbf0ea97fc8e..68c4399ffd50 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -946,6 +946,68 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 	}
 }
 
+static void qed_slowpath_wq_stop(struct qed_dev *cdev)
+{
+	int i;
+
+	if (IS_VF(cdev))
+		return;
+
+	for_each_hwfn(cdev, i) {
+		if (!cdev->hwfns[i].slowpath_wq)
+			continue;
+
+		flush_workqueue(cdev->hwfns[i].slowpath_wq);
+		destroy_workqueue(cdev->hwfns[i].slowpath_wq);
+	}
+}
+
+static void qed_slowpath_task(struct work_struct *work)
+{
+	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
+					     slowpath_task.work);
+	struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+	if (!ptt) {
+		queue_delayed_work(hwfn->slowpath_wq, &hwfn->slowpath_task, 0);
+		return;
+	}
+
+	if (test_and_clear_bit(QED_SLOWPATH_MFW_TLV_REQ,
+			       &hwfn->slowpath_task_flags))
+		qed_mfw_process_tlv_req(hwfn, ptt);
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static int qed_slowpath_wq_start(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn;
+	char name[NAME_SIZE];
+	int i;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	for_each_hwfn(cdev, i) {
+		hwfn = &cdev->hwfns[i];
+
+		snprintf(name, NAME_SIZE, "slowpath-%02x:%02x.%02x",
+			 cdev->pdev->bus->number,
+			 PCI_SLOT(cdev->pdev->devfn), hwfn->abs_pf_id);
+
+		hwfn->slowpath_wq = alloc_workqueue(name, 0, 0);
+		if (!hwfn->slowpath_wq) {
+			DP_NOTICE(hwfn, "Cannot create slowpath workqueue\n");
+			return -ENOMEM;
+		}
+
+		INIT_DELAYED_WORK(&hwfn->slowpath_task, qed_slowpath_task);
+	}
+
+	return 0;
+}
+
 static int qed_slowpath_start(struct qed_dev *cdev,
 			      struct qed_slowpath_params *params)
 {
@@ -961,6 +1023,9 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	if (qed_iov_wq_start(cdev))
 		goto err;
 
+	if (qed_slowpath_wq_start(cdev))
+		goto err;
+
 	if (IS_PF(cdev)) {
 		rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
 				      &cdev->pdev->dev);
@@ -1095,6 +1160,8 @@ err:
 
 	qed_iov_wq_stop(cdev, false);
 
+	qed_slowpath_wq_stop(cdev);
+
 	return rc;
 }
 
@@ -1103,6 +1170,8 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	if (!cdev)
 		return -ENODEV;
 
+	qed_slowpath_wq_stop(cdev);
+
 	qed_ll2_dealloc_if(cdev);
 
 	if (IS_PF(cdev)) {
@@ -2089,8 +2158,88 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 	}
 }
 
+int qed_mfw_tlv_req(struct qed_hwfn *hwfn)
+{
+	DP_VERBOSE(hwfn->cdev, NETIF_MSG_DRV,
+		   "Scheduling slowpath task [Flag: %d]\n",
+		   QED_SLOWPATH_MFW_TLV_REQ);
+	smp_mb__before_atomic();
+	set_bit(QED_SLOWPATH_MFW_TLV_REQ, &hwfn->slowpath_task_flags);
+	smp_mb__after_atomic();
+	queue_delayed_work(hwfn->slowpath_wq, &hwfn->slowpath_task, 0);
+
+	return 0;
+}
+
+static void
+qed_fill_generic_tlv_data(struct qed_dev *cdev, struct qed_mfw_tlv_generic *tlv)
+{
+	struct qed_common_cb_ops *op = cdev->protocol_ops.common;
+	struct qed_eth_stats_common *p_common;
+	struct qed_generic_tlvs gen_tlvs;
+	struct qed_eth_stats stats;
+	int i;
+
+	memset(&gen_tlvs, 0, sizeof(gen_tlvs));
+	op->get_generic_tlv_data(cdev->ops_cookie, &gen_tlvs);
+
+	if (gen_tlvs.feat_flags & QED_TLV_IP_CSUM)
+		tlv->flags.ipv4_csum_offload = true;
+	if (gen_tlvs.feat_flags & QED_TLV_LSO)
+		tlv->flags.lso_supported = true;
+	tlv->flags.b_set = true;
+
+	for (i = 0; i < QED_TLV_MAC_COUNT; i++) {
+		if (is_valid_ether_addr(gen_tlvs.mac[i])) {
+			ether_addr_copy(tlv->mac[i], gen_tlvs.mac[i]);
+			tlv->mac_set[i] = true;
+		}
+	}
+
+	qed_get_vport_stats(cdev, &stats);
+	p_common = &stats.common;
+	tlv->rx_frames = p_common->rx_ucast_pkts + p_common->rx_mcast_pkts +
+			 p_common->rx_bcast_pkts;
+	tlv->rx_frames_set = true;
+	tlv->rx_bytes = p_common->rx_ucast_bytes + p_common->rx_mcast_bytes +
+			p_common->rx_bcast_bytes;
+	tlv->rx_bytes_set = true;
+	tlv->tx_frames = p_common->tx_ucast_pkts + p_common->tx_mcast_pkts +
+			 p_common->tx_bcast_pkts;
+	tlv->tx_frames_set = true;
+	tlv->tx_bytes = p_common->tx_ucast_bytes + p_common->tx_mcast_bytes +
+			p_common->tx_bcast_bytes;
+	tlv->rx_bytes_set = true;
+}
+
 int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn, enum qed_mfw_tlv_type type,
 			  union qed_mfw_tlv_data *tlv_buf)
 {
-	return -EINVAL;
+	struct qed_dev *cdev = hwfn->cdev;
+	struct qed_common_cb_ops *ops;
+
+	ops = cdev->protocol_ops.common;
+	if (!ops || !ops->get_protocol_tlv_data || !ops->get_generic_tlv_data) {
+		DP_NOTICE(hwfn, "Can't collect TLV management info\n");
+		return -EINVAL;
+	}
+
+	switch (type) {
+	case QED_MFW_TLV_GENERIC:
+		qed_fill_generic_tlv_data(hwfn->cdev, &tlv_buf->generic);
+		break;
+	case QED_MFW_TLV_ETH:
+		ops->get_protocol_tlv_data(cdev->ops_cookie, &tlv_buf->eth);
+		break;
+	case QED_MFW_TLV_FCOE:
+		ops->get_protocol_tlv_data(cdev->ops_cookie, &tlv_buf->fcoe);
+		break;
+	case QED_MFW_TLV_ISCSI:
+		ops->get_protocol_tlv_data(cdev->ops_cookie, &tlv_buf->iscsi);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index e80f5e7c7992..2612e3e458d9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1622,6 +1622,8 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 		case MFW_DRV_MSG_S_TAG_UPDATE:
 			qed_mcp_update_stag(p_hwfn, p_ptt);
 			break;
+		case MFW_DRV_MSG_GET_TLV_REQ:
+			qed_mfw_tlv_req(p_hwfn);
 			break;
 		default:
 			DP_INFO(p_hwfn, "Unimplemented MFW message %d\n", i);
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 92b5352287df..44af652e7407 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -751,6 +751,14 @@ struct qed_int_info {
 	u8			used_cnt;
 };
 
+struct qed_generic_tlvs {
+#define QED_TLV_IP_CSUM         BIT(0)
+#define QED_TLV_LSO             BIT(1)
+	u16 feat_flags;
+#define QED_TLV_MAC_COUNT	3
+	u8 mac[QED_TLV_MAC_COUNT][ETH_ALEN];
+};
+
 #define QED_NVM_SIGNATURE 0x12435687
 
 enum qed_nvm_flash_cmd {
@@ -765,6 +773,8 @@ struct qed_common_cb_ops {
 	void	(*link_update)(void			*dev,
 			       struct qed_link_output	*link);
 	void	(*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type);
+	void (*get_generic_tlv_data)(void *dev, struct qed_generic_tlvs *data);
+	void (*get_protocol_tlv_data)(void *dev, void *data);
 };
 
 struct qed_selftest_ops {
-- 
cgit v1.2.3


From 522239b445a2de988edb81672963708a6aaf9046 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 May 2018 23:17:03 -0700
Subject: uio, lib: Fix CONFIG_ARCH_HAS_UACCESS_MCSAFE compilation

Add a common Kconfig CONFIG_ARCH_HAS_UACCESS_MCSAFE that archs can
optionally select, and fixup the declaration of _copy_to_iter_mcsafe().

Fixes: 8780356ef630 ("x86/asm/memcpy_mcsafe: Define copy_to_iter_mcsafe()")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/uio.h | 2 +-
 lib/Kconfig         | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index f5766e853a77..409c845d4cd3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -155,7 +155,7 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
 #endif
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
-size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i);
 #else
 #define _copy_to_iter_mcsafe _copy_to_iter
 #endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 5fe577673b98..907f6e4f1cf2 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -586,6 +586,9 @@ config ARCH_HAS_PMEM_API
 config ARCH_HAS_UACCESS_FLUSHCACHE
 	bool
 
+config ARCH_HAS_UACCESS_MCSAFE
+	bool
+
 config STACKDEPOT
 	bool
 	select STACKTRACE
-- 
cgit v1.2.3


From b3a9a0c36e1f7b9e2e6cf965c2bb973624f2b3b9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 2 May 2018 06:46:33 -0700
Subject: dax: Introduce a ->copy_to_iter dax operation

Similar to the ->copy_from_iter() operation, a platform may want to
deploy an architecture or device specific routine for handling reads
from a dax_device like /dev/pmemX. On x86 this routine will point to a
machine check safe version of copy_to_iter(). For now, add the plumbing
to device-mapper and the dax core.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c           | 10 ++++++++++
 drivers/md/dm-linear.c        | 16 ++++++++++++++++
 drivers/md/dm-log-writes.c    | 15 +++++++++++++++
 drivers/md/dm-stripe.c        | 21 +++++++++++++++++++++
 drivers/md/dm.c               | 25 +++++++++++++++++++++++++
 drivers/nvdimm/pmem.c         |  7 +++++++
 drivers/s390/block/dcssblk.c  |  7 +++++++
 fs/dax.c                      |  3 ++-
 include/linux/dax.h           |  5 +++++
 include/linux/device-mapper.h |  5 +++--
 10 files changed, 111 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..31b839113399 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -282,6 +282,16 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 }
 EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 
+size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t bytes, struct iov_iter *i)
+{
+	if (!dax_alive(dax_dev))
+		return 0;
+
+	return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+EXPORT_SYMBOL_GPL(dax_copy_to_iter);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 775c06d953b7..d10964d41fd7 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
 	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define linear_dax_direct_access NULL
 #define linear_dax_copy_from_iter NULL
+#define linear_dax_copy_to_iter NULL
 #endif
 
 static struct target_type linear_target = {
@@ -204,6 +219,7 @@ static struct target_type linear_target = {
 	.iterate_devices = linear_iterate_devices,
 	.direct_access = linear_dax_direct_access,
 	.dax_copy_from_iter = linear_dax_copy_from_iter,
+	.dax_copy_to_iter = linear_dax_copy_to_iter,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index c90c7c08a77f..9ea2b0291f20 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
 dax_copy:
 	return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 }
+
+static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
+					  pgoff_t pgoff, void *addr, size_t bytes,
+					  struct iov_iter *i)
+{
+	struct log_writes_c *lc = ti->private;
+	sector_t sector = pgoff * PAGE_SECTORS;
+
+	if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define log_writes_dax_direct_access NULL
 #define log_writes_dax_copy_from_iter NULL
+#define log_writes_dax_copy_to_iter NULL
 #endif
 
 static struct target_type log_writes_target = {
@@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
 	.io_hints = log_writes_io_hints,
 	.direct_access = log_writes_dax_direct_access,
 	.dax_copy_from_iter = log_writes_dax_copy_from_iter,
+	.dax_copy_to_iter = log_writes_dax_copy_to_iter,
 };
 
 static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index fe7fb9b1aec3..8547d7594338 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
 	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+	struct stripe_c *sc = ti->private;
+	struct dax_device *dax_dev;
+	struct block_device *bdev;
+	uint32_t stripe;
+
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define stripe_dax_direct_access NULL
 #define stripe_dax_copy_from_iter NULL
+#define stripe_dax_copy_to_iter NULL
 #endif
 
 /*
@@ -478,6 +498,7 @@ static struct target_type stripe_target = {
 	.io_hints = stripe_io_hints,
 	.direct_access = stripe_dax_direct_access,
 	.dax_copy_from_iter = stripe_dax_copy_from_iter,
+	.dax_copy_to_iter = stripe_dax_copy_to_iter,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0a7b0107ca78..6752f1c25258 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1089,6 +1089,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	return ret;
 }
 
+static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	long ret = 0;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (!ti->type->dax_copy_to_iter) {
+		ret = copy_to_iter(addr, bytes, i);
+		goto out;
+	}
+	ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
+ out:
+	dm_put_live_table(md, srcu_idx);
+
+	return ret;
+}
+
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
@@ -3134,6 +3158,7 @@ static const struct block_device_operations dm_blk_dops = {
 static const struct dax_operations dm_dax_ops = {
 	.direct_access = dm_dax_direct_access,
 	.copy_from_iter = dm_dax_copy_from_iter,
+	.copy_to_iter = dm_dax_copy_to_iter,
 };
 
 /*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e023d6aa22b5..1b8ab48365de 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -264,9 +264,16 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	return copy_from_iter_flushcache(addr, bytes, i);
 }
 
+static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_to_iter(addr, bytes, i);
+}
+
 static const struct dax_operations pmem_dax_ops = {
 	.direct_access = pmem_dax_direct_access,
 	.copy_from_iter = pmem_copy_from_iter,
+	.copy_to_iter = pmem_copy_to_iter,
 };
 
 static const struct attribute_group *pmem_attribute_groups[] = {
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 0a312e450207..29024492b8ed 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -51,9 +51,16 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev,
 	return copy_from_iter(addr, bytes, i);
 }
 
+static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
+		pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_to_iter(addr, bytes, i);
+}
+
 static const struct dax_operations dcssblk_dax_ops = {
 	.direct_access = dcssblk_dax_direct_access,
 	.copy_from_iter = dcssblk_dax_copy_from_iter,
+	.copy_to_iter = dcssblk_dax_copy_to_iter,
 };
 
 struct dcssblk_dev_info {
diff --git a/fs/dax.c b/fs/dax.c
index aaec72ded1b6..a64afdf7ec0d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1057,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
 					map_len, iter);
 		else
-			map_len = copy_to_iter(kaddr, map_len, iter);
+			map_len = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+					map_len, iter);
 		if (map_len <= 0) {
 			ret = map_len ? map_len : -EFAULT;
 			break;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f9eb22ad341e..a43b396fb336 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -20,6 +20,9 @@ struct dax_operations {
 	/* copy_from_iter: required operation for fs-dax direct-i/o */
 	size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
 			struct iov_iter *);
+	/* copy_to_iter: required operation for fs-dax direct-i/o */
+	size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
+			struct iov_iter *);
 };
 
 extern struct attribute_group dax_attribute_group;
@@ -118,6 +121,8 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		void **kaddr, pfn_t *pfn);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
+size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t bytes, struct iov_iter *i);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 31fef7c34185..6fb0808e87c8 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -133,7 +133,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  */
 typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn);
-typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
+typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i);
 #define PAGE_SECTORS (PAGE_SIZE / 512)
 
@@ -184,7 +184,8 @@ struct target_type {
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
 	dm_dax_direct_access_fn direct_access;
-	dm_dax_copy_from_iter_fn dax_copy_from_iter;
+	dm_dax_copy_iter_fn dax_copy_from_iter;
+	dm_dax_copy_iter_fn dax_copy_to_iter;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.3


From 1f55236bd8dde69d1860a30c50793fb28d8405ae Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:53 +0200
Subject: netfilter: nf_nat: move common nat code to nat core

Copy-pasted, both l3 helpers almost use same code here.
Split out the common part into an 'inet' helper.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_core.h      |  7 ++++
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 55 +------------------------
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 48 +---------------------
 net/netfilter/nf_nat_core.c              | 70 ++++++++++++++++++++++++++++++++
 4 files changed, 81 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index 235bd0e9a5aa..0d84dd29108d 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -11,6 +11,13 @@
 unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 			   unsigned int hooknum, struct sk_buff *skb);
 
+unsigned int
+nf_nat_inet_fn(void *priv, struct sk_buff *skb,
+	       const struct nf_hook_state *state,
+	       unsigned int (*do_chain)(void *priv,
+					struct sk_buff *skb,
+					const struct nf_hook_state *state));
+
 int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family);
 
 static inline int nf_nat_initialized(struct nf_conn *ct,
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 325e02956bf5..29b5aceac66d 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -250,24 +250,12 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
-	struct nf_conn_nat *nat;
-	/* maniptype == SRC for postrouting. */
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
 
 	ct = nf_ct_get(skb, &ctinfo);
-	/* Can't track?  It's not due to stress, or conntrack would
-	 * have dropped it.  Hence it's the user's responsibilty to
-	 * packet filter it out, or implement conntrack/NAT for that
-	 * protocol. 8) --RR
-	 */
 	if (!ct)
 		return NF_ACCEPT;
 
-	nat = nfct_nat(ct);
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED_REPLY:
+	if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
 		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
 			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
 							   state->hook))
@@ -275,48 +263,9 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 			else
 				return NF_ACCEPT;
 		}
-		/* Only ICMPs can be IP_CT_IS_REPLY: */
-		/* fall through */
-	case IP_CT_NEW:
-		/* Seen it before?  This can happen for loopback, retrans,
-		 * or local packets.
-		 */
-		if (!nf_nat_initialized(ct, maniptype)) {
-			unsigned int ret;
-
-			ret = do_chain(priv, skb, state);
-			if (ret != NF_ACCEPT)
-				return ret;
-
-			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
-				break;
-
-			ret = nf_nat_alloc_null_binding(ct, state->hook);
-			if (ret != NF_ACCEPT)
-				return ret;
-		} else {
-			pr_debug("Already setup manip %s for ct %p\n",
-				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
-				 ct);
-			if (nf_nat_oif_changed(state->hook, ctinfo, nat,
-					       state->out))
-				goto oif_changed;
-		}
-		break;
-
-	default:
-		/* ESTABLISHED */
-		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
-			ctinfo != IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
-			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, state->hook, skb);
-
-oif_changed:
-	nf_ct_kill_acct(ct, ctinfo, skb);
-	return NF_DROP;
+	return nf_nat_inet_fn(priv, skb, state, do_chain);
 }
 EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
 
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index f1582b6f9588..3ec228984f82 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -261,8 +261,6 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
-	struct nf_conn_nat *nat;
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
 	__be16 frag_off;
 	int hdrlen;
 	u8 nexthdr;
@@ -276,11 +274,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 	if (!ct)
 		return NF_ACCEPT;
 
-	nat = nfct_nat(ct);
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED_REPLY:
+	if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
 		nexthdr = ipv6_hdr(skb)->nexthdr;
 		hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
 					  &nexthdr, &frag_off);
@@ -293,47 +287,9 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 			else
 				return NF_ACCEPT;
 		}
-		/* Only ICMPs can be IP_CT_IS_REPLY: */
-		/* fall through */
-	case IP_CT_NEW:
-		/* Seen it before?  This can happen for loopback, retrans,
-		 * or local packets.
-		 */
-		if (!nf_nat_initialized(ct, maniptype)) {
-			unsigned int ret;
-
-			ret = do_chain(priv, skb, state);
-			if (ret != NF_ACCEPT)
-				return ret;
-
-			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
-				break;
-
-			ret = nf_nat_alloc_null_binding(ct, state->hook);
-			if (ret != NF_ACCEPT)
-				return ret;
-		} else {
-			pr_debug("Already setup manip %s for ct %p\n",
-				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
-				 ct);
-			if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
-				goto oif_changed;
-		}
-		break;
-
-	default:
-		/* ESTABLISHED */
-		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
-			ctinfo != IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
-			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, state->hook, skb);
-
-oif_changed:
-	nf_ct_kill_acct(ct, ctinfo, skb);
-	return NF_DROP;
+	return nf_nat_inet_fn(priv, skb, state, do_chain);
 }
 EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 37b3c9913b08..0cd503aacbf0 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -513,6 +513,76 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_nat_packet);
 
+unsigned int
+nf_nat_inet_fn(void *priv, struct sk_buff *skb,
+	       const struct nf_hook_state *state,
+	       unsigned int (*do_chain)(void *priv,
+					struct sk_buff *skb,
+					const struct nf_hook_state *state))
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_nat *nat;
+	/* maniptype == SRC for postrouting. */
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
+
+	ct = nf_ct_get(skb, &ctinfo);
+	/* Can't track?  It's not due to stress, or conntrack would
+	 * have dropped it.  Hence it's the user's responsibilty to
+	 * packet filter it out, or implement conntrack/NAT for that
+	 * protocol. 8) --RR
+	 */
+	if (!ct)
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		/* Only ICMPs can be IP_CT_IS_REPLY.  Fallthrough */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			unsigned int ret;
+
+			ret = do_chain(priv, skb, state);
+			if (ret != NF_ACCEPT)
+				return ret;
+
+			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
+				break;
+
+			ret = nf_nat_alloc_null_binding(ct, state->hook);
+			if (ret != NF_ACCEPT)
+				return ret;
+		} else {
+			pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
+				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+				 ct, ct->status);
+			if (nf_nat_oif_changed(state->hook, ctinfo, nat,
+					       state->out))
+				goto oif_changed;
+		}
+		break;
+	default:
+		/* ESTABLISHED */
+		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+			ctinfo != IP_CT_ESTABLISHED_REPLY);
+		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
+			goto oif_changed;
+	}
+
+	return nf_nat_packet(ct, ctinfo, state->hook, skb);
+
+oif_changed:
+	nf_ct_kill_acct(ct, ctinfo, skb);
+	return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nf_nat_inet_fn);
+
 struct nf_nat_proto_clean {
 	u8	l3proto;
 	u8	l4proto;
-- 
cgit v1.2.3


From 4e25ceb80b585891c5e2a6edfa481bc4709e9544 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:55 +0200
Subject: netfilter: nf_tables: allow chain type to override hook register

Will be used in followup patch when nat types no longer
use nf_register_net_hook() but will instead register with the nat core.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       |  8 ++++----
 net/ipv4/netfilter/nft_chain_nat_ipv4.c | 19 +++++++++++++------
 net/ipv6/netfilter/nft_chain_nat_ipv6.c | 20 ++++++++++++++------
 net/netfilter/nf_tables_api.c           | 23 ++++++++++++++++-------
 4 files changed, 47 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 435c9e3b9181..a94fd0c730d6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -880,8 +880,8 @@ enum nft_chain_types {
  * 	@owner: module owner
  * 	@hook_mask: mask of valid hooks
  * 	@hooks: array of hook functions
- *	@init: chain initialization function
- *	@free: chain release function
+ *	@ops_register: base chain register function
+ *	@ops_unregister: base chain unregister function
  */
 struct nft_chain_type {
 	const char			*name;
@@ -890,8 +890,8 @@ struct nft_chain_type {
 	struct module			*owner;
 	unsigned int			hook_mask;
 	nf_hookfn			*hooks[NF_MAX_HOOKS];
-	int				(*init)(struct nft_ctx *ctx);
-	void				(*free)(struct nft_ctx *ctx);
+	int				(*ops_register)(struct net *net, const struct nf_hook_ops *ops);
+	void				(*ops_unregister)(struct net *net, const struct nf_hook_ops *ops);
 };
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index 285baccfbdea..bbcb624b6b81 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -66,14 +66,21 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv,
 	return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
-static int nft_nat_ipv4_init(struct nft_ctx *ctx)
+static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	return nf_ct_netns_get(ctx->net, ctx->family);
+	int ret = nf_register_net_hook(net, ops);
+	if (ret == 0) {
+		ret = nf_ct_netns_get(net, NFPROTO_IPV4);
+		if (ret)
+			 nf_unregister_net_hook(net, ops);
+	}
+	return ret;
 }
 
-static void nft_nat_ipv4_free(struct nft_ctx *ctx)
+static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_unregister_net_hook(net, ops);
+	nf_ct_netns_put(net, NFPROTO_IPV4);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv4 = {
@@ -91,8 +98,8 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
 		[NF_INET_LOCAL_OUT]	= nft_nat_ipv4_local_fn,
 		[NF_INET_LOCAL_IN]	= nft_nat_ipv4_fn,
 	},
-	.init		= nft_nat_ipv4_init,
-	.free		= nft_nat_ipv4_free,
+	.ops_register = nft_nat_ipv4_reg,
+	.ops_unregister = nft_nat_ipv4_unreg,
 };
 
 static int __init nft_chain_nat_init(void)
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 100a6bd1046a..05bcb2c23125 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -64,14 +64,22 @@ static unsigned int nft_nat_ipv6_local_fn(void *priv,
 	return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
-static int nft_nat_ipv6_init(struct nft_ctx *ctx)
+static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	return nf_ct_netns_get(ctx->net, ctx->family);
+	int ret = nf_register_net_hook(net, ops);
+	if (ret == 0) {
+		ret = nf_ct_netns_get(net, NFPROTO_IPV6);
+		if (ret)
+			 nf_unregister_net_hook(net, ops);
+	}
+
+	return ret;
 }
 
-static void nft_nat_ipv6_free(struct nft_ctx *ctx)
+static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_unregister_net_hook(net, ops);
+	nf_ct_netns_put(net, NFPROTO_IPV6);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv6 = {
@@ -89,8 +97,8 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
 		[NF_INET_LOCAL_OUT]	= nft_nat_ipv6_local_fn,
 		[NF_INET_LOCAL_IN]	= nft_nat_ipv6_fn,
 	},
-	.init		= nft_nat_ipv6_init,
-	.free		= nft_nat_ipv6_free,
+	.ops_register		= nft_nat_ipv6_reg,
+	.ops_unregister		= nft_nat_ipv6_unreg,
 };
 
 static int __init nft_chain_nat_ipv6_init(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 18bd584fadda..ded54b2abfbc 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -129,6 +129,7 @@ static int nf_tables_register_hook(struct net *net,
 				   const struct nft_table *table,
 				   struct nft_chain *chain)
 {
+	const struct nft_base_chain *basechain;
 	struct nf_hook_ops *ops;
 	int ret;
 
@@ -136,7 +137,12 @@ static int nf_tables_register_hook(struct net *net,
 	    !nft_is_base_chain(chain))
 		return 0;
 
-	ops = &nft_base_chain(chain)->ops;
+	basechain = nft_base_chain(chain);
+	ops = &basechain->ops;
+
+	if (basechain->type->ops_register)
+		return basechain->type->ops_register(net, ops);
+
 	ret = nf_register_net_hook(net, ops);
 	if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
 		ops->nat_hook = false;
@@ -151,11 +157,19 @@ static void nf_tables_unregister_hook(struct net *net,
 				      const struct nft_table *table,
 				      struct nft_chain *chain)
 {
+	const struct nft_base_chain *basechain;
+	const struct nf_hook_ops *ops;
+
 	if (table->flags & NFT_TABLE_F_DORMANT ||
 	    !nft_is_base_chain(chain))
 		return;
+	basechain = nft_base_chain(chain);
+	ops = &basechain->ops;
+
+	if (basechain->type->ops_unregister)
+		return basechain->type->ops_unregister(net, ops);
 
-	nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+	nf_unregister_net_hook(net, ops);
 }
 
 static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -1262,8 +1276,6 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 	if (nft_is_base_chain(chain)) {
 		struct nft_base_chain *basechain = nft_base_chain(chain);
 
-		if (basechain->type->free)
-			basechain->type->free(ctx);
 		module_put(basechain->type->owner);
 		free_percpu(basechain->stats);
 		if (basechain->stats)
@@ -1396,9 +1408,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		}
 
 		basechain->type = hook.type;
-		if (basechain->type->init)
-			basechain->type->init(ctx);
-
 		chain = &basechain->chain;
 
 		ops		= &basechain->ops;
-- 
cgit v1.2.3


From 1cd472bf036ca038e783ef5f058f54e45b7e8180 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:57 +0200
Subject: netfilter: nf_nat: add nat hook register functions to nf_nat

This adds the infrastructure to register nat hooks with the nat core
instead of the netfilter core.

nat hooks are used to configure nat bindings.  Such hooks are registered
from ip(6)table_nat or by the nftables core when a nat chain is added.

After next patch, nat hooks will be registered with nf_nat instead of
netfilter core.  This allows to use many nat lookup functions at the
same time while doing the real packet rewrite (nat transformation) in
one place.

This change doesn't convert the intended users yet to ease review.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat.h |   4 ++
 net/netfilter/nf_nat_core.c    | 157 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index da3d601cadee..a17eb2f8d40e 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -75,4 +75,8 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum,
 #endif
 }
 
+int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+		       const struct nf_hook_ops *nat_ops, unsigned int ops_count);
+void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+			  unsigned int ops_count);
 #endif
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 0cd503aacbf0..f531d77dd684 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -32,6 +32,8 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+#include "nf_internals.h"
+
 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
 
 static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -39,11 +41,27 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
+static unsigned int nat_net_id __read_mostly;
 
 static struct hlist_head *nf_nat_bysource __read_mostly;
 static unsigned int nf_nat_htable_size __read_mostly;
 static unsigned int nf_nat_hash_rnd __read_mostly;
 
+struct nf_nat_lookup_hook_priv {
+	struct nf_hook_entries __rcu *entries;
+
+	struct rcu_head rcu_head;
+};
+
+struct nf_nat_hooks_net {
+	struct nf_hook_ops *nat_hook_ops;
+	unsigned int users;
+};
+
+struct nat_net {
+	struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
+};
+
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
 {
@@ -871,6 +889,138 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
 	.expectfn	= nf_nat_follow_master,
 };
 
+int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+		       const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
+{
+	struct nat_net *nat_net = net_generic(net, nat_net_id);
+	struct nf_nat_hooks_net *nat_proto_net;
+	struct nf_nat_lookup_hook_priv *priv;
+	unsigned int hooknum = ops->hooknum;
+	struct nf_hook_ops *nat_ops;
+	int i, ret;
+
+	if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
+		return -EINVAL;
+
+	nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+	for (i = 0; i < ops_count; i++) {
+		if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
+			return -EINVAL;
+		if (orig_nat_ops[i].hooknum == hooknum) {
+			hooknum = i;
+			break;
+		}
+	}
+
+	if (WARN_ON_ONCE(i == ops_count))
+		return -EINVAL;
+
+	mutex_lock(&nf_nat_proto_mutex);
+	if (!nat_proto_net->nat_hook_ops) {
+		WARN_ON(nat_proto_net->users != 0);
+
+		nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
+		if (!nat_ops) {
+			mutex_unlock(&nf_nat_proto_mutex);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < ops_count; i++) {
+			priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+			if (priv) {
+				nat_ops[i].priv = priv;
+				continue;
+			}
+			mutex_unlock(&nf_nat_proto_mutex);
+			while (i)
+				kfree(nat_ops[--i].priv);
+			kfree(nat_ops);
+			return -ENOMEM;
+		}
+
+		ret = nf_register_net_hooks(net, nat_ops, ops_count);
+		if (ret < 0) {
+			mutex_unlock(&nf_nat_proto_mutex);
+			for (i = 0; i < ops_count; i++)
+				kfree(nat_ops[i].priv);
+			kfree(nat_ops);
+			return ret;
+		}
+
+		nat_proto_net->nat_hook_ops = nat_ops;
+	}
+
+	nat_ops = nat_proto_net->nat_hook_ops;
+	priv = nat_ops[hooknum].priv;
+	if (WARN_ON_ONCE(!priv)) {
+		mutex_unlock(&nf_nat_proto_mutex);
+		return -EOPNOTSUPP;
+	}
+
+	ret = nf_hook_entries_insert_raw(&priv->entries, ops);
+	if (ret == 0)
+		nat_proto_net->users++;
+
+	mutex_unlock(&nf_nat_proto_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_register_fn);
+
+void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+		          unsigned int ops_count)
+{
+	struct nat_net *nat_net = net_generic(net, nat_net_id);
+	struct nf_nat_hooks_net *nat_proto_net;
+	struct nf_nat_lookup_hook_priv *priv;
+	struct nf_hook_ops *nat_ops;
+	int hooknum = ops->hooknum;
+	int i;
+
+	if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
+		return;
+
+	nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+	mutex_lock(&nf_nat_proto_mutex);
+	if (WARN_ON(nat_proto_net->users == 0))
+		goto unlock;
+
+	nat_proto_net->users--;
+
+	nat_ops = nat_proto_net->nat_hook_ops;
+	for (i = 0; i < ops_count; i++) {
+		if (nat_ops[i].hooknum == hooknum) {
+			hooknum = i;
+			break;
+		}
+	}
+	if (WARN_ON_ONCE(i == ops_count))
+		goto unlock;
+	priv = nat_ops[hooknum].priv;
+	nf_hook_entries_delete_raw(&priv->entries, ops);
+
+	if (nat_proto_net->users == 0) {
+		nf_unregister_net_hooks(net, nat_ops, ops_count);
+
+		for (i = 0; i < ops_count; i++) {
+			priv = nat_ops[i].priv;
+			kfree_rcu(priv, rcu_head);
+		}
+
+		nat_proto_net->nat_hook_ops = NULL;
+		kfree(nat_ops);
+	}
+unlock:
+	mutex_unlock(&nf_nat_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
+
+static struct pernet_operations nat_net_ops = {
+	.id = &nat_net_id,
+	.size = sizeof(struct nat_net),
+};
+
 static int __init nf_nat_init(void)
 {
 	int ret, i;
@@ -894,6 +1044,12 @@ static int __init nf_nat_init(void)
 	for (i = 0; i < CONNTRACK_LOCKS; i++)
 		spin_lock_init(&nf_nat_locks[i]);
 
+	ret = register_pernet_subsys(&nat_net_ops);
+	if (ret < 0) {
+		nf_ct_extend_unregister(&nat_extend);
+		return ret;
+	}
+
 	nf_ct_helper_expectfn_register(&follow_master_nat);
 
 	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
@@ -925,6 +1081,7 @@ static void __exit nf_nat_cleanup(void)
 		kfree(nf_nat_l4protos[i]);
 	synchronize_net();
 	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+	unregister_pernet_subsys(&nat_net_ops);
 }
 
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 9971a514ed2697e542f3984a6162eac54bb1da98 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:58 +0200
Subject: netfilter: nf_nat: add nat type hooks to nat core

Currently the packet rewrite and instantiation of nat NULL bindings
happens from the protocol specific nat backend.

Invocation occurs either via ip(6)table_nat or the nf_tables nat chain type.

Invocation looks like this (simplified):
NF_HOOK()
   |
   `---iptable_nat
	 |
	 `---> nf_nat_l3proto_ipv4 -> nf_nat_packet
	               |
          new packet? pass skb though iptables nat chain
                       |
		       `---> iptable_nat: ipt_do_table

In nft case, this looks the same (nft_chain_nat_ipv4 instead of
iptable_nat).

This is a problem for two reasons:
1. Can't use iptables nat and nf_tables nat at the same time,
   as the first user adds a nat binding (nf_nat_l3proto_ipv4 adds a
   NULL binding if do_table() did not find a matching nat rule so we
   can detect post-nat tuple collisions).
2. If you use e.g. nft_masq, snat, redir, etc. uses must also register
   an empty base chain so that the nat core gets called fro NF_HOOK()
   to do the reverse translation, which is neither obvious nor user
   friendly.

After this change, the base hook gets registered not from iptable_nat or
nftables nat hooks, but from the l3 nat core.

iptables/nft nat base hooks get registered with the nat core instead:

NF_HOOK()
   |
   `---> nf_nat_l3proto_ipv4 -> nf_nat_packet
		|
         new packet? pass skb through iptables/nftables nat chains
                |
		+-> iptables_nat: ipt_do_table
	        +-> nft nat chain x
	        `-> nft nat chain y

The nat core deals with null bindings and reverse translation.
When no mapping exists, it calls the registered nat lookup hooks until
one creates a new mapping.
If both iptables and nftables nat hooks exist, the first matching
one is used (i.e., higher priority wins).

Also, nft users do not need to create empty nat hooks anymore,
nat core always registers the base hooks that take care of reverse/reply
translation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_core.h      |  5 +-
 include/net/netfilter/nf_nat_l3proto.h   | 52 ++-----------------
 net/ipv4/netfilter/iptable_nat.c         | 85 ++++++++++++++++----------------
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 82 ++++++++++++++++++++----------
 net/ipv4/netfilter/nft_chain_nat_ipv4.c  | 51 +++----------------
 net/ipv6/netfilter/ip6table_nat.c        | 84 +++++++++++++++----------------
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 83 ++++++++++++++++++++-----------
 net/ipv6/netfilter/nft_chain_nat_ipv6.c  | 48 +++---------------
 net/netfilter/nf_nat_core.c              | 31 +++++++-----
 9 files changed, 232 insertions(+), 289 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index 0d84dd29108d..c78e9be14b3d 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -13,10 +13,7 @@ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 
 unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state));
+	       const struct nf_hook_state *state);
 
 int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family);
 
diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index 8bad2560576f..d300b8f03972 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -44,58 +44,14 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
 				  enum ip_conntrack_info ctinfo,
 				  unsigned int hooknum);
 
-unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
-			     const struct nf_hook_state *state,
-			     unsigned int (*do_chain)(void *priv,
-						      struct sk_buff *skb,
-						      const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv4_local_fn(void *priv,
-				  struct sk_buff *skb,
-				  const struct nf_hook_state *state,
-				  unsigned int (*do_chain)(void *priv,
-							   struct sk_buff *skb,
-							   const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
-
 int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
 				    enum ip_conntrack_info ctinfo,
 				    unsigned int hooknum, unsigned int hdrlen);
 
-unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
-			     const struct nf_hook_state *state,
-			     unsigned int (*do_chain)(void *priv,
-						      struct sk_buff *skb,
-						      const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv6_local_fn(void *priv,
-				  struct sk_buff *skb,
-				  const struct nf_hook_state *state,
-				  unsigned int (*do_chain)(void *priv,
-							   struct sk_buff *skb,
-							   const struct nf_hook_state *state));
+int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops);
+void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 
-unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
+int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops);
+void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 
 #endif /* _NF_NAT_L3PROTO_H */
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 529d89ec31e8..a317445448bf 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -38,69 +38,58 @@ static unsigned int iptable_nat_do_chain(void *priv,
 	return ipt_do_table(skb, state, state->net->ipv4.nat_table);
 }
 
-static unsigned int iptable_nat_ipv4_fn(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_in(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_out(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_local_fn(void *priv,
-					      struct sk_buff *skb,
-					      const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
-}
-
 static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
-	/* Before packet filtering, change destination */
 	{
-		.hook		= iptable_nat_ipv4_in,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= iptable_nat_ipv4_out,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP_PRI_NAT_SRC,
 	},
-	/* Before packet filtering, change destination */
 	{
-		.hook		= iptable_nat_ipv4_local_fn,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= iptable_nat_ipv4_fn,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP_PRI_NAT_SRC,
 	},
 };
 
+static int ipt_nat_register_lookups(struct net *net)
+{
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
+		ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
+		if (ret) {
+			while (i)
+				nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
+
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void ipt_nat_unregister_lookups(struct net *net)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
+		nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
+}
+
 static int __net_init iptable_nat_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
@@ -113,7 +102,18 @@ static int __net_init iptable_nat_table_init(struct net *net)
 	if (repl == NULL)
 		return -ENOMEM;
 	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
-				 nf_nat_ipv4_ops, &net->ipv4.nat_table);
+				 NULL, &net->ipv4.nat_table);
+	if (ret < 0) {
+		kfree(repl);
+		return ret;
+	}
+
+	ret = ipt_nat_register_lookups(net);
+	if (ret < 0) {
+		ipt_unregister_table(net, net->ipv4.nat_table, NULL);
+		net->ipv4.nat_table = NULL;
+	}
+
 	kfree(repl);
 	return ret;
 }
@@ -122,7 +122,8 @@ static void __net_exit iptable_nat_net_exit(struct net *net)
 {
 	if (!net->ipv4.nat_table)
 		return;
-	ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+	ipt_nat_unregister_lookups(net);
+	ipt_unregister_table(net, net->ipv4.nat_table, NULL);
 	net->ipv4.nat_table = NULL;
 }
 
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 29b5aceac66d..6115bf1ff6f0 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -241,12 +241,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -265,35 +262,28 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 		}
 	}
 
-	return nf_nat_inet_fn(priv, skb, state, do_chain);
+	return nf_nat_inet_fn(priv, skb, state);
 }
 EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	unsigned int ret;
 	__be32 daddr = ip_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv4_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    daddr != ip_hdr(skb)->daddr)
 		skb_dst_drop(skb);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
-		const struct nf_hook_state *state,
-		unsigned int (*do_chain)(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state))
+		const struct nf_hook_state *state)
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -302,7 +292,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 #endif
 	unsigned int ret;
 
-	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv4_fn(priv, skb, state);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -322,21 +312,17 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 #endif
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
-		     const struct nf_hook_state *state,
-		     unsigned int (*do_chain)(void *priv,
-					       struct sk_buff *skb,
-					       const struct nf_hook_state *state))
+		     const struct nf_hook_state *state)
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 	int err;
 
-	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv4_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -360,7 +346,49 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
+
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_in,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_out,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_local_fn,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_fn,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+};
+
+int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn);
+
+void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
 
 static int __init nf_nat_l3proto_ipv4_init(void)
 {
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index bbcb624b6b81..a3c4ea303e3e 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -27,8 +27,8 @@
 #include <net/ip.h>
 
 static unsigned int nft_nat_do_chain(void *priv,
-				      struct sk_buff *skb,
-				      const struct nf_hook_state *state)
+				     struct sk_buff *skb,
+				     const struct nf_hook_state *state)
 {
 	struct nft_pktinfo pkt;
 
@@ -38,49 +38,14 @@ static unsigned int nft_nat_do_chain(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static unsigned int nft_nat_ipv4_fn(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_in(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_out(void *priv,
-				     struct sk_buff *skb,
-				     const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_local_fn(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
-}
-
 static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	int ret = nf_register_net_hook(net, ops);
-	if (ret == 0) {
-		ret = nf_ct_netns_get(net, NFPROTO_IPV4);
-		if (ret)
-			 nf_unregister_net_hook(net, ops);
-	}
-	return ret;
+	return nf_nat_l3proto_ipv4_register_fn(net, ops);
 }
 
 static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hook(net, ops);
-	nf_ct_netns_put(net, NFPROTO_IPV4);
+	nf_nat_l3proto_ipv4_unregister_fn(net, ops);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv4 = {
@@ -93,10 +58,10 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
 			  (1 << NF_INET_LOCAL_OUT) |
 			  (1 << NF_INET_LOCAL_IN),
 	.hooks		= {
-		[NF_INET_PRE_ROUTING]	= nft_nat_ipv4_in,
-		[NF_INET_POST_ROUTING]	= nft_nat_ipv4_out,
-		[NF_INET_LOCAL_OUT]	= nft_nat_ipv4_local_fn,
-		[NF_INET_LOCAL_IN]	= nft_nat_ipv4_fn,
+		[NF_INET_PRE_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_nat_do_chain,
 	},
 	.ops_register = nft_nat_ipv4_reg,
 	.ops_unregister = nft_nat_ipv4_unreg,
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 2bf554e18af8..67ba70ab9f5c 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -40,69 +40,58 @@ static unsigned int ip6table_nat_do_chain(void *priv,
 	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
 }
 
-static unsigned int ip6table_nat_fn(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_in(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_out(void *priv,
-				     struct sk_buff *skb,
-				     const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_local_fn(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain);
-}
-
 static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
-	/* Before packet filtering, change destination */
 	{
-		.hook		= ip6table_nat_in,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP6_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= ip6table_nat_out,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP6_PRI_NAT_SRC,
 	},
-	/* Before packet filtering, change destination */
 	{
-		.hook		= ip6table_nat_local_fn,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP6_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= ip6table_nat_fn,
-		.nat_hook	= true,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP6_PRI_NAT_SRC,
 	},
 };
 
+static int ip6t_nat_register_lookups(struct net *net)
+{
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) {
+		ret = nf_nat_l3proto_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]);
+		if (ret) {
+			while (i)
+				nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]);
+
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void ip6t_nat_unregister_lookups(struct net *net)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
+		nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]);
+}
+
 static int __net_init ip6table_nat_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
@@ -115,7 +104,17 @@ static int __net_init ip6table_nat_table_init(struct net *net)
 	if (repl == NULL)
 		return -ENOMEM;
 	ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
-				  nf_nat_ipv6_ops, &net->ipv6.ip6table_nat);
+				  NULL, &net->ipv6.ip6table_nat);
+	if (ret < 0) {
+		kfree(repl);
+		return ret;
+	}
+
+	ret = ip6t_nat_register_lookups(net);
+	if (ret < 0) {
+		ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
+		net->ipv6.ip6table_nat = NULL;
+	}
 	kfree(repl);
 	return ret;
 }
@@ -124,7 +123,8 @@ static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
 	if (!net->ipv6.ip6table_nat)
 		return;
-	ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
+	ip6t_nat_unregister_lookups(net);
+	ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
 	net->ipv6.ip6table_nat = NULL;
 }
 
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 3ec228984f82..ca6d38698b1a 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -252,12 +252,9 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -289,35 +286,27 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 		}
 	}
 
-	return nf_nat_inet_fn(priv, skb, state, do_chain);
+	return nf_nat_inet_fn(priv, skb, state);
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	unsigned int ret;
 	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv6_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
 		skb_dst_drop(skb);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_in);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
-		const struct nf_hook_state *state,
-		unsigned int (*do_chain)(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state))
+		const struct nf_hook_state *state)
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -326,7 +315,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 #endif
 	unsigned int ret;
 
-	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv6_fn(priv, skb, state);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -346,21 +335,17 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 #endif
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_out);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
-		     const struct nf_hook_state *state,
-		     unsigned int (*do_chain)(void *priv,
-					      struct sk_buff *skb,
-					      const struct nf_hook_state *state))
+		     const struct nf_hook_state *state)
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 	int err;
 
-	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv6_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -384,7 +369,49 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn);
+
+static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv6_in,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP6_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv6_out,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP6_PRI_NAT_SRC,
+	},
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv6_local_fn,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv6_fn,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_NAT_SRC,
+	},
+};
+
+int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_register_fn);
+
+void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn);
 
 static int __init nf_nat_l3proto_ipv6_init(void)
 {
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 05bcb2c23125..8a081ad7d5db 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -36,50 +36,14 @@ static unsigned int nft_nat_do_chain(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static unsigned int nft_nat_ipv6_fn(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_in(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_out(void *priv,
-				     struct sk_buff *skb,
-				     const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_local_fn(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
-}
-
 static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	int ret = nf_register_net_hook(net, ops);
-	if (ret == 0) {
-		ret = nf_ct_netns_get(net, NFPROTO_IPV6);
-		if (ret)
-			 nf_unregister_net_hook(net, ops);
-	}
-
-	return ret;
+	return nf_nat_l3proto_ipv6_register_fn(net, ops);
 }
 
 static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hook(net, ops);
-	nf_ct_netns_put(net, NFPROTO_IPV6);
+	nf_nat_l3proto_ipv6_unregister_fn(net, ops);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv6 = {
@@ -92,10 +56,10 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
 			  (1 << NF_INET_LOCAL_OUT) |
 			  (1 << NF_INET_LOCAL_IN),
 	.hooks		= {
-		[NF_INET_PRE_ROUTING]	= nft_nat_ipv6_in,
-		[NF_INET_POST_ROUTING]	= nft_nat_ipv6_out,
-		[NF_INET_LOCAL_OUT]	= nft_nat_ipv6_local_fn,
-		[NF_INET_LOCAL_IN]	= nft_nat_ipv6_fn,
+		[NF_INET_PRE_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_nat_do_chain,
 	},
 	.ops_register		= nft_nat_ipv6_reg,
 	.ops_unregister		= nft_nat_ipv6_unreg,
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f531d77dd684..489599b549cf 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -533,10 +533,7 @@ EXPORT_SYMBOL_GPL(nf_nat_packet);
 
 unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -564,15 +561,23 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 		 * or local packets.
 		 */
 		if (!nf_nat_initialized(ct, maniptype)) {
+			struct nf_nat_lookup_hook_priv *lpriv = priv;
+			struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
 			unsigned int ret;
-
-			ret = do_chain(priv, skb, state);
-			if (ret != NF_ACCEPT)
-				return ret;
-
-			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
-				break;
-
+			int i;
+
+			if (!e)
+				goto null_bind;
+
+			for (i = 0; i < e->num_hook_entries; i++) {
+				ret = e->hooks[i].hook(e->hooks[i].priv, skb,
+						       state);
+				if (ret != NF_ACCEPT)
+					return ret;
+				if (nf_nat_initialized(ct, maniptype))
+					goto do_nat;
+			}
+null_bind:
 			ret = nf_nat_alloc_null_binding(ct, state->hook);
 			if (ret != NF_ACCEPT)
 				return ret;
@@ -592,7 +597,7 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
 			goto oif_changed;
 	}
-
+do_nat:
 	return nf_nat_packet(ct, ctinfo, state->hook, skb);
 
 oif_changed:
-- 
cgit v1.2.3


From a37061a678cab6d164f2989dd6f3b65f730289c7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:59 +0200
Subject: netfilter: lift one-nat-hook-only restriction

This reverts commit f92b40a8b2645
("netfilter: core: only allow one nat hook per hook point"), this
limitation is no longer needed.  The nat core now invokes these
functions and makes sure that hook evaluation stops after a mapping is
created and a null binding is created otherwise.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h     |  1 -
 net/netfilter/core.c          |  5 ----
 net/netfilter/nf_tables_api.c | 66 ++-----------------------------------------
 3 files changed, 2 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 85a1a0b32c66..72f5871b9a0a 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -67,7 +67,6 @@ struct nf_hook_ops {
 	struct net_device	*dev;
 	void			*priv;
 	u_int8_t		pf;
-	bool			nat_hook;
 	unsigned int		hooknum;
 	/* Hooks are ordered in ascending priority. */
 	int			priority;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5f0ebf9a8d5b..907d6ef8f3c1 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -138,11 +138,6 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
 			continue;
 		}
 
-		if (reg->nat_hook && orig_ops[i]->nat_hook) {
-			kvfree(new);
-			return ERR_PTR(-EBUSY);
-		}
-
 		if (inserted || reg->priority > orig_ops[i]->priority) {
 			new_ops[nhooks] = (void *)orig_ops[i];
 			new->hooks[nhooks] = old->hooks[i];
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ded54b2abfbc..a2bb31472aa1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -74,64 +74,12 @@ static void nft_trans_destroy(struct nft_trans *trans)
 	kfree(trans);
 }
 
-/* removal requests are queued in the commit_list, but not acted upon
- * until after all new rules are in place.
- *
- * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending
- * nf_unregister_net_hook().
- *
- * nf_register_net_hook thus fails if a nat hook is already in place
- * even if the conflicting hook is about to be removed.
- *
- * If collision is detected, search commit_log for DELCHAIN matching
- * the new nat hooknum; if we find one collision is temporary:
- *
- * Either transaction is aborted (new/colliding hook is removed), or
- * transaction is committed (old hook is removed).
- */
-static bool nf_tables_allow_nat_conflict(const struct net *net,
-					 const struct nf_hook_ops *ops)
-{
-	const struct nft_trans *trans;
-	bool ret = false;
-
-	if (!ops->nat_hook)
-		return false;
-
-	list_for_each_entry(trans, &net->nft.commit_list, list) {
-		const struct nf_hook_ops *pending_ops;
-		const struct nft_chain *pending;
-
-		if (trans->msg_type != NFT_MSG_NEWCHAIN &&
-		    trans->msg_type != NFT_MSG_DELCHAIN)
-			continue;
-
-		pending = trans->ctx.chain;
-		if (!nft_is_base_chain(pending))
-			continue;
-
-		pending_ops = &nft_base_chain(pending)->ops;
-		if (pending_ops->nat_hook &&
-		    pending_ops->pf == ops->pf &&
-		    pending_ops->hooknum == ops->hooknum) {
-			/* other hook registration already pending? */
-			if (trans->msg_type == NFT_MSG_NEWCHAIN)
-				return false;
-
-			ret = true;
-		}
-	}
-
-	return ret;
-}
-
 static int nf_tables_register_hook(struct net *net,
 				   const struct nft_table *table,
 				   struct nft_chain *chain)
 {
 	const struct nft_base_chain *basechain;
-	struct nf_hook_ops *ops;
-	int ret;
+	const struct nf_hook_ops *ops;
 
 	if (table->flags & NFT_TABLE_F_DORMANT ||
 	    !nft_is_base_chain(chain))
@@ -143,14 +91,7 @@ static int nf_tables_register_hook(struct net *net,
 	if (basechain->type->ops_register)
 		return basechain->type->ops_register(net, ops);
 
-	ret = nf_register_net_hook(net, ops);
-	if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
-		ops->nat_hook = false;
-		ret = nf_register_net_hook(net, ops);
-		ops->nat_hook = true;
-	}
-
-	return ret;
+	return nf_register_net_hook(net, ops);
 }
 
 static void nf_tables_unregister_hook(struct net *net,
@@ -1418,9 +1359,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		ops->hook	= hook.type->hooks[ops->hooknum];
 		ops->dev	= hook.dev;
 
-		if (basechain->type->type == NFT_CHAIN_T_NAT)
-			ops->nat_hook = true;
-
 		chain->flags |= NFT_BASE_CHAIN;
 		basechain->policy = policy;
 	} else {
-- 
cgit v1.2.3


From 1f4b24397d3e164dde7026b91056e67304724fb6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 23 May 2018 09:17:12 +0200
Subject: netfilter: add struct nf_ct_hook and use it

Move the nf_ct_destroy indirection to the struct nf_ct_hook.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         |  7 ++++++-
 net/netfilter/core.c              | 14 +++++++-------
 net/netfilter/nf_conntrack_core.c |  9 ++++++---
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 72f5871b9a0a..75ded6f6eebe 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -373,13 +373,18 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 
 extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
-extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
 
 struct nf_conn;
 enum ip_conntrack_info;
+
+struct nf_ct_hook {
+	void (*destroy)(struct nf_conntrack *);
+};
+extern struct nf_ct_hook __rcu *nf_ct_hook;
+
 struct nlattr;
 
 struct nfnl_ct_hook {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 907d6ef8f3c1..1bd844ea1d7c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -563,6 +563,9 @@ EXPORT_SYMBOL(skb_make_writable);
 struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
 EXPORT_SYMBOL_GPL(nfnl_ct_hook);
 
+struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_hook);
+
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 /* This does not belong here, but locally generated errors need it if connection
    tracking in use: without this, connection may not be in hash table, and hence
@@ -585,17 +588,14 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 }
 EXPORT_SYMBOL(nf_ct_attach);
 
-void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
-EXPORT_SYMBOL(nf_ct_destroy);
-
 void nf_conntrack_destroy(struct nf_conntrack *nfct)
 {
-	void (*destroy)(struct nf_conntrack *);
+	struct nf_ct_hook *ct_hook;
 
 	rcu_read_lock();
-	destroy = rcu_dereference(nf_ct_destroy);
-	BUG_ON(destroy == NULL);
-	destroy(nfct);
+	ct_hook = rcu_dereference(nf_ct_hook);
+	BUG_ON(ct_hook == NULL);
+	ct_hook->destroy(nfct);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(nf_conntrack_destroy);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 605441727008..8b2a8644d955 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1813,8 +1813,7 @@ void nf_conntrack_cleanup_start(void)
 
 void nf_conntrack_cleanup_end(void)
 {
-	RCU_INIT_POINTER(nf_ct_destroy, NULL);
-
+	RCU_INIT_POINTER(nf_ct_hook, NULL);
 	cancel_delayed_work_sync(&conntrack_gc_work.dwork);
 	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
 
@@ -2131,11 +2130,15 @@ err_cachep:
 	return ret;
 }
 
+static struct nf_ct_hook nf_conntrack_hook = {
+	.destroy	= destroy_conntrack,
+};
+
 void nf_conntrack_init_end(void)
 {
 	/* For use by REJECT target */
 	RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
-	RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
+	RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
 }
 
 /*
-- 
cgit v1.2.3


From 2c205dd3981f79cef097207ba9c61c2260812f39 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 23 May 2018 09:17:19 +0200
Subject: netfilter: add struct nf_nat_hook and use it

Move decode_session() and parse_nat_setup_hook() indirections to struct
nf_nat_hook structure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h            | 21 ++++++++++++++++-----
 include/net/netfilter/nf_nat_core.h  |  7 -------
 net/netfilter/core.c                 |  8 +++-----
 net/netfilter/nf_conntrack_core.c    |  5 -----
 net/netfilter/nf_conntrack_netlink.c | 10 +++++-----
 net/netfilter/nf_nat_core.c          | 23 ++++++++++++-----------
 6 files changed, 36 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 75ded6f6eebe..e8d09dc028f6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -320,18 +320,29 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);
 
 #include <net/flow.h>
-extern void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
+
+struct nf_conn;
+enum nf_nat_manip_type;
+struct nlattr;
+
+struct nf_nat_hook {
+	int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
+			       const struct nlattr *attr);
+	void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
+};
+
+extern struct nf_nat_hook __rcu *nf_nat_hook;
 
 static inline void
 nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 {
 #ifdef CONFIG_NF_NAT_NEEDED
-	void (*decodefn)(struct sk_buff *, struct flowi *);
+	struct nf_nat_hook *nat_hook;
 
 	rcu_read_lock();
-	decodefn = rcu_dereference(nf_nat_decode_session_hook);
-	if (decodefn)
-		decodefn(skb, fl);
+	nat_hook = rcu_dereference(nf_nat_hook);
+	if (nat_hook->decode_session)
+		nat_hook->decode_session(skb, fl);
 	rcu_read_unlock();
 #endif
 }
diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index c78e9be14b3d..dc7cd0440229 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -26,11 +26,4 @@ static inline int nf_nat_initialized(struct nf_conn *ct,
 		return ct->status & IPS_DST_NAT_DONE;
 }
 
-struct nlattr;
-
-extern int
-(*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
-				  enum nf_nat_manip_type manip,
-				  const struct nlattr *attr);
-
 #endif /* _NF_NAT_CORE_H */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 1bd844ea1d7c..e0ae4aae96f5 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -574,6 +574,9 @@ void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
 		__rcu __read_mostly;
 EXPORT_SYMBOL(ip_ct_attach);
 
+struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_hook);
+
 void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 {
 	void (*attach)(struct sk_buff *, const struct sk_buff *);
@@ -608,11 +611,6 @@ const struct nf_conntrack_zone nf_ct_zone_dflt = {
 EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
 #endif /* CONFIG_NF_CONNTRACK */
 
-#ifdef CONFIG_NF_NAT_NEEDED
-void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(nf_nat_decode_session_hook);
-#endif
-
 static void __net_init __netfilter_net_init(struct nf_hook_entries **e, int max)
 {
 	int h;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8b2a8644d955..8d109d750073 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -58,11 +58,6 @@
 
 #include "nf_internals.h"
 
-int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
-				      enum nf_nat_manip_type manip,
-				      const struct nlattr *attr) __read_mostly;
-EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
-
 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d807b8770be3..39327a42879f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1431,11 +1431,11 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 			  enum nf_nat_manip_type manip,
 			  const struct nlattr *attr)
 {
-	typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+	struct nf_nat_hook *nat_hook;
 	int err;
 
-	parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
-	if (!parse_nat_setup) {
+	nat_hook = rcu_dereference(nf_nat_hook);
+	if (!nat_hook) {
 #ifdef CONFIG_MODULES
 		rcu_read_unlock();
 		nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
@@ -1446,13 +1446,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 		}
 		nfnl_lock(NFNL_SUBSYS_CTNETLINK);
 		rcu_read_lock();
-		if (nfnetlink_parse_nat_setup_hook)
+		if (nat_hook->parse_nat_setup)
 			return -EAGAIN;
 #endif
 		return -EOPNOTSUPP;
 	}
 
-	err = parse_nat_setup(ct, manip, attr);
+	err = nat_hook->parse_nat_setup(ct, manip, attr);
 	if (err == -EAGAIN) {
 #ifdef CONFIG_MODULES
 		rcu_read_unlock();
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 489599b549cf..f4d264676cfe 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1026,6 +1026,13 @@ static struct pernet_operations nat_net_ops = {
 	.size = sizeof(struct nat_net),
 };
 
+struct nf_nat_hook nat_hook = {
+	.parse_nat_setup	= nfnetlink_parse_nat_setup,
+#ifdef CONFIG_XFRM
+	.decode_session		= __nf_nat_decode_session,
+#endif
+};
+
 static int __init nf_nat_init(void)
 {
 	int ret, i;
@@ -1057,13 +1064,9 @@ static int __init nf_nat_init(void)
 
 	nf_ct_helper_expectfn_register(&follow_master_nat);
 
-	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
-	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
-			   nfnetlink_parse_nat_setup);
-#ifdef CONFIG_XFRM
-	BUG_ON(nf_nat_decode_session_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
-#endif
+	WARN_ON(nf_nat_hook != NULL);
+	RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
+
 	return 0;
 }
 
@@ -1076,10 +1079,8 @@ static void __exit nf_nat_cleanup(void)
 
 	nf_ct_extend_unregister(&nat_extend);
 	nf_ct_helper_expectfn_unregister(&follow_master_nat);
-	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-#ifdef CONFIG_XFRM
-	RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
-#endif
+	RCU_INIT_POINTER(nf_nat_hook, NULL);
+
 	synchronize_rcu();
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
-- 
cgit v1.2.3


From 368982cd7d1bd41cd39049c794990aca3770db44 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 23 May 2018 09:17:24 +0200
Subject: netfilter: nfnetlink_queue: resolve clash for unconfirmed conntracks

In nfqueue, two consecutive skbuffs may race to create the conntrack
entry. Hence, the one that loses the race gets dropped due to clash in
the insertion into the hashes from the nf_conntrack_confirm() path.

This patch adds a new nf_conntrack_update() function which searches for
possible clashes and resolve them. NAT mangling for the packet losing
race is corrected by using the conntrack information that won race.

In order to avoid direct module dependencies with conntrack and NAT, the
nf_ct_hook and nf_nat_hook structures are used for this purpose.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         |  5 +++
 net/netfilter/nf_conntrack_core.c | 77 +++++++++++++++++++++++++++++++++++++++
 net/netfilter/nf_nat_core.c       | 41 +++++++++++++--------
 net/netfilter/nfnetlink_queue.c   | 28 ++++++++++++--
 4 files changed, 132 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e8d09dc028f6..04551af2ff23 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -324,11 +324,15 @@ int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);
 struct nf_conn;
 enum nf_nat_manip_type;
 struct nlattr;
+enum ip_conntrack_dir;
 
 struct nf_nat_hook {
 	int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
 			       const struct nlattr *attr);
 	void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
+	unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
+				  enum nf_nat_manip_type mtype,
+				  enum ip_conntrack_dir dir);
 };
 
 extern struct nf_nat_hook __rcu *nf_nat_hook;
@@ -392,6 +396,7 @@ struct nf_conn;
 enum ip_conntrack_info;
 
 struct nf_ct_hook {
+	int (*update)(struct net *net, struct sk_buff *skb);
 	void (*destroy)(struct nf_conntrack *);
 };
 extern struct nf_ct_hook __rcu *nf_ct_hook;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8d109d750073..3465da2a98bd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1607,6 +1607,82 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 	nf_conntrack_get(skb_nfct(nskb));
 }
 
+static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+{
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+	enum ip_conntrack_info ctinfo;
+	struct nf_nat_hook *nat_hook;
+	unsigned int dataoff, status;
+	struct nf_conn *ct;
+	u16 l3num;
+	u8 l4num;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || nf_ct_is_confirmed(ct))
+		return 0;
+
+	l3num = nf_ct_l3num(ct);
+	l3proto = nf_ct_l3proto_find_get(l3num);
+
+	if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+				 &l4num) <= 0)
+		return -1;
+
+	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+			     l4num, net, &tuple, l3proto, l4proto))
+		return -1;
+
+	if (ct->status & IPS_SRC_NAT) {
+		memcpy(tuple.src.u3.all,
+		       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
+		       sizeof(tuple.src.u3.all));
+		tuple.src.u.all =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
+	}
+
+	if (ct->status & IPS_DST_NAT) {
+		memcpy(tuple.dst.u3.all,
+		       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
+		       sizeof(tuple.dst.u3.all));
+		tuple.dst.u.all =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
+	}
+
+	h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
+	if (!h)
+		return 0;
+
+	/* Store status bits of the conntrack that is clashing to re-do NAT
+	 * mangling according to what it has been done already to this packet.
+	 */
+	status = ct->status;
+
+	nf_ct_put(ct);
+	ct = nf_ct_tuplehash_to_ctrack(h);
+	nf_ct_set(skb, ct, ctinfo);
+
+	nat_hook = rcu_dereference(nf_nat_hook);
+	if (!nat_hook)
+		return 0;
+
+	if (status & IPS_SRC_NAT &&
+	    nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
+				IP_CT_DIR_ORIGINAL) == NF_DROP)
+		return -1;
+
+	if (status & IPS_DST_NAT &&
+	    nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
+				IP_CT_DIR_ORIGINAL) == NF_DROP)
+		return -1;
+
+	return 0;
+}
+
 /* Bring out ya dead! */
 static struct nf_conn *
 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -2126,6 +2202,7 @@ err_cachep:
 }
 
 static struct nf_ct_hook nf_conntrack_hook = {
+	.update		= nf_conntrack_update,
 	.destroy	= destroy_conntrack,
 };
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f4d264676cfe..821f8d835f7a 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -493,17 +493,36 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 }
 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
 
+static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
+				     enum nf_nat_manip_type mtype,
+				     enum ip_conntrack_dir dir)
+{
+	const struct nf_nat_l3proto *l3proto;
+	const struct nf_nat_l4proto *l4proto;
+	struct nf_conntrack_tuple target;
+
+	/* We are aiming to look like inverse of other direction. */
+	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+	l3proto = __nf_nat_l3proto_find(target.src.l3num);
+	l4proto = __nf_nat_l4proto_find(target.src.l3num,
+					target.dst.protonum);
+	if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+		return NF_DROP;
+
+	return NF_ACCEPT;
+}
+
 /* Do packet manipulations according to nf_nat_setup_info. */
 unsigned int nf_nat_packet(struct nf_conn *ct,
 			   enum ip_conntrack_info ctinfo,
 			   unsigned int hooknum,
 			   struct sk_buff *skb)
 {
-	const struct nf_nat_l3proto *l3proto;
-	const struct nf_nat_l4proto *l4proto;
+	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	unsigned int verdict = NF_ACCEPT;
 	unsigned long statusbit;
-	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
 
 	if (mtype == NF_NAT_MANIP_SRC)
 		statusbit = IPS_SRC_NAT;
@@ -515,19 +534,10 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
 		statusbit ^= IPS_NAT_MASK;
 
 	/* Non-atomic: these bits don't change. */
-	if (ct->status & statusbit) {
-		struct nf_conntrack_tuple target;
-
-		/* We are aiming to look like inverse of other direction. */
-		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+	if (ct->status & statusbit)
+		verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
 
-		l3proto = __nf_nat_l3proto_find(target.src.l3num);
-		l4proto = __nf_nat_l4proto_find(target.src.l3num,
-						target.dst.protonum);
-		if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
-			return NF_DROP;
-	}
-	return NF_ACCEPT;
+	return verdict;
 }
 EXPORT_SYMBOL_GPL(nf_nat_packet);
 
@@ -1031,6 +1041,7 @@ struct nf_nat_hook nat_hook = {
 #ifdef CONFIG_XFRM
 	.decode_session		= __nf_nat_decode_session,
 #endif
+	.manip_pkt		= nf_nat_manip_pkt,
 };
 
 static int __init nf_nat_init(void)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 74a04638ef03..2c173042ac0e 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -227,6 +227,25 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 	return entry;
 }
 
+static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+	struct nf_ct_hook *ct_hook;
+	int err;
+
+	if (verdict == NF_ACCEPT ||
+	    verdict == NF_STOP) {
+		rcu_read_lock();
+		ct_hook = rcu_dereference(nf_ct_hook);
+		if (ct_hook) {
+			err = ct_hook->update(entry->state.net, entry->skb);
+			if (err < 0)
+				verdict = NF_DROP;
+		}
+		rcu_read_unlock();
+	}
+	nf_reinject(entry, verdict);
+}
+
 static void
 nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 {
@@ -237,7 +256,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 		if (!cmpfn || cmpfn(entry, data)) {
 			list_del(&entry->list);
 			queue->queue_total--;
-			nf_reinject(entry, NF_DROP);
+			nfqnl_reinject(entry, NF_DROP);
 		}
 	}
 	spin_unlock_bh(&queue->lock);
@@ -686,7 +705,7 @@ err_out_free_nskb:
 err_out_unlock:
 	spin_unlock_bh(&queue->lock);
 	if (failopen)
-		nf_reinject(entry, NF_ACCEPT);
+		nfqnl_reinject(entry, NF_ACCEPT);
 err_out:
 	return err;
 }
@@ -1085,7 +1104,8 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
 	list_for_each_entry_safe(entry, tmp, &batch_list, list) {
 		if (nfqa[NFQA_MARK])
 			entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
-		nf_reinject(entry, verdict);
+
+		nfqnl_reinject(entry, verdict);
 	}
 	return 0;
 }
@@ -1208,7 +1228,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
 	if (nfqa[NFQA_MARK])
 		entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
 
-	nf_reinject(entry, verdict);
+	nfqnl_reinject(entry, verdict);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 036399782bf51dafb932b680b260936b2b5f8dd6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 22 May 2018 15:31:30 +0530
Subject: cpufreq: Rename cpufreq_can_do_remote_dvfs()

This routine checks if the CPU running this code belongs to the policy
of the target CPU or if not, can it do remote DVFS for it remotely. But
the current name of it implies as if it is only about doing remote
updates.

Rename it to make it more relevant.

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c | 2 +-
 include/linux/cpufreq.h            | 2 +-
 kernel/sched/cpufreq_schedutil.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index ca38229b045a..871bf9cf55cf 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -278,7 +278,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 	u64 delta_ns, lst;
 
-	if (!cpufreq_can_do_remote_dvfs(policy_dbs->policy))
+	if (!cpufreq_this_cpu_can_update(policy_dbs->policy))
 		return;
 
 	/*
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 87f48dd932eb..882a9b9e34bc 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -571,7 +571,7 @@ struct governor_attr {
 			 size_t count);
 };
 
-static inline bool cpufreq_can_do_remote_dvfs(struct cpufreq_policy *policy)
+static inline bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
 {
 	/*
 	 * Allow remote callbacks if:
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 416b7d7853d4..caf435c14a52 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -89,7 +89,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 	 * schedule the kthread.
 	 */
 	if (sg_policy->policy->fast_switch_enabled &&
-	    !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+	    !cpufreq_this_cpu_can_update(sg_policy->policy))
 		return false;
 
 	if (sg_policy->work_in_progress)
-- 
cgit v1.2.3


From d4e36e5554eb92f3ec7fedad3efb602570584df4 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Fri, 20 Apr 2018 13:49:25 +0300
Subject: mac80211: Support adding duration for prepare_tx() callback

There are specific cases, such as SAE authentication exchange, that
might require long duration to complete. For such cases, add support
for indicating to the driver the required duration of the prepare_tx()
operation, so the driver would still be able to complete the frame
exchange.

Currently, indicate the duration only for SAE authentication exchange,
as SAE authentication can take up to 2000 msec (as defined in IEEE
P802.11-REVmd D1.0 p. 3504).

As the patch modified the prepare_tx() callback API, also modify
the relevant code in iwlwifi.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath9k/main.c             |  3 ++-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c |  6 +++++-
 include/net/mac80211.h                            |  5 ++++-
 net/mac80211/driver-ops.h                         |  8 +++++---
 net/mac80211/mlme.c                               | 17 +++++++++------
 net/mac80211/trace.h                              | 25 ++++++++++++++++++++---
 6 files changed, 49 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index a3be8add56e1..b6663c80e7dd 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -2544,7 +2544,8 @@ static void ath9k_unassign_vif_chanctx(struct ieee80211_hw *hw,
 }
 
 static void ath9k_mgd_prepare_tx(struct ieee80211_hw *hw,
-				 struct ieee80211_vif *vif)
+				 struct ieee80211_vif *vif,
+				 u16 duration)
 {
 	struct ath_softc *sc = hw->priv;
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 5f0701c992a4..ec91dd90acfd 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -2844,7 +2844,8 @@ static int iwl_mvm_mac_conf_tx(struct ieee80211_hw *hw,
 }
 
 static void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw,
-				      struct ieee80211_vif *vif)
+				       struct ieee80211_vif *vif,
+				       u16 req_duration)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	u32 duration = IWL_MVM_TE_SESSION_PROTECTION_MAX_TIME_MS;
@@ -2857,6 +2858,9 @@ static void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw,
 	if (iwl_mvm_ref_sync(mvm, IWL_MVM_REF_PREPARE_TX))
 		return;
 
+	if (req_duration > duration)
+		duration = req_duration;
+
 	mutex_lock(&mvm->mutex);
 	/* Try really hard to protect the session and hear a beacon */
 	iwl_mvm_protect_session(mvm, vif, duration, min_duration, 500, false);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 604d738a2128..851a5e19ae32 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3378,6 +3378,8 @@ enum ieee80211_reconfig_type {
  *	frame in case that no beacon was heard from the AP/P2P GO.
  *	The callback will be called before each transmission and upon return
  *	mac80211 will transmit the frame right away.
+ *      If duration is greater than zero, mac80211 hints to the driver the
+ *      duration for which the operation is requested.
  *	The callback is optional and can (should!) sleep.
  *
  * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending
@@ -3697,7 +3699,8 @@ struct ieee80211_ops {
 				  u32 sset, u8 *data);
 
 	void	(*mgd_prepare_tx)(struct ieee80211_hw *hw,
-				  struct ieee80211_vif *vif);
+				  struct ieee80211_vif *vif,
+				  u16 duration);
 
 	void	(*mgd_protect_tdls_discover)(struct ieee80211_hw *hw,
 					     struct ieee80211_vif *vif);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 4d82fe7d627c..8f6998091d26 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -2,6 +2,7 @@
 /*
 * Portions of this file
 * Copyright(c) 2016 Intel Deutschland GmbH
+* Copyright (C) 2018 Intel Corporation
 */
 
 #ifndef __MAC80211_DRIVER_OPS
@@ -813,7 +814,8 @@ drv_allow_buffered_frames(struct ieee80211_local *local,
 }
 
 static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
-				      struct ieee80211_sub_if_data *sdata)
+				      struct ieee80211_sub_if_data *sdata,
+				      u16 duration)
 {
 	might_sleep();
 
@@ -821,9 +823,9 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
 		return;
 	WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
 
-	trace_drv_mgd_prepare_tx(local, sdata);
+	trace_drv_mgd_prepare_tx(local, sdata, duration);
 	if (local->ops->mgd_prepare_tx)
-		local->ops->mgd_prepare_tx(&local->hw, &sdata->vif);
+		local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration);
 	trace_drv_return_void(local);
 }
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 233068756502..a59187c016e0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -864,7 +864,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 		return;
 	}
 
-	drv_mgd_prepare_tx(local, sdata);
+	drv_mgd_prepare_tx(local, sdata, 0);
 
 	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
 	if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
@@ -2022,7 +2022,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 		 */
 		if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) &&
 		    !ifmgd->have_beacon)
-			drv_mgd_prepare_tx(sdata->local, sdata);
+			drv_mgd_prepare_tx(sdata->local, sdata, 0);
 
 		ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype,
 					       reason, tx, frame_buf);
@@ -2560,7 +2560,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 	if (!elems.challenge)
 		return;
 	auth_data->expected_transaction = 4;
-	drv_mgd_prepare_tx(sdata->local, sdata);
+	drv_mgd_prepare_tx(sdata->local, sdata, 0);
 	if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
 		tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
 			   IEEE80211_TX_INTFL_MLME_CONN_TX;
@@ -3769,6 +3769,7 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
 	u32 tx_flags = 0;
 	u16 trans = 1;
 	u16 status = 0;
+	u16 prepare_tx_duration = 0;
 
 	sdata_assert_lock(sdata);
 
@@ -3790,7 +3791,11 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
 		return -ETIMEDOUT;
 	}
 
-	drv_mgd_prepare_tx(local, sdata);
+	if (auth_data->algorithm == WLAN_AUTH_SAE)
+		prepare_tx_duration =
+			jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE);
+
+	drv_mgd_prepare_tx(local, sdata, prepare_tx_duration);
 
 	sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
 		   auth_data->bss->bssid, auth_data->tries,
@@ -4994,7 +4999,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 			   req->bssid, req->reason_code,
 			   ieee80211_get_reason_code_string(req->reason_code));
 
-		drv_mgd_prepare_tx(sdata->local, sdata);
+		drv_mgd_prepare_tx(sdata->local, sdata, 0);
 		ieee80211_send_deauth_disassoc(sdata, req->bssid,
 					       IEEE80211_STYPE_DEAUTH,
 					       req->reason_code, tx,
@@ -5014,7 +5019,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 			   req->bssid, req->reason_code,
 			   ieee80211_get_reason_code_string(req->reason_code));
 
-		drv_mgd_prepare_tx(sdata->local, sdata);
+		drv_mgd_prepare_tx(sdata->local, sdata, 0);
 		ieee80211_send_deauth_disassoc(sdata, req->bssid,
 					       IEEE80211_STYPE_DEAUTH,
 					       req->reason_code, tx,
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 591ad02e1fa4..80a7edf8d314 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2,6 +2,7 @@
 /*
 * Portions of this file
 * Copyright(c) 2016 Intel Deutschland GmbH
+* Copyright (C) 2018 Intel Corporation
 */
 
 #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
@@ -1413,11 +1414,29 @@ DEFINE_EVENT(release_evt, drv_allow_buffered_frames,
 	TP_ARGS(local, sta, tids, num_frames, reason, more_data)
 );
 
-DEFINE_EVENT(local_sdata_evt, drv_mgd_prepare_tx,
+TRACE_EVENT(drv_mgd_prepare_tx,
 	TP_PROTO(struct ieee80211_local *local,
-		 struct ieee80211_sub_if_data *sdata),
+		 struct ieee80211_sub_if_data *sdata,
+		 u16 duration),
 
-	TP_ARGS(local, sdata)
+	TP_ARGS(local, sdata, duration),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u32, duration)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->duration = duration;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT " duration: %u",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration
+	)
 );
 
 DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover,
-- 
cgit v1.2.3


From 76804d28c32ec14e1fdb3981623e5b7a4bc1c739 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Tue, 22 May 2018 10:19:06 +0200
Subject: cfg80211: use separate struct for FILS parameters

Put FILS related parameters into their own struct definition so
it can be reused for roam events in subsequent change.

Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 44 ++++++++++++++++++++++++++------------------
 net/wireless/nl80211.c | 22 +++++++++++-----------
 net/wireless/sme.c     | 40 +++++++++++++++++++++-------------------
 3 files changed, 58 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index df33c2766d22..0e4a2a04d55d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5420,6 +5420,30 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 #define CFG80211_TESTMODE_DUMP(cmd)
 #endif
 
+/**
+ * struct cfg80211_fils_resp_params - FILS connection response params
+ * @kek: KEK derived from a successful FILS connection (may be %NULL)
+ * @kek_len: Length of @fils_kek in octets
+ * @update_erp_next_seq_num: Boolean value to specify whether the value in
+ *	@erp_next_seq_num is valid.
+ * @erp_next_seq_num: The next sequence number to use in ERP message in
+ *	FILS Authentication. This value should be specified irrespective of the
+ *	status for a FILS connection.
+ * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
+ * @pmk_len: Length of @pmk in octets
+ * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
+ *	used for this FILS connection (may be %NULL).
+ */
+struct cfg80211_fils_resp_params {
+	const u8 *kek;
+	size_t kek_len;
+	bool update_erp_next_seq_num;
+	u16 erp_next_seq_num;
+	const u8 *pmk;
+	size_t pmk_len;
+	const u8 *pmkid;
+};
+
 /**
  * struct cfg80211_connect_resp_params - Connection response params
  * @status: Status code, %WLAN_STATUS_SUCCESS for successful connection, use
@@ -5438,17 +5462,7 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
  * @req_ie_len: Association request IEs length
  * @resp_ie: Association response IEs (may be %NULL)
  * @resp_ie_len: Association response IEs length
- * @fils_kek: KEK derived from a successful FILS connection (may be %NULL)
- * @fils_kek_len: Length of @fils_kek in octets
- * @update_erp_next_seq_num: Boolean value to specify whether the value in
- *	@fils_erp_next_seq_num is valid.
- * @fils_erp_next_seq_num: The next sequence number to use in ERP message in
- *	FILS Authentication. This value should be specified irrespective of the
- *	status for a FILS connection.
- * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
- * @pmk_len: Length of @pmk in octets
- * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
- *	used for this FILS connection (may be %NULL).
+ * @fils: FILS connection response parameters.
  * @timeout_reason: Reason for connection timeout. This is used when the
  *	connection fails due to a timeout instead of an explicit rejection from
  *	the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
@@ -5464,13 +5478,7 @@ struct cfg80211_connect_resp_params {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
-	const u8 *fils_kek;
-	size_t fils_kek_len;
-	bool update_erp_next_seq_num;
-	u16 fils_erp_next_seq_num;
-	const u8 *pmk;
-	size_t pmk_len;
-	const u8 *pmkid;
+	struct cfg80211_fils_resp_params fils;
 	enum nl80211_timeout_reason timeout_reason;
 };
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 451f12ecb894..3ab443b13bb0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14206,8 +14206,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	void *hdr;
 
 	msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
-			cr->fils_kek_len + cr->pmk_len +
-			(cr->pmkid ? WLAN_PMKID_LEN : 0), gfp);
+			cr->fils.kek_len + cr->fils.pmk_len +
+			(cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!msg)
 		return;
 
@@ -14233,17 +14233,17 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	    (cr->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len,
 		     cr->resp_ie)) ||
-	    (cr->update_erp_next_seq_num &&
+	    (cr->fils.update_erp_next_seq_num &&
 	     nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
-			 cr->fils_erp_next_seq_num)) ||
+			 cr->fils.erp_next_seq_num)) ||
 	    (cr->status == WLAN_STATUS_SUCCESS &&
-	     ((cr->fils_kek &&
-	       nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len,
-		       cr->fils_kek)) ||
-	      (cr->pmk &&
-	       nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) ||
-	      (cr->pmkid &&
-	       nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid)))))
+	     ((cr->fils.kek &&
+	       nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils.kek_len,
+		       cr->fils.kek)) ||
+	      (cr->fils.pmk &&
+	       nla_put(msg, NL80211_ATTR_PMK, cr->fils.pmk_len, cr->fils.pmk)) ||
+	      (cr->fils.pmkid &&
+	       nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid)))))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5df6b33db786..73881fb7f86d 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -803,8 +803,8 @@ void cfg80211_connect_done(struct net_device *dev,
 
 	ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) +
 		     params->req_ie_len + params->resp_ie_len +
-		     params->fils_kek_len + params->pmk_len +
-		     (params->pmkid ? WLAN_PMKID_LEN : 0), gfp);
+		     params->fils.kek_len + params->fils.pmk_len +
+		     (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!ev) {
 		cfg80211_put_bss(wdev->wiphy, params->bss);
 		return;
@@ -831,27 +831,29 @@ void cfg80211_connect_done(struct net_device *dev,
 		       params->resp_ie_len);
 		next += params->resp_ie_len;
 	}
-	if (params->fils_kek_len) {
-		ev->cr.fils_kek = next;
-		ev->cr.fils_kek_len = params->fils_kek_len;
-		memcpy((void *)ev->cr.fils_kek, params->fils_kek,
-		       params->fils_kek_len);
-		next += params->fils_kek_len;
+	if (params->fils.kek_len) {
+		ev->cr.fils.kek = next;
+		ev->cr.fils.kek_len = params->fils.kek_len;
+		memcpy((void *)ev->cr.fils.kek, params->fils.kek,
+		       params->fils.kek_len);
+		next += params->fils.kek_len;
 	}
-	if (params->pmk_len) {
-		ev->cr.pmk = next;
-		ev->cr.pmk_len = params->pmk_len;
-		memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len);
-		next += params->pmk_len;
+	if (params->fils.pmk_len) {
+		ev->cr.fils.pmk = next;
+		ev->cr.fils.pmk_len = params->fils.pmk_len;
+		memcpy((void *)ev->cr.fils.pmk, params->fils.pmk,
+		       params->fils.pmk_len);
+		next += params->fils.pmk_len;
 	}
-	if (params->pmkid) {
-		ev->cr.pmkid = next;
-		memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN);
+	if (params->fils.pmkid) {
+		ev->cr.fils.pmkid = next;
+		memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid,
+		       WLAN_PMKID_LEN);
 		next += WLAN_PMKID_LEN;
 	}
-	ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num;
-	if (params->update_erp_next_seq_num)
-		ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num;
+	ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num;
+	if (params->fils.update_erp_next_seq_num)
+		ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num;
 	if (params->bss)
 		cfg80211_hold_bss(bss_from_pub(params->bss));
 	ev->cr.bss = params->bss;
-- 
cgit v1.2.3


From e841b7b11ebd0b359e07bb3d7caf15dca1a80b72 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Tue, 22 May 2018 10:19:07 +0200
Subject: nl80211: add FILS related parameters to ROAM event

In case of FILS shared key offload the parameters can change
upon roaming of which user-space needs to be notified.

Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h |  3 ++-
 net/wireless/nl80211.c       | 16 +++++++++++++--
 net/wireless/sme.c           | 48 +++++++++++++++++++++++++++++++++++++-------
 4 files changed, 59 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0e4a2a04d55d..1e7c0df4fe33 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5624,6 +5624,7 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
+ * @fils: FILS related roaming information.
  */
 struct cfg80211_roam_info {
 	struct ieee80211_channel *channel;
@@ -5633,6 +5634,7 @@ struct cfg80211_roam_info {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
+	struct cfg80211_fils_resp_params fils;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 83ed1dd757e6..0a412335d56b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -215,7 +215,8 @@
  * as specified in IETF RFC 6696.
  *
  * When FILS shared key authentication is completed, driver needs to provide the
- * below additional parameters to userspace.
+ * below additional parameters to userspace, which can be either after setting
+ * up a connection or after roaming.
  *	%NL80211_ATTR_FILS_KEK - used for key renewal
  *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges
  *	%NL80211_ATTR_PMKID - used to identify the PMKSA used/generated
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3ab443b13bb0..ae57f9712d7d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14264,7 +14264,9 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	void *hdr;
 	const u8 *bssid = info->bss ? info->bss->bssid : info->bssid;
 
-	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len, gfp);
+	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
+			info->fils.kek_len + info->fils.pmk_len +
+			(info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!msg)
 		return;
 
@@ -14282,7 +14284,17 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 		     info->req_ie)) ||
 	    (info->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
-		     info->resp_ie)))
+		     info->resp_ie)) ||
+	    (info->fils.update_erp_next_seq_num &&
+	     nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
+			 info->fils.erp_next_seq_num)) ||
+	    (info->fils.kek &&
+	     nla_put(msg, NL80211_ATTR_FILS_KEK, info->fils.kek_len,
+		     info->fils.kek)) ||
+	    (info->fils.pmk &&
+	     nla_put(msg, NL80211_ATTR_PMK, info->fils.pmk_len, info->fils.pmk)) ||
+	    (info->fils.pmkid &&
+	     nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 73881fb7f86d..d536b07582f8 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -932,6 +932,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_event *ev;
 	unsigned long flags;
+	u8 *next;
 
 	if (!info->bss) {
 		info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
@@ -944,19 +945,52 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	if (WARN_ON(!info->bss))
 		return;
 
-	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len, gfp);
+	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len +
+		     info->fils.kek_len + info->fils.pmk_len +
+		     (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!ev) {
 		cfg80211_put_bss(wdev->wiphy, info->bss);
 		return;
 	}
 
 	ev->type = EVENT_ROAMED;
-	ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev);
-	ev->rm.req_ie_len = info->req_ie_len;
-	memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
-	ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + info->req_ie_len;
-	ev->rm.resp_ie_len = info->resp_ie_len;
-	memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len);
+	next = ((u8 *)ev) + sizeof(*ev);
+	if (info->req_ie_len) {
+		ev->rm.req_ie = next;
+		ev->rm.req_ie_len = info->req_ie_len;
+		memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
+		next += info->req_ie_len;
+	}
+	if (info->resp_ie_len) {
+		ev->rm.resp_ie = next;
+		ev->rm.resp_ie_len = info->resp_ie_len;
+		memcpy((void *)ev->rm.resp_ie, info->resp_ie,
+		       info->resp_ie_len);
+		next += info->resp_ie_len;
+	}
+	if (info->fils.kek_len) {
+		ev->rm.fils.kek = next;
+		ev->rm.fils.kek_len = info->fils.kek_len;
+		memcpy((void *)ev->rm.fils.kek, info->fils.kek,
+		       info->fils.kek_len);
+		next += info->fils.kek_len;
+	}
+	if (info->fils.pmk_len) {
+		ev->rm.fils.pmk = next;
+		ev->rm.fils.pmk_len = info->fils.pmk_len;
+		memcpy((void *)ev->rm.fils.pmk, info->fils.pmk,
+		       info->fils.pmk_len);
+		next += info->fils.pmk_len;
+	}
+	if (info->fils.pmkid) {
+		ev->rm.fils.pmkid = next;
+		memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid,
+		       WLAN_PMKID_LEN);
+		next += WLAN_PMKID_LEN;
+	}
+	ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num;
+	if (info->fils.update_erp_next_seq_num)
+		ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num;
 	ev->rm.bss = info->bss;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
-- 
cgit v1.2.3


From 7f9a3e150ec7d3596386449c15aefb59904a1266 Mon Sep 17 00:00:00 2001
From: Vidyullatha Kanchanapally <vidyullatha@codeaurora.org>
Date: Tue, 22 May 2018 10:19:08 +0200
Subject: nl80211: Update ERP info using NL80211_CMD_UPDATE_CONNECT_PARAMS

Use NL80211_CMD_UPDATE_CONNECT_PARAMS to update new ERP information,
Association IEs and the Authentication type to driver / firmware which
will be used in subsequent roamings.

Signed-off-by: Vidyullatha Kanchanapally <vidyullatha@codeaurora.org>
[arend: extended fils-sk kernel doc and added check in wiphy_register()]
Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  5 +++++
 include/uapi/linux/nl80211.h |  3 ++-
 net/wireless/core.c          |  4 ++++
 net/wireless/nl80211.c       | 52 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1e7c0df4fe33..5fbfe61f41c6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2225,9 +2225,14 @@ struct cfg80211_connect_params {
  * have to be updated as part of update_connect_params() call.
  *
  * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
+ * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm,
+ *	username, erp sequence number and rrk) are updated
+ * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated
  */
 enum cfg80211_connect_params_changed {
 	UPDATE_ASSOC_IES		= BIT(0),
+	UPDATE_FILS_ERP_INFO		= BIT(1),
+	UPDATE_AUTH_TYPE		= BIT(2),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 0a412335d56b..06f9af23156b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -204,7 +204,8 @@
  * FILS shared key authentication offload should be able to construct the
  * authentication and association frames for FILS shared key authentication and
  * eventually do a key derivation as per IEEE 802.11ai. The below additional
- * parameters should be given to driver in %NL80211_CMD_CONNECT.
+ * parameters should be given to driver in %NL80211_CMD_CONNECT and/or in
+ * %NL80211_CMD_UPDATE_CONNECT_PARAMS.
  *	%NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai
  *	%NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai
  *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c0fd8a85e7f7..5fe35aafdd9c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -725,6 +725,10 @@ int wiphy_register(struct wiphy *wiphy)
 		    (!rdev->ops->set_pmk || !rdev->ops->del_pmk)))
 		return -EINVAL;
 
+	if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
+		    rdev->ops->update_connect_params))
+		return -EINVAL;
+
 	if (wiphy->addresses)
 		memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ae57f9712d7d..bdf73b24cc09 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9429,6 +9429,8 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	bool fils_sk_offload;
+	u32 auth_type;
 	u32 changed = 0;
 	int ret;
 
@@ -9443,6 +9445,56 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 		changed |= UPDATE_ASSOC_IES;
 	}
 
+	fils_sk_offload = wiphy_ext_feature_isset(&rdev->wiphy,
+						  NL80211_EXT_FEATURE_FILS_SK_OFFLOAD);
+
+	/*
+	 * when driver supports fils-sk offload all attributes must be
+	 * provided. So the else covers "fils-sk-not-all" and
+	 * "no-fils-sk-any".
+	 */
+	if (fils_sk_offload &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		connect.fils_erp_username =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_username_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_realm =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_realm_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_next_seq_num =
+			nla_get_u16(
+			   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
+		connect.fils_erp_rrk =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+		connect.fils_erp_rrk_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+		changed |= UPDATE_FILS_ERP_INFO;
+	} else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+		auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+		if (!nl80211_valid_auth_type(rdev, auth_type,
+					     NL80211_CMD_CONNECT))
+			return -EINVAL;
+
+		if (auth_type == NL80211_AUTHTYPE_FILS_SK &&
+		    fils_sk_offload && !(changed & UPDATE_FILS_ERP_INFO))
+			return -EINVAL;
+
+		connect.auth_type = auth_type;
+		changed |= UPDATE_AUTH_TYPE;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	if (!wdev->current_bss)
 		ret = -ENOLINK;
-- 
cgit v1.2.3


From dcab51f19b291d5ee23724c51b0a3a597c16451a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 15:03:31 -0700
Subject: bpf: Expose check_uarg_tail_zero()

This patch exposes check_uarg_tail_zero() which will
be reused by a later BTF patch.  Its name is changed to
bpf_check_uarg_tail_zero().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/syscall.c | 14 +++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ed0122b45b63..f6fe3c719ca8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -463,6 +463,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
 int bpf_get_file_flag(int flags);
+int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size,
+			     size_t actual_size);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde949c7f8..2b29ef84ded3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -65,9 +65,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
  * copy_from_user() call. However, this is not a concern since this function is
  * meant to be a future-proofing of bits.
  */
-static int check_uarg_tail_zero(void __user *uaddr,
-				size_t expected_size,
-				size_t actual_size)
+int bpf_check_uarg_tail_zero(void __user *uaddr,
+			     size_t expected_size,
+			     size_t actual_size)
 {
 	unsigned char __user *addr;
 	unsigned char __user *end;
@@ -1899,7 +1899,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	u32 ulen;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -1998,7 +1998,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -2038,7 +2038,7 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
 	if (err)
 		return err;
 
@@ -2110,7 +2110,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
 	if (err)
 		return err;
 	size = min_t(u32, size, sizeof(attr));
-- 
cgit v1.2.3


From f80442a4cd1864154beaa060bb483a2c9f7d811f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:18 -0700
Subject: bpf: btf: Change how section is supported in btf_header

There are currently unused section descriptions in the btf_header.  Those
sections are here to support future BTF use cases.  For example, the
func section (func_off) is to support function signature (e.g. the BPF
prog function signature).

Instead of spelling out all potential sections up-front in the btf_header.
This patch makes changes to btf_header such that extending it (e.g. adding
a section) is possible later.  The unused ones can be removed for now and
they can be added back later.

This patch:
1. adds a hdr_len to the btf_header.  It will allow adding
sections (and other info like parent_label and parent_name)
later.  The check is similar to the existing bpf_attr.
If a user passes in a longer hdr_len, the kernel
ensures the extra tailing bytes are 0.

2. allows the section order in the BTF object to be
different from its sec_off order in btf_header.

3. each sec_off is followed by a sec_len.  It must not have gap or
overlapping among sections.

The string section is ensured to be at the end due to the 4 bytes
alignment requirement of the type section.

The above changes will allow enough flexibility to
add new sections (and other info) to the btf_header later.

This patch also removes an unnecessary !err check
at the end of btf_parse().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h |   8 +-
 kernel/bpf/btf.c         | 209 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 160 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index bcb56ee47014..4fa479741a02 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -12,15 +12,11 @@ struct btf_header {
 	__u16	magic;
 	__u8	version;
 	__u8	flags;
-
-	__u32	parent_label;
-	__u32	parent_name;
+	__u32	hdr_len;
 
 	/* All offsets are in bytes relative to the end of this header */
-	__u32	label_off;	/* offset of label section	*/
-	__u32	object_off;	/* offset of data object section*/
-	__u32	func_off;	/* offset of function section	*/
 	__u32	type_off;	/* offset of type section	*/
+	__u32	type_len;	/* length of type section	*/
 	__u32	str_off;	/* offset of string section	*/
 	__u32	str_len;	/* length of string section	*/
 };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ded10ab47b8a..75da6cbae47d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -12,6 +12,7 @@
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/sort.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr);
 static DEFINE_SPINLOCK(btf_idr_lock);
 
 struct btf {
-	union {
-		struct btf_header *hdr;
-		void *data;
-	};
+	void *data;
 	struct btf_type **types;
 	u32 *resolved_ids;
 	u32 *resolved_sizes;
 	const char *strings;
 	void *nohdr_data;
+	struct btf_header hdr;
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
@@ -228,6 +227,11 @@ enum resolve_mode {
 
 #define MAX_RESOLVE_DEPTH 32
 
+struct btf_sec_info {
+	u32 off;
+	u32 len;
+};
+
 struct btf_verifier_env {
 	struct btf *btf;
 	u8 *visit_states;
@@ -418,14 +422,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
 	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!BTF_STR_OFFSET(offset))
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
 		return &btf->strings[BTF_STR_OFFSET(offset)];
 	else
 		return "(invalid-name-offset)";
@@ -536,7 +540,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "\n");
 }
 
-static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+				 u32 btf_data_size)
 {
 	struct bpf_verifier_log *log = &env->log;
 	const struct btf *btf = env->btf;
@@ -545,19 +550,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env)
 	if (!bpf_verifier_log_needed(log))
 		return;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
 	__btf_verifier_log(log, "version: %u\n", hdr->version);
 	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
-	__btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
-	__btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
-	__btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
-	__btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
-	__btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+	__btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
 	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
 	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
 	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
-	__btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
 }
 
 static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
@@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	struct btf_header *hdr;
 	void *cur, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	cur = btf->nohdr_data + hdr->type_off;
-	end = btf->nohdr_data + hdr->str_off;
+	end = btf->nohdr_data + hdr->type_len;
 
 	env->log_type_id = 1;
 	while (cur < end) {
@@ -1866,8 +1868,20 @@ static int btf_check_all_types(struct btf_verifier_env *env)
 
 static int btf_parse_type_sec(struct btf_verifier_env *env)
 {
+	const struct btf_header *hdr = &env->btf->hdr;
 	int err;
 
+	/* Type section must align to 4 bytes */
+	if (hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Unaligned type_off");
+		return -EINVAL;
+	}
+
+	if (!hdr->type_len) {
+		btf_verifier_log(env, "No type found");
+		return -EINVAL;
+	}
+
 	err = btf_check_all_metas(env);
 	if (err)
 		return err;
@@ -1881,10 +1895,15 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	struct btf *btf = env->btf;
 	const char *start, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	start = btf->nohdr_data + hdr->str_off;
 	end = start + hdr->str_len;
 
+	if (end != btf->data + btf->data_size) {
+		btf_verifier_log(env, "String section is not at the end");
+		return -EINVAL;
+	}
+
 	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
 	    start[0] || end[-1]) {
 		btf_verifier_log(env, "Invalid string section");
@@ -1896,20 +1915,122 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	return 0;
 }
 
-static int btf_parse_hdr(struct btf_verifier_env *env)
+static const size_t btf_sec_info_offset[] = {
+	offsetof(struct btf_header, type_off),
+	offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+	const struct btf_sec_info *x = a;
+	const struct btf_sec_info *y = b;
+
+	return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+			      u32 btf_data_size)
 {
+	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
+	struct btf_sec_info secs[nr_secs];
+	u32 total, expected_total, i;
 	const struct btf_header *hdr;
-	struct btf *btf = env->btf;
-	u32 meta_left;
+	const struct btf *btf;
+
+	btf = env->btf;
+	hdr = &btf->hdr;
 
-	if (btf->data_size < sizeof(*hdr)) {
+	/* Populate the secs from hdr */
+	for (i = 0; i < nr_secs; i++)
+		secs[i] = *(struct btf_sec_info *)((void *)hdr +
+						   btf_sec_info_offset[i]);
+
+	sort(secs, nr_secs, sizeof(struct btf_sec_info),
+	     btf_sec_info_cmp, NULL);
+
+	/* Check for gaps and overlap among sections */
+	total = 0;
+	expected_total = btf_data_size - hdr->hdr_len;
+	for (i = 0; i < nr_secs; i++) {
+		if (expected_total < secs[i].off) {
+			btf_verifier_log(env, "Invalid section offset");
+			return -EINVAL;
+		}
+		if (total < secs[i].off) {
+			/* gap */
+			btf_verifier_log(env, "Unsupported section found");
+			return -EINVAL;
+		}
+		if (total > secs[i].off) {
+			btf_verifier_log(env, "Section overlap found");
+			return -EINVAL;
+		}
+		if (expected_total - total < secs[i].len) {
+			btf_verifier_log(env,
+					 "Total section length too long");
+			return -EINVAL;
+		}
+		total += secs[i].len;
+	}
+
+	/* There is data other than hdr and known sections */
+	if (expected_total != total) {
+		btf_verifier_log(env, "Unsupported section found");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
+			 u32 btf_data_size)
+{
+	const struct btf_header *hdr;
+	u32 hdr_len, hdr_copy;
+	/*
+	 * Minimal part of the "struct btf_header" that
+	 * contains the hdr_len.
+	 */
+	struct btf_min_header {
+		u16	magic;
+		u8	version;
+		u8	flags;
+		u32	hdr_len;
+	} __user *min_hdr;
+	struct btf *btf;
+	int err;
+
+	btf = env->btf;
+	min_hdr = btf_data;
+
+	if (btf_data_size < sizeof(*min_hdr)) {
+		btf_verifier_log(env, "hdr_len not found");
+		return -EINVAL;
+	}
+
+	if (get_user(hdr_len, &min_hdr->hdr_len))
+		return -EFAULT;
+
+	if (btf_data_size < hdr_len) {
 		btf_verifier_log(env, "btf_header not found");
 		return -EINVAL;
 	}
 
-	btf_verifier_log_hdr(env);
+	err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
+	if (err) {
+		if (err == -E2BIG)
+			btf_verifier_log(env, "Unsupported btf_header");
+		return err;
+	}
+
+	hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+	if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
+		return -EFAULT;
+
+	hdr = &btf->hdr;
+
+	btf_verifier_log_hdr(env, btf_data_size);
 
-	hdr = btf->hdr;
 	if (hdr->magic != BTF_MAGIC) {
 		btf_verifier_log(env, "Invalid magic");
 		return -EINVAL;
@@ -1925,26 +2046,14 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 		return -ENOTSUPP;
 	}
 
-	meta_left = btf->data_size - sizeof(*hdr);
-	if (!meta_left) {
+	if (btf_data_size == hdr->hdr_len) {
 		btf_verifier_log(env, "No data");
 		return -EINVAL;
 	}
 
-	if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
-	    /* Type section must align to 4 bytes */
-	    hdr->type_off & (sizeof(u32) - 1)) {
-		btf_verifier_log(env, "Invalid type_off");
-		return -EINVAL;
-	}
-
-	if (meta_left < hdr->str_off ||
-	    meta_left - hdr->str_off < hdr->str_len) {
-		btf_verifier_log(env, "Invalid str_off or str_len");
-		return -EINVAL;
-	}
-
-	btf->nohdr_data = btf->hdr + 1;
+	err = btf_check_sec_info(env, btf_data_size);
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -1987,6 +2096,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 		err = -ENOMEM;
 		goto errout;
 	}
+	env->btf = btf;
+
+	err = btf_parse_hdr(env, btf_data, btf_data_size);
+	if (err)
+		goto errout;
 
 	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!data) {
@@ -1996,18 +2110,13 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	btf->data = data;
 	btf->data_size = btf_data_size;
+	btf->nohdr_data = btf->data + btf->hdr.hdr_len;
 
 	if (copy_from_user(data, btf_data, btf_data_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
 
-	env->btf = btf;
-
-	err = btf_parse_hdr(env);
-	if (err)
-		goto errout;
-
 	err = btf_parse_str_sec(env);
 	if (err)
 		goto errout;
@@ -2016,16 +2125,14 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 	if (err)
 		goto errout;
 
-	if (!err && log->level && bpf_verifier_log_full(log)) {
+	if (log->level && bpf_verifier_log_full(log)) {
 		err = -ENOSPC;
 		goto errout;
 	}
 
-	if (!err) {
-		btf_verifier_env_free(env);
-		refcount_set(&btf->refcnt, 1);
-		return btf;
-	}
+	btf_verifier_env_free(env);
+	refcount_set(&btf->refcnt, 1);
+	return btf;
 
 errout:
 	btf_verifier_env_free(env);
-- 
cgit v1.2.3


From aea2f7b8911617d7a8c23fb73d69e78764f91b58 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:20 -0700
Subject: bpf: btf: Remove unused bits from uapi/linux/btf.h

This patch does the followings:
1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k.  We can
   raise it later.

2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID.  They are
   currently encoded at the highest bit of a u32.
   It is because the current use case does not require supporting
   parent type (i.e type_id referring to a type in another BTF file).
   It also does not support referring to a string in ELF.

   The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced
   by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are
   defined in btf.c instead of uapi/linux/btf.h.

3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough.
   There is unused bits headroom if we ever needed it later.

4. The root bit in BTF_INFO is also removed because it is not
   used in the current use case.

5. Remove BTF_INT_VARARGS since func type is not supported now.
   The BTF_INT_ENCODING is limited to 4 bits instead of 8 bits.

The above can be added back later because the verifier
ensures the unused bits are zeros.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h | 29 +++++++++------------------
 kernel/bpf/btf.c         | 52 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 4fa479741a02..0b5ddbe135a4 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -22,28 +22,19 @@ struct btf_header {
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE	0x7fffffff
+#define BTF_MAX_TYPE	0x0000ffff
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x7fffffff
+#define BTF_MAX_NAME_OFFSET	0x0000ffff
 /* Max # of struct/union/enum members or func args */
 #define BTF_MAX_VLEN	0xffff
 
-/* The type id is referring to a parent BTF */
-#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
-#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
-
-/* String is in the ELF string section */
-#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
-#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
-
 struct btf_type {
 	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-28: kind (e.g. int, ptr, array...etc)
-	 * bits 29-30: unused
-	 * bits    31: root
+	 * bits 24-27: kind (e.g. int, ptr, array...etc)
+	 * bits 28-31: unused
 	 */
 	__u32 info;
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
@@ -58,8 +49,7 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
-#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
 
 #define BTF_KIND_UNKN		0	/* Unknown	*/
@@ -84,15 +74,14 @@ struct btf_type {
 /* BTF_KIND_INT is followed by a u32 and the following
  * is the 32 bits arrangement:
  */
-#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0x0f000000) >> 24)
 #define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
 #define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
-#define BTF_INT_SIGNED	0x1
-#define BTF_INT_CHAR	0x2
-#define BTF_INT_BOOL	0x4
-#define BTF_INT_VARARGS	0x8
+#define BTF_INT_SIGNED	(1 << 0)
+#define BTF_INT_CHAR	(1 << 1)
+#define BTF_INT_BOOL	(1 << 2)
 
 /* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
  * The exact number of btf_enum is stored in the vlen (of the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e388a6598de2..9cbeabb5aca3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -163,13 +163,16 @@
 #define BITS_ROUNDUP_BYTES(bits) \
 	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
+#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
 /* 16MB for 64k structs and each has 16 members and
  * a few MB spaces for the string section.
  * The hard limit is S32_MAX.
  */
 #define BTF_MAX_SIZE (16 * 1024 * 1024)
-/* 64k. We can raise it later. The hard limit is S32_MAX. */
-#define BTF_MAX_NR_TYPES 65535
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
@@ -383,8 +386,6 @@ static const char *btf_int_encoding_str(u8 encoding)
 		return "CHAR";
 	else if (encoding == BTF_INT_BOOL)
 		return "BOOL";
-	else if (encoding == BTF_INT_VARARGS)
-		return "VARARGS";
 	else
 		return "UNKN";
 }
@@ -421,16 +422,16 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
-	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
+	return BTF_STR_OFFSET_VALID(offset) &&
+		offset < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
-	if (!BTF_STR_OFFSET(offset))
+	if (!offset)
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
-		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else if (offset < btf->hdr.str_len)
+		return &btf->strings[offset];
 	else
 		return "(invalid-name-offset)";
 }
@@ -598,13 +599,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 		struct btf_type **new_types;
 		u32 expand_by, new_size;
 
-		if (btf->types_size == BTF_MAX_NR_TYPES) {
+		if (btf->types_size == BTF_MAX_TYPE) {
 			btf_verifier_log(env, "Exceeded max num of types");
 			return -E2BIG;
 		}
 
 		expand_by = max_t(u32, btf->types_size >> 2, 16);
-		new_size = min_t(u32, BTF_MAX_NR_TYPES,
+		new_size = min_t(u32, BTF_MAX_TYPE,
 				 btf->types_size + expand_by);
 
 		new_types = kvzalloc(new_size * sizeof(*new_types),
@@ -934,6 +935,12 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 	}
 
 	int_data = btf_type_int(t);
+	if (int_data & ~BTF_INT_MASK) {
+		btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+				       int_data);
+		return -EINVAL;
+	}
+
 	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
 
 	if (nr_bits > BITS_PER_U64) {
@@ -947,12 +954,17 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/*
+	 * Only one of the encoding bits is allowed and it
+	 * should be sufficient for the pretty print purpose (i.e. decoding).
+	 * Multiple bits can be allowed later if it is found
+	 * to be insufficient.
+	 */
 	encoding = BTF_INT_ENCODING(int_data);
 	if (encoding &&
 	    encoding != BTF_INT_SIGNED &&
 	    encoding != BTF_INT_CHAR &&
-	    encoding != BTF_INT_BOOL &&
-	    encoding != BTF_INT_VARARGS) {
+	    encoding != BTF_INT_BOOL) {
 		btf_verifier_log_type(env, t, "Unsupported encoding");
 		return -ENOTSUPP;
 	}
@@ -1126,7 +1138,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (BTF_TYPE_PARENT(t->type)) {
+	if (!BTF_TYPE_ID_VALID(t->type)) {
 		btf_verifier_log_type(env, t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1333,12 +1345,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 	/* Array elem type and index type cannot be in type void,
 	 * so !array->type and !array->index_type are not allowed.
 	 */
-	if (!array->type || BTF_TYPE_PARENT(array->type)) {
+	if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
 		btf_verifier_log_type(env, t, "Invalid elem");
 		return -EINVAL;
 	}
 
-	if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+	if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
 		btf_verifier_log_type(env, t, "Invalid index");
 		return -EINVAL;
 	}
@@ -1507,7 +1519,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 		}
 
 		/* A member cannot be in type void */
-		if (!member->type || BTF_TYPE_PARENT(member->type)) {
+		if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid type_id");
 			return -EINVAL;
@@ -1760,6 +1772,12 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 	}
 	meta_left -= sizeof(*t);
 
+	if (t->info & ~BTF_INFO_MASK) {
+		btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+				 env->log_type_id, t->info);
+		return -EINVAL;
+	}
+
 	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
 	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
 		btf_verifier_log(env, "[%u] Invalid kind:%u",
-- 
cgit v1.2.3


From 9b2cf328b2eccf761537a06bef914d2a0700fba7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:21 -0700
Subject: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info

In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id"
could cause confusion because the "id" of "btf_id" means the BPF obj id
given to the BTF object while
"btf_key_id" and "btf_value_id" means the BTF type id within
that BTF object.

To make it clear, btf_key_id and btf_value_id are
renamed to btf_key_type_id and btf_value_type_id.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  4 ++--
 include/uapi/linux/bpf.h |  8 ++++----
 kernel/bpf/arraymap.c    |  2 +-
 kernel/bpf/syscall.c     | 18 +++++++++---------
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f6fe3c719ca8..1795eeee846c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -69,8 +69,8 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
-	u32 btf_key_id;
-	u32 btf_value_id;
+	u32 btf_key_type_id;
+	u32 btf_value_type_id;
 	struct btf *btf;
 	bool unpriv_array;
 	/* 55 bytes hole */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bbe2ca5..c3e502d06bc3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -284,8 +284,8 @@ union bpf_attr {
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
 		__u32	btf_fd;		/* fd pointing to a BTF type data */
-		__u32	btf_key_id;	/* BTF type_id of the key */
-		__u32	btf_value_id;	/* BTF type_id of the value */
+		__u32	btf_key_type_id;	/* BTF type_id of the key */
+		__u32	btf_value_type_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -2219,8 +2219,8 @@ struct bpf_map_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 btf_id;
-	__u32 btf_key_id;
-	__u32 btf_value_id;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0fd8d8f1a398..544e58f5f642 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
 	}
 
 	seq_printf(m, "%u: ", *(u32 *)key);
-	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
 	seq_puts(m, "\n");
 
 	rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2b29ef84ded3..0b4c94551001 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -422,7 +422,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -457,10 +457,10 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->usercnt, 1);
 
 	if (bpf_map_support_seq_show(map) &&
-	    (attr->btf_key_id || attr->btf_value_id)) {
+	    (attr->btf_key_type_id || attr->btf_value_type_id)) {
 		struct btf *btf;
 
-		if (!attr->btf_key_id || !attr->btf_value_id) {
+		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
 			err = -EINVAL;
 			goto free_map_nouncharge;
 		}
@@ -471,16 +471,16 @@ static int map_create(union bpf_attr *attr)
 			goto free_map_nouncharge;
 		}
 
-		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
-					      attr->btf_value_id);
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+					      attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
 			goto free_map_nouncharge;
 		}
 
 		map->btf = btf;
-		map->btf_key_id = attr->btf_key_id;
-		map->btf_value_id = attr->btf_value_id;
+		map->btf_key_type_id = attr->btf_key_type_id;
+		map->btf_value_type_id = attr->btf_value_type_id;
 	}
 
 	err = security_bpf_map_alloc(map);
@@ -2013,8 +2013,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 
 	if (map->btf) {
 		info.btf_id = btf_id(map->btf);
-		info.btf_key_id = map->btf_key_id;
-		info.btf_value_id = map->btf_value_id;
+		info.btf_key_type_id = map->btf_key_type_id;
+		info.btf_value_type_id = map->btf_value_type_id;
 	}
 
 	if (bpf_map_is_dev_bound(map)) {
-- 
cgit v1.2.3


From 0c6bca747111dee19aa48c8f73d77fc85fcb8dd0 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 15 May 2018 21:23:31 +0900
Subject: netfilter: nf_tables: remove nft_af_info.

The struct nft_af_info was removed.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/nftables.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 48134353411d..29c3851b486a 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -4,8 +4,6 @@
 
 #include <linux/list.h>
 
-struct nft_af_info;
-
 struct netns_nftables {
 	struct list_head	tables;
 	struct list_head	commit_list;
-- 
cgit v1.2.3


From a25005310b3ad6df93fbda5ec41b505d5e769799 Mon Sep 17 00:00:00 2001
From: Caesar Wang <wxt@rock-chips.com>
Date: Wed, 23 May 2018 14:48:37 +0800
Subject: dt-bindings: power: add RK3036 SoCs header for power-domain

According to a description from TRM, add all the power domains.

Signed-off-by: Caesar Wang <wxt@rock-chips.com>
Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/power/rk3036-power.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 include/dt-bindings/power/rk3036-power.h

(limited to 'include')

diff --git a/include/dt-bindings/power/rk3036-power.h b/include/dt-bindings/power/rk3036-power.h
new file mode 100644
index 000000000000..0bc6b5d5075e
--- /dev/null
+++ b/include/dt-bindings/power/rk3036-power.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_BINDINGS_POWER_RK3036_POWER_H__
+#define __DT_BINDINGS_POWER_RK3036_POWER_H__
+
+#define RK3036_PD_MSCH		0
+#define RK3036_PD_CORE		1
+#define RK3036_PD_PERI		2
+#define RK3036_PD_VIO		3
+#define RK3036_PD_VPU		4
+#define RK3036_PD_GPU		5
+#define RK3036_PD_SYS		6
+
+#endif
-- 
cgit v1.2.3


From 3027743f83f867d85662f5134e835c2e199cb9be Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Mon, 21 May 2018 10:57:07 -0700
Subject: gpio: Remove VLA from gpiolib

The new challenge is to remove VLAs from the kernel
(see https://lkml.org/lkml/2018/3/7/621) to eventually
turn on -Wvla.

Using a kmalloc array is the easy way to fix this but kmalloc is still
more expensive than stack allocation. Introduce a fast path with a
fixed size stack array to cover most chip with gpios below some fixed
amount. The slow path dynamically allocates an array to cover those
chips with a large number of gpios.

Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Phil Reid <preid@electromag.com.au>
Reviewed-and-tested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/Kconfig          | 12 +++++++
 drivers/gpio/gpiolib.c        | 76 +++++++++++++++++++++++++++++++++----------
 drivers/gpio/gpiolib.h        |  2 +-
 include/linux/gpio/consumer.h | 10 +++---
 4 files changed, 78 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index b960f6f35abd..71c0ab46f216 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -22,6 +22,18 @@ menuconfig GPIOLIB
 
 if GPIOLIB
 
+config GPIOLIB_FASTPATH_LIMIT
+	int "Maximum number of GPIOs for fast path"
+	range 32 512
+	default 512
+	help
+	   This adjusts the point at which certain APIs will switch from
+	   using a stack allocated buffer to a dynamically allocated buffer.
+
+	   You shouldn't need to change this unless you really need to
+	   optimize either stack space or performance. Change this carefully
+	   since setting an incorrect value could cause stack corruption.
+
 config OF_GPIO
 	def_bool y
 	depends on OF
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d1171db66c30..487edf188138 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -61,6 +61,11 @@ static struct bus_type gpio_bus_type = {
 	.name = "gpio",
 };
 
+/*
+ * Number of GPIOs to use for the fast path in set array
+ */
+#define FASTPATH_NGPIO CONFIG_GPIOLIB_FASTPATH_LIMIT
+
 /* gpio_lock prevents conflicts during gpio_desc[] table updates.
  * While any GPIO is requested, its gpio_chip is not removable;
  * each GPIO's "requested" flag serves as a lock and refcount.
@@ -453,12 +458,11 @@ static long linehandle_ioctl(struct file *filep, unsigned int cmd,
 			vals[i] = !!ghd.values[i];
 
 		/* Reuse the array setting function */
-		gpiod_set_array_value_complex(false,
+		return gpiod_set_array_value_complex(false,
 					      true,
 					      lh->numdescs,
 					      lh->descs,
 					      vals);
-		return 0;
 	}
 	return -EINVAL;
 }
@@ -1281,6 +1285,10 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 		goto err_free_descs;
 	}
 
+	if (chip->ngpio > FASTPATH_NGPIO)
+		chip_warn(chip, "line cnt %u is greater than fast path cnt %u\n",
+		chip->ngpio, FASTPATH_NGPIO);
+
 	gdev->label = kstrdup_const(chip->label ?: "unknown", GFP_KERNEL);
 	if (!gdev->label) {
 		status = -ENOMEM;
@@ -2758,16 +2766,28 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 
 	while (i < array_size) {
 		struct gpio_chip *chip = desc_array[i]->gdev->chip;
-		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
-		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
+		unsigned long fastpath[2 * BITS_TO_LONGS(FASTPATH_NGPIO)];
+		unsigned long *mask, *bits;
 		int first, j, ret;
 
+		if (likely(chip->ngpio <= FASTPATH_NGPIO)) {
+			mask = fastpath;
+		} else {
+			mask = kmalloc_array(2 * BITS_TO_LONGS(chip->ngpio),
+					   sizeof(*mask),
+					   can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+			if (!mask)
+				return -ENOMEM;
+		}
+
+		bits = mask + BITS_TO_LONGS(chip->ngpio);
+		bitmap_zero(mask, chip->ngpio);
+
 		if (!can_sleep)
 			WARN_ON(chip->can_sleep);
 
 		/* collect all inputs belonging to the same chip */
 		first = i;
-		memset(mask, 0, sizeof(mask));
 		do {
 			const struct gpio_desc *desc = desc_array[i];
 			int hwgpio = gpio_chip_hwgpio(desc);
@@ -2778,8 +2798,11 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 			 (desc_array[i]->gdev->chip == chip));
 
 		ret = gpio_chip_get_multiple(chip, mask, bits);
-		if (ret)
+		if (ret) {
+			if (mask != fastpath)
+				kfree(mask);
 			return ret;
+		}
 
 		for (j = first; j < i; j++) {
 			const struct gpio_desc *desc = desc_array[j];
@@ -2791,6 +2814,9 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 			value_array[j] = value;
 			trace_gpio_value(desc_to_gpio(desc), 1, value);
 		}
+
+		if (mask != fastpath)
+			kfree(mask);
 	}
 	return 0;
 }
@@ -2974,7 +3000,7 @@ static void gpio_chip_set_multiple(struct gpio_chip *chip,
 	}
 }
 
-void gpiod_set_array_value_complex(bool raw, bool can_sleep,
+int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				   unsigned int array_size,
 				   struct gpio_desc **desc_array,
 				   int *value_array)
@@ -2983,14 +3009,26 @@ void gpiod_set_array_value_complex(bool raw, bool can_sleep,
 
 	while (i < array_size) {
 		struct gpio_chip *chip = desc_array[i]->gdev->chip;
-		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
-		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
+		unsigned long fastpath[2 * BITS_TO_LONGS(FASTPATH_NGPIO)];
+		unsigned long *mask, *bits;
 		int count = 0;
 
+		if (likely(chip->ngpio <= FASTPATH_NGPIO)) {
+			mask = fastpath;
+		} else {
+			mask = kmalloc_array(2 * BITS_TO_LONGS(chip->ngpio),
+					   sizeof(*mask),
+					   can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+			if (!mask)
+				return -ENOMEM;
+		}
+
+		bits = mask + BITS_TO_LONGS(chip->ngpio);
+		bitmap_zero(mask, chip->ngpio);
+
 		if (!can_sleep)
 			WARN_ON(chip->can_sleep);
 
-		memset(mask, 0, sizeof(mask));
 		do {
 			struct gpio_desc *desc = desc_array[i];
 			int hwgpio = gpio_chip_hwgpio(desc);
@@ -3021,7 +3059,11 @@ void gpiod_set_array_value_complex(bool raw, bool can_sleep,
 		/* push collected bits to outputs */
 		if (count != 0)
 			gpio_chip_set_multiple(chip, mask, bits);
+
+		if (mask != fastpath)
+			kfree(mask);
 	}
+	return 0;
 }
 
 /**
@@ -3096,13 +3138,13 @@ EXPORT_SYMBOL_GPL(gpiod_set_value);
  * This function should be called from contexts where we cannot sleep, and will
  * complain if the GPIO chip functions potentially sleep.
  */
-void gpiod_set_raw_array_value(unsigned int array_size,
+int gpiod_set_raw_array_value(unsigned int array_size,
 			 struct gpio_desc **desc_array, int *value_array)
 {
 	if (!desc_array)
-		return;
-	gpiod_set_array_value_complex(true, false, array_size, desc_array,
-				      value_array);
+		return -EINVAL;
+	return gpiod_set_array_value_complex(true, false, array_size,
+					desc_array, value_array);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value);
 
@@ -3422,14 +3464,14 @@ EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep);
  *
  * This function is to be called from contexts that can sleep.
  */
-void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
+int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 					struct gpio_desc **desc_array,
 					int *value_array)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
-		return;
-	gpiod_set_array_value_complex(true, true, array_size, desc_array,
+		return -EINVAL;
+	return gpiod_set_array_value_complex(true, true, array_size, desc_array,
 				      value_array);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value_cansleep);
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index ad456b6f9d8b..1a8e20363861 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -188,7 +188,7 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 				  unsigned int array_size,
 				  struct gpio_desc **desc_array,
 				  int *value_array);
-void gpiod_set_array_value_complex(bool raw, bool can_sleep,
+int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				   unsigned int array_size,
 				   struct gpio_desc **desc_array,
 				   int *value_array);
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index dbd065963296..243112c7fa7d 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -116,7 +116,7 @@ int gpiod_get_raw_array_value(unsigned int array_size,
 			      struct gpio_desc **desc_array,
 			      int *value_array);
 void gpiod_set_raw_value(struct gpio_desc *desc, int value);
-void gpiod_set_raw_array_value(unsigned int array_size,
+int gpiod_set_raw_array_value(unsigned int array_size,
 			       struct gpio_desc **desc_array,
 			       int *value_array);
 
@@ -134,7 +134,7 @@ int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 				       struct gpio_desc **desc_array,
 				       int *value_array);
 void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value);
-void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
+int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 					struct gpio_desc **desc_array,
 					int *value_array);
 
@@ -369,12 +369,13 @@ static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value)
 	/* GPIO can never have been requested */
 	WARN_ON(1);
 }
-static inline void gpiod_set_raw_array_value(unsigned int array_size,
+static inline int gpiod_set_raw_array_value(unsigned int array_size,
 					     struct gpio_desc **desc_array,
 					     int *value_array)
 {
 	/* GPIO can never have been requested */
 	WARN_ON(1);
+	return 0;
 }
 
 static inline int gpiod_get_value_cansleep(const struct gpio_desc *desc)
@@ -423,12 +424,13 @@ static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc,
 	/* GPIO can never have been requested */
 	WARN_ON(1);
 }
-static inline void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
+static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 						struct gpio_desc **desc_array,
 						int *value_array)
 {
 	/* GPIO can never have been requested */
 	WARN_ON(1);
+	return 0;
 }
 
 static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
-- 
cgit v1.2.3


From 449325b52b7a6208f65ed67d3484fd7b7184477b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 21 May 2018 19:22:29 -0700
Subject: umh: introduce fork_usermode_blob() helper

Introduce helper:
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
struct umh_info {
       struct file *pipe_to_umh;
       struct file *pipe_from_umh;
       pid_t pid;
};

that GPLed kernel modules (signed or unsigned) can use it to execute part
of its own data as swappable user mode process.

The kernel will do:
- allocate a unique file in tmpfs
- populate that file with [data, data + len] bytes
- user-mode-helper code will do_execve that file and, before the process
  starts, the kernel will create two unix pipes for bidirectional
  communication between kernel module and umh
- close tmpfs file, effectively deleting it
- the fork_usermode_blob will return zero on success and populate
  'struct umh_info' with two unix pipes and the pid of the user process

As the first step in the development of the bpfilter project
the fork_usermode_blob() helper is introduced to allow user mode code
to be invoked from a kernel module. The idea is that user mode code plus
normal kernel module code are built as part of the kernel build
and installed as traditional kernel module into distro specified location,
such that from a distribution point of view, there is
no difference between regular kernel modules and kernel modules + umh code.
Such modules can be signed, modprobed, rmmod, etc. The use of this new helper
by a kernel module doesn't make it any special from kernel and user space
tooling point of view.

Such approach enables kernel to delegate functionality traditionally done
by the kernel modules into the user space processes (either root or !root) and
reduces security attack surface of the new code. The buggy umh code would crash
the user process, but not the kernel. Another advantage is that umh code
of the kernel module can be debugged and tested out of user space
(e.g. opening the possibility to run clang sanitizers, fuzzers or
user space test suites on the umh code).
In case of the bpfilter project such architecture allows complex control plane
to be done in the user space while bpf based data plane stays in the kernel.

Since umh can crash, can be oom-ed by the kernel, killed by the admin,
the kernel module that uses them (like bpfilter) needs to manage life
time of umh on its own via two unix pipes and the pid of umh.

The exit code of such kernel module should kill the umh it started,
so that rmmod of the kernel module will cleanup the corresponding umh.
Just like if the kernel module does kmalloc() it should kfree() it
in the exit code.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/exec.c               |  38 +++++++++++----
 include/linux/binfmts.h |   1 +
 include/linux/umh.h     |  12 +++++
 kernel/umh.c            | 125 ++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 164 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 183059c427b9..30a36c2a39bf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execveat_common(int fd, struct filename *filename,
-			      struct user_arg_ptr argv,
-			      struct user_arg_ptr envp,
-			      int flags)
+static int __do_execve_file(int fd, struct filename *filename,
+			    struct user_arg_ptr argv,
+			    struct user_arg_ptr envp,
+			    int flags, struct file *file)
 {
 	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
-	struct file *file;
 	struct files_struct *displaced;
 	int retval;
 
@@ -1752,7 +1751,8 @@ static int do_execveat_common(int fd, struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-	file = do_open_execat(fd, filename, flags);
+	if (!file)
+		file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename *filename,
 	sched_exec();
 
 	bprm->file = file;
-	if (fd == AT_FDCWD || filename->name[0] == '/') {
+	if (!filename) {
+		bprm->filename = "none";
+	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
 		bprm->filename = filename->name;
 	} else {
 		if (filename->name[0] == '\0')
@@ -1826,7 +1828,8 @@ static int do_execveat_common(int fd, struct filename *filename,
 	task_numa_free(current);
 	free_bprm(bprm);
 	kfree(pathbuf);
-	putname(filename);
+	if (filename)
+		putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
 	return retval;
@@ -1849,10 +1852,27 @@ out_files:
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
-	putname(filename);
+	if (filename)
+		putname(filename);
 	return retval;
 }
 
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
+{
+	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
+}
+
+int do_execve_file(struct file *file, void *__argv, void *__envp)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+
+	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
+}
+
 int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 4955e0863b83..c05f24fac4f6 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -150,5 +150,6 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
+int do_execve_file(struct file *file, void *__argv, void *__envp);
 
 #endif /* _LINUX_BINFMTS_H */
diff --git a/include/linux/umh.h b/include/linux/umh.h
index 244aff638220..5c812acbb80a 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -22,8 +22,10 @@ struct subprocess_info {
 	const char *path;
 	char **argv;
 	char **envp;
+	struct file *file;
 	int wait;
 	int retval;
+	pid_t pid;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
@@ -38,6 +40,16 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
+struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
+			  int (*init)(struct subprocess_info *info, struct cred *new),
+			  void (*cleanup)(struct subprocess_info *), void *data);
+struct umh_info {
+	struct file *pipe_to_umh;
+	struct file *pipe_from_umh;
+	pid_t pid;
+};
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
diff --git a/kernel/umh.c b/kernel/umh.c
index f76b3ff876cf..30db93fd7e39 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -25,6 +25,8 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -97,9 +99,13 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	retval = do_execve(getname_kernel(sub_info->path),
-			   (const char __user *const __user *)sub_info->argv,
-			   (const char __user *const __user *)sub_info->envp);
+	if (sub_info->file)
+		retval = do_execve_file(sub_info->file,
+					sub_info->argv, sub_info->envp);
+	else
+		retval = do_execve(getname_kernel(sub_info->path),
+				   (const char __user *const __user *)sub_info->argv,
+				   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
@@ -185,6 +191,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
 		if (pid < 0) {
 			sub_info->retval = pid;
 			umh_complete(sub_info);
+		} else {
+			sub_info->pid = pid;
 		}
 	}
 }
@@ -393,6 +401,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
+struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
+		int (*init)(struct subprocess_info *info, struct cred *new),
+		void (*cleanup)(struct subprocess_info *info), void *data)
+{
+	struct subprocess_info *sub_info;
+
+	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
+	if (!sub_info)
+		return NULL;
+
+	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+	sub_info->path = "none";
+	sub_info->file = file;
+	sub_info->init = init;
+	sub_info->cleanup = cleanup;
+	sub_info->data = data;
+	return sub_info;
+}
+
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct umh_info *umh_info = info->data;
+	struct file *from_umh[2];
+	struct file *to_umh[2];
+	int err;
+
+	/* create pipe to send data to umh */
+	err = create_pipe_files(to_umh, 0);
+	if (err)
+		return err;
+	err = replace_fd(0, to_umh[0], 0);
+	fput(to_umh[0]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		return err;
+	}
+
+	/* create pipe to receive data from umh */
+	err = create_pipe_files(from_umh, 0);
+	if (err) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		return err;
+	}
+	err = replace_fd(1, from_umh[1], 0);
+	fput(from_umh[1]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		fput(from_umh[0]);
+		return err;
+	}
+
+	umh_info->pipe_to_umh = to_umh[1];
+	umh_info->pipe_from_umh = from_umh[0];
+	return 0;
+}
+
+static void umh_save_pid(struct subprocess_info *info)
+{
+	struct umh_info *umh_info = info->data;
+
+	umh_info->pid = info->pid;
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+	struct subprocess_info *sub_info;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+	int err;
+
+	file = shmem_kernel_file_setup("", len, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
+						  umh_save_pid, info);
+	if (!sub_info)
+		goto out;
+
+	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+	fput(file);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
-- 
cgit v1.2.3


From d2ba09c17a0647f899d6c20a11bab9e6d3382f07 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 21 May 2018 19:22:30 -0700
Subject: net: add skeleton of bpfilter kernel module

bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
and user mode helper code that is embedded into bpfilter.ko

The steps to build bpfilter.ko are the following:
- main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
- with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
  is converted into bpfilter_umh.o object file
  with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
  Example:
  $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
  0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
  0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
  0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
- bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko

bpfilter_kern.c is a normal kernel module code that calls
the fork_usermode_blob() helper to execute part of its own data
as a user mode process.

Notice that _binary_net_bpfilter_bpfilter_umh_start - end
is placed into .init.rodata section, so it's freed as soon as __init
function of bpfilter.ko is finished.
As part of __init the bpfilter.ko does first request/reply action
via two unix pipe provided by fork_usermode_blob() helper to
make sure that umh is healthy. If not it will kill it via pid.

Later bpfilter_process_sockopt() will be called from bpfilter hooks
in get/setsockopt() to pass iptable commands into umh via bpfilter.ko

If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
kill umh as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpfilter.h      |  15 ++++++
 include/uapi/linux/bpfilter.h |  21 ++++++++
 net/Kconfig                   |   2 +
 net/Makefile                  |   1 +
 net/bpfilter/Kconfig          |  16 ++++++
 net/bpfilter/Makefile         |  30 ++++++++++++
 net/bpfilter/bpfilter_kern.c  | 111 ++++++++++++++++++++++++++++++++++++++++++
 net/bpfilter/main.c           |  63 ++++++++++++++++++++++++
 net/bpfilter/msgfmt.h         |  17 +++++++
 net/ipv4/Makefile             |   2 +
 net/ipv4/bpfilter/Makefile    |   2 +
 net/ipv4/bpfilter/sockopt.c   |  42 ++++++++++++++++
 net/ipv4/ip_sockglue.c        |  17 +++++++
 13 files changed, 339 insertions(+)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

(limited to 'include')

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
new file mode 100644
index 000000000000..687b1760bb9f
--- /dev/null
+++ b/include/linux/bpfilter.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BPFILTER_H
+#define _LINUX_BPFILTER_H
+
+#include <uapi/linux/bpfilter.h>
+
+struct sock;
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
+			    unsigned int optlen);
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
+			    int *optlen);
+extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				       char __user *optval,
+				       unsigned int optlen, bool is_set);
+#endif
diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
new file mode 100644
index 000000000000..2ec3cc99ea4c
--- /dev/null
+++ b/include/uapi/linux/bpfilter.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_LINUX_BPFILTER_H
+#define _UAPI_LINUX_BPFILTER_H
+
+#include <linux/if.h>
+
+enum {
+	BPFILTER_IPT_SO_SET_REPLACE = 64,
+	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
+	BPFILTER_IPT_SET_MAX,
+};
+
+enum {
+	BPFILTER_IPT_SO_GET_INFO = 64,
+	BPFILTER_IPT_SO_GET_ENTRIES = 65,
+	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
+	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
+	BPFILTER_IPT_GET_MAX,
+};
+
+#endif /* _UAPI_LINUX_BPFILTER_H */
diff --git a/net/Kconfig b/net/Kconfig
index df8d45ef47d8..ba554cedb615 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -202,6 +202,8 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
+source "net/bpfilter/Kconfig"
+
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/rds/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 77aaddedbd29..bdaf53925acd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
+obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index 000000000000..60725c5f79db
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,16 @@
+menuconfig BPFILTER
+	bool "BPF based packet filtering framework (BPFILTER)"
+	default n
+	depends on NET && BPF
+	help
+	  This builds experimental bpfilter framework that is aiming to
+	  provide netfilter compatible functionality via BPF
+
+if BPFILTER
+config BPFILTER_UMH
+	tristate "bpfilter kernel module with user mode helper"
+	default m
+	help
+	  This builds bpfilter kernel module with embedded user mode helper
+endif
+
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
new file mode 100644
index 000000000000..2af752c8ef5e
--- /dev/null
+++ b/net/bpfilter/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux BPFILTER layer.
+#
+
+hostprogs-y := bpfilter_umh
+bpfilter_umh-objs := main.o
+HOSTCFLAGS += -I. -Itools/include/
+ifeq ($(CONFIG_BPFILTER_UMH), y)
+# builtin bpfilter_umh should be compiled with -static
+# since rootfs isn't mounted at the time of __init
+# function is called and do_execv won't find elf interpreter
+HOSTLDFLAGS += -static
+endif
+
+# a bit of elf magic to convert bpfilter_umh binary into a binary blob
+# inside bpfilter_umh.o elf file referenced by
+# _binary_net_bpfilter_bpfilter_umh_start symbol
+# which bpfilter_kern.c passes further into umh blob loader at run-time
+quiet_cmd_copy_umh = GEN $@
+      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
+      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
+      --rename-section .data=.init.rodata $< $@
+
+$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
+	$(call cmd,copy_umh)
+
+obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
+bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
new file mode 100644
index 000000000000..7596314b61c7
--- /dev/null
+++ b/net/bpfilter/bpfilter_kern.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/umh.h>
+#include <linux/bpfilter.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include "msgfmt.h"
+
+#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
+#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
+
+extern char UMH_start;
+extern char UMH_end;
+
+static struct umh_info info;
+/* since ip_getsockopt() can run in parallel, serialize access to umh */
+static DEFINE_MUTEX(bpfilter_lock);
+
+static void shutdown_umh(struct umh_info *info)
+{
+	struct task_struct *tsk;
+
+	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
+	if (tsk)
+		force_sig(SIGKILL, tsk);
+	fput(info->pipe_to_umh);
+	fput(info->pipe_from_umh);
+}
+
+static void __stop_umh(void)
+{
+	if (bpfilter_process_sockopt) {
+		bpfilter_process_sockopt = NULL;
+		shutdown_umh(&info);
+	}
+}
+
+static void stop_umh(void)
+{
+	mutex_lock(&bpfilter_lock);
+	__stop_umh();
+	mutex_unlock(&bpfilter_lock);
+}
+
+static int __bpfilter_process_sockopt(struct sock *sk, int optname,
+				      char __user *optval,
+				      unsigned int optlen, bool is_set)
+{
+	struct mbox_request req;
+	struct mbox_reply reply;
+	loff_t pos;
+	ssize_t n;
+	int ret;
+
+	req.is_set = is_set;
+	req.pid = current->pid;
+	req.cmd = optname;
+	req.addr = (long)optval;
+	req.len = optlen;
+	mutex_lock(&bpfilter_lock);
+	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
+	if (n != sizeof(req)) {
+		pr_err("write fail %zd\n", n);
+		__stop_umh();
+		ret = -EFAULT;
+		goto out;
+	}
+	pos = 0;
+	n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
+	if (n != sizeof(reply)) {
+		pr_err("read fail %zd\n", n);
+		__stop_umh();
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = reply.status;
+out:
+	mutex_unlock(&bpfilter_lock);
+	return ret;
+}
+
+static int __init load_umh(void)
+{
+	int err;
+
+	/* fork usermode process */
+	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+	if (err)
+		return err;
+	pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
+
+	/* health check that usermode process started correctly */
+	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
+		stop_umh();
+		return -EFAULT;
+	}
+	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+	return 0;
+}
+
+static void __exit fini_umh(void)
+{
+	stop_umh();
+}
+module_init(load_umh);
+module_exit(fini_umh);
+MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
new file mode 100644
index 000000000000..81bbc1684896
--- /dev/null
+++ b/net/bpfilter/main.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "include/uapi/linux/bpf.h"
+#include <asm/unistd.h>
+#include "msgfmt.h"
+
+int debug_fd;
+
+static int handle_get_cmd(struct mbox_request *cmd)
+{
+	switch (cmd->cmd) {
+	case 0:
+		return 0;
+	default:
+		break;
+	}
+	return -ENOPROTOOPT;
+}
+
+static int handle_set_cmd(struct mbox_request *cmd)
+{
+	return -ENOPROTOOPT;
+}
+
+static void loop(void)
+{
+	while (1) {
+		struct mbox_request req;
+		struct mbox_reply reply;
+		int n;
+
+		n = read(0, &req, sizeof(req));
+		if (n != sizeof(req)) {
+			dprintf(debug_fd, "invalid request %d\n", n);
+			return;
+		}
+
+		reply.status = req.is_set ?
+			handle_set_cmd(&req) :
+			handle_get_cmd(&req);
+
+		n = write(1, &reply, sizeof(reply));
+		if (n != sizeof(reply)) {
+			dprintf(debug_fd, "reply failed %d\n", n);
+			return;
+		}
+	}
+}
+
+int main(void)
+{
+	debug_fd = open("/dev/console", 00000002 | 00000100);
+	dprintf(debug_fd, "Started bpfilter\n");
+	loop();
+	close(debug_fd);
+	return 0;
+}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
new file mode 100644
index 000000000000..98d121c62945
--- /dev/null
+++ b/net/bpfilter/msgfmt.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_BPFILTER_MSGFMT_H
+#define _NET_BPFILTER_MSGFMT_H
+
+struct mbox_request {
+	__u64 addr;
+	__u32 len;
+	__u32 is_set;
+	__u32 cmd;
+	__u32 pid;
+};
+
+struct mbox_reply {
+	__u32 status;
+};
+
+#endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520f9133..7018f91c5a39 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,6 +16,8 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
 	     metrics.o
 
+obj-$(CONFIG_BPFILTER) += bpfilter/
+
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 000000000000..ce262d76cc48
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BPFILTER) += sockopt.o
+
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 000000000000..42a96d2d8d05
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+#include <uapi/linux/bpf.h>
+#include <linux/wait.h>
+#include <linux/kmod.h>
+
+int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				char __user *optval,
+				unsigned int optlen, bool is_set);
+EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
+
+int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
+			  unsigned int optlen, bool is_set)
+{
+	if (!bpfilter_process_sockopt) {
+		int err = request_module("bpfilter");
+
+		if (err)
+			return err;
+		if (!bpfilter_process_sockopt)
+			return -ECHILD;
+	}
+	return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
+}
+
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
+			    unsigned int optlen)
+{
+	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
+}
+
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
+			    int __user *optlen)
+{
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	return bpfilter_mbox_request(sk, optname, optval, len, false);
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5ad2d8ed3a3f..e0791faacb24 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,6 +47,8 @@
 #include <linux/errqueue.h>
 #include <linux/uaccess.h>
 
+#include <linux/bpfilter.h>
+
 /*
  *	SOL_IP control messages.
  */
@@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
 		return -ENOPROTOOPT;
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
+	    optname < BPFILTER_IPT_SET_MAX)
+		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
 	int err;
 
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
 		MSG_CMSG_COMPAT);
 
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
-- 
cgit v1.2.3


From 1844cc7ebb95d500fcf13fa491017e760947db0e Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Wed, 23 May 2018 14:50:31 +0800
Subject: dt-bindings: power: add RK3128 SoCs header for power-domain

According to a description from TRM, add all the power domains.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/power/rk3128-power.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 include/dt-bindings/power/rk3128-power.h

(limited to 'include')

diff --git a/include/dt-bindings/power/rk3128-power.h b/include/dt-bindings/power/rk3128-power.h
new file mode 100644
index 000000000000..c051dc3108db
--- /dev/null
+++ b/include/dt-bindings/power/rk3128-power.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_BINDINGS_POWER_RK3128_POWER_H__
+#define __DT_BINDINGS_POWER_RK3128_POWER_H__
+
+/* VD_CORE */
+#define RK3128_PD_CORE		0
+
+/* VD_LOGIC */
+#define RK3128_PD_VIO		1
+#define RK3128_PD_VIDEO		2
+#define RK3128_PD_GPU		3
+#define RK3128_PD_MSCH		4
+
+#endif
-- 
cgit v1.2.3


From 3cfff9adab0447c0bb858ba4d36b9affb88af406 Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Wed, 23 May 2018 14:51:26 +0800
Subject: dt-bindings: power: add RK3228 SoCs header for power-domain

According to a description from TRM, add all the power domains.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/power/rk3228-power.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 include/dt-bindings/power/rk3228-power.h

(limited to 'include')

diff --git a/include/dt-bindings/power/rk3228-power.h b/include/dt-bindings/power/rk3228-power.h
new file mode 100644
index 000000000000..6a8dc1bf76ce
--- /dev/null
+++ b/include/dt-bindings/power/rk3228-power.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_BINDINGS_POWER_RK3228_POWER_H__
+#define __DT_BINDINGS_POWER_RK3228_POWER_H__
+
+/**
+ * RK3228 idle id Summary.
+ */
+
+#define RK3228_PD_CORE		0
+#define RK3228_PD_MSCH		1
+#define RK3228_PD_BUS		2
+#define RK3228_PD_SYS		3
+#define RK3228_PD_VIO		4
+#define RK3228_PD_VOP		5
+#define RK3228_PD_VPU		6
+#define RK3228_PD_RKVDEC	7
+#define RK3228_PD_GPU		8
+#define RK3228_PD_PERI		9
+#define RK3228_PD_GMAC		10
+
+#endif
-- 
cgit v1.2.3


From c736afccf29005f87d81ea7cb24b8d840c5e3651 Mon Sep 17 00:00:00 2001
From: Finley Xiao <finley.xiao@rock-chips.com>
Date: Wed, 23 May 2018 14:52:21 +0800
Subject: dt-bindings: power: add PX30 SoCs header for power-domain

According to a description from TRM, add all the power domains.

Signed-off-by: Finley Xiao <finley.xiao@rock-chips.com>
Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/power/px30-power.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 include/dt-bindings/power/px30-power.h

(limited to 'include')

diff --git a/include/dt-bindings/power/px30-power.h b/include/dt-bindings/power/px30-power.h
new file mode 100644
index 000000000000..30917a99ad20
--- /dev/null
+++ b/include/dt-bindings/power/px30-power.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_BINDINGS_POWER_PX30_POWER_H__
+#define __DT_BINDINGS_POWER_PX30_POWER_H__
+
+/* VD_CORE */
+#define PX30_PD_A35_0		0
+#define PX30_PD_A35_1		1
+#define PX30_PD_A35_2		2
+#define PX30_PD_A35_3		3
+#define PX30_PD_SCU		4
+
+/* VD_LOGIC */
+#define PX30_PD_USB		5
+#define PX30_PD_DDR		6
+#define PX30_PD_SDCARD		7
+#define PX30_PD_CRYPTO		8
+#define PX30_PD_GMAC		9
+#define PX30_PD_MMC_NAND	10
+#define PX30_PD_VPU		11
+#define PX30_PD_VO		12
+#define PX30_PD_VI		13
+#define PX30_PD_GPU		14
+
+/* VD_PMU */
+#define PX30_PD_PMU		15
+
+#endif
-- 
cgit v1.2.3


From 8eea1ca82be90a7e7a4624ab9cb323574a5f71df Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 22 May 2018 11:34:40 -0400
Subject: gso: limit udp gso to egress-only virtual devices

Until the udp receive stack supports large packets (UDP GRO), GSO
packets must not loop from the egress to the ingress path.

Revert the change that added NETIF_F_GSO_UDP_L4 to various virtual
devices through NETIF_F_GSO_ENCAP_ALL as this included devices that
may loop packets, such as veth and macvlan.

Instead add it to specific devices that forward to another device's
egress path, bonding and team.

Fixes: 83aa025f535f ("udp: add gso support to virtual devices")
CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 5 +++--
 drivers/net/team/team.c         | 5 +++--
 include/linux/netdev_features.h | 1 -
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 06efdf6a762b..fea17b92b1ae 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1107,7 +1107,8 @@ static void bond_compute_features(struct bonding *bond)
 
 done:
 	bond_dev->vlan_features = vlan_features;
-	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+				    NETIF_F_GSO_UDP_L4;
 	bond_dev->gso_max_segs = gso_max_segs;
 	netif_set_gso_max_size(bond_dev, gso_max_size);
 
@@ -4268,7 +4269,7 @@ void bond_setup(struct net_device *bond_dev)
 				NETIF_F_HW_VLAN_CTAG_RX |
 				NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL | NETIF_F_GSO_UDP_L4;
 	bond_dev->features |= bond_dev->hw_features;
 }
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 9dbd390ace34..d6ff881165d0 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1026,7 +1026,8 @@ static void __team_compute_features(struct team *team)
 	}
 
 	team->dev->vlan_features = vlan_features;
-	team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+				     NETIF_F_GSO_UDP_L4;
 	team->dev->hard_header_len = max_hard_header_len;
 
 	team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
@@ -2117,7 +2118,7 @@ static void team_setup(struct net_device *dev)
 			   NETIF_F_HW_VLAN_CTAG_RX |
 			   NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	dev->hw_features |= NETIF_F_GSO_ENCAP_ALL | NETIF_F_GSO_UDP_L4;
 	dev->features |= dev->hw_features;
 }
 
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index c87c3a3453c1..623bb8ced060 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -220,7 +220,6 @@ enum {
 				 NETIF_F_GSO_GRE_CSUM |			\
 				 NETIF_F_GSO_IPXIP4 |			\
 				 NETIF_F_GSO_IPXIP6 |			\
-				 NETIF_F_GSO_UDP_L4 |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
-- 
cgit v1.2.3


From 404eb77ea766260c45cb05c4a8043b13bd7142d5 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 22 May 2018 14:03:27 -0700
Subject: ipv4: support sport, dport and ip_proto in RTM_GETROUTE

This is a followup to fib rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h               |   3 +
 include/uapi/linux/rtnetlink.h |   3 +
 net/ipv4/Makefile              |   2 +-
 net/ipv4/fib_frontend.c        |   3 +
 net/ipv4/netlink.c             |  23 +++++++
 net/ipv4/route.c               | 146 ++++++++++++++++++++++++++++++-----------
 6 files changed, 140 insertions(+), 40 deletions(-)
 create mode 100644 net/ipv4/netlink.c

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index bada1f1f871e..0d2281b4b27a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -664,4 +664,7 @@ extern int sysctl_icmp_msgs_burst;
 int ip_misc_proc_init(void);
 #endif
 
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack);
+
 #endif	/* _IP_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9b15005955fa..cabb210c93af 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -327,6 +327,9 @@ enum rtattr_type_t {
 	RTA_PAD,
 	RTA_UID,
 	RTA_TTL_PROPAGATE,
+	RTA_IP_PROTO,
+	RTA_SPORT,
+	RTA_DPORT,
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7018f91c5a39..eec9569ffa5c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-	     metrics.o
+	     metrics.o netlink.o
 
 obj-$(CONFIG_BPFILTER) += bpfilter/
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d622112bf95..897ae92dff0f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -649,6 +649,9 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 000000000000..f86bb4f06609
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack)
+{
+	*ip_proto = nla_get_u8(attr);
+
+	switch (*ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		return 0;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2cfa1b518f8d..0e401dc4e1bd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2574,11 +2574,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 /* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
-			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-			u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+			struct sk_buff *skb, u32 portid, u32 seq)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
@@ -2674,7 +2673,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			}
 		} else
 #endif
-			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+			if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
 				goto nla_put_failure;
 	}
 
@@ -2689,43 +2688,93 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+						   u8 ip_proto, __be16 sport,
+						   __be16 dport)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	/* Reserve room for dummy headers, this skb can pass
+	 * through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	iph = skb_put(skb, sizeof(struct iphdr));
+	iph->protocol = ip_proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->version = 0x4;
+	iph->frag_off = 0;
+	iph->ihl = 0x5;
+	skb_set_transport_header(skb, skb->len);
+
+	switch (iph->protocol) {
+	case IPPROTO_UDP: {
+		struct udphdr *udph;
+
+		udph = skb_put_zero(skb, sizeof(struct udphdr));
+		udph->source = sport;
+		udph->dest = dport;
+		udph->len = sizeof(struct udphdr);
+		udph->check = 0;
+		break;
+	}
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph;
+
+		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+		tcph->source	= sport;
+		tcph->dest	= dport;
+		tcph->doff	= sizeof(struct tcphdr) / 4;
+		tcph->rst = 1;
+		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+					    src, dst, 0);
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr *icmph;
+
+		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+		icmph->type = ICMP_ECHO;
+		icmph->code = 0;
+	}
+	}
+
+	return skb;
+}
+
 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(in_skb->sk);
-	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
+	u32 table_id = RT_TABLE_MAIN;
+	__be16 sport = 0, dport = 0;
 	struct fib_result res = {};
+	u8 ip_proto = IPPROTO_UDP;
 	struct rtable *rt = NULL;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
 	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
+	kuid_t uid;
 	u32 iif;
 	int err;
 	int mark;
-	struct sk_buff *skb;
-	u32 table_id = RT_TABLE_MAIN;
-	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
 			  extack);
 	if (err < 0)
-		goto errout;
+		return err;
 
 	rtm = nlmsg_data(nlh);
-
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!skb) {
-		err = -ENOBUFS;
-		goto errout;
-	}
-
-	/* Reserve room for dummy headers, this skb can pass
-	   through good chunk of routing engine.
-	 */
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-
 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2735,14 +2784,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		uid = (iif ? INVALID_UID : current_uid());
 
-	/* Bugfix: need to give ip_route_input enough of an IP header to
-	 * not gag.
-	 */
-	ip_hdr(skb)->protocol = IPPROTO_UDP;
-	ip_hdr(skb)->saddr = src;
-	ip_hdr(skb)->daddr = dst;
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &ip_proto, extack);
+		if (err)
+			return err;
+	}
+
+	if (tb[RTA_SPORT])
+		sport = nla_get_be16(tb[RTA_SPORT]);
 
-	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+	if (tb[RTA_DPORT])
+		dport = nla_get_be16(tb[RTA_DPORT]);
+
+	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+	if (!skb)
+		return -ENOBUFS;
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = dst;
@@ -2751,6 +2808,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_uid = uid;
+	if (sport)
+		fl4.fl4_sport = sport;
+	if (dport)
+		fl4.fl4_dport = dport;
+	fl4.flowi4_proto = ip_proto;
 
 	rcu_read_lock();
 
@@ -2760,10 +2822,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		dev = dev_get_by_index_rcu(net, iif);
 		if (!dev) {
 			err = -ENODEV;
-			goto errout_free;
+			goto errout_rcu;
 		}
 
-		skb->protocol	= htons(ETH_P_IP);
+		fl4.flowi4_iif = iif; /* for rt_fill_info */
 		skb->dev	= dev;
 		skb->mark	= mark;
 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2783,7 +2845,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	if (err)
-		goto errout_free;
+		goto errout_rcu;
 
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
@@ -2791,34 +2853,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
 		table_id = res.table ? res.table->tb_id : 0;
 
+	/* reset skb for netlink reply msg */
+	skb_trim(skb, 0);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_header(skb);
+
 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 		if (!res.fi) {
 			err = fib_props[res.type].error;
 			if (!err)
 				err = -EHOSTUNREACH;
-			goto errout_free;
+			goto errout_rcu;
 		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
 				    rt->rt_type, res.prefix, res.prefixlen,
 				    fl4.flowi4_tos, res.fi, 0);
 	} else {
-		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 	}
 	if (err < 0)
-		goto errout_free;
+		goto errout_rcu;
 
 	rcu_read_unlock();
 
 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-	return err;
 
 errout_free:
+	return err;
+errout_rcu:
 	rcu_read_unlock();
 	kfree_skb(skb);
-	goto errout;
+	goto errout_free;
 }
 
 void ip_rt_multicast_event(struct in_device *in_dev)
-- 
cgit v1.2.3


From dbecd7388476aedeb66389febea84d5450d28773 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:48 +0530
Subject: bpf: get kernel symbol addresses via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of kernel symbol addresses for all functions in a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

When bpf_jit_kallsyms is enabled, we can get the address
of the corresponding kernel symbol for a callee function
and resolve the symbol's name. The address is determined
by adding the value of the call instruction's imm field
to __bpf_call_base. This offset gets assigned to the imm
field by the verifier.

For some architectures, such as powerpc64, the imm field
is not large enough to hold this offset.

We resolve this by:

[1] Assigning the subprog id to the imm field of a call
    instruction in the verifier instead of the offset of
    the callee's symbol's address from __bpf_call_base.

[2] Determining the address of a callee's corresponding
    symbol by using the imm field as an index for the
    list of kernel symbol addresses now available from
    the program info.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 25 +++++++++++++++++++++++++
 kernel/bpf/verifier.c    |  7 +------
 3 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d06bc3..0be90965867d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b4c94551001..068a4fc79ddb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1981,6 +1982,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u64 __user *user_ksyms;
+			ulong ksym_addr;
+			u32 i;
+
+			/* copy the address of the kernel symbol
+			 * corresponding to each function
+			 */
+			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+			for (i = 0; i < ulen; i++) {
+				ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+				ksym_addr &= PAGE_MASK;
+				if (put_user((u64) ksym_addr, &user_ksyms[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_ksyms = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 559cb74ba29e..8c4d9d0fd3ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
-- 
cgit v1.2.3


From 815581c11cc29f74af252b6306ea1ec94160231a Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:52 +0530
Subject: bpf: get JITed image lengths of functions via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of the JITed image lengths of each function for a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

This can be used by userspace applications like bpftool
to split up the contiguous JITed dump, also obtained via
the system call, into more relatable chunks corresponding
to each function.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0be90965867d..344d2ddcef49 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c8e987a612b5..788456c18617 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2037,6 +2037,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_func_lens;
+	info.nr_jited_func_lens = prog->aux->func_cnt;
+	if (info.nr_jited_func_lens && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u32 __user *user_lens;
+			u32 func_len, i;
+
+			/* copy the JITed image lengths for each function */
+			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+			user_lens = u64_to_user_ptr(info.jited_func_lens);
+			for (i = 0; i < ulen; i++) {
+				func_len = prog->aux->func[i]->jited_len;
+				if (put_user(func_len, &user_lens[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_func_lens = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From 96961fa048cf2eef2b53cbc26313629937198996 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 24 May 2018 11:14:13 +0200
Subject: ALSA: memalloc: Drop superfluous ifndef

Drop the superfluous #ifndef check in memalloc.h that had been put
just for allowing building the alsa-driver kernel modules externally.
Since the external build was discontinued years ago, let's clean up
the old kludges.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/memalloc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/memalloc.h b/include/sound/memalloc.h
index 782d1df34208..9c3db3dce32b 100644
--- a/include/sound/memalloc.h
+++ b/include/sound/memalloc.h
@@ -34,11 +34,9 @@ struct snd_dma_device {
 	struct device *dev;		/* generic device */
 };
 
-#ifndef snd_dma_pci_data
 #define snd_dma_pci_data(pci)	(&(pci)->dev)
 #define snd_dma_isa_data()	NULL
 #define snd_dma_continuous_data(x)	((struct device *)(__force unsigned long)(x))
-#endif
 
 
 /*
-- 
cgit v1.2.3


From 63526e1c805b5a8992d25386d315009fcabac8e2 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:12 +0100
Subject: ipv6: sr: make seg6.h includable without IPv6

include/net/seg6.h cannot be included in a source file if CONFIG_IPV6 is
not enabled:
   include/net/seg6.h: In function 'seg6_pernet':
>> include/net/seg6.h:52:14: error: 'struct net' has no member named
                                        'ipv6'; did you mean 'ipv4'?
     return net->ipv6.seg6_data;
                 ^~~~
                 ipv4

This commit makes seg6_pernet return NULL if IPv6 is not compiled, hence
allowing seg6.h to be included regardless of the configuration.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/seg6.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 099bad59dc90..70b4cfac52d7 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -49,7 +49,11 @@ struct seg6_pernet_data {
 
 static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
 {
+#if IS_ENABLED(CONFIG_IPV6)
 	return net->ipv6.seg6_data;
+#else
+	return NULL;
+#endif
 }
 
 extern int seg6_init(void);
-- 
cgit v1.2.3


From 1c1e761ef1a08a878a040e67979711015cfc163e Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:13 +0100
Subject: ipv6: sr: export function lookup_nexthop

The function lookup_nexthop is essential to implement most of the seg6local
actions. As we want to provide a BPF helper allowing to apply some of these
actions on the packet being processed, the helper should be able to call
this function, hence the need to make it public.

Moreover, if one argument is incorrect or if the next hop can not be found,
an error should be returned by the BPF helper so the BPF program can adapt
its processing of the packet (return an error, properly force the drop,
...). This patch hence makes this function return dst->error to indicate a
possible error.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/seg6.h       |  3 ++-
 include/net/seg6_local.h | 24 ++++++++++++++++++++++++
 net/ipv6/seg6_local.c    | 20 +++++++++++---------
 3 files changed, 37 insertions(+), 10 deletions(-)
 create mode 100644 include/net/seg6_local.h

(limited to 'include')

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 70b4cfac52d7..e029e301faa5 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -67,5 +67,6 @@ extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
 			     int proto);
 extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
-
+extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			       u32 tbl_id);
 #endif
diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
new file mode 100644
index 000000000000..57498b23085d
--- /dev/null
+++ b/include/net/seg6_local.h
@@ -0,0 +1,24 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Authors:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_LOCAL_H
+#define _NET_SEG6_LOCAL_H
+
+#include <linux/net.h>
+#include <linux/ipv6.h>
+
+extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			       u32 tbl_id);
+
+#endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 45722327375a..e9b23fb924ad 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -30,6 +30,7 @@
 #ifdef CONFIG_IPV6_SEG6_HMAC
 #include <net/seg6_hmac.h>
 #endif
+#include <net/seg6_local.h>
 #include <linux/etherdevice.h>
 
 struct seg6_local_lwt;
@@ -140,8 +141,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
 	*daddr = *addr;
 }
 
-static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
-			   u32 tbl_id)
+int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			u32 tbl_id)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -187,6 +188,7 @@ out:
 
 	skb_dst_drop(skb);
 	skb_dst_set(skb, dst);
+	return dst->error;
 }
 
 /* regular endpoint function */
@@ -200,7 +202,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -220,7 +222,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, &slwt->nh6, 0);
+	seg6_lookup_nexthop(skb, &slwt->nh6, 0);
 
 	return dst_input(skb);
 
@@ -239,7 +241,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, NULL, slwt->table);
+	seg6_lookup_nexthop(skb, NULL, slwt->table);
 
 	return dst_input(skb);
 
@@ -331,7 +333,7 @@ static int input_action_end_dx6(struct sk_buff *skb,
 	if (!ipv6_addr_any(&slwt->nh6))
 		nhaddr = &slwt->nh6;
 
-	lookup_nexthop(skb, nhaddr, 0);
+	seg6_lookup_nexthop(skb, nhaddr, 0);
 
 	return dst_input(skb);
 drop:
@@ -380,7 +382,7 @@ static int input_action_end_dt6(struct sk_buff *skb,
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	lookup_nexthop(skb, NULL, slwt->table);
+	seg6_lookup_nexthop(skb, NULL, slwt->table);
 
 	return dst_input(skb);
 
@@ -406,7 +408,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -438,7 +440,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
-- 
cgit v1.2.3


From fe94cc290f535709d3c5ebd1e472dfd0aec7ee79 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:14 +0100
Subject: bpf: Add IPv6 Segment Routing helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The BPF seg6local hook should be powerful enough to enable users to
implement most of the use-cases one could think of. After some thinking,
we figured out that the following actions should be possible on a SRv6
packet, requiring 3 specific helpers :
    - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH
    - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH
                               (to add/delete TLVs)
    - bpf_lwt_seg6_action: Apply some SRv6 network programming actions
                           (specifically End.X, End.T, End.B6 and
                            End.B6.Encap)

The specifications of these helpers are provided in the patch (see
include/uapi/linux/bpf.h).

The non-sensitive fields of the SRH are the following : flags, tag and
TLVs. The other fields can not be modified, to maintain the SRH
integrity. Flags, tag and TLVs can easily be modified as their validity
can be checked afterwards via seg6_validate_srh. It is not allowed to
modify the segments directly. If one wants to add segments on the path,
he should stack a new SRH using the End.B6 action via
bpf_lwt_seg6_action.

Growing, shrinking or editing TLVs via the helpers will flag the SRH as
invalid, and it will have to be re-validated before re-entering the IPv6
layer. This flag is stored in a per-CPU buffer, along with the current
header length in bytes.

Storing the SRH len in bytes in the control block is mandatory when using
bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH
len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes
boundary). When adding/deleting TLVs within the BPF program, the SRH may
temporary be in an invalid state where its length cannot be rounded to 8
bytes without remainder, hence the need to store the length in bytes
separately. The caller of the BPF program can then ensure that the SRH's
final length is valid using this value. Again, a final SRH modified by a
BPF program which doesn’t respect the 8-bytes boundary will be discarded
as it will be considered as invalid.

Finally, a fourth helper is provided, bpf_lwt_push_encap, which is
available from the LWT BPF IN hook, but not from the seg6local BPF one.
This helper allows to encapsulate a Segment Routing Header (either with
a new outer IPv6 header, or by inlining it directly in the existing IPv6
header) into a non-SRv6 packet. This helper is required if we want to
offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet,
as the BPF seg6local hook only works on traffic already containing a SRH.
This is the BPF equivalent of the seg6 LWT infrastructure, which achieves
the same purpose but with a static SRH per route.

These helpers require CONFIG_IPV6=y (and not =m).

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/seg6_local.h |   8 ++
 include/uapi/linux/bpf.h |  96 +++++++++++++++-
 net/core/filter.c        | 285 +++++++++++++++++++++++++++++++++++++++++++----
 net/ipv6/Kconfig         |   5 +
 net/ipv6/seg6_local.c    |   2 +
 5 files changed, 372 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
index 57498b23085d..661fd5b4d3e0 100644
--- a/include/net/seg6_local.h
+++ b/include/net/seg6_local.h
@@ -15,10 +15,18 @@
 #ifndef _NET_SEG6_LOCAL_H
 #define _NET_SEG6_LOCAL_H
 
+#include <linux/percpu.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
 
 extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 			       u32 tbl_id);
 
+struct seg6_bpf_srh_state {
+	bool valid;
+	u16 hdrlen;
+};
+
+DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 344d2ddcef49..fdaf6a0bfa5b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1902,6 +1902,90 @@ union bpf_attr {
  *		egress otherwise). This is the only flag supported for now.
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1976,7 +2060,11 @@ union bpf_attr {
 	FN(fib_lookup),			\
 	FN(sock_hash_update),		\
 	FN(msg_redirect_hash),		\
-	FN(sk_redirect_hash),
+	FN(sk_redirect_hash),		\
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2043,6 +2131,12 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index ba3ff5aa575a..2e05dcfda6d7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -64,6 +64,10 @@
 #include <net/ip_fib.h>
 #include <net/flow.h>
 #include <net/arp.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3363,28 +3367,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
-bool bpf_helper_changes_pkt_data(void *func)
-{
-	if (func == bpf_skb_vlan_push ||
-	    func == bpf_skb_vlan_pop ||
-	    func == bpf_skb_store_bytes ||
-	    func == bpf_skb_change_proto ||
-	    func == bpf_skb_change_head ||
-	    func == bpf_skb_change_tail ||
-	    func == bpf_skb_adjust_room ||
-	    func == bpf_skb_pull_data ||
-	    func == bpf_clone_redirect ||
-	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head ||
-	    func == bpf_xdp_adjust_meta ||
-	    func == bpf_msg_pull_data ||
-	    func == bpf_xdp_adjust_tail)
-		return true;
-
-	return false;
-}
-
 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
 				  unsigned long off, unsigned long len)
 {
@@ -4360,6 +4342,264 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+	int err;
+	struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+	if (!seg6_validate_srh(srh, len))
+		return -EINVAL;
+
+	switch (type) {
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		if (skb->protocol != htons(ETH_P_IPV6))
+			return -EBADMSG;
+
+		err = seg6_do_srh_inline(skb, srh);
+		break;
+	case BPF_LWT_ENCAP_SEG6:
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+		err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	bpf_compute_data_pointers(skb);
+	if (err)
+		return err;
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+	   u32, len)
+{
+	switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	case BPF_LWT_ENCAP_SEG6:
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+	.func		= bpf_lwt_push_encap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+	   const void *, from, u32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_tlvs, *srh_end, *ptr;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+	srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+	ptr = skb->data + offset;
+	if (ptr >= srh_tlvs && ptr + len <= srh_end)
+		srh_state->valid = 0;
+	else if (ptr < (void *)&srh->flags ||
+		 ptr + len > (void *)&srh->segments)
+		return -EFAULT;
+
+	if (unlikely(bpf_try_make_writable(skb, offset + len)))
+		return -EFAULT;
+
+	memcpy(skb->data + offset, from, len);
+	return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+	.func		= bpf_lwt_seg6_store_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+	   u32, action, void *, param, u32, param_len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int err;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	if (!srh_state->valid) {
+		if (unlikely((srh_state->hdrlen & 7) != 0))
+			return -EBADMSG;
+
+		srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+		if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+			return -EBADMSG;
+
+		srh_state->valid = 1;
+	}
+
+	switch (action) {
+	case SEG6_LOCAL_ACTION_END_X:
+		if (param_len != sizeof(struct in6_addr))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+	case SEG6_LOCAL_ACTION_END_T:
+		if (param_len != sizeof(int))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+	case SEG6_LOCAL_ACTION_END_B6:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	default:
+		return -EINVAL;
+	}
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+	.func		= bpf_lwt_seg6_action,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+	   s32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_end, *srh_tlvs, *ptr;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6hdr *hdr;
+	int srhoff = 0;
+	int ret;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+			((srh->first_segment + 1) << 4));
+	srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+			srh_state->hdrlen);
+	ptr = skb->data + offset;
+
+	if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+		return -EFAULT;
+	if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+		return -EFAULT;
+
+	if (len > 0) {
+		ret = skb_cow_head(skb, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		ret = bpf_skb_net_hdr_push(skb, offset, len);
+	} else {
+		ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+	}
+
+	bpf_compute_data_pointers(skb);
+	if (unlikely(ret < 0))
+		return ret;
+
+	hdr = (struct ipv6hdr *)skb->data;
+	hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+	srh_state->hdrlen += len;
+	srh_state->valid = 0;
+	return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+	.func		= bpf_lwt_seg6_adjust_srh,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_head ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_adjust_room ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_clone_redirect ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta ||
+	    func == bpf_msg_pull_data ||
+	    func == bpf_xdp_adjust_tail ||
+	    func == bpf_lwt_push_encap ||
+	    func == bpf_lwt_seg6_store_bytes ||
+	    func == bpf_lwt_seg6_adjust_srh ||
+	    func == bpf_lwt_seg6_action
+	    )
+		return true;
+
+	return false;
+}
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4774,7 +5014,6 @@ static bool lwt_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
-
 /* Attach type specific accesses */
 static bool __sock_filter_check_attach_type(int off,
 					    enum bpf_access_type access_type,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 11e4e80cf7e9..0eff75525da1 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC
 
 	  If unsure, say N.
 
+config IPV6_SEG6_BPF
+	def_bool y
+	depends on IPV6_SEG6_LWTUNNEL
+	depends on IPV6 = y
+
 endif # IPV6
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index e9b23fb924ad..ae68c1ef8fb0 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -449,6 +449,8 @@ drop:
 	return err;
 }
 
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
-- 
cgit v1.2.3


From cd3092c7f8db8c320ea7f1aa7c1adeac2450f43a Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:15 +0100
Subject: bpf: Split lwt inout verifier structures

The new bpf_lwt_push_encap helper should only be accessible within the
LWT BPF IN hook, and not the OUT one, as this may lead to a skb under
panic.

At the moment, both LWT BPF IN and OUT share the same list of helpers,
whose calls are authorized by the verifier. This patch separates the
verifier ops for the IN and OUT hooks, and allows the IN hook to call the
bpf_lwt_push_encap helper.

This patch is also the occasion to put all lwt_*_func_proto functions
together for clarity. At the moment, socks_op_func_proto is in the middle
of lwt_inout_func_proto and lwt_xmit_func_proto.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h |  4 +--
 net/core/filter.c         | 83 +++++++++++++++++++++++++++++------------------
 2 files changed, 54 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b67f8793de0d..aa5c8b878474 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -9,8 +9,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 2e05dcfda6d7..5dc44309d124 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4783,33 +4783,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
-static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
-	switch (func_id) {
-	case BPF_FUNC_skb_load_bytes:
-		return &bpf_skb_load_bytes_proto;
-	case BPF_FUNC_skb_pull_data:
-		return &bpf_skb_pull_data_proto;
-	case BPF_FUNC_csum_diff:
-		return &bpf_csum_diff_proto;
-	case BPF_FUNC_get_cgroup_classid:
-		return &bpf_get_cgroup_classid_proto;
-	case BPF_FUNC_get_route_realm:
-		return &bpf_get_route_realm_proto;
-	case BPF_FUNC_get_hash_recalc:
-		return &bpf_get_hash_recalc_proto;
-	case BPF_FUNC_perf_event_output:
-		return &bpf_skb_event_output_proto;
-	case BPF_FUNC_get_smp_processor_id:
-		return &bpf_get_smp_processor_id_proto;
-	case BPF_FUNC_skb_under_cgroup:
-		return &bpf_skb_under_cgroup_proto;
-	default:
-		return bpf_base_func_proto(func_id);
-	}
-}
-
 static const struct bpf_func_proto *
 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4875,6 +4848,44 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_push_encap_proto;
+	default:
+		return lwt_out_func_proto(func_id, prog);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4906,7 +4917,7 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
 	default:
-		return lwt_inout_func_proto(func_id, prog);
+		return lwt_out_func_proto(func_id, prog);
 	}
 }
 
@@ -6587,13 +6598,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_inout_verifier_ops = {
-	.get_func_proto		= lwt_inout_func_proto,
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+	.get_func_proto		= lwt_in_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_in_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+	.get_func_proto		= lwt_out_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-const struct bpf_prog_ops lwt_inout_prog_ops = {
+const struct bpf_prog_ops lwt_out_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-- 
cgit v1.2.3


From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:16 +0100
Subject: ipv6: sr: Add seg6local action End.BPF

This patch adds the End.BPF action to the LWT seg6local infrastructure.
This action works like any other seg6local End action, meaning that an IPv6
header with SRH is needed, whose DA has to be equal to the SID of the
action. It will also advance the SRH to the next segment, the BPF program
does not have to take care of this.

Since the BPF program may not be a source of instability in the kernel, it
is important to ensure that the integrity of the packet is maintained
before yielding it back to the IPv6 layer. The hook hence keeps track if
the SRH has been altered through the helpers, and re-validates its
content if needed with seg6_validate_srh. The state kept for validation is
stored in a per-CPU buffer. The BPF program is not allowed to directly
write into the packet, and only some fields of the SRH can be altered
through the helper bpf_lwt_seg6_store_bytes.

Performances profiling has shown that the SRH re-validation does not induce
a significant overhead. If the altered SRH is deemed as invalid, the packet
is dropped.

This validation is also done before executing any action through
bpf_lwt_seg6_action, and will not be performed again if the SRH is not
modified after calling the action.

The BPF program may return 3 types of return codes:
    - BPF_OK: the End.BPF action will look up the next destination through
             seg6_lookup_nexthop.
    - BPF_REDIRECT: if an action has been executed through the
          bpf_lwt_seg6_action helper, the BPF program should return this
          value, as the skb's destination is already set and the default
          lookup should not be performed.
    - BPF_DROP : the packet will be dropped.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h       |   1 +
 include/uapi/linux/bpf.h        |   1 +
 include/uapi/linux/seg6_local.h |  12 +++
 kernel/bpf/verifier.c           |   1 +
 net/core/filter.c               |  25 ++++++
 net/ipv6/seg6_local.c           | 168 +++++++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.c          |   1 +
 7 files changed, 207 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index aa5c8b878474..b161e506dcfc 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -12,6 +12,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fdaf6a0bfa5b..e95fec90c2c1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -141,6 +141,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
index ef2d8c3e76c1..edc138bdc56d 100644
--- a/include/uapi/linux/seg6_local.h
+++ b/include/uapi/linux/seg6_local.h
@@ -25,6 +25,7 @@ enum {
 	SEG6_LOCAL_NH6,
 	SEG6_LOCAL_IIF,
 	SEG6_LOCAL_OIF,
+	SEG6_LOCAL_BPF,
 	__SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -59,10 +60,21 @@ enum {
 	SEG6_LOCAL_ACTION_END_AS	= 13,
 	/* forward to SR-unaware VNF with masquerading */
 	SEG6_LOCAL_ACTION_END_AM	= 14,
+	/* custom BPF action */
+	SEG6_LOCAL_ACTION_END_BPF	= 15,
 
 	__SEG6_LOCAL_ACTION_MAX,
 };
 
 #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1)
 
+enum {
+	SEG6_LOCAL_BPF_PROG_UNSPEC,
+	SEG6_LOCAL_BPF_PROG,
+	SEG6_LOCAL_BPF_PROG_NAME,
+	__SEG6_LOCAL_BPF_PROG_MAX,
+};
+
+#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
+
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c4d9d0fd3ab..967cacf286ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index 5dc44309d124..aa114c4acb25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4921,6 +4921,21 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_lwt_seg6_store_bytes:
+		return &bpf_lwt_seg6_store_bytes_proto;
+	case BPF_FUNC_lwt_seg6_action:
+		return &bpf_lwt_seg6_action_proto;
+	case BPF_FUNC_lwt_seg6_adjust_srh:
+		return &bpf_lwt_seg6_adjust_srh_proto;
+	default:
+		return lwt_out_func_proto(func_id, prog);
+	}
+}
+
 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
 				    const struct bpf_prog *prog,
 				    struct bpf_insn_access_aux *info)
@@ -6629,6 +6644,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+	.get_func_proto		= lwt_seg6local_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
 const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index ae68c1ef8fb0..cd6e4cab63f6 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1,8 +1,9 @@
 /*
  *  SR-IPv6 implementation
  *
- *  Author:
+ *  Authors:
  *  David Lebrun <david.lebrun@uclouvain.be>
+ *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
  *
  *
  *  This program is free software; you can redistribute it and/or
@@ -32,6 +33,7 @@
 #endif
 #include <net/seg6_local.h>
 #include <linux/etherdevice.h>
+#include <linux/bpf.h>
 
 struct seg6_local_lwt;
 
@@ -42,6 +44,11 @@ struct seg6_action_desc {
 	int static_headroom;
 };
 
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
 struct seg6_local_lwt {
 	int action;
 	struct ipv6_sr_hdr *srh;
@@ -50,6 +57,7 @@ struct seg6_local_lwt {
 	struct in6_addr nh6;
 	int iif;
 	int oif;
+	struct bpf_lwt_prog bpf;
 
 	int headroom;
 	struct seg6_action_desc *desc;
@@ -451,6 +459,69 @@ drop:
 
 DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
 
+static int input_action_end_bpf(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct seg6_bpf_srh_state local_srh_state;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int ret;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+	/* preempt_disable is needed to protect the per-CPU buffer srh_state,
+	 * which is also accessed by the bpf_lwt_seg6_* helpers
+	 */
+	preempt_disable();
+	srh_state->hdrlen = srh->hdrlen << 3;
+	srh_state->valid = 1;
+
+	rcu_read_lock();
+	bpf_compute_data_pointers(skb);
+	ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
+	rcu_read_unlock();
+
+	local_srh_state = *srh_state;
+	preempt_enable();
+
+	switch (ret) {
+	case BPF_OK:
+	case BPF_REDIRECT:
+		break;
+	case BPF_DROP:
+		goto drop;
+	default:
+		pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
+		goto drop;
+	}
+
+	if (unlikely((local_srh_state.hdrlen & 7) != 0))
+		goto drop;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		goto drop;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
+
+	if (!local_srh_state.valid &&
+	    unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+		goto drop;
+
+	if (ret != BPF_REDIRECT)
+		seg6_lookup_nexthop(skb, NULL, 0);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
@@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = {
 		.attrs		= (1 << SEG6_LOCAL_SRH),
 		.input		= input_action_end_b6_encap,
 		.static_headroom	= sizeof(struct ipv6hdr),
-	}
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_BPF,
+		.attrs		= (1 << SEG6_LOCAL_BPF),
+		.input		= input_action_end_bpf,
+	},
+
 };
 
 static struct seg6_action_desc *__get_action_desc(int action)
@@ -542,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 				    .len = sizeof(struct in6_addr) },
 	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_BPF]	= { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -719,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
 	return 0;
 }
 
+#define MAX_PROG_NAME 256
+static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
+	[SEG6_LOCAL_BPF_PROG]	   = { .type = NLA_U32, },
+	[SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				       .len = MAX_PROG_NAME },
+};
+
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX,
+			       attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME])
+		return -EINVAL;
+
+	slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL);
+	if (!slwt->bpf.name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]);
+	p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
+	if (IS_ERR(p)) {
+		kfree(slwt->bpf.name);
+		return PTR_ERR(p);
+	}
+
+	slwt->bpf.prog = p;
+	return 0;
+}
+
+static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *nest;
+
+	if (!slwt->bpf.prog)
+		return 0;
+
+	nest = nla_nest_start(skb, SEG6_LOCAL_BPF);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id))
+		return -EMSGSIZE;
+
+	if (slwt->bpf.name &&
+	    nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	if (!a->bpf.name && !b->bpf.name)
+		return 0;
+
+	if (!a->bpf.name || !b->bpf.name)
+		return 1;
+
+	return strcmp(a->bpf.name, b->bpf.name);
+}
+
 struct seg6_action_param {
 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
 	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -749,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_OIF]	= { .parse = parse_nla_oif,
 				    .put = put_nla_oif,
 				    .cmp = cmp_nla_oif },
+
+	[SEG6_LOCAL_BPF]	= { .parse = parse_nla_bpf,
+				    .put = put_nla_bpf,
+				    .cmp = cmp_nla_bpf },
+
 };
 
 static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -834,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
 	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
 
 	kfree(slwt->srh);
+
+	if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
+		kfree(slwt->bpf.name);
+		bpf_prog_put(slwt->bpf.prog);
+	}
+
+	return;
 }
 
 static int seg6_local_fill_encap(struct sk_buff *skb,
@@ -886,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
 	if (attrs & (1 << SEG6_LOCAL_OIF))
 		nlsize += nla_total_size(4);
 
+	if (attrs & (1 << SEG6_LOCAL_BPF))
+		nlsize += nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) +
+		       nla_total_size(4);
+
 	return nlsize;
 }
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e5cd4a958846..d07e44474848 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1456,6 +1456,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
 	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
-- 
cgit v1.2.3


From 25e62655c79395c596601a35805c3c7376d097b6 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Tue, 22 May 2018 20:33:45 +0300
Subject: IB/core: Reduce the places that use zgid

Instead of open coding memcmp() to check whether a given GID is zero or
not, use a helper function to do so, and replace instances of
memcpy(z,&zgid) with memset.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cache.c   | 19 ++++++++++++++-----
 drivers/infiniband/hw/mlx4/main.c |  5 +++--
 drivers/infiniband/hw/mlx4/qp.c   |  2 +-
 include/rdma/ib_cache.h           |  1 +
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 5f1a8333a45a..82699f70e9b6 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -125,6 +125,16 @@ const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
 }
 EXPORT_SYMBOL(ib_cache_gid_type_str);
 
+/** rdma_is_zero_gid - Check if given GID is zero or not.
+ * @gid:	GID to check
+ * Returns true if given GID is zero, returns false otherwise.
+ */
+bool rdma_is_zero_gid(const union ib_gid *gid)
+{
+	return !memcmp(gid, &zgid, sizeof(*gid));
+}
+EXPORT_SYMBOL(rdma_is_zero_gid);
+
 int ib_cache_gid_parse_type_str(const char *buf)
 {
 	unsigned int i;
@@ -231,7 +241,7 @@ static int add_modify_gid(struct ib_gid_table *table,
 		 * So ignore such behavior for IB link layer and don't
 		 * fail the call, but don't add such entry to GID cache.
 		 */
-		if (!memcmp(gid, &zgid, sizeof(*gid)))
+		if (rdma_is_zero_gid(gid))
 			return 0;
 	}
 
@@ -264,7 +274,7 @@ static void del_gid(struct ib_device *ib_dev, u8 port,
 
 	if (rdma_protocol_roce(ib_dev, port))
 		del_roce_gid(ib_dev, port, table, ix);
-	memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid));
+	memset(&table->data_vec[ix].gid, 0, sizeof(table->data_vec[ix].gid));
 	memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr));
 	table->data_vec[ix].context = NULL;
 }
@@ -363,7 +373,7 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 	 * IB spec version 1.3 section 4.1.1 point (6) and
 	 * section 12.7.10 and section 12.7.20
 	 */
-	if (!memcmp(gid, &zgid, sizeof(*gid)))
+	if (rdma_is_zero_gid(gid))
 		return -EINVAL;
 
 	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
@@ -724,8 +734,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
 
 	mutex_lock(&table->lock);
 	for (i = 0; i < table->sz; ++i) {
-		if (memcmp(&table->data_vec[i].gid, &zgid,
-			   sizeof(table->data_vec[i].gid))) {
+		if (!rdma_is_zero_gid(&table->data_vec[i].gid)) {
 			del_gid(ib_dev, port, table, i);
 			deleted = true;
 		}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 5b70744f414a..bf12394c13c1 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -276,7 +276,7 @@ static int mlx4_ib_add_gid(const union ib_gid *gid,
 			found = i;
 			break;
 		}
-		if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
+		if (free < 0 && rdma_is_zero_gid(&port_gid_table->gids[i].gid))
 			free = i; /* HW has space */
 	}
 
@@ -345,7 +345,8 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context)
 		if (!ctx->refcount) {
 			unsigned int real_index = ctx->real_index;
 
-			memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid));
+			memset(&port_gid_table->gids[real_index].gid, 0,
+			       sizeof(port_gid_table->gids[real_index].gid));
 			kfree(port_gid_table->gids[real_index].ctx);
 			port_gid_table->gids[real_index].ctx = NULL;
 			hw_update = 1;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 199648adac74..cd2c08c45334 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -3078,7 +3078,7 @@ static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num,
 	memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid));
 	*gid_type = port_gid_table->gids[index].gid_type;
 	spin_unlock_irqrestore(&iboe->lock, flags);
-	if (!memcmp(gid, &zgid, sizeof(*gid)))
+	if (rdma_is_zero_gid(gid))
 		return -ENOENT;
 
 	return 0;
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index eb49cc8d1f95..a5f249828115 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -149,4 +149,5 @@ int ib_get_cached_port_state(struct ib_device *device,
 			      u8                port_num,
 			      enum ib_port_state *port_active);
 
+bool rdma_is_zero_gid(const union ib_gid *gid);
 #endif /* _IB_CACHE_H */
-- 
cgit v1.2.3


From 2468f0d51548b23507ceb3bba5f4e3acaea117ba Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:28 +0200
Subject: regulator: lp8788-ldo: Pass descriptor instead of GPIO number

Instead of passing a global GPIO number, pass a descriptor looked
up with the standard devm_gpiod_get_index_optional() call.

This driver has supported passing a LDO enable GPIO for years,
yet this facility has never been put to use in the upstream kernel.
If someone desires to put in place GPIO control for the LDOs,
this can be done by adding a GPIO descriptor table in the MFD
nexus in drivers/mfd/lp8788.c for the LDO device when spawning the
MFD children, or using a board file.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/lp8788-ldo.c | 32 ++++++++++++++++----------------
 include/linux/mfd/lp8788.h     | 16 ----------------
 2 files changed, 16 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/lp8788-ldo.c b/drivers/regulator/lp8788-ldo.c
index cbfd35873575..f2347474a106 100644
--- a/drivers/regulator/lp8788-ldo.c
+++ b/drivers/regulator/lp8788-ldo.c
@@ -16,7 +16,7 @@
 #include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/mfd/lp8788.h>
 
 /* register address */
@@ -85,8 +85,6 @@
 #define LP8788_STARTUP_TIME_S		3
 
 #define ENABLE_TIME_USEC		32
-#define ENABLE				GPIOF_OUT_INIT_HIGH
-#define DISABLE				GPIOF_OUT_INIT_LOW
 
 enum lp8788_ldo_id {
 	DLDO1,
@@ -117,7 +115,7 @@ struct lp8788_ldo {
 	struct lp8788 *lp;
 	struct regulator_desc *desc;
 	struct regulator_dev *regulator;
-	struct lp8788_ldo_enable_pin *en_pin;
+	struct gpio_desc *ena_gpiod;
 };
 
 /* DLDO 1, 2, 3, 9 voltage table */
@@ -469,7 +467,6 @@ static int lp8788_config_ldo_enable_mode(struct platform_device *pdev,
 					enum lp8788_ldo_id id)
 {
 	struct lp8788 *lp = ldo->lp;
-	struct lp8788_platform_data *pdata = lp->pdata;
 	enum lp8788_ext_ldo_en_id enable_id;
 	u8 en_mask[] = {
 		[EN_ALDO1]   = LP8788_EN_SEL_ALDO1_M,
@@ -504,11 +501,18 @@ static int lp8788_config_ldo_enable_mode(struct platform_device *pdev,
 		return 0;
 	}
 
-	/* if no platform data for ldo pin, then set default enable mode */
-	if (!pdata || !pdata->ldo_pin || !pdata->ldo_pin[enable_id])
+	/* FIXME: check default mode for GPIO here: high or low? */
+	ldo->ena_gpiod = devm_gpiod_get_index_optional(&pdev->dev,
+						       "enable",
+						       enable_id,
+						       GPIOD_OUT_HIGH);
+	if (IS_ERR(ldo->ena_gpiod))
+		return PTR_ERR(ldo->ena_gpiod);
+
+	/* if no GPIO for ldo pin, then set default enable mode */
+	if (!ldo->ena_gpiod)
 		goto set_default_ldo_enable_mode;
 
-	ldo->en_pin = pdata->ldo_pin[enable_id];
 	return 0;
 
 set_default_ldo_enable_mode:
@@ -533,10 +537,8 @@ static int lp8788_dldo_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	if (ldo->en_pin) {
-		cfg.ena_gpio = ldo->en_pin->gpio;
-		cfg.ena_gpio_flags = ldo->en_pin->init_state;
-	}
+	if (ldo->ena_gpiod)
+		cfg.ena_gpiod = ldo->ena_gpiod;
 
 	cfg.dev = pdev->dev.parent;
 	cfg.init_data = lp->pdata ? lp->pdata->dldo_data[id] : NULL;
@@ -582,10 +584,8 @@ static int lp8788_aldo_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	if (ldo->en_pin) {
-		cfg.ena_gpio = ldo->en_pin->gpio;
-		cfg.ena_gpio_flags = ldo->en_pin->init_state;
-	}
+	if (ldo->ena_gpiod)
+		cfg.ena_gpiod = ldo->ena_gpiod;
 
 	cfg.dev = pdev->dev.parent;
 	cfg.init_data = lp->pdata ? lp->pdata->aldo_data[id] : NULL;
diff --git a/include/linux/mfd/lp8788.h b/include/linux/mfd/lp8788.h
index 786bf6679a28..2010e0de3e34 100644
--- a/include/linux/mfd/lp8788.h
+++ b/include/linux/mfd/lp8788.h
@@ -181,20 +181,6 @@ struct lp8788_buck2_dvs {
 	enum lp8788_dvs_sel vsel;
 };
 
-/*
- * struct lp8788_ldo_enable_pin
- *
- *   Basically, all LDOs are enabled through the I2C commands.
- *   But ALDO 1 ~ 5, 7, DLDO 7, 9, 11 can be enabled by external gpio pins.
- *
- * @gpio         : gpio number which is used for enabling ldos
- * @init_state   : initial gpio state (ex. GPIOF_OUT_INIT_LOW)
- */
-struct lp8788_ldo_enable_pin {
-	int gpio;
-	int init_state;
-};
-
 /*
  * struct lp8788_chg_param
  * @addr         : charging control register address (range : 0x11 ~ 0x1C)
@@ -288,7 +274,6 @@ struct lp8788_vib_platform_data {
  * @aldo_data    : regulator initial data for analog ldo
  * @buck1_dvs    : gpio configurations for buck1 dvs
  * @buck2_dvs    : gpio configurations for buck2 dvs
- * @ldo_pin      : gpio configurations for enabling LDOs
  * @chg_pdata    : platform data for charger driver
  * @alarm_sel    : rtc alarm selection (1 or 2)
  * @bl_pdata     : configurable data for backlight driver
@@ -306,7 +291,6 @@ struct lp8788_platform_data {
 	struct regulator_init_data *aldo_data[LP8788_NUM_ALDOS];
 	struct lp8788_buck1_dvs *buck1_dvs;
 	struct lp8788_buck2_dvs *buck2_dvs;
-	struct lp8788_ldo_enable_pin *ldo_pin[EN_LDOS_MAX];
 
 	/* charger */
 	struct lp8788_charger_platform_data *chg_pdata;
-- 
cgit v1.2.3


From d7a261c2d1f233260c48651b4e68987ce1206139 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:29 +0200
Subject: regulator: max8952: Pass descriptor instead of GPIO number

Instead of passing a global GPIO number for the enable GPIO, pass
a descriptor looked up with the standard devm_gpiod_get_optional()
call.

All users of this regulator use device tree so the transition is
pretty smooth.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/max8952.c       | 18 +++++++++++++-----
 include/linux/regulator/max8952.h |  1 -
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/max8952.c b/drivers/regulator/max8952.c
index 1096546c05e9..f1e77ed5dfec 100644
--- a/drivers/regulator/max8952.c
+++ b/drivers/regulator/max8952.c
@@ -27,6 +27,7 @@
 #include <linux/regulator/driver.h>
 #include <linux/regulator/max8952.h>
 #include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
@@ -148,7 +149,6 @@ static struct max8952_platform_data *max8952_parse_dt(struct device *dev)
 
 	pd->gpio_vid0 = of_get_named_gpio(np, "max8952,vid-gpios", 0);
 	pd->gpio_vid1 = of_get_named_gpio(np, "max8952,vid-gpios", 1);
-	pd->gpio_en = of_get_named_gpio(np, "max8952,en-gpio", 0);
 
 	if (of_property_read_u32(np, "max8952,default-mode", &pd->default_mode))
 		dev_warn(dev, "Default mode not specified, assuming 0\n");
@@ -197,6 +197,8 @@ static int max8952_pmic_probe(struct i2c_client *client,
 	struct regulator_config config = { };
 	struct max8952_data *max8952;
 	struct regulator_dev *rdev;
+	struct gpio_desc *gpiod;
+	enum gpiod_flags gflags;
 
 	int ret = 0, err = 0;
 
@@ -224,11 +226,17 @@ static int max8952_pmic_probe(struct i2c_client *client,
 	config.driver_data = max8952;
 	config.of_node = client->dev.of_node;
 
-	config.ena_gpio = pdata->gpio_en;
-	if (client->dev.of_node)
-		config.ena_gpio_initialized = true;
 	if (pdata->reg_data->constraints.boot_on)
-		config.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
+		gflags = GPIOD_OUT_HIGH;
+	else
+		gflags = GPIOD_OUT_LOW;
+	gpiod = devm_gpiod_get_optional(&client->dev,
+					"max8952,en",
+					gflags);
+	if (IS_ERR(gpiod))
+		return PTR_ERR(gpiod);
+	if (gpiod)
+		config.ena_gpiod = gpiod;
 
 	rdev = devm_regulator_register(&client->dev, &regulator, &config);
 	if (IS_ERR(rdev)) {
diff --git a/include/linux/regulator/max8952.h b/include/linux/regulator/max8952.h
index 4dbb63a1d4ab..686c42c041b5 100644
--- a/include/linux/regulator/max8952.h
+++ b/include/linux/regulator/max8952.h
@@ -120,7 +120,6 @@ enum {
 struct max8952_platform_data {
 	int gpio_vid0;
 	int gpio_vid1;
-	int gpio_en;
 
 	u32 default_mode;
 	u32 dvs_mode[MAX8952_NUM_DVS_MODE]; /* MAX8952_DVS_MODEx_XXXXmV */
-- 
cgit v1.2.3


From 9ae5cc75ceaacf69cc50b9fd4713276c2aa62eb9 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:32 +0200
Subject: regulator: s5m8767: Pass descriptor instead of GPIO number

Instead of passing a global GPIO number for the enable GPIO, pass
a descriptor looked up from the device tree node for the
regulator.

This regulator supports passing platform data, but enable/sleep
regulators are looked up from the device tree exclusively, so
we can need not touch other files.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/s5m8767.c      | 26 +++++++++++++++-----------
 include/linux/mfd/samsung/core.h |  4 +++-
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/s5m8767.c b/drivers/regulator/s5m8767.c
index 4836947e1521..b8443a360646 100644
--- a/drivers/regulator/s5m8767.c
+++ b/drivers/regulator/s5m8767.c
@@ -13,6 +13,7 @@
 
 #include <linux/err.h>
 #include <linux/of_gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
@@ -459,15 +460,14 @@ static void s5m8767_regulator_config_ext_control(struct s5m8767_info *s5m8767,
 		return;
 	}
 
-	if (!gpio_is_valid(rdata->ext_control_gpio)) {
+	if (!rdata->ext_control_gpiod) {
 		dev_warn(s5m8767->dev,
 				"ext-control for %s: GPIO not valid, ignoring\n",
-				rdata->reg_node->name);
+			 rdata->reg_node->name);
 		return;
 	}
 
-	config->ena_gpio = rdata->ext_control_gpio;
-	config->ena_gpio_flags = GPIOF_OUT_INIT_HIGH;
+	config->ena_gpiod = rdata->ext_control_gpiod;
 }
 
 /*
@@ -577,8 +577,14 @@ static int s5m8767_pmic_dt_parse_pdata(struct platform_device *pdev,
 			continue;
 		}
 
-		rdata->ext_control_gpio = of_get_named_gpio(reg_np,
-			"s5m8767,pmic-ext-control-gpios", 0);
+		rdata->ext_control_gpiod = devm_gpiod_get_from_of_node(&pdev->dev,
+								       reg_np,
+								       "s5m8767,pmic-ext-control-gpios",
+								       0,
+								       GPIOD_OUT_HIGH,
+								       "s5m8767");
+		if (IS_ERR(rdata->ext_control_gpiod))
+			return PTR_ERR(rdata->ext_control_gpiod);
 
 		rdata->id = i;
 		rdata->initdata = of_get_regulator_init_data(
@@ -954,10 +960,8 @@ static int s5m8767_pmic_probe(struct platform_device *pdev)
 		config.driver_data = s5m8767;
 		config.regmap = iodev->regmap_pmic;
 		config.of_node = pdata->regulators[i].reg_node;
-		config.ena_gpio = -EINVAL;
-		config.ena_gpio_flags = 0;
-		config.ena_gpio_initialized = true;
-		if (gpio_is_valid(pdata->regulators[i].ext_control_gpio))
+		config.ena_gpiod = NULL;
+		if (pdata->regulators[i].ext_control_gpiod)
 			s5m8767_regulator_config_ext_control(s5m8767,
 					&pdata->regulators[i], &config);
 
@@ -970,7 +974,7 @@ static int s5m8767_pmic_probe(struct platform_device *pdev)
 			return ret;
 		}
 
-		if (gpio_is_valid(pdata->regulators[i].ext_control_gpio)) {
+		if (pdata->regulators[i].ext_control_gpiod) {
 			ret = s5m8767_enable_ext_control(s5m8767, rdev);
 			if (ret < 0) {
 				dev_err(s5m8767->dev,
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index 5a23dd4df432..28f4ae76271d 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -39,6 +39,8 @@
 #define STEP_12_5_MV		12500
 #define STEP_6_25_MV		6250
 
+struct gpio_desc;
+
 enum sec_device_type {
 	S5M8751X,
 	S5M8763X,
@@ -151,7 +153,7 @@ struct sec_regulator_data {
 	int				id;
 	struct regulator_init_data	*initdata;
 	struct device_node		*reg_node;
-	int				ext_control_gpio;
+	struct gpio_desc		*ext_control_gpiod;
 };
 
 /*
-- 
cgit v1.2.3


From 3012e81446d011c1bd99812e562e2292f21060fb Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:33 +0200
Subject: regulator: tps65090: Pass descriptor instead of GPIO number

Instead of passing a global GPIO number for the enable GPIO, pass
a descriptor looked up from the device tree node for the
regulator.

This regulator supports passing platform data, but enable/sleep
regulators are looked up from the device tree exclusively, so
we can need not touch other files.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/tps65090-regulator.c | 50 ++++++++++++++++------------------
 include/linux/mfd/tps65090.h           |  8 ++++--
 2 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/tps65090-regulator.c b/drivers/regulator/tps65090-regulator.c
index 395f35dc8cdb..2d398fa3b720 100644
--- a/drivers/regulator/tps65090-regulator.c
+++ b/drivers/regulator/tps65090-regulator.c
@@ -19,8 +19,8 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/of_gpio.h>
+#include <linux/of.h>
+#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
@@ -300,26 +300,6 @@ static int tps65090_regulator_disable_ext_control(
 	return tps65090_config_ext_control(ri, false);
 }
 
-static void tps65090_configure_regulator_config(
-		struct tps65090_regulator_plat_data *tps_pdata,
-		struct regulator_config *config)
-{
-	if (gpio_is_valid(tps_pdata->gpio)) {
-		int gpio_flag = GPIOF_OUT_INIT_LOW;
-
-		if (tps_pdata->reg_init_data->constraints.always_on ||
-				tps_pdata->reg_init_data->constraints.boot_on)
-			gpio_flag = GPIOF_OUT_INIT_HIGH;
-
-		config->ena_gpio = tps_pdata->gpio;
-		config->ena_gpio_initialized = true;
-		config->ena_gpio_flags = gpio_flag;
-	} else {
-		config->ena_gpio = -EINVAL;
-		config->ena_gpio_initialized = false;
-	}
-}
-
 #ifdef CONFIG_OF
 static struct of_regulator_match tps65090_matches[] = {
 	{ .name = "dcdc1", },
@@ -385,9 +365,26 @@ static struct tps65090_platform_data *tps65090_parse_dt_reg_data(
 		rpdata->enable_ext_control = of_property_read_bool(
 					tps65090_matches[idx].of_node,
 					"ti,enable-ext-control");
-		if (rpdata->enable_ext_control)
-			rpdata->gpio = of_get_named_gpio(np,
-					"dcdc-ext-control-gpios", 0);
+		if (rpdata->enable_ext_control) {
+			enum gpiod_flags gflags;
+
+			if (ri_data->constraints.always_on ||
+			    ri_data->constraints.boot_on)
+				gflags = GPIOD_OUT_HIGH;
+			else
+				gflags = GPIOD_OUT_LOW;
+
+			rpdata->gpiod = devm_gpiod_get_from_of_node(&pdev->dev,
+								    tps65090_matches[idx].of_node,
+								    "dcdc-ext-control-gpios", 0,
+								    gflags,
+								    "tps65090");
+			if (IS_ERR(rpdata->gpiod))
+				return ERR_CAST(rpdata->gpiod);
+			if (!rpdata->gpiod)
+				dev_err(&pdev->dev,
+					"could not find DCDC external control GPIO\n");
+		}
 
 		if (of_property_read_u32(tps65090_matches[idx].of_node,
 					 "ti,overcurrent-wait",
@@ -455,8 +452,7 @@ static int tps65090_regulator_probe(struct platform_device *pdev)
 		 */
 		if (tps_pdata && is_dcdc(num) && tps_pdata->reg_init_data) {
 			if (tps_pdata->enable_ext_control) {
-				tps65090_configure_regulator_config(
-						tps_pdata, &config);
+				config.ena_gpiod = tps_pdata->gpiod;
 				ri->desc->ops = &tps65090_ext_control_ops;
 			} else {
 				ret = tps65090_regulator_disable_ext_control(
diff --git a/include/linux/mfd/tps65090.h b/include/linux/mfd/tps65090.h
index 67d144b3b8f9..f05bf4a146e2 100644
--- a/include/linux/mfd/tps65090.h
+++ b/include/linux/mfd/tps65090.h
@@ -83,6 +83,8 @@ enum {
 #define TPS65090_MAX_REG	TPS65090_REG_AD_OUT2
 #define TPS65090_NUM_REGS	(TPS65090_MAX_REG + 1)
 
+struct gpio_desc;
+
 struct tps65090 {
 	struct device		*dev;
 	struct regmap		*rmap;
@@ -95,8 +97,8 @@ struct tps65090 {
  * @reg_init_data: The regulator init data.
  * @enable_ext_control: Enable extrenal control or not. Only available for
  *     DCDC1, DCDC2 and DCDC3.
- * @gpio: Gpio number if external control is enabled and controlled through
- *     gpio.
+ * @gpiod: Gpio descriptor if external control is enabled and controlled through
+ *     gpio
  * @overcurrent_wait_valid: True if the overcurrent_wait should be applied.
  * @overcurrent_wait: Value to set as the overcurrent wait time.  This is the
  *     actual bitfield value, not a time in ms (valid value are 0 - 3).
@@ -104,7 +106,7 @@ struct tps65090 {
 struct tps65090_regulator_plat_data {
 	struct regulator_init_data *reg_init_data;
 	bool enable_ext_control;
-	int gpio;
+	struct gpio_desc *gpiod;
 	bool overcurrent_wait_valid;
 	int overcurrent_wait;
 };
-- 
cgit v1.2.3


From ece711b5a42ce9b99a2a3706c56bf70a5425a7bf Mon Sep 17 00:00:00 2001
From: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Date: Tue, 22 May 2018 16:16:23 +0100
Subject: power: supply: Add fwnode pointer to power_supply_config struct

To allow users of the power supply framework to be hw description
agnostic, this commit adds the ability to pass a fwnode pointer,
via the power_supply_config structure, to the initialisation code
of the core, instead of explicitly specifying of_ndoe. If that
fwnode pointer is provided then it will automatically resolve down
to of_node on platforms which support it, otherwise it will be NULL.

In the future, when ACPI support is added, this can be modified to
accommodate ACPI without the need to change calling code which
already provides the fwnode handle in this manner.

Signed-off-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Suggested-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/power/supply/power_supply_core.c | 4 +++-
 include/linux/power_supply.h             | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index ecd68c2053c5..f57ab0a27301 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -19,6 +19,7 @@
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/power_supply.h>
+#include <linux/property.h>
 #include <linux/thermal.h>
 #include "power_supply.h"
 
@@ -874,7 +875,8 @@ __power_supply_register(struct device *parent,
 	psy->desc = desc;
 	if (cfg) {
 		psy->drv_data = cfg->drv_data;
-		psy->of_node = cfg->of_node;
+		psy->of_node =
+			cfg->fwnode ? to_of_node(cfg->fwnode) : cfg->of_node;
 		psy->supplied_to = cfg->supplied_to;
 		psy->num_supplicants = cfg->num_supplicants;
 	}
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 0c9a572a1eb8..b21c4bd96b84 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -199,6 +199,8 @@ struct power_supply;
 /* Run-time specific power supply configuration */
 struct power_supply_config {
 	struct device_node *of_node;
+	struct fwnode_handle *fwnode;
+
 	/* Driver private data */
 	void *drv_data;
 
-- 
cgit v1.2.3


From e66003f55d7dda8ded886fa1f6f5f2741242eeb8 Mon Sep 17 00:00:00 2001
From: Giulio Benetti <giulio.benetti@micronovasrl.com>
Date: Wed, 16 May 2018 17:43:54 +0200
Subject: tty: fix typo in ASYNCB_FOURPORT comment

On #define ASYNCB_FOURPORT there's an ortography error on comment:
"Set OU1, OUT2 per AST Fourport settings"

Change it into:
"Set OUT1, OUT2 per AST Fourport settings"

Signed-off-by: Giulio Benetti <giulio.benetti@micronovasrl.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty_flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h
index 6ac609a00dea..900a32e63424 100644
--- a/include/uapi/linux/tty_flags.h
+++ b/include/uapi/linux/tty_flags.h
@@ -13,7 +13,7 @@
  */
 #define ASYNCB_HUP_NOTIFY	 0 /* Notify getty on hangups and closes
 				    * on the callout port */
-#define ASYNCB_FOURPORT		 1 /* Set OU1, OUT2 per AST Fourport settings */
+#define ASYNCB_FOURPORT		 1 /* Set OUT1, OUT2 per AST Fourport settings */
 #define ASYNCB_SAK		 2 /* Secure Attention Key (Orange book) */
 #define ASYNCB_SPLIT_TERMIOS	 3 /* [x] Separate termios for dialin/callout */
 #define ASYNCB_SPD_HI		 4 /* Use 57600 instead of 38400 bps */
-- 
cgit v1.2.3


From 8b22f3d2f6b8652b71301d68d8ca69f91b5311a7 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Fri, 18 May 2018 13:27:58 -0600
Subject: PCI: Add Intel VMD devices to pci ids

Add the Intel VMD device ids to the pci id database and update the VMD
driver.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 drivers/pci/host/vmd.c  | 2 +-
 include/linux/pci_ids.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/pci/host/vmd.c b/drivers/pci/host/vmd.c
index 930a8fa08bd6..b10d2bce2993 100644
--- a/drivers/pci/host/vmd.c
+++ b/drivers/pci/host/vmd.c
@@ -778,7 +778,7 @@ static int vmd_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
 
 static const struct pci_device_id vmd_ids[] = {
-	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),},
 	{0,}
 };
 MODULE_DEVICE_TABLE(pci, vmd_ids);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cc608fc55334..0334066f2e48 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2672,6 +2672,7 @@
 #define PCI_DEVICE_ID_INTEL_PANTHERPOINT_XHCI	0x1e31
 #define PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN	0x1e40
 #define PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX	0x1e5f
+#define PCI_DEVICE_ID_INTEL_VMD_201D	0x201d
 #define PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN	0x2310
 #define PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX	0x231f
 #define PCI_DEVICE_ID_INTEL_82801AA_0	0x2410
@@ -2776,6 +2777,7 @@
 #define PCI_DEVICE_ID_INTEL_ICH8_4	0x2815
 #define PCI_DEVICE_ID_INTEL_ICH8_5	0x283e
 #define PCI_DEVICE_ID_INTEL_ICH8_6	0x2850
+#define PCI_DEVICE_ID_INTEL_VMD_28C0	0x28c0
 #define PCI_DEVICE_ID_INTEL_ICH9_0	0x2910
 #define PCI_DEVICE_ID_INTEL_ICH9_1	0x2917
 #define PCI_DEVICE_ID_INTEL_ICH9_2	0x2912
-- 
cgit v1.2.3


From e6fc46498784e799d3eb95d83079180e413c4e7d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 24 May 2018 11:00:39 -0600
Subject: blk-mq: avoid starving tag allocation after allocating process
 migrates

When the allocation process is scheduled back and the mapped hw queue is
changed, fake one extra wake up on previous queue for compensating wake
up miss, so other allocations on the previous queue won't be starved.

This patch fixes one request allocation hang issue, which can be
triggered easily in case of very low nr_request.

The race is as follows:

1) 2 hw queues, nr_requests are 2, and wake_batch is one

2) there are 3 waiters on hw queue 0

3) two in-flight requests in hw queue 0 are completed, and only two
   waiters of 3 are waken up because of wake_batch, but both the two
   waiters can be scheduled to another CPU and cause to switch to hw
   queue 1

4) then the 3rd waiter will wait for ever, since no in-flight request
   is in hw queue 0 any more.

5) this patch fixes it by the fake wakeup when waiter is scheduled to
   another hw queue

Cc: <stable@vger.kernel.org>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>

Modified commit message to make it clearer, and make it apply on
top of the 4.18 branch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c      | 12 ++++++++++++
 include/linux/sbitmap.h |  7 +++++++
 lib/sbitmap.c           | 29 +++++++++++++++--------------
 3 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 336dde07b230..a4e58fc28a06 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,6 +134,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	ws = bt_wait_ptr(bt, data->hctx);
 	drop_ctx = data->ctx == NULL;
 	do {
+		struct sbitmap_queue *bt_prev;
+
 		/*
 		 * We're out of tags on this hardware queue, kick any
 		 * pending IO submits before going to sleep waiting for
@@ -159,6 +161,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (data->ctx)
 			blk_mq_put_ctx(data->ctx);
 
+		bt_prev = bt;
 		io_schedule();
 
 		data->ctx = blk_mq_get_ctx(data->q);
@@ -170,6 +173,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 			bt = &tags->bitmap_tags;
 
 		finish_wait(&ws->wait, &wait);
+
+		/*
+		 * If destination hw queue is changed, fake wake up on
+		 * previous queue for compensating the wake up miss, so
+		 * other allocations on previous queue won't be starved.
+		 */
+		if (bt != bt_prev)
+			sbitmap_queue_wake_up(bt_prev);
+
 		ws = bt_wait_ptr(bt, data->hctx);
 	} while (1);
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 0c4a9c242dd7..e6539536dea9 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -512,6 +512,13 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
  */
 void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
 
+/**
+ * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
+ * on a &struct sbitmap_queue.
+ * @sbq: Bitmap queue to wake up.
+ */
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
+
 /**
  * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
  * seq_file.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e6d7d610778d..6fdc6267f4a8 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -352,8 +352,9 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
 	if (sbq->wake_batch != wake_batch) {
 		WRITE_ONCE(sbq->wake_batch, wake_batch);
 		/*
-		 * Pairs with the memory barrier in sbq_wake_up() to ensure that
-		 * the batch size is updated before the wait counts.
+		 * Pairs with the memory barrier in sbitmap_queue_wake_up()
+		 * to ensure that the batch size is updated before the wait
+		 * counts.
 		 */
 		smp_mb__before_atomic();
 		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
@@ -463,15 +464,6 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 	unsigned int wake_batch;
 	int wait_cnt;
 
-	/*
-	 * Pairs with the memory barrier in set_current_state() to ensure the
-	 * proper ordering of clear_bit()/waitqueue_active() in the waker and
-	 * test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
-	 * waiter. See the comment on waitqueue_active(). This is __after_atomic
-	 * because we just did clear_bit_unlock() in the caller.
-	 */
-	smp_mb__after_atomic();
-
 	ws = sbq_wake_ptr(sbq);
 	if (!ws)
 		return false;
@@ -507,17 +499,26 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 	return false;
 }
 
-static void sbq_wake_up(struct sbitmap_queue *sbq)
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
 {
 	while (__sbq_wake_up(sbq))
 		;
 }
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
 	sbitmap_clear_bit_unlock(&sbq->sb, nr);
-	sbq_wake_up(sbq);
+	/*
+	 * Pairs with the memory barrier in set_current_state() to ensure the
+	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
+	 * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
+	 * waiter. See the comment on waitqueue_active().
+	 */
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+
 	if (likely(!sbq->round_robin && nr < sbq->sb.depth))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
@@ -529,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
 
 	/*
 	 * Pairs with the memory barrier in set_current_state() like in
-	 * sbq_wake_up().
+	 * sbitmap_queue_wake_up().
 	 */
 	smp_mb();
 	wake_index = atomic_read(&sbq->wake_index);
-- 
cgit v1.2.3


From eebba71e1cb4f3464645c4b106f8ccc31bc5cd11 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Fri, 11 May 2018 12:03:16 -0500
Subject: hwspinlock/core: Switch to SPDX license identifier

Use the appropriate SPDX license identifier in the Hwspinlock core
driver source files and drop the previous boilerplate license text.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/hwspinlock/Kconfig               |  1 +
 drivers/hwspinlock/hwspinlock_core.c     | 10 +---------
 drivers/hwspinlock/hwspinlock_internal.h | 10 +---------
 include/linux/hwspinlock.h               | 10 +---------
 4 files changed, 4 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/hwspinlock/Kconfig b/drivers/hwspinlock/Kconfig
index f0f467983960..e895d29500ee 100644
--- a/drivers/hwspinlock/Kconfig
+++ b/drivers/hwspinlock/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
 #
 # Generic HWSPINLOCK framework
 #
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index 5278d0560a4a..d16e6a3d38e8 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Hardware spinlock framework
  *
  * Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Ohad Ben-Cohen <ohad@wizery.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #define pr_fmt(fmt)    "%s: " fmt, __func__
diff --git a/drivers/hwspinlock/hwspinlock_internal.h b/drivers/hwspinlock/hwspinlock_internal.h
index d26f78b8f214..9eb6bd020dc7 100644
--- a/drivers/hwspinlock/hwspinlock_internal.h
+++ b/drivers/hwspinlock/hwspinlock_internal.h
@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Hardware spinlocks internal header
  *
  * Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Ohad Ben-Cohen <ohad@wizery.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __HWSPINLOCK_HWSPINLOCK_H
diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h
index fe450ee58d55..57537e67b468 100644
--- a/include/linux/hwspinlock.h
+++ b/include/linux/hwspinlock.h
@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Hardware spinlock public header
  *
  * Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Ohad Ben-Cohen <ohad@wizery.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __LINUX_HWSPINLOCK_H
-- 
cgit v1.2.3


From 87e5808d52b65fc5b0bfda209ba8864cc2f933e5 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 23 May 2018 08:05:20 +0200
Subject: net: phy: replace bool members in struct phy_device with bit-fields

In struct phy_device we have a number of flags being defined as type
bool. Similar to e.g. struct pci_dev we can save some space by using
bit-fields.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 073235e70442..6cd09098427c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -406,13 +406,17 @@ struct phy_device {
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
-	bool is_c45;
-	bool is_internal;
-	bool is_pseudo_fixed_link;
-	bool has_fixups;
-	bool suspended;
-	bool sysfs_links;
-	bool loopback_enabled;
+	unsigned is_c45:1;
+	unsigned is_internal:1;
+	unsigned is_pseudo_fixed_link:1;
+	unsigned has_fixups:1;
+	unsigned suspended:1;
+	unsigned sysfs_links:1;
+	unsigned loopback_enabled:1;
+
+	unsigned autoneg:1;
+	/* The most recently read link state */
+	unsigned link:1;
 
 	enum phy_state state;
 
@@ -429,9 +433,6 @@ struct phy_device {
 	int pause;
 	int asym_pause;
 
-	/* The most recently read link state */
-	int link;
-
 	/* Enabled Interrupts */
 	u32 interrupts;
 
@@ -444,8 +445,6 @@ struct phy_device {
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
 
-	int autoneg;
-
 	int link_timeout;
 
 #ifdef CONFIG_LED_TRIGGER_PHY
-- 
cgit v1.2.3


From e549f6f9c098067a99e9de8ac84f5cc2c07ae5c6 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 22 Feb 2018 11:57:10 -0600
Subject: net/dcb: Add dcbnl buffer attribute

In this patch, we add dcbnl buffer attribute to allow user
change the NIC's buffer configuration such as priority
to buffer mapping and buffer size of individual buffer.

This attribute combined with pfc attribute allows advanced user to
fine tune the qos setting for specific priority queue. For example,
user can give dedicated buffer for one or more priorities or user
can give large buffer to certain priorities.

The dcb buffer configuration will be controlled by lldptool.
lldptool -T -i eth2 -V BUFFER prio 0,2,5,7,1,2,3,6
  maps priorities 0,1,2,3,4,5,6,7 to receive buffer 0,2,5,7,1,2,3,6
lldptool -T -i eth2 -V BUFFER size 87296,87296,0,87296,0,0,0,0
  sets receive buffer size for buffer 0,1,2,3,4,5,6,7 respectively

After discussion on mailing list with Jakub, Jiri, Ido and John, we agreed to
choose dcbnl over devlink interface since this feature is intended to set
port attributes which are governed by the netdev instance of that port, where
devlink API is more suitable for global ASIC configurations.

We present an use case scenario where dcbnl buffer attribute configured
by advance user helps reduce the latency of messages of different sizes.

Scenarios description:
On ConnectX-5, we run latency sensitive traffic with
small/medium message sizes ranging from 64B to 256KB and bandwidth sensitive
traffic with large messages sizes 512KB and 1MB. We group small, medium,
and large message sizes to their own pfc enables priorities as follow.
  Priorities 1 & 2 (64B, 256B and 1KB)
  Priorities 3 & 4 (4KB, 8KB, 16KB, 64KB, 128KB and 256KB)
  Priorities 5 & 6 (512KB and 1MB)

By default, ConnectX-5 maps all pfc enabled priorities to a single
lossless fixed buffer size of 50% of total available buffer space. The
other 50% is assigned to lossy buffer. Using dcbnl buffer attribute,
we create three equal size lossless buffers. Each buffer has 25% of total
available buffer space. Thus, the lossy buffer size reduces to 25%. Priority
to lossless  buffer mappings are set as follow.
  Priorities 1 & 2 on lossless buffer #1
  Priorities 3 & 4 on lossless buffer #2
  Priorities 5 & 6 on lossless buffer #3

We observe improvements in latency for small and medium message sizes
as follows. Please note that the large message sizes bandwidth performance is
reduced but the total bandwidth remains the same.
  256B message size (42 % latency reduction)
  4K message size (21% latency reduction)
  64K message size (16% latency reduction)

CC: Ido Schimmel <idosch@idosch.org>
CC: Jakub Kicinski <jakub.kicinski@netronome.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Or Gerlitz <gerlitz.or@gmail.com>
CC: Parav Pandit <parav@mellanox.com>
CC: Aron Silverton <aron.silverton@oracle.com>
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/dcbnl.h        |  4 ++++
 include/uapi/linux/dcbnl.h | 11 +++++++++++
 net/dcb/dcbnl.c            | 20 ++++++++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index 207d9ba1f92c..0e5e91be2d30 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -101,6 +101,10 @@ struct dcbnl_rtnl_ops {
 	/* CEE peer */
 	int (*cee_peer_getpg) (struct net_device *, struct cee_pg *);
 	int (*cee_peer_getpfc) (struct net_device *, struct cee_pfc *);
+
+	/* buffer settings */
+	int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *);
+	int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index 2c0c6453c3f4..60aa2e446698 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -163,6 +163,16 @@ struct ieee_pfc {
 	__u64	indications[IEEE_8021QAZ_MAX_TCS];
 };
 
+#define IEEE_8021Q_MAX_PRIORITIES 8
+#define DCBX_MAX_BUFFERS  8
+struct dcbnl_buffer {
+	/* priority to buffer mapping */
+	__u8    prio2buffer[IEEE_8021Q_MAX_PRIORITIES];
+	/* buffer size in Bytes */
+	__u32   buffer_size[DCBX_MAX_BUFFERS];
+	__u32   total_size;
+};
+
 /* CEE DCBX std supported values */
 #define CEE_DCBX_MAX_PGS	8
 #define CEE_DCBX_MAX_PRIO	8
@@ -406,6 +416,7 @@ enum ieee_attrs {
 	DCB_ATTR_IEEE_MAXRATE,
 	DCB_ATTR_IEEE_QCN,
 	DCB_ATTR_IEEE_QCN_STATS,
+	DCB_ATTR_DCB_BUFFER,
 	__DCB_ATTR_IEEE_MAX
 };
 #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1)
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index bae7d78aa068..d2f4e0c1faaf 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -176,6 +176,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
 	[DCB_ATTR_IEEE_MAXRATE]   = {.len = sizeof(struct ieee_maxrate)},
 	[DCB_ATTR_IEEE_QCN]         = {.len = sizeof(struct ieee_qcn)},
 	[DCB_ATTR_IEEE_QCN_STATS]   = {.len = sizeof(struct ieee_qcn_stats)},
+	[DCB_ATTR_DCB_BUFFER]       = {.len = sizeof(struct dcbnl_buffer)},
 };
 
 /* DCB number of traffic classes nested attributes. */
@@ -1094,6 +1095,16 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 			return -EMSGSIZE;
 	}
 
+	if (ops->dcbnl_getbuffer) {
+		struct dcbnl_buffer buffer;
+
+		memset(&buffer, 0, sizeof(buffer));
+		err = ops->dcbnl_getbuffer(netdev, &buffer);
+		if (!err &&
+		    nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
+			return -EMSGSIZE;
+	}
+
 	app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
 	if (!app)
 		return -EMSGSIZE;
@@ -1453,6 +1464,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
 			goto err;
 	}
 
+	if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
+		struct dcbnl_buffer *buffer =
+			nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
+
+		err = ops->dcbnl_setbuffer(netdev, buffer);
+		if (err)
+			goto err;
+	}
+
 	if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
 		struct nlattr *attr;
 		int rem;
-- 
cgit v1.2.3


From df5f1361cc080877013f7838b61d31ad31307c2b Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 28 Feb 2018 14:16:47 -0600
Subject: net/mlx5: Add pbmc and pptb in the port_access_reg_cap_mask

Add pbmc and pptb in the port_access_reg_cap_mask. These two
bits determine if device supports receive buffer configuration.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   |  3 +++
 include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2bc27f8c5b87..db0332a6d23c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1152,6 +1152,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \
 	MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld)
 
+#define MLX5_CAP_PCAM_REG(mdev, reg) \
+	MLX5_GET(pcam_reg, (mdev)->caps.pcam, port_access_reg_cap_mask.regs_5000_to_507f.reg)
+
 #define MLX5_CAP_MCAM_REG(mdev, reg) \
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_access_reg_cap_mask.access_regs.reg)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b4ea8a9914c4..f687989d336b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8003,6 +8003,17 @@ struct mlx5_ifc_pcam_enhanced_features_bits {
 	u8         ppcnt_statistical_group[0x1];
 };
 
+struct mlx5_ifc_pcam_regs_5000_to_507f_bits {
+	u8         port_access_reg_cap_mask_127_to_96[0x20];
+	u8         port_access_reg_cap_mask_95_to_64[0x20];
+	u8         port_access_reg_cap_mask_63_to_32[0x20];
+
+	u8         port_access_reg_cap_mask_31_to_13[0x13];
+	u8         pbmc[0x1];
+	u8         pptb[0x1];
+	u8         port_access_reg_cap_mask_10_to_0[0xb];
+};
+
 struct mlx5_ifc_pcam_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         feature_group[0x8];
@@ -8012,6 +8023,7 @@ struct mlx5_ifc_pcam_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	union {
+		struct mlx5_ifc_pcam_regs_5000_to_507f_bits regs_5000_to_507f;
 		u8         reserved_at_0[0x80];
 	} port_access_reg_cap_mask;
 
-- 
cgit v1.2.3


From 50b4a3c23646254c7345f3663ff1e0a6cbcd9abb Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Fri, 2 Mar 2018 15:47:01 -0600
Subject: net/mlx5: PPTB and PBMC register firmware command support

Add firmware command interface to read and write PPTB and PBMC
registers.

PPTB register enables mappings priority to a specific receive buffer.

PBMC registers enables changing the receive buffer's configuration such
as buffer size, xon/xoff thresholds, buffer's lossy property and
buffer's shared property.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/port.c | 108 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/port.h |   5 +
 include/linux/mlx5/driver.h                       |   2 +
 include/linux/mlx5/mlx5_ifc.h                     |  35 +++++++
 4 files changed, 150 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
index 9f04542f3661..24e3b564964f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -127,3 +127,111 @@ u32 mlx5e_port_speed2linkmodes(u32 speed)
 
 	return link_modes;
 }
+
+int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out)
+{
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *in;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(pbmc_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 0);
+
+	kfree(in);
+	return err;
+}
+
+int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in)
+{
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *out;
+	int err;
+
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(pbmc_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 1);
+
+	kfree(out);
+	return err;
+}
+
+/* buffer[i]: buffer that priority i mapped to */
+int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer)
+{
+	int sz = MLX5_ST_SZ_BYTES(pptb_reg);
+	u32 prio_x_buff;
+	void *out;
+	void *in;
+	int prio;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(pptb_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0);
+	if (err)
+		goto out;
+
+	prio_x_buff = MLX5_GET(pptb_reg, out, prio_x_buff);
+	for (prio = 0; prio < 8; prio++) {
+		buffer[prio] = (u8)(prio_x_buff >> (4 * prio)) & 0xF;
+		mlx5_core_dbg(mdev, "prio %d, buffer %d\n", prio, buffer[prio]);
+	}
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+
+int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer)
+{
+	int sz = MLX5_ST_SZ_BYTES(pptb_reg);
+	u32 prio_x_buff;
+	void *out;
+	void *in;
+	int prio;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* First query the pptb register */
+	MLX5_SET(pptb_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0);
+	if (err)
+		goto out;
+
+	memcpy(in, out, sz);
+	MLX5_SET(pptb_reg, in, local_port, 1);
+
+	/* Update the pm and prio_x_buff */
+	MLX5_SET(pptb_reg, in, pm, 0xFF);
+
+	prio_x_buff = 0;
+	for (prio = 0; prio < 8; prio++)
+		prio_x_buff |= (buffer[prio] << (4 * prio));
+	MLX5_SET(pptb_reg, in, prio_x_buff, prio_x_buff);
+
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 1);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
index 7aae38e98a65..f8cbd8194179 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
@@ -40,4 +40,9 @@ u32 mlx5e_port_ptys2speed(u32 eth_proto_oper);
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 u32 mlx5e_port_speed2linkmodes(u32 speed);
+
+int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out);
+int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in);
+int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer);
+int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer);
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d703774982ca..92d292454351 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -124,6 +124,8 @@ enum {
 	MLX5_REG_PAOS		 = 0x5006,
 	MLX5_REG_PFCC            = 0x5007,
 	MLX5_REG_PPCNT		 = 0x5008,
+	MLX5_REG_PPTB            = 0x500b,
+	MLX5_REG_PBMC            = 0x500c,
 	MLX5_REG_PMAOS		 = 0x5012,
 	MLX5_REG_PUDE		 = 0x5009,
 	MLX5_REG_PMPE		 = 0x5010,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f687989d336b..edbddeaacc88 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8788,6 +8788,41 @@ struct mlx5_ifc_qpts_reg_bits {
 	u8         trust_state[0x3];
 };
 
+struct mlx5_ifc_pptb_reg_bits {
+	u8         reserved_at_0[0x2];
+	u8         mm[0x2];
+	u8         reserved_at_4[0x4];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x6];
+	u8         cm[0x1];
+	u8         um[0x1];
+	u8         pm[0x8];
+
+	u8         prio_x_buff[0x20];
+
+	u8         pm_msb[0x8];
+	u8         reserved_at_48[0x10];
+	u8         ctrl_buff[0x4];
+	u8         untagged_buff[0x4];
+};
+
+struct mlx5_ifc_pbmc_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         xoff_timer_value[0x10];
+	u8         xoff_refresh[0x10];
+
+	u8         reserved_at_40[0x9];
+	u8         fullness_threshold[0x7];
+	u8         port_buffer_size[0x10];
+
+	struct mlx5_ifc_bufferx_reg_bits buffer[10];
+
+	u8         reserved_at_2e0[0x40];
+};
+
 struct mlx5_ifc_qtct_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         port_number[0x8];
-- 
cgit v1.2.3


From f8d959a5b188dc81e57a6bac34a1b2986f61e2fd Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:08 -0700
Subject: perf/core: add perf_get_event() to return perf_event given a struct
 file

A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 ++++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99eb9a4e..eec302b61cfe 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline struct file *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+	return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce359ad..6eeab86d24ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
 	return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+	if (file->f_op != &perf_fops)
+		return ERR_PTR(-EINVAL);
+
+	return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	if (!event)
-- 
cgit v1.2.3


From 41bdc4b40ed6fb26c6acc655ed9a243a348709c9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:09 -0700
Subject: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/trace_events.h |  17 ++++++
 include/uapi/linux/bpf.h     |  26 +++++++++
 kernel/bpf/syscall.c         | 131 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c     |  48 ++++++++++++++++
 kernel/trace/trace_kprobe.c  |  29 ++++++++++
 kernel/trace/trace_uprobe.c  |  22 ++++++++
 6 files changed, 273 insertions(+)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3eff564c..d34144a3c5b5 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
 {
 	return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+					  u32 *prog_id, u32 *fd_type,
+					  const char **buf, u64 *probe_offset,
+					  u64 *probe_addr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **symbol,
+			       u64 *probe_offset, u64 *probe_addr,
+			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **filename,
+			       u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e95fec90c2c1..9b8c6e310e9a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -380,6 +381,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2557,4 +2574,13 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 788456c18617..388d4feda348 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
+#include <linux/fs.h>
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
@@ -2178,6 +2180,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	return btf_get_fd_by_id(attr->btf_id);
 }
 
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr,
+				    u32 prog_id, u32 fd_type,
+				    const char *buf, u64 probe_offset,
+				    u64 probe_addr)
+{
+	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+	u32 len = buf ? strlen(buf) : 0, input_len;
+	int err = 0;
+
+	if (put_user(len, &uattr->task_fd_query.buf_len))
+		return -EFAULT;
+	input_len = attr->task_fd_query.buf_len;
+	if (input_len && ubuf) {
+		if (!len) {
+			/* nothing to copy, just make ubuf NULL terminated */
+			char zero = '\0';
+
+			if (put_user(zero, ubuf))
+				return -EFAULT;
+		} else if (input_len >= len + 1) {
+			/* ubuf can hold the string with NULL terminator */
+			if (copy_to_user(ubuf, buf, len + 1))
+				return -EFAULT;
+		} else {
+			/* ubuf cannot hold the string with NULL terminator,
+			 * do a partial copy with NULL terminator.
+			 */
+			char zero = '\0';
+
+			err = -ENOSPC;
+			if (copy_to_user(ubuf, buf, input_len - 1))
+				return -EFAULT;
+			if (put_user(zero, ubuf + input_len - 1))
+				return -EFAULT;
+		}
+	}
+
+	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+		return -EFAULT;
+
+	return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	pid_t pid = attr->task_fd_query.pid;
+	u32 fd = attr->task_fd_query.fd;
+	const struct perf_event *event;
+	struct files_struct *files;
+	struct task_struct *task;
+	struct file *file;
+	int err;
+
+	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->task_fd_query.flags != 0)
+		return -EINVAL;
+
+	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+	if (!files)
+		return -ENOENT;
+
+	err = 0;
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		err = -EBADF;
+	else
+		get_file(file);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (err)
+		goto out;
+
+	if (file->f_op == &bpf_raw_tp_fops) {
+		struct bpf_raw_tracepoint *raw_tp = file->private_data;
+		struct bpf_raw_event_map *btp = raw_tp->btp;
+
+		err = bpf_task_fd_query_copy(attr, uattr,
+					     raw_tp->prog->aux->id,
+					     BPF_FD_TYPE_RAW_TRACEPOINT,
+					     btp->tp->name, 0, 0);
+		goto put_file;
+	}
+
+	event = perf_get_event(file);
+	if (!IS_ERR(event)) {
+		u64 probe_offset, probe_addr;
+		u32 prog_id, fd_type;
+		const char *buf;
+
+		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+					      &buf, &probe_offset,
+					      &probe_addr);
+		if (!err)
+			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+						     fd_type, buf,
+						     probe_offset,
+						     probe_addr);
+		goto put_file;
+	}
+
+	err = -ENOTSUPP;
+put_file:
+	fput(file);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2264,6 +2392,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
 		break;
+	case BPF_TASK_FD_QUERY:
+		err = bpf_task_fd_query(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ce2cbbff27e4..81fdf2fc94ac 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
+#include <linux/syscalls.h>
 #include <linux/error-injection.h>
 
 #include "trace_probe.h"
@@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 	mutex_unlock(&bpf_event_mutex);
 	return err;
 }
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr)
+{
+	bool is_tracepoint, is_syscall_tp;
+	struct bpf_prog *prog;
+	int flags, err = 0;
+
+	prog = event->prog;
+	if (!prog)
+		return -ENOENT;
+
+	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+		return -EOPNOTSUPP;
+
+	*prog_id = prog->aux->id;
+	flags = event->tp_event->flags;
+	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+	if (is_tracepoint || is_syscall_tp) {
+		*buf = is_tracepoint ? event->tp_event->tp->name
+				     : event->tp_event->name;
+		*fd_type = BPF_FD_TYPE_TRACEPOINT;
+		*probe_offset = 0x0;
+		*probe_addr = 0x0;
+	} else {
+		/* kprobe/uprobe */
+		err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_KPROBE)
+			err = bpf_get_kprobe_info(event, fd_type, buf,
+						  probe_offset, probe_addr,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_UPROBE)
+			err = bpf_get_uprobe_info(event, fd_type, buf,
+						  probe_offset,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+	}
+
+	return err;
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 02aed76e0978..daa81571b22a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **symbol, u64 *probe_offset,
+			u64 *probe_addr, bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_kprobe *tk;
+
+	if (perf_type_tracepoint)
+		tk = find_trace_kprobe(pevent, group);
+	else
+		tk = event->tp_event->data;
+	if (!tk)
+		return -EINVAL;
+
+	*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+					      : BPF_FD_TYPE_KPROBE;
+	if (tk->symbol) {
+		*symbol = tk->symbol;
+		*probe_offset = tk->rp.kp.offset;
+		*probe_addr = 0;
+	} else {
+		*symbol = NULL;
+		*probe_offset = 0;
+		*probe_addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 /*
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ac892878dbe6..bf89a51e740d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 {
 	__uprobe_perf_func(tu, func, regs, ucb, dsize);
 }
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **filename, u64 *probe_offset,
+			bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_uprobe *tu;
+
+	if (perf_type_tracepoint)
+		tu = find_probe_event(pevent, group);
+	else
+		tu = event->tp_event->data;
+	if (!tu)
+		return -EINVAL;
+
+	*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+				    : BPF_FD_TYPE_UPROBE;
+	*filename = tu->filename;
+	*probe_offset = tu->offset;
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
-- 
cgit v1.2.3


From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:46 +0200
Subject: bpf: devmap introduce dev_map_enqueue

Functionality is the same, but the ndo_xdp_xmit call is now
simply invoked from inside the devmap.c code.

V2: Fix compile issue reported by kbuild test robot <lkp@intel.com>

V5: Cleanups requested by Daniel
 - Newlines before func definition
 - Use BUILD_BUG_ON checks
 - Remove unnecessary use return value store in dev_map_enqueue

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        | 16 +++++++++++++---
 include/trace/events/xdp.h |  9 ++++++++-
 kernel/bpf/devmap.c        | 34 ++++++++++++++++++++++++++++------
 net/core/filter.c          | 15 ++-------------
 4 files changed, 51 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1795eeee846c..23a809da452d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -487,14 +487,16 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct xdp_buff;
+
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
-struct xdp_buff;
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 
@@ -573,6 +575,15 @@ static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
 
+struct xdp_buff;
+struct bpf_dtab_netdev;
+
+static inline
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	return 0;
+}
+
 static inline
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
@@ -587,7 +598,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
 {
 }
 
-struct xdp_buff;
 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 				  struct xdp_buff *xdp,
 				  struct net_device *dev_rx)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 8989a92c571a..96104610d40e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#ifndef __DEVMAP_OBJ_TYPE
+#define __DEVMAP_OBJ_TYPE
+struct _bpf_dtab_netdev {
+	struct net_device *dev;
+};
+#endif /* __DEVMAP_OBJ_TYPE */
+
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
 	 (!map ? 0 :						\
 	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	   ((struct net_device *)fwd)->ifindex : 0)))
+	   ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..06c400e7e4ff 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,13 +48,15 @@
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/filter.h>
+#include <trace/events/xdp.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bpf_dtab_netdev {
-	struct net_device *dev;
+	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct rcu_head rcu;
@@ -240,21 +242,38 @@ void __dev_map_flush(struct bpf_map *map)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_dtab_netdev *dev;
+	struct bpf_dtab_netdev *obj;
 
 	if (key >= map->max_entries)
 		return NULL;
 
-	dev = READ_ONCE(dtab->netdev_map[key]);
-	return dev ? dev->dev : NULL;
+	obj = READ_ONCE(dtab->netdev_map[key]);
+	return obj;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	struct net_device *dev = dst->dev;
+	struct xdp_frame *xdpf;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit)
+		return -EOPNOTSUPP;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* TODO: implement a bulking/enqueue step later */
+	return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct net_device *dev = dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
@@ -405,6 +424,9 @@ static struct notifier_block dev_map_notifier = {
 
 static int __init dev_map_init(void)
 {
+	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
+	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
+		     offsetof(struct _bpf_dtab_netdev, dev));
 	register_netdevice_notifier(&dev_map_notifier);
 	return 0;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index aa114c4acb25..c867106d3707 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3065,20 +3065,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP: {
-		struct net_device *dev = fwd;
-		struct xdp_frame *xdpf;
+		struct bpf_dtab_netdev *dst = fwd;
 
-		if (!dev->netdev_ops->ndo_xdp_xmit)
-			return -EOPNOTSUPP;
-
-		xdpf = convert_to_xdp_frame(xdp);
-		if (unlikely(!xdpf))
-			return -EOVERFLOW;
-
-		/* TODO: move to inside map code instead, for bulk support
-		 * err = dev_map_enqueue(dev, xdp);
-		 */
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		err = dev_map_enqueue(dst, xdp);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
-- 
cgit v1.2.3


From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:57 +0200
Subject: xdp: add tracepoint for devmap like cpumap have

Notice how this allow us get XDP statistic without affecting the XDP
performance, as tracepoint is no-longer activated on a per packet basis.

V5: Spotted by John Fastabend.
 Fix 'sent' also counted 'drops' in this patch, a later patch corrected
 this, but it was a mistake in this intermediate step.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        |  6 ++++--
 include/trace/events/xdp.h | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/devmap.c        | 27 +++++++++++++++++++++++----
 net/core/filter.c          |  2 +-
 4 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 23a809da452d..bbe297436e5d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -492,7 +492,8 @@ struct xdp_buff;
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
@@ -579,7 +580,8 @@ struct xdp_buff;
 struct bpf_dtab_netdev;
 
 static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	return 0;
 }
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 96104610d40e..2e9ef0650144 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue,
 		  __entry->to_cpu)
 );
 
+TRACE_EVENT(xdp_devmap_xmit,
+
+	TP_PROTO(const struct bpf_map *map, u32 map_index,
+		 int sent, int drops,
+		 const struct net_device *from_dev,
+		 const struct net_device *to_dev),
+
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(u32, map_index)
+		__field(int, drops)
+		__field(int, sent)
+		__field(int, from_ifindex)
+		__field(int, to_ifindex)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map->id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->map_index	= map_index;
+		__entry->drops		= drops;
+		__entry->sent		= sent;
+		__entry->from_ifindex	= from_dev->ifindex;
+		__entry->to_ifindex	= to_dev->ifindex;
+	),
+
+	TP_printk("ndo_xdp_xmit"
+		  " map_id=%d map_index=%d action=%s"
+		  " sent=%d drops=%d"
+		  " from_ifindex=%d to_ifindex=%d",
+		  __entry->map_id, __entry->map_index,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->sent, __entry->drops,
+		  __entry->from_ifindex, __entry->to_ifindex)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 15293b9dfb77..ff2f3bf59f2f 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -58,6 +58,7 @@
 #define DEV_MAP_BULK_SIZE 16
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct net_device *dev_rx;
 	unsigned int count;
 };
 
@@ -219,6 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
+	int sent = 0, drops = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -235,11 +237,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		int err;
 
 		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err)
+		if (err) {
+			drops++;
 			xdp_return_frame(xdpf);
+		} else {
+			sent++;
+		}
 	}
 	bq->count = 0;
 
+	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+			      sent, drops, bq->dev_rx, dev);
+	bq->dev_rx = NULL;
 	return 0;
 }
 
@@ -296,18 +305,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+		      struct net_device *dev_rx)
+
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 		bq_xmit_all(obj, bq);
 
+	/* Ingress dev_rx will be the same for all xdp_frame's in
+	 * bulk_queue, because bq stored per-CPU and must be flushed
+	 * from net_device drivers NAPI func end.
+	 */
+	if (!bq->dev_rx)
+		bq->dev_rx = dev_rx;
+
 	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
@@ -319,7 +338,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	return bq_enqueue(dst, xdpf);
+	return bq_enqueue(dst, xdpf, dev_rx);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
diff --git a/net/core/filter.c b/net/core/filter.c
index c867106d3707..36cf2f87d742 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3067,7 +3067,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	case BPF_MAP_TYPE_DEVMAP: {
 		struct bpf_dtab_netdev *dst = fwd;
 
-		err = dev_map_enqueue(dst, xdp);
+		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
-- 
cgit v1.2.3


From 389ab7f01af988c2a1ec5617eb0c7e220df1ef1c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:07 +0200
Subject: xdp: introduce xdp_return_frame_rx_napi

When sending an xdp_frame through xdp_do_redirect call, then error
cases can happen where the xdp_frame needs to be dropped, and
returning an -errno code isn't sufficient/possible any-longer
(e.g. for cpumap case). This is already fully supported, by simply
calling xdp_return_frame.

This patch is an optimization, which provides xdp_return_frame_rx_napi,
which is a faster variant for these error cases.  It take advantage of
the protection provided by XDP RX running under NAPI protection.

This change is mostly relevant for drivers using the page_pool
allocator as it can take advantage of this. (Tested with mlx5).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/page_pool.h |  5 +++--
 include/net/xdp.h       |  1 +
 kernel/bpf/cpumap.c     |  2 +-
 kernel/bpf/devmap.c     |  2 +-
 net/core/xdp.c          | 20 ++++++++++++++++----
 5 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index c79087153148..694d055e01ef 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool);
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
 
-static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+static inline void page_pool_put_page(struct page_pool *pool,
+				      struct page *page, bool allow_direct)
 {
 	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_put_page(pool, page, false);
+	__page_pool_put_page(pool, page, allow_direct);
 #endif
 }
 /* Very limited use-cases allow recycle direct */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0b689cf561c7..7ad779237ae8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index c95b04ec103e..e0918d180f08 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		}
 		processed++;
 	}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ff2f3bf59f2f..a9cd5c93dd2b 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -239,7 +239,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		} else {
 			sent++;
 		}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bf6758f74339..cb8c4e061a5a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,7 +308,13 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-static void xdp_return(void *data, struct xdp_mem_info *mem)
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection.  The @napi_direct boolian
+ * is used for those calls sites.  Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
 		if (xa)
-			page_pool_put_page(xa->page_pool, page);
+			page_pool_put_page(xa->page_pool, page, napi_direct);
 		else
 			put_page(page);
 		rcu_read_unlock();
@@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	xdp_return(xdpf->data, &xdpf->mem);
+	__xdp_return(xdpf->data, &xdpf->mem, false);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+	__xdp_return(xdpf->data, &xdpf->mem, true);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	xdp_return(xdp->data, &xdp->rxq->mem);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
-- 
cgit v1.2.3


From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:12 +0200
Subject: xdp: change ndo_xdp_xmit API to support bulking

This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.

When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.

Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
 for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
 for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps

With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.

Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.

The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.

V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
V4: Isolated ndo, driver changes and callers.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 26 ++++++++---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 21 ++++++---
 drivers/net/tun.c                             | 37 +++++++++------
 drivers/net/virtio_net.c                      | 66 ++++++++++++++++++++-------
 include/linux/netdevice.h                     | 14 ++++--
 kernel/bpf/devmap.c                           | 29 +++++++-----
 net/core/filter.c                             |  8 ++--
 8 files changed, 139 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5efa68de935b..9b698c5acd05 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * @dev: netdev
  * @xdp: XDP buffer
  *
- * Returns Zero if sent, else an error code
+ * Returns number of frames successfully sent. Frames that fail are
+ * free'ed via XDP return API.
+ *
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
 	struct i40e_vsi *vsi = np->vsi;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (test_bit(__I40E_VSI_DOWN, vsi->state))
 		return -ENETDOWN;
@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
-	if (err != I40E_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
+		if (err != I40E_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index fdd2c55f03a6..eb8804b3d7b6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6652b201df5b..9645619f7729 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int ixgbe_xdp_xmit(struct net_device *dev, int n,
+			  struct xdp_frame **frames)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
@@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (unlikely(!ring))
 		return -ENXIO;
 
-	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
-	if (err != IXGBE_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
+		if (err != IXGBE_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 static void ixgbe_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 44d4f3d25350..d3dcfcb1c4b3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -70,6 +70,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/sock.h>
+#include <net/xdp.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
 #include <linux/skb_array.h>
@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
+static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
 	u32 numqueues;
-	int ret = 0;
+	int drops = 0;
+	int cnt = n;
+	int i;
 
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
 	if (!numqueues) {
-		ret = -ENOSPC;
-		goto out;
+		rcu_read_unlock();
+		return -ENXIO; /* Caller will free/return all frames */
 	}
 
 	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
 					    numqueues]);
-	/* Encode the XDP flag into lowest bit for consumer to differ
-	 * XDP buffer from sk_buff.
-	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
-		this_cpu_inc(tun->pcpu_stats->tx_dropped);
-		ret = -ENOSPC;
+
+	spin_lock(&tfile->tx_ring.producer_lock);
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdp = frames[i];
+		/* Encode the XDP flag into lowest bit for consumer to differ
+		 * XDP buffer from sk_buff.
+		 */
+		void *frame = tun_xdp_to_ptr(xdp);
+
+		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
+			this_cpu_inc(tun->pcpu_stats->tx_dropped);
+			xdp_return_frame_rx_napi(xdp);
+			drops++;
+		}
 	}
+	spin_unlock(&tfile->tx_ring.producer_lock);
 
-out:
 	rcu_read_unlock();
-	return ret;
+	return cnt - drops;
 }
 
 static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, frame);
+	return tun_xdp_xmit(dev, 1, &frame);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f34794a76c4d..39a0783d1cde 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
 	virtqueue_kick(sq->vq);
 }
 
-static int __virtnet_xdp_xmit(struct virtnet_info *vi,
-			       struct xdp_frame *xdpf)
+static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
+				   struct send_queue *sq,
+				   struct xdp_frame *xdpf)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	struct xdp_frame *xdpf_sent;
-	struct send_queue *sq;
-	unsigned int len;
-	unsigned int qp;
 	int err;
 
-	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
-	sq = &vi->sq[qp];
-
-	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
-		xdp_return_frame(xdpf_sent);
-
 	/* virtqueue want to use data area in-front of packet */
 	if (unlikely(xdpf->metasize > 0))
 		return -EOPNOTSUPP;
@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	return 0;
 }
 
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
+				   struct xdp_frame *xdpf)
+{
+	struct xdp_frame *xdpf_sent;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
+
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	return __virtnet_xdp_xmit_one(vi, sq, xdpf);
+}
+
+static int virtnet_xdp_xmit(struct net_device *dev,
+			    int n, struct xdp_frame **frames)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
+	struct xdp_frame *xdpf_sent;
 	struct bpf_prog *xdp_prog;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+	int drops = 0;
+	int err;
+	int i;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
 
 	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 	 * indicate XDP resources have been successfully allocated.
@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	return __virtnet_xdp_xmit(vi, xdpf);
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+
+		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
+		if (err) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+	return n - drops;
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492c4e14..debdb6286170 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,9 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
- *	This function is used to submit a XDP packet for transmit on a
- *	netdevice.
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ *	This function is used to submit @n XDP packets for transmit on a
+ *	netdevice. Returns number of frames successfully transmitted, frames
+ *	that got dropped are freed/returned via xdp_return_frame().
+ *	Returns negative number, means general error invoking ndo, meaning
+ *	no frames were xmit'ed and core-caller will free all frames.
+ *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1375,8 +1379,8 @@ struct net_device_ops {
 						       int needed_headroom);
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
-	int			(*ndo_xdp_xmit)(struct net_device *dev,
-						struct xdp_frame *xdp);
+	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
+						struct xdp_frame **xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a9cd5c93dd2b..77908311ec98 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -232,24 +232,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	for (i = 0; i < bq->count; i++) {
-		struct xdp_frame *xdpf = bq->q[i];
-		int err;
-
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err) {
-			drops++;
-			xdp_return_frame_rx_napi(xdpf);
-		} else {
-			sent++;
-		}
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	if (sent < 0) {
+		sent = 0;
+		goto error;
 	}
+	drops = bq->count - sent;
+out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
 			      sent, drops, bq->dev_rx, dev);
 	bq->dev_rx = NULL;
 	return 0;
+error:
+	/* If ndo_xdp_xmit fails with an errno, no frames have been
+	 * xmit'ed and it's our responsibility to them free all.
+	 */
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		/* RX path under NAPI protection, can return frames faster */
+		xdp_return_frame_rx_napi(xdpf);
+		drops++;
+	}
+	goto out;
 }
 
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
diff --git a/net/core/filter.c b/net/core/filter.c
index 36cf2f87d742..1d75f9322275 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
 			u32 index)
 {
 	struct xdp_frame *xdpf;
-	int err;
+	int sent;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit) {
 		return -EOPNOTSUPP;
@@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-	if (err)
-		return err;
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	if (sent <= 0)
+		return sent;
 	dev->netdev_ops->ndo_xdp_flush(dev);
 	return 0;
 }
-- 
cgit v1.2.3


From e74de52e55c092e7113f839e74400ce9dbe12ceb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:17 +0200
Subject: xdp/trace: extend tracepoint in devmap with an err

Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code
allow people to easier identify the reason behind the ndo_xdp_xmit
call to a given driver is failing.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/xdp.h | 10 ++++++----
 kernel/bpf/devmap.c        |  5 +++--
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 2e9ef0650144..1ecf4c67fcf7 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit,
 	TP_PROTO(const struct bpf_map *map, u32 map_index,
 		 int sent, int drops,
 		 const struct net_device *from_dev,
-		 const struct net_device *to_dev),
+		 const struct net_device *to_dev, int err),
 
-	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err),
 
 	TP_STRUCT__entry(
 		__field(int, map_id)
@@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__field(int, sent)
 		__field(int, from_ifindex)
 		__field(int, to_ifindex)
+		__field(int, err)
 	),
 
 	TP_fast_assign(
@@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__entry->sent		= sent;
 		__entry->from_ifindex	= from_dev->ifindex;
 		__entry->to_ifindex	= to_dev->ifindex;
+		__entry->err		= err;
 	),
 
 	TP_printk("ndo_xdp_xmit"
 		  " map_id=%d map_index=%d action=%s"
 		  " sent=%d drops=%d"
-		  " from_ifindex=%d to_ifindex=%d",
+		  " from_ifindex=%d to_ifindex=%d err=%d",
 		  __entry->map_id, __entry->map_index,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->sent, __entry->drops,
-		  __entry->from_ifindex, __entry->to_ifindex)
+		  __entry->from_ifindex, __entry->to_ifindex, __entry->err)
 );
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 77908311ec98..ae16d0c373ef 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -220,7 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
-	int sent = 0, drops = 0;
+	int sent = 0, drops = 0, err = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -234,6 +234,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 
 	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
 	if (sent < 0) {
+		err = sent;
 		sent = 0;
 		goto error;
 	}
@@ -242,7 +243,7 @@ out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
-			      sent, drops, bq->dev_rx, dev);
+			      sent, drops, bq->dev_rx, dev, err);
 	bq->dev_rx = NULL;
 	return 0;
 error:
-- 
cgit v1.2.3


From aaa908ffbee18a65529b716efb346a626e81559a Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 23 May 2018 15:26:53 -0700
Subject: net_sched: switch to rcu_work

Commit 05f0fe6b74db ("RCU, workqueue: Implement rcu_work") introduces
new API's for dispatching work in a RCU callback. Now we can just
switch to the new API's for tc filters. This could get rid of a lot
of code.

Cc: Tejun Heo <tj@kernel.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h    |  2 +-
 net/sched/cls_api.c      |  5 +++--
 net/sched/cls_basic.c    | 24 +++++++-----------------
 net/sched/cls_bpf.c      | 22 ++++++----------------
 net/sched/cls_cgroup.c   | 23 +++++------------------
 net/sched/cls_flow.c     | 24 +++++++-----------------
 net/sched/cls_flower.c   | 40 ++++++++++------------------------------
 net/sched/cls_fw.c       | 24 +++++++-----------------
 net/sched/cls_matchall.c | 21 +++++----------------
 net/sched/cls_route.c    | 23 +++++++++--------------
 net/sched/cls_rsvp.h     | 20 +++++---------------
 net/sched/cls_tcindex.c  | 41 ++++++++++-------------------------------
 net/sched/cls_u32.c      | 37 ++++++++++---------------------------
 13 files changed, 85 insertions(+), 221 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0005f0b40fe9..f3ec43725724 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -33,7 +33,7 @@ struct tcf_block_ext_info {
 };
 
 struct tcf_block_cb;
-bool tcf_queue_work(struct work_struct *work);
+bool tcf_queue_work(struct rcu_work *rwork, work_func_t func);
 
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 963e4bf0aab8..a4a5ace834c3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -103,9 +103,10 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 }
 EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
-bool tcf_queue_work(struct work_struct *work)
+bool tcf_queue_work(struct rcu_work *rwork, work_func_t func)
 {
-	return queue_work(tc_filter_wq, work);
+	INIT_RCU_WORK(rwork, func);
+	return queue_rcu_work(tc_filter_wq, rwork);
 }
 EXPORT_SYMBOL(tcf_queue_work);
 
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6b7ab3512f5b..95367f37098d 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -35,10 +35,7 @@ struct basic_filter {
 	struct tcf_result	res;
 	struct tcf_proto	*tp;
 	struct list_head	link;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -97,21 +94,14 @@ static void __basic_delete_filter(struct basic_filter *f)
 
 static void basic_delete_filter_work(struct work_struct *work)
 {
-	struct basic_filter *f = container_of(work, struct basic_filter, work);
-
+	struct basic_filter *f = container_of(to_rcu_work(work),
+					      struct basic_filter,
+					      rwork);
 	rtnl_lock();
 	__basic_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void basic_delete_filter(struct rcu_head *head)
-{
-	struct basic_filter *f = container_of(head, struct basic_filter, rcu);
-
-	INIT_WORK(&f->work, basic_delete_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct basic_head *head = rtnl_dereference(tp->root);
@@ -122,7 +112,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 		tcf_unbind_filter(tp, &f->res);
 		idr_remove(&head->handle_idr, f->handle);
 		if (tcf_exts_get_net(&f->exts))
-			call_rcu(&f->rcu, basic_delete_filter);
+			tcf_queue_work(&f->rwork, basic_delete_filter_work);
 		else
 			__basic_delete_filter(f);
 	}
@@ -140,7 +130,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
 	tcf_unbind_filter(tp, &f->res);
 	idr_remove(&head->handle_idr, f->handle);
 	tcf_exts_get_net(&f->exts);
-	call_rcu(&f->rcu, basic_delete_filter);
+	tcf_queue_work(&f->rwork, basic_delete_filter_work);
 	*last = list_empty(&head->flist);
 	return 0;
 }
@@ -234,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 		list_replace_rcu(&fold->link, &fnew->link);
 		tcf_unbind_filter(tp, &fold->res);
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, basic_delete_filter);
+		tcf_queue_work(&fold->rwork, basic_delete_filter_work);
 	} else {
 		list_add_rcu(&fnew->link, &head->flist);
 	}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index b07c1fa8bc0d..1aa7f6511065 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -49,10 +49,7 @@ struct cls_bpf_prog {
 	struct sock_filter *bpf_ops;
 	const char *bpf_name;
 	struct tcf_proto *tp;
-	union {
-		struct work_struct work;
-		struct rcu_head rcu;
-	};
+	struct rcu_work rwork;
 };
 
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
@@ -275,21 +272,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
 
 static void cls_bpf_delete_prog_work(struct work_struct *work)
 {
-	struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work);
-
+	struct cls_bpf_prog *prog = container_of(to_rcu_work(work),
+						 struct cls_bpf_prog,
+						 rwork);
 	rtnl_lock();
 	__cls_bpf_delete_prog(prog);
 	rtnl_unlock();
 }
 
-static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
-{
-	struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
-
-	INIT_WORK(&prog->work, cls_bpf_delete_prog_work);
-	tcf_queue_work(&prog->work);
-}
-
 static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 			     struct netlink_ext_ack *extack)
 {
@@ -300,7 +290,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
 	if (tcf_exts_get_net(&prog->exts))
-		call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
+		tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work);
 	else
 		__cls_bpf_delete_prog(prog);
 }
@@ -526,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
 		tcf_exts_get_net(&oldprog->exts);
-		call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
+		tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work);
 	} else {
 		list_add_rcu(&prog->link, &head->plist);
 	}
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 762da5c0cf5e..3bc01bdde165 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,10 +23,7 @@ struct cls_cgroup_head {
 	struct tcf_exts		exts;
 	struct tcf_ematch_tree	ematches;
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -70,24 +67,14 @@ static void __cls_cgroup_destroy(struct cls_cgroup_head *head)
 
 static void cls_cgroup_destroy_work(struct work_struct *work)
 {
-	struct cls_cgroup_head *head = container_of(work,
+	struct cls_cgroup_head *head = container_of(to_rcu_work(work),
 						    struct cls_cgroup_head,
-						    work);
+						    rwork);
 	rtnl_lock();
 	__cls_cgroup_destroy(head);
 	rtnl_unlock();
 }
 
-static void cls_cgroup_destroy_rcu(struct rcu_head *root)
-{
-	struct cls_cgroup_head *head = container_of(root,
-						    struct cls_cgroup_head,
-						    rcu);
-
-	INIT_WORK(&head->work, cls_cgroup_destroy_work);
-	tcf_queue_work(&head->work);
-}
-
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
@@ -134,7 +121,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	rcu_assign_pointer(tp->root, new);
 	if (head) {
 		tcf_exts_get_net(&head->exts);
-		call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+		tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
 	}
 	return 0;
 errout:
@@ -151,7 +138,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp,
 	/* Head can still be NULL due to cls_cgroup_init(). */
 	if (head) {
 		if (tcf_exts_get_net(&head->exts))
-			call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+			tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
 		else
 			__cls_cgroup_destroy(head);
 	}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index cd5fe383afdd..2bb043cd436b 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -57,10 +57,7 @@ struct flow_filter {
 	u32			divisor;
 	u32			baseclass;
 	u32			hashrnd;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static inline u32 addr_fold(void *addr)
@@ -383,21 +380,14 @@ static void __flow_destroy_filter(struct flow_filter *f)
 
 static void flow_destroy_filter_work(struct work_struct *work)
 {
-	struct flow_filter *f = container_of(work, struct flow_filter, work);
-
+	struct flow_filter *f = container_of(to_rcu_work(work),
+					     struct flow_filter,
+					     rwork);
 	rtnl_lock();
 	__flow_destroy_filter(f);
 	rtnl_unlock();
 }
 
-static void flow_destroy_filter(struct rcu_head *head)
-{
-	struct flow_filter *f = container_of(head, struct flow_filter, rcu);
-
-	INIT_WORK(&f->work, flow_destroy_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static int flow_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
@@ -563,7 +553,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 
 	if (fold) {
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, flow_destroy_filter);
+		tcf_queue_work(&fold->rwork, flow_destroy_filter_work);
 	}
 	return 0;
 
@@ -583,7 +573,7 @@ static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
 
 	list_del_rcu(&f->list);
 	tcf_exts_get_net(&f->exts);
-	call_rcu(&f->rcu, flow_destroy_filter);
+	tcf_queue_work(&f->rwork, flow_destroy_filter_work);
 	*last = list_empty(&head->filters);
 	return 0;
 }
@@ -608,7 +598,7 @@ static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	list_for_each_entry_safe(f, next, &head->filters, list) {
 		list_del_rcu(&f->list);
 		if (tcf_exts_get_net(&f->exts))
-			call_rcu(&f->rcu, flow_destroy_filter);
+			tcf_queue_work(&f->rwork, flow_destroy_filter_work);
 		else
 			__flow_destroy_filter(f);
 	}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index eacaaf803914..4e74508515f4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -73,10 +73,7 @@ struct fl_flow_mask {
 struct cls_fl_head {
 	struct rhashtable ht;
 	struct list_head masks;
-	union {
-		struct work_struct work;
-		struct rcu_head	rcu;
-	};
+	struct rcu_work rwork;
 	struct idr handle_idr;
 };
 
@@ -90,10 +87,7 @@ struct cls_fl_filter {
 	struct list_head list;
 	u32 handle;
 	u32 flags;
-	union {
-		struct work_struct work;
-		struct rcu_head	rcu;
-	};
+	struct rcu_work rwork;
 	struct net_device *hw_dev;
 };
 
@@ -235,21 +229,14 @@ static void __fl_destroy_filter(struct cls_fl_filter *f)
 
 static void fl_destroy_filter_work(struct work_struct *work)
 {
-	struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work);
+	struct cls_fl_filter *f = container_of(to_rcu_work(work),
+					struct cls_fl_filter, rwork);
 
 	rtnl_lock();
 	__fl_destroy_filter(f);
 	rtnl_unlock();
 }
 
-static void fl_destroy_filter(struct rcu_head *head)
-{
-	struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu);
-
-	INIT_WORK(&f->work, fl_destroy_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 				 struct netlink_ext_ack *extack)
 {
@@ -327,7 +314,7 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
 		fl_hw_destroy_filter(tp, f, extack);
 	tcf_unbind_filter(tp, &f->res);
 	if (async)
-		call_rcu(&f->rcu, fl_destroy_filter);
+		tcf_queue_work(&f->rwork, fl_destroy_filter_work);
 	else
 		__fl_destroy_filter(f);
 
@@ -336,20 +323,13 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
 
 static void fl_destroy_sleepable(struct work_struct *work)
 {
-	struct cls_fl_head *head = container_of(work, struct cls_fl_head,
-						work);
+	struct cls_fl_head *head = container_of(to_rcu_work(work),
+						struct cls_fl_head,
+						rwork);
 	kfree(head);
 	module_put(THIS_MODULE);
 }
 
-static void fl_destroy_rcu(struct rcu_head *rcu)
-{
-	struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu);
-
-	INIT_WORK(&head->work, fl_destroy_sleepable);
-	schedule_work(&head->work);
-}
-
 static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -365,7 +345,7 @@ static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	idr_destroy(&head->handle_idr);
 
 	__module_get(THIS_MODULE);
-	call_rcu(&head->rcu, fl_destroy_rcu);
+	tcf_queue_work(&head->rwork, fl_destroy_sleepable);
 }
 
 static void *fl_get(struct tcf_proto *tp, u32 handle)
@@ -1036,7 +1016,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		list_replace_rcu(&fold->list, &fnew->list);
 		tcf_unbind_filter(tp, &fold->res);
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, fl_destroy_filter);
+		tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
 	} else {
 		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
 	}
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 8b207723fbc2..29eeeaf3ea44 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -47,10 +47,7 @@ struct fw_filter {
 #endif /* CONFIG_NET_CLS_IND */
 	struct tcf_exts		exts;
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static u32 fw_hash(u32 handle)
@@ -134,21 +131,14 @@ static void __fw_delete_filter(struct fw_filter *f)
 
 static void fw_delete_filter_work(struct work_struct *work)
 {
-	struct fw_filter *f = container_of(work, struct fw_filter, work);
-
+	struct fw_filter *f = container_of(to_rcu_work(work),
+					   struct fw_filter,
+					   rwork);
 	rtnl_lock();
 	__fw_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void fw_delete_filter(struct rcu_head *head)
-{
-	struct fw_filter *f = container_of(head, struct fw_filter, rcu);
-
-	INIT_WORK(&f->work, fw_delete_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
@@ -164,7 +154,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 					 rtnl_dereference(f->next));
 			tcf_unbind_filter(tp, &f->res);
 			if (tcf_exts_get_net(&f->exts))
-				call_rcu(&f->rcu, fw_delete_filter);
+				tcf_queue_work(&f->rwork, fw_delete_filter_work);
 			else
 				__fw_delete_filter(f);
 		}
@@ -193,7 +183,7 @@ static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
 			RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
 			tcf_unbind_filter(tp, &f->res);
 			tcf_exts_get_net(&f->exts);
-			call_rcu(&f->rcu, fw_delete_filter);
+			tcf_queue_work(&f->rwork, fw_delete_filter_work);
 			ret = 0;
 			break;
 		}
@@ -316,7 +306,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 		rcu_assign_pointer(*fp, fnew);
 		tcf_unbind_filter(tp, &f->res);
 		tcf_exts_get_net(&f->exts);
-		call_rcu(&f->rcu, fw_delete_filter);
+		tcf_queue_work(&f->rwork, fw_delete_filter_work);
 
 		*arg = fnew;
 		return err;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2ba721a590a7..47b207ef7762 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,10 +21,7 @@ struct cls_mall_head {
 	struct tcf_result res;
 	u32 handle;
 	u32 flags;
-	union {
-		struct work_struct work;
-		struct rcu_head	rcu;
-	};
+	struct rcu_work rwork;
 };
 
 static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -53,22 +50,14 @@ static void __mall_destroy(struct cls_mall_head *head)
 
 static void mall_destroy_work(struct work_struct *work)
 {
-	struct cls_mall_head *head = container_of(work, struct cls_mall_head,
-						  work);
+	struct cls_mall_head *head = container_of(to_rcu_work(work),
+						  struct cls_mall_head,
+						  rwork);
 	rtnl_lock();
 	__mall_destroy(head);
 	rtnl_unlock();
 }
 
-static void mall_destroy_rcu(struct rcu_head *rcu)
-{
-	struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
-						  rcu);
-
-	INIT_WORK(&head->work, mall_destroy_work);
-	tcf_queue_work(&head->work);
-}
-
 static void mall_destroy_hw_filter(struct tcf_proto *tp,
 				   struct cls_mall_head *head,
 				   unsigned long cookie,
@@ -126,7 +115,7 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 		mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
 
 	if (tcf_exts_get_net(&head->exts))
-		call_rcu(&head->rcu, mall_destroy_rcu);
+		tcf_queue_work(&head->rwork, mall_destroy_work);
 	else
 		__mall_destroy(head);
 }
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 21a03a8ee029..0404aa5fa7cb 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -57,10 +57,7 @@ struct route4_filter {
 	u32			handle;
 	struct route4_bucket	*bkt;
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 #define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
@@ -266,19 +263,17 @@ static void __route4_delete_filter(struct route4_filter *f)
 
 static void route4_delete_filter_work(struct work_struct *work)
 {
-	struct route4_filter *f = container_of(work, struct route4_filter, work);
-
+	struct route4_filter *f = container_of(to_rcu_work(work),
+					       struct route4_filter,
+					       rwork);
 	rtnl_lock();
 	__route4_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void route4_delete_filter(struct rcu_head *head)
+static void route4_queue_work(struct route4_filter *f)
 {
-	struct route4_filter *f = container_of(head, struct route4_filter, rcu);
-
-	INIT_WORK(&f->work, route4_delete_filter_work);
-	tcf_queue_work(&f->work);
+	tcf_queue_work(&f->rwork, route4_delete_filter_work);
 }
 
 static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
@@ -304,7 +299,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 					RCU_INIT_POINTER(b->ht[h2], next);
 					tcf_unbind_filter(tp, &f->res);
 					if (tcf_exts_get_net(&f->exts))
-						call_rcu(&f->rcu, route4_delete_filter);
+						route4_queue_work(f);
 					else
 						__route4_delete_filter(f);
 				}
@@ -349,7 +344,7 @@ static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
 			/* Delete it */
 			tcf_unbind_filter(tp, &f->res);
 			tcf_exts_get_net(&f->exts);
-			call_rcu(&f->rcu, route4_delete_filter);
+			tcf_queue_work(&f->rwork, route4_delete_filter_work);
 
 			/* Strip RTNL protected tree */
 			for (i = 0; i <= 32; i++) {
@@ -554,7 +549,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	if (fold) {
 		tcf_unbind_filter(tp, &fold->res);
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, route4_delete_filter);
+		tcf_queue_work(&fold->rwork, route4_delete_filter_work);
 	}
 	return 0;
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f1297657c27..e9ccf7daea7d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -97,10 +97,7 @@ struct rsvp_filter {
 
 	u32				handle;
 	struct rsvp_session		*sess;
-	union {
-		struct work_struct		work;
-		struct rcu_head			rcu;
-	};
+	struct rcu_work			rwork;
 };
 
 static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
@@ -294,21 +291,14 @@ static void __rsvp_delete_filter(struct rsvp_filter *f)
 
 static void rsvp_delete_filter_work(struct work_struct *work)
 {
-	struct rsvp_filter *f = container_of(work, struct rsvp_filter, work);
-
+	struct rsvp_filter *f = container_of(to_rcu_work(work),
+					     struct rsvp_filter,
+					     rwork);
 	rtnl_lock();
 	__rsvp_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void rsvp_delete_filter_rcu(struct rcu_head *head)
-{
-	struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu);
-
-	INIT_WORK(&f->work, rsvp_delete_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
 {
 	tcf_unbind_filter(tp, &f->res);
@@ -317,7 +307,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
 	 * in cleanup() callback
 	 */
 	if (tcf_exts_get_net(&f->exts))
-		call_rcu(&f->rcu, rsvp_delete_filter_rcu);
+		tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
 	else
 		__rsvp_delete_filter(f);
 }
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index b49cc990a000..32f4bbd82f35 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -28,20 +28,14 @@
 struct tcindex_filter_result {
 	struct tcf_exts		exts;
 	struct tcf_result	res;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 struct tcindex_filter {
 	u16 key;
 	struct tcindex_filter_result result;
 	struct tcindex_filter __rcu *next;
-	union {
-		struct work_struct work;
-		struct rcu_head rcu;
-	};
+	struct rcu_work rwork;
 };
 
 
@@ -152,21 +146,14 @@ static void tcindex_destroy_rexts_work(struct work_struct *work)
 {
 	struct tcindex_filter_result *r;
 
-	r = container_of(work, struct tcindex_filter_result, work);
+	r = container_of(to_rcu_work(work),
+			 struct tcindex_filter_result,
+			 rwork);
 	rtnl_lock();
 	__tcindex_destroy_rexts(r);
 	rtnl_unlock();
 }
 
-static void tcindex_destroy_rexts(struct rcu_head *head)
-{
-	struct tcindex_filter_result *r;
-
-	r = container_of(head, struct tcindex_filter_result, rcu);
-	INIT_WORK(&r->work, tcindex_destroy_rexts_work);
-	tcf_queue_work(&r->work);
-}
-
 static void __tcindex_destroy_fexts(struct tcindex_filter *f)
 {
 	tcf_exts_destroy(&f->result.exts);
@@ -176,23 +163,15 @@ static void __tcindex_destroy_fexts(struct tcindex_filter *f)
 
 static void tcindex_destroy_fexts_work(struct work_struct *work)
 {
-	struct tcindex_filter *f = container_of(work, struct tcindex_filter,
-						work);
+	struct tcindex_filter *f = container_of(to_rcu_work(work),
+						struct tcindex_filter,
+						rwork);
 
 	rtnl_lock();
 	__tcindex_destroy_fexts(f);
 	rtnl_unlock();
 }
 
-static void tcindex_destroy_fexts(struct rcu_head *head)
-{
-	struct tcindex_filter *f = container_of(head, struct tcindex_filter,
-						rcu);
-
-	INIT_WORK(&f->work, tcindex_destroy_fexts_work);
-	tcf_queue_work(&f->work);
-}
-
 static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
 			  struct netlink_ext_ack *extack)
 {
@@ -228,12 +207,12 @@ found:
 	 */
 	if (f) {
 		if (tcf_exts_get_net(&f->result.exts))
-			call_rcu(&f->rcu, tcindex_destroy_fexts);
+			tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
 		else
 			__tcindex_destroy_fexts(f);
 	} else {
 		if (tcf_exts_get_net(&r->exts))
-			call_rcu(&r->rcu, tcindex_destroy_rexts);
+			tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
 		else
 			__tcindex_destroy_rexts(r);
 	}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index bac47b5d18fd..fb861f90fde6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,10 +68,7 @@ struct tc_u_knode {
 	u32 __percpu		*pcpu_success;
 #endif
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 	/* The 'sel' field MUST be the last field in structure to allow for
 	 * tc_u32_keys allocated at end of structure.
 	 */
@@ -436,21 +433,14 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
  */
 static void u32_delete_key_work(struct work_struct *work)
 {
-	struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
-
+	struct tc_u_knode *key = container_of(to_rcu_work(work),
+					      struct tc_u_knode,
+					      rwork);
 	rtnl_lock();
 	u32_destroy_key(key->tp, key, false);
 	rtnl_unlock();
 }
 
-static void u32_delete_key_rcu(struct rcu_head *rcu)
-{
-	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
-
-	INIT_WORK(&key->work, u32_delete_key_work);
-	tcf_queue_work(&key->work);
-}
-
 /* u32_delete_key_freepf_rcu is the rcu callback variant
  * that free's the entire structure including the statistics
  * percpu variables. Only use this if the key is not a copy
@@ -460,21 +450,14 @@ static void u32_delete_key_rcu(struct rcu_head *rcu)
  */
 static void u32_delete_key_freepf_work(struct work_struct *work)
 {
-	struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
-
+	struct tc_u_knode *key = container_of(to_rcu_work(work),
+					      struct tc_u_knode,
+					      rwork);
 	rtnl_lock();
 	u32_destroy_key(key->tp, key, true);
 	rtnl_unlock();
 }
 
-static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
-{
-	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
-
-	INIT_WORK(&key->work, u32_delete_key_freepf_work);
-	tcf_queue_work(&key->work);
-}
-
 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 {
 	struct tc_u_knode __rcu **kp;
@@ -491,7 +474,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 				tcf_unbind_filter(tp, &key->res);
 				idr_remove(&ht->handle_idr, key->handle);
 				tcf_exts_get_net(&key->exts);
-				call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
+				tcf_queue_work(&key->rwork, u32_delete_key_freepf_work);
 				return 0;
 			}
 		}
@@ -611,7 +594,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 			u32_remove_hw_knode(tp, n, extack);
 			idr_remove(&ht->handle_idr, n->handle);
 			if (tcf_exts_get_net(&n->exts))
-				call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
+				tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
 			else
 				u32_destroy_key(n->tp, n, true);
 		}
@@ -995,7 +978,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		tcf_exts_get_net(&n->exts);
-		call_rcu(&n->rcu, u32_delete_key_rcu);
+		tcf_queue_work(&n->rwork, u32_delete_key_work);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 9f323973c915d402378cb3e1336dd6ed4c45144b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 23 May 2018 17:08:47 -0700
Subject: net/ipv4: Udate fib_table_lookup tracepoint

Commit 4a2d73a4fb36 ("ipv4: fib_rules: support match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.

In addition, make the IPv4 tracepoint similar to the IPv6 one where
the lookup parameters and result are dumped in 1 event. It is much
easier to use and understand the outcome of the lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib.h | 72 +++++++++++++++++++++++++++-------------------
 net/ipv4/fib_trie.c        | 14 +++++----
 2 files changed, 52 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 81b7e985bb45..f5a1d4c518d8 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -12,12 +12,14 @@
 
 TRACE_EVENT(fib_table_lookup,
 
-	TP_PROTO(u32 tb_id, const struct flowi4 *flp),
+	TP_PROTO(u32 tb_id, const struct flowi4 *flp,
+		 const struct fib_nh *nh, int err),
 
-	TP_ARGS(tb_id, flp),
+	TP_ARGS(tb_id, flp, nh, err),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
+		__field(	int,	err		)
 		__field(	int,	oif		)
 		__field(	int,	iif		)
 		__field(	__u8,	tos		)
@@ -25,12 +27,19 @@ TRACE_EVENT(fib_table_lookup,
 		__field(	__u8,	flags		)
 		__array(	__u8,	src,	4	)
 		__array(	__u8,	dst,	4	)
+		__array(	__u8,	gw,	4	)
+		__array(	__u8,	saddr,	4	)
+		__field(	u16,	sport		)
+		__field(	u16,	dport		)
+		__field(	u8,	proto		)
+		__dynamic_array(char,  name,   IFNAMSIZ )
 	),
 
 	TP_fast_assign(
 		__be32 *p32;
 
 		__entry->tb_id = tb_id;
+		__entry->err = err;
 		__entry->oif = flp->flowi4_oif;
 		__entry->iif = flp->flowi4_iif;
 		__entry->tos = flp->flowi4_tos;
@@ -42,36 +51,41 @@ TRACE_EVENT(fib_table_lookup,
 
 		p32 = (__be32 *) __entry->dst;
 		*p32 = flp->daddr;
-	),
-
-	TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x",
-		  __entry->tb_id, __entry->oif, __entry->iif,
-		  __entry->src, __entry->dst, __entry->tos, __entry->scope,
-		  __entry->flags)
-);
-
-TRACE_EVENT(fib_table_lookup_nh,
-
-	TP_PROTO(const struct fib_nh *nh),
-
-	TP_ARGS(nh),
-
-	TP_STRUCT__entry(
-		__string(	name,	nh->nh_dev->name)
-		__field(	int,	oif		)
-		__array(	__u8,	src,	4	)
-	),
-
-	TP_fast_assign(
-		__be32 *p32 = (__be32 *) __entry->src;
 
-		__assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set");
-		__entry->oif = nh->nh_oif;
-		*p32 = nh->nh_saddr;
+		__entry->proto = flp->flowi4_proto;
+		if (__entry->proto == IPPROTO_TCP ||
+		    __entry->proto == IPPROTO_UDP) {
+			__entry->sport = ntohs(flp->fl4_sport);
+			__entry->dport = ntohs(flp->fl4_dport);
+		} else {
+			__entry->sport = 0;
+			__entry->dport = 0;
+		}
+
+		if (nh) {
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = nh->nh_saddr;
+
+			p32 = (__be32 *) __entry->gw;
+			*p32 = nh->nh_gw;
+
+			__assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-");
+		} else {
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = 0;
+
+			p32 = (__be32 *) __entry->gw;
+			*p32 = 0;
+
+			__assign_str(name, "-");
+		}
 	),
 
-	TP_printk("nexthop dev %s oif %d src %pI4",
-		  __get_str(name), __entry->oif, __entry->src)
+	TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d",
+		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
+		  __entry->tos, __entry->scope, __entry->flags,
+		  __get_str(name), __entry->gw, __entry->saddr, __entry->err)
 );
 
 TRACE_EVENT(fib_validate_source,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3ce98c..65c340f230ae 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 	unsigned long index;
 	t_key cindex;
 
-	trace_fib_table_lookup(tb->tb_id, flp);
-
 	pn = t->kv;
 	cindex = 0;
 
 	n = get_child_rcu(pn, cindex);
-	if (!n)
+	if (!n) {
+		trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
 		return -EAGAIN;
+	}
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	this_cpu_inc(stats->gets);
@@ -1416,8 +1416,11 @@ backtrace:
 				 * nothing for us to do as we do not have any
 				 * further nodes to parse.
 				 */
-				if (IS_TRIE(pn))
+				if (IS_TRIE(pn)) {
+					trace_fib_table_lookup(tb->tb_id, flp,
+							       NULL, -EAGAIN);
 					return -EAGAIN;
+				}
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 				this_cpu_inc(stats->backtrack);
 #endif
@@ -1459,6 +1462,7 @@ found:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
+			trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
 			return err;
 		}
 		if (fi->fib_flags & RTNH_F_DEAD)
@@ -1494,7 +1498,7 @@ found:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
-			trace_fib_table_lookup_nh(nh);
+			trace_fib_table_lookup(tb->tb_id, flp, nh, err);
 
 			return err;
 		}
-- 
cgit v1.2.3


From 30d444d30049490398178ca4337ab49156571886 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 23 May 2018 17:08:48 -0700
Subject: net/ipv6: Udate fib6_table_lookup tracepoint

Commit bb0ad1987e96 ("ipv6: fib6_rules: support for match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib6.h | 29 ++++++++++++++++++++++-------
 net/core/net-traces.c       |  4 ----
 net/ipv6/route.c            |  9 +++++++--
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 1b8d951e3c12..b088b54d699c 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -19,7 +19,7 @@ TRACE_EVENT(fib6_table_lookup,
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
-
+		__field(	int,	err		)
 		__field(	int,	oif		)
 		__field(	int,	iif		)
 		__field(	__u8,	tos		)
@@ -27,7 +27,10 @@ TRACE_EVENT(fib6_table_lookup,
 		__field(	__u8,	flags		)
 		__array(	__u8,	src,	16	)
 		__array(	__u8,	dst,	16	)
-
+		__field(        u16,	sport		)
+		__field(        u16,	dport		)
+		__field(        u8,	proto		)
+		__field(        u8,	rt_type		)
 		__dynamic_array(	char,	name,	IFNAMSIZ )
 		__array(		__u8,	gw,	16	 )
 	),
@@ -36,6 +39,7 @@ TRACE_EVENT(fib6_table_lookup,
 		struct in6_addr *in6;
 
 		__entry->tb_id = table->tb6_id;
+		__entry->err = ip6_rt_type_to_error(f6i->fib6_type);
 		__entry->oif = flp->flowi6_oif;
 		__entry->iif = flp->flowi6_iif;
 		__entry->tos = ip6_tclass(flp->flowlabel);
@@ -48,10 +52,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
+		__entry->proto = flp->flowi6_proto;
+		if (__entry->proto == IPPROTO_TCP ||
+		    __entry->proto == IPPROTO_UDP) {
+			__entry->sport = ntohs(flp->fl6_sport);
+			__entry->dport = ntohs(flp->fl6_dport);
+		} else {
+			__entry->sport = 0;
+			__entry->dport = 0;
+		}
+
 		if (f6i->fib6_nh.nh_dev) {
 			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
-			__assign_str(name, "");
+			__assign_str(name, "-");
 		}
 		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
@@ -65,10 +79,11 @@ TRACE_EVENT(fib6_table_lookup,
 		}
 	),
 
-	TP_printk("table %3u oif %d iif %d src %pI6c dst %pI6c tos %d scope %d flags %x ==> dev %s gw %pI6c",
-		  __entry->tb_id, __entry->oif, __entry->iif,
-		  __entry->src, __entry->dst, __entry->tos, __entry->scope,
-		  __entry->flags, __get_str(name), __entry->gw)
+	TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d",
+		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
+		  __entry->tos, __entry->scope, __entry->flags,
+		  __get_str(name), __entry->gw, __entry->err)
 );
 
 #endif /* _TRACE_FIB6_H */
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa1..419af6dfe29f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
 #include <trace/events/tcp.h>
 #include <trace/events/fib.h>
 #include <trace/events/qdisc.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <trace/events/fib6.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
-#endif
 #if IS_ENABLED(CONFIG_BRIDGE)
 #include <trace/events/bridge.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0a35ded448a6..22c4de2317d0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -64,14 +64,19 @@
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
 #include <net/ip.h>
-#include <trace/events/fib6.h>
-
 #include <linux/uaccess.h>
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
 
+static int ip6_rt_type_to_error(u8 fib6_type);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#undef CREATE_TRACE_POINTS
+
 enum rt6_nud_state {
 	RT6_NUD_FAIL_HARD = -3,
 	RT6_NUD_FAIL_PROBE = -2,
-- 
cgit v1.2.3


From c949cbbbe5d6ff3dcacf82e8fd26f627285e8a12 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 23 May 2018 17:08:49 -0700
Subject: net/ipv4: Remove tracepoint in fib_validate_source

Tracepoint does not add value and the call to fib_lookup follows
it which shows the same information and the fib lookup result.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib.h | 35 -----------------------------------
 net/ipv4/fib_frontend.c    |  2 --
 2 files changed, 37 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index f5a1d4c518d8..9763cddd0594 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -87,41 +87,6 @@ TRACE_EVENT(fib_table_lookup,
 		  __entry->tos, __entry->scope, __entry->flags,
 		  __get_str(name), __entry->gw, __entry->saddr, __entry->err)
 );
-
-TRACE_EVENT(fib_validate_source,
-
-	TP_PROTO(const struct net_device *dev, const struct flowi4 *flp),
-
-	TP_ARGS(dev, flp),
-
-	TP_STRUCT__entry(
-		__string(	name,	dev->name	)
-		__field(	int,	oif		)
-		__field(	int,	iif		)
-		__field(	__u8,	tos		)
-		__array(	__u8,	src,	4	)
-		__array(	__u8,	dst,	4	)
-	),
-
-	TP_fast_assign(
-		__be32 *p32;
-
-		__assign_str(name, dev ? dev->name : "not set");
-		__entry->oif = flp->flowi4_oif;
-		__entry->iif = flp->flowi4_iif;
-		__entry->tos = flp->flowi4_tos;
-
-		p32 = (__be32 *) __entry->src;
-		*p32 = flp->saddr;
-
-		p32 = (__be32 *) __entry->dst;
-		*p32 = flp->daddr;
-	),
-
-	TP_printk("dev %s oif %d iif %d tos %d src %pI4 dst %pI4",
-		  __get_str(name), __entry->oif, __entry->iif, __entry->tos,
-		  __entry->src, __entry->dst)
-);
 #endif /* _TRACE_FIB_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 897ae92dff0f..045c43a27c12 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 		fl4.fl4_dport = 0;
 	}
 
-	trace_fib_validate_source(dev, &fl4);
-
 	if (fib_lookup(net, &fl4, &res, 0))
 		goto last_resort;
 	if (res.type != RTN_UNICAST &&
-- 
cgit v1.2.3


From f44aa9ef7950a56daa3a5b41f069761f945f1a1f Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:52 -0700
Subject: net: include hash policy in LAG changeupper info

LAG upper event notifiers contain the tx type used by the LAG device.
Extend this to also include the hash policy used for tx types that
utilize hashing.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 27 ++++++++++++++++++++++++++-
 drivers/net/team/team.c         |  1 +
 include/linux/netdevice.h       | 11 +++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fea17b92b1ae..bd53a71f6b00 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1218,12 +1218,37 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
 	}
 }
 
+static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
+					       enum netdev_lag_tx_type type)
+{
+	if (type != NETDEV_LAG_TX_TYPE_HASH)
+		return NETDEV_LAG_HASH_NONE;
+
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_LAYER2:
+		return NETDEV_LAG_HASH_L2;
+	case BOND_XMIT_POLICY_LAYER34:
+		return NETDEV_LAG_HASH_L34;
+	case BOND_XMIT_POLICY_LAYER23:
+		return NETDEV_LAG_HASH_L23;
+	case BOND_XMIT_POLICY_ENCAP23:
+		return NETDEV_LAG_HASH_E23;
+	case BOND_XMIT_POLICY_ENCAP34:
+		return NETDEV_LAG_HASH_E34;
+	default:
+		return NETDEV_LAG_HASH_UNKNOWN;
+	}
+}
+
 static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
 				      struct netlink_ext_ack *extack)
 {
 	struct netdev_lag_upper_info lag_upper_info;
+	enum netdev_lag_tx_type type;
 
-	lag_upper_info.tx_type = bond_lag_tx_type(bond);
+	type = bond_lag_tx_type(bond);
+	lag_upper_info.tx_type = type;
+	lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
 
 	return netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
 					    &lag_upper_info, extack);
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index d6ff881165d0..e6730a01d130 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1129,6 +1129,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port,
 	int err;
 
 	lag_upper_info.tx_type = team->mode->lag_tx_type;
+	lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN;
 	err = netdev_master_upper_dev_link(port->dev, team->dev, NULL,
 					   &lag_upper_info, extack);
 	if (err)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index debdb6286170..8452f72087ef 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2332,8 +2332,19 @@ enum netdev_lag_tx_type {
 	NETDEV_LAG_TX_TYPE_HASH,
 };
 
+enum netdev_lag_hash {
+	NETDEV_LAG_HASH_NONE,
+	NETDEV_LAG_HASH_L2,
+	NETDEV_LAG_HASH_L34,
+	NETDEV_LAG_HASH_L23,
+	NETDEV_LAG_HASH_E23,
+	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_UNKNOWN,
+};
+
 struct netdev_lag_upper_info {
 	enum netdev_lag_tx_type tx_type;
+	enum netdev_lag_hash hash_type;
 };
 
 struct netdev_lag_lower_state_info {
-- 
cgit v1.2.3


From a1150c202207cc8501bebc45b63c264f91959260 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 3 May 2018 12:47:16 -0700
Subject: perf/core: Fix group scheduling with mixed hw and sw events

When hw and sw events are mixed in the same group, they are all attached
to the hw perf_event_context. This sometimes requires moving group of
perf_event to a different context.

We found a bug in how the kernel handles this, for example if we do:

   perf stat -e '{faults,ref-cycles,faults}'  -I 1000

     1.005591180              1,297      faults
     1.005591180        457,476,576      ref-cycles
     1.005591180    <not supported>      faults

First, sw event "faults" is attached to the sw context, and becomes the
group leader. Then, hw event "ref-cycles" is attached, so both events
are moved to the hw context. Last, another sw "faults" tries to attach,
but it fails because of mismatch between the new target ctx (from sw
pmu) and the group_leader's ctx (hw context, same as ref-cycles).

The broken condition is:
   group_leader is sw event;
   group_leader is on hw context;
   add a sw event to the group.

Fix this scenario by checking group_leader's context (instead of just
event type). If group_leader is on hw context, use the ->pmu of this
context to look up context for the new event.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <kernel-team@fb.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: b04243ef7006 ("perf: Complete software pmu grouping")
Link: http://lkml.kernel.org/r/20180503194716.162815-1-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  8 ++++++++
 kernel/events/core.c       | 21 +++++++++++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99eb9a4e..def866f7269b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1016,6 +1016,14 @@ static inline int is_software_event(struct perf_event *event)
 	return event->event_caps & PERF_EV_CAP_SOFTWARE;
 }
 
+/*
+ * Return 1 for event in sw context, 0 for event in hw context
+ */
+static inline int in_software_context(struct perf_event *event)
+{
+	return event->ctx->pmu->task_ctx_nr == perf_sw_context;
+}
+
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce359ad..ce6aa5ff3c96 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10521,19 +10521,20 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (pmu->task_ctx_nr == perf_sw_context)
 		event->event_caps |= PERF_EV_CAP_SOFTWARE;
 
-	if (group_leader &&
-	    (is_software_event(event) != is_software_event(group_leader))) {
-		if (is_software_event(event)) {
+	if (group_leader) {
+		if (is_software_event(event) &&
+		    !in_software_context(group_leader)) {
 			/*
-			 * If event and group_leader are not both a software
-			 * event, and event is, then group leader is not.
+			 * If the event is a sw event, but the group_leader
+			 * is on hw context.
 			 *
-			 * Allow the addition of software events to !software
-			 * groups, this is safe because software events never
-			 * fail to schedule.
+			 * Allow the addition of software events to hw
+			 * groups, this is safe because software events
+			 * never fail to schedule.
 			 */
-			pmu = group_leader->pmu;
-		} else if (is_software_event(group_leader) &&
+			pmu = group_leader->ctx->pmu;
+		} else if (!is_software_event(event) &&
+			   is_software_event(group_leader) &&
 			   (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
 			/*
 			 * In case the group is a pure software group, and we
-- 
cgit v1.2.3


From 9511bce9fe8e5e6c0f923c09243a713eba560141 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 17 Apr 2018 23:29:07 -0700
Subject: perf/core: Fix bad use of igrab()

As Miklos reported and suggested:

 "This pattern repeats two times in trace_uprobe.c and in
  kernel/events/core.c as well:

      ret = kern_path(filename, LOOKUP_FOLLOW, &path);
      if (ret)
          goto fail_address_parse;

      inode = igrab(d_inode(path.dentry));
      path_put(&path);

  And it's wrong.  You can only hold a reference to the inode if you
  have an active ref to the superblock as well (which is normally
  through path.mnt) or holding s_umount.

  This way unmounting the containing filesystem while the tracepoint is
  active will give you the "VFS: Busy inodes after unmount..." message
  and a crash when the inode is finally put.

  Solution: store path instead of inode."

This patch fixes the issue in kernel/event/core.c.

Reviewed-and-tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Reported-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <kernel-team@fb.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 375637bc5249 ("perf/core: Introduce address range filtering")
Link: http://lkml.kernel.org/r/20180418062907.3210386-2-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/pt.c |  4 ++--
 include/linux/perf_event.h |  2 +-
 kernel/events/core.c       | 21 +++++++++------------
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b993942a0e4..8d016ce5b80d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
 		    filter->action == PERF_ADDR_FILTER_ACTION_START)
 			return -EOPNOTSUPP;
 
-		if (!filter->inode) {
+		if (!filter->path.dentry) {
 			if (!valid_kernel_ip(filter->offset))
 				return -EINVAL;
 
@@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
 		return;
 
 	list_for_each_entry(filter, &head->list, entry) {
-		if (filter->inode && !offs[range]) {
+		if (filter->path.dentry && !offs[range]) {
 			msr_a = msr_b = 0;
 		} else {
 			/* apply the offset */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index def866f7269b..bea0b0cd4bf7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,7 +467,7 @@ enum perf_addr_filter_action_t {
  */
 struct perf_addr_filter {
 	struct list_head	entry;
-	struct inode		*inode;
+	struct path		path;
 	unsigned long		offset;
 	unsigned long		size;
 	enum perf_addr_filter_action_t	action;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce6aa5ff3c96..24dea13a27ed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6668,7 +6668,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 
 	raw_spin_lock_irqsave(&ifh->lock, flags);
 	list_for_each_entry(filter, &ifh->list, entry) {
-		if (filter->inode) {
+		if (filter->path.dentry) {
 			event->addr_filters_offs[count] = 0;
 			restart++;
 		}
@@ -7333,7 +7333,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
 				     struct file *file, unsigned long offset,
 				     unsigned long size)
 {
-	if (filter->inode != file_inode(file))
+	if (d_inode(filter->path.dentry) != file_inode(file))
 		return false;
 
 	if (filter->offset > offset + size)
@@ -8686,8 +8686,7 @@ static void free_filters_list(struct list_head *filters)
 	struct perf_addr_filter *filter, *iter;
 
 	list_for_each_entry_safe(filter, iter, filters, entry) {
-		if (filter->inode)
-			iput(filter->inode);
+		path_put(&filter->path);
 		list_del(&filter->entry);
 		kfree(filter);
 	}
@@ -8784,7 +8783,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 		 * Adjust base offset if the filter is associated to a binary
 		 * that needs to be mapped:
 		 */
-		if (filter->inode)
+		if (filter->path.dentry)
 			event->addr_filters_offs[count] =
 				perf_addr_filter_apply(filter, mm);
 
@@ -8858,7 +8857,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 {
 	struct perf_addr_filter *filter = NULL;
 	char *start, *orig, *filename = NULL;
-	struct path path;
 	substring_t args[MAX_OPT_ARGS];
 	int state = IF_STATE_ACTION, token;
 	unsigned int kernel = 0;
@@ -8971,19 +8969,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 					goto fail_free_name;
 
 				/* look up the path and grab its inode */
-				ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+				ret = kern_path(filename, LOOKUP_FOLLOW,
+						&filter->path);
 				if (ret)
 					goto fail_free_name;
 
-				filter->inode = igrab(d_inode(path.dentry));
-				path_put(&path);
 				kfree(filename);
 				filename = NULL;
 
 				ret = -EINVAL;
-				if (!filter->inode ||
-				    !S_ISREG(filter->inode->i_mode))
-					/* free_filters_list() will iput() */
+				if (!filter->path.dentry ||
+				    !S_ISREG(d_inode(filter->path.dentry)
+					     ->i_mode))
 					goto fail;
 
 				event->addr_filters.nr_file_filters++;
-- 
cgit v1.2.3


From 745364533e40ec76f7822275d491f5196362e016 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 25 May 2018 10:30:35 +0200
Subject: ACPICA: Introduce acpi_dispatch_gpe()

Introduce acpi_dispatch_gpe() as a wrapper around acpi_ev_detect_gpe()
for checking if the given GPE (as represented by a GPE device handle
and a GPE number) is currently active and dispatching it (if that's
the case) outside of interrupt context.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/evgpe.c   |  6 ++++++
 drivers/acpi/acpica/evxfgpe.c | 22 ++++++++++++++++++++++
 include/acpi/acpixf.h         |  1 +
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c
index abbd59063906..e10fec99a182 100644
--- a/drivers/acpi/acpica/evgpe.c
+++ b/drivers/acpi/acpica/evgpe.c
@@ -634,6 +634,12 @@ acpi_ev_detect_gpe(struct acpi_namespace_node *gpe_device,
 
 	flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock);
 
+	if (!gpe_event_info) {
+		gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number);
+		if (!gpe_event_info)
+			goto error_exit;
+	}
+
 	/* Get the info block for the entire GPE register */
 
 	gpe_register_info = gpe_event_info->register_info;
diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c
index c80e3bdf4805..b2d5f66cc1b0 100644
--- a/drivers/acpi/acpica/evxfgpe.c
+++ b/drivers/acpi/acpica/evxfgpe.c
@@ -637,6 +637,28 @@ unlock_and_exit:
 
 ACPI_EXPORT_SYMBOL(acpi_get_gpe_status)
 
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_gispatch_gpe
+ *
+ * PARAMETERS:  gpe_device          - Parent GPE Device. NULL for GPE0/GPE1
+ *              gpe_number          - GPE level within the GPE block
+ *
+ * RETURN:      None
+ *
+ * DESCRIPTION: Detect and dispatch a General Purpose Event to either a function
+ *              (e.g. EC) or method (e.g. _Lxx/_Exx) handler.
+ *
+ ******************************************************************************/
+void acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)
+{
+	ACPI_FUNCTION_TRACE(acpi_dispatch_gpe);
+
+	acpi_ev_detect_gpe(gpe_device, NULL, gpe_number);
+}
+
+ACPI_EXPORT_SYMBOL(acpi_dispatch_gpe)
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_finish_gpe
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index da0215ea9f44..c01d790c17c2 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -753,6 +753,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status
 						     u32 gpe_number,
 						     acpi_event_status
 						     *event_status))
+ACPI_HW_DEPENDENT_RETURN_VOID(void acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number))
 ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void))
 ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void))
 ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void))
-- 
cgit v1.2.3


From 884571f0de7b02bb784be3a5c870eabce62cdaeb Mon Sep 17 00:00:00 2001
From: Huaisheng Ye <yehs1@lenovo.com>
Date: Fri, 25 May 2018 13:00:00 +0800
Subject: dma-mapping: remove unused gfp_t parameter to arch_dma_alloc_attrs

Signed-off-by: Huaisheng Ye <yehs1@lenovo.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/x86/include/asm/dma-mapping.h | 2 +-
 arch/x86/kernel/pci-dma.c          | 2 +-
 include/linux/dma-mapping.h        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 89ce4bfd241f..ef597478f0ac 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -33,7 +33,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 int arch_dma_supported(struct device *dev, u64 mask);
 #define arch_dma_supported arch_dma_supported
 
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+bool arch_dma_alloc_attrs(struct device **dev);
 #define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
 #endif
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index bcbaa2e8031e..c7113fd18e32 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -73,7 +73,7 @@ void __init pci_iommu_alloc(void)
 	}
 }
 
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
+bool arch_dma_alloc_attrs(struct device **dev)
 {
 	if (!*dev)
 		*dev = &x86_dma_fallback_dev;
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4be070df5fc5..15e4b4e9cc20 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -502,7 +502,7 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0)
 
 #ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev, flag)	(true)
+#define arch_dma_alloc_attrs(dev)	(true)
 #endif
 
 static inline void *dma_alloc_attrs(struct device *dev, size_t size,
@@ -521,7 +521,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (!arch_dma_alloc_attrs(&dev, &flag))
+	if (!arch_dma_alloc_attrs(&dev))
 		return NULL;
 	if (!ops->alloc)
 		return NULL;
-- 
cgit v1.2.3


From 93ee37c2a6a97e5d358af2d8f0510817b6e75679 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Wed, 11 Apr 2018 17:54:20 +0100
Subject: thread_info: Add update_thread_flag() helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are a number of bits of code sprinkled around the kernel to
set a thread flag if a certain condition is true, and clear it
otherwise.

To help make those call sites terser and less cumbersome, this
patch adds a new family of thread flag manipulators

	update*_thread_flag([...,] flag, cond)

which do the equivalent of:

	if (cond)
		set*_thread_flag([...,] flag);
	else
		clear*_thread_flag([...,] flag);

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/sched.h       |  6 ++++++
 include/linux/thread_info.h | 11 +++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3d697f3b573..c2c305199721 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1578,6 +1578,12 @@ static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
 	clear_ti_thread_flag(task_thread_info(tsk), flag);
 }
 
+static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
+					  bool value)
+{
+	update_ti_thread_flag(task_thread_info(tsk), flag, value);
+}
+
 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index cf2862bd134a..8d8821b3689a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -60,6 +60,15 @@ static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
 	clear_bit(flag, (unsigned long *)&ti->flags);
 }
 
+static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
+					 bool value)
+{
+	if (value)
+		set_ti_thread_flag(ti, flag);
+	else
+		clear_ti_thread_flag(ti, flag);
+}
+
 static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
 {
 	return test_and_set_bit(flag, (unsigned long *)&ti->flags);
@@ -79,6 +88,8 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 	set_ti_thread_flag(current_thread_info(), flag)
 #define clear_thread_flag(flag) \
 	clear_ti_thread_flag(current_thread_info(), flag)
+#define update_thread_flag(flag, value) \
+	update_ti_thread_flag(current_thread_info(), flag, value)
 #define test_and_set_thread_flag(flag) \
 	test_and_set_ti_thread_flag(current_thread_info(), flag)
 #define test_and_clear_thread_flag(flag) \
-- 
cgit v1.2.3


From bd2a6394fd2d3ea528d4b9c67f829e35f1f5d5dd Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 23 Feb 2018 17:23:57 +0100
Subject: KVM: arm/arm64: Introduce kvm_arch_vcpu_run_pid_change
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM/ARM differs from other architectures in having to maintain an
additional virtual address space from that of the host and the
guest, because we split the execution of KVM across both EL1 and
EL2.

This results in a need to explicitly map data structures into EL2
(hyp) which are accessed from the hyp code.  As we are about to be
more clever with our FPSIMD handling on arm64, which stores data in
the task struct and uses thread_info flags, we will have to map
parts of the currently executing task struct into the EL2 virtual
address space.

However, we don't want to do this on every KVM_RUN, because it is a
fairly expensive operation to walk the page tables, and the common
execution mode is to map a single thread to a VCPU.  By introducing
a hook that architectures can select with
HAVE_KVM_VCPU_RUN_PID_CHANGE, we do not introduce overhead for
other architectures, but have a simple way to only map the data we
need when required for arm64.

This patch introduces the framework only, and wires it up in the
arm/arm64 KVM common code.

No functional change.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/kvm_host.h | 9 +++++++++
 virt/kvm/Kconfig         | 3 +++
 virt/kvm/kvm_main.c      | 7 ++++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6930c63126c7..4268ace60bf1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1276,4 +1276,13 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 		unsigned long start, unsigned long end);
 
+#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
+int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
+
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index cca7e065a075..72143cfaf6ec 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -54,3 +54,6 @@ config HAVE_KVM_IRQ_BYPASS
 
 config HAVE_KVM_VCPU_ASYNC_IOCTL
        bool
+
+config HAVE_KVM_VCPU_RUN_PID_CHANGE
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c7b2e927f699..c32e2407713d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2550,8 +2550,13 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		oldpid = rcu_access_pointer(vcpu->pid);
 		if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) {
 			/* The thread running this VCPU changed. */
-			struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+			struct pid *newpid;
 
+			r = kvm_arch_vcpu_run_pid_change(vcpu);
+			if (r)
+				break;
+
+			newpid = get_task_pid(current, PIDTYPE_PID);
 			rcu_assign_pointer(vcpu->pid, newpid);
 			if (oldpid)
 				synchronize_rcu();
-- 
cgit v1.2.3


From dbd9733ab6742dfb7d4f6c49e7d412d4e792d8be Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 22 May 2018 09:55:08 +0200
Subject: KVM: arm/arm64: Replace the single rdist region by a list

At the moment KVM supports a single rdist region. We want to
support several separate rdist regions so let's introduce a list
of them. This patch currently only cares about a single
entry in this list as the functionality to register several redist
regions is not yet there. So this only translates the existing code
into something functionally similar using that new data struct.

The redistributor region handle is stored in the vgic_cpu structure
to allow later computation of the TYPER last bit.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h              | 14 ++++++++++----
 virt/kvm/arm/vgic/vgic-init.c       | 16 ++++++++++++++--
 virt/kvm/arm/vgic/vgic-kvm-device.c | 13 +++++++++++--
 virt/kvm/arm/vgic/vgic-mmio-v3.c    | 38 +++++++++++++++++++++++++++++--------
 virt/kvm/arm/vgic/vgic-v3.c         | 20 +++++++++++--------
 5 files changed, 77 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index e7efe12a81bd..90e489f685ae 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -201,6 +201,14 @@ struct vgic_its {
 
 struct vgic_state_iter;
 
+struct vgic_redist_region {
+	u32 index;
+	gpa_t base;
+	u32 count; /* number of redistributors or 0 if single region */
+	u32 free_index; /* index of the next free redistributor */
+	struct list_head list;
+};
+
 struct vgic_dist {
 	bool			in_kernel;
 	bool			ready;
@@ -220,10 +228,7 @@ struct vgic_dist {
 		/* either a GICv2 CPU interface */
 		gpa_t			vgic_cpu_base;
 		/* or a number of GICv3 redistributor regions */
-		struct {
-			gpa_t		vgic_redist_base;
-			gpa_t		vgic_redist_free_offset;
-		};
+		struct list_head rd_regions;
 	};
 
 	/* distributor enabled */
@@ -311,6 +316,7 @@ struct vgic_cpu {
 	 */
 	struct vgic_io_device	rd_iodev;
 	struct vgic_io_device	sgi_iodev;
+	struct vgic_redist_region *rdreg;
 
 	/* Contains the attributes and gpa of the LPI pending tables. */
 	u64 pendbaser;
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 9a5aed7eecfd..8901b2d8fca1 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -167,8 +167,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	kvm->arch.vgic.vgic_model = type;
 
 	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
-	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
-	kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
+
+	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+		kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+	else
+		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
 
 out_unlock:
 	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
@@ -303,6 +306,7 @@ out:
 static void kvm_vgic_dist_destroy(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_redist_region *rdreg, *next;
 
 	dist->ready = false;
 	dist->initialized = false;
@@ -311,6 +315,14 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
 	dist->spis = NULL;
 	dist->nr_spis = 0;
 
+	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+		list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
+			list_del(&rdreg->list);
+			kfree(rdreg);
+		}
+		INIT_LIST_HEAD(&dist->rd_regions);
+	}
+
 	if (vgic_supports_direct_msis(kvm))
 		vgic_v4_teardown(kvm);
 }
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 10ae6f394b71..76ab3691f7fe 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -66,6 +66,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 	int r = 0;
 	struct vgic_dist *vgic = &kvm->arch.vgic;
 	phys_addr_t *addr_ptr, alignment;
+	u64 undef_value = VGIC_ADDR_UNDEF;
 
 	mutex_lock(&kvm->lock);
 	switch (type) {
@@ -84,7 +85,9 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 		addr_ptr = &vgic->vgic_dist_base;
 		alignment = SZ_64K;
 		break;
-	case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+	case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
+		struct vgic_redist_region *rdreg;
+
 		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
 		if (r)
 			break;
@@ -92,8 +95,14 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 			r = vgic_v3_set_redist_base(kvm, *addr);
 			goto out;
 		}
-		addr_ptr = &vgic->vgic_redist_base;
+		rdreg = list_first_entry(&vgic->rd_regions,
+					 struct vgic_redist_region, list);
+		if (!rdreg)
+			addr_ptr = &undef_value;
+		else
+			addr_ptr = &rdreg->base;
 		break;
+	}
 	default:
 		r = -ENODEV;
 	}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 671fe81f8e1d..d1aab183a1cc 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -580,8 +580,10 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct vgic_dist *vgic = &kvm->arch.vgic;
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
 	struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
+	struct vgic_redist_region *rdreg;
 	gpa_t rd_base, sgi_base;
 	int ret;
 
@@ -591,13 +593,17 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
 	 * function for all VCPUs when the base address is set.  Just return
 	 * without doing any work for now.
 	 */
-	if (IS_VGIC_ADDR_UNDEF(vgic->vgic_redist_base))
+	rdreg = list_first_entry(&vgic->rd_regions,
+				 struct vgic_redist_region, list);
+	if (!rdreg)
 		return 0;
 
 	if (!vgic_v3_check_base(kvm))
 		return -EINVAL;
 
-	rd_base = vgic->vgic_redist_base + vgic->vgic_redist_free_offset;
+	vgic_cpu->rdreg = rdreg;
+
+	rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
 	sgi_base = rd_base + SZ_64K;
 
 	kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
@@ -631,7 +637,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	vgic->vgic_redist_free_offset += 2 * SZ_64K;
+	rdreg->free_index++;
 out:
 	mutex_unlock(&kvm->slots_lock);
 	return ret;
@@ -673,19 +679,31 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
 int vgic_v3_set_redist_base(struct kvm *kvm, u64 addr)
 {
 	struct vgic_dist *vgic = &kvm->arch.vgic;
+	struct vgic_redist_region *rdreg;
 	int ret;
 
 	/* vgic_check_ioaddr makes sure we don't do this twice */
-	ret = vgic_check_ioaddr(kvm, &vgic->vgic_redist_base, addr, SZ_64K);
+	if (!list_empty(&vgic->rd_regions))
+		return -EINVAL;
+
+	rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL);
+	if (!rdreg)
+		return -ENOMEM;
+
+	rdreg->base = VGIC_ADDR_UNDEF;
+
+	ret = vgic_check_ioaddr(kvm, &rdreg->base, addr, SZ_64K);
 	if (ret)
-		return ret;
+		goto out;
 
-	vgic->vgic_redist_base = addr;
+	rdreg->base = addr;
 	if (!vgic_v3_check_base(kvm)) {
-		vgic->vgic_redist_base = VGIC_ADDR_UNDEF;
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
+	list_add(&rdreg->list, &vgic->rd_regions);
+
 	/*
 	 * Register iodevs for each existing VCPU.  Adding more VCPUs
 	 * afterwards will register the iodevs when needed.
@@ -695,6 +713,10 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u64 addr)
 		return ret;
 
 	return 0;
+
+out:
+	kfree(rdreg);
+	return ret;
 }
 
 int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index c7423f3768e5..56e6e903d998 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -427,6 +427,9 @@ bool vgic_v3_check_base(struct kvm *kvm)
 {
 	struct vgic_dist *d = &kvm->arch.vgic;
 	gpa_t redist_size = KVM_VGIC_V3_REDIST_SIZE;
+	struct vgic_redist_region *rdreg =
+		list_first_entry(&d->rd_regions,
+				 struct vgic_redist_region, list);
 
 	redist_size *= atomic_read(&kvm->online_vcpus);
 
@@ -434,18 +437,17 @@ bool vgic_v3_check_base(struct kvm *kvm)
 	    d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
 		return false;
 
-	if (!IS_VGIC_ADDR_UNDEF(d->vgic_redist_base) &&
-	    d->vgic_redist_base + redist_size < d->vgic_redist_base)
+	if (rdreg && (rdreg->base + redist_size < rdreg->base))
 		return false;
 
 	/* Both base addresses must be set to check if they overlap */
-	if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) ||
-	    IS_VGIC_ADDR_UNDEF(d->vgic_redist_base))
+	if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) || !rdreg)
 		return true;
 
-	if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE <= d->vgic_redist_base)
+	if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE <= rdreg->base)
 		return true;
-	if (d->vgic_redist_base + redist_size <= d->vgic_dist_base)
+
+	if (rdreg->base + redist_size <= d->vgic_dist_base)
 		return true;
 
 	return false;
@@ -455,12 +457,14 @@ int vgic_v3_map_resources(struct kvm *kvm)
 {
 	int ret = 0;
 	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_redist_region *rdreg =
+		list_first_entry(&dist->rd_regions,
+				 struct vgic_redist_region, list);
 
 	if (vgic_ready(kvm))
 		goto out;
 
-	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-	    IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
+	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || !rdreg) {
 		kvm_err("Need to set vgic distributor addresses first\n");
 		ret = -ENXIO;
 		goto out;
-- 
cgit v1.2.3


From 5ec17fbac6713be82b90f54d5a31251803fd8de5 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 22 May 2018 09:55:13 +0200
Subject: KVM: arm/arm64: Remove kvm_vgic_vcpu_early_init

kvm_vgic_vcpu_early_init gets called after kvm_vgic_cpu_init which
is confusing. The call path is as follows:
kvm_vm_ioctl_create_vcpu
|_ kvm_arch_cpu_create
   |_ kvm_vcpu_init
      |_ kvm_arch_vcpu_init
         |_ kvm_vgic_vcpu_init
|_ kvm_arch_vcpu_postcreate
   |_ kvm_vgic_vcpu_early_init

Static initialization currently done in kvm_vgic_vcpu_early_init()
can be moved to kvm_vgic_vcpu_init(). So let's move the code and
remove kvm_vgic_vcpu_early_init(). kvm_arch_vcpu_postcreate() does
nothing.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h        |  1 -
 virt/kvm/arm/arm.c            |  1 -
 virt/kvm/arm/vgic/vgic-init.c | 80 ++++++++++++++++++++-----------------------
 3 files changed, 37 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 90e489f685ae..08ccbe37dcda 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -338,7 +338,6 @@ void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_hyp_init(void);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 39e777155e7c..126b98fbf9ba 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -292,7 +292,6 @@ out:
 
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	kvm_vgic_vcpu_early_init(vcpu);
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 8901b2d8fca1..272af9704952 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -44,7 +44,7 @@
  *
  * CPU Interface:
  *
- * - kvm_vgic_vcpu_early_init(): initialization of static data that
+ * - kvm_vgic_vcpu_init(): initialization of static data that
  *   doesn't depend on any sizing information or emulation type. No
  *   allocation is allowed there.
  */
@@ -67,46 +67,6 @@ void kvm_vgic_early_init(struct kvm *kvm)
 	spin_lock_init(&dist->lpi_list_lock);
 }
 
-/**
- * kvm_vgic_vcpu_early_init() - Initialize static VGIC VCPU data structures
- * @vcpu: The VCPU whose VGIC data structures whould be initialized
- *
- * Only do initialization, but do not actually enable the VGIC CPU interface
- * yet.
- */
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	int i;
-
-	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-	spin_lock_init(&vgic_cpu->ap_list_lock);
-
-	/*
-	 * Enable and configure all SGIs to be edge-triggered and
-	 * configure all PPIs as level-triggered.
-	 */
-	for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-		struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-
-		INIT_LIST_HEAD(&irq->ap_list);
-		spin_lock_init(&irq->irq_lock);
-		irq->intid = i;
-		irq->vcpu = NULL;
-		irq->target_vcpu = vcpu;
-		irq->targets = 1U << vcpu->vcpu_id;
-		kref_init(&irq->refcount);
-		if (vgic_irq_is_sgi(i)) {
-			/* SGIs */
-			irq->enabled = 1;
-			irq->config = VGIC_CONFIG_EDGE;
-		} else {
-			/* PPIs */
-			irq->config = VGIC_CONFIG_LEVEL;
-		}
-	}
-}
-
 /* CREATION */
 
 /**
@@ -224,13 +184,47 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 }
 
 /**
- * kvm_vgic_vcpu_init() - Register VCPU-specific KVM iodevs
+ * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
+ * structures and register VCPU-specific KVM iodevs
+ *
  * @vcpu: pointer to the VCPU being created and initialized
+ *
+ * Only do initialization, but do not actually enable the
+ * VGIC CPU interface
  */
 int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	int ret = 0;
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	int ret = 0;
+	int i;
+
+	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+	spin_lock_init(&vgic_cpu->ap_list_lock);
+
+	/*
+	 * Enable and configure all SGIs to be edge-triggered and
+	 * configure all PPIs as level-triggered.
+	 */
+	for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+		struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+
+		INIT_LIST_HEAD(&irq->ap_list);
+		spin_lock_init(&irq->irq_lock);
+		irq->intid = i;
+		irq->vcpu = NULL;
+		irq->target_vcpu = vcpu;
+		irq->targets = 1U << vcpu->vcpu_id;
+		kref_init(&irq->refcount);
+		if (vgic_irq_is_sgi(i)) {
+			/* SGIs */
+			irq->enabled = 1;
+			irq->config = VGIC_CONFIG_EDGE;
+		} else {
+			/* PPIs */
+			irq->config = VGIC_CONFIG_LEVEL;
+		}
+	}
 
 	if (!irqchip_in_kernel(vcpu->kvm))
 		return 0;
-- 
cgit v1.2.3


From e25028c8ded011d19f9a11164807507c94febc01 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 22 May 2018 09:55:18 +0200
Subject: KVM: arm/arm64: Bump VGIC_V3_MAX_CPUS to 512

Let's raise the number of supported vcpus along with
vgic v3 now that HW is looming with more physical CPUs.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Acked-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 08ccbe37dcda..cfdd2484cc42 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -28,7 +28,7 @@
 
 #include <linux/irqchip/arm-gic-v4.h>
 
-#define VGIC_V3_MAX_CPUS	255
+#define VGIC_V3_MAX_CPUS	512
 #define VGIC_V2_MAX_CPUS	8
 #define VGIC_NR_IRQS_LEGACY     256
 #define VGIC_NR_SGIS		16
-- 
cgit v1.2.3


From 7085e2a94f7df5f419e3cfb2fe809ce6564e9629 Mon Sep 17 00:00:00 2001
From: Alan Tull <atull@kernel.org>
Date: Wed, 16 May 2018 18:49:55 -0500
Subject: fpga: manager: change api, don't use drvdata

Change fpga_mgr_register to not set or use drvdata.  This supports
the case where a PCIe device has more than one manager.

Add fpga_mgr_create/free functions.  Change fpga_mgr_register and
fpga_mgr_unregister functions to take the mgr struct as their only
parameter.

  struct fpga_manager *fpga_mgr_create(struct device *dev,
                const char *name,
                const struct fpga_manager_ops *mops,
                void *priv);
  void fpga_mgr_free(struct fpga_manager *mgr);
  int fpga_mgr_register(struct fpga_manager *mgr);
  void fpga_mgr_unregister(struct fpga_manager *mgr);

Update the drivers that call fpga_mgr_register with the new API.

Signed-off-by: Alan Tull <atull@kernel.org>
[Moritz: Fixup whitespace issue]
Reported-by: Jiuyue Ma <majiuyue@huawei.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/fpga/fpga-mgr.txt  | 35 +++++++++++++-----
 drivers/fpga/altera-cvp.c        | 19 +++++++---
 drivers/fpga/altera-pr-ip-core.c | 18 ++++++++--
 drivers/fpga/altera-ps-spi.c     | 20 +++++++++--
 drivers/fpga/fpga-mgr.c          | 78 ++++++++++++++++++++++++++--------------
 drivers/fpga/ice40-spi.c         | 21 ++++++++---
 drivers/fpga/machxo2-spi.c       | 20 ++++++++---
 drivers/fpga/socfpga-a10.c       | 14 ++++++--
 drivers/fpga/socfpga.c           | 19 ++++++++--
 drivers/fpga/ts73xx-fpga.c       | 20 +++++++++--
 drivers/fpga/xilinx-spi.c        | 20 +++++++++--
 drivers/fpga/zynq-fpga.c         | 14 ++++++--
 include/linux/fpga/fpga-mgr.h    | 10 +++---
 13 files changed, 237 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/Documentation/fpga/fpga-mgr.txt b/Documentation/fpga/fpga-mgr.txt
index cc6413ed6fc9..86b6df66a905 100644
--- a/Documentation/fpga/fpga-mgr.txt
+++ b/Documentation/fpga/fpga-mgr.txt
@@ -63,17 +63,23 @@ The user should call fpga_mgr_lock and verify that it returns 0 before
 attempting to program the FPGA.  Likewise, the user should call
 fpga_mgr_unlock when done programming the FPGA.
 
+To alloc/free a FPGA manager struct:
+------------------------------------
+
+	struct fpga_manager *fpga_mgr_create(struct device *dev,
+					     const char *name,
+					     const struct fpga_manager_ops *mops,
+					     void *priv);
+	void fpga_mgr_free(struct fpga_manager *mgr);
 
 To register or unregister the low level FPGA-specific driver:
 -------------------------------------------------------------
 
-	int fpga_mgr_register(struct device *dev, const char *name,
-			      const struct fpga_manager_ops *mops,
-			      void *priv);
+	int fpga_mgr_register(struct fpga_manager *mgr);
 
-	void fpga_mgr_unregister(struct device *dev);
+	void fpga_mgr_unregister(struct fpga_manager *mgr);
 
-Use of these two functions is described below in "How To Support a new FPGA
+Use of these functions is described below in "How To Support a new FPGA
 device."
 
 
@@ -148,6 +154,7 @@ static int socfpga_fpga_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct socfpga_fpga_priv *priv;
+	struct fpga_manager *mgr;
 	int ret;
 
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
@@ -157,13 +164,25 @@ static int socfpga_fpga_probe(struct platform_device *pdev)
 	/* ... do ioremaps, get interrupts, etc. and save
 	   them in priv... */
 
-	return fpga_mgr_register(dev, "Altera SOCFPGA FPGA Manager",
-				 &socfpga_fpga_ops, priv);
+	mgr = fpga_mgr_create(dev, "Altera SOCFPGA FPGA Manager",
+			      &socfpga_fpga_ops, priv);
+	if (!mgr)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 
 static int socfpga_fpga_remove(struct platform_device *pdev)
 {
-	fpga_mgr_unregister(&pdev->dev);
+	struct fpga_manager *mgr = platform_get_drvdata(pdev);
+
+	fpga_mgr_unregister(mgr);
 
 	return 0;
 }
diff --git a/drivers/fpga/altera-cvp.c b/drivers/fpga/altera-cvp.c
index 77b04e4b3254..dd4edd8f22ce 100644
--- a/drivers/fpga/altera-cvp.c
+++ b/drivers/fpga/altera-cvp.c
@@ -401,6 +401,7 @@ static int altera_cvp_probe(struct pci_dev *pdev,
 			    const struct pci_device_id *dev_id)
 {
 	struct altera_cvp_conf *conf;
+	struct fpga_manager *mgr;
 	u16 cmd, val;
 	int ret;
 
@@ -452,16 +453,24 @@ static int altera_cvp_probe(struct pci_dev *pdev,
 	snprintf(conf->mgr_name, sizeof(conf->mgr_name), "%s @%s",
 		 ALTERA_CVP_MGR_NAME, pci_name(pdev));
 
-	ret = fpga_mgr_register(&pdev->dev, conf->mgr_name,
-				&altera_cvp_ops, conf);
-	if (ret)
+	mgr = fpga_mgr_create(&pdev->dev, conf->mgr_name,
+			      &altera_cvp_ops, conf);
+	if (!mgr)
+		return -ENOMEM;
+
+	pci_set_drvdata(pdev, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret) {
+		fpga_mgr_free(mgr);
 		goto err_unmap;
+	}
 
 	ret = driver_create_file(&altera_cvp_driver.driver,
 				 &driver_attr_chkcfg);
 	if (ret) {
 		dev_err(&pdev->dev, "Can't create sysfs chkcfg file\n");
-		fpga_mgr_unregister(&pdev->dev);
+		fpga_mgr_unregister(mgr);
 		goto err_unmap;
 	}
 
@@ -483,7 +492,7 @@ static void altera_cvp_remove(struct pci_dev *pdev)
 	u16 cmd;
 
 	driver_remove_file(&altera_cvp_driver.driver, &driver_attr_chkcfg);
-	fpga_mgr_unregister(&pdev->dev);
+	fpga_mgr_unregister(mgr);
 	pci_iounmap(pdev, conf->map);
 	pci_release_region(pdev, CVP_BAR);
 	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
diff --git a/drivers/fpga/altera-pr-ip-core.c b/drivers/fpga/altera-pr-ip-core.c
index a7b31f9797ce..eea521774cf6 100644
--- a/drivers/fpga/altera-pr-ip-core.c
+++ b/drivers/fpga/altera-pr-ip-core.c
@@ -187,6 +187,8 @@ static const struct fpga_manager_ops alt_pr_ops = {
 int alt_pr_register(struct device *dev, void __iomem *reg_base)
 {
 	struct alt_pr_priv *priv;
+	struct fpga_manager *mgr;
+	int ret;
 	u32 val;
 
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
@@ -201,15 +203,27 @@ int alt_pr_register(struct device *dev, void __iomem *reg_base)
 		(val & ALT_PR_CSR_STATUS_MSK) >> ALT_PR_CSR_STATUS_SFT,
 		(int)(val & ALT_PR_CSR_PR_START));
 
-	return fpga_mgr_register(dev, dev_name(dev), &alt_pr_ops, priv);
+	mgr = fpga_mgr_create(dev, dev_name(dev), &alt_pr_ops, priv);
+	if (!mgr)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(alt_pr_register);
 
 int alt_pr_unregister(struct device *dev)
 {
+	struct fpga_manager *mgr = dev_get_drvdata(dev);
+
 	dev_dbg(dev, "%s\n", __func__);
 
-	fpga_mgr_unregister(dev);
+	fpga_mgr_unregister(mgr);
 
 	return 0;
 }
diff --git a/drivers/fpga/altera-ps-spi.c b/drivers/fpga/altera-ps-spi.c
index 06d212a3d49d..24b25c626036 100644
--- a/drivers/fpga/altera-ps-spi.c
+++ b/drivers/fpga/altera-ps-spi.c
@@ -238,6 +238,8 @@ static int altera_ps_probe(struct spi_device *spi)
 {
 	struct altera_ps_conf *conf;
 	const struct of_device_id *of_id;
+	struct fpga_manager *mgr;
+	int ret;
 
 	conf = devm_kzalloc(&spi->dev, sizeof(*conf), GFP_KERNEL);
 	if (!conf)
@@ -273,13 +275,25 @@ static int altera_ps_probe(struct spi_device *spi)
 	snprintf(conf->mgr_name, sizeof(conf->mgr_name), "%s %s",
 		 dev_driver_string(&spi->dev), dev_name(&spi->dev));
 
-	return fpga_mgr_register(&spi->dev, conf->mgr_name,
-				 &altera_ps_ops, conf);
+	mgr = fpga_mgr_create(&spi->dev, conf->mgr_name,
+			      &altera_ps_ops, conf);
+	if (!mgr)
+		return -ENOMEM;
+
+	spi_set_drvdata(spi, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 
 static int altera_ps_remove(struct spi_device *spi)
 {
-	fpga_mgr_unregister(&spi->dev);
+	struct fpga_manager *mgr = spi_get_drvdata(spi);
+
+	fpga_mgr_unregister(mgr);
 
 	return 0;
 }
diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index 9939d2cbc9a6..0a5181db3e2b 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -515,17 +515,17 @@ void fpga_mgr_unlock(struct fpga_manager *mgr)
 EXPORT_SYMBOL_GPL(fpga_mgr_unlock);
 
 /**
- * fpga_mgr_register - register a low level fpga manager driver
+ * fpga_mgr_create - create and initialize a FPGA manager struct
  * @dev:	fpga manager device from pdev
  * @name:	fpga manager name
  * @mops:	pointer to structure of fpga manager ops
  * @priv:	fpga manager private data
  *
- * Return: 0 on success, negative error code otherwise.
+ * Return: pointer to struct fpga_manager or NULL
  */
-int fpga_mgr_register(struct device *dev, const char *name,
-		      const struct fpga_manager_ops *mops,
-		      void *priv)
+struct fpga_manager *fpga_mgr_create(struct device *dev, const char *name,
+				     const struct fpga_manager_ops *mops,
+				     void *priv)
 {
 	struct fpga_manager *mgr;
 	int id, ret;
@@ -534,17 +534,17 @@ int fpga_mgr_register(struct device *dev, const char *name,
 	    !mops->write_init || (!mops->write && !mops->write_sg) ||
 	    (mops->write && mops->write_sg)) {
 		dev_err(dev, "Attempt to register without fpga_manager_ops\n");
-		return -EINVAL;
+		return NULL;
 	}
 
 	if (!name || !strlen(name)) {
 		dev_err(dev, "Attempt to register with no name!\n");
-		return -EINVAL;
+		return NULL;
 	}
 
 	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
 	if (!mgr)
-		return -ENOMEM;
+		return NULL;
 
 	id = ida_simple_get(&fpga_mgr_ida, 0, 0, GFP_KERNEL);
 	if (id < 0) {
@@ -558,25 +558,56 @@ int fpga_mgr_register(struct device *dev, const char *name,
 	mgr->mops = mops;
 	mgr->priv = priv;
 
-	/*
-	 * Initialize framework state by requesting low level driver read state
-	 * from device.  FPGA may be in reset mode or may have been programmed
-	 * by bootloader or EEPROM.
-	 */
-	mgr->state = mgr->mops->state(mgr);
-
 	device_initialize(&mgr->dev);
 	mgr->dev.class = fpga_mgr_class;
 	mgr->dev.groups = mops->groups;
 	mgr->dev.parent = dev;
 	mgr->dev.of_node = dev->of_node;
 	mgr->dev.id = id;
-	dev_set_drvdata(dev, mgr);
 
 	ret = dev_set_name(&mgr->dev, "fpga%d", id);
 	if (ret)
 		goto error_device;
 
+	return mgr;
+
+error_device:
+	ida_simple_remove(&fpga_mgr_ida, id);
+error_kfree:
+	kfree(mgr);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_create);
+
+/**
+ * fpga_mgr_free - deallocate a FPGA manager
+ * @mgr:	fpga manager struct created by fpga_mgr_create
+ */
+void fpga_mgr_free(struct fpga_manager *mgr)
+{
+	ida_simple_remove(&fpga_mgr_ida, mgr->dev.id);
+	kfree(mgr);
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_free);
+
+/**
+ * fpga_mgr_register - register a FPGA manager
+ * @mgr:	fpga manager struct created by fpga_mgr_create
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int fpga_mgr_register(struct fpga_manager *mgr)
+{
+	int ret;
+
+	/*
+	 * Initialize framework state by requesting low level driver read state
+	 * from device.  FPGA may be in reset mode or may have been programmed
+	 * by bootloader or EEPROM.
+	 */
+	mgr->state = mgr->mops->state(mgr);
+
 	ret = device_add(&mgr->dev);
 	if (ret)
 		goto error_device;
@@ -586,22 +617,18 @@ int fpga_mgr_register(struct device *dev, const char *name,
 	return 0;
 
 error_device:
-	ida_simple_remove(&fpga_mgr_ida, id);
-error_kfree:
-	kfree(mgr);
+	ida_simple_remove(&fpga_mgr_ida, mgr->dev.id);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_register);
 
 /**
- * fpga_mgr_unregister - unregister a low level fpga manager driver
- * @dev:	fpga manager device from pdev
+ * fpga_mgr_unregister - unregister a FPGA manager
+ * @mgr:	fpga manager struct
  */
-void fpga_mgr_unregister(struct device *dev)
+void fpga_mgr_unregister(struct fpga_manager *mgr)
 {
-	struct fpga_manager *mgr = dev_get_drvdata(dev);
-
 	dev_info(&mgr->dev, "%s %s\n", __func__, mgr->name);
 
 	/*
@@ -619,8 +646,7 @@ static void fpga_mgr_dev_release(struct device *dev)
 {
 	struct fpga_manager *mgr = to_fpga_manager(dev);
 
-	ida_simple_remove(&fpga_mgr_ida, mgr->dev.id);
-	kfree(mgr);
+	fpga_mgr_free(mgr);
 }
 
 static int __init fpga_mgr_class_init(void)
diff --git a/drivers/fpga/ice40-spi.c b/drivers/fpga/ice40-spi.c
index 7fca82023062..5981c7ee7a7d 100644
--- a/drivers/fpga/ice40-spi.c
+++ b/drivers/fpga/ice40-spi.c
@@ -133,6 +133,7 @@ static int ice40_fpga_probe(struct spi_device *spi)
 {
 	struct device *dev = &spi->dev;
 	struct ice40_fpga_priv *priv;
+	struct fpga_manager *mgr;
 	int ret;
 
 	priv = devm_kzalloc(&spi->dev, sizeof(*priv), GFP_KERNEL);
@@ -174,14 +175,26 @@ static int ice40_fpga_probe(struct spi_device *spi)
 		return ret;
 	}
 
-	/* Register with the FPGA manager */
-	return fpga_mgr_register(dev, "Lattice iCE40 FPGA Manager",
-				 &ice40_fpga_ops, priv);
+	mgr = fpga_mgr_create(dev, "Lattice iCE40 FPGA Manager",
+			      &ice40_fpga_ops, priv);
+	if (!mgr)
+		return -ENOMEM;
+
+	spi_set_drvdata(spi, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 
 static int ice40_fpga_remove(struct spi_device *spi)
 {
-	fpga_mgr_unregister(&spi->dev);
+	struct fpga_manager *mgr = spi_get_drvdata(spi);
+
+	fpga_mgr_unregister(mgr);
+
 	return 0;
 }
 
diff --git a/drivers/fpga/machxo2-spi.c b/drivers/fpga/machxo2-spi.c
index 8e95ec9c5c9a..a582e0000c97 100644
--- a/drivers/fpga/machxo2-spi.c
+++ b/drivers/fpga/machxo2-spi.c
@@ -355,21 +355,33 @@ static const struct fpga_manager_ops machxo2_ops = {
 static int machxo2_spi_probe(struct spi_device *spi)
 {
 	struct device *dev = &spi->dev;
+	struct fpga_manager *mgr;
+	int ret;
 
 	if (spi->max_speed_hz > MACHXO2_MAX_SPEED) {
 		dev_err(dev, "Speed is too high\n");
 		return -EINVAL;
 	}
 
-	return fpga_mgr_register(dev, "Lattice MachXO2 SPI FPGA Manager",
-				 &machxo2_ops, spi);
+	mgr = fpga_mgr_create(dev, "Lattice MachXO2 SPI FPGA Manager",
+			      &machxo2_ops, spi);
+	if (!mgr)
+		return -ENOMEM;
+
+	spi_set_drvdata(spi, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 
 static int machxo2_spi_remove(struct spi_device *spi)
 {
-	struct device *dev = &spi->dev;
+	struct fpga_manager *mgr = spi_get_drvdata(spi);
 
-	fpga_mgr_unregister(dev);
+	fpga_mgr_unregister(mgr);
 
 	return 0;
 }
diff --git a/drivers/fpga/socfpga-a10.c b/drivers/fpga/socfpga-a10.c
index a46e343a5b72..dec3db5cdab1 100644
--- a/drivers/fpga/socfpga-a10.c
+++ b/drivers/fpga/socfpga-a10.c
@@ -482,6 +482,7 @@ static int socfpga_a10_fpga_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct a10_fpga_priv *priv;
 	void __iomem *reg_base;
+	struct fpga_manager *mgr;
 	struct resource *res;
 	int ret;
 
@@ -519,9 +520,16 @@ static int socfpga_a10_fpga_probe(struct platform_device *pdev)
 		return -EBUSY;
 	}
 
-	ret = fpga_mgr_register(dev, "SoCFPGA Arria10 FPGA Manager",
-				 &socfpga_a10_fpga_mgr_ops, priv);
+	mgr = fpga_mgr_create(dev, "SoCFPGA Arria10 FPGA Manager",
+			      &socfpga_a10_fpga_mgr_ops, priv);
+	if (!mgr)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mgr);
+
+	ret = fpga_mgr_register(mgr);
 	if (ret) {
+		fpga_mgr_free(mgr);
 		clk_disable_unprepare(priv->clk);
 		return ret;
 	}
@@ -534,7 +542,7 @@ static int socfpga_a10_fpga_remove(struct platform_device *pdev)
 	struct fpga_manager *mgr = platform_get_drvdata(pdev);
 	struct a10_fpga_priv *priv = mgr->priv;
 
-	fpga_mgr_unregister(&pdev->dev);
+	fpga_mgr_unregister(mgr);
 	clk_disable_unprepare(priv->clk);
 
 	return 0;
diff --git a/drivers/fpga/socfpga.c b/drivers/fpga/socfpga.c
index b6672e66cda6..51efaf9e0e03 100644
--- a/drivers/fpga/socfpga.c
+++ b/drivers/fpga/socfpga.c
@@ -555,6 +555,7 @@ static int socfpga_fpga_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct socfpga_fpga_priv *priv;
+	struct fpga_manager *mgr;
 	struct resource *res;
 	int ret;
 
@@ -581,13 +582,25 @@ static int socfpga_fpga_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return fpga_mgr_register(dev, "Altera SOCFPGA FPGA Manager",
-				 &socfpga_fpga_ops, priv);
+	mgr = fpga_mgr_create(dev, "Altera SOCFPGA FPGA Manager",
+			      &socfpga_fpga_ops, priv);
+	if (!mgr)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 
 static int socfpga_fpga_remove(struct platform_device *pdev)
 {
-	fpga_mgr_unregister(&pdev->dev);
+	struct fpga_manager *mgr = platform_get_drvdata(pdev);
+
+	fpga_mgr_unregister(mgr);
 
 	return 0;
 }
diff --git a/drivers/fpga/ts73xx-fpga.c b/drivers/fpga/ts73xx-fpga.c
index f6a96b42e2ca..08efd1895b1b 100644
--- a/drivers/fpga/ts73xx-fpga.c
+++ b/drivers/fpga/ts73xx-fpga.c
@@ -116,7 +116,9 @@ static int ts73xx_fpga_probe(struct platform_device *pdev)
 {
 	struct device *kdev = &pdev->dev;
 	struct ts73xx_fpga_priv *priv;
+	struct fpga_manager *mgr;
 	struct resource *res;
+	int ret;
 
 	priv = devm_kzalloc(kdev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -131,13 +133,25 @@ static int ts73xx_fpga_probe(struct platform_device *pdev)
 		return PTR_ERR(priv->io_base);
 	}
 
-	return fpga_mgr_register(kdev, "TS-73xx FPGA Manager",
-				 &ts73xx_fpga_ops, priv);
+	mgr = fpga_mgr_create(kdev, "TS-73xx FPGA Manager",
+			      &ts73xx_fpga_ops, priv);
+	if (!mgr)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 
 static int ts73xx_fpga_remove(struct platform_device *pdev)
 {
-	fpga_mgr_unregister(&pdev->dev);
+	struct fpga_manager *mgr = platform_get_drvdata(pdev);
+
+	fpga_mgr_unregister(mgr);
 
 	return 0;
 }
diff --git a/drivers/fpga/xilinx-spi.c b/drivers/fpga/xilinx-spi.c
index 9b62a4c2a3df..8d1945966533 100644
--- a/drivers/fpga/xilinx-spi.c
+++ b/drivers/fpga/xilinx-spi.c
@@ -143,6 +143,8 @@ static const struct fpga_manager_ops xilinx_spi_ops = {
 static int xilinx_spi_probe(struct spi_device *spi)
 {
 	struct xilinx_spi_conf *conf;
+	struct fpga_manager *mgr;
+	int ret;
 
 	conf = devm_kzalloc(&spi->dev, sizeof(*conf), GFP_KERNEL);
 	if (!conf)
@@ -165,13 +167,25 @@ static int xilinx_spi_probe(struct spi_device *spi)
 		return PTR_ERR(conf->done);
 	}
 
-	return fpga_mgr_register(&spi->dev, "Xilinx Slave Serial FPGA Manager",
-				 &xilinx_spi_ops, conf);
+	mgr = fpga_mgr_create(&spi->dev, "Xilinx Slave Serial FPGA Manager",
+			      &xilinx_spi_ops, conf);
+	if (!mgr)
+		return -ENOMEM;
+
+	spi_set_drvdata(spi, mgr);
+
+	ret = fpga_mgr_register(mgr);
+	if (ret)
+		fpga_mgr_free(mgr);
+
+	return ret;
 }
 
 static int xilinx_spi_remove(struct spi_device *spi)
 {
-	fpga_mgr_unregister(&spi->dev);
+	struct fpga_manager *mgr = spi_get_drvdata(spi);
+
+	fpga_mgr_unregister(mgr);
 
 	return 0;
 }
diff --git a/drivers/fpga/zynq-fpga.c b/drivers/fpga/zynq-fpga.c
index 70b15b303471..3110e00121ca 100644
--- a/drivers/fpga/zynq-fpga.c
+++ b/drivers/fpga/zynq-fpga.c
@@ -558,6 +558,7 @@ static int zynq_fpga_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct zynq_fpga_priv *priv;
+	struct fpga_manager *mgr;
 	struct resource *res;
 	int err;
 
@@ -613,10 +614,17 @@ static int zynq_fpga_probe(struct platform_device *pdev)
 
 	clk_disable(priv->clk);
 
-	err = fpga_mgr_register(dev, "Xilinx Zynq FPGA Manager",
-				&zynq_fpga_ops, priv);
+	mgr = fpga_mgr_create(dev, "Xilinx Zynq FPGA Manager",
+			      &zynq_fpga_ops, priv);
+	if (!mgr)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mgr);
+
+	err = fpga_mgr_register(mgr);
 	if (err) {
 		dev_err(dev, "unable to register FPGA manager\n");
+		fpga_mgr_free(mgr);
 		clk_unprepare(priv->clk);
 		return err;
 	}
@@ -632,7 +640,7 @@ static int zynq_fpga_remove(struct platform_device *pdev)
 	mgr = platform_get_drvdata(pdev);
 	priv = mgr->priv;
 
-	fpga_mgr_unregister(&pdev->dev);
+	fpga_mgr_unregister(mgr);
 
 	clk_unprepare(priv->clk);
 
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 3c6de23aabdf..1266c1108e70 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -170,9 +170,11 @@ struct fpga_manager *fpga_mgr_get(struct device *dev);
 
 void fpga_mgr_put(struct fpga_manager *mgr);
 
-int fpga_mgr_register(struct device *dev, const char *name,
-		      const struct fpga_manager_ops *mops, void *priv);
-
-void fpga_mgr_unregister(struct device *dev);
+struct fpga_manager *fpga_mgr_create(struct device *dev, const char *name,
+				     const struct fpga_manager_ops *mops,
+				     void *priv);
+void fpga_mgr_free(struct fpga_manager *mgr);
+int fpga_mgr_register(struct fpga_manager *mgr);
+void fpga_mgr_unregister(struct fpga_manager *mgr);
 
 #endif /*_LINUX_FPGA_MGR_H */
-- 
cgit v1.2.3


From 371cd1b1fdabb33603340559049e46dfeae45b1e Mon Sep 17 00:00:00 2001
From: Alan Tull <atull@kernel.org>
Date: Wed, 16 May 2018 18:49:56 -0500
Subject: fpga: bridge: change api, don't use drvdata

Change fpga_bridge_register to not set drvdata.  This is to support
the case where a PCIe device can have more than one bridge.

Add API functions to create/free the fpga bridge struct. Change
fpga_bridge_register/unregister to take FPGA bridge struct as
the only parameter.

  struct fpga_bridge
  *fpga_bridge_create(struct device *dev, const char *name,
                      const struct fpga_bridge_ops *br_ops,
                      void *priv);
  void fpga_bridge_free(struct fpga_bridge *br);
  int fpga_bridge_register(struct fpga_bridge *br);
  void fpga_bridge_unregister(struct fpga_bridge *br);

Update the drivers that call fpga_bridge_register with the new API.

Signed-off-by: Alan Tull <atull@kernel.org>
Reported-by: Jiuyue Ma <majiuyue@huawei.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/altera-fpga2sdram.c    | 21 ++++++++---
 drivers/fpga/altera-freeze-bridge.c | 22 ++++++++++--
 drivers/fpga/altera-hps2fpga.c      | 24 ++++++++++---
 drivers/fpga/fpga-bridge.c          | 70 ++++++++++++++++++++++++-------------
 drivers/fpga/xilinx-pr-decoupler.c  | 22 +++++++++---
 include/linux/fpga/fpga-bridge.h    |  9 +++--
 6 files changed, 123 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/fpga/altera-fpga2sdram.c b/drivers/fpga/altera-fpga2sdram.c
index d4eeb74388da..5a29ab6e3b28 100644
--- a/drivers/fpga/altera-fpga2sdram.c
+++ b/drivers/fpga/altera-fpga2sdram.c
@@ -106,6 +106,7 @@ static int alt_fpga_bridge_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct alt_fpga2sdram_data *priv;
+	struct fpga_bridge *br;
 	u32 enable;
 	struct regmap *sysmgr;
 	int ret = 0;
@@ -131,10 +132,18 @@ static int alt_fpga_bridge_probe(struct platform_device *pdev)
 	/* Get f2s bridge configuration saved in handoff register */
 	regmap_read(sysmgr, SYSMGR_ISWGRP_HANDOFF3, &priv->mask);
 
-	ret = fpga_bridge_register(dev, F2S_BRIDGE_NAME,
-				   &altera_fpga2sdram_br_ops, priv);
-	if (ret)
+	br = fpga_bridge_create(dev, F2S_BRIDGE_NAME,
+				&altera_fpga2sdram_br_ops, priv);
+	if (!br)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, br);
+
+	ret = fpga_bridge_register(br);
+	if (ret) {
+		fpga_bridge_free(br);
 		return ret;
+	}
 
 	dev_info(dev, "driver initialized with handoff %08x\n", priv->mask);
 
@@ -146,7 +155,7 @@ static int alt_fpga_bridge_probe(struct platform_device *pdev)
 				 (enable ? "enabling" : "disabling"));
 			ret = _alt_fpga2sdram_enable_set(priv, enable);
 			if (ret) {
-				fpga_bridge_unregister(&pdev->dev);
+				fpga_bridge_unregister(br);
 				return ret;
 			}
 		}
@@ -157,7 +166,9 @@ static int alt_fpga_bridge_probe(struct platform_device *pdev)
 
 static int alt_fpga_bridge_remove(struct platform_device *pdev)
 {
-	fpga_bridge_unregister(&pdev->dev);
+	struct fpga_bridge *br = platform_get_drvdata(pdev);
+
+	fpga_bridge_unregister(br);
 
 	return 0;
 }
diff --git a/drivers/fpga/altera-freeze-bridge.c b/drivers/fpga/altera-freeze-bridge.c
index 6159cfcf78a2..fa4b693cf4be 100644
--- a/drivers/fpga/altera-freeze-bridge.c
+++ b/drivers/fpga/altera-freeze-bridge.c
@@ -221,8 +221,10 @@ static int altera_freeze_br_probe(struct platform_device *pdev)
 	struct device_node *np = pdev->dev.of_node;
 	void __iomem *base_addr;
 	struct altera_freeze_br_data *priv;
+	struct fpga_bridge *br;
 	struct resource *res;
 	u32 status, revision;
+	int ret;
 
 	if (!np)
 		return -ENODEV;
@@ -254,13 +256,27 @@ static int altera_freeze_br_probe(struct platform_device *pdev)
 
 	priv->base_addr = base_addr;
 
-	return fpga_bridge_register(dev, FREEZE_BRIDGE_NAME,
-				    &altera_freeze_br_br_ops, priv);
+	br = fpga_bridge_create(dev, FREEZE_BRIDGE_NAME,
+				&altera_freeze_br_br_ops, priv);
+	if (!br)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, br);
+
+	ret = fpga_bridge_register(br);
+	if (ret) {
+		fpga_bridge_free(br);
+		return ret;
+	}
+
+	return 0;
 }
 
 static int altera_freeze_br_remove(struct platform_device *pdev)
 {
-	fpga_bridge_unregister(&pdev->dev);
+	struct fpga_bridge *br = platform_get_drvdata(pdev);
+
+	fpga_bridge_unregister(br);
 
 	return 0;
 }
diff --git a/drivers/fpga/altera-hps2fpga.c b/drivers/fpga/altera-hps2fpga.c
index 406d2f10741f..e4d39f0a7572 100644
--- a/drivers/fpga/altera-hps2fpga.c
+++ b/drivers/fpga/altera-hps2fpga.c
@@ -139,6 +139,7 @@ static int alt_fpga_bridge_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct altera_hps2fpga_data *priv;
 	const struct of_device_id *of_id;
+	struct fpga_bridge *br;
 	u32 enable;
 	int ret;
 
@@ -190,11 +191,24 @@ static int alt_fpga_bridge_probe(struct platform_device *pdev)
 		}
 	}
 
-	ret = fpga_bridge_register(dev, priv->name, &altera_hps2fpga_br_ops,
-				   priv);
-err:
+	br = fpga_bridge_create(dev, priv->name, &altera_hps2fpga_br_ops, priv);
+	if (!br) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	platform_set_drvdata(pdev, br);
+
+	ret = fpga_bridge_register(br);
 	if (ret)
-		clk_disable_unprepare(priv->clk);
+		goto err_free;
+
+	return 0;
+
+err_free:
+	fpga_bridge_free(br);
+err:
+	clk_disable_unprepare(priv->clk);
 
 	return ret;
 }
@@ -204,7 +218,7 @@ static int alt_fpga_bridge_remove(struct platform_device *pdev)
 	struct fpga_bridge *bridge = platform_get_drvdata(pdev);
 	struct altera_hps2fpga_data *priv = bridge->priv;
 
-	fpga_bridge_unregister(&pdev->dev);
+	fpga_bridge_unregister(bridge);
 
 	clk_disable_unprepare(priv->clk);
 
diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c
index 31bd2c59c305..2db1573507eb 100644
--- a/drivers/fpga/fpga-bridge.c
+++ b/drivers/fpga/fpga-bridge.c
@@ -328,28 +328,29 @@ static struct attribute *fpga_bridge_attrs[] = {
 ATTRIBUTE_GROUPS(fpga_bridge);
 
 /**
- * fpga_bridge_register - register a fpga bridge driver
+ * fpga_bridge_create - create and initialize a struct fpga_bridge
  * @dev:	FPGA bridge device from pdev
  * @name:	FPGA bridge name
  * @br_ops:	pointer to structure of fpga bridge ops
  * @priv:	FPGA bridge private data
  *
- * Return: 0 for success, error code otherwise.
+ * Return: struct fpga_bridge or NULL
  */
-int fpga_bridge_register(struct device *dev, const char *name,
-			 const struct fpga_bridge_ops *br_ops, void *priv)
+struct fpga_bridge *fpga_bridge_create(struct device *dev, const char *name,
+				       const struct fpga_bridge_ops *br_ops,
+				       void *priv)
 {
 	struct fpga_bridge *bridge;
 	int id, ret = 0;
 
 	if (!name || !strlen(name)) {
 		dev_err(dev, "Attempt to register with no name!\n");
-		return -EINVAL;
+		return NULL;
 	}
 
 	bridge = kzalloc(sizeof(*bridge), GFP_KERNEL);
 	if (!bridge)
-		return -ENOMEM;
+		return NULL;
 
 	id = ida_simple_get(&fpga_bridge_ida, 0, 0, GFP_KERNEL);
 	if (id < 0) {
@@ -370,40 +371,62 @@ int fpga_bridge_register(struct device *dev, const char *name,
 	bridge->dev.parent = dev;
 	bridge->dev.of_node = dev->of_node;
 	bridge->dev.id = id;
-	dev_set_drvdata(dev, bridge);
 
 	ret = dev_set_name(&bridge->dev, "br%d", id);
 	if (ret)
 		goto error_device;
 
-	ret = device_add(&bridge->dev);
-	if (ret)
-		goto error_device;
-
-	of_platform_populate(dev->of_node, NULL, NULL, dev);
-
-	dev_info(bridge->dev.parent, "fpga bridge [%s] registered\n",
-		 bridge->name);
-
-	return 0;
+	return bridge;
 
 error_device:
 	ida_simple_remove(&fpga_bridge_ida, id);
 error_kfree:
 	kfree(bridge);
 
-	return ret;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(fpga_bridge_create);
+
+/**
+ * fpga_bridge_free - free a fpga bridge and its id
+ * @bridge:	FPGA bridge struct created by fpga_bridge_create
+ */
+void fpga_bridge_free(struct fpga_bridge *bridge)
+{
+	ida_simple_remove(&fpga_bridge_ida, bridge->dev.id);
+	kfree(bridge);
+}
+EXPORT_SYMBOL_GPL(fpga_bridge_free);
+
+/**
+ * fpga_bridge_register - register a fpga bridge
+ * @bridge:	FPGA bridge struct created by fpga_bridge_create
+ *
+ * Return: 0 for success, error code otherwise.
+ */
+int fpga_bridge_register(struct fpga_bridge *bridge)
+{
+	struct device *dev = &bridge->dev;
+	int ret;
+
+	ret = device_add(dev);
+	if (ret)
+		return ret;
+
+	of_platform_populate(dev->of_node, NULL, NULL, dev);
+
+	dev_info(dev->parent, "fpga bridge [%s] registered\n", bridge->name);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(fpga_bridge_register);
 
 /**
  * fpga_bridge_unregister - unregister a fpga bridge driver
- * @dev: FPGA bridge device from pdev
+ * @bridge:	FPGA bridge struct created by fpga_bridge_create
  */
-void fpga_bridge_unregister(struct device *dev)
+void fpga_bridge_unregister(struct fpga_bridge *bridge)
 {
-	struct fpga_bridge *bridge = dev_get_drvdata(dev);
-
 	/*
 	 * If the low level driver provides a method for putting bridge into
 	 * a desired state upon unregister, do it.
@@ -419,8 +442,7 @@ static void fpga_bridge_dev_release(struct device *dev)
 {
 	struct fpga_bridge *bridge = to_fpga_bridge(dev);
 
-	ida_simple_remove(&fpga_bridge_ida, bridge->dev.id);
-	kfree(bridge);
+	fpga_bridge_free(bridge);
 }
 
 static int __init fpga_bridge_dev_init(void)
diff --git a/drivers/fpga/xilinx-pr-decoupler.c b/drivers/fpga/xilinx-pr-decoupler.c
index 0d7743089414..07ba1539e82c 100644
--- a/drivers/fpga/xilinx-pr-decoupler.c
+++ b/drivers/fpga/xilinx-pr-decoupler.c
@@ -94,6 +94,7 @@ MODULE_DEVICE_TABLE(of, xlnx_pr_decoupler_of_match);
 static int xlnx_pr_decoupler_probe(struct platform_device *pdev)
 {
 	struct xlnx_pr_decoupler_data *priv;
+	struct fpga_bridge *br;
 	int err;
 	struct resource *res;
 
@@ -120,16 +121,27 @@ static int xlnx_pr_decoupler_probe(struct platform_device *pdev)
 
 	clk_disable(priv->clk);
 
-	err = fpga_bridge_register(&pdev->dev, "Xilinx PR Decoupler",
-				   &xlnx_pr_decoupler_br_ops, priv);
+	br = fpga_bridge_create(&pdev->dev, "Xilinx PR Decoupler",
+				&xlnx_pr_decoupler_br_ops, priv);
+	if (!br) {
+		err = -ENOMEM;
+		goto err_clk;
+	}
+
+	platform_set_drvdata(pdev, br);
 
+	err = fpga_bridge_register(br);
 	if (err) {
 		dev_err(&pdev->dev, "unable to register Xilinx PR Decoupler");
-		clk_unprepare(priv->clk);
-		return err;
+		goto err_clk;
 	}
 
 	return 0;
+
+err_clk:
+	clk_unprepare(priv->clk);
+
+	return err;
 }
 
 static int xlnx_pr_decoupler_remove(struct platform_device *pdev)
@@ -137,7 +149,7 @@ static int xlnx_pr_decoupler_remove(struct platform_device *pdev)
 	struct fpga_bridge *bridge = platform_get_drvdata(pdev);
 	struct xlnx_pr_decoupler_data *p = bridge->priv;
 
-	fpga_bridge_unregister(&pdev->dev);
+	fpga_bridge_unregister(bridge);
 
 	clk_unprepare(p->clk);
 
diff --git a/include/linux/fpga/fpga-bridge.h b/include/linux/fpga/fpga-bridge.h
index 3694821a6d2d..ce550fcf6360 100644
--- a/include/linux/fpga/fpga-bridge.h
+++ b/include/linux/fpga/fpga-bridge.h
@@ -62,8 +62,11 @@ int of_fpga_bridge_get_to_list(struct device_node *np,
 			       struct fpga_image_info *info,
 			       struct list_head *bridge_list);
 
-int fpga_bridge_register(struct device *dev, const char *name,
-			 const struct fpga_bridge_ops *br_ops, void *priv);
-void fpga_bridge_unregister(struct device *dev);
+struct fpga_bridge *fpga_bridge_create(struct device *dev, const char *name,
+				       const struct fpga_bridge_ops *br_ops,
+				       void *priv);
+void fpga_bridge_free(struct fpga_bridge *br);
+int fpga_bridge_register(struct fpga_bridge *br);
+void fpga_bridge_unregister(struct fpga_bridge *br);
 
 #endif /* _LINUX_FPGA_BRIDGE_H */
-- 
cgit v1.2.3


From 9f368977b4589e2fe0b9d3a4cbaf11ff6a58ecf5 Mon Sep 17 00:00:00 2001
From: Alan Tull <atull@kernel.org>
Date: Wed, 16 May 2018 18:49:57 -0500
Subject: fpga: region: change api, add fpga_region_create/free

Add fpga_region_create/free API functions.

Change fpga_region_register to take FPGA region struct as the only
parameter.  Change fpga_region_unregister to return void.

  struct fpga_region *fpga_region_create(struct device *dev,
                        struct fpga_manager *mgr,
                        int (*get_bridges)(struct fpga_region *));
  void fpga_region_free(struct fpga_region *region);
  int fpga_region_register(struct fpga_region *region);
  void fpga_region_unregister(struct fpga_region *region);

Remove groups storage from struct fpga_region, it's not
needed.  Callers can just "region->dev.groups = groups;"
after calling fpga_region_create.

Update the drivers that call fpga_region_register with the new API.

Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/fpga/fpga-region.txt |  3 +-
 drivers/fpga/fpga-region.c         | 68 ++++++++++++++++++++++++++++++--------
 drivers/fpga/of-fpga-region.c      | 13 +++-----
 include/linux/fpga/fpga-region.h   | 11 +++---
 4 files changed, 68 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/Documentation/fpga/fpga-region.txt b/Documentation/fpga/fpga-region.txt
index 139a02ba1ff6..d38fa3b4154a 100644
--- a/Documentation/fpga/fpga-region.txt
+++ b/Documentation/fpga/fpga-region.txt
@@ -42,8 +42,7 @@ The FPGA region API
 To register or unregister a region:
 -----------------------------------
 
-	int fpga_region_register(struct device *dev,
-				 struct fpga_region *region);
+	int fpga_region_register(struct fpga_region *region);
 	int fpga_region_unregister(struct fpga_region *region);
 
 An example of usage can be seen in the probe function of [3]
diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
index f634a8ed5e2c..b3ba3e40c44b 100644
--- a/drivers/fpga/fpga-region.c
+++ b/drivers/fpga/fpga-region.c
@@ -167,18 +167,36 @@ err_put_region:
 }
 EXPORT_SYMBOL_GPL(fpga_region_program_fpga);
 
-int fpga_region_register(struct device *dev, struct fpga_region *region)
+/**
+ * fpga_region_create - alloc and init a struct fpga_region
+ * @dev: device parent
+ * @mgr: manager that programs this region
+ * @get_bridges: optional function to get bridges to a list
+ *
+ * Return: struct fpga_region or NULL
+ */
+struct fpga_region
+*fpga_region_create(struct device *dev,
+		    struct fpga_manager *mgr,
+		    int (*get_bridges)(struct fpga_region *))
 {
+	struct fpga_region *region;
 	int id, ret = 0;
 
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return NULL;
+
 	id = ida_simple_get(&fpga_region_ida, 0, 0, GFP_KERNEL);
 	if (id < 0)
-		return id;
+		goto err_free;
 
+	region->mgr = mgr;
+	region->get_bridges = get_bridges;
 	mutex_init(&region->mutex);
 	INIT_LIST_HEAD(&region->bridge_list);
+
 	device_initialize(&region->dev);
-	region->dev.groups = region->groups;
 	region->dev.class = fpga_region_class;
 	region->dev.parent = dev;
 	region->dev.of_node = dev->of_node;
@@ -188,23 +206,47 @@ int fpga_region_register(struct device *dev, struct fpga_region *region)
 	if (ret)
 		goto err_remove;
 
-	ret = device_add(&region->dev);
-	if (ret)
-		goto err_remove;
-
-	return 0;
+	return region;
 
 err_remove:
 	ida_simple_remove(&fpga_region_ida, id);
-	return ret;
+err_free:
+	kfree(region);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(fpga_region_create);
+
+/**
+ * fpga_region_free - free a struct fpga_region
+ * @region: FPGA region created by fpga_region_create
+ */
+void fpga_region_free(struct fpga_region *region)
+{
+	ida_simple_remove(&fpga_region_ida, region->dev.id);
+	kfree(region);
+}
+EXPORT_SYMBOL_GPL(fpga_region_free);
+
+/*
+ * fpga_region_register - register a FPGA region
+ * @region: FPGA region created by fpga_region_create
+ * Return: 0 or -errno
+ */
+int fpga_region_register(struct fpga_region *region)
+{
+	return device_add(&region->dev);
+
 }
 EXPORT_SYMBOL_GPL(fpga_region_register);
 
-int fpga_region_unregister(struct fpga_region *region)
+/*
+ * fpga_region_unregister - unregister a FPGA region
+ * @region: FPGA region
+ */
+void fpga_region_unregister(struct fpga_region *region)
 {
 	device_unregister(&region->dev);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(fpga_region_unregister);
 
@@ -212,7 +254,7 @@ static void fpga_region_dev_release(struct device *dev)
 {
 	struct fpga_region *region = to_fpga_region(dev);
 
-	ida_simple_remove(&fpga_region_ida, region->dev.id);
+	fpga_region_free(region);
 }
 
 /**
diff --git a/drivers/fpga/of-fpga-region.c b/drivers/fpga/of-fpga-region.c
index 35e7e8c4a0cb..9d681a1c5738 100644
--- a/drivers/fpga/of-fpga-region.c
+++ b/drivers/fpga/of-fpga-region.c
@@ -422,20 +422,15 @@ static int of_fpga_region_probe(struct platform_device *pdev)
 	if (IS_ERR(mgr))
 		return -EPROBE_DEFER;
 
-	region = devm_kzalloc(dev, sizeof(*region), GFP_KERNEL);
+	region = fpga_region_create(dev, mgr, of_fpga_region_get_bridges);
 	if (!region) {
 		ret = -ENOMEM;
 		goto eprobe_mgr_put;
 	}
 
-	region->mgr = mgr;
-
-	/* Specify how to get bridges for this type of region. */
-	region->get_bridges = of_fpga_region_get_bridges;
-
-	ret = fpga_region_register(dev, region);
+	ret = fpga_region_register(region);
 	if (ret)
-		goto eprobe_mgr_put;
+		goto eprobe_free;
 
 	of_platform_populate(np, fpga_region_of_match, NULL, &region->dev);
 	dev_set_drvdata(dev, region);
@@ -444,6 +439,8 @@ static int of_fpga_region_probe(struct platform_device *pdev)
 
 	return 0;
 
+eprobe_free:
+	fpga_region_free(region);
 eprobe_mgr_put:
 	fpga_mgr_put(mgr);
 	return ret;
diff --git a/include/linux/fpga/fpga-region.h b/include/linux/fpga/fpga-region.h
index b6520318ab9c..f2e215bd1330 100644
--- a/include/linux/fpga/fpga-region.h
+++ b/include/linux/fpga/fpga-region.h
@@ -14,7 +14,6 @@
  * @info: FPGA image info
  * @priv: private data
  * @get_bridges: optional function to get bridges to a list
- * @groups: optional attribute groups.
  */
 struct fpga_region {
 	struct device dev;
@@ -24,7 +23,6 @@ struct fpga_region {
 	struct fpga_image_info *info;
 	void *priv;
 	int (*get_bridges)(struct fpga_region *region);
-	const struct attribute_group **groups;
 };
 
 #define to_fpga_region(d) container_of(d, struct fpga_region, dev)
@@ -34,7 +32,12 @@ struct fpga_region *fpga_region_class_find(
 	int (*match)(struct device *, const void *));
 
 int fpga_region_program_fpga(struct fpga_region *region);
-int fpga_region_register(struct device *dev, struct fpga_region *region);
-int fpga_region_unregister(struct fpga_region *region);
+
+struct fpga_region
+*fpga_region_create(struct device *dev, struct fpga_manager *mgr,
+		    int (*get_bridges)(struct fpga_region *));
+void fpga_region_free(struct fpga_region *region);
+int fpga_region_register(struct fpga_region *region);
+void fpga_region_unregister(struct fpga_region *region);
 
 #endif /* _FPGA_REGION_H */
-- 
cgit v1.2.3


From 473f01f7e4b9fc53d44c446ad22b39070c65393f Mon Sep 17 00:00:00 2001
From: Alan Tull <atull@kernel.org>
Date: Wed, 16 May 2018 18:49:58 -0500
Subject: fpga: use SPDX

Replace GPLv2 boilerplate with SPDX in FPGA code that came from me or
from Altera.

Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/altera-fpga2sdram.c       | 13 +------------
 drivers/fpga/altera-freeze-bridge.c    | 13 +------------
 drivers/fpga/altera-hps2fpga.c         | 13 +------------
 drivers/fpga/altera-pr-ip-core-plat.c  | 13 +------------
 drivers/fpga/altera-pr-ip-core.c       | 13 +------------
 drivers/fpga/fpga-bridge.c             | 13 +------------
 drivers/fpga/fpga-mgr.c                | 13 +------------
 drivers/fpga/fpga-region.c             | 14 +-------------
 drivers/fpga/of-fpga-region.c          | 14 +-------------
 drivers/fpga/socfpga-a10.c             | 14 +-------------
 drivers/fpga/socfpga.c                 | 13 +------------
 include/linux/fpga/altera-pr-ip-core.h | 13 +------------
 include/linux/fpga/fpga-mgr.h          | 13 +------------
 include/linux/fpga/fpga-region.h       |  2 ++
 14 files changed, 15 insertions(+), 159 deletions(-)

(limited to 'include')

diff --git a/drivers/fpga/altera-fpga2sdram.c b/drivers/fpga/altera-fpga2sdram.c
index 5a29ab6e3b28..23660ccd634b 100644
--- a/drivers/fpga/altera-fpga2sdram.c
+++ b/drivers/fpga/altera-fpga2sdram.c
@@ -1,19 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA to SDRAM Bridge Driver for Altera SoCFPGA Devices
  *
  *  Copyright (C) 2013-2016 Altera Corporation, All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
diff --git a/drivers/fpga/altera-freeze-bridge.c b/drivers/fpga/altera-freeze-bridge.c
index fa4b693cf4be..ffd586c48ecf 100644
--- a/drivers/fpga/altera-freeze-bridge.c
+++ b/drivers/fpga/altera-freeze-bridge.c
@@ -1,19 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA Freeze Bridge Controller
  *
  *  Copyright (C) 2016 Altera Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/delay.h>
 #include <linux/io.h>
diff --git a/drivers/fpga/altera-hps2fpga.c b/drivers/fpga/altera-hps2fpga.c
index e4d39f0a7572..a974d3f60321 100644
--- a/drivers/fpga/altera-hps2fpga.c
+++ b/drivers/fpga/altera-hps2fpga.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA to/from HPS Bridge Driver for Altera SoCFPGA Devices
  *
@@ -6,18 +7,6 @@
  * Includes this patch from the mailing list:
  *   fpga: altera-hps2fpga: fix HPS2FPGA bridge visibility to L3 masters
  *   Signed-off-by: Anatolij Gustschin <agust@denx.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
diff --git a/drivers/fpga/altera-pr-ip-core-plat.c b/drivers/fpga/altera-pr-ip-core-plat.c
index 8fb36b8b4648..b293d83143f1 100644
--- a/drivers/fpga/altera-pr-ip-core-plat.c
+++ b/drivers/fpga/altera-pr-ip-core-plat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for Altera Partial Reconfiguration IP Core
  *
@@ -5,18 +6,6 @@
  *
  * Based on socfpga-a10.c Copyright (C) 2015-2016 Altera Corporation
  *  by Alan Tull <atull@opensource.altera.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/fpga/altera-pr-ip-core.h>
 #include <linux/module.h>
diff --git a/drivers/fpga/altera-pr-ip-core.c b/drivers/fpga/altera-pr-ip-core.c
index eea521774cf6..65e0b6a2c031 100644
--- a/drivers/fpga/altera-pr-ip-core.c
+++ b/drivers/fpga/altera-pr-ip-core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for Altera Partial Reconfiguration IP Core
  *
@@ -5,18 +6,6 @@
  *
  * Based on socfpga-a10.c Copyright (C) 2015-2016 Altera Corporation
  *  by Alan Tull <atull@opensource.altera.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/delay.h>
 #include <linux/fpga/altera-pr-ip-core.h>
diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c
index 2db1573507eb..164eb552da45 100644
--- a/drivers/fpga/fpga-bridge.c
+++ b/drivers/fpga/fpga-bridge.c
@@ -1,20 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA Bridge Framework Driver
  *
  *  Copyright (C) 2013-2016 Altera Corporation, All Rights Reserved.
  *  Copyright (C) 2017 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/fpga/fpga-bridge.h>
 #include <linux/idr.h>
diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index 0a5181db3e2b..151ac364be80 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA Manager Core
  *
@@ -6,18 +7,6 @@
  *
  * With code from the mailing list:
  * Copyright (C) 2013 Xilinx, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/firmware.h>
 #include <linux/fpga/fpga-mgr.h>
diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
index b3ba3e40c44b..0878f62dd1fc 100644
--- a/drivers/fpga/fpga-region.c
+++ b/drivers/fpga/fpga-region.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA Region - Device Tree support for FPGA programming under Linux
  *
  *  Copyright (C) 2013-2016 Altera Corporation
  *  Copyright (C) 2017 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
 #include <linux/fpga/fpga-bridge.h>
 #include <linux/fpga/fpga-mgr.h>
 #include <linux/fpga/fpga-region.h>
diff --git a/drivers/fpga/of-fpga-region.c b/drivers/fpga/of-fpga-region.c
index 9d681a1c5738..35fabb8083fb 100644
--- a/drivers/fpga/of-fpga-region.c
+++ b/drivers/fpga/of-fpga-region.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA Region - Device Tree support for FPGA programming under Linux
  *
  *  Copyright (C) 2013-2016 Altera Corporation
  *  Copyright (C) 2017 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
 #include <linux/fpga/fpga-bridge.h>
 #include <linux/fpga/fpga-mgr.h>
 #include <linux/fpga/fpga-region.h>
diff --git a/drivers/fpga/socfpga-a10.c b/drivers/fpga/socfpga-a10.c
index dec3db5cdab1..be30c48eb6e4 100644
--- a/drivers/fpga/socfpga-a10.c
+++ b/drivers/fpga/socfpga-a10.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA Manager Driver for Altera Arria10 SoCFPGA
  *
  * Copyright (C) 2015-2016 Altera Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
 #include <linux/clk.h>
 #include <linux/device.h>
 #include <linux/delay.h>
diff --git a/drivers/fpga/socfpga.c b/drivers/fpga/socfpga.c
index 51efaf9e0e03..959d71f26896 100644
--- a/drivers/fpga/socfpga.c
+++ b/drivers/fpga/socfpga.c
@@ -1,19 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FPGA Manager Driver for Altera SOCFPGA
  *
  *  Copyright (C) 2013-2015 Altera Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/completion.h>
 #include <linux/delay.h>
diff --git a/include/linux/fpga/altera-pr-ip-core.h b/include/linux/fpga/altera-pr-ip-core.h
index 3810a9033f49..7d4664730d60 100644
--- a/include/linux/fpga/altera-pr-ip-core.h
+++ b/include/linux/fpga/altera-pr-ip-core.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Driver for Altera Partial Reconfiguration IP Core
  *
@@ -5,18 +6,6 @@
  *
  * Based on socfpga-a10.c Copyright (C) 2015-2016 Altera Corporation
  *  by Alan Tull <atull@opensource.altera.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _ALT_PR_IP_CORE_H
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 1266c1108e70..eec7c2478b0d 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -1,20 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * FPGA Framework
  *
  *  Copyright (C) 2013-2016 Altera Corporation
  *  Copyright (C) 2017 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef _LINUX_FPGA_MGR_H
 #define _LINUX_FPGA_MGR_H
diff --git a/include/linux/fpga/fpga-region.h b/include/linux/fpga/fpga-region.h
index f2e215bd1330..d7071cddd727 100644
--- a/include/linux/fpga/fpga-region.h
+++ b/include/linux/fpga/fpga-region.h
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
 #ifndef _FPGA_REGION_H
 #define _FPGA_REGION_H
 
-- 
cgit v1.2.3


From ed24d568bd9190cee046ae64df5890494dea2ca4 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Fri, 25 May 2018 14:50:36 +0100
Subject: regmap: add missing prototype for devm_init_slimbus

For some reason the devm variant of slimbus init is not added
into the header eventhough this __devm_regmap_init_slimbus()
is an exported function.

This patch adds this. This also fixes below warning in regmap-slimbus.c
regmap-slimbus.c:65:15: warning: symbol '__devm_regmap_init_slimbus'
 was not declared. Should it be static?
regmap-slimbus.c:65:16: warning: no previous prototype for
 '__devm_regmap_init_slimbus' [-Wmissing-prototypes]

Fixes: 7d6f7fb053ad ("regmap: add SLIMbus support")
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index b6865f070464..4f38068ffb71 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -588,7 +588,10 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
-
+struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name);
 /*
  * Wrapper for regmap_init macros to include a unique lockdep key and name
  * for each call. No-op if CONFIG_LOCKDEP is not set.
@@ -907,6 +910,19 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 	__regmap_lockdep_wrapper(__devm_regmap_init_sdw, #config,	\
 				sdw, config)
 
+/**
+ * devm_regmap_init_slimbus() - Initialise managed register map
+ *
+ * @slimbus: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap. The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_slimbus(slimbus, config)			\
+	__regmap_lockdep_wrapper(__devm_regmap_init_slimbus, #config,	\
+				slimbus, config)
 int regmap_mmio_attach_clk(struct regmap *map, struct clk *clk);
 void regmap_mmio_detach_clk(struct regmap *map);
 void regmap_exit(struct regmap *map);
-- 
cgit v1.2.3


From 7d850abd5f4edb1b1ca4b4141a4453305736f564 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 24 May 2018 11:56:48 +0300
Subject: net: bridge: add support for port isolation

This patch adds support for a new port flag - BR_ISOLATED. If it is set
then isolated ports cannot communicate between each other, but they can
still communicate with non-isolated ports. The same can be achieved via
ACLs but they can't scale with large number of ports and also the
complexity of the rules grows. This feature can be used to achieve
isolated vlan functionality (similar to pvlan) as well, though currently
it will be port-wide (for all vlans on the port). The new test in
should_deliver uses data that is already cache hot and the new boolean
is used to avoid an additional source port test in should_deliver.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    | 1 +
 include/uapi/linux/if_link.h | 1 +
 net/bridge/br_forward.c      | 3 ++-
 net/bridge/br_input.c        | 1 +
 net/bridge/br_netlink.c      | 9 ++++++++-
 net/bridge/br_private.h      | 9 +++++++++
 net/bridge/br_sysfs_if.c     | 2 ++
 7 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 585d27182425..7843b98e1c6e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -50,6 +50,7 @@ struct br_ip_list {
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
 #define BR_NEIGH_SUPPRESS	BIT(15)
+#define BR_ISOLATED		BIT(16)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b85266420bfb..cf01b6824244 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -333,6 +333,7 @@ enum {
 	IFLA_BRPORT_BCAST_FLOOD,
 	IFLA_BRPORT_GROUP_FWD_MASK,
 	IFLA_BRPORT_NEIGH_SUPPRESS,
+	IFLA_BRPORT_ISOLATED,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7a7fd672ccf2..9019f326fe81 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
 	vg = nbp_vlan_group_rcu(p);
 	return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
 		br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
-		nbp_switchdev_allowed_egress(p, skb);
+		nbp_switchdev_allowed_egress(p, skb) &&
+		!br_skb_isolated(p, skb);
 }
 
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7f98a7d25866..72074276c088 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		goto drop;
 
 	BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+	BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
 
 	if (IS_ENABLED(CONFIG_INET) &&
 	    (skb->protocol == htons(ETH_P_ARP) ||
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 015f465c514b..9f5eb05b0373 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
 		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
+		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 							BR_VLAN_TUNNEL)) ||
 	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
 	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
-		       !!(p->flags & BR_NEIGH_SUPPRESS)))
+		       !!(p->flags & BR_NEIGH_SUPPRESS)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 },
 	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 	[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
+	[IFLA_BRPORT_ISOLATED]	= { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	if (err)
 		return err;
 
+	err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+	if (err)
+		return err;
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 742f40aefdaf..11520ed528b0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -423,6 +423,7 @@ struct br_input_skb_cb {
 #endif
 
 	bool proxyarp_replied;
+	bool src_port_isolated;
 
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	bool vlan_filtered;
@@ -574,6 +575,14 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
 
+/* return true if both source port and dest port are isolated */
+static inline bool br_skb_isolated(const struct net_bridge_port *to,
+				   const struct sk_buff *skb)
+{
+	return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
+	       (to->flags & BR_ISOLATED);
+}
+
 /* br_if.c */
 void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
 int br_add_bridge(struct net *net, const char *name);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index fd31ad83ec7b..f99c5bf5c906 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
 BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
+BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_broadcast_flood,
 	&brport_attr_group_fwd_mask,
 	&brport_attr_neigh_suppress,
+	&brport_attr_isolated,
 	NULL
 };
 
-- 
cgit v1.2.3


From 3893fc62b1769db3ef160f7f1e36d3db754497ee Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 24 May 2018 09:54:51 -0700
Subject: qed*: Support other classification modes.

Currently, driver supports flow classification to PF
receive queues based on TCP/UDP 4 tuples [src_ip, dst_ip,
src_port, dst_port] only.

This patch enables to configure different flow profiles
[For example - only UDP dest port or src_ip based] on the
adapter so that classification can be done according to
just those fields as well. Although, at a time just one
type of flow configuration is supported due to limited
number of flow profiles available on the device.

For example -

ethtool -N enp7s0f0 flow-type udp4 dst-port 45762 action 2
ethtool -N enp7s0f0 flow-type tcp4 src-ip 192.16.4.10 action 1
ethtool -N enp7s0f0 flow-type udp6 dst-port 45762 action 3

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c       |  2 ++
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 31 ++++++++++++++++++++++++--
 include/linux/qed/qed_eth_if.h                 |  1 +
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 5e655c3601cf..3cb8a8088467 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1973,6 +1973,8 @@ qed_arfs_mode_to_hsi(enum qed_filter_config_mode mode)
 		return GFT_PROFILE_TYPE_4_TUPLE;
 	if (mode == QED_FILTER_CONFIG_MODE_IP_DEST)
 		return GFT_PROFILE_TYPE_IP_DST_ADDR;
+	if (mode == QED_FILTER_CONFIG_MODE_IP_SRC)
+		return GFT_PROFILE_TYPE_IP_SRC_ADDR;
 	return GFT_PROFILE_TYPE_L4_DST_PORT;
 }
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 43ed420900c1..9b84f0cb12e7 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1623,9 +1623,17 @@ static int qede_flow_spec_to_tuple_ipv4_common(struct qede_dev *edev,
 	t->src_port = fs->h_u.tcp_ip4_spec.psrc;
 	t->dst_port = fs->h_u.tcp_ip4_spec.pdst;
 
-	/* We must have a valid 4-tuple */
+	/* We must either have a valid 4-tuple or only dst port
+	 * or only src ip as an input
+	 */
 	if (t->src_port && t->dst_port && t->src_ipv4 && t->dst_ipv4) {
 		t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
+	} else if (!t->src_port && t->dst_port &&
+		   !t->src_ipv4 && !t->dst_ipv4) {
+		t->mode = QED_FILTER_CONFIG_MODE_L4_PORT;
+	}  else if (!t->src_port && !t->dst_port &&
+		    !t->dst_ipv4 && t->src_ipv4) {
+		t->mode = QED_FILTER_CONFIG_MODE_IP_SRC;
 	} else {
 		DP_INFO(edev, "Invalid N-tuple\n");
 		return -EOPNOTSUPP;
@@ -1697,11 +1705,21 @@ static int qede_flow_spec_to_tuple_ipv6_common(struct qede_dev *edev,
 	t->src_port = fs->h_u.tcp_ip6_spec.psrc;
 	t->dst_port = fs->h_u.tcp_ip6_spec.pdst;
 
-	/* We must make sure we have a valid 4-tuple */
+	/* We must make sure we have a valid 4-tuple or only dest port
+	 * or only src ip as an input
+	 */
 	if (t->src_port && t->dst_port &&
 	    memcmp(&t->src_ipv6, p, sizeof(struct in6_addr)) &&
 	    memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr))) {
 		t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
+	} else if (!t->src_port && t->dst_port &&
+		   !memcmp(&t->src_ipv6, p, sizeof(struct in6_addr)) &&
+		   !memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr))) {
+		t->mode = QED_FILTER_CONFIG_MODE_L4_PORT;
+	} else if (!t->src_port && !t->dst_port &&
+		   !memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr)) &&
+		   memcmp(&t->src_ipv6, p, sizeof(struct in6_addr))) {
+		t->mode = QED_FILTER_CONFIG_MODE_IP_SRC;
 	} else {
 		DP_INFO(edev, "Invalid N-tuple\n");
 		return -EOPNOTSUPP;
@@ -1779,6 +1797,15 @@ static int qede_flow_spec_validate(struct qede_dev *edev,
 		return -EINVAL;
 	}
 
+	/* Check if the filtering-mode could support the filter */
+	if (edev->arfs->filter_count &&
+	    edev->arfs->mode != t->mode) {
+		DP_INFO(edev,
+			"flow_spec would require filtering mode %08x, but %08x is configured\n",
+			t->mode, edev->arfs->filter_count);
+		return -EINVAL;
+	}
+
 	if (fs->ring_cookie >= QEDE_RSS_COUNT(edev)) {
 		DP_INFO(edev, "Queue out-of-bounds\n");
 		return -EINVAL;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 7f9756fe9180..557e86e1c7d9 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -66,6 +66,7 @@ enum qed_filter_config_mode {
 	QED_FILTER_CONFIG_MODE_5_TUPLE,
 	QED_FILTER_CONFIG_MODE_L4_PORT,
 	QED_FILTER_CONFIG_MODE_IP_DEST,
+	QED_FILTER_CONFIG_MODE_IP_SRC,
 };
 
 struct qed_ntuple_filter_params {
-- 
cgit v1.2.3


From 608e00d0a2eb53079c55dc9c14d8711bbb3a4390 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 24 May 2018 09:54:53 -0700
Subject: qed*: Support drop action classification

With this patch, User can configure for the supported
flows to be dropped. Added a stat "gft_filter_drop"
as well to be populated in ethtool for the dropped flows.

For example -

ethtool -N p5p1 flow-type udp4 dst-port 8000 action -1
ethtool -N p5p1 flow-type tcp4 scr-ip 192.168.8.1 action -1

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 35 ++++++++++++++-----------
 drivers/net/ethernet/qlogic/qede/qede.h         |  1 +
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  1 +
 drivers/net/ethernet/qlogic/qede/qede_filter.c  | 14 ++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  1 +
 include/linux/qed/qed_eth_if.h                  |  3 +++
 include/linux/qed/qed_if.h                      |  1 +
 7 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 3cb8a8088467..1c0d0c217936 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1677,6 +1677,8 @@ static void __qed_get_vport_tstats(struct qed_hwfn *p_hwfn,
 	    HILO_64_REGPAIR(tstats.mftag_filter_discard);
 	p_stats->common.mac_filter_discards +=
 	    HILO_64_REGPAIR(tstats.eth_mac_filter_discard);
+	p_stats->common.gft_filter_drop +=
+		HILO_64_REGPAIR(tstats.eth_gft_drop_pkt);
 }
 
 static void __qed_get_vport_ustats_addrlen(struct qed_hwfn *p_hwfn,
@@ -2015,16 +2017,6 @@ qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn,
 	u8 abs_vport_id = 0;
 	int rc = -EINVAL;
 
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
-	if (rc)
-		return rc;
-
-	if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
-		rc = qed_fw_l2_queue(p_hwfn, p_params->qid, &abs_rx_q_id);
-		if (rc)
-			return rc;
-	}
-
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
 	init_data.cid = qed_spq_get_cid(p_hwfn);
@@ -2049,15 +2041,28 @@ qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn,
 	DMA_REGPAIR_LE(p_ramrod->pkt_hdr_addr, p_params->addr);
 	p_ramrod->pkt_hdr_length = cpu_to_le16(p_params->length);
 
-	if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
-		p_ramrod->rx_qid_valid = 1;
-		p_ramrod->rx_qid = cpu_to_le16(abs_rx_q_id);
+	if (p_params->b_is_drop) {
+		p_ramrod->vport_id = cpu_to_le16(ETH_GFT_TRASHCAN_VPORT);
+	} else {
+		rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
+		if (rc)
+			return rc;
+
+		if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
+			rc = qed_fw_l2_queue(p_hwfn, p_params->qid,
+					     &abs_rx_q_id);
+			if (rc)
+				return rc;
+
+			p_ramrod->rx_qid_valid = 1;
+			p_ramrod->rx_qid = cpu_to_le16(abs_rx_q_id);
+		}
+
+		p_ramrod->vport_id = cpu_to_le16((u16)abs_vport_id);
 	}
 
 	p_ramrod->flow_id_valid = 0;
 	p_ramrod->flow_id = 0;
-
-	p_ramrod->vport_id = cpu_to_le16((u16)abs_vport_id);
 	p_ramrod->filter_action = p_params->b_is_add ? GFT_ADD_FILTER
 	    : GFT_DELETE_FILTER;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 2d3f09ed413b..81c5c8dfa2ef 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -75,6 +75,7 @@ struct qede_stats_common {
 	u64 rx_bcast_pkts;
 	u64 mftag_filter_discards;
 	u64 mac_filter_discards;
+	u64 gft_filter_drop;
 	u64 tx_ucast_bytes;
 	u64 tx_mcast_bytes;
 	u64 tx_bcast_bytes;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 8c6fdad91986..6906e04b609e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -161,6 +161,7 @@ static const struct {
 	QEDE_STAT(no_buff_discards),
 	QEDE_PF_STAT(mftag_filter_discards),
 	QEDE_PF_STAT(mac_filter_discards),
+	QEDE_PF_STAT(gft_filter_drop),
 	QEDE_STAT(tx_err_drop_pkts),
 	QEDE_STAT(ttl0_discard),
 	QEDE_STAT(packet_too_big_discard),
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 6c02c21d996d..e9e088d9c815 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -90,6 +90,7 @@ struct qede_arfs_fltr_node {
 	bool filter_op;
 	bool used;
 	u8 fw_rc;
+	bool b_is_drop;
 	struct hlist_node node;
 };
 
@@ -125,6 +126,7 @@ static void qede_configure_arfs_fltr(struct qede_dev *edev,
 	params.length = n->buf_len;
 	params.qid = rxq_id;
 	params.b_is_add = add_fltr;
+	params.b_is_drop = n->b_is_drop;
 
 	if (n->vfid) {
 		params.b_is_vf = true;
@@ -1445,6 +1447,9 @@ int qede_get_cls_rule_entry(struct qede_dev *edev, struct ethtool_rxnfc *cmd)
 		fsp->ring_cookie |= ((u64)fltr->vfid) <<
 					ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
 	}
+
+	if (fltr->b_is_drop)
+		fsp->ring_cookie = RX_CLS_FLOW_DISC;
 unlock:
 	__qede_unlock(edev);
 	return rc;
@@ -1816,6 +1821,10 @@ static int qede_flow_spec_validate(struct qede_dev *edev,
 		return -EINVAL;
 	}
 
+	/* If drop requested then no need to validate other data */
+	if (fs->ring_cookie == RX_CLS_FLOW_DISC)
+		return 0;
+
 	if (ethtool_get_flow_spec_ring_vf(fs->ring_cookie))
 		return 0;
 
@@ -1852,6 +1861,11 @@ static void qede_flow_set_destination(struct qede_dev *edev,
 				      struct qede_arfs_fltr_node *n,
 				      struct ethtool_rx_flow_spec *fs)
 {
+	if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
+		n->b_is_drop = true;
+		return;
+	}
+
 	n->vfid = ethtool_get_flow_spec_ring_vf(fs->ring_cookie);
 	n->rxq_id = ethtool_get_flow_spec_ring(fs->ring_cookie);
 	n->next_rxq_id = n->rxq_id;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 9e70f713c3c5..d118771e1a7b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -347,6 +347,7 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	p_common->rx_bcast_pkts = stats.common.rx_bcast_pkts;
 	p_common->mftag_filter_discards = stats.common.mftag_filter_discards;
 	p_common->mac_filter_discards = stats.common.mac_filter_discards;
+	p_common->gft_filter_drop = stats.common.gft_filter_drop;
 
 	p_common->tx_ucast_bytes = stats.common.tx_ucast_bytes;
 	p_common->tx_mcast_bytes = stats.common.tx_mcast_bytes;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 557e86e1c7d9..2978fa4add42 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -89,6 +89,9 @@ struct qed_ntuple_filter_params {
 
 	/* true iff this filter is to be added. Else to be removed */
 	bool b_is_add;
+
+	/* If flow needs to be dropped */
+	bool b_is_drop;
 };
 
 struct qed_dev_eth_info {
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 44af652e7407..ac991a3b8f03 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -1129,6 +1129,7 @@ struct qed_eth_stats_common {
 	u64	rx_bcast_pkts;
 	u64	mftag_filter_discards;
 	u64	mac_filter_discards;
+	u64	gft_filter_drop;
 	u64	tx_ucast_bytes;
 	u64	tx_mcast_bytes;
 	u64	tx_bcast_bytes;
-- 
cgit v1.2.3


From 5972be6b2495c6bffbf444497517fd1c070eef78 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Thu, 24 May 2018 17:56:42 -0700
Subject: openvswitch: Add conntrack limit netlink definition

Define netlink messages and attributes to support user kernel
communication that uses the conntrack limit feature.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 713e56ce681f..863aabaa5cc9 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -937,4 +937,32 @@ enum ovs_meter_band_type {
 
 #define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
 
+/* Conntrack limit */
+#define OVS_CT_LIMIT_FAMILY  "ovs_ct_limit"
+#define OVS_CT_LIMIT_MCGROUP "ovs_ct_limit"
+#define OVS_CT_LIMIT_VERSION 0x1
+
+enum ovs_ct_limit_cmd {
+	OVS_CT_LIMIT_CMD_UNSPEC,
+	OVS_CT_LIMIT_CMD_SET,		/* Add or modify ct limit. */
+	OVS_CT_LIMIT_CMD_DEL,		/* Delete ct limit. */
+	OVS_CT_LIMIT_CMD_GET		/* Get ct limit. */
+};
+
+enum ovs_ct_limit_attr {
+	OVS_CT_LIMIT_ATTR_UNSPEC,
+	OVS_CT_LIMIT_ATTR_ZONE_LIMIT,	/* Nested struct ovs_zone_limit. */
+	__OVS_CT_LIMIT_ATTR_MAX
+};
+
+#define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
+
+#define OVS_ZONE_LIMIT_DEFAULT_ZONE -1
+
+struct ovs_zone_limit {
+	int zone_id;
+	__u32 limit;
+	__u32 count;
+};
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
cgit v1.2.3


From 312416d9171a1460b7ed8d182b5b540c910ce80d Mon Sep 17 00:00:00 2001
From: Mahesh Sivasubramanian <msivasub@codeaurora.org>
Date: Tue, 10 Apr 2018 11:57:23 -0600
Subject: drivers: qcom: add command DB driver

Command DB is a simple database in the shared memory of QCOM SoCs, that
provides information regarding shared resources. Some shared resources
in the SoC have properties that are probed dynamically at boot by the
remote processor. The information pertaining to the SoC and the platform
are made available in the shared memory. Drivers can query this
information using predefined strings.

Signed-off-by: Mahesh Sivasubramanian <msivasub@codeaurora.org>
Signed-off-by: Lina Iyer <ilina@codeaurora.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/of/platform.c     |   1 +
 drivers/soc/qcom/Kconfig  |   9 ++
 drivers/soc/qcom/Makefile |   1 +
 drivers/soc/qcom/cmd-db.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++
 include/soc/qcom/cmd-db.h |  45 +++++++
 5 files changed, 366 insertions(+)
 create mode 100644 drivers/soc/qcom/cmd-db.c
 create mode 100644 include/soc/qcom/cmd-db.h

(limited to 'include')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index c00d81dfac0b..26fb43847f4b 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -494,6 +494,7 @@ EXPORT_SYMBOL_GPL(of_platform_default_populate);
 #ifndef CONFIG_PPC
 static const struct of_device_id reserved_mem_matches[] = {
 	{ .compatible = "qcom,rmtfs-mem" },
+	{ .compatible = "qcom,cmd-db" },
 	{ .compatible = "ramoops" },
 	{}
 };
diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index d0372de29834..7093fe79934b 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -3,6 +3,15 @@
 #
 menu "Qualcomm SoC drivers"
 
+config QCOM_COMMAND_DB
+	bool "Qualcomm Command DB"
+	depends on (ARCH_QCOM && OF) || COMPILE_TEST
+	help
+	  Command DB queries shared memory by key string for shared system
+	  resources. Platform drivers that require to set state of a shared
+	  resource on a RPM-hardened platform must use this database to get
+	  SoC specific identifier and information for the shared resources.
+
 config QCOM_GENI_SE
 	tristate "QCOM GENI Serial Engine Driver"
 	depends on ARCH_QCOM || COMPILE_TEST
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index 959aa748f5ac..cbf414cfe6aa 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_QCOM_GENI_SE) +=	qcom-geni-se.o
+obj-$(CONFIG_QCOM_COMMAND_DB) += cmd-db.o
 obj-$(CONFIG_QCOM_GLINK_SSR) +=	glink_ssr.o
 obj-$(CONFIG_QCOM_GSBI)	+=	qcom_gsbi.o
 obj-$(CONFIG_QCOM_MDT_LOADER)	+= mdt_loader.o
diff --git a/drivers/soc/qcom/cmd-db.c b/drivers/soc/qcom/cmd-db.c
new file mode 100644
index 000000000000..b5172049f608
--- /dev/null
+++ b/drivers/soc/qcom/cmd-db.c
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2016-2018, The Linux Foundation. All rights reserved. */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+
+#include <soc/qcom/cmd-db.h>
+
+#define NUM_PRIORITY		2
+#define MAX_SLV_ID		8
+#define CMD_DB_MAGIC		0x0C0330DBUL
+#define SLAVE_ID_MASK		0x7
+#define SLAVE_ID_SHIFT		16
+
+#define ENTRY_HEADER(hdr)	((void *)cmd_db_header +	\
+				sizeof(*cmd_db_header) +	\
+				hdr->header_offset)
+
+#define RSC_OFFSET(hdr, ent)	((void *)cmd_db_header +	\
+				sizeof(*cmd_db_header) +	\
+				hdr.data_offset + ent.offset)
+
+/**
+ * struct entry_header: header for each entry in cmddb
+ *
+ * @id: resource's identifier
+ * @priority: unused
+ * @addr: the address of the resource
+ * @len: length of the data
+ * @offset: offset from :@data_offset, start of the data
+ */
+struct entry_header {
+	u64 id;
+	u32 priority[NUM_PRIORITY];
+	u32 addr;
+	u16 len;
+	u16 offset;
+};
+
+/**
+ * struct rsc_hdr: resource header information
+ *
+ * @slv_id: id for the resource
+ * @header_offset: entry's header at offset from the end of the cmd_db_header
+ * @data_offset: entry's data at offset from the end of the cmd_db_header
+ * @cnt: number of entries for HW type
+ * @version: MSB is major, LSB is minor
+ * @reserved: reserved for future use.
+ */
+struct rsc_hdr {
+	u16 slv_id;
+	u16 header_offset;
+	u16 data_offset;
+	u16 cnt;
+	u16 version;
+	u16 reserved[3];
+};
+
+/**
+ * struct cmd_db_header: The DB header information
+ *
+ * @version: The cmd db version
+ * @magic_number: constant expected in the database
+ * @header: array of resources
+ * @checksum: checksum for the header. Unused.
+ * @reserved: reserved memory
+ * @data: driver specific data
+ */
+struct cmd_db_header {
+	u32 version;
+	u32 magic_num;
+	struct rsc_hdr header[MAX_SLV_ID];
+	u32 checksum;
+	u32 reserved;
+	u8 data[];
+};
+
+/**
+ * DOC: Description of the Command DB database.
+ *
+ * At the start of the command DB memory is the cmd_db_header structure.
+ * The cmd_db_header holds the version, checksum, magic key as well as an
+ * array for header for each slave (depicted by the rsc_header). Each h/w
+ * based accelerator is a 'slave' (shared resource) and has slave id indicating
+ * the type of accelerator. The rsc_header is the header for such individual
+ * slaves of a given type. The entries for each of these slaves begin at the
+ * rsc_hdr.header_offset. In addition each slave could have auxiliary data
+ * that may be needed by the driver. The data for the slave starts at the
+ * entry_header.offset to the location pointed to by the rsc_hdr.data_offset.
+ *
+ * Drivers have a stringified key to a slave/resource. They can query the slave
+ * information and get the slave id and the auxiliary data and the length of the
+ * data. Using this information, they can format the request to be sent to the
+ * h/w accelerator and request a resource state.
+ */
+
+static struct cmd_db_header *cmd_db_header;
+
+/**
+ * cmd_db_ready - Indicates if command DB is available
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int cmd_db_ready(void)
+{
+	if (cmd_db_header == NULL)
+		return -EPROBE_DEFER;
+	else if (cmd_db_header->magic_num != CMD_DB_MAGIC)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(cmd_db_ready);
+
+static u64 cmd_db_get_u64_id(const char *id)
+{
+	u64 rsc_id = 0;
+	u8 *ch = (u8 *)&rsc_id;
+
+	strncpy(ch, id, min(strlen(id), sizeof(rsc_id)));
+
+	return rsc_id;
+}
+
+static int cmd_db_get_header(u64 query, struct entry_header *eh,
+			     struct rsc_hdr *rh)
+{
+	struct rsc_hdr *rsc_hdr;
+	struct entry_header *ent;
+	int ret, i, j;
+
+	ret = cmd_db_ready();
+	if (ret)
+		return ret;
+
+	if (!eh || !rh)
+		return -EINVAL;
+
+	for (i = 0; i < MAX_SLV_ID; i++) {
+		rsc_hdr = &cmd_db_header->header[i];
+		if (!rsc_hdr->slv_id)
+			break;
+
+		ent = ENTRY_HEADER(rsc_hdr);
+		for (j = 0; j < rsc_hdr->cnt; j++, ent++) {
+			if (ent->id == query)
+				break;
+		}
+
+		if (j < rsc_hdr->cnt) {
+			memcpy(eh, ent, sizeof(*ent));
+			memcpy(rh, rsc_hdr, sizeof(*rh));
+			return 0;
+		}
+	}
+
+	return -ENODEV;
+}
+
+static int cmd_db_get_header_by_rsc_id(const char *id,
+				       struct entry_header *ent_hdr,
+				       struct rsc_hdr *rsc_hdr)
+{
+	u64 rsc_id = cmd_db_get_u64_id(id);
+
+	return cmd_db_get_header(rsc_id, ent_hdr, rsc_hdr);
+}
+
+/**
+ * cmd_db_read_addr() - Query command db for resource id address.
+ *
+ * @id: resource id to query for address
+ *
+ * Return: resource address on success, 0 on error
+ *
+ * This is used to retrieve resource address based on resource
+ * id.
+ */
+u32 cmd_db_read_addr(const char *id)
+{
+	int ret;
+	struct entry_header ent;
+	struct rsc_hdr rsc_hdr;
+
+	ret = cmd_db_get_header_by_rsc_id(id, &ent, &rsc_hdr);
+
+	return ret < 0 ? 0 : ent.addr;
+}
+EXPORT_SYMBOL(cmd_db_read_addr);
+
+/**
+ * cmd_db_read_aux_data() - Query command db for aux data.
+ *
+ *  @id: Resource to retrieve AUX Data on.
+ *  @data: Data buffer to copy returned aux data to. Returns size on NULL
+ *  @len: Caller provides size of data buffer passed in.
+ *
+ *  Return: size of data on success, errno otherwise
+ */
+int cmd_db_read_aux_data(const char *id, u8 *data, size_t len)
+{
+	int ret;
+	struct entry_header ent;
+	struct rsc_hdr rsc_hdr;
+
+	if (!data)
+		return -EINVAL;
+
+	ret = cmd_db_get_header_by_rsc_id(id, &ent, &rsc_hdr);
+	if (ret)
+		return ret;
+
+	if (len < ent.len)
+		return -EINVAL;
+
+	len = min_t(u16, ent.len, len);
+	memcpy(data, RSC_OFFSET(rsc_hdr, ent), len);
+
+	return len;
+}
+EXPORT_SYMBOL(cmd_db_read_aux_data);
+
+/**
+ * cmd_db_read_aux_data_len - Get the length of the auxiliary data stored in DB.
+ *
+ * @id: Resource to retrieve AUX Data.
+ *
+ * Return: size on success, 0 on error
+ */
+size_t cmd_db_read_aux_data_len(const char *id)
+{
+	int ret;
+	struct entry_header ent;
+	struct rsc_hdr rsc_hdr;
+
+	ret = cmd_db_get_header_by_rsc_id(id, &ent, &rsc_hdr);
+
+	return ret < 0 ? 0 : ent.len;
+}
+EXPORT_SYMBOL(cmd_db_read_aux_data_len);
+
+/**
+ * cmd_db_read_slave_id - Get the slave ID for a given resource address
+ *
+ * @id: Resource id to query the DB for version
+ *
+ * Return: cmd_db_hw_type enum on success, CMD_DB_HW_INVALID on error
+ */
+enum cmd_db_hw_type cmd_db_read_slave_id(const char *id)
+{
+	int ret;
+	struct entry_header ent;
+	struct rsc_hdr rsc_hdr;
+
+	ret = cmd_db_get_header_by_rsc_id(id, &ent, &rsc_hdr);
+
+	return ret < 0 ? CMD_DB_HW_INVALID :
+		       (ent.addr >> SLAVE_ID_SHIFT) & SLAVE_ID_MASK;
+}
+EXPORT_SYMBOL(cmd_db_read_slave_id);
+
+static int cmd_db_dev_probe(struct platform_device *pdev)
+{
+	struct reserved_mem *rmem;
+	int ret = 0;
+
+	rmem = of_reserved_mem_lookup(pdev->dev.of_node);
+	if (!rmem) {
+		dev_err(&pdev->dev, "failed to acquire memory region\n");
+		return -EINVAL;
+	}
+
+	cmd_db_header = memremap(rmem->base, rmem->size, MEMREMAP_WB);
+	if (IS_ERR_OR_NULL(cmd_db_header)) {
+		ret = PTR_ERR(cmd_db_header);
+		cmd_db_header = NULL;
+		return ret;
+	}
+
+	if (cmd_db_header->magic_num != CMD_DB_MAGIC) {
+		dev_err(&pdev->dev, "Invalid Command DB Magic\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct of_device_id cmd_db_match_table[] = {
+	{ .compatible = "qcom,cmd-db" },
+	{ },
+};
+
+static struct platform_driver cmd_db_dev_driver = {
+	.probe  = cmd_db_dev_probe,
+	.driver = {
+		   .name = "cmd-db",
+		   .of_match_table = cmd_db_match_table,
+	},
+};
+
+static int __init cmd_db_device_init(void)
+{
+	return platform_driver_register(&cmd_db_dev_driver);
+}
+arch_initcall(cmd_db_device_init);
diff --git a/include/soc/qcom/cmd-db.h b/include/soc/qcom/cmd-db.h
new file mode 100644
index 000000000000..578180cbc134
--- /dev/null
+++ b/include/soc/qcom/cmd-db.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2016-2018, The Linux Foundation. All rights reserved. */
+
+#ifndef __QCOM_COMMAND_DB_H__
+#define __QCOM_COMMAND_DB_H__
+
+
+enum cmd_db_hw_type {
+	CMD_DB_HW_INVALID = 0,
+	CMD_DB_HW_MIN     = 3,
+	CMD_DB_HW_ARC     = CMD_DB_HW_MIN,
+	CMD_DB_HW_VRM     = 4,
+	CMD_DB_HW_BCM     = 5,
+	CMD_DB_HW_MAX     = CMD_DB_HW_BCM,
+	CMD_DB_HW_ALL     = 0xff,
+};
+
+#if IS_ENABLED(CONFIG_QCOM_COMMAND_DB)
+u32 cmd_db_read_addr(const char *resource_id);
+
+int cmd_db_read_aux_data(const char *resource_id, u8 *data, size_t len);
+
+size_t cmd_db_read_aux_data_len(const char *resource_id);
+
+enum cmd_db_hw_type cmd_db_read_slave_id(const char *resource_id);
+
+int cmd_db_ready(void);
+#else
+static inline u32 cmd_db_read_addr(const char *resource_id)
+{ return 0; }
+
+static inline int cmd_db_read_aux_data(const char *resource_id, u8 *data,
+				       size_t len)
+{ return -ENODEV; }
+
+static inline size_t cmd_db_read_aux_data_len(const char *resource_id)
+{ return -ENODEV; }
+
+static inline enum cmd_db_hw_type cmd_db_read_slave_id(const char *resource_id)
+{ return -ENODEV; }
+
+static inline int cmd_db_ready(void)
+{ return -ENODEV; }
+#endif /* CONFIG_QCOM_COMMAND_DB */
+#endif /* __QCOM_COMMAND_DB_H__ */
-- 
cgit v1.2.3


From 6d361c1db7b69fddf5748cf212169ab57bb13a6e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Wed, 25 Apr 2018 10:18:20 -0500
Subject: soc: qcom: smem: introduce qcom_smem_virt_to_phys()

Create function qcom_smem_virt_to_phys(), which returns the physical
address corresponding to a given SMEM item's virtual address.  This
feature is required for a driver that will soon be out for review.

Signed-off-by: Alex Elder <elder@linaro.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/smem.c       | 27 +++++++++++++++++++++++++++
 include/linux/soc/qcom/smem.h |  2 ++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/qcom/smem.c b/drivers/soc/qcom/smem.c
index 7d9a43da5084..70b2ee80d6bd 100644
--- a/drivers/soc/qcom/smem.c
+++ b/drivers/soc/qcom/smem.c
@@ -655,6 +655,33 @@ int qcom_smem_get_free_space(unsigned host)
 }
 EXPORT_SYMBOL(qcom_smem_get_free_space);
 
+/**
+ * qcom_smem_virt_to_phys() - return the physical address associated
+ * with an smem item pointer (previously returned by qcom_smem_get()
+ * @p:	the virtual address to convert
+ *
+ * Returns 0 if the pointer provided is not within any smem region.
+ */
+phys_addr_t qcom_smem_virt_to_phys(void *p)
+{
+	unsigned i;
+
+	for (i = 0; i < __smem->num_regions; i++) {
+		struct smem_region *region = &__smem->regions[i];
+
+		if (p < region->virt_base)
+			continue;
+		if (p < region->virt_base + region->size) {
+			u64 offset = p - region->virt_base;
+
+			return (phys_addr_t)region->aux_base + offset;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(qcom_smem_virt_to_phys);
+
 static int qcom_smem_get_sbl_version(struct qcom_smem *smem)
 {
 	struct smem_header *header;
diff --git a/include/linux/soc/qcom/smem.h b/include/linux/soc/qcom/smem.h
index c1657ed27b30..86e1b358688a 100644
--- a/include/linux/soc/qcom/smem.h
+++ b/include/linux/soc/qcom/smem.h
@@ -9,4 +9,6 @@ void *qcom_smem_get(unsigned host, unsigned item, size_t *size);
 
 int qcom_smem_get_free_space(unsigned host);
 
+phys_addr_t qcom_smem_virt_to_phys(void *p);
+
 #endif
-- 
cgit v1.2.3


From b45630021b52c203c2547a95a5c811b57aed4ead Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Tue, 24 Apr 2018 11:21:46 +0900
Subject: net/mlx5: Add cap bits for flow table destination in FDB table

If set, the FDB table supports the forward action with a
destination list that includes a flow table.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index edbddeaacc88..05b480fae27d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -524,7 +524,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 };
 
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
-	u8     reserved_at_0[0x200];
+	u8      reserved_at_0[0x1c];
+	u8      fdb_multi_path_to_table[0x1];
+	u8      reserved_at_1d[0x1e3];
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
 
-- 
cgit v1.2.3


From 3a2f70331226c140e5aa27ee6bbe2a5c618acb4c Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 4 Apr 2018 12:54:23 +0300
Subject: net/mlx5: Use order-0 allocations for all WQ types

Complete the transition of all WQ types to use fragmented
order-0 coherent memory instead of high-order allocations.

CQ-WQ already uses order-0.
Here we do the same for cyclic and linked-list WQs.

This allows the driver to load cleanly on systems with a highly
fragmented coherent memory.

Performance tests:
ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Packet rate of 64B packets, single transmit ring, size 8K.

No degradation is sensed.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 15 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 17 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    | 24 +++---
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.c    | 14 ++--
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.h    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/wq.c       | 94 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/wq.h       | 33 ++++----
 include/linux/mlx5/driver.h                        | 16 +++-
 9 files changed, 123 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3c0f0a0343fd..9396db54973f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -314,7 +314,7 @@ struct mlx5e_cq {
 
 	/* control */
 	struct mlx5_core_dev      *mdev;
-	struct mlx5_frag_wq_ctrl   wq_ctrl;
+	struct mlx5_wq_ctrl        wq_ctrl;
 } ____cacheline_aligned_in_smp;
 
 struct mlx5e_tx_wqe_info {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8b1e43384ca..0c167e5fc346 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -646,8 +646,8 @@ static int mlx5e_create_rq(struct mlx5e_rq *rq,
 						MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
 
-	mlx5_fill_page_array(&rq->wq_ctrl.buf,
-			     (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+	mlx5_fill_page_frag_array(&rq->wq_ctrl.buf,
+				  (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
 
@@ -1096,7 +1096,8 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
 					  MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr,      csp->wq_ctrl->db.dma);
 
-	mlx5_fill_page_array(&csp->wq_ctrl->buf, (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+	mlx5_fill_page_frag_array(&csp->wq_ctrl->buf,
+				  (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_sq(mdev, in, inlen, sqn);
 
@@ -1538,7 +1539,7 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
 
 static void mlx5e_free_cq(struct mlx5e_cq *cq)
 {
-	mlx5_cqwq_destroy(&cq->wq_ctrl);
+	mlx5_wq_destroy(&cq->wq_ctrl);
 }
 
 static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
@@ -1554,7 +1555,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
-		sizeof(u64) * cq->wq_ctrl.frag_buf.npages;
+		sizeof(u64) * cq->wq_ctrl.buf.npages;
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -1563,7 +1564,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 
 	memcpy(cqc, param->cqc, sizeof(param->cqc));
 
-	mlx5_fill_page_frag_array(&cq->wq_ctrl.frag_buf,
+	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
 				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used);
@@ -1571,7 +1572,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
 	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
 	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
-	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.frag_buf.page_shift -
+	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f4d2c8886492..ac54380d41e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -383,16 +383,16 @@ static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
 	return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
 }
 
-static inline void mlx5e_fill_icosq_edge(struct mlx5e_icosq *sq,
-					 struct mlx5_wq_cyc *wq,
-					 u16 pi)
+static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq,
+					      struct mlx5_wq_cyc *wq,
+					      u16 pi, u16 frag_pi)
 {
 	struct mlx5e_sq_wqe_info *edge_wi, *wi = &sq->db.ico_wqe[pi];
-	u8 nnops = mlx5_wq_cyc_get_size(wq) - pi;
+	u8 nnops = mlx5_wq_cyc_get_frag_size(wq) - frag_pi;
 
 	edge_wi = wi + nnops;
 
-	/* fill sq edge with nops to avoid wqe wrapping two pages */
+	/* fill sq frag edge with nops to avoid wqe wrapping two pages */
 	for (; wi < edge_wi; wi++) {
 		wi->opcode = MLX5_OPCODE_NOP;
 		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
@@ -407,14 +407,15 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	struct mlx5_wq_cyc *wq = &sq->wq;
 	struct mlx5e_umr_wqe *umr_wqe;
 	u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
-	u16 pi;
+	u16 pi, frag_pi;
 	int err;
 	int i;
 
 	pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
 
-	if (unlikely(pi + MLX5E_UMR_WQEBBS > mlx5_wq_cyc_get_size(wq))) {
-		mlx5e_fill_icosq_edge(sq, wq, pi);
+	if (unlikely(frag_pi + MLX5E_UMR_WQEBBS > mlx5_wq_cyc_get_frag_size(wq))) {
+		mlx5e_fill_icosq_frag_edge(sq, wq, pi, frag_pi);
 		pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index fc68e72b0b2b..d37566be06e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -296,16 +296,16 @@ dma_unmap_wqe_err:
 	return -ENOMEM;
 }
 
-static inline void mlx5e_fill_sq_edge(struct mlx5e_txqsq *sq,
-				      struct mlx5_wq_cyc *wq,
-				      u16 pi)
+static inline void mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq,
+					   struct mlx5_wq_cyc *wq,
+					   u16 pi, u16 frag_pi)
 {
 	struct mlx5e_tx_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi];
-	u8 nnops = mlx5_wq_cyc_get_size(wq) - pi;
+	u8 nnops = mlx5_wq_cyc_get_frag_size(wq) - frag_pi;
 
 	edge_wi = wi + nnops;
 
-	/* fill sq edge with nops to avoid wqe wrap around */
+	/* fill sq frag edge with nops to avoid wqe wrapping two pages */
 	for (; wi < edge_wi; wi++) {
 		wi->skb        = NULL;
 		wi->num_wqebbs = 1;
@@ -358,8 +358,8 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
 	u16 ds_cnt, ds_cnt_inl = 0;
+	u16 headlen, ihs, frag_pi;
 	u8 num_wqebbs, opcode;
-	u16 headlen, ihs;
 	u32 num_bytes;
 	int num_dma;
 	__be16 mss;
@@ -395,8 +395,9 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	}
 
 	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
-	if (unlikely(pi + num_wqebbs > mlx5_wq_cyc_get_size(wq))) {
-		mlx5e_fill_sq_edge(sq, wq, pi);
+	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
+	if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) {
+		mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi);
 		mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
 	}
 
@@ -642,9 +643,9 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
+	u16 headlen, ihs, pi, frag_pi;
 	u16 ds_cnt, ds_cnt_inl = 0;
 	u8 num_wqebbs, opcode;
-	u16 headlen, ihs, pi;
 	u32 num_bytes;
 	int num_dma;
 	__be16 mss;
@@ -680,8 +681,9 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	}
 
 	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
-	if (unlikely(pi + num_wqebbs > mlx5_wq_cyc_get_size(wq))) {
-		mlx5e_fill_sq_edge(sq, wq, pi);
+	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
+	if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) {
+		mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi);
 		mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index de7fe087d6fe..4e5a5cf25f17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -454,7 +454,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	}
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
-		sizeof(u64) * conn->cq.wq_ctrl.frag_buf.npages;
+		sizeof(u64) * conn->cq.wq_ctrl.buf.npages;
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
 		err = -ENOMEM;
@@ -469,12 +469,12 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size));
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET(cqc, cqc, uar_page, fdev->conn_res.uar->index);
-	MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.frag_buf.page_shift -
+	MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.buf.page_shift -
 			   MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr, conn->cq.wq_ctrl.db.dma);
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
-	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.frag_buf, pas);
+	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.buf, pas);
 
 	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen);
 	kvfree(in);
@@ -500,7 +500,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	goto out;
 
 err_cqwq:
-	mlx5_cqwq_destroy(&conn->cq.wq_ctrl);
+	mlx5_wq_destroy(&conn->cq.wq_ctrl);
 out:
 	return err;
 }
@@ -510,7 +510,7 @@ static void mlx5_fpga_conn_destroy_cq(struct mlx5_fpga_conn *conn)
 	tasklet_disable(&conn->cq.tasklet);
 	tasklet_kill(&conn->cq.tasklet);
 	mlx5_core_destroy_cq(conn->fdev->mdev, &conn->cq.mcq);
-	mlx5_cqwq_destroy(&conn->cq.wq_ctrl);
+	mlx5_wq_destroy(&conn->cq.wq_ctrl);
 }
 
 static int mlx5_fpga_conn_create_wq(struct mlx5_fpga_conn *conn, void *qpc)
@@ -591,8 +591,8 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn,
 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
 
-	mlx5_fill_page_array(&conn->qp.wq_ctrl.buf,
-			     (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas));
+	mlx5_fill_page_frag_array(&conn->qp.wq_ctrl.buf,
+				  (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas));
 
 	err = mlx5_core_create_qp(mdev, &conn->qp.mqp, in, inlen);
 	if (err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
index 44bd9eccc711..634ae10e287b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
@@ -54,7 +54,7 @@ struct mlx5_fpga_conn {
 	/* CQ */
 	struct {
 		struct mlx5_cqwq wq;
-		struct mlx5_frag_wq_ctrl wq_ctrl;
+		struct mlx5_wq_ctrl wq_ctrl;
 		struct mlx5_core_cq mcq;
 		struct tasklet_struct tasklet;
 	} cq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index ea66448ba365..5b8b35392025 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -36,7 +36,12 @@
 
 u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq)
 {
-	return (u32)wq->sz_m1 + 1;
+	return (u32)wq->fbc.sz_m1 + 1;
+}
+
+u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq)
+{
+	return (u32)wq->fbc.frag_sz_m1 + 1;
 }
 
 u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq)
@@ -46,12 +51,12 @@ u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq)
 
 u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq)
 {
-	return (u32)wq->sz_m1 + 1;
+	return (u32)wq->fbc.sz_m1 + 1;
 }
 
 static u32 mlx5_wq_cyc_get_byte_size(struct mlx5_wq_cyc *wq)
 {
-	return mlx5_wq_cyc_get_size(wq) << wq->log_stride;
+	return mlx5_wq_cyc_get_size(wq) << wq->fbc.log_stride;
 }
 
 static u32 mlx5_wq_qp_get_byte_size(struct mlx5_wq_qp *wq)
@@ -67,17 +72,19 @@ static u32 mlx5_cqwq_get_byte_size(struct mlx5_cqwq *wq)
 
 static u32 mlx5_wq_ll_get_byte_size(struct mlx5_wq_ll *wq)
 {
-	return mlx5_wq_ll_get_size(wq) << wq->log_stride;
+	return mlx5_wq_ll_get_size(wq) << wq->fbc.log_stride;
 }
 
 int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		       void *wqc, struct mlx5_wq_cyc *wq,
 		       struct mlx5_wq_ctrl *wq_ctrl)
 {
+	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
 	int err;
 
-	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
-	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
+	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
+		      MLX5_GET(wq, wqc, log_wq_sz),
+		      fbc);
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -85,14 +92,14 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
-				  &wq_ctrl->buf, param->buf_numa_node);
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
-		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	wq->buf = wq_ctrl->buf.frags->buf;
+	fbc->frag_buf = wq_ctrl->buf;
 	wq->db  = wq_ctrl->db.db;
 
 	wq_ctrl->mdev = mdev;
@@ -105,17 +112,35 @@ err_db_free:
 	return err;
 }
 
+static void mlx5e_qp_set_frag_buf(struct mlx5_frag_buf *buf,
+				  struct mlx5_wq_qp *qp)
+{
+	struct mlx5_frag_buf *rqb, *sqb;
+
+	rqb = &qp->rq.fbc.frag_buf;
+	*rqb = *buf;
+	rqb->size   = mlx5_wq_cyc_get_byte_size(&qp->rq);
+	rqb->npages = 1 << get_order(rqb->size);
+
+	sqb = &qp->sq.fbc.frag_buf;
+	*sqb = *buf;
+	sqb->size   = mlx5_wq_cyc_get_byte_size(&qp->rq);
+	sqb->npages = 1 << get_order(sqb->size);
+	sqb->frags += rqb->npages; /* first part is for the rq */
+}
+
 int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *qpc, struct mlx5_wq_qp *wq,
 		      struct mlx5_wq_ctrl *wq_ctrl)
 {
 	int err;
 
-	wq->rq.log_stride = MLX5_GET(qpc, qpc, log_rq_stride) + 4;
-	wq->rq.sz_m1 = (1 << MLX5_GET(qpc, qpc, log_rq_size)) - 1;
-
-	wq->sq.log_stride = ilog2(MLX5_SEND_WQE_BB);
-	wq->sq.sz_m1 = (1 << MLX5_GET(qpc, qpc, log_sq_size)) - 1;
+	mlx5_fill_fbc(MLX5_GET(qpc, qpc, log_rq_stride) + 4,
+		      MLX5_GET(qpc, qpc, log_rq_size),
+		      &wq->rq.fbc);
+	mlx5_fill_fbc(ilog2(MLX5_SEND_WQE_BB),
+		      MLX5_GET(qpc, qpc, log_sq_size),
+		      &wq->sq.fbc);
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -123,15 +148,15 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_buf_alloc_node(mdev, mlx5_wq_qp_get_byte_size(wq),
-				  &wq_ctrl->buf, param->buf_numa_node);
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_qp_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
-		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	wq->rq.buf = wq_ctrl->buf.frags->buf;
-	wq->sq.buf = wq->rq.buf + mlx5_wq_cyc_get_byte_size(&wq->rq);
+	mlx5e_qp_set_frag_buf(&wq_ctrl->buf, wq);
+
 	wq->rq.db  = &wq_ctrl->db.db[MLX5_RCV_DBR];
 	wq->sq.db  = &wq_ctrl->db.db[MLX5_SND_DBR];
 
@@ -147,7 +172,7 @@ err_db_free:
 
 int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		     void *cqc, struct mlx5_cqwq *wq,
-		     struct mlx5_frag_wq_ctrl *wq_ctrl)
+		     struct mlx5_wq_ctrl *wq_ctrl)
 {
 	int err;
 
@@ -160,7 +185,7 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	}
 
 	err = mlx5_frag_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
-				       &wq_ctrl->frag_buf,
+				       &wq_ctrl->buf,
 				       param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n",
@@ -168,7 +193,7 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		goto err_db_free;
 	}
 
-	wq->fbc.frag_buf = wq_ctrl->frag_buf;
+	wq->fbc.frag_buf = wq_ctrl->buf;
 	wq->db  = wq_ctrl->db.db;
 
 	wq_ctrl->mdev = mdev;
@@ -185,12 +210,14 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *wqc, struct mlx5_wq_ll *wq,
 		      struct mlx5_wq_ctrl *wq_ctrl)
 {
+	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
 	struct mlx5_wqe_srq_next_seg *next_seg;
 	int err;
 	int i;
 
-	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
-	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
+	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
+		      MLX5_GET(wq, wqc, log_wq_sz),
+		      fbc);
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -198,17 +225,17 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
-				  &wq_ctrl->buf, param->buf_numa_node);
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
-		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	wq->buf = wq_ctrl->buf.frags->buf;
+	wq->fbc.frag_buf = wq_ctrl->buf;
 	wq->db  = wq_ctrl->db.db;
 
-	for (i = 0; i < wq->sz_m1; i++) {
+	for (i = 0; i < fbc->sz_m1; i++) {
 		next_seg = mlx5_wq_ll_get_wqe(wq, i);
 		next_seg->next_wqe_index = cpu_to_be16(i + 1);
 	}
@@ -227,12 +254,7 @@ err_db_free:
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl)
 {
-	mlx5_buf_free(wq_ctrl->mdev, &wq_ctrl->buf);
+	mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->buf);
 	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
 }
 
-void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl)
-{
-	mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->frag_buf);
-	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index a3572e148f09..b9d7c01fc7cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -48,17 +48,9 @@ struct mlx5_wq_ctrl {
 	struct mlx5_db		db;
 };
 
-struct mlx5_frag_wq_ctrl {
-	struct mlx5_core_dev	*mdev;
-	struct mlx5_frag_buf	frag_buf;
-	struct mlx5_db		db;
-};
-
 struct mlx5_wq_cyc {
-	void			*buf;
+	struct mlx5_frag_buf_ctrl fbc;
 	__be32			*db;
-	u16			sz_m1;
-	u8			log_stride;
 };
 
 struct mlx5_wq_qp {
@@ -73,20 +65,19 @@ struct mlx5_cqwq {
 };
 
 struct mlx5_wq_ll {
-	void			*buf;
+	struct mlx5_frag_buf_ctrl fbc;
 	__be32			*db;
 	__be16			*tail_next;
-	u16			sz_m1;
 	u16			head;
 	u16			wqe_ctr;
 	u16			cur_sz;
-	u8			log_stride;
 };
 
 int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		       void *wqc, struct mlx5_wq_cyc *wq,
 		       struct mlx5_wq_ctrl *wq_ctrl);
 u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq);
+u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq);
 
 int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *qpc, struct mlx5_wq_qp *wq,
@@ -94,7 +85,7 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 
 int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		     void *cqc, struct mlx5_cqwq *wq,
-		     struct mlx5_frag_wq_ctrl *wq_ctrl);
+		     struct mlx5_wq_ctrl *wq_ctrl);
 u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq);
 
 int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
@@ -103,16 +94,20 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
-void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl);
 
 static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
 {
-	return ctr & wq->sz_m1;
+	return ctr & wq->fbc.sz_m1;
+}
+
+static inline u16 mlx5_wq_cyc_ctr2fragix(struct mlx5_wq_cyc *wq, u16 ctr)
+{
+	return ctr & wq->fbc.frag_sz_m1;
 }
 
 static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix)
 {
-	return wq->buf + (ix << wq->log_stride);
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
 }
 
 static inline int mlx5_wq_cyc_cc_bigger(u16 cc1, u16 cc2)
@@ -176,7 +171,7 @@ static inline struct mlx5_cqe64 *mlx5_cqwq_get_cqe(struct mlx5_cqwq *wq)
 
 static inline int mlx5_wq_ll_is_full(struct mlx5_wq_ll *wq)
 {
-	return wq->cur_sz == wq->sz_m1;
+	return wq->cur_sz == wq->fbc.sz_m1;
 }
 
 static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
@@ -186,12 +181,12 @@ static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
 
 static inline u16 mlx5_wq_ll_ctr2ix(struct mlx5_wq_ll *wq, u16 ctr)
 {
-	return ctr & wq->sz_m1;
+	return ctr & wq->fbc.sz_m1;
 }
 
 static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
 {
-	return wq->buf + (ix << wq->log_stride);
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
 }
 
 static inline void mlx5_wq_ll_push(struct mlx5_wq_ll *wq, u16 head_next)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 92d292454351..80cbb7fdce4a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -983,16 +983,24 @@ static inline u32 mlx5_base_mkey(const u32 key)
 	return key & 0xffffff00u;
 }
 
-static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc,
-					      void *cqc)
+static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz,
+				 struct mlx5_frag_buf_ctrl *fbc)
 {
-	fbc->log_stride	= 6 + MLX5_GET(cqc, cqc, cqe_sz);
-	fbc->log_sz	= MLX5_GET(cqc, cqc, log_cq_size);
+	fbc->log_stride = log_stride;
+	fbc->log_sz     = log_sz;
 	fbc->sz_m1	= (1 << fbc->log_sz) - 1;
 	fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride;
 	fbc->frag_sz_m1	= (1 << fbc->log_frag_strides) - 1;
 }
 
+static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc,
+					      void *cqc)
+{
+	mlx5_fill_fbc(6 + MLX5_GET(cqc, cqc, cqe_sz),
+		      MLX5_GET(cqc, cqc, log_cq_size),
+		      fbc);
+}
+
 static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc,
 					  u32 ix)
 {
-- 
cgit v1.2.3


From e5b1db0186bfb3bede41e412b27c9bcf2b336622 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 30 Mar 2018 14:41:49 -0500
Subject: PCI: Remove unused pcie_get_minimum_link()

In some cases pcie_get_minimum_link() returned misleading information
because it found the slowest link and the narrowest link without
considering the total bandwidth of the link.

For example, consider a path with these two links:

  - 16.0 GT/s  x1 link  (16.0 * 10^9 * 128 / 130) *  1 / 8 = 1969 MB/s
  -  2.5 GT/s x16 link  ( 2.5 * 10^9 *   8 /  10) * 16 / 8 = 4000 MB/s

The available bandwidth of the path is limited by the 16 GT/s link to about
1969 MB/s, but pcie_get_minimum_link() returned 2.5 GT/s x1, which
corresponds to only 250 MB/s.

Callers should use pcie_print_link_status() instead, or
pcie_bandwidth_available() if they need more detailed information.

Remove pcie_get_minimum_link() since there are no callers left.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 43 -------------------------------------------
 include/linux/pci.h |  2 --
 2 files changed, 45 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655a5643..4bafa817c40a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5069,49 +5069,6 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
 }
 EXPORT_SYMBOL(pcie_set_mps);
 
-/**
- * pcie_get_minimum_link - determine minimum link settings of a PCI device
- * @dev: PCI device to query
- * @speed: storage for minimum speed
- * @width: storage for minimum width
- *
- * This function will walk up the PCI device chain and determine the minimum
- * link width and speed of the device.
- */
-int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
-			  enum pcie_link_width *width)
-{
-	int ret;
-
-	*speed = PCI_SPEED_UNKNOWN;
-	*width = PCIE_LNK_WIDTH_UNKNOWN;
-
-	while (dev) {
-		u16 lnksta;
-		enum pci_bus_speed next_speed;
-		enum pcie_link_width next_width;
-
-		ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
-		if (ret)
-			return ret;
-
-		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
-		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
-			PCI_EXP_LNKSTA_NLW_SHIFT;
-
-		if (next_speed < *speed)
-			*speed = next_speed;
-
-		if (next_width < *width)
-			*width = next_width;
-
-		dev = dev->bus->self;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(pcie_get_minimum_link);
-
 /**
  * pcie_bandwidth_available - determine minimum link settings of a PCIe
  *			      device and its bandwidth limitation
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 5eb97e198325..f7aa6d9f8999 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1080,8 +1080,6 @@ int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
 int pcie_get_mps(struct pci_dev *dev);
 int pcie_set_mps(struct pci_dev *dev, int mps);
-int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
-			  enum pcie_link_width *width);
 u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 			     enum pci_bus_speed *speed,
 			     enum pcie_link_width *width);
-- 
cgit v1.2.3


From 8efd6894ff089adeeac7cb9f32125b85d963d1bc Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 22 Apr 2018 20:18:46 -0700
Subject: fs: add timespec64_truncate()

As vfs moves to using struct timespec64 to represent times,
update the argument to timespec_truncate() to use
struct timespec64. Also change the name of the function.
The rest of the implementation logic is the same.

Move this to fs/inode.c instead of kernel/time/time.c as all the
users of this api are filesystems.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 24 ++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index 13ceb98c3bd3..93af998ee290 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2110,6 +2110,30 @@ void inode_nohighmem(struct inode *inode)
 }
 EXPORT_SYMBOL(inode_nohighmem);
 
+/**
+ * timespec64_trunc - Truncate timespec64 to a granularity
+ * @t: Timespec64
+ * @gran: Granularity in ns.
+ *
+ * Truncate a timespec64 to a granularity. Always rounds down. gran must
+ * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
+ */
+struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran)
+{
+	/* Avoid division in the common cases 1 ns and 1 s. */
+	if (gran == 1) {
+		/* nothing */
+	} else if (gran == NSEC_PER_SEC) {
+		t.tv_nsec = 0;
+	} else if (gran > 1 && gran < NSEC_PER_SEC) {
+		t.tv_nsec -= t.tv_nsec % gran;
+	} else {
+		WARN(1, "illegal file time granularity: %u", gran);
+	}
+	return t;
+}
+EXPORT_SYMBOL(timespec64_trunc);
+
 /**
  * current_time - Return FS time
  * @inode: inode.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..7f6997e0dabf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1476,6 +1476,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
+extern struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran);
 extern struct timespec current_time(struct inode *inode);
 
 /*
-- 
cgit v1.2.3


From ee219b946e4bf7df2d2c68dfaae1bdaa360ec2ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 23 May 2018 17:42:59 +0200
Subject: uapi: turn __poll_t sparse checks on by default

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/uapi/linux/types.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index cd4f0b897a48..2fce8b6876e9 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -49,11 +49,7 @@ typedef __u32 __bitwise __wsum;
 #define __aligned_be64 __be64 __attribute__((aligned(8)))
 #define __aligned_le64 __le64 __attribute__((aligned(8)))
 
-#ifdef __CHECK_POLL
 typedef unsigned __bitwise __poll_t;
-#else
-typedef unsigned __poll_t;
-#endif
 
 #endif /*  __ASSEMBLY__ */
 #endif /* _UAPI_LINUX_TYPES_H */
-- 
cgit v1.2.3


From 8f546ae1fc5ce8396827d4868c7eee1f1cc6947a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Jan 2018 12:23:05 +0100
Subject: fs: unexport poll_schedule_timeout

No users outside of select.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/select.c          | 3 +--
 include/linux/poll.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/select.c b/fs/select.c
index ba879c51288f..a87f396f0313 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -233,7 +233,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
-int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 			  ktime_t *expires, unsigned long slack)
 {
 	int rc = -EINTR;
@@ -258,7 +258,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 
 	return rc;
 }
-EXPORT_SYMBOL(poll_schedule_timeout);
 
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f45ebd017eaa..a3576da63377 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,8 +96,6 @@ struct poll_wqueues {
 
 extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
-extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
-				 ktime_t *expires, unsigned long slack);
 extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
-- 
cgit v1.2.3


From 9965ed174e7d38896e5d2582159d8ef31ecd4cb5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 5 Mar 2018 07:26:05 -0800
Subject: fs: add new vfs_poll and file_can_poll helpers

These abstract out calls to the poll method in preparation for changes
in how we poll.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 drivers/staging/comedi/drivers/serial2002.c |  4 ++--
 drivers/vfio/virqfd.c                       |  2 +-
 drivers/vhost/vhost.c                       |  2 +-
 fs/eventpoll.c                              |  5 ++---
 fs/select.c                                 | 23 ++++++++---------------
 include/linux/poll.h                        | 12 ++++++++++++
 mm/memcontrol.c                             |  2 +-
 net/9p/trans_fd.c                           | 18 ++++--------------
 virt/kvm/eventfd.c                          |  2 +-
 9 files changed, 32 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c
index b3f3b4a201af..5471b2212a62 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c
@@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout)
 		long elapsed;
 		__poll_t mask;
 
-		mask = f->f_op->poll(f, &table.pt);
+		mask = vfs_poll(f, &table.pt);
 		if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
 			    EPOLLHUP | EPOLLERR)) {
 			break;
@@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout)
 
 	result = -1;
 	if (!IS_ERR(f)) {
-		if (f->f_op->poll) {
+		if (file_can_poll(f)) {
 			serial2002_tty_read_poll_wait(f, timeout);
 
 			if (kernel_read(f, &ch, 1, &pos) == 1)
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 085700f1be10..2a1be859ee71 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque,
 	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
 	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
 
-	events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
+	events = vfs_poll(irqfd.file, &virqfd->pt);
 
 	/*
 	 * Check if there was an event already pending on the eventfd
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e941224..f6022881f147 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 	if (poll->wqh)
 		return 0;
 
-	mask = file->f_op->poll(file, &poll->table);
+	mask = vfs_poll(file, &poll->table);
 	if (mask)
 		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
 	if (mask & EPOLLERR) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 602ca4285b2e..67db22fe99c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
 
 	pt->_key = epi->event.events;
 	if (!is_file_epoll(epi->ffd.file))
-		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
-		       epi->event.events;
+		return vfs_poll(epi->ffd.file, pt) & epi->event.events;
 
 	ep = epi->ffd.file->private_data;
 	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
@@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
 	/* The target file descriptor must support poll */
 	error = -EPERM;
-	if (!tf.file->f_op->poll)
+	if (!file_can_poll(tf.file))
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/select.c b/fs/select.c
index 25da26253485..e30def680b2e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -502,14 +502,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 					continue;
 				f = fdget(i);
 				if (f.file) {
-					const struct file_operations *f_op;
-					f_op = f.file->f_op;
-					mask = DEFAULT_POLLMASK;
-					if (f_op->poll) {
-						wait_key_set(wait, in, out,
-							     bit, busy_flag);
-						mask = (*f_op->poll)(f.file, wait);
-					}
+					wait_key_set(wait, in, out, bit,
+						     busy_flag);
+					mask = vfs_poll(f.file, wait);
+
 					fdput(f);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
@@ -825,13 +821,10 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 
 	/* userland u16 ->events contains POLL... bitmap */
 	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
-	mask = DEFAULT_POLLMASK;
-	if (f.file->f_op->poll) {
-		pwait->_key = filter | busy_flag;
-		mask = f.file->f_op->poll(f.file, pwait);
-		if (mask & busy_flag)
-			*can_busy_poll = true;
-	}
+	pwait->_key = filter | busy_flag;
+	mask = vfs_poll(f.file, pwait);
+	if (mask & busy_flag)
+		*can_busy_poll = true;
 	mask &= filter;		/* Mask out unneeded events. */
 	fdput(f);
 
diff --git a/include/linux/poll.h b/include/linux/poll.h
index a3576da63377..7e0fdcf905d2 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,6 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
+static inline bool file_can_poll(struct file *file)
+{
+	return file->f_op->poll;
+}
+
+static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+	if (unlikely(!file->f_op->poll))
+		return DEFAULT_POLLMASK;
+	return file->f_op->poll(file, pt);
+}
+
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3d101a..1695f38630f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 	if (ret)
 		goto out_put_css;
 
-	efile.file->f_op->poll(efile.file, &event->pt);
+	vfs_poll(efile.file, &event->pt);
 
 	spin_lock(&memcg->event_list_lock);
 	list_add(&event->list, &memcg->event_list);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 0cfba919d167..3811775692d0 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -231,7 +231,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
 static __poll_t
 p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 {
-	__poll_t ret, n;
+	__poll_t ret;
 	struct p9_trans_fd *ts = NULL;
 
 	if (client && client->status == Connected)
@@ -243,19 +243,9 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 		return EPOLLERR;
 	}
 
-	if (!ts->rd->f_op->poll)
-		ret = DEFAULT_POLLMASK;
-	else
-		ret = ts->rd->f_op->poll(ts->rd, pt);
-
-	if (ts->rd != ts->wr) {
-		if (!ts->wr->f_op->poll)
-			n = DEFAULT_POLLMASK;
-		else
-			n = ts->wr->f_op->poll(ts->wr, pt);
-		ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN);
-	}
-
+	ret = vfs_poll(ts->rd, pt);
+	if (ts->rd != ts->wr)
+		ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);
 	return ret;
 }
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 6e865e8b5b10..90d30fbe95ae 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -397,7 +397,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered, and trigger it as if we didn't miss it.
 	 */
-	events = f.file->f_op->poll(f.file, &irqfd->pt);
+	events = vfs_poll(f.file, &irqfd->pt);
 
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
-- 
cgit v1.2.3


From 3deb642f0de4c14f37437dd247f9c77839f043f8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Jan 2018 15:29:24 +0100
Subject: fs: introduce new ->get_poll_head and ->poll_mask methods

->get_poll_head returns the waitqueue that the poll operation is going
to sleep on.  Note that this means we can only use a single waitqueue
for the poll, unlike some current drivers that use two waitqueues for
different events.  But now that we have keyed wakeups and heavily use
those for poll there aren't that many good reason left to keep the
multiple waitqueues, and if there are any ->poll is still around, the
driver just won't support aio poll.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 Documentation/filesystems/Locking |  7 ++++++-
 Documentation/filesystems/vfs.txt | 13 +++++++++++++
 fs/select.c                       | 23 +++++++++++++++++++++++
 include/linux/fs.h                |  2 ++
 include/linux/poll.h              | 12 ++++++------
 5 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 220bba28f72b..6d227f9d7bd9 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -440,6 +440,8 @@ prototypes:
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -470,7 +472,7 @@ prototypes:
 };
 
 locking rules:
-	All may block.
+	All except for ->poll_mask may block.
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
@@ -498,6 +500,9 @@ in sys_read() and friends.
 the lease within the individual filesystem to record the result of the
 operation
 
+->poll_mask can be called with or without the waitqueue lock for the waitqueue
+returned from ->get_poll_head.
+
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f608180ad59d..829a7b7857a4 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,6 +857,8 @@ struct file_operations {
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -901,6 +903,17 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
+  get_poll_head: Returns the struct wait_queue_head that callers can
+  wait on.  Callers need to check the returned events using ->poll_mask
+  once woken.  Can return NULL to indicate polling is not supported,
+  or any error code using the ERR_PTR convention to indicate that a
+  grave error occured and ->poll_mask shall not be called.
+
+  poll_mask: return the mask of EPOLL* values describing the file descriptor
+  state.  Called either before going to sleep on the waitqueue returned by
+  get_poll_head, or after it has been woken.  If ->get_poll_head and
+  ->poll_mask are implemented ->poll does not need to be implement.
+
   unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
diff --git a/fs/select.c b/fs/select.c
index e30def680b2e..bc3cc0f98896 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,6 +34,29 @@
 
 #include <linux/uaccess.h>
 
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+	if (file->f_op->poll) {
+		return file->f_op->poll(file, pt);
+	} else if (file_has_poll_mask(file)) {
+		unsigned int events = poll_requested_events(pt);
+		struct wait_queue_head *head;
+
+		if (pt && pt->_qproc) {
+			head = file->f_op->get_poll_head(file, events);
+			if (!head)
+				return DEFAULT_POLLMASK;
+			if (IS_ERR(head))
+				return EPOLLERR;
+			pt->_qproc(file, head, pt);
+		}
+
+		return file->f_op->poll_mask(file, events);
+	} else {
+		return DEFAULT_POLLMASK;
+	}
+}
+EXPORT_SYMBOL_GPL(vfs_poll);
 
 /*
  * Estimate expected accuracy in ns from a timeval.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..4b6045ebb2f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1711,6 +1711,8 @@ struct file_operations {
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 7e0fdcf905d2..fdf86b4cbc71 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
-static inline bool file_can_poll(struct file *file)
+static inline bool file_has_poll_mask(struct file *file)
 {
-	return file->f_op->poll;
+	return file->f_op->get_poll_head && file->f_op->poll_mask;
 }
 
-static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+static inline bool file_can_poll(struct file *file)
 {
-	if (unlikely(!file->f_op->poll))
-		return DEFAULT_POLLMASK;
-	return file->f_op->poll(file, pt);
+	return file->f_op->poll || file_has_poll_mask(file);
 }
 
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt);
+
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
-- 
cgit v1.2.3


From f3a2752a43de18eb2eafa869d06f6e423ff0ce61 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 30 Mar 2018 11:19:25 +0200
Subject: aio: simplify KIOCB_KEY handling

No need to pass the key field to lookup_iocb to compare it with KIOCB_KEY,
as we can do that right after retrieving it from userspace.  Also move the
KIOCB_KEY definition to aio.c as it is an internal value not used by any
other place in the kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c            | 14 +++++++-------
 include/linux/aio.h |  2 --
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index 1c383bb44b2d..50a90e5581ed 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -46,6 +46,8 @@
 
 #include "internal.h"
 
+#define KIOCB_KEY		0
+
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
@@ -1811,15 +1813,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
  *	Finds a given iocb for cancellation.
  */
 static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
 {
 	struct aio_kiocb *kiocb;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	if (key != KIOCB_KEY)
-		return NULL;
-
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
 		if (kiocb->ki_user_iocb == iocb)
@@ -1846,9 +1845,10 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	u32 key;
 	int ret;
 
-	ret = get_user(key, &iocb->aio_key);
-	if (unlikely(ret))
+	if (unlikely(get_user(key, &iocb->aio_key)))
 		return -EFAULT;
+	if (unlikely(key != KIOCB_KEY))
+		return -EINVAL;
 
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx))
@@ -1856,7 +1856,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 
 	spin_lock_irq(&ctx->ctx_lock);
 
-	kiocb = lookup_kiocb(ctx, iocb, key);
+	kiocb = lookup_kiocb(ctx, iocb);
 	if (kiocb)
 		ret = kiocb_cancel(kiocb);
 	else
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9d8aabecfe2d..b83e68dd006f 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -8,8 +8,6 @@ struct kioctx;
 struct kiocb;
 struct mm_struct;
 
-#define KIOCB_KEY		0
-
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
 /* prototypes */
-- 
cgit v1.2.3


From 2c14fa838cbefc23cf1c73ca167ed85b274b2913 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Mar 2018 16:34:36 +0100
Subject: aio: implement IOCB_CMD_POLL

Simple one-shot poll through the io_submit() interface.  To poll for
a file descriptor the application should submit an iocb of type
IOCB_CMD_POLL.  It will poll the fd for the events specified in the
the first 32 bits of the aio_buf field of the iocb.

Unlike poll or epoll without EPOLLONESHOT this interface always works
in one shot mode, that is once the iocb is completed, it will have to be
resubmitted.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/aio.c                     | 134 ++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/aio_abi.h |   6 +-
 2 files changed, 135 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index 0633cf3b325c..bd711cfa4a1f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
  *	Implements an efficient asynchronous io interface.
  *
  *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ *	Copyright 2018 Christoph Hellwig.
  *
  *	See ../COPYING for licensing terms.
  */
@@ -164,10 +165,22 @@ struct fsync_iocb {
 	bool			datasync;
 };
 
+struct poll_iocb {
+	struct file		*file;
+	__poll_t		events;
+	struct wait_queue_head	*head;
+
+	union {
+		struct wait_queue_entry	wait;
+		struct work_struct	work;
+	};
+};
+
 struct aio_kiocb {
 	union {
 		struct kiocb		rw;
 		struct fsync_iocb	fsync;
+		struct poll_iocb	poll;
 	};
 
 	struct kioctx		*ki_ctx;
@@ -1558,7 +1571,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
 			iocb->aio_rw_flags))
 		return -EINVAL;
-
 	req->file = fget(iocb->aio_fildes);
 	if (unlikely(!req->file))
 		return -EBADF;
@@ -1573,6 +1585,124 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	return -EIOCBQUEUED;
 }
 
+/* need to use list_del_init so we can check if item was present */
+static inline bool __aio_poll_remove(struct poll_iocb *req)
+{
+	if (list_empty(&req->wait.entry))
+		return false;
+	list_del_init(&req->wait.entry);
+	return true;
+}
+
+static inline void __aio_poll_complete(struct poll_iocb *req, __poll_t mask)
+{
+	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+	struct file *file = req->file;
+
+	aio_complete(iocb, mangle_poll(mask), 0);
+	fput(file);
+}
+
+static void aio_poll_work(struct work_struct *work)
+{
+	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+
+	__aio_poll_complete(req, req->events);
+}
+
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+	struct poll_iocb *req = &aiocb->poll;
+	struct wait_queue_head *head = req->head;
+	bool found = false;
+
+	spin_lock(&head->lock);
+	found = __aio_poll_remove(req);
+	spin_unlock(&head->lock);
+
+	if (found) {
+		req->events = 0;
+		INIT_WORK(&req->work, aio_poll_work);
+		schedule_work(&req->work);
+	}
+	return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+		void *key)
+{
+	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+	struct file *file = req->file;
+	__poll_t mask = key_to_poll(key);
+
+	assert_spin_locked(&req->head->lock);
+
+	/* for instances that support it check for an event match first: */
+	if (mask && !(mask & req->events))
+		return 0;
+
+	mask = file->f_op->poll_mask(file, req->events);
+	if (!mask)
+		return 0;
+
+	__aio_poll_remove(req);
+
+	req->events = mask;
+	INIT_WORK(&req->work, aio_poll_work);
+	schedule_work(&req->work);
+	return 1;
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+	struct kioctx *ctx = aiocb->ki_ctx;
+	struct poll_iocb *req = &aiocb->poll;
+	__poll_t mask;
+
+	/* reject any unknown events outside the normal event mask. */
+	if ((u16)iocb->aio_buf != iocb->aio_buf)
+		return -EINVAL;
+	/* reject fields that are not defined for poll */
+	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+		return -EINVAL;
+
+	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+	req->file = fget(iocb->aio_fildes);
+	if (unlikely(!req->file))
+		return -EBADF;
+	if (!file_has_poll_mask(req->file))
+		goto out_fail;
+
+	req->head = req->file->f_op->get_poll_head(req->file, req->events);
+	if (!req->head)
+		goto out_fail;
+	if (IS_ERR(req->head)) {
+		mask = EPOLLERR;
+		goto done;
+	}
+
+	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+	aiocb->ki_cancel = aio_poll_cancel;
+
+	spin_lock_irq(&ctx->ctx_lock);
+	spin_lock(&req->head->lock);
+	mask = req->file->f_op->poll_mask(req->file, req->events);
+	if (!mask) {
+		__add_wait_queue(req->head, &req->wait);
+		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+	}
+	spin_unlock(&req->head->lock);
+	spin_unlock_irq(&ctx->ctx_lock);
+done:
+	if (mask)
+		__aio_poll_complete(req, mask);
+	return -EIOCBQUEUED;
+out_fail:
+	fput(req->file);
+	return -EINVAL; /* same as no support for IOCB_CMD_POLL */
+}
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb, bool compat)
 {
@@ -1641,6 +1771,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		break;
 	case IOCB_CMD_FDSYNC:
 		ret = aio_fsync(&req->fsync, iocb, true);
+	case IOCB_CMD_POLL:
+		ret = aio_poll(req, iocb);
 		break;
 	default:
 		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index 2c0a3415beee..ed0185945bb2 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -39,10 +39,8 @@ enum {
 	IOCB_CMD_PWRITE = 1,
 	IOCB_CMD_FSYNC = 2,
 	IOCB_CMD_FDSYNC = 3,
-	/* These two are experimental.
-	 * IOCB_CMD_PREADX = 4,
-	 * IOCB_CMD_POLL = 5,
-	 */
+	/* 4 was the experimental IOCB_CMD_PREADX */
+	IOCB_CMD_POLL = 5,
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
-- 
cgit v1.2.3


From 3cafb37633a1230011c5415e6f394b05260d21ad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Jan 2018 16:07:34 +0100
Subject: net: refactor socket_poll

Factor out two busy poll related helpers for late reuse, and remove
a command that isn't very helpful, especially with the __poll_t
annotations in place.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/busy_poll.h | 15 +++++++++++++++
 net/socket.c            | 21 ++++-----------------
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 71c72a939bf8..c5187438af38 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -121,6 +121,21 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
 #endif
 }
 
+static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events)
+{
+	if (sk_can_busy_loop(sock->sk) &&
+	    events && (events & POLL_BUSY_LOOP)) {
+		/* once, only if requested by syscall */
+		sk_busy_loop(sock->sk, 1);
+	}
+}
+
+/* if this socket can poll_ll, tell the system call */
+static inline __poll_t sock_poll_busy_flag(struct socket *sock)
+{
+	return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0;
+}
+
 /* used in the NIC receive handler to mark the skb */
 static inline void skb_mark_napi_id(struct sk_buff *skb,
 				    struct napi_struct *napi)
diff --git a/net/socket.c b/net/socket.c
index f10f1d947c78..571ee4005192 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1117,24 +1117,11 @@ EXPORT_SYMBOL(sock_create_lite);
 /* No kernel lock held - perfect */
 static __poll_t sock_poll(struct file *file, poll_table *wait)
 {
-	__poll_t busy_flag = 0;
-	struct socket *sock;
-
-	/*
-	 *      We can't return errors to poll, so it's either yes or no.
-	 */
-	sock = file->private_data;
-
-	if (sk_can_busy_loop(sock->sk)) {
-		/* this socket can poll_ll so tell the system call */
-		busy_flag = POLL_BUSY_LOOP;
-
-		/* once, only if requested by syscall */
-		if (wait && (wait->_key & POLL_BUSY_LOOP))
-			sk_busy_loop(sock->sk, 1);
-	}
+	struct socket *sock = file->private_data;
+	__poll_t events = poll_requested_events(wait);
 
-	return busy_flag | sock->ops->poll(file, sock, wait);
+	sock_poll_busy_loop(sock, events);
+	return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 152524231023c76b3b7b3e71017c7f951812868d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Apr 2018 15:25:28 +0200
Subject: net: add support for ->poll_mask in proto_ops

The socket file operations still implement ->poll until all protocols are
switched over.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/net.h |  1 +
 net/socket.c        | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 2248a052061d..3fd9d8c16581 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -147,6 +147,7 @@ struct proto_ops {
 	int		(*getname)   (struct socket *sock,
 				      struct sockaddr *addr,
 				      int peer);
+	__poll_t	(*poll_mask) (struct socket *sock, __poll_t events);
 	__poll_t	(*poll)	     (struct file *file, struct socket *sock,
 				      struct poll_table_struct *wait);
 	int		(*ioctl)     (struct socket *sock, unsigned int cmd,
diff --git a/net/socket.c b/net/socket.c
index 571ee4005192..2d752e9eb3f9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -117,8 +117,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 
 static int sock_close(struct inode *inode, struct file *file);
-static __poll_t sock_poll(struct file *file,
-			      struct poll_table_struct *wait);
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+		__poll_t events);
+static __poll_t sock_poll_mask(struct file *file, __poll_t);
+static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);
 static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_COMPAT
 static long compat_sock_ioctl(struct file *file,
@@ -141,6 +143,8 @@ static const struct file_operations socket_file_ops = {
 	.llseek =	no_llseek,
 	.read_iter =	sock_read_iter,
 	.write_iter =	sock_write_iter,
+	.get_poll_head = sock_get_poll_head,
+	.poll_mask =	sock_poll_mask,
 	.poll =		sock_poll,
 	.unlocked_ioctl = sock_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1114,14 +1118,48 @@ out_release:
 }
 EXPORT_SYMBOL(sock_create_lite);
 
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+		__poll_t events)
+{
+	struct socket *sock = file->private_data;
+
+	if (!sock->ops->poll_mask)
+		return NULL;
+	sock_poll_busy_loop(sock, events);
+	return sk_sleep(sock->sk);
+}
+
+static __poll_t sock_poll_mask(struct file *file, __poll_t events)
+{
+	struct socket *sock = file->private_data;
+
+	/*
+	 * We need to be sure we are in sync with the socket flags modification.
+	 *
+	 * This memory barrier is paired in the wq_has_sleeper.
+	 */
+	smp_mb();
+
+	/* this socket can poll_ll so tell the system call */
+	return sock->ops->poll_mask(sock, events) |
+		(sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0);
+}
+
 /* No kernel lock held - perfect */
 static __poll_t sock_poll(struct file *file, poll_table *wait)
 {
 	struct socket *sock = file->private_data;
-	__poll_t events = poll_requested_events(wait);
+	__poll_t events = poll_requested_events(wait), mask = 0;
 
-	sock_poll_busy_loop(sock, events);
-	return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
+	if (sock->ops->poll) {
+		sock_poll_busy_loop(sock, events);
+		mask = sock->ops->poll(file, sock, wait);
+	} else if (sock->ops->poll_mask) {
+		sock_poll_wait(file, sock_get_poll_head(file, events), wait);
+		mask = sock->ops->poll_mask(sock, events);
+	}
+
+	return mask | sock_poll_busy_flag(sock);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 984652dd8b1f0998b9a181944ad5a00d06f9586f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Apr 2018 15:26:26 +0200
Subject: net: remove sock_no_poll

Now that sock_poll handles a NULL ->poll or ->poll_mask there is no need
for a stub.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 crypto/af_alg.c             | 1 -
 crypto/algif_hash.c         | 2 --
 crypto/algif_rng.c          | 1 -
 drivers/isdn/mISDN/socket.c | 1 -
 drivers/net/ppp/pptp.c      | 1 -
 include/net/sock.h          | 2 --
 net/bluetooth/bnep/sock.c   | 1 -
 net/bluetooth/cmtp/sock.c   | 1 -
 net/bluetooth/hidp/sock.c   | 1 -
 net/core/sock.c             | 6 ------
 10 files changed, 17 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 7846c0c20cfe..80838c1cef94 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -347,7 +347,6 @@ static const struct proto_ops alg_proto_ops = {
 	.sendpage	=	sock_no_sendpage,
 	.sendmsg	=	sock_no_sendmsg,
 	.recvmsg	=	sock_no_recvmsg,
-	.poll		=	sock_no_poll,
 
 	.bind		=	alg_bind,
 	.release	=	af_alg_release,
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 6c9b1927a520..bfcf595fd8f9 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -288,7 +288,6 @@ static struct proto_ops algif_hash_ops = {
 	.mmap		=	sock_no_mmap,
 	.bind		=	sock_no_bind,
 	.setsockopt	=	sock_no_setsockopt,
-	.poll		=	sock_no_poll,
 
 	.release	=	af_alg_release,
 	.sendmsg	=	hash_sendmsg,
@@ -396,7 +395,6 @@ static struct proto_ops algif_hash_ops_nokey = {
 	.mmap		=	sock_no_mmap,
 	.bind		=	sock_no_bind,
 	.setsockopt	=	sock_no_setsockopt,
-	.poll		=	sock_no_poll,
 
 	.release	=	af_alg_release,
 	.sendmsg	=	hash_sendmsg_nokey,
diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c
index 150c2b6480ed..22df3799a17b 100644
--- a/crypto/algif_rng.c
+++ b/crypto/algif_rng.c
@@ -106,7 +106,6 @@ static struct proto_ops algif_rng_ops = {
 	.bind		=	sock_no_bind,
 	.accept		=	sock_no_accept,
 	.setsockopt	=	sock_no_setsockopt,
-	.poll		=	sock_no_poll,
 	.sendmsg	=	sock_no_sendmsg,
 	.sendpage	=	sock_no_sendpage,
 
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 1f8f489b4167..18c0a1281914 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -745,7 +745,6 @@ static const struct proto_ops base_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index c4267ecefd85..157b67c1bf8e 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -624,7 +624,6 @@ static const struct proto_ops pptp_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept     = sock_no_accept,
 	.getname    = pptp_getname,
-	.poll       = sock_no_poll,
 	.listen     = sock_no_listen,
 	.shutdown   = sock_no_shutdown,
 	.setsockopt = sock_no_setsockopt,
diff --git a/include/net/sock.h b/include/net/sock.h
index 74d725fdbe0f..4d2e8ad98985 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1591,8 +1591,6 @@ int sock_no_connect(struct socket *, struct sockaddr *, int, int);
 int sock_no_socketpair(struct socket *, struct socket *);
 int sock_no_accept(struct socket *, struct socket *, int, bool);
 int sock_no_getname(struct socket *, struct sockaddr *, int);
-__poll_t sock_no_poll(struct file *, struct socket *,
-			  struct poll_table_struct *);
 int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
 int sock_no_listen(struct socket *, int);
 int sock_no_shutdown(struct socket *, int);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index b5116fa9835e..00deacdcb51c 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -175,7 +175,6 @@ static const struct proto_ops bnep_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index ce86a7bae844..e08f28fadd65 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -178,7 +178,6 @@ static const struct proto_ops cmtp_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 008ba439bd62..1eaac01f85de 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -208,7 +208,6 @@ static const struct proto_ops hidp_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/net/core/sock.c b/net/core/sock.c
index 6444525f610c..d471aa524b62 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2567,12 +2567,6 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
 }
 EXPORT_SYMBOL(sock_no_getname);
 
-__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
-{
-	return 0;
-}
-EXPORT_SYMBOL(sock_no_poll);
-
 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 2c7d3dacebd4b432acc5011bf69a986f366d851d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Apr 2018 15:27:02 +0200
Subject: net/tcp: convert to ->poll_mask

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/tcp.h   |  3 +--
 net/ipv4/af_inet.c  |  2 +-
 net/ipv4/tcp.c      | 23 ++++++-----------------
 net/ipv6/af_inet6.c |  2 +-
 4 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9c9b3768b350..72b0ef0fe8cf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,8 +388,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
-__poll_t tcp_poll(struct file *file, struct socket *sock,
-		      struct poll_table_struct *wait);
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
 		   char __user *optval, int __user *optlen);
 int tcp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eaed0367e669..116e3cd11515 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,
-	.poll		   = tcp_poll,
+	.poll_mask	   = tcp_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = inet_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9ce1c726185e..643c1ba27338 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -494,32 +494,21 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
 }
 
 /*
- *	Wait for a TCP event.
- *
- *	Note that we don't need to lock the socket, as the upper poll layers
- *	take care of normal races (between the test and the event) and we don't
- *	go look at any of the socket buffers directly.
+ * Socket is not locked. We are protected from async events by poll logic and
+ * correct handling of state changes made by other threads is impossible in
+ * any case.
  */
-__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
 {
-	__poll_t mask;
 	struct sock *sk = sock->sk;
 	const struct tcp_sock *tp = tcp_sk(sk);
+	__poll_t mask = 0;
 	int state;
 
-	sock_poll_wait(file, sk_sleep(sk), wait);
-
 	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
-	/* Socket is not locked. We are protected from async events
-	 * by poll logic and correct handling of state changes
-	 * made by other threads is impossible in any case.
-	 */
-
-	mask = 0;
-
 	/*
 	 * EPOLLHUP is certainly not done right. But poll() doesn't
 	 * have a notion of HUP in just one direction, and for a
@@ -600,7 +589,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	return mask;
 }
-EXPORT_SYMBOL(tcp_poll);
+EXPORT_SYMBOL(tcp_poll_mask);
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8da0b513f188..57b85ea438e9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -571,7 +571,7 @@ const struct proto_ops inet6_stream_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = inet_accept,		/* ok		*/
 	.getname	   = inet6_getname,
-	.poll		   = tcp_poll,			/* ok		*/
+	.poll_mask	   = tcp_poll_mask,		/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = inet_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
-- 
cgit v1.2.3


From db5051ead64a987e863f71a770351a75be542b15 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Apr 2018 15:27:37 +0200
Subject: net: convert datagram_poll users tp ->poll_mask

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/mISDN/socket.c  |  2 +-
 drivers/net/ppp/pppoe.c      |  2 +-
 drivers/staging/ipx/af_ipx.c |  2 +-
 include/linux/skbuff.h       |  3 +--
 include/net/udp.h            |  2 +-
 net/appletalk/ddp.c          |  2 +-
 net/ax25/af_ax25.c           |  2 +-
 net/bluetooth/hci_sock.c     |  2 +-
 net/can/bcm.c                |  2 +-
 net/can/raw.c                |  2 +-
 net/core/datagram.c          | 13 ++++---------
 net/decnet/af_decnet.c       |  6 +++---
 net/ieee802154/socket.c      |  4 ++--
 net/ipv4/af_inet.c           |  6 +++---
 net/ipv4/udp.c               | 10 +++++-----
 net/ipv6/af_inet6.c          |  2 +-
 net/ipv6/raw.c               |  4 ++--
 net/kcm/kcmsock.c            | 10 +++++-----
 net/key/af_key.c             |  2 +-
 net/l2tp/l2tp_ip.c           |  2 +-
 net/l2tp/l2tp_ip6.c          |  2 +-
 net/l2tp/l2tp_ppp.c          |  2 +-
 net/llc/af_llc.c             |  2 +-
 net/netlink/af_netlink.c     |  2 +-
 net/netrom/af_netrom.c       |  2 +-
 net/nfc/rawsock.c            |  4 ++--
 net/packet/af_packet.c       |  9 ++++-----
 net/phonet/socket.c          |  2 +-
 net/qrtr/qrtr.c              |  2 +-
 net/rose/af_rose.c           |  2 +-
 net/x25/af_x25.c             |  2 +-
 31 files changed, 52 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 18c0a1281914..98f90aadd141 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = {
 	.getname	= data_sock_getname,
 	.sendmsg	= mISDN_sock_sendmsg,
 	.recvmsg	= mISDN_sock_recvmsg,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= data_sock_setsockopt,
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 7df07337d69c..40d0c80fa6ef 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1122,7 +1122,7 @@ static const struct proto_ops pppoe_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppoe_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c
index 5703dd176787..208b5c161631 100644
--- a/drivers/staging/ipx/af_ipx.c
+++ b/drivers/staging/ipx/af_ipx.c
@@ -1965,7 +1965,7 @@ static const struct proto_ops ipx_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= ipx_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= ipx_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ipx_compat_ioctl,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9065477ed255..89198379b39d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3250,8 +3250,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
-__poll_t datagram_poll(struct file *file, struct socket *sock,
-			   struct poll_table_struct *wait);
+__poll_t datagram_poll_mask(struct socket *sock, __poll_t events);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
diff --git a/include/net/udp.h b/include/net/udp.h
index 0676b272f6ac..61389a29334b 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -276,7 +276,7 @@ int udp_init_sock(struct sock *sk);
 int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int __udp_disconnect(struct sock *sk, int flags);
 int udp_disconnect(struct sock *sk, int flags);
-__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t udp_poll_mask(struct socket *sock, __poll_t events);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       netdev_features_t features,
 				       bool is_ipv6);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 9b6bc5abe946..55fdba05d7d9 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= atalk_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= atalk_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= atalk_compat_ioctl,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2b41366fcad2..b7cd97325c66 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1954,7 +1954,7 @@ static const struct proto_ops ax25_proto_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= ax25_accept,
 	.getname	= ax25_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= ax25_ioctl,
 	.listen		= ax25_listen,
 	.shutdown	= ax25_shutdown,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1506e1632394..d6c099861538 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = {
 	.sendmsg	= hci_sock_sendmsg,
 	.recvmsg	= hci_sock_recvmsg,
 	.ioctl		= hci_sock_ioctl,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= hci_sock_setsockopt,
diff --git a/net/can/bcm.c b/net/can/bcm.c
index ac5e5e34fee3..30c51e0ce294 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1669,7 +1669,7 @@ static const struct proto_ops bcm_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = sock_no_getname,
-	.poll          = datagram_poll,
+	.poll_mask     = datagram_poll_mask,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/can/raw.c b/net/can/raw.c
index 1051eee82581..fd7e2f49ea6a 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = raw_getname,
-	.poll          = datagram_poll,
+	.poll_mask     = datagram_poll_mask,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9938952c5c78..f19bf3dc2bd6 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -819,9 +819,8 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
 
 /**
  * 	datagram_poll - generic datagram poll
- *	@file: file struct
  *	@sock: socket
- *	@wait: poll table
+ *	@events to wait for
  *
  *	Datagram poll: Again totally generic. This also handles
  *	sequenced packet sockets providing the socket receive queue
@@ -831,14 +830,10 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
  *	and you use a different write policy from sock_writeable()
  *	then please supply your own write_space callback.
  */
-__poll_t datagram_poll(struct file *file, struct socket *sock,
-			   poll_table *wait)
+__poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	__poll_t mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -871,4 +866,4 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
-EXPORT_SYMBOL(datagram_poll);
+EXPORT_SYMBOL(datagram_poll_mask);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 32751602767f..2af6470d73ce 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
 }
 
 
-static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wait)
+static __poll_t dn_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct dn_scp *scp = DN_SK(sk);
-	__poll_t mask = datagram_poll(file, sock, wait);
+	__poll_t mask = datagram_poll_mask(sock, events);
 
 	if (!skb_queue_empty(&scp->other_receive_queue))
 		mask |= EPOLLRDBAND;
@@ -2344,7 +2344,7 @@ static const struct proto_ops dn_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	dn_accept,
 	.getname =	dn_getname,
-	.poll =		dn_poll,
+	.poll_mask =	dn_poll_mask,
 	.ioctl =	dn_ioctl,
 	.listen =	dn_listen,
 	.shutdown =	dn_shutdown,
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a60658c85a9a..a0768d2759b8 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
@@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 116e3cd11515..8a59428e63ab 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1018,7 +1018,7 @@ const struct proto_ops inet_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll		   = udp_poll,
+	.poll_mask	   = udp_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
@@ -1039,7 +1039,7 @@ EXPORT_SYMBOL(inet_dgram_ops);
 
 /*
  * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
- * udp_poll
+ * udp_poll_mask
  */
 static const struct proto_ops inet_sockraw_ops = {
 	.family		   = PF_INET,
@@ -1050,7 +1050,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24b5c59b1c53..34a2cd7290dc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2498,7 +2498,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  * 	udp_poll - wait for a UDP event.
  *	@file - file struct
  *	@sock - socket
- *	@wait - poll table
+ *	@events - events to wait for
  *
  *	This is same as datagram poll, except for the special case of
  *	blocking sockets. If application is using a blocking fd
@@ -2507,23 +2507,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  *	but then block when reading it. Add special case code
  *	to work around these arguably broken applications.
  */
-__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t udp_poll_mask(struct socket *sock, __poll_t events)
 {
-	__poll_t mask = datagram_poll(file, sock, wait);
+	__poll_t mask = datagram_poll_mask(sock, events);
 	struct sock *sk = sock->sk;
 
 	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Check for false positives due to checksum errors */
-	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+	if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) &&
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
 		mask &= ~(EPOLLIN | EPOLLRDNORM);
 
 	return mask;
 
 }
-EXPORT_SYMBOL(udp_poll);
+EXPORT_SYMBOL(udp_poll_mask);
 
 int udp_abort(struct sock *sk, int err)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 57b85ea438e9..d443c18b45fe 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -601,7 +601,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll		   = udp_poll,			/* ok		*/
+	.poll_mask	   = udp_poll_mask,		/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5eb9b08947ed..4a73ea1ddd51 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1345,7 +1345,7 @@ void raw6_proc_exit(void)
 }
 #endif	/* CONFIG_PROC_FS */
 
-/* Same as inet6_dgram_ops, sans udp_poll.  */
+/* Same as inet6_dgram_ops, sans udp_poll_mask.  */
 const struct proto_ops inet6_sockraw_ops = {
 	.family		   = PF_INET6,
 	.owner		   = THIS_MODULE,
@@ -1355,7 +1355,7 @@ const struct proto_ops inet6_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll		   = datagram_poll,		/* ok		*/
+	.poll_mask	   = datagram_poll_mask,	/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index dc76bc346829..d67734c99027 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
 	struct list_head *head;
 	int index = 0;
 
-	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
-	 * we set sk_state, otherwise epoll_wait always returns right away with
-	 * EPOLLHUP
+	/* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state,
+	 * so  we set sk_state, otherwise epoll_wait always returns right away
+	 * with EPOLLHUP
 	 */
 	kcm->sk.sk_state = TCP_ESTABLISHED;
 
@@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 7e2e7188e7f4..7654607e728b 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3726,7 +3726,7 @@ static const struct proto_ops pfkey_ops = {
 
 	/* Now the operations that really occur. */
 	.release	=	pfkey_release,
-	.poll		=	datagram_poll,
+	.poll_mask	=	datagram_poll_mask,
 	.sendmsg	=	pfkey_sendmsg,
 	.recvmsg	=	pfkey_recvmsg,
 };
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index a9c05b2bc1b0..181073bf6925 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 957369192ca1..336e4c00abbc 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip6_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1fd9e145076a..ef1f46aa6414 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1806,7 +1806,7 @@ static const struct proto_ops pppol2tp_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppol2tp_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= pppol2tp_setsockopt,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index cb80ebb38311..c75ec214415d 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1189,7 +1189,7 @@ static const struct proto_ops llc_ui_ops = {
 	.socketpair  = sock_no_socketpair,
 	.accept      = llc_ui_accept,
 	.getname     = llc_ui_getname,
-	.poll	     = datagram_poll,
+	.poll_mask   = datagram_poll_mask,
 	.ioctl       = llc_ui_ioctl,
 	.listen      = llc_ui_listen,
 	.shutdown    = llc_ui_shutdown,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 55342c4d5cec..22b30278903b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2673,7 +2673,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	netlink_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 4221d98a314b..96b45549f4ac 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1367,7 +1367,7 @@ static const struct proto_ops nr_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	nr_accept,
 	.getname	=	nr_getname,
-	.poll		=	datagram_poll,
+	.poll_mask	=	datagram_poll_mask,
 	.ioctl		=	nr_ioctl,
 	.listen		=	nr_listen,
 	.shutdown	=	sock_no_shutdown,
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index e2188deb08dc..60c322531c49 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll           = datagram_poll,
+	.poll_mask      = datagram_poll_mask,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
@@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll           = datagram_poll,
+	.poll_mask      = datagram_poll_mask,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01f3515cada0..f1d6a351a111 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4108,12 +4108,11 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
 	return 0;
 }
 
-static __poll_t packet_poll(struct file *file, struct socket *sock,
-				poll_table *wait)
+static __poll_t packet_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po = pkt_sk(sk);
-	__poll_t mask = datagram_poll(file, sock, wait);
+	__poll_t mask = datagram_poll_mask(sock, events);
 
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (po->rx_ring.pg_vec) {
@@ -4455,7 +4454,7 @@ static const struct proto_ops packet_ops_spkt = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname_spkt,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -4476,7 +4475,7 @@ static const struct proto_ops packet_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname,
-	.poll =		packet_poll,
+	.poll_mask =	packet_poll_mask,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index f9b40e6a18a5..9ecf02def928 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -448,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pn_socket_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= pn_socket_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 2aa07b547b16..1b5025ea5b04 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = {
 	.recvmsg	= qrtr_recvmsg,
 	.getname	= qrtr_getname,
 	.ioctl		= qrtr_ioctl,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
 	.getsockopt	= sock_no_getsockopt,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 9ff5e0a76593..ecd8bc5f5f7e 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1482,7 +1482,7 @@ static const struct proto_ops rose_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	rose_accept,
 	.getname	=	rose_getname,
-	.poll		=	datagram_poll,
+	.poll_mask	=	datagram_poll_mask,
 	.ioctl		=	rose_ioctl,
 	.listen		=	rose_listen,
 	.shutdown	=	sock_no_shutdown,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d49aa79b7997..f93365ae0fdd 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	x25_accept,
 	.getname =	x25_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	x25_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = compat_x25_ioctl,
-- 
cgit v1.2.3


From 568ea88ef962e2a1e7eff0bb21c776522dfff147 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 31 Dec 2017 16:37:35 +0100
Subject: net/sctp: convert to ->poll_mask

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/sctp/sctp.h | 3 +--
 net/sctp/ipv6.c         | 2 +-
 net/sctp/protocol.c     | 2 +-
 net/sctp/socket.c       | 4 +---
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 28b996d63490..206100cc665b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -107,8 +107,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
 int sctp_inet_listen(struct socket *sock, int backlog);
 void sctp_write_space(struct sock *sk);
 void sctp_data_ready(struct sock *sk);
-__poll_t sctp_poll(struct file *file, struct socket *sock,
-		poll_table *wait);
+__poll_t sctp_poll_mask(struct socket *sock, __poll_t events);
 void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 2e3f7b75a8ec..d52eaf1881af 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1007,7 +1007,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = sctp_getname,
-	.poll		   = sctp_poll,
+	.poll_mask	   = sctp_poll_mask,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d685f8456762..a1d2ea3ff4c9 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,	/* Semantics are different.  */
-	.poll		   = sctp_poll,
+	.poll_mask	   = sctp_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,	/* Looks harmless.  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 80835ac26d2c..f6bb1b89525c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7701,14 +7701,12 @@ out:
  * here, again, by modeling the current TCP/UDP code.  We don't have
  * a good way to test with it yet.
  */
-__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t sctp_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct sctp_sock *sp = sctp_sk(sk);
 	__poll_t mask;
 
-	poll_wait(file, sk_sleep(sk), wait);
-
 	sock_rps_record_flow(sk);
 
 	/* A TCP-style listening socket becomes readable when the accept queue
-- 
cgit v1.2.3


From 17112d80814389c474415bbe8ef57db1160d5d2c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 31 Dec 2017 16:38:18 +0100
Subject: net/bluetooth: convert to ->poll_mask

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/bluetooth/bluetooth.h | 2 +-
 net/bluetooth/af_bluetooth.c      | 7 ++-----
 net/bluetooth/l2cap_sock.c        | 2 +-
 net/bluetooth/rfcomm/sock.c       | 2 +-
 net/bluetooth/sco.c               | 2 +-
 5 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index ec9d6bc65855..53ce8176c313 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -271,7 +271,7 @@ int  bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		     int flags);
 int  bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 			    size_t len, int flags);
-__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events);
 int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
 int  bt_sock_wait_ready(struct sock *sk, unsigned long flags);
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 84d92a077834..80033a7e1de2 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -437,16 +437,13 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
 	return 0;
 }
 
-__poll_t bt_sock_poll(struct file *file, struct socket *sock,
-			  poll_table *wait)
+__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
 	BT_DBG("sock %p, sk %p", sock, sk);
 
-	poll_wait(file, sk_sleep(sk), wait);
-
 	if (sk->sk_state == BT_LISTEN)
 		return bt_accept_poll(sk);
 
@@ -478,7 +475,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
-EXPORT_SYMBOL(bt_sock_poll);
+EXPORT_SYMBOL(bt_sock_poll_mask);
 
 int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 686bdc6b35b0..742a190034e6 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = {
 	.getname	= l2cap_sock_getname,
 	.sendmsg	= l2cap_sock_sendmsg,
 	.recvmsg	= l2cap_sock_recvmsg,
-	.poll		= bt_sock_poll,
+	.poll_mask	= bt_sock_poll_mask,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d606e9212291..1cf57622473a 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = {
 	.setsockopt	= rfcomm_sock_setsockopt,
 	.getsockopt	= rfcomm_sock_getsockopt,
 	.ioctl		= rfcomm_sock_ioctl,
-	.poll		= bt_sock_poll,
+	.poll_mask	= bt_sock_poll_mask,
 	.socketpair	= sock_no_socketpair,
 	.mmap		= sock_no_mmap
 };
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 413b8ee49fec..d60dbc61d170 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = {
 	.getname	= sco_sock_getname,
 	.sendmsg	= sco_sock_sendmsg,
 	.recvmsg	= sco_sock_recvmsg,
-	.poll		= bt_sock_poll,
+	.poll_mask	= bt_sock_poll_mask,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
-- 
cgit v1.2.3


From f87be8948185a4f1085aea38bccd0d2ce07f711c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 31 Dec 2017 16:40:15 +0100
Subject: net/iucv: convert to ->poll_mask

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/iucv/af_iucv.h | 2 --
 net/iucv/af_iucv.c         | 7 ++-----
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index f4c21b5a1242..b0eaeb02d46d 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -153,8 +153,6 @@ struct iucv_sock_list {
 	atomic_t	  autobind_name;
 };
 
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
-			    poll_table *wait);
 void iucv_sock_link(struct iucv_sock_list *l, struct sock *s);
 void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s);
 void iucv_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 893a022f9620..68e86257a549 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1488,14 +1488,11 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
 	return 0;
 }
 
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
-			    poll_table *wait)
+static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
-	sock_poll_wait(file, sk_sleep(sk), wait);
-
 	if (sk->sk_state == IUCV_LISTEN)
 		return iucv_accept_poll(sk);
 
@@ -2388,7 +2385,7 @@ static const struct proto_ops iucv_sock_ops = {
 	.getname	= iucv_sock_getname,
 	.sendmsg	= iucv_sock_sendmsg,
 	.recvmsg	= iucv_sock_recvmsg,
-	.poll		= iucv_sock_poll,
+	.poll_mask	= iucv_sock_poll_mask,
 	.ioctl		= sock_no_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
-- 
cgit v1.2.3


From b28fc82267aa07c34e019a72c42292d156654ee8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Jan 2018 09:40:49 +0100
Subject: crypto: af_alg: convert to ->poll_mask

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 crypto/af_alg.c         | 13 +++----------
 crypto/algif_aead.c     |  4 ++--
 crypto/algif_skcipher.c |  4 ++--
 include/crypto/if_alg.h |  3 +--
 4 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 80838c1cef94..89ed613c017e 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1060,19 +1060,12 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
 }
 EXPORT_SYMBOL_GPL(af_alg_async_cb);
 
-/**
- * af_alg_poll - poll system call handler
- */
-__poll_t af_alg_poll(struct file *file, struct socket *sock,
-			 poll_table *wait)
+__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct af_alg_ctx *ctx = ask->private;
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	__poll_t mask = 0;
 
 	if (!ctx->more || ctx->used)
 		mask |= EPOLLIN | EPOLLRDNORM;
@@ -1082,7 +1075,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
-EXPORT_SYMBOL_GPL(af_alg_poll);
+EXPORT_SYMBOL_GPL(af_alg_poll_mask);
 
 /**
  * af_alg_alloc_areq - allocate struct af_alg_async_req
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 4b07edd5a9ff..330cf9f2b767 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = {
 	.sendmsg	=	aead_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	aead_recvmsg,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static int aead_check_key(struct socket *sock)
@@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = {
 	.sendmsg	=	aead_sendmsg_nokey,
 	.sendpage	=	aead_sendpage_nokey,
 	.recvmsg	=	aead_recvmsg_nokey,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static void *aead_bind(const char *name, u32 type, u32 mask)
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index c4e885df4564..15cf3c5222e0 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -205,7 +205,7 @@ static struct proto_ops algif_skcipher_ops = {
 	.sendmsg	=	skcipher_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	skcipher_recvmsg,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static int skcipher_check_key(struct socket *sock)
@@ -301,7 +301,7 @@ static struct proto_ops algif_skcipher_ops_nokey = {
 	.sendmsg	=	skcipher_sendmsg_nokey,
 	.sendpage	=	skcipher_sendpage_nokey,
 	.recvmsg	=	skcipher_recvmsg_nokey,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 482461d8931d..cc414db9da0a 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -245,8 +245,7 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
 			int offset, size_t size, int flags);
 void af_alg_free_resources(struct af_alg_async_req *areq);
 void af_alg_async_cb(struct crypto_async_request *_req, int err);
-__poll_t af_alg_poll(struct file *file, struct socket *sock,
-			 poll_table *wait);
+__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events);
 struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
 					   unsigned int areqlen);
 int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
-- 
cgit v1.2.3


From 7053df4edb3ae3ae15c316fe49122c0b3936e9dd Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 16 May 2018 17:21:28 +0200
Subject: KVM: introduce kvm_make_vcpus_request_mask() API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hyper-V style PV TLB flush hypercalls inmplementation will use this API.
To avoid memory allocation in CONFIG_CPUMASK_OFFSTACK case add
cpumask_var_t argument.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 include/linux/kvm_host.h |  3 +++
 virt/kvm/kvm_main.c      | 34 ++++++++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6d6e79c59e68..14e710d639c7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -730,6 +730,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
+
+bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+				 unsigned long *vcpu_bitmap, cpumask_var_t tmp);
 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 
 long kvm_arch_dev_ioctl(struct file *filp,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c7b2e927f699..b125d94307d2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -203,29 +203,47 @@ static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
 	return true;
 }
 
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+				 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
 {
 	int i, cpu, me;
-	cpumask_var_t cpus;
-	bool called;
 	struct kvm_vcpu *vcpu;
-
-	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+	bool called;
 
 	me = get_cpu();
+
 	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!test_bit(i, vcpu_bitmap))
+			continue;
+
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
 		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 			continue;
 
-		if (cpus != NULL && cpu != -1 && cpu != me &&
+		if (tmp != NULL && cpu != -1 && cpu != me &&
 		    kvm_request_needs_ipi(vcpu, req))
-			__cpumask_set_cpu(cpu, cpus);
+			__cpumask_set_cpu(cpu, tmp);
 	}
-	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
+
+	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
 	put_cpu();
+
+	return called;
+}
+
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
+{
+	cpumask_var_t cpus;
+	bool called;
+	static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
+		= {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
+
+	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+	called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
+
 	free_cpumask_var(cpus);
 	return called;
 }
-- 
cgit v1.2.3


From c1aea9196ef4f6b64a8ef7e62a933f7e4164aed9 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 16 May 2018 17:21:31 +0200
Subject: KVM: x86: hyperv: declare KVM_CAP_HYPERV_TLBFLUSH capability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need a new capability to indicate support for the newly added
HvFlushVirtualAddress{List,Space}{,Ex} hypercalls. Upon seeing this
capability, userspace is supposed to announce PV TLB flush features
by setting the appropriate CPUID bits (if needed).

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 9 +++++++++
 arch/x86/kvm/x86.c                | 1 +
 include/uapi/linux/kvm.h          | 1 +
 3 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 758bf403a169..c563da4244da 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4603,3 +4603,12 @@ Architectures: s390
 This capability indicates that kvm will implement the interfaces to handle
 reset, migration and nested KVM for branch prediction blocking. The stfle
 facility 82 should not be provided to the guest without this capability.
+
+8.14 KVM_CAP_HYPERV_TLBFLUSH
+
+Architectures: x86
+
+This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush
+hypercalls:
+HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx,
+HvFlushVirtualAddressList, HvFlushVirtualAddressListEx.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b7bf9ac9b6d1..22bd20fedd6d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2871,6 +2871,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_SYNIC2:
 	case KVM_CAP_HYPERV_VP_INDEX:
 	case KVM_CAP_HYPERV_EVENTFD:
+	case KVM_CAP_HYPERV_TLBFLUSH:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b02c41e53d56..b252ceb3965c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -948,6 +948,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_BPB 152
 #define KVM_CAP_GET_MSR_FEATURES 153
 #define KVM_CAP_HYPERV_EVENTFD 154
+#define KVM_CAP_HYPERV_TLBFLUSH 155
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From c1a957d17086d20d52d7f9c8dffaeac2ee09d6f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2018 17:54:41 +0200
Subject: PM / suspend: Prevent might sleep splats

timekeeping suspend/resume calls read_persistent_clock() which takes
rtc_lock. That results in might sleep warnings because at that point
we run with interrupts disabled.

We cannot convert rtc_lock to a raw spinlock as that would trigger
other might sleep warnings.

As a workaround we disable the might sleep warnings by setting
system_state to SYSTEM_SUSPEND before calling sysdev_suspend() and
restoring it to SYSTEM_RUNNING afer sysdev_resume(). There is no lock
contention because hibernate / suspend to RAM is single-CPU at this
point.

In s2idle's case the system_state is set to SYSTEM_SUSPEND before
timekeeping_suspend() which is invoked by the last CPU. In the resume
case it set back to SYSTEM_RUNNING after timekeeping_resume() which is
invoked by the first CPU in the resume case. The other CPUs will block
on tick_freeze_lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: cover s2idle in tick_freeze() / tick_unfreeze()]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/kernel.h    | 1 +
 kernel/power/hibernate.c  | 7 +++++++
 kernel/power/suspend.c    | 4 ++++
 kernel/time/tick-common.c | 2 ++
 4 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 6a1eb0b0aad9..7aed92624531 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -542,6 +542,7 @@ extern enum system_states {
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
 	SYSTEM_RESTART,
+	SYSTEM_SUSPEND,
 } system_state;
 
 /* This cannot be an enum because some may be used in assembly source. */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5454cc639a8d..9c85c7822383 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
 
 	local_irq_disable();
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (error) {
 		pr_err("Some system devices failed to power down, aborting hibernation\n");
@@ -317,6 +319,7 @@ static int create_image(int platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
 		goto Enable_cpus;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 
 	error = syscore_suspend();
 	if (error)
@@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
 		goto Enable_cpus;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 	syscore_suspend();
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
@@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
 
  Power_up:
 	syscore_resume();
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4c10be0f4843..5149c77506b3 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -428,6 +428,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (!error) {
 		*wakeup = pm_wakeup_pending();
@@ -443,6 +445,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 		syscore_resume();
 	}
 
+	system_state = SYSTEM_RUNNING;
+
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 49edc1c4f3e6..14de3727b18e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -490,6 +490,7 @@ void tick_freeze(void)
 	if (tick_freeze_depth == num_online_cpus()) {
 		trace_suspend_resume(TPS("timekeeping_freeze"),
 				     smp_processor_id(), true);
+		system_state = SYSTEM_SUSPEND;
 		timekeeping_suspend();
 	} else {
 		tick_suspend_local();
@@ -513,6 +514,7 @@ void tick_unfreeze(void)
 
 	if (tick_freeze_depth == num_online_cpus()) {
 		timekeeping_resume();
+		system_state = SYSTEM_RUNNING;
 		trace_suspend_resume(TPS("timekeeping_freeze"),
 				     smp_processor_id(), false);
 	} else {
-- 
cgit v1.2.3


From 1e8378619841ef1d621b130bbd3fc3b7e6739b50 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 18 May 2018 10:48:49 +0200
Subject: PM / runtime: Fixup reference counting of device link suppliers at
 probe

In the driver core, before it invokes really_probe() it runtime resumes the
suppliers for the device via calling pm_runtime_get_suppliers(), which also
increases the runtime PM usage count for each of the available supplier.

This makes sense, as to be able to allow the consumer device to be probed
by its driver. However, if the driver decides to add a new supplier link
during ->probe(), hence updating the list of suppliers, the following call
to pm_runtime_put_suppliers(), invoked after really_probe() in the driver
core, we get into trouble.

More precisely, pm_runtime_put() gets called also for the new supplier(s),
which is wrong as the driver core, didn't trigger pm_runtime_get_sync() to
be called for it in the first place. In other words, the new supplier may
be runtime suspended even in cases when it shouldn't.

Fix this behaviour, by runtime resume suppliers according to the same
conditions as managed by the runtime PM core, when runtime resume callbacks
are being invoked.

Additionally, don't try to runtime suspend any of the suppliers after
really_probe(), but instead rely on that to happen via the consumer device,
when it becomes runtime suspended.

Fixes: 21d5c57b3726 (PM / runtime: Use device links)
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/dd.c            |  3 +--
 drivers/base/power/runtime.c | 27 +++------------------------
 include/linux/pm_runtime.h   |  6 ++----
 3 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 10454fe54482..a41c91bfac0e 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -580,7 +580,7 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
 
-	pm_runtime_get_suppliers(dev);
+	pm_runtime_resume_suppliers(dev);
 	if (dev->parent)
 		pm_runtime_get_sync(dev->parent);
 
@@ -591,7 +591,6 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	if (dev->parent)
 		pm_runtime_put(dev->parent);
 
-	pm_runtime_put_suppliers(dev);
 	return ret;
 }
 
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 8bef3cb2424d..6f4f50e196c1 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1563,37 +1563,16 @@ void pm_runtime_clean_up_links(struct device *dev)
 }
 
 /**
- * pm_runtime_get_suppliers - Resume and reference-count supplier devices.
+ * pm_runtime_resume_suppliers - Resume supplier devices.
  * @dev: Consumer device.
  */
-void pm_runtime_get_suppliers(struct device *dev)
+void pm_runtime_resume_suppliers(struct device *dev)
 {
-	struct device_link *link;
 	int idx;
 
 	idx = device_links_read_lock();
 
-	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node)
-		if (link->flags & DL_FLAG_PM_RUNTIME)
-			pm_runtime_get_sync(link->supplier);
-
-	device_links_read_unlock(idx);
-}
-
-/**
- * pm_runtime_put_suppliers - Drop references to supplier devices.
- * @dev: Consumer device.
- */
-void pm_runtime_put_suppliers(struct device *dev)
-{
-	struct device_link *link;
-	int idx;
-
-	idx = device_links_read_lock();
-
-	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node)
-		if (link->flags & DL_FLAG_PM_RUNTIME)
-			pm_runtime_put(link->supplier);
+	rpm_get_suppliers(dev);
 
 	device_links_read_unlock(idx);
 }
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index f0fc4700b6ff..db5dbbf7a48d 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -56,8 +56,7 @@ extern void pm_runtime_update_max_time_suspended(struct device *dev,
 						 s64 delta_ns);
 extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
 extern void pm_runtime_clean_up_links(struct device *dev);
-extern void pm_runtime_get_suppliers(struct device *dev);
-extern void pm_runtime_put_suppliers(struct device *dev);
+extern void pm_runtime_resume_suppliers(struct device *dev);
 extern void pm_runtime_new_link(struct device *dev);
 extern void pm_runtime_drop_link(struct device *dev);
 
@@ -173,8 +172,7 @@ static inline unsigned long pm_runtime_autosuspend_expiration(
 static inline void pm_runtime_set_memalloc_noio(struct device *dev,
 						bool enable){}
 static inline void pm_runtime_clean_up_links(struct device *dev) {}
-static inline void pm_runtime_get_suppliers(struct device *dev) {}
-static inline void pm_runtime_put_suppliers(struct device *dev) {}
+static inline void pm_runtime_resume_suppliers(struct device *dev) {}
 static inline void pm_runtime_new_link(struct device *dev) {}
 static inline void pm_runtime_drop_link(struct device *dev) {}
 
-- 
cgit v1.2.3


From 02f6bf8717a359cabe042c632f3d03fc772450bf Mon Sep 17 00:00:00 2001
From: Phil Edworthy <phil.edworthy@renesas.com>
Date: Fri, 25 May 2018 16:11:34 +0100
Subject: gpio: dwapb: Fix rework support for 1 interrupt per port A GPIO

Commit da069d5d2b814d9887989dcdb29fb0202eac8b38 ("gpio: dwapb: Rework
support for 1 interrupt per port A GPIO"), was an incremental patch that
was supposed to provide the delta between v5 and v6 patch set for
adding support for 1 interupt per port A GPIO. v5 was applied, then some
other feedback came afterwards.

However, in my haste I managed to drop the changes made to dwapb_port_property
struct. This patch includes those missing changes.

Signed-off-by: Phil Edworthy <phil.edworthy@renesas.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/platform_data/gpio-dwapb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h
index 5a52d69c13f3..419cfacb4b42 100644
--- a/include/linux/platform_data/gpio-dwapb.h
+++ b/include/linux/platform_data/gpio-dwapb.h
@@ -19,7 +19,7 @@ struct dwapb_port_property {
 	unsigned int	idx;
 	unsigned int	ngpio;
 	unsigned int	gpio_base;
-	unsigned int	irq[32];
+	int		irq[32];
 	bool		has_irq;
 	bool		irq_shared;
 };
-- 
cgit v1.2.3


From f068fe3170bcf06f14fd0a9eec0be12be04ff012 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Apr 2018 09:02:55 +0200
Subject: core, dma-direct: add a flag 32-bit dma limits

Various PCI bridges (VIA PCI, Xilinx PCIe) limit DMA to only 32-bits
even if the device itself supports more.  Add a single bit flag to
struct device (to be moved into the dma extension once we get to it)
to flag such devices and reject larger DMA to them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 3 +++
 lib/dma-direct.c       | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 563077d1cdc1..00b6c3b42437 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -904,6 +904,8 @@ struct dev_links_info {
  * @offline:	Set after successful invocation of bus type's .offline().
  * @of_node_reused: Set if the device-tree node is shared with an ancestor
  *              device.
+ * @dma_32bit_limit: bridge limited to 32bit DMA even if the device itself
+ *		indicates support for a higher limit in the dma_mask field.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -992,6 +994,7 @@ struct device {
 	bool			offline_disabled:1;
 	bool			offline:1;
 	bool			of_node_reused:1;
+	bool			dma_32bit_limit:1;
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index b824eb218782..a48f94eff62e 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -172,6 +172,12 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	if (mask < DMA_BIT_MASK(32))
 		return 0;
 #endif
+	/*
+	 * Various PCI/PCIe bridges have broken support for > 32bit DMA even
+	 * if the device itself might support it.
+	 */
+	if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+		return 0;
 	return 1;
 }
 
-- 
cgit v1.2.3


From 0ead51c3fbd15a4bc91e984f1b18b5c9422fbb02 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 May 2018 12:47:57 +0200
Subject: x86/pci-dma: switch the VIA 32-bit DMA quirk to use the struct device
 flag

Instead of globally disabling > 32bit DMA using the arch_dma_supported
hook walk the PCI bus under the actually affected bridge and mark every
device with the dma_32bit_limit flag.  This also gets rid of the
arch_dma_supported hook entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/dma-mapping.h |  3 ---
 arch/x86/kernel/pci-dma.c          | 27 ++++++++++-----------------
 include/linux/dma-mapping.h        | 11 -----------
 3 files changed, 10 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ef597478f0ac..ce4d176b3d13 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -30,9 +30,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-int arch_dma_supported(struct device *dev, u64 mask);
-#define arch_dma_supported arch_dma_supported
-
 bool arch_dma_alloc_attrs(struct device **dev);
 #define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b523414bc323..ab5d9dd668d2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -15,7 +15,7 @@
 #include <asm/x86_init.h>
 #include <asm/iommu_table.h>
 
-static int forbid_dac __read_mostly;
+static bool disable_dac_quirk __read_mostly;
 
 const struct dma_map_ops *dma_ops = &dma_direct_ops;
 EXPORT_SYMBOL(dma_ops);
@@ -126,7 +126,7 @@ static __init int iommu_setup(char *p)
 		if (!strncmp(p, "nodac", 5))
 			pr_warn("nodac option ignored.\n");
 		if (!strncmp(p, "usedac", 6)) {
-			forbid_dac = -1;
+			disable_dac_quirk = true;
 			return 1;
 		}
 #ifdef CONFIG_SWIOTLB
@@ -151,19 +151,6 @@ static __init int iommu_setup(char *p)
 }
 early_param("iommu", iommu_setup);
 
-int arch_dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
-	if (mask > 0xffffffff && forbid_dac > 0) {
-		dev_info(dev, "PCI: Disallowing DAC for device\n");
-		return 0;
-	}
-#endif
-
-	return 1;
-}
-EXPORT_SYMBOL(arch_dma_supported);
-
 static int __init pci_iommu_init(void)
 {
 	struct iommu_table_entry *p;
@@ -186,11 +173,17 @@ rootfs_initcall(pci_iommu_init);
 #ifdef CONFIG_PCI
 /* Many VIA bridges seem to corrupt data for DAC. Disable it here */
 
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+	pdev->dev.dma_32bit_limit = true;
+	return 0;
+}
+
 static void via_no_dac(struct pci_dev *dev)
 {
-	if (forbid_dac == 0) {
+	if (!disable_dac_quirk) {
 		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
-		forbid_dac = 1;
+		pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
 	}
 }
 DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 15e4b4e9cc20..f9cc309507d9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -572,14 +572,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
-/*
- * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please
- * don't use this in new code.
- */
-#ifndef arch_dma_supported
-#define arch_dma_supported(dev, mask)	(1)
-#endif
-
 static inline void dma_check_mask(struct device *dev, u64 mask)
 {
 	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
@@ -592,9 +584,6 @@ static inline int dma_supported(struct device *dev, u64 mask)
 
 	if (!ops)
 		return 0;
-	if (!arch_dma_supported(dev, mask))
-		return 0;
-
 	if (!ops->dma_supported)
 		return 1;
 	return ops->dma_supported(dev, mask);
-- 
cgit v1.2.3


From 13193b0f392f5a65d0d54185cb95ed5e99c0a5bf Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:22 -0700
Subject: bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n

Static key is used to enable/disable cgroup-bpf related code paths at
run time.

Though it's not defined when cgroup-bpf is disabled at compile time,
i.e. CONFIG_CGROUP_BPF=n, and if some code wants to use it, it has to do
this:

	#ifdef CONFIG_CGROUP_BPF
		if (cgroup_bpf_enabled) {
			/* ... some work ... */
		}
	#endif

This code can be simplified by setting cgroup_bpf_enabled to 0 for
CONFIG_CGROUP_BPF=n case:

	if (cgroup_bpf_enabled) {
		/* ... some work ... */
	}

And it aligns well with existing BPF_CGROUP_RUN_PROG_* macros that
defined for both states of CONFIG_CGROUP_BPF.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 30d15e64b993..de8e89a3758b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -185,6 +185,7 @@ struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
+#define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
-- 
cgit v1.2.3


From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:23 -0700
Subject: bpf: Hooks for sys_sendmsg

In addition to already existing BPF hooks for sys_bind and sys_connect,
the patch provides new hooks for sys_sendmsg.

It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
that provides access to socket itlself (properties like family, type,
protocol) and user-passed `struct sockaddr *` so that BPF program can
override destination IP and port for system calls such as sendto(2) or
sendmsg(2) and/or assign source IP to the socket.

The hooks are implemented as two new attach types:
`BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
UDPv6 correspondingly.

UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
sys_connect hooks, i.e. to prevent reading from / writing to e.g.
user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.

The difference with already existing hooks is sys_sendmsg are
implemented only for unconnected UDP.

For TCP it doesn't make sense to change user-provided `struct sockaddr *`
at sendto(2)/sendmsg(2) time since socket either was already connected
and has source/destination set or wasn't connected and call to
sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.

Connected UDP is already handled by sys_connect hooks that can override
source/destination at connect time and use fast-path later, i.e. these
hooks don't affect UDP fast-path.

Rewriting source IP is implemented differently than that in sys_connect
hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
just bind socket to desired local IP address since source IP can be set
on per-packet basis by using ancillary data (cmsg(3)). So no matter if
socket is bound or not, source IP has to be rewritten on every call to
sys_sendmsg.

To do so two new fields are added to UAPI `struct bpf_sock_addr`;
* `msg_src_ip4` to set source IPv4 for UDPv4;
* `msg_src_ip6` to set source IPv6 for UDPv6.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 23 +++++++++++++++++------
 include/linux/filter.h     |  1 +
 include/uapi/linux/bpf.h   |  8 ++++++++
 kernel/bpf/cgroup.c        | 11 ++++++++++-
 kernel/bpf/syscall.c       |  8 ++++++++
 net/core/filter.c          | 39 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/udp.c             | 20 ++++++++++++++++++--
 net/ipv6/udp.c             | 24 ++++++++++++++++++++++++
 8 files changed, 125 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index de8e89a3758b..975fb4cf1bb7 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type);
+				      enum bpf_attach_type type,
+				      void *t_ctx);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
@@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  NULL);	       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)			       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)	{					       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  t_ctx);	       \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
+
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
+
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
@@ -198,6 +207,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d358d1815c16..d90abdaea94b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern {
 	 * only two (src and dst) are available at convert_ctx_access time
 	 */
 	u64 tmp_reg;
+	void *t_ctx;	/* Attach type specific context. */
 };
 
 struct bpf_sock_ops_kern {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9b8c6e310e9a..cc68787f2d97 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -160,6 +160,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2363,6 +2365,12 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43171a0bb02b..f7c00bd6f8e4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
  * @type: The type of program to be exectuted
+ * @t_ctx: Pointer to attach type specific context
  *
  * socket is expected to be of type INET or INET6.
  *
@@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type)
+				      enum bpf_attach_type type,
+				      void *t_ctx)
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
 		.uaddr = uaddr,
+		.t_ctx = t_ctx,
 	};
+	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
+	if (!ctx.uaddr) {
+		memset(&unspec, 0, sizeof(unspec));
+		ctx.uaddr = (struct sockaddr *)&unspec;
+	}
+
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 388d4feda348..e254526d6744 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1249,6 +1249,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
+		case BPF_CGROUP_UDP6_SENDMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -1565,6 +1567,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1635,6 +1639,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1692,6 +1698,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
diff --git a/net/core/filter.c b/net/core/filter.c
index acf1f4fb99d1..24e6ce8be567 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5299,6 +5299,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET4_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5308,6 +5309,24 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP6_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP4_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP6_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5318,6 +5337,9 @@ static bool sock_addr_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
 		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
@@ -6072,6 +6094,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
 					SK_FL_PROTO_SHIFT);
 		break;
+
+	case offsetof(struct bpf_sock_addr, msg_src_ip4):
+		/* Treat t_ctx as struct in_addr for msg_src_ip4. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+			s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+		break;
+
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		off = si->off;
+		off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+		/* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+			s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d71f1f3e1155..3c27d00b5730 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -901,6 +901,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct udp_sock *up = udp_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
 	struct flowi4 fl4_stack;
 	struct flowi4 *fl4;
 	int ulen = len;
@@ -955,8 +956,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	/*
 	 *	Get and verify the address.
 	 */
-	if (msg->msg_name) {
-		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+	if (usin) {
 		if (msg->msg_namelen < sizeof(*usin))
 			return -EINVAL;
 		if (usin->sin_family != AF_INET) {
@@ -1010,6 +1010,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rcu_read_unlock();
 	}
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+					    (struct sockaddr *)usin, &ipc.addr);
+		if (err)
+			goto out_free;
+		if (usin) {
+			if (usin->sin_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_free;
+			}
+			daddr = usin->sin_addr.s_addr;
+			dport = usin->sin_port;
+		}
+	}
+
 	saddr = ipc.addr;
 	ipc.addr = faddr = daddr;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 426c9d2b418d..9f729a7b8cf0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1316,6 +1316,29 @@ do_udp_sendmsg:
 		fl6.saddr = np->saddr;
 	fl6.fl6_sport = inet->inet_sport;
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
+					   (struct sockaddr *)sin6, &fl6.saddr);
+		if (err)
+			goto out_no_dst;
+		if (sin6) {
+			if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+				/* BPF program rewrote IPv6-only by IPv4-mapped
+				 * IPv6. It's currently unsupported.
+				 */
+				err = -ENOTSUPP;
+				goto out_no_dst;
+			}
+			if (sin6->sin6_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_no_dst;
+			}
+			fl6.fl6_dport = sin6->sin6_port;
+			fl6.daddr = sin6->sin6_addr;
+		}
+	}
+
 	final_p = fl6_update_dst(&fl6, opt, &final);
 	if (final_p)
 		connected = false;
@@ -1395,6 +1418,7 @@ do_append_data:
 
 out:
 	dst_release(dst);
+out_no_dst:
 	fl6_sock_release(flowlabel);
 	txopt_put(opt_to_free);
 	if (!err)
-- 
cgit v1.2.3


From f8f8e189512cc47da448bbf8ec8c39a2ab6e7515 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 3 Apr 2018 20:57:21 +0200
Subject: btrfs: tracepoints, use correct type for inode number

The size of ino_t depends on 32/64bit architecture type. Btrfs stores
the full 64bit inode anyway so we should use it.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 47 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 965c650a5273..dafd58ad86ec 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -133,7 +133,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 	TP_ARGS(inode),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino			)
+		__field(	u64,  ino			)
 		__field(	blkcnt_t,  blocks		)
 		__field(	u64,  disk_i_size		)
 		__field(	u64,  generation		)
@@ -143,7 +143,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
-		__entry->ino	= inode->i_ino;
+		__entry->ino	= btrfs_ino(BTRFS_I(inode));
 		__entry->blocks	= inode->i_blocks;
 		__entry->disk_i_size  = BTRFS_I(inode)->disk_i_size;
 		__entry->generation = BTRFS_I(inode)->generation;
@@ -153,11 +153,11 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 				BTRFS_I(inode)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%lu blocks=%llu "
+	TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu "
 		  "disk_i_size=%llu last_trans=%llu logged_trans=%llu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long long)__entry->generation,
-		  (unsigned long)__entry->ino,
+		  (unsigned long long)__entry->ino,
 		  (unsigned long long)__entry->blocks,
 		  (unsigned long long)__entry->disk_i_size,
 		  (unsigned long long)__entry->last_trans,
@@ -443,7 +443,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 	TP_ARGS(inode, ordered),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino		)
+		__field(	u64,  ino		)
 		__field(	u64,  file_offset	)
 		__field(	u64,  start		)
 		__field(	u64,  len		)
@@ -457,7 +457,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
-		__entry->ino 		= inode->i_ino;
+		__entry->ino 		= btrfs_ino(BTRFS_I(inode));
 		__entry->file_offset	= ordered->file_offset;
 		__entry->start		= ordered->start;
 		__entry->len		= ordered->len;
@@ -528,7 +528,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 	TP_ARGS(page, inode, wbc),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino			)
+		__field(	u64,	ino			)
 		__field(	pgoff_t,  index			)
 		__field(	long,   nr_to_write		)
 		__field(	long,   pages_skipped		)
@@ -542,7 +542,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
-		__entry->ino		= inode->i_ino;
+		__entry->ino		= btrfs_ino(BTRFS_I(inode));
 		__entry->index		= page->index;
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
@@ -556,12 +556,12 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 				 BTRFS_I(inode)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu "
+	TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu "
 		  "nr_to_write=%ld pages_skipped=%ld range_start=%llu "
 		  "range_end=%llu for_kupdate=%d "
 		  "for_reclaim=%d range_cyclic=%d writeback_index=%lu",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long)__entry->ino, __entry->index,
+		  (unsigned long long)__entry->ino, __entry->index,
 		  __entry->nr_to_write, __entry->pages_skipped,
 		  __entry->range_start, __entry->range_end,
 		  __entry->for_kupdate,
@@ -584,7 +584,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 	TP_ARGS(page, start, end, uptodate),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,	 ino		)
+		__field(	u64,	 ino		)
 		__field(	pgoff_t, index		)
 		__field(	u64,	 start		)
 		__field(	u64,	 end		)
@@ -593,7 +593,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(page->mapping->host->i_sb),
-		__entry->ino	= page->mapping->host->i_ino;
+		__entry->ino	= btrfs_ino(BTRFS_I(page->mapping->host));
 		__entry->index	= page->index;
 		__entry->start	= start;
 		__entry->end	= end;
@@ -602,10 +602,10 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 			 BTRFS_I(page->mapping->host)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu start=%llu "
+	TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu start=%llu "
 		  "end=%llu uptodate=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long)__entry->ino, (unsigned long)__entry->index,
+		  (unsigned long long)__entry->ino, (unsigned long)__entry->index,
 		  (unsigned long long)__entry->start,
 		  (unsigned long long)__entry->end, __entry->uptodate)
 );
@@ -617,8 +617,8 @@ TRACE_EVENT(btrfs_sync_file,
 	TP_ARGS(file, datasync),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino		)
-		__field(	ino_t,  parent		)
+		__field(	u64,	ino		)
+		__field(	u64,	parent		)
 		__field(	int,    datasync	)
 		__field(	u64,    root_objectid	)
 	),
@@ -628,16 +628,17 @@ TRACE_EVENT(btrfs_sync_file,
 		const struct inode *inode = d_inode(dentry);
 
 		TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
-		__entry->ino		= inode->i_ino;
-		__entry->parent		= d_inode(dentry->d_parent)->i_ino;
+		__entry->ino		= btrfs_ino(BTRFS_I(inode));
+		__entry->parent		= btrfs_ino(BTRFS_I(d_inode(dentry->d_parent)));
 		__entry->datasync	= datasync;
 		__entry->root_objectid	=
 				 BTRFS_I(inode)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) ino=%ld parent=%ld datasync=%d",
+	TP_printk_btrfs("root=%llu(%s) ino=%llu parent=%llu datasync=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long)__entry->ino, (unsigned long)__entry->parent,
+		  (unsigned long long)__entry->ino,
+		  (unsigned long long)__entry->parent,
 		  __entry->datasync)
 );
 
@@ -1476,7 +1477,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,		rootid		)
-		__field(	unsigned long,	ino		)
+		__field(	u64,		ino		)
 		__field(	u64,		start		)
 		__field(	u64,		len		)
 		__field(	u64,		reserved	)
@@ -1485,14 +1486,14 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
 		__entry->rootid		= BTRFS_I(inode)->root->objectid;
-		__entry->ino		= inode->i_ino;
+		__entry->ino		= btrfs_ino(BTRFS_I(inode));
 		__entry->start		= start;
 		__entry->len		= len;
 		__entry->reserved	= reserved;
 		__entry->op		= op;
 	),
 
-	TP_printk_btrfs("root=%llu ino=%lu start=%llu len=%llu reserved=%llu op=%s",
+	TP_printk_btrfs("root=%llu ino=%llu start=%llu len=%llu reserved=%llu op=%s",
 		  __entry->rootid, __entry->ino, __entry->start, __entry->len,
 		  __entry->reserved,
 		  __print_flags((unsigned long)__entry->op, "",
-- 
cgit v1.2.3


From 5439c7f54c008ad727aa45a24bd8518da248f8b7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 3 Apr 2018 21:05:37 +0200
Subject: btrfs: tracepoints, use %llu instead of %Lu

For consistency, use the %llu form.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index dafd58ad86ec..200c45911919 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1002,7 +1002,7 @@ TRACE_EVENT(btrfs_space_reservation,
 		__entry->reserve	= reserve;
 	),
 
-	TP_printk_btrfs("%s: %Lu %s %Lu", __get_str(type), __entry->val,
+	TP_printk_btrfs("%s: %llu %s %llu", __get_str(type), __entry->val,
 			__entry->reserve ? "reserve" : "release",
 			__entry->bytes)
 );
@@ -1141,7 +1141,7 @@ TRACE_EVENT(find_free_extent,
 		__entry->data		= data;
 	),
 
-	TP_printk_btrfs("root=%Lu(%s) len=%Lu empty_size=%Lu flags=%Lu(%s)",
+	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
 		  __entry->num_bytes, __entry->empty_size, __entry->data,
 		  __print_flags((unsigned long)__entry->data, "|",
@@ -1170,8 +1170,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 		__entry->len		= len;
 	),
 
-	TP_printk_btrfs("root=%Lu(%s) block_group=%Lu flags=%Lu(%s) "
-		  "start=%Lu len=%Lu",
+	TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
+		  "start=%llu len=%llu",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
 		  __entry->bg_objectid,
 		  __entry->flags, __print_flags((unsigned long)__entry->flags,
@@ -1222,8 +1222,8 @@ TRACE_EVENT(btrfs_find_cluster,
 		__entry->min_bytes	= min_bytes;
 	),
 
-	TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) start=%Lu len=%Lu "
-		  "empty_size=%Lu min_bytes=%Lu", __entry->bg_objectid,
+	TP_printk_btrfs("block_group=%llu flags=%llu(%s) start=%llu len=%llu "
+		  "empty_size=%llu min_bytes=%llu", __entry->bg_objectid,
 		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS), __entry->start,
@@ -1244,7 +1244,7 @@ TRACE_EVENT(btrfs_failed_cluster_setup,
 		__entry->bg_objectid	= block_group->key.objectid;
 	),
 
-	TP_printk_btrfs("block_group=%Lu", __entry->bg_objectid)
+	TP_printk_btrfs("block_group=%llu", __entry->bg_objectid)
 );
 
 TRACE_EVENT(btrfs_setup_cluster,
@@ -1273,8 +1273,8 @@ TRACE_EVENT(btrfs_setup_cluster,
 		__entry->bitmap		= bitmap;
 	),
 
-	TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) window_start=%Lu "
-		  "size=%Lu max_size=%Lu bitmap=%d",
+	TP_printk_btrfs("block_group=%llu flags=%llu(%s) window_start=%llu "
+		  "size=%llu max_size=%llu bitmap=%d",
 		  __entry->bg_objectid,
 		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
-- 
cgit v1.2.3


From 8eec8463085271585c41ab9ec6b30e8ff4bc0b83 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 3 Apr 2018 21:15:11 +0200
Subject: btrfs: tracepoints, drop unnecessary ULL casts

The (unsigned long long) casts are not necessary since long ago.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 124 +++++++++++++++++++++----------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 200c45911919..06e8b8bdfb42 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -123,7 +123,7 @@ TRACE_EVENT(btrfs_transaction_commit,
 
 	TP_printk_btrfs("root = %llu(%s), gen = %llu",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->generation)
+		  __entry->generation)
 );
 
 DECLARE_EVENT_CLASS(btrfs__inode,
@@ -156,12 +156,12 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 	TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu "
 		  "disk_i_size=%llu last_trans=%llu logged_trans=%llu",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->generation,
-		  (unsigned long long)__entry->ino,
+		  __entry->generation,
+		  __entry->ino,
 		  (unsigned long long)__entry->blocks,
-		  (unsigned long long)__entry->disk_i_size,
-		  (unsigned long long)__entry->last_trans,
-		  (unsigned long long)__entry->logged_trans)
+		  __entry->disk_i_size,
+		  __entry->last_trans,
+		  __entry->logged_trans)
 );
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_new,
@@ -244,12 +244,12 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
 		  "block_len=%llu flags=%s refs=%u "
 		  "compress_type=%u",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->ino,
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len,
-		  (unsigned long long)__entry->orig_start,
+		  __entry->ino,
+		  __entry->start,
+		  __entry->len,
+		  __entry->orig_start,
 		  show_map_type(__entry->block_start),
-		  (unsigned long long)__entry->block_len,
+		  __entry->block_len,
 		  show_map_flags(__entry->flags),
 		  __entry->refs, __entry->compress_type)
 );
@@ -281,12 +281,12 @@ TRACE_EVENT(btrfs_handle_em_exist,
 	TP_printk("start=%llu len=%llu "
 		  "existing(start=%llu len=%llu) "
 		  "em(start=%llu len=%llu)",
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len,
-		  (unsigned long long)__entry->e_start,
-		  (unsigned long long)__entry->e_len,
-		  (unsigned long long)__entry->map_start,
-		  (unsigned long long)__entry->map_len)
+		  __entry->start,
+		  __entry->len,
+		  __entry->e_start,
+		  __entry->e_len,
+		  __entry->map_start,
+		  __entry->map_len)
 );
 
 /* file extent item */
@@ -477,13 +477,13 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 		  "bytes_left=%llu flags=%s compress_type=%d "
 		  "refs=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->ino,
-		  (unsigned long long)__entry->file_offset,
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len,
-		  (unsigned long long)__entry->disk_len,
-		  (unsigned long long)__entry->truncated_len,
-		  (unsigned long long)__entry->bytes_left,
+		  __entry->ino,
+		  __entry->file_offset,
+		  __entry->start,
+		  __entry->len,
+		  __entry->disk_len,
+		  __entry->truncated_len,
+		  __entry->bytes_left,
 		  show_ordered_flags(__entry->flags),
 		  __entry->compress_type, __entry->refs)
 );
@@ -561,7 +561,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		  "range_end=%llu for_kupdate=%d "
 		  "for_reclaim=%d range_cyclic=%d writeback_index=%lu",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->ino, __entry->index,
+		  __entry->ino, __entry->index,
 		  __entry->nr_to_write, __entry->pages_skipped,
 		  __entry->range_start, __entry->range_end,
 		  __entry->for_kupdate,
@@ -605,9 +605,9 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 	TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu start=%llu "
 		  "end=%llu uptodate=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->ino, (unsigned long)__entry->index,
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->end, __entry->uptodate)
+		  __entry->ino, (unsigned long)__entry->index,
+		  __entry->start,
+		  __entry->end, __entry->uptodate)
 );
 
 TRACE_EVENT(btrfs_sync_file,
@@ -637,8 +637,8 @@ TRACE_EVENT(btrfs_sync_file,
 
 	TP_printk_btrfs("root=%llu(%s) ino=%llu parent=%llu datasync=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->ino,
-		  (unsigned long long)__entry->parent,
+		  __entry->ino,
+		  __entry->parent,
 		  __entry->datasync)
 );
 
@@ -690,13 +690,13 @@ TRACE_EVENT(btrfs_add_block_group,
 	TP_printk("%pU: block_group offset=%llu size=%llu "
 		  "flags=%llu(%s) bytes_used=%llu bytes_super=%llu "
 		  "create=%d", __entry->fsid,
-		  (unsigned long long)__entry->offset,
-		  (unsigned long long)__entry->size,
-		  (unsigned long long)__entry->flags,
+		  __entry->offset,
+		  __entry->size,
+		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->bytes_used,
-		  (unsigned long long)__entry->bytes_super, __entry->create)
+		  __entry->bytes_used,
+		  __entry->bytes_super, __entry->create)
 );
 
 #define show_ref_action(action)						\
@@ -741,13 +741,13 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 	TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
 		  "parent=%llu(%s) ref_root=%llu(%s) level=%d "
 		  "type=%s seq=%llu",
-		  (unsigned long long)__entry->bytenr,
-		  (unsigned long long)__entry->num_bytes,
+		  __entry->bytenr,
+		  __entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
 		  __entry->level, show_ref_type(__entry->type),
-		  (unsigned long long)__entry->seq)
+		  __entry->seq)
 );
 
 DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
@@ -806,15 +806,15 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 	TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
 		  "parent=%llu(%s) ref_root=%llu(%s) owner=%llu "
 		  "offset=%llu type=%s seq=%llu",
-		  (unsigned long long)__entry->bytenr,
-		  (unsigned long long)__entry->num_bytes,
+		  __entry->bytenr,
+		  __entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
-		  (unsigned long long)__entry->owner,
-		  (unsigned long long)__entry->offset,
+		  __entry->owner,
+		  __entry->offset,
 		  show_ref_type(__entry->type),
-		  (unsigned long long)__entry->seq)
+		  __entry->seq)
 );
 
 DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
@@ -860,8 +860,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 	),
 
 	TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s is_data=%d",
-		  (unsigned long long)__entry->bytenr,
-		  (unsigned long long)__entry->num_bytes,
+		  __entry->bytenr,
+		  __entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  __entry->is_data)
 );
@@ -924,8 +924,8 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
 	TP_printk_btrfs("root=%llu(%s) offset=%llu size=%llu "
 		  "num_stripes=%d sub_stripes=%d type=%s",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->offset,
-		  (unsigned long long)__entry->size,
+		  __entry->offset,
+		  __entry->size,
 		  __entry->num_stripes, __entry->sub_stripes,
 		  show_chunk_type(__entry->type))
 );
@@ -975,9 +975,9 @@ TRACE_EVENT(btrfs_cow_block,
 		  "(orig_level=%d) cow_buf=%llu (cow_level=%d)",
 		  show_root_type(__entry->root_objectid),
 		  __entry->refs,
-		  (unsigned long long)__entry->buf_start,
+		  __entry->buf_start,
 		  __entry->buf_level,
-		  (unsigned long long)__entry->cow_start,
+		  __entry->cow_start,
 		  __entry->cow_level)
 );
 
@@ -1039,10 +1039,10 @@ TRACE_EVENT(btrfs_trigger_flush,
 	TP_printk("%pU: %s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
 		  __entry->fsid, __get_str(reason), __entry->flush,
 		  show_flush_action(__entry->flush),
-		  (unsigned long long)__entry->flags,
+		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->bytes)
+		  __entry->bytes)
 );
 
 #define show_flush_state(state)							\
@@ -1080,10 +1080,10 @@ TRACE_EVENT(btrfs_flush_space,
 	TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
 		  __entry->fsid, __entry->state,
 		  show_flush_state(__entry->state),
-		  (unsigned long long)__entry->flags,
+		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->num_bytes, __entry->ret)
+		  __entry->num_bytes, __entry->ret)
 );
 
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
@@ -1104,8 +1104,8 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
 	TP_printk_btrfs("root=%llu(%s) start=%llu len=%llu",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len)
+		  __entry->start,
+		  __entry->len)
 );
 
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_alloc,
@@ -1766,14 +1766,14 @@ DECLARE_EVENT_CLASS(btrfs__prelim_ref,
 	),
 
 	TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu",
-			(unsigned long long)__entry->root_id,
-			(unsigned long long)__entry->objectid, __entry->type,
-			(unsigned long long)__entry->offset, __entry->level,
+			__entry->root_id,
+			__entry->objectid, __entry->type,
+			__entry->offset, __entry->level,
 			__entry->old_count, __entry->mod_count,
 			__entry->old_count + __entry->mod_count,
-			(unsigned long long)__entry->parent,
-			(unsigned long long)__entry->bytenr,
-			(unsigned long long)__entry->tree_size)
+			__entry->parent,
+			__entry->bytenr,
+			__entry->tree_size)
 );
 
 DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge,
@@ -1809,7 +1809,7 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
 
 	TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d",
 			show_root_type(__entry->root_objectid),
-			(unsigned long long)__entry->ino, __entry->mod)
+			__entry->ino, __entry->mod)
 );
 #endif /* _TRACE_BTRFS_H */
 
-- 
cgit v1.2.3


From 79bcb71a5b3adf84ec9d671ce9f0662ae5b1d0e3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 3 Apr 2018 21:21:17 +0200
Subject: btrfs: tracepoints, fix whitespace in strings

The preferred style is to avoid spaces between key and value and no
commas between key=values.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 06e8b8bdfb42..1773355e9365 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -121,7 +121,7 @@ TRACE_EVENT(btrfs_transaction_commit,
 		__entry->root_objectid	= root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root = %llu(%s), gen = %llu",
+	TP_printk_btrfs("root=%llu(%s) gen=%llu",
 		  show_root_type(__entry->root_objectid),
 		  __entry->generation)
 );
@@ -656,7 +656,7 @@ TRACE_EVENT(btrfs_sync_fs,
 		__entry->wait	= wait;
 	),
 
-	TP_printk_btrfs("wait = %d", __entry->wait)
+	TP_printk_btrfs("wait=%d", __entry->wait)
 );
 
 TRACE_EVENT(btrfs_add_block_group,
-- 
cgit v1.2.3


From 2e63e62d981dcaa822d3a845935b009b2543b3f2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 3 Apr 2018 21:34:06 +0200
Subject: btrfs: tracepoints, use extended format with UUID where possible

Most of the strings are prefixed by the UUID of the filesystem that
generates the message, however there are a few events that still
opencode the macro magic and can be converted to the common macros.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 1773355e9365..9be469706d30 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -666,8 +666,7 @@ TRACE_EVENT(btrfs_add_block_group,
 
 	TP_ARGS(fs_info, block_group, create),
 
-	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,	offset			)
 		__field(	u64,	size			)
 		__field(	u64,	flags			)
@@ -676,8 +675,7 @@ TRACE_EVENT(btrfs_add_block_group,
 		__field(	int,	create			)
 	),
 
-	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+	TP_fast_assign_btrfs(fs_info,
 		__entry->offset		= block_group->key.objectid;
 		__entry->size		= block_group->key.offset;
 		__entry->flags		= block_group->flags;
@@ -687,9 +685,9 @@ TRACE_EVENT(btrfs_add_block_group,
 		__entry->create		= create;
 	),
 
-	TP_printk("%pU: block_group offset=%llu size=%llu "
+	TP_printk_btrfs("block_group offset=%llu size=%llu "
 		  "flags=%llu(%s) bytes_used=%llu bytes_super=%llu "
-		  "create=%d", __entry->fsid,
+		  "create=%d",
 		  __entry->offset,
 		  __entry->size,
 		  __entry->flags,
@@ -1020,24 +1018,22 @@ TRACE_EVENT(btrfs_trigger_flush,
 
 	TP_ARGS(fs_info, flags, bytes, flush, reason),
 
-	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,	flags			)
 		__field(	u64,	bytes			)
 		__field(	int,	flush			)
 		__string(	reason,	reason			)
 	),
 
-	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+	TP_fast_assign_btrfs(fs_info,
 		__entry->flags	= flags;
 		__entry->bytes	= bytes;
 		__entry->flush	= flush;
 		__assign_str(reason, reason)
 	),
 
-	TP_printk("%pU: %s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
-		  __entry->fsid, __get_str(reason), __entry->flush,
+	TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
+		  __get_str(reason), __entry->flush,
 		  show_flush_action(__entry->flush),
 		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
@@ -1061,24 +1057,22 @@ TRACE_EVENT(btrfs_flush_space,
 
 	TP_ARGS(fs_info, flags, num_bytes, state, ret),
 
-	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,	flags			)
 		__field(	u64,	num_bytes		)
 		__field(	int,	state			)
 		__field(	int,	ret			)
 	),
 
-	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+	TP_fast_assign_btrfs(fs_info,
 		__entry->flags		=	flags;
 		__entry->num_bytes	=	num_bytes;
 		__entry->state		=	state;
 		__entry->ret		=	ret;
 	),
 
-	TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
-		  __entry->fsid, __entry->state,
+	TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
+		  __entry->state,
 		  show_flush_state(__entry->state),
 		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
-- 
cgit v1.2.3


From f46b24c9457143a367c6707eac82d546e2bcf280 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 3 Apr 2018 21:45:57 +0200
Subject: btrfs: use fs_info for btrfs_handle_em_exist tracepoint

We really want to know to which filesystem the extent map events belong,
but as it cannot be reached from the extent_map pointers, we need to
pass it down the callchain.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c             |  6 ++++--
 fs/btrfs/extent_map.h             |  3 ++-
 fs/btrfs/inode.c                  |  2 +-
 fs/btrfs/tests/extent-map-tests.c |  8 ++++----
 include/trace/events/btrfs.h      | 12 +++++++-----
 5 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1b8a078f92eb..6648d55e5339 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -518,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
 
 /**
  * btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @fs_info - used for tracepoint
  * @em_tree - the extent tree into which we want to insert the extent mapping
  * @em_in   - extent we are inserting
  * @start   - start of the logical range btrfs_get_extent() is requesting
@@ -535,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
  * Return 0 on success, otherwise -EEXIST.
  *
  */
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+			     struct extent_map_tree *em_tree,
 			     struct extent_map **em_in, u64 start, u64 len)
 {
 	int ret;
@@ -553,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
 
 		existing = search_extent_mapping(em_tree, start, len);
 
-		trace_btrfs_handle_em_exist(existing, em, start, len);
+		trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
 
 		/*
 		 * existing will always be non-NULL, since there must be
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5fcb80a6ce37..25d985e7532a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -92,7 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+			     struct extent_map_tree *em_tree,
 			     struct extent_map **em_in, u64 start, u64 len);
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index be17cfdcbcf5..f4447986263a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7083,7 +7083,7 @@ insert:
 
 	err = 0;
 	write_lock(&em_tree->lock);
-	err = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	write_unlock(&em_tree->lock);
 out:
 
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 8a39de4453e4..9c051c4a3315 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -91,7 +91,7 @@ static void test_case_1(struct btrfs_fs_info *fs_info,
 	em->len = len;
 	em->block_start = start;
 	em->block_len = len;
-	ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	if (ret)
 		test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret);
 	if (em &&
@@ -155,7 +155,7 @@ static void test_case_2(struct btrfs_fs_info *fs_info,
 	em->len = SZ_1K;
 	em->block_start = EXTENT_MAP_INLINE;
 	em->block_len = (u64)-1;
-	ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	if (ret)
 		test_msg("case2 [0 1K]: ret %d\n", ret);
 	if (em &&
@@ -201,7 +201,7 @@ static void __test_case_3(struct btrfs_fs_info *fs_info,
 	em->len = SZ_16K;
 	em->block_start = 0;
 	em->block_len = SZ_16K;
-	ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	if (ret)
 		test_msg("case3 [0x%llx 0x%llx): ret %d\n",
 			 start, start + len, ret);
@@ -288,7 +288,7 @@ static void __test_case_4(struct btrfs_fs_info *fs_info,
 	em->len = SZ_32K;
 	em->block_start = 0;
 	em->block_len = SZ_32K;
-	ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	if (ret)
 		test_msg("case4 [0x%llx 0x%llx): ret %d\n",
 			 start, len, ret);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 9be469706d30..d78d8ab4bc86 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -256,11 +256,13 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
 
 TRACE_EVENT(btrfs_handle_em_exist,
 
-	TP_PROTO(const struct extent_map *existing, const struct extent_map *map, u64 start, u64 len),
+	TP_PROTO(struct btrfs_fs_info *fs_info,
+		const struct extent_map *existing, const struct extent_map *map,
+		u64 start, u64 len),
 
-	TP_ARGS(existing, map, start, len),
+	TP_ARGS(fs_info, existing, map, start, len),
 
-	TP_STRUCT__entry(
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,  e_start		)
 		__field(	u64,  e_len		)
 		__field(	u64,  map_start		)
@@ -269,7 +271,7 @@ TRACE_EVENT(btrfs_handle_em_exist,
 		__field(	u64,  len		)
 	),
 
-	TP_fast_assign(
+	TP_fast_assign_btrfs(fs_info,
 		__entry->e_start	= existing->start;
 		__entry->e_len		= existing->len;
 		__entry->map_start	= map->start;
@@ -278,7 +280,7 @@ TRACE_EVENT(btrfs_handle_em_exist,
 		__entry->len		= len;
 	),
 
-	TP_printk("start=%llu len=%llu "
+	TP_printk_btrfs("start=%llu len=%llu "
 		  "existing(start=%llu len=%llu) "
 		  "em(start=%llu len=%llu)",
 		  __entry->start,
-- 
cgit v1.2.3


From 3dca5c942dac60164e6a6e89172f25b86af07ce7 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 26 Apr 2018 14:24:25 +0800
Subject: btrfs: trace: Remove unnecessary fs_info parameter for
 btrfs__reserve_extent event class

fs_info can be extracted from btrfs_block_group_cache, and all
btrfs_block_group_cache is created by btrfs_create_block_group_cache()
with fs_info initialized, no need to worry about NULL pointer
dereference.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       |  7 +++----
 include/trace/events/btrfs.h | 18 ++++++++----------
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fdd6ac9ee2c6..295f8298fd9e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7651,7 +7651,7 @@ have_block_group:
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
-				trace_btrfs_reserve_extent_cluster(fs_info,
+				trace_btrfs_reserve_extent_cluster(
 						used_block_group,
 						search_start, num_bytes);
 				if (used_block_group != block_group) {
@@ -7724,7 +7724,7 @@ refill_cluster:
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
-					trace_btrfs_reserve_extent_cluster(fs_info,
+					trace_btrfs_reserve_extent_cluster(
 						block_group, search_start,
 						num_bytes);
 					goto checks;
@@ -7824,8 +7824,7 @@ checks:
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
-		trace_btrfs_reserve_extent(fs_info, block_group,
-					   search_start, num_bytes);
+		trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
 		btrfs_release_block_group(block_group, delalloc);
 		break;
 loop:
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index d78d8ab4bc86..5af2479e9b16 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1146,11 +1146,10 @@ TRACE_EVENT(find_free_extent,
 
 DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
-	TP_ARGS(fs_info, block_group, start, len),
+	TP_ARGS(block_group, start, len),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,	bg_objectid		)
@@ -1159,7 +1158,7 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 		__field(	u64,	len			)
 	),
 
-	TP_fast_assign_btrfs(fs_info,
+	TP_fast_assign_btrfs(block_group->fs_info,
 		__entry->bg_objectid	= block_group->key.objectid;
 		__entry->flags		= block_group->flags;
 		__entry->start		= start;
@@ -1177,20 +1176,18 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
-	TP_ARGS(fs_info, block_group, start, len)
+	TP_ARGS(block_group, start, len)
 );
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
-	TP_ARGS(fs_info, block_group, start, len)
+	TP_ARGS(block_group, start, len)
 );
 
 TRACE_EVENT(btrfs_find_cluster,
@@ -1807,6 +1804,7 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
 			show_root_type(__entry->root_objectid),
 			__entry->ino, __entry->mod)
 );
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 4ed0a7a3b7c3f8387caf2ad57424d9341f523856 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 26 Apr 2018 17:17:20 +0800
Subject: btrfs: trace: Add trace points for unused block groups

This patch will add the following trace events:
1) btrfs_remove_block_group
   For btrfs_remove_block_group() function.
   Triggered when a block group is really removed.

2) btrfs_add_unused_block_group
   Triggered which block group is added to unused_bgs list.

3) btrfs_skip_unused_block_group
   Triggered which unused block group is not deleted.

These trace events is pretty handy to debug case related to block group
auto remove.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       |  4 ++++
 fs/btrfs/scrub.c             |  1 +
 include/trace/events/btrfs.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 295f8298fd9e..888a47894c30 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6358,6 +6358,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			spin_lock(&info->unused_bgs_lock);
 			if (list_empty(&cache->bg_list)) {
 				btrfs_get_block_group(cache);
+				trace_btrfs_add_unused_block_group(cache);
 				list_add_tail(&cache->bg_list,
 					      &info->unused_bgs);
 			}
@@ -10192,6 +10193,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 			/* Should always be true but just in case. */
 			if (list_empty(&cache->bg_list)) {
 				btrfs_get_block_group(cache);
+				trace_btrfs_add_unused_block_group(cache);
 				list_add_tail(&cache->bg_list,
 					      &info->unused_bgs);
 			}
@@ -10379,6 +10381,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!block_group);
 	BUG_ON(!block_group->ro);
 
+	trace_btrfs_remove_block_group(block_group);
 	/*
 	 * Free the reserved super bytes from this block group before
 	 * remove it.
@@ -10743,6 +10746,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			 * the ro check in case balance is currently acting on
 			 * this block group.
 			 */
+			trace_btrfs_skip_unused_block_group(block_group);
 			spin_unlock(&block_group->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 52b39a0924e9..a59005862010 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3984,6 +3984,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			spin_lock(&fs_info->unused_bgs_lock);
 			if (list_empty(&cache->bg_list)) {
 				btrfs_get_block_group(cache);
+				trace_btrfs_add_unused_block_group(cache);
 				list_add_tail(&cache->bg_list,
 					      &fs_info->unused_bgs);
 			}
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 5af2479e9b16..914ddb7a6d25 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1805,6 +1805,48 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
 			__entry->ino, __entry->mod)
 );
 
+DECLARE_EVENT_CLASS(btrfs__block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	bytenr		)
+		__field(	u64,	len		)
+		__field(	u64,	used		)
+		__field(	u64,	flags		)
+	),
+
+	TP_fast_assign_btrfs(bg_cache->fs_info,
+		__entry->bytenr = bg_cache->key.objectid,
+		__entry->len	= bg_cache->key.offset,
+		__entry->used	= btrfs_block_group_used(&bg_cache->item);
+		__entry->flags	= bg_cache->flags;
+	),
+
+	TP_printk_btrfs("bg bytenr=%llu len=%llu used=%llu flags=%llu(%s)",
+		__entry->bytenr, __entry->len, __entry->used, __entry->flags,
+		__print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS))
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_remove_block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache)
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache)
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 8b317901da4ab5576d109bf1aed7eb15cbe31d5f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 30 Apr 2018 15:04:44 +0800
Subject: btrfs: trace: Allow trace_qgroup_update_counters() to record old
 rfer/excl value

Origin trace_qgroup_update_counters() only records qgroup id and its
reference count change.

It's good enough to debug qgroup accounting change, but when rescan race
is involved, it's pretty hard to distinguish which modification belongs
to which rescan.

So add old_rfer and old_excl trace output to help distinguishing
different rescan instance.
(Different rescan instance should reset its qgroup->rfer to 0)

For trace event parameter, it just changes from u64 qgroup_id to struct
btrfs_qgroup *qgroup, so number of parameters is not changed at all.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c            |  4 ++--
 include/trace/events/btrfs.h | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9fb758d5077a..ec2339a49ec3 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1882,8 +1882,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
 		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
 		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
 
-		trace_qgroup_update_counters(fs_info, qg->qgroupid,
-					     cur_old_count, cur_new_count);
+		trace_qgroup_update_counters(fs_info, qg, cur_old_count,
+					     cur_new_count);
 
 		/* Rfer update part */
 		if (cur_old_count == 0 && cur_new_count > 0) {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 914ddb7a6d25..29f9b14412ad 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1607,27 +1607,31 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
 
 TRACE_EVENT(qgroup_update_counters,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 qgid,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 struct btrfs_qgroup *qgroup,
 		 u64 cur_old_count, u64 cur_new_count),
 
-	TP_ARGS(fs_info, qgid, cur_old_count, cur_new_count),
+	TP_ARGS(fs_info, qgroup, cur_old_count, cur_new_count),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  qgid			)
+		__field(	u64,  old_rfer			)
+		__field(	u64,  old_excl			)
 		__field(	u64,  cur_old_count		)
 		__field(	u64,  cur_new_count		)
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__entry->qgid		= qgid;
+		__entry->qgid		= qgroup->qgroupid;
+		__entry->old_rfer	= qgroup->rfer;
+		__entry->old_excl	= qgroup->excl;
 		__entry->cur_old_count	= cur_old_count;
 		__entry->cur_new_count	= cur_new_count;
 	),
 
-	TP_printk_btrfs("qgid=%llu cur_old_count=%llu cur_new_count=%llu",
-		  __entry->qgid,
-		  __entry->cur_old_count,
-		  __entry->cur_new_count)
+	TP_printk_btrfs("qgid=%llu old_rfer=%llu old_excl=%llu cur_old_count=%llu cur_new_count=%llu",
+		  __entry->qgid, __entry->old_rfer, __entry->old_excl,
+		  __entry->cur_old_count, __entry->cur_new_count)
 );
 
 TRACE_EVENT(qgroup_update_reserve,
-- 
cgit v1.2.3


From c9f6f3cd1c6fc4df959ce2bce15e5e6ce660bfd4 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 3 May 2018 09:59:02 +0800
Subject: btrfs: qgroup: Allow trace_btrfs_qgroup_account_extent() to record
 its transid

When debugging quota rescan race, some times btrfs rescan could account
some old (committed) leaf and then re-account newly committed leaf
in next generation.

This race needs extra transid to locate, so add @transid for
trace_btrfs_qgroup_account_extent() for such debug.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c            |  4 ++--
 include/trace/events/btrfs.h | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ec2339a49ec3..9fdac5b46aa9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2014,8 +2014,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!fs_info->quota_root);
 
-	trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
-					  nr_old_roots, nr_new_roots);
+	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
+					num_bytes, nr_old_roots, nr_new_roots);
 
 	qgroups = ulist_alloc(GFP_NOFS);
 	if (!qgroups) {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 29f9b14412ad..39b94ec965be 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1578,12 +1578,14 @@ DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
 
 TRACE_EVENT(btrfs_qgroup_account_extent,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 bytenr,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr,
 		 u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
 
-	TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots),
+	TP_ARGS(fs_info, transid, bytenr, num_bytes, nr_old_roots,
+		nr_new_roots),
 
 	TP_STRUCT__entry_btrfs(
+		__field(	u64,  transid			)
 		__field(	u64,  bytenr			)
 		__field(	u64,  num_bytes			)
 		__field(	u64,  nr_old_roots		)
@@ -1591,18 +1593,20 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
+		__entry->transid	= transid;
 		__entry->bytenr		= bytenr;
 		__entry->num_bytes	= num_bytes;
 		__entry->nr_old_roots	= nr_old_roots;
 		__entry->nr_new_roots	= nr_new_roots;
 	),
 
-	TP_printk_btrfs("bytenr=%llu num_bytes=%llu nr_old_roots=%llu "
-		  "nr_new_roots=%llu",
-		  __entry->bytenr,
-		  __entry->num_bytes,
-		  __entry->nr_old_roots,
-		  __entry->nr_new_roots)
+	TP_printk_btrfs(
+"transid=%llu bytenr=%llu num_bytes=%llu nr_old_roots=%llu nr_new_roots=%llu",
+		__entry->transid,
+		__entry->bytenr,
+		__entry->num_bytes,
+		__entry->nr_old_roots,
+		__entry->nr_new_roots)
 );
 
 TRACE_EVENT(qgroup_update_counters,
-- 
cgit v1.2.3


From 08bb558ac11ab944e0539e78619d7b4c356278bd Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Wed, 23 May 2018 15:30:30 +0300
Subject: IB/core: Make testing MR flags for writability a static inline
 function

Make the MR writability flags check, which is performed in umem.c,
a static inline function in file ib_verbs.h

This allows the function to be used by low-level infiniband drivers.

Cc: <stable@vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/umem.c | 11 +----------
 include/rdma/ib_verbs.h        | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 2b6c9b516070..d76455edd292 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -119,16 +119,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	umem->length     = size;
 	umem->address    = addr;
 	umem->page_shift = PAGE_SHIFT;
-	/*
-	 * We ask for writable memory if any of the following
-	 * access flags are set.  "Local write" and "remote write"
-	 * obviously require write access.  "Remote atomic" can do
-	 * things like fetch and add, which will modify memory, and
-	 * "MW bind" can change permissions by binding a window.
-	 */
-	umem->writable  = !!(access &
-		(IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
-		 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+	umem->writable   = ib_access_writable(access);
 
 	if (access & IB_ACCESS_ON_DEMAND) {
 		ret = ib_umem_odp_get(context, umem, access);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9fc8a825aa28..20fa5c591e81 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3734,6 +3734,20 @@ static inline int ib_check_mr_access(int flags)
 	return 0;
 }
 
+static inline bool ib_access_writable(int access_flags)
+{
+	/*
+	 * We have writable memory backing the MR if any of the following
+	 * access flags are set.  "Local write" and "remote write" obviously
+	 * require write access.  "Remote atomic" can do things like fetch and
+	 * add, which will modify memory, and "MW bind" can change permissions
+	 * by binding a window.
+	 */
+	return access_flags &
+		(IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
+		 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND);
+}
+
 /**
  * ib_check_mr_status: lightweight check of MR status.
  *     This routine may provide status checks on a selected
-- 
cgit v1.2.3


From 0d5615d3479fd165a4aca7f2dc3c418ffed3eb20 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Sat, 12 May 2018 10:44:03 -0400
Subject: media: v4l2-ioctl: delete unused v4l2_disable_ioctl_locking

The last user of this 'feature' was the gspca driver. Now that
that driver has been converted to vb2 we can delete this code.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ioctl.c |  2 --
 include/media/v4l2-dev.h             | 15 ---------------
 2 files changed, 17 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 212aac1d22c1..c23fbfe90a9e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -2666,8 +2666,6 @@ struct mutex *v4l2_ioctl_get_lock(struct video_device *vdev, unsigned cmd)
 {
 	if (_IOC_NR(cmd) >= V4L2_IOCTLS)
 		return vdev->lock;
-	if (test_bit(_IOC_NR(cmd), vdev->disable_locking))
-		return NULL;
 	if (vdev->queue && vdev->queue->lock &&
 			(v4l2_ioctls[_IOC_NR(cmd)].flags & INFO_FL_QUEUE))
 		return vdev->queue->lock;
diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h
index 73073f6ee48c..30423aefe7c5 100644
--- a/include/media/v4l2-dev.h
+++ b/include/media/v4l2-dev.h
@@ -238,7 +238,6 @@ struct v4l2_file_operations {
  * @ioctl_ops: pointer to &struct v4l2_ioctl_ops with ioctl callbacks
  *
  * @valid_ioctls: bitmap with the valid ioctls for this device
- * @disable_locking: bitmap with the ioctls that don't require locking
  * @lock: pointer to &struct mutex serialization lock
  *
  * .. note::
@@ -291,7 +290,6 @@ struct video_device
 	const struct v4l2_ioctl_ops *ioctl_ops;
 	DECLARE_BITMAP(valid_ioctls, BASE_VIDIOC_PRIVATE);
 
-	DECLARE_BITMAP(disable_locking, BASE_VIDIOC_PRIVATE);
 	struct mutex *lock;
 };
 
@@ -446,19 +444,6 @@ void video_device_release_empty(struct video_device *vdev);
  */
 bool v4l2_is_known_ioctl(unsigned int cmd);
 
-/** v4l2_disable_ioctl_locking - mark that a given command
- *	shouldn't use core locking
- *
- * @vdev: pointer to &struct video_device
- * @cmd: ioctl command
- */
-static inline void v4l2_disable_ioctl_locking(struct video_device *vdev,
-					      unsigned int cmd)
-{
-	if (_IOC_NR(cmd) < BASE_VIDIOC_PRIVATE)
-		set_bit(_IOC_NR(cmd), vdev->disable_locking);
-}
-
 /**
  * v4l2_disable_ioctl- mark that a given command isn't implemented.
  *	shouldn't use core locking
-- 
cgit v1.2.3


From 73a110623e7b7592defea69f028cccae495d69a4 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Fri, 11 May 2018 05:32:24 -0400
Subject: media: v4l2-core: push taking ioctl mutex down to ioctl handler

The ioctl serialization mutex (vdev->lock or q->lock for vb2 queues)
was taken at the highest level in v4l2-dev.c. This prevents more
fine-grained locking since at that level we cannot examine the ioctl
arguments, we can only do that after video_usercopy is called.

So push the locking down to __video_do_ioctl() and subdev_do_ioctl_lock().

This also allows us to make a few functions in v4l2-ioctl.c static and
video_usercopy() is no longer exported.

The locking scheme is not changed by this patch, just pushed down.

Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/v4l2-dev.c    |  6 ------
 drivers/media/v4l2-core/v4l2-ioctl.c  | 20 +++++++++++++++++---
 drivers/media/v4l2-core/v4l2-subdev.c | 17 ++++++++++++++++-
 include/media/v4l2-dev.h              |  9 ---------
 include/media/v4l2-ioctl.h            | 12 ------------
 5 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c
index c4f4357e9ca4..4ffd7d60a901 100644
--- a/drivers/media/v4l2-core/v4l2-dev.c
+++ b/drivers/media/v4l2-core/v4l2-dev.c
@@ -360,14 +360,8 @@ static long v4l2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	int ret = -ENODEV;
 
 	if (vdev->fops->unlocked_ioctl) {
-		struct mutex *lock = v4l2_ioctl_get_lock(vdev, cmd);
-
-		if (lock && mutex_lock_interruptible(lock))
-			return -ERESTARTSYS;
 		if (video_is_registered(vdev))
 			ret = vdev->fops->unlocked_ioctl(filp, cmd, arg);
-		if (lock)
-			mutex_unlock(lock);
 	} else
 		ret = -ENOTTY;
 
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index c23fbfe90a9e..965fd301f617 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -2655,14 +2655,15 @@ static struct v4l2_ioctl_info v4l2_ioctls[] = {
 };
 #define V4L2_IOCTLS ARRAY_SIZE(v4l2_ioctls)
 
-bool v4l2_is_known_ioctl(unsigned int cmd)
+static bool v4l2_is_known_ioctl(unsigned int cmd)
 {
 	if (_IOC_NR(cmd) >= V4L2_IOCTLS)
 		return false;
 	return v4l2_ioctls[_IOC_NR(cmd)].ioctl == cmd;
 }
 
-struct mutex *v4l2_ioctl_get_lock(struct video_device *vdev, unsigned cmd)
+static struct mutex *v4l2_ioctl_get_lock(struct video_device *vdev,
+					 unsigned int cmd)
 {
 	if (_IOC_NR(cmd) >= V4L2_IOCTLS)
 		return vdev->lock;
@@ -2713,6 +2714,7 @@ static long __video_do_ioctl(struct file *file,
 		unsigned int cmd, void *arg)
 {
 	struct video_device *vfd = video_devdata(file);
+	struct mutex *lock; /* ioctl serialization mutex */
 	const struct v4l2_ioctl_ops *ops = vfd->ioctl_ops;
 	bool write_only = false;
 	struct v4l2_ioctl_info default_info;
@@ -2731,6 +2733,16 @@ static long __video_do_ioctl(struct file *file,
 	if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags))
 		vfh = file->private_data;
 
+	lock = v4l2_ioctl_get_lock(vfd, cmd);
+
+	if (lock && mutex_lock_interruptible(lock))
+		return -ERESTARTSYS;
+
+	if (!video_is_registered(vfd)) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
 	if (v4l2_is_known_ioctl(cmd)) {
 		info = &v4l2_ioctls[_IOC_NR(cmd)];
 
@@ -2780,6 +2792,9 @@ done:
 		}
 	}
 
+unlock:
+	if (lock)
+		mutex_unlock(lock);
 	return ret;
 }
 
@@ -2969,7 +2984,6 @@ out:
 	kvfree(mbuf);
 	return err;
 }
-EXPORT_SYMBOL(video_usercopy);
 
 long video_ioctl2(struct file *file,
 	       unsigned int cmd, unsigned long arg)
diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index f9eed938d348..6a7f7f75dfd7 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -502,10 +502,25 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg)
 	return 0;
 }
 
+static long subdev_do_ioctl_lock(struct file *file, unsigned int cmd, void *arg)
+{
+	struct video_device *vdev = video_devdata(file);
+	struct mutex *lock = vdev->lock;
+	long ret = -ENODEV;
+
+	if (lock && mutex_lock_interruptible(lock))
+		return -ERESTARTSYS;
+	if (video_is_registered(vdev))
+		ret = subdev_do_ioctl(file, cmd, arg);
+	if (lock)
+		mutex_unlock(lock);
+	return ret;
+}
+
 static long subdev_ioctl(struct file *file, unsigned int cmd,
 	unsigned long arg)
 {
-	return video_usercopy(file, cmd, arg, subdev_do_ioctl);
+	return video_usercopy(file, cmd, arg, subdev_do_ioctl_lock);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h
index 30423aefe7c5..456ac13eca1d 100644
--- a/include/media/v4l2-dev.h
+++ b/include/media/v4l2-dev.h
@@ -435,15 +435,6 @@ void video_device_release(struct video_device *vdev);
  */
 void video_device_release_empty(struct video_device *vdev);
 
-/**
- * v4l2_is_known_ioctl - Checks if a given cmd is a known V4L ioctl
- *
- * @cmd: ioctl command
- *
- * returns true if cmd is a known V4L2 ioctl
- */
-bool v4l2_is_known_ioctl(unsigned int cmd);
-
 /**
  * v4l2_disable_ioctl- mark that a given command isn't implemented.
  *	shouldn't use core locking
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index a7b3f7c75d62..a8dbf5b54b5c 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -658,18 +658,6 @@ void v4l_printk_ioctl(const char *prefix, unsigned int cmd);
 
 struct video_device;
 
-
-/**
- * v4l2_ioctl_get_lock - get the mutex (if any) that it is need to lock for
- *	a given command.
- *
- * @vdev: Pointer to struct &video_device.
- * @cmd: Ioctl name.
- *
- * .. note:: Internal use only. Should not be used outside V4L2 core.
- */
-struct mutex *v4l2_ioctl_get_lock(struct video_device *vdev, unsigned int cmd);
-
 /* names for fancy debug output */
 extern const char *v4l2_field_names[];
 extern const char *v4l2_type_names[];
-- 
cgit v1.2.3


From e5a10bb2acf246c13dc164fd37d4c77d9acaf88f Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Wed, 23 May 2018 16:26:35 +0200
Subject: netfilter: add includes to nf_socket.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These have to be included always when nf_socket.h is included.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_socket.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h
index 8230fefff9f5..29b6313f0557 100644
--- a/include/net/netfilter/nf_socket.h
+++ b/include/net/netfilter/nf_socket.h
@@ -2,10 +2,8 @@
 #ifndef _NF_SOCK_H_
 #define _NF_SOCK_H_
 
-struct net_device;
-struct sk_buff;
-struct sock;
-struct net;
+#include <net/sock.h>
+#include <net/inet_timewait_sock.h>
 
 static inline bool nf_sk_is_transparent(struct sock *sk)
 {
-- 
cgit v1.2.3


From 30c8bd5aa8b2c78546c3e52337101b9c85879320 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:13 -0700
Subject: net: Introduce generic failover module

The failover module provides a generic interface for paravirtual drivers
to register a netdev and a set of ops with a failover instance. The ops
are used as event handlers that get called to handle netdev register/
unregister/link change/name change events on slave pci ethernet devices
with the same mac address as the failover netdev.

This enables paravirtual drivers to use a VF as an accelerated low latency
datapath. It also allows migration of VMs with direct attached VFs by
failing over to the paravirtual datapath when the VF is unplugged.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/failover.rst |  18 ++
 MAINTAINERS                           |   8 +
 include/linux/netdevice.h             |  16 ++
 include/net/failover.h                |  36 ++++
 net/Kconfig                           |  13 ++
 net/core/Makefile                     |   1 +
 net/core/failover.c                   | 315 ++++++++++++++++++++++++++++++++++
 7 files changed, 407 insertions(+)
 create mode 100644 Documentation/networking/failover.rst
 create mode 100644 include/net/failover.h
 create mode 100644 net/core/failover.c

(limited to 'include')

diff --git a/Documentation/networking/failover.rst b/Documentation/networking/failover.rst
new file mode 100644
index 000000000000..f0c8483cdbf5
--- /dev/null
+++ b/Documentation/networking/failover.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========
+FAILOVER
+========
+
+Overview
+========
+
+The failover module provides a generic interface for paravirtual drivers
+to register a netdev and a set of ops with a failover instance. The ops
+are used as event handlers that get called to handle netdev register/
+unregister/link change/name change events on slave pci ethernet devices
+with the same mac address as the failover netdev.
+
+This enables paravirtual drivers to use a VF as an accelerated low latency
+datapath. It also allows live migration of VMs with direct attached VFs by
+failing over to the paravirtual datapath when the VF is unplugged.
diff --git a/MAINTAINERS b/MAINTAINERS
index f492431b239b..6c59bdf49a8a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5411,6 +5411,14 @@ S:	Maintained
 F:	Documentation/hwmon/f71805f
 F:	drivers/hwmon/f71805f.c
 
+FAILOVER MODULE
+M:	Sridhar Samudrala <sridhar.samudrala@intel.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	net/core/failover.c
+F:	include/net/failover.h
+F:	Documentation/networking/failover.rst
+
 FANOTIFY
 M:	Jan Kara <jack@suse.cz>
 R:	Amir Goldstein <amir73il@gmail.com>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..f45b1a4e37ab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1425,6 +1425,8 @@ struct net_device_ops {
  *	entity (i.e. the master device for bridged veth)
  * @IFF_MACSEC: device is a MACsec device
  * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
+ * @IFF_FAILOVER: device is a failover master device
+ * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1454,6 +1456,8 @@ enum netdev_priv_flags {
 	IFF_PHONY_HEADROOM		= 1<<24,
 	IFF_MACSEC			= 1<<25,
 	IFF_NO_RX_HANDLER		= 1<<26,
+	IFF_FAILOVER			= 1<<27,
+	IFF_FAILOVER_SLAVE		= 1<<28,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1482,6 +1486,8 @@ enum netdev_priv_flags {
 #define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
 #define IFF_MACSEC			IFF_MACSEC
 #define IFF_NO_RX_HANDLER		IFF_NO_RX_HANDLER
+#define IFF_FAILOVER			IFF_FAILOVER
+#define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -4336,6 +4342,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
 	return dev->priv_flags & IFF_RXFH_CONFIGURED;
 }
 
+static inline bool netif_is_failover(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_FAILOVER;
+}
+
+static inline bool netif_is_failover_slave(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_FAILOVER_SLAVE;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/include/net/failover.h b/include/net/failover.h
new file mode 100644
index 000000000000..bb15438f39c7
--- /dev/null
+++ b/include/net/failover.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _FAILOVER_H
+#define _FAILOVER_H
+
+#include <linux/netdevice.h>
+
+struct failover_ops {
+	int (*slave_pre_register)(struct net_device *slave_dev,
+				  struct net_device *failover_dev);
+	int (*slave_register)(struct net_device *slave_dev,
+			      struct net_device *failover_dev);
+	int (*slave_pre_unregister)(struct net_device *slave_dev,
+				    struct net_device *failover_dev);
+	int (*slave_unregister)(struct net_device *slave_dev,
+				struct net_device *failover_dev);
+	int (*slave_link_change)(struct net_device *slave_dev,
+				 struct net_device *failover_dev);
+	int (*slave_name_change)(struct net_device *slave_dev,
+				 struct net_device *failover_dev);
+	rx_handler_result_t (*slave_handle_frame)(struct sk_buff **pskb);
+};
+
+struct failover {
+	struct list_head list;
+	struct net_device __rcu *failover_dev;
+	struct failover_ops __rcu *ops;
+};
+
+struct failover *failover_register(struct net_device *dev,
+				   struct failover_ops *ops);
+void failover_unregister(struct failover *failover);
+int failover_slave_unregister(struct net_device *slave_dev);
+
+#endif /* _FAILOVER_H */
diff --git a/net/Kconfig b/net/Kconfig
index ba554cedb615..f738a6f27665 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -432,6 +432,19 @@ config MAY_USE_DEVLINK
 config PAGE_POOL
        bool
 
+config FAILOVER
+	tristate "Generic failover module"
+	help
+	  The failover module provides a generic interface for paravirtual
+	  drivers to register a netdev and a set of ops with a failover
+	  instance. The ops are used as event handlers that get called to
+	  handle netdev register/unregister/link change/name change events
+	  on slave pci ethernet devices with the same mac address as the
+	  failover netdev. This enables paravirtual drivers to use a
+	  VF as an accelerated low latency datapath. It also allows live
+	  migration of VMs with direct attached VFs by failing over to the
+	  paravirtual datapath when the VF is unplugged.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 7080417f8bc8..80175e6a2eb8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..4a92a98ccce9
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+	struct net_device *failover_dev;
+	struct failover *failover;
+
+	spin_lock(&failover_lock);
+	list_for_each_entry(failover, &failover_list, list) {
+		failover_dev = rtnl_dereference(failover->failover_dev);
+		if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+			*ops = rtnl_dereference(failover->ops);
+			spin_unlock(&failover_lock);
+			return failover_dev;
+		}
+	}
+	spin_unlock(&failover_lock);
+	return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+	struct netdev_lag_upper_info lag_upper_info;
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+	int err;
+
+	if (slave_dev->type != ARPHRD_ETHER)
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (fops && fops->slave_pre_register &&
+	    fops->slave_pre_register(slave_dev, failover_dev))
+		goto done;
+
+	err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+					 failover_dev);
+	if (err) {
+		netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+			   err);
+		goto done;
+	}
+
+	lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+	err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+					   &lag_upper_info, NULL);
+	if (err) {
+		netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+			   failover_dev->name, err);
+		goto err_upper_link;
+	}
+
+	slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+
+	if (fops && fops->slave_register &&
+	    !fops->slave_register(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+	netdev_upper_dev_unlink(slave_dev, failover_dev);
+	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+err_upper_link:
+	netdev_rx_handler_unregister(slave_dev);
+done:
+	return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+
+	if (!netif_is_failover_slave(slave_dev))
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (fops && fops->slave_pre_unregister &&
+	    fops->slave_pre_unregister(slave_dev, failover_dev))
+		goto done;
+
+	netdev_rx_handler_unregister(slave_dev);
+	netdev_upper_dev_unlink(slave_dev, failover_dev);
+	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+
+	if (fops && fops->slave_unregister &&
+	    !fops->slave_unregister(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+done:
+	return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+
+	if (!netif_is_failover_slave(slave_dev))
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (!netif_running(failover_dev))
+		goto done;
+
+	if (fops && fops->slave_link_change &&
+	    !fops->slave_link_change(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+done:
+	return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+
+	if (!netif_is_failover_slave(slave_dev))
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (!netif_running(failover_dev))
+		goto done;
+
+	if (fops && fops->slave_name_change &&
+	    !fops->slave_name_change(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+done:
+	return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Skip parent events */
+	if (netif_is_failover(event_dev))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return failover_slave_register(event_dev);
+	case NETDEV_UNREGISTER:
+		return failover_slave_unregister(event_dev);
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		return failover_slave_link_change(event_dev);
+	case NETDEV_CHANGENAME:
+		return failover_slave_name_change(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block failover_notifier = {
+	.notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+	struct net *net = dev_net(failover_dev);
+	struct net_device *dev;
+
+	rtnl_lock();
+	for_each_netdev(net, dev) {
+		if (netif_is_failover(dev))
+			continue;
+		if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+			failover_slave_register(dev);
+	}
+	rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+				   struct failover_ops *ops)
+{
+	struct failover *failover;
+
+	if (dev->type != ARPHRD_ETHER)
+		return ERR_PTR(-EINVAL);
+
+	failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+	if (!failover)
+		return ERR_PTR(-ENOMEM);
+
+	rcu_assign_pointer(failover->ops, ops);
+	dev_hold(dev);
+	dev->priv_flags |= IFF_FAILOVER;
+	rcu_assign_pointer(failover->failover_dev, dev);
+
+	spin_lock(&failover_lock);
+	list_add_tail(&failover->list, &failover_list);
+	spin_unlock(&failover_lock);
+
+	netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+	failover_existing_slave_register(dev);
+
+	return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+	struct net_device *failover_dev;
+
+	failover_dev = rcu_dereference(failover->failover_dev);
+
+	netdev_info(failover_dev, "failover master:%s unregistered\n",
+		    failover_dev->name);
+
+	failover_dev->priv_flags &= ~IFF_FAILOVER;
+	dev_put(failover_dev);
+
+	spin_lock(&failover_lock);
+	list_del(&failover->list);
+	spin_unlock(&failover_lock);
+
+	kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+	register_netdevice_notifier(&failover_notifier);
+
+	return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+	unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From cfc80d9a11635404a40199a1c9471c96890f3f74 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:15 -0700
Subject: net: Introduce net_failover driver

The net_failover driver provides an automated failover mechanism via APIs
to create and destroy a failover master netdev and manages a primary and
standby slave netdevs that get registered via the generic failover
infrastructure.

The failover netdev acts a master device and controls 2 slave devices. The
original paravirtual interface gets registered as 'standby' slave netdev and
a passthru/vf device with the same MAC gets registered as 'primary' slave
netdev. Both 'standby' and 'failover' netdevs are associated with the same
'pci' device. The user accesses the network interface via 'failover' netdev.
The 'failover' netdev chooses 'primary' netdev as default for transmits when
it is available with link up and running.

This can be used by paravirtual drivers to enable an alternate low latency
datapath. It also enables hypervisor controlled live migration of a VM with
direct attached VF by failing over to the paravirtual datapath when the VF
is unplugged.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/net_failover.rst |  26 +
 MAINTAINERS                               |   8 +
 drivers/net/Kconfig                       |  12 +
 drivers/net/Makefile                      |   1 +
 drivers/net/net_failover.c                | 836 ++++++++++++++++++++++++++++++
 include/net/net_failover.h                |  40 ++
 6 files changed, 923 insertions(+)
 create mode 100644 Documentation/networking/net_failover.rst
 create mode 100644 drivers/net/net_failover.c
 create mode 100644 include/net/net_failover.h

(limited to 'include')

diff --git a/Documentation/networking/net_failover.rst b/Documentation/networking/net_failover.rst
new file mode 100644
index 000000000000..d4513ad31809
--- /dev/null
+++ b/Documentation/networking/net_failover.rst
@@ -0,0 +1,26 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+NET_FAILOVER
+============
+
+Overview
+========
+
+The net_failover driver provides an automated failover mechanism via APIs
+to create and destroy a failover master netdev and mananges a primary and
+standby slave netdevs that get registered via the generic failover
+infrastructrure.
+
+The failover netdev acts a master device and controls 2 slave devices. The
+original paravirtual interface is registered as 'standby' slave netdev and
+a passthru/vf device with the same MAC gets registered as 'primary' slave
+netdev. Both 'standby' and 'failover' netdevs are associated with the same
+'pci' device. The user accesses the network interface via 'failover' netdev.
+The 'failover' netdev chooses 'primary' netdev as default for transmits when
+it is available with link up and running.
+
+This can be used by paravirtual drivers to enable an alternate low latency
+datapath. It also enables hypervisor controlled live migration of a VM with
+direct attached VF by failing over to the paravirtual datapath when the VF
+is unplugged.
diff --git a/MAINTAINERS b/MAINTAINERS
index 6c59bdf49a8a..1831ff5863a1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9654,6 +9654,14 @@ S:	Maintained
 F:	Documentation/hwmon/nct6775
 F:	drivers/hwmon/nct6775.c
 
+NET_FAILOVER MODULE
+M:	Sridhar Samudrala <sridhar.samudrala@intel.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	driver/net/net_failover.c
+F:	include/net/net_failover.h
+F:	Documentation/networking/net_failover.rst
+
 NETEFFECT IWARP RNIC DRIVER (IW_NES)
 M:	Faisal Latif <faisal.latif@intel.com>
 L:	linux-rdma@vger.kernel.org
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index a029b27fd002..2cdaff90a9ec 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -510,4 +510,16 @@ config NETDEVSIM
 	  To compile this driver as a module, choose M here: the module
 	  will be called netdevsim.
 
+config NET_FAILOVER
+	tristate "Failover driver"
+	select FAILOVER
+	help
+	  This provides an automated failover mechanism via APIs to create
+	  and destroy a failover master netdev and manages a primary and
+	  standby slave netdevs that get registered via the generic failover
+	  infrastructure. This can be used by paravirtual drivers to enable
+	  an alternate low latency datapath. It alsoenables live migration of
+	  a VM with direct attached VF by failing over to the paravirtual
+	  datapath when the VF is unplugged.
+
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 91e67e375dd4..21cde7e78621 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_FUJITSU_ES) += fjes/
 thunderbolt-net-y += thunderbolt.o
 obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o
 obj-$(CONFIG_NETDEVSIM) += netdevsim/
+obj-$(CONFIG_NET_FAILOVER) += net_failover.o
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
new file mode 100644
index 000000000000..8b508e2cf29b
--- /dev/null
+++ b/drivers/net/net_failover.c
@@ -0,0 +1,836 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* This provides a net_failover interface for paravirtual drivers to
+ * provide an alternate datapath by exporting APIs to create and
+ * destroy a upper 'net_failover' netdev. The upper dev manages the
+ * original paravirtual interface as a 'standby' netdev and uses the
+ * generic failover infrastructure to register and manage a direct
+ * attached VF as a 'primary' netdev. This enables live migration of
+ * a VM with direct attached VF by failing over to the paravirtual
+ * datapath when the VF is unplugged.
+ *
+ * Some of the netdev management routines are based on bond/team driver as
+ * this driver provides active-backup functionality similar to those drivers.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/pci.h>
+#include <net/sch_generic.h>
+#include <uapi/linux/if_arp.h>
+#include <net/net_failover.h>
+
+static bool net_failover_xmit_ready(struct net_device *dev)
+{
+	return netif_running(dev) && netif_carrier_ok(dev);
+}
+
+static int net_failover_open(struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+	int err;
+
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		err = dev_open(primary_dev);
+		if (err)
+			goto err_primary_open;
+	}
+
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		err = dev_open(standby_dev);
+		if (err)
+			goto err_standby_open;
+	}
+
+	if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+	    (standby_dev && net_failover_xmit_ready(standby_dev))) {
+		netif_carrier_on(dev);
+		netif_tx_wake_all_queues(dev);
+	}
+
+	return 0;
+
+err_standby_open:
+	dev_close(primary_dev);
+err_primary_open:
+	netif_tx_disable(dev);
+	return err;
+}
+
+static int net_failover_close(struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	netif_tx_disable(dev);
+
+	slave_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (slave_dev)
+		dev_close(slave_dev);
+
+	slave_dev = rtnl_dereference(nfo_info->standby_dev);
+	if (slave_dev)
+		dev_close(slave_dev);
+
+	return 0;
+}
+
+static netdev_tx_t net_failover_drop_xmit(struct sk_buff *skb,
+					  struct net_device *dev)
+{
+	atomic_long_inc(&dev->tx_dropped);
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb,
+					   struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *xmit_dev;
+
+	/* Try xmit via primary netdev followed by standby netdev */
+	xmit_dev = rcu_dereference_bh(nfo_info->primary_dev);
+	if (!xmit_dev || !net_failover_xmit_ready(xmit_dev)) {
+		xmit_dev = rcu_dereference_bh(nfo_info->standby_dev);
+		if (!xmit_dev || !net_failover_xmit_ready(xmit_dev))
+			return net_failover_drop_xmit(skb, dev);
+	}
+
+	skb->dev = xmit_dev;
+	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+	return dev_queue_xmit(skb);
+}
+
+static u16 net_failover_select_queue(struct net_device *dev,
+				     struct sk_buff *skb, void *accel_priv,
+				     select_queue_fallback_t fallback)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev;
+	u16 txq;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		const struct net_device_ops *ops = primary_dev->netdev_ops;
+
+		if (ops->ndo_select_queue)
+			txq = ops->ndo_select_queue(primary_dev, skb,
+						    accel_priv, fallback);
+		else
+			txq = fallback(primary_dev, skb);
+
+		qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+		return txq;
+	}
+
+	txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
+
+	/* Save the original txq to restore before passing to the driver */
+	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+	if (unlikely(txq >= dev->real_num_tx_queues)) {
+		do {
+			txq -= dev->real_num_tx_queues;
+		} while (txq >= dev->real_num_tx_queues);
+	}
+
+	return txq;
+}
+
+/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
+ * that some drivers can provide 32bit values only.
+ */
+static void net_failover_fold_stats(struct rtnl_link_stats64 *_res,
+				    const struct rtnl_link_stats64 *_new,
+				    const struct rtnl_link_stats64 *_old)
+{
+	const u64 *new = (const u64 *)_new;
+	const u64 *old = (const u64 *)_old;
+	u64 *res = (u64 *)_res;
+	int i;
+
+	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
+		u64 nv = new[i];
+		u64 ov = old[i];
+		s64 delta = nv - ov;
+
+		/* detects if this particular field is 32bit only */
+		if (((nv | ov) >> 32) == 0)
+			delta = (s64)(s32)((u32)nv - (u32)ov);
+
+		/* filter anomalies, some drivers reset their stats
+		 * at down/up events.
+		 */
+		if (delta > 0)
+			res[i] += delta;
+	}
+}
+
+static void net_failover_get_stats(struct net_device *dev,
+				   struct rtnl_link_stats64 *stats)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	const struct rtnl_link_stats64 *new;
+	struct rtnl_link_stats64 temp;
+	struct net_device *slave_dev;
+
+	spin_lock(&nfo_info->stats_lock);
+	memcpy(stats, &nfo_info->failover_stats, sizeof(*stats));
+
+	rcu_read_lock();
+
+	slave_dev = rcu_dereference(nfo_info->primary_dev);
+	if (slave_dev) {
+		new = dev_get_stats(slave_dev, &temp);
+		net_failover_fold_stats(stats, new, &nfo_info->primary_stats);
+		memcpy(&nfo_info->primary_stats, new, sizeof(*new));
+	}
+
+	slave_dev = rcu_dereference(nfo_info->standby_dev);
+	if (slave_dev) {
+		new = dev_get_stats(slave_dev, &temp);
+		net_failover_fold_stats(stats, new, &nfo_info->standby_stats);
+		memcpy(&nfo_info->standby_stats, new, sizeof(*new));
+	}
+
+	rcu_read_unlock();
+
+	memcpy(&nfo_info->failover_stats, stats, sizeof(*stats));
+	spin_unlock(&nfo_info->stats_lock);
+}
+
+static int net_failover_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+	int ret = 0;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		ret = dev_set_mtu(primary_dev, new_mtu);
+		if (ret)
+			return ret;
+	}
+
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		ret = dev_set_mtu(standby_dev, new_mtu);
+		if (ret) {
+			if (primary_dev)
+				dev_set_mtu(primary_dev, dev->mtu);
+			return ret;
+		}
+	}
+
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static void net_failover_set_rx_mode(struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	rcu_read_lock();
+
+	slave_dev = rcu_dereference(nfo_info->primary_dev);
+	if (slave_dev) {
+		dev_uc_sync_multiple(slave_dev, dev);
+		dev_mc_sync_multiple(slave_dev, dev);
+	}
+
+	slave_dev = rcu_dereference(nfo_info->standby_dev);
+	if (slave_dev) {
+		dev_uc_sync_multiple(slave_dev, dev);
+		dev_mc_sync_multiple(slave_dev, dev);
+	}
+
+	rcu_read_unlock();
+}
+
+static int net_failover_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+					u16 vid)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+	int ret = 0;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		ret = vlan_vid_add(primary_dev, proto, vid);
+		if (ret)
+			return ret;
+	}
+
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		ret = vlan_vid_add(standby_dev, proto, vid);
+		if (ret)
+			if (primary_dev)
+				vlan_vid_del(primary_dev, proto, vid);
+	}
+
+	return ret;
+}
+
+static int net_failover_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+					 u16 vid)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	slave_dev = rcu_dereference(nfo_info->primary_dev);
+	if (slave_dev)
+		vlan_vid_del(slave_dev, proto, vid);
+
+	slave_dev = rcu_dereference(nfo_info->standby_dev);
+	if (slave_dev)
+		vlan_vid_del(slave_dev, proto, vid);
+
+	return 0;
+}
+
+static const struct net_device_ops failover_dev_ops = {
+	.ndo_open		= net_failover_open,
+	.ndo_stop		= net_failover_close,
+	.ndo_start_xmit		= net_failover_start_xmit,
+	.ndo_select_queue	= net_failover_select_queue,
+	.ndo_get_stats64	= net_failover_get_stats,
+	.ndo_change_mtu		= net_failover_change_mtu,
+	.ndo_set_rx_mode	= net_failover_set_rx_mode,
+	.ndo_vlan_rx_add_vid	= net_failover_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= net_failover_vlan_rx_kill_vid,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_features_check	= passthru_features_check,
+};
+
+#define FAILOVER_NAME "net_failover"
+#define FAILOVER_VERSION "0.1"
+
+static void nfo_ethtool_get_drvinfo(struct net_device *dev,
+				    struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, FAILOVER_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, FAILOVER_VERSION, sizeof(drvinfo->version));
+}
+
+static int nfo_ethtool_get_link_ksettings(struct net_device *dev,
+					  struct ethtool_link_ksettings *cmd)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	slave_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+		slave_dev = rtnl_dereference(nfo_info->standby_dev);
+		if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+			cmd->base.duplex = DUPLEX_UNKNOWN;
+			cmd->base.port = PORT_OTHER;
+			cmd->base.speed = SPEED_UNKNOWN;
+
+			return 0;
+		}
+	}
+
+	return __ethtool_get_link_ksettings(slave_dev, cmd);
+}
+
+static const struct ethtool_ops failover_ethtool_ops = {
+	.get_drvinfo            = nfo_ethtool_get_drvinfo,
+	.get_link               = ethtool_op_get_link,
+	.get_link_ksettings     = nfo_ethtool_get_link_ksettings,
+};
+
+/* Called when slave dev is injecting data into network stack.
+ * Change the associated network device from lower dev to failover dev.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t net_failover_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *dev = rcu_dereference(skb->dev->rx_handler_data);
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+
+	if (primary_dev && skb->dev == standby_dev)
+		return RX_HANDLER_EXACT;
+
+	skb->dev = dev;
+
+	return RX_HANDLER_ANOTHER;
+}
+
+static void net_failover_compute_features(struct net_device *dev)
+{
+	u32 vlan_features = FAILOVER_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
+	netdev_features_t enc_features  = FAILOVER_ENC_FEATURES;
+	unsigned short max_hard_header_len = ETH_HLEN;
+	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+					IFF_XMIT_DST_RELEASE_PERM;
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		vlan_features =
+			netdev_increment_features(vlan_features,
+						  primary_dev->vlan_features,
+						  FAILOVER_VLAN_FEATURES);
+		enc_features =
+			netdev_increment_features(enc_features,
+						  primary_dev->hw_enc_features,
+						  FAILOVER_ENC_FEATURES);
+
+		dst_release_flag &= primary_dev->priv_flags;
+		if (primary_dev->hard_header_len > max_hard_header_len)
+			max_hard_header_len = primary_dev->hard_header_len;
+	}
+
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		vlan_features =
+			netdev_increment_features(vlan_features,
+						  standby_dev->vlan_features,
+						  FAILOVER_VLAN_FEATURES);
+		enc_features =
+			netdev_increment_features(enc_features,
+						  standby_dev->hw_enc_features,
+						  FAILOVER_ENC_FEATURES);
+
+		dst_release_flag &= standby_dev->priv_flags;
+		if (standby_dev->hard_header_len > max_hard_header_len)
+			max_hard_header_len = standby_dev->hard_header_len;
+	}
+
+	dev->vlan_features = vlan_features;
+	dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	dev->hard_header_len = max_hard_header_len;
+
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	if (dst_release_flag == (IFF_XMIT_DST_RELEASE |
+				 IFF_XMIT_DST_RELEASE_PERM))
+		dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+	netdev_change_features(dev);
+}
+
+static void net_failover_lower_state_changed(struct net_device *slave_dev,
+					     struct net_device *primary_dev,
+					     struct net_device *standby_dev)
+{
+	struct netdev_lag_lower_state_info info;
+
+	if (netif_carrier_ok(slave_dev))
+		info.link_up = true;
+	else
+		info.link_up = false;
+
+	if (slave_dev == primary_dev) {
+		if (netif_running(primary_dev))
+			info.tx_enabled = true;
+		else
+			info.tx_enabled = false;
+	} else {
+		if ((primary_dev && netif_running(primary_dev)) ||
+		    (!netif_running(standby_dev)))
+			info.tx_enabled = false;
+		else
+			info.tx_enabled = true;
+	}
+
+	netdev_lower_state_changed(slave_dev, &info);
+}
+
+static int net_failover_slave_pre_register(struct net_device *slave_dev,
+					   struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+	bool slave_is_standby;
+
+	nfo_info = netdev_priv(failover_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+	if (slave_is_standby ? standby_dev : primary_dev) {
+		netdev_err(failover_dev, "%s attempting to register as slave dev when %s already present\n",
+			   slave_dev->name,
+			   slave_is_standby ? "standby" : "primary");
+		return -EINVAL;
+	}
+
+	/* We want to allow only a direct attached VF device as a primary
+	 * netdev. As there is no easy way to check for a VF device, restrict
+	 * this to a pci device.
+	 */
+	if (!slave_is_standby && (!slave_dev->dev.parent ||
+				  !dev_is_pci(slave_dev->dev.parent)))
+		return -EINVAL;
+
+	if (failover_dev->features & NETIF_F_VLAN_CHALLENGED &&
+	    vlan_uses_dev(failover_dev)) {
+		netdev_err(failover_dev, "Device %s is VLAN challenged and failover device has VLAN set up\n",
+			   failover_dev->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int net_failover_slave_register(struct net_device *slave_dev,
+				       struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+	bool slave_is_standby;
+	u32 orig_mtu;
+	int err;
+
+	/* Align MTU of slave with failover dev */
+	orig_mtu = slave_dev->mtu;
+	err = dev_set_mtu(slave_dev, failover_dev->mtu);
+	if (err) {
+		netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n",
+			   slave_dev->name, failover_dev->mtu);
+		goto done;
+	}
+
+	dev_hold(slave_dev);
+
+	if (netif_running(failover_dev)) {
+		err = dev_open(slave_dev);
+		if (err && (err != -EBUSY)) {
+			netdev_err(failover_dev, "Opening slave %s failed err:%d\n",
+				   slave_dev->name, err);
+			goto err_dev_open;
+		}
+	}
+
+	netif_addr_lock_bh(failover_dev);
+	dev_uc_sync_multiple(slave_dev, failover_dev);
+	dev_uc_sync_multiple(slave_dev, failover_dev);
+	netif_addr_unlock_bh(failover_dev);
+
+	err = vlan_vids_add_by_dev(slave_dev, failover_dev);
+	if (err) {
+		netdev_err(failover_dev, "Failed to add vlan ids to device %s err:%d\n",
+			   slave_dev->name, err);
+		goto err_vlan_add;
+	}
+
+	nfo_info = netdev_priv(failover_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+
+	if (slave_is_standby) {
+		rcu_assign_pointer(nfo_info->standby_dev, slave_dev);
+		standby_dev = slave_dev;
+		dev_get_stats(standby_dev, &nfo_info->standby_stats);
+	} else {
+		rcu_assign_pointer(nfo_info->primary_dev, slave_dev);
+		primary_dev = slave_dev;
+		dev_get_stats(primary_dev, &nfo_info->primary_stats);
+		failover_dev->min_mtu = slave_dev->min_mtu;
+		failover_dev->max_mtu = slave_dev->max_mtu;
+	}
+
+	net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+	net_failover_compute_features(failover_dev);
+
+	call_netdevice_notifiers(NETDEV_JOIN, slave_dev);
+
+	netdev_info(failover_dev, "failover %s slave:%s registered\n",
+		    slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+	return 0;
+
+err_vlan_add:
+	dev_uc_unsync(slave_dev, failover_dev);
+	dev_mc_unsync(slave_dev, failover_dev);
+	dev_close(slave_dev);
+err_dev_open:
+	dev_put(slave_dev);
+	dev_set_mtu(slave_dev, orig_mtu);
+done:
+	return err;
+}
+
+static int net_failover_slave_pre_unregister(struct net_device *slave_dev,
+					     struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+
+	nfo_info = netdev_priv(failover_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	if (slave_dev != primary_dev && slave_dev != standby_dev)
+		return -ENODEV;
+
+	return 0;
+}
+
+static int net_failover_slave_unregister(struct net_device *slave_dev,
+					 struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+	bool slave_is_standby;
+
+	nfo_info = netdev_priv(failover_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	vlan_vids_del_by_dev(slave_dev, failover_dev);
+	dev_uc_unsync(slave_dev, failover_dev);
+	dev_mc_unsync(slave_dev, failover_dev);
+	dev_close(slave_dev);
+
+	nfo_info = netdev_priv(failover_dev);
+	dev_get_stats(failover_dev, &nfo_info->failover_stats);
+
+	slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+	if (slave_is_standby) {
+		RCU_INIT_POINTER(nfo_info->standby_dev, NULL);
+	} else {
+		RCU_INIT_POINTER(nfo_info->primary_dev, NULL);
+		if (standby_dev) {
+			failover_dev->min_mtu = standby_dev->min_mtu;
+			failover_dev->max_mtu = standby_dev->max_mtu;
+		}
+	}
+
+	dev_put(slave_dev);
+
+	net_failover_compute_features(failover_dev);
+
+	netdev_info(failover_dev, "failover %s slave:%s unregistered\n",
+		    slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+	return 0;
+}
+
+static int net_failover_slave_link_change(struct net_device *slave_dev,
+					  struct net_device *failover_dev)
+{
+	struct net_device *primary_dev, *standby_dev;
+	struct net_failover_info *nfo_info;
+
+	nfo_info = netdev_priv(failover_dev);
+
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	if (slave_dev != primary_dev && slave_dev != standby_dev)
+		return -ENODEV;
+
+	if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+	    (standby_dev && net_failover_xmit_ready(standby_dev))) {
+		netif_carrier_on(failover_dev);
+		netif_tx_wake_all_queues(failover_dev);
+	} else {
+		dev_get_stats(failover_dev, &nfo_info->failover_stats);
+		netif_carrier_off(failover_dev);
+		netif_tx_stop_all_queues(failover_dev);
+	}
+
+	net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+
+	return 0;
+}
+
+static int net_failover_slave_name_change(struct net_device *slave_dev,
+					  struct net_device *failover_dev)
+{
+	struct net_device *primary_dev, *standby_dev;
+	struct net_failover_info *nfo_info;
+
+	nfo_info = netdev_priv(failover_dev);
+
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	if (slave_dev != primary_dev && slave_dev != standby_dev)
+		return -ENODEV;
+
+	/* We need to bring up the slave after the rename by udev in case
+	 * open failed with EBUSY when it was registered.
+	 */
+	dev_open(slave_dev);
+
+	return 0;
+}
+
+static struct failover_ops net_failover_ops = {
+	.slave_pre_register	= net_failover_slave_pre_register,
+	.slave_register		= net_failover_slave_register,
+	.slave_pre_unregister	= net_failover_slave_pre_unregister,
+	.slave_unregister	= net_failover_slave_unregister,
+	.slave_link_change	= net_failover_slave_link_change,
+	.slave_name_change	= net_failover_slave_name_change,
+	.slave_handle_frame	= net_failover_handle_frame,
+};
+
+/**
+ * net_failover_create - Create and register a failover instance
+ *
+ * @dev: standby netdev
+ *
+ * Creates a failover netdev and registers a failover instance for a standby
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ * The failover netdev acts as a master device and controls 2 slave devices -
+ * the original standby netdev and a VF netdev with the same MAC gets
+ * registered as primary netdev.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *net_failover_create(struct net_device *standby_dev)
+{
+	struct device *dev = standby_dev->dev.parent;
+	struct net_device *failover_dev;
+	struct failover *failover;
+	int err;
+
+	/* Alloc at least 2 queues, for now we are going with 16 assuming
+	 * that VF devices being enslaved won't have too many queues.
+	 */
+	failover_dev = alloc_etherdev_mq(sizeof(struct net_failover_info), 16);
+	if (!failover_dev) {
+		dev_err(dev, "Unable to allocate failover_netdev!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev_net_set(failover_dev, dev_net(standby_dev));
+	SET_NETDEV_DEV(failover_dev, dev);
+
+	failover_dev->netdev_ops = &failover_dev_ops;
+	failover_dev->ethtool_ops = &failover_ethtool_ops;
+
+	/* Initialize the device options */
+	failover_dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
+	failover_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
+				       IFF_TX_SKB_SHARING);
+
+	/* don't acquire failover netdev's netif_tx_lock when transmitting */
+	failover_dev->features |= NETIF_F_LLTX;
+
+	/* Don't allow failover devices to change network namespaces. */
+	failover_dev->features |= NETIF_F_NETNS_LOCAL;
+
+	failover_dev->hw_features = FAILOVER_VLAN_FEATURES |
+				    NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_HW_VLAN_CTAG_RX |
+				    NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	failover_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	failover_dev->features |= failover_dev->hw_features;
+
+	memcpy(failover_dev->dev_addr, standby_dev->dev_addr,
+	       failover_dev->addr_len);
+
+	failover_dev->min_mtu = standby_dev->min_mtu;
+	failover_dev->max_mtu = standby_dev->max_mtu;
+
+	err = register_netdev(failover_dev);
+	if (err) {
+		dev_err(dev, "Unable to register failover_dev!\n");
+		goto err_register_netdev;
+	}
+
+	netif_carrier_off(failover_dev);
+
+	failover = failover_register(failover_dev, &net_failover_ops);
+	if (IS_ERR(failover))
+		goto err_failover_register;
+
+	return failover;
+
+err_failover_register:
+	unregister_netdev(failover_dev);
+err_register_netdev:
+	free_netdev(failover_dev);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(net_failover_create);
+
+/**
+ * net_failover_destroy - Destroy a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters any slave netdevs associated with the failover instance by
+ * calling failover_slave_unregister().
+ * unregisters the failover instance itself and finally frees the failover
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ *
+ */
+void net_failover_destroy(struct failover *failover)
+{
+	struct net_failover_info *nfo_info;
+	struct net_device *failover_dev;
+	struct net_device *slave_dev;
+
+	if (!failover)
+		return;
+
+	failover_dev = rcu_dereference(failover->failover_dev);
+	nfo_info = netdev_priv(failover_dev);
+
+	netif_device_detach(failover_dev);
+
+	rtnl_lock();
+
+	slave_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (slave_dev)
+		failover_slave_unregister(slave_dev);
+
+	slave_dev = rtnl_dereference(nfo_info->standby_dev);
+	if (slave_dev)
+		failover_slave_unregister(slave_dev);
+
+	failover_unregister(failover);
+
+	unregister_netdevice(failover_dev);
+
+	rtnl_unlock();
+
+	free_netdev(failover_dev);
+}
+EXPORT_SYMBOL_GPL(net_failover_destroy);
+
+static __init int
+net_failover_init(void)
+{
+	return 0;
+}
+module_init(net_failover_init);
+
+static __exit
+void net_failover_exit(void)
+{
+}
+module_exit(net_failover_exit);
+
+MODULE_DESCRIPTION("Failover driver for Paravirtual drivers");
+MODULE_LICENSE("GPL v2");
diff --git a/include/net/net_failover.h b/include/net/net_failover.h
new file mode 100644
index 000000000000..b12a1c469d1c
--- /dev/null
+++ b/include/net/net_failover.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _NET_FAILOVER_H
+#define _NET_FAILOVER_H
+
+#include <net/failover.h>
+
+/* failover state */
+struct net_failover_info {
+	/* primary netdev with same MAC */
+	struct net_device __rcu *primary_dev;
+
+	/* standby netdev */
+	struct net_device __rcu *standby_dev;
+
+	/* primary netdev stats */
+	struct rtnl_link_stats64 primary_stats;
+
+	/* standby netdev stats */
+	struct rtnl_link_stats64 standby_stats;
+
+	/* aggregated stats */
+	struct rtnl_link_stats64 failover_stats;
+
+	/* spinlock while updating stats */
+	spinlock_t stats_lock;
+};
+
+struct failover *net_failover_create(struct net_device *standby_dev);
+void net_failover_destroy(struct failover *failover);
+
+#define FAILOVER_VLAN_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
+				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+				 NETIF_F_HIGHDMA | NETIF_F_LRO)
+
+#define FAILOVER_ENC_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
+				 NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
+
+#endif /* _NET_FAILOVER_H */
-- 
cgit v1.2.3


From 9805069d14c1b0b66b1600ea60cfc08f94841bd8 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:16 -0700
Subject: virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit

This feature bit can be used by hypervisor to indicate virtio_net device to
act as a standby for another device with the same MAC address.

VIRTIO_NET_F_STANDBY is defined as bit 62 as it is a device feature bit.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c        | 2 +-
 include/uapi/linux/virtio_net.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..fe4534ee50a1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3030,7 +3030,7 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
-	VIRTIO_NET_F_SPEED_DUPLEX
+	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 5de6ed37695b..a3715a3224c1 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
 
+#define VIRTIO_NET_F_STANDBY	  62	/* Act as standby for another device
+					 * with the same MAC.
+					 */
 #define VIRTIO_NET_F_SPEED_DUPLEX 63	/* Device set linkspeed and duplex */
 
 #ifndef VIRTIO_NET_NO_LEGACY
-- 
cgit v1.2.3


From 6c50c1ed72aeb5cd1ae8cd267d5a79904c82a04f Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Fri, 25 May 2018 12:40:35 +0800
Subject: ptp_qoriq: move some definitions to header file

This patch is to move some definitions in ptp_qoriq.c
to the header file.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_qoriq.c       | 132 +--------------------------------------
 include/linux/fsl/ptp_qoriq.h | 141 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 131 deletions(-)
 create mode 100644 include/linux/fsl/ptp_qoriq.h

(limited to 'include')

diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index 5110cce78fb5..1468a1642b49 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -28,139 +28,9 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/timex.h>
-#include <linux/io.h>
 #include <linux/slab.h>
 
-#include <linux/ptp_clock_kernel.h>
-
-/*
- * qoriq ptp registers
- * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
- */
-struct qoriq_ptp_registers {
-	u32 tmr_ctrl;     /* Timer control register */
-	u32 tmr_tevent;   /* Timestamp event register */
-	u32 tmr_temask;   /* Timer event mask register */
-	u32 tmr_pevent;   /* Timestamp event register */
-	u32 tmr_pemask;   /* Timer event mask register */
-	u32 tmr_stat;     /* Timestamp status register */
-	u32 tmr_cnt_h;    /* Timer counter high register */
-	u32 tmr_cnt_l;    /* Timer counter low register */
-	u32 tmr_add;      /* Timer drift compensation addend register */
-	u32 tmr_acc;      /* Timer accumulator register */
-	u32 tmr_prsc;     /* Timer prescale */
-	u8  res1[4];
-	u32 tmroff_h;     /* Timer offset high */
-	u32 tmroff_l;     /* Timer offset low */
-	u8  res2[8];
-	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
-	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
-	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
-	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
-	u8  res3[48];
-	u32 tmr_fiper1;   /* Timer fixed period interval */
-	u32 tmr_fiper2;   /* Timer fixed period interval */
-	u32 tmr_fiper3;   /* Timer fixed period interval */
-	u8  res4[20];
-	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
-};
-
-/* Bit definitions for the TMR_CTRL register */
-#define ALM1P                 (1<<31) /* Alarm1 output polarity */
-#define ALM2P                 (1<<30) /* Alarm2 output polarity */
-#define FIPERST               (1<<28) /* FIPER start indication */
-#define PP1L                  (1<<27) /* Fiper1 pulse loopback mode enabled. */
-#define PP2L                  (1<<26) /* Fiper2 pulse loopback mode enabled. */
-#define TCLK_PERIOD_SHIFT     (16) /* 1588 timer reference clock period. */
-#define TCLK_PERIOD_MASK      (0x3ff)
-#define RTPE                  (1<<15) /* Record Tx Timestamp to PAL Enable. */
-#define FRD                   (1<<14) /* FIPER Realignment Disable */
-#define ESFDP                 (1<<11) /* External Tx/Rx SFD Polarity. */
-#define ESFDE                 (1<<10) /* External Tx/Rx SFD Enable. */
-#define ETEP2                 (1<<9) /* External trigger 2 edge polarity */
-#define ETEP1                 (1<<8) /* External trigger 1 edge polarity */
-#define COPH                  (1<<7) /* Generated clock output phase. */
-#define CIPH                  (1<<6) /* External oscillator input clock phase */
-#define TMSR                  (1<<5) /* Timer soft reset. */
-#define BYP                   (1<<3) /* Bypass drift compensated clock */
-#define TE                    (1<<2) /* 1588 timer enable. */
-#define CKSEL_SHIFT           (0)    /* 1588 Timer reference clock source */
-#define CKSEL_MASK            (0x3)
-
-/* Bit definitions for the TMR_TEVENT register */
-#define ETS2                  (1<<25) /* External trigger 2 timestamp sampled */
-#define ETS1                  (1<<24) /* External trigger 1 timestamp sampled */
-#define ALM2                  (1<<17) /* Current time = alarm time register 2 */
-#define ALM1                  (1<<16) /* Current time = alarm time register 1 */
-#define PP1                   (1<<7)  /* periodic pulse generated on FIPER1 */
-#define PP2                   (1<<6)  /* periodic pulse generated on FIPER2 */
-#define PP3                   (1<<5)  /* periodic pulse generated on FIPER3 */
-
-/* Bit definitions for the TMR_TEMASK register */
-#define ETS2EN                (1<<25) /* External trigger 2 timestamp enable */
-#define ETS1EN                (1<<24) /* External trigger 1 timestamp enable */
-#define ALM2EN                (1<<17) /* Timer ALM2 event enable */
-#define ALM1EN                (1<<16) /* Timer ALM1 event enable */
-#define PP1EN                 (1<<7) /* Periodic pulse event 1 enable */
-#define PP2EN                 (1<<6) /* Periodic pulse event 2 enable */
-
-/* Bit definitions for the TMR_PEVENT register */
-#define TXP2                  (1<<9) /* PTP transmitted timestamp im TXTS2 */
-#define TXP1                  (1<<8) /* PTP transmitted timestamp in TXTS1 */
-#define RXP                   (1<<0) /* PTP frame has been received */
-
-/* Bit definitions for the TMR_PEMASK register */
-#define TXP2EN                (1<<9) /* Transmit PTP packet event 2 enable */
-#define TXP1EN                (1<<8) /* Transmit PTP packet event 1 enable */
-#define RXPEN                 (1<<0) /* Receive PTP packet event enable */
-
-/* Bit definitions for the TMR_STAT register */
-#define STAT_VEC_SHIFT        (0) /* Timer general purpose status vector */
-#define STAT_VEC_MASK         (0x3f)
-
-/* Bit definitions for the TMR_PRSC register */
-#define PRSC_OCK_SHIFT        (0) /* Output clock division/prescale factor. */
-#define PRSC_OCK_MASK         (0xffff)
-
-
-#define DRIVER		"ptp_qoriq"
-#define DEFAULT_CKSEL	1
-#define N_EXT_TS	2
-#define REG_SIZE	sizeof(struct qoriq_ptp_registers)
-
-struct qoriq_ptp {
-	struct qoriq_ptp_registers __iomem *regs;
-	spinlock_t lock; /* protects regs */
-	struct ptp_clock *clock;
-	struct ptp_clock_info caps;
-	struct resource *rsrc;
-	int irq;
-	int phc_index;
-	u64 alarm_interval; /* for periodic alarm */
-	u64 alarm_value;
-	u32 tclk_period;  /* nanoseconds */
-	u32 tmr_prsc;
-	u32 tmr_add;
-	u32 cksel;
-	u32 tmr_fiper1;
-	u32 tmr_fiper2;
-};
-
-static inline u32 qoriq_read(unsigned __iomem *addr)
-{
-	u32 val;
-
-	val = ioread32be(addr);
-	return val;
-}
-
-static inline void qoriq_write(unsigned __iomem *addr, u32 val)
-{
-	iowrite32be(val, addr);
-}
+#include <linux/fsl/ptp_qoriq.h>
 
 /*
  * Register access functions
diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h
new file mode 100644
index 000000000000..b462d9ea8007
--- /dev/null
+++ b/include/linux/fsl/ptp_qoriq.h
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ * Copyright 2018 NXP
+ */
+#ifndef __PTP_QORIQ_H__
+#define __PTP_QORIQ_H__
+
+#include <linux/io.h>
+#include <linux/ptp_clock_kernel.h>
+
+/*
+ * qoriq ptp registers
+ * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
+ */
+struct qoriq_ptp_registers {
+	u32 tmr_ctrl;     /* Timer control register */
+	u32 tmr_tevent;   /* Timestamp event register */
+	u32 tmr_temask;   /* Timer event mask register */
+	u32 tmr_pevent;   /* Timestamp event register */
+	u32 tmr_pemask;   /* Timer event mask register */
+	u32 tmr_stat;     /* Timestamp status register */
+	u32 tmr_cnt_h;    /* Timer counter high register */
+	u32 tmr_cnt_l;    /* Timer counter low register */
+	u32 tmr_add;      /* Timer drift compensation addend register */
+	u32 tmr_acc;      /* Timer accumulator register */
+	u32 tmr_prsc;     /* Timer prescale */
+	u8  res1[4];
+	u32 tmroff_h;     /* Timer offset high */
+	u32 tmroff_l;     /* Timer offset low */
+	u8  res2[8];
+	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
+	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
+	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
+	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
+	u8  res3[48];
+	u32 tmr_fiper1;   /* Timer fixed period interval */
+	u32 tmr_fiper2;   /* Timer fixed period interval */
+	u32 tmr_fiper3;   /* Timer fixed period interval */
+	u8  res4[20];
+	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
+};
+
+/* Bit definitions for the TMR_CTRL register */
+#define ALM1P                 (1<<31) /* Alarm1 output polarity */
+#define ALM2P                 (1<<30) /* Alarm2 output polarity */
+#define FIPERST               (1<<28) /* FIPER start indication */
+#define PP1L                  (1<<27) /* Fiper1 pulse loopback mode enabled. */
+#define PP2L                  (1<<26) /* Fiper2 pulse loopback mode enabled. */
+#define TCLK_PERIOD_SHIFT     (16) /* 1588 timer reference clock period. */
+#define TCLK_PERIOD_MASK      (0x3ff)
+#define RTPE                  (1<<15) /* Record Tx Timestamp to PAL Enable. */
+#define FRD                   (1<<14) /* FIPER Realignment Disable */
+#define ESFDP                 (1<<11) /* External Tx/Rx SFD Polarity. */
+#define ESFDE                 (1<<10) /* External Tx/Rx SFD Enable. */
+#define ETEP2                 (1<<9) /* External trigger 2 edge polarity */
+#define ETEP1                 (1<<8) /* External trigger 1 edge polarity */
+#define COPH                  (1<<7) /* Generated clock output phase. */
+#define CIPH                  (1<<6) /* External oscillator input clock phase */
+#define TMSR                  (1<<5) /* Timer soft reset. */
+#define BYP                   (1<<3) /* Bypass drift compensated clock */
+#define TE                    (1<<2) /* 1588 timer enable. */
+#define CKSEL_SHIFT           (0)    /* 1588 Timer reference clock source */
+#define CKSEL_MASK            (0x3)
+
+/* Bit definitions for the TMR_TEVENT register */
+#define ETS2                  (1<<25) /* External trigger 2 timestamp sampled */
+#define ETS1                  (1<<24) /* External trigger 1 timestamp sampled */
+#define ALM2                  (1<<17) /* Current time = alarm time register 2 */
+#define ALM1                  (1<<16) /* Current time = alarm time register 1 */
+#define PP1                   (1<<7)  /* periodic pulse generated on FIPER1 */
+#define PP2                   (1<<6)  /* periodic pulse generated on FIPER2 */
+#define PP3                   (1<<5)  /* periodic pulse generated on FIPER3 */
+
+/* Bit definitions for the TMR_TEMASK register */
+#define ETS2EN                (1<<25) /* External trigger 2 timestamp enable */
+#define ETS1EN                (1<<24) /* External trigger 1 timestamp enable */
+#define ALM2EN                (1<<17) /* Timer ALM2 event enable */
+#define ALM1EN                (1<<16) /* Timer ALM1 event enable */
+#define PP1EN                 (1<<7) /* Periodic pulse event 1 enable */
+#define PP2EN                 (1<<6) /* Periodic pulse event 2 enable */
+
+/* Bit definitions for the TMR_PEVENT register */
+#define TXP2                  (1<<9) /* PTP transmitted timestamp im TXTS2 */
+#define TXP1                  (1<<8) /* PTP transmitted timestamp in TXTS1 */
+#define RXP                   (1<<0) /* PTP frame has been received */
+
+/* Bit definitions for the TMR_PEMASK register */
+#define TXP2EN                (1<<9) /* Transmit PTP packet event 2 enable */
+#define TXP1EN                (1<<8) /* Transmit PTP packet event 1 enable */
+#define RXPEN                 (1<<0) /* Receive PTP packet event enable */
+
+/* Bit definitions for the TMR_STAT register */
+#define STAT_VEC_SHIFT        (0) /* Timer general purpose status vector */
+#define STAT_VEC_MASK         (0x3f)
+
+/* Bit definitions for the TMR_PRSC register */
+#define PRSC_OCK_SHIFT        (0) /* Output clock division/prescale factor. */
+#define PRSC_OCK_MASK         (0xffff)
+
+
+#define DRIVER		"ptp_qoriq"
+#define DEFAULT_CKSEL	1
+#define N_EXT_TS	2
+#define REG_SIZE	sizeof(struct qoriq_ptp_registers)
+
+struct qoriq_ptp {
+	struct qoriq_ptp_registers __iomem *regs;
+	spinlock_t lock; /* protects regs */
+	struct ptp_clock *clock;
+	struct ptp_clock_info caps;
+	struct resource *rsrc;
+	int irq;
+	int phc_index;
+	u64 alarm_interval; /* for periodic alarm */
+	u64 alarm_value;
+	u32 tclk_period;  /* nanoseconds */
+	u32 tmr_prsc;
+	u32 tmr_add;
+	u32 cksel;
+	u32 tmr_fiper1;
+	u32 tmr_fiper2;
+};
+
+static inline u32 qoriq_read(unsigned __iomem *addr)
+{
+	u32 val;
+
+	val = ioread32be(addr);
+	return val;
+}
+
+static inline void qoriq_write(unsigned __iomem *addr, u32 val)
+{
+	iowrite32be(val, addr);
+}
+
+#endif
-- 
cgit v1.2.3


From e9be0e993d95adbe5efe0e0f03b2a3e71f5bb2b6 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 25 May 2018 16:28:44 +0200
Subject: net: sched: shrink struct Qdisc

The struct Qdisc has a lot of holes, especially after commit
a53851e2c321 ("net: sched: explicit locking in gso_cpu fallback"),
which as a side effect, moved the fields just after 'busylock'
on a new cacheline.

Since both 'padded' and 'refcnt' are not updated frequently, and
there is a hole before 'gso_skb', we can move such fields there,
saving a cacheline without any performance side effect.

Before this commit:

pahole -C Qdisc net/sche/sch_generic.o
	# ...
        /* size: 384, cachelines: 6, members: 25 */
        /* sum members: 236, holes: 3, sum holes: 92 */
        /* padding: 56 */

After this commit:
pahole -C Qdisc net/sche/sch_generic.o
	# ...
	/* size: 320, cachelines: 5, members: 25 */
	/* sum members: 236, holes: 2, sum holes: 28 */
	/* padding: 56 */

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 98c10a28cd01..827a3711dc68 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -85,6 +85,8 @@ struct Qdisc {
 	struct net_rate_estimator __rcu *rate_est;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
+	int			padded;
+	refcount_t		refcnt;
 
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
@@ -97,8 +99,6 @@ struct Qdisc {
 	unsigned long		state;
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
-	int			padded;
-	refcount_t		refcnt;
 
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
-- 
cgit v1.2.3


From ca1b7d3daff43a269fb7a0479055ac5b741638d8 Mon Sep 17 00:00:00 2001
From: Eric Long <eric.long@spreadtrum.com>
Date: Wed, 23 May 2018 17:31:11 +0800
Subject: dmaengine: sprd: Add Spreadtrum DMA configuration

This patch adds the 'device_config' and 'device_prep_slave_sg' interfaces
for users to configure DMA, as well as adding one 'struct sprd_dma_config'
structure to save Spreadtrum DMA configuration for each DMA channel.

Signed-off-by: Eric Long <eric.long@spreadtrum.com>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sprd-dma.c       | 182 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/dma/sprd-dma.h |   4 +
 2 files changed, 186 insertions(+)

(limited to 'include')

diff --git a/drivers/dma/sprd-dma.c b/drivers/dma/sprd-dma.c
index 1ca5acd2a4ac..ab69f835ea02 100644
--- a/drivers/dma/sprd-dma.c
+++ b/drivers/dma/sprd-dma.c
@@ -164,6 +164,7 @@ struct sprd_dma_desc {
 struct sprd_dma_chn {
 	struct virt_dma_chan	vc;
 	void __iomem		*chn_base;
+	struct dma_slave_config	slave_cfg;
 	u32			chn_num;
 	u32			dev_id;
 	struct sprd_dma_desc	*cur_desc;
@@ -552,6 +553,129 @@ static void sprd_dma_issue_pending(struct dma_chan *chan)
 	spin_unlock_irqrestore(&schan->vc.lock, flags);
 }
 
+static int sprd_dma_get_datawidth(enum dma_slave_buswidth buswidth)
+{
+	switch (buswidth) {
+	case DMA_SLAVE_BUSWIDTH_1_BYTE:
+	case DMA_SLAVE_BUSWIDTH_2_BYTES:
+	case DMA_SLAVE_BUSWIDTH_4_BYTES:
+	case DMA_SLAVE_BUSWIDTH_8_BYTES:
+		return ffs(buswidth) - 1;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int sprd_dma_get_step(enum dma_slave_buswidth buswidth)
+{
+	switch (buswidth) {
+	case DMA_SLAVE_BUSWIDTH_1_BYTE:
+	case DMA_SLAVE_BUSWIDTH_2_BYTES:
+	case DMA_SLAVE_BUSWIDTH_4_BYTES:
+	case DMA_SLAVE_BUSWIDTH_8_BYTES:
+		return buswidth;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int sprd_dma_fill_desc(struct dma_chan *chan,
+			      struct sprd_dma_desc *sdesc,
+			      dma_addr_t src, dma_addr_t dst, u32 len,
+			      enum dma_transfer_direction dir,
+			      unsigned long flags,
+			      struct dma_slave_config *slave_cfg)
+{
+	struct sprd_dma_dev *sdev = to_sprd_dma_dev(chan);
+	struct sprd_dma_chn *schan = to_sprd_dma_chan(chan);
+	struct sprd_dma_chn_hw *hw = &sdesc->chn_hw;
+	u32 req_mode = (flags >> SPRD_DMA_REQ_SHIFT) & SPRD_DMA_REQ_MODE_MASK;
+	u32 int_mode = flags & SPRD_DMA_INT_MASK;
+	int src_datawidth, dst_datawidth, src_step, dst_step;
+	u32 temp, fix_mode = 0, fix_en = 0;
+
+	if (dir == DMA_MEM_TO_DEV) {
+		src_step = sprd_dma_get_step(slave_cfg->src_addr_width);
+		if (src_step < 0) {
+			dev_err(sdev->dma_dev.dev, "invalid source step\n");
+			return src_step;
+		}
+		dst_step = SPRD_DMA_NONE_STEP;
+	} else {
+		dst_step = sprd_dma_get_step(slave_cfg->dst_addr_width);
+		if (dst_step < 0) {
+			dev_err(sdev->dma_dev.dev, "invalid destination step\n");
+			return dst_step;
+		}
+		src_step = SPRD_DMA_NONE_STEP;
+	}
+
+	src_datawidth = sprd_dma_get_datawidth(slave_cfg->src_addr_width);
+	if (src_datawidth < 0) {
+		dev_err(sdev->dma_dev.dev, "invalid source datawidth\n");
+		return src_datawidth;
+	}
+
+	dst_datawidth = sprd_dma_get_datawidth(slave_cfg->dst_addr_width);
+	if (dst_datawidth < 0) {
+		dev_err(sdev->dma_dev.dev, "invalid destination datawidth\n");
+		return dst_datawidth;
+	}
+
+	if (slave_cfg->slave_id)
+		schan->dev_id = slave_cfg->slave_id;
+
+	hw->cfg = SPRD_DMA_DONOT_WAIT_BDONE << SPRD_DMA_WAIT_BDONE_OFFSET;
+
+	/*
+	 * wrap_ptr and wrap_to will save the high 4 bits source address and
+	 * destination address.
+	 */
+	hw->wrap_ptr = (src >> SPRD_DMA_HIGH_ADDR_OFFSET) & SPRD_DMA_HIGH_ADDR_MASK;
+	hw->wrap_to = (dst >> SPRD_DMA_HIGH_ADDR_OFFSET) & SPRD_DMA_HIGH_ADDR_MASK;
+	hw->src_addr = src & SPRD_DMA_LOW_ADDR_MASK;
+	hw->des_addr = dst & SPRD_DMA_LOW_ADDR_MASK;
+
+	/*
+	 * If the src step and dst step both are 0 or both are not 0, that means
+	 * we can not enable the fix mode. If one is 0 and another one is not,
+	 * we can enable the fix mode.
+	 */
+	if ((src_step != 0 && dst_step != 0) || (src_step | dst_step) == 0) {
+		fix_en = 0;
+	} else {
+		fix_en = 1;
+		if (src_step)
+			fix_mode = 1;
+		else
+			fix_mode = 0;
+	}
+
+	hw->intc = int_mode | SPRD_DMA_CFG_ERR_INT_EN;
+
+	temp = src_datawidth << SPRD_DMA_SRC_DATAWIDTH_OFFSET;
+	temp |= dst_datawidth << SPRD_DMA_DES_DATAWIDTH_OFFSET;
+	temp |= req_mode << SPRD_DMA_REQ_MODE_OFFSET;
+	temp |= fix_mode << SPRD_DMA_FIX_SEL_OFFSET;
+	temp |= fix_en << SPRD_DMA_FIX_EN_OFFSET;
+	temp |= slave_cfg->src_maxburst & SPRD_DMA_FRG_LEN_MASK;
+	hw->frg_len = temp;
+
+	hw->blk_len = len & SPRD_DMA_BLK_LEN_MASK;
+	hw->trsc_len = len & SPRD_DMA_TRSC_LEN_MASK;
+
+	temp = (dst_step & SPRD_DMA_TRSF_STEP_MASK) << SPRD_DMA_DEST_TRSF_STEP_OFFSET;
+	temp |= (src_step & SPRD_DMA_TRSF_STEP_MASK) << SPRD_DMA_SRC_TRSF_STEP_OFFSET;
+	hw->trsf_step = temp;
+
+	hw->frg_step = 0;
+	hw->src_blk_step = 0;
+	hw->des_blk_step = 0;
+	return 0;
+}
+
 static struct dma_async_tx_descriptor *
 sprd_dma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 			 size_t len, unsigned long flags)
@@ -607,6 +731,62 @@ sprd_dma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	return vchan_tx_prep(&schan->vc, &sdesc->vd, flags);
 }
 
+static struct dma_async_tx_descriptor *
+sprd_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
+		       unsigned int sglen, enum dma_transfer_direction dir,
+		       unsigned long flags, void *context)
+{
+	struct sprd_dma_chn *schan = to_sprd_dma_chan(chan);
+	struct dma_slave_config *slave_cfg = &schan->slave_cfg;
+	dma_addr_t src = 0, dst = 0;
+	struct sprd_dma_desc *sdesc;
+	struct scatterlist *sg;
+	u32 len = 0;
+	int ret, i;
+
+	/* TODO: now we only support one sg for each DMA configuration. */
+	if (!is_slave_direction(dir) || sglen > 1)
+		return NULL;
+
+	sdesc = kzalloc(sizeof(*sdesc), GFP_NOWAIT);
+	if (!sdesc)
+		return NULL;
+
+	for_each_sg(sgl, sg, sglen, i) {
+		len = sg_dma_len(sg);
+
+		if (dir == DMA_MEM_TO_DEV) {
+			src = sg_dma_address(sg);
+			dst = slave_cfg->dst_addr;
+		} else {
+			src = slave_cfg->src_addr;
+			dst = sg_dma_address(sg);
+		}
+	}
+
+	ret = sprd_dma_fill_desc(chan, sdesc, src, dst, len, dir, flags,
+				 slave_cfg);
+	if (ret) {
+		kfree(sdesc);
+		return NULL;
+	}
+
+	return vchan_tx_prep(&schan->vc, &sdesc->vd, flags);
+}
+
+static int sprd_dma_slave_config(struct dma_chan *chan,
+				 struct dma_slave_config *config)
+{
+	struct sprd_dma_chn *schan = to_sprd_dma_chan(chan);
+	struct dma_slave_config *slave_cfg = &schan->slave_cfg;
+
+	if (!is_slave_direction(config->direction))
+		return -EINVAL;
+
+	memcpy(slave_cfg, config, sizeof(*config));
+	return 0;
+}
+
 static int sprd_dma_pause(struct dma_chan *chan)
 {
 	struct sprd_dma_chn *schan = to_sprd_dma_chan(chan);
@@ -733,6 +913,8 @@ static int sprd_dma_probe(struct platform_device *pdev)
 	sdev->dma_dev.device_tx_status = sprd_dma_tx_status;
 	sdev->dma_dev.device_issue_pending = sprd_dma_issue_pending;
 	sdev->dma_dev.device_prep_dma_memcpy = sprd_dma_prep_dma_memcpy;
+	sdev->dma_dev.device_prep_slave_sg = sprd_dma_prep_slave_sg;
+	sdev->dma_dev.device_config = sprd_dma_slave_config;
 	sdev->dma_dev.device_pause = sprd_dma_pause;
 	sdev->dma_dev.device_resume = sprd_dma_resume;
 	sdev->dma_dev.device_terminate_all = sprd_dma_terminate_all;
diff --git a/include/linux/dma/sprd-dma.h b/include/linux/dma/sprd-dma.h
index c545162e725b..b0115e340fbc 100644
--- a/include/linux/dma/sprd-dma.h
+++ b/include/linux/dma/sprd-dma.h
@@ -3,6 +3,10 @@
 #ifndef _SPRD_DMA_H_
 #define _SPRD_DMA_H_
 
+#define SPRD_DMA_REQ_SHIFT 16
+#define SPRD_DMA_FLAGS(req_mode, int_type) \
+	((req_mode) << SPRD_DMA_REQ_SHIFT | (int_type))
+
 /*
  * enum sprd_dma_req_mode: define the DMA request mode
  * @SPRD_DMA_FRAG_REQ: fragment request mode
-- 
cgit v1.2.3


From 44d99d737279eb2021b2c66df3cee6f8a21ff4e4 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 18 Apr 2018 12:24:00 +0200
Subject: mfd: cros_ec: Don't try to grab log when suspended

We should stop our worker thread while we're suspended.  If we don't
then we'll get messages like:

  cros-ec-spi spi5.0: spi transfer failed: -108
  cros-ec-spi spi5.0: cs-deassert spi transfer failed: -108
  cros-ec-ctl cros-ec-ctl.0.auto: EC communication failed

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_dev.c                 |  4 ++++
 drivers/platform/chrome/cros_ec_debugfs.c | 20 ++++++++++++++++++++
 include/linux/mfd/cros_ec.h               |  2 ++
 3 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index eafd06f62a3a..5a7d4e1dea70 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -466,6 +466,8 @@ static __maybe_unused int ec_device_suspend(struct device *dev)
 {
 	struct cros_ec_dev *ec = dev_get_drvdata(dev);
 
+	cros_ec_debugfs_suspend(ec);
+
 	lb_suspend(ec);
 
 	return 0;
@@ -475,6 +477,8 @@ static __maybe_unused int ec_device_resume(struct device *dev)
 {
 	struct cros_ec_dev *ec = dev_get_drvdata(dev);
 
+	cros_ec_debugfs_resume(ec);
+
 	lb_resume(ec);
 
 	return 0;
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index cc265ed8deb7..c62ee8e610a0 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -470,3 +470,23 @@ void cros_ec_debugfs_remove(struct cros_ec_dev *ec)
 	cros_ec_cleanup_console_log(ec->debug_info);
 }
 EXPORT_SYMBOL(cros_ec_debugfs_remove);
+
+void cros_ec_debugfs_suspend(struct cros_ec_dev *ec)
+{
+	/*
+	 * cros_ec_debugfs_init() failures are non-fatal; it's also possible
+	 * that we initted things but decided that console log wasn't supported.
+	 * We'll use the same set of checks that cros_ec_debugfs_remove() +
+	 * cros_ec_cleanup_console_log() end up using to handle those cases.
+	 */
+	if (ec->debug_info && ec->debug_info->log_buffer.buf)
+		cancel_delayed_work_sync(&ec->debug_info->log_poll_work);
+}
+EXPORT_SYMBOL(cros_ec_debugfs_suspend);
+
+void cros_ec_debugfs_resume(struct cros_ec_dev *ec)
+{
+	if (ec->debug_info && ec->debug_info->log_buffer.buf)
+		schedule_delayed_work(&ec->debug_info->log_poll_work, 0);
+}
+EXPORT_SYMBOL(cros_ec_debugfs_resume);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 4ff0cec979b0..b8e750d91200 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -327,5 +327,7 @@ extern struct attribute_group cros_ec_vbc_attr_group;
 /* debugfs stuff */
 int cros_ec_debugfs_init(struct cros_ec_dev *ec);
 void cros_ec_debugfs_remove(struct cros_ec_dev *ec);
+void cros_ec_debugfs_suspend(struct cros_ec_dev *ec);
+void cros_ec_debugfs_resume(struct cros_ec_dev *ec);
 
 #endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit v1.2.3


From a0d476654a2b222567710ab13f3660a39c5641c8 Mon Sep 17 00:00:00 2001
From: "yinbo.zhu" <yinbo.zhu@nxp.com>
Date: Thu, 24 May 2018 11:38:25 +0800
Subject: mmc: sd: Define name for default speed dtr

Add a new define for the sd default speed 25MHz case

Signed-off-by: Yinbo Zhu <yinbo.zhu@nxp.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/card.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 5ebc47855721..de7377815b6b 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -156,6 +156,7 @@ struct sd_switch_caps {
 #define UHS_DDR50_MAX_DTR	50000000
 #define UHS_SDR25_MAX_DTR	UHS_DDR50_MAX_DTR
 #define UHS_SDR12_MAX_DTR	25000000
+#define DEFAULT_SPEED_MAX_DTR	UHS_SDR12_MAX_DTR
 	unsigned int		sd3_bus_mode;
 #define UHS_SDR12_BUS_SPEED	0
 #define HIGH_SPEED_BUS_SPEED	1
-- 
cgit v1.2.3


From c94e45bc38ae484ee989e6e3b2496a52776da9e4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 7 May 2018 16:02:14 -0400
Subject: tracing: Do not reference event data in post call triggers

Trace event triggers can be called before or after the event has been
committed. If it has been called after the commit, there's a possibility
that the event no longer exists. Currently, the two post callers is the
trigger to disable tracing (traceoff) and the one that will record a stack
dump (stacktrace). Neither of them reference the trace event entry record,
as that would lead to a race condition that could pass in corrupted data.

To prevent any other users of the post data triggers from using the trace
event record, pass in NULL to the post call trigger functions for the event
record, as they should never need to use them in the first place.

This does not fix any bug, but prevents bugs from happening by new post call
trigger users.

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h        | 3 +--
 kernel/trace/trace.h                | 4 ++--
 kernel/trace/trace_events_trigger.c | 6 ++----
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3eff564c..d1c442d9bd85 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -435,8 +435,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
 		    struct ring_buffer_event *event);
 extern void
 event_triggers_post_call(struct trace_event_file *file,
-			 enum event_trigger_type tt,
-			 void *rec, struct ring_buffer_event *event);
+			 enum event_trigger_type tt);
 
 bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 507954b4e058..6bfc2467479c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1334,7 +1334,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
 		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
 
 	if (tt)
-		event_triggers_post_call(file, tt, entry, event);
+		event_triggers_post_call(file, tt);
 }
 
 /**
@@ -1367,7 +1367,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
 						irq_flags, pc, regs);
 
 	if (tt)
-		event_triggers_post_call(file, tt, entry, event);
+		event_triggers_post_call(file, tt);
 }
 
 #define FILTER_PRED_INVALID	((unsigned short)-1)
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8b5bdcf64871..d18249683682 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -97,7 +97,6 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
  * event_triggers_post_call - Call 'post_triggers' for a trace event
  * @file: The trace_event_file associated with the event
  * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
- * @rec: The trace entry for the event
  *
  * For each trigger associated with an event, invoke the trigger
  * function registered with the associated trigger command, if the
@@ -108,8 +107,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
  */
 void
 event_triggers_post_call(struct trace_event_file *file,
-			 enum event_trigger_type tt,
-			 void *rec, struct ring_buffer_event *event)
+			 enum event_trigger_type tt)
 {
 	struct event_trigger_data *data;
 
@@ -117,7 +115,7 @@ event_triggers_post_call(struct trace_event_file *file,
 		if (data->paused)
 			continue;
 		if (data->cmd_ops->trigger_type & tt)
-			data->ops->func(data, rec, event);
+			data->ops->func(data, NULL, NULL);
 	}
 }
 EXPORT_SYMBOL_GPL(event_triggers_post_call);
-- 
cgit v1.2.3


From 4a0772cf0674346f3c1ff1043e440523b7e3131b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 10 May 2018 12:40:21 -0400
Subject: tracing: Prevent further users of zero size static arrays in trace
 events

A zero size static array has special meaning in the ftrace infrastructure.
Trace events are for recording data in the trace buffers that is normally
difficult to obtain via probes or function tracing. There is no reason for
any trace event to declare a zero size static array.

If one does, BUILD_BUG_ON() will trigger and prevent the kernel from
compiling.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/trace/trace_events.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index bfda803b0a09..4ecdfe2e3580 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -422,6 +422,7 @@ static struct trace_event_functions trace_event_type_funcs_##call = {	\
 	do {								\
 		char *type_str = #type"["__stringify(len)"]";		\
 		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\
+		BUILD_BUG_ON(len <= 0);					\
 		ret = trace_define_field(event_call, type_str, #item,	\
 				 offsetof(typeof(field), item),		\
 				 sizeof(field.item),			\
-- 
cgit v1.2.3


From 46ca359955fee63486dc1cfc528ae5692bb16dcd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 29 May 2018 10:26:44 +0200
Subject: doc: document scope NOFS, NOIO APIs

Although the api is documented in the source code Ted has pointed out
that there is no mention in the core-api Documentation and there are
people looking there to find answers how to use a specific API.

Requested-by: "Theodore Y. Ts'o" <tytso@mit.edu>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/core-api/gfp_mask-from-fs-io.rst | 61 ++++++++++++++++++++++++++
 Documentation/core-api/index.rst               |  1 +
 include/linux/sched/mm.h                       | 38 ++++++++++++++++
 3 files changed, 100 insertions(+)
 create mode 100644 Documentation/core-api/gfp_mask-from-fs-io.rst

(limited to 'include')

diff --git a/Documentation/core-api/gfp_mask-from-fs-io.rst b/Documentation/core-api/gfp_mask-from-fs-io.rst
new file mode 100644
index 000000000000..2dc442b04a77
--- /dev/null
+++ b/Documentation/core-api/gfp_mask-from-fs-io.rst
@@ -0,0 +1,61 @@
+=================================
+GFP masks used from FS/IO context
+=================================
+
+:Date: May, 2018
+:Author: Michal Hocko <mhocko@kernel.org>
+
+Introduction
+============
+
+Code paths in the filesystem and IO stacks must be careful when
+allocating memory to prevent recursion deadlocks caused by direct
+memory reclaim calling back into the FS or IO paths and blocking on
+already held resources (e.g. locks - most commonly those used for the
+transaction context).
+
+The traditional way to avoid this deadlock problem is to clear __GFP_FS
+respectively __GFP_IO (note the latter implies clearing the first as well) in
+the gfp mask when calling an allocator. GFP_NOFS respectively GFP_NOIO can be
+used as shortcut. It turned out though that above approach has led to
+abuses when the restricted gfp mask is used "just in case" without a
+deeper consideration which leads to problems because an excessive use
+of GFP_NOFS/GFP_NOIO can lead to memory over-reclaim or other memory
+reclaim issues.
+
+New API
+========
+
+Since 4.12 we do have a generic scope API for both NOFS and NOIO context
+``memalloc_nofs_save``, ``memalloc_nofs_restore`` respectively ``memalloc_noio_save``,
+``memalloc_noio_restore`` which allow to mark a scope to be a critical
+section from a filesystem or I/O point of view. Any allocation from that
+scope will inherently drop __GFP_FS respectively __GFP_IO from the given
+mask so no memory allocation can recurse back in the FS/IO.
+
+FS/IO code then simply calls the appropriate save function before
+any critical section with respect to the reclaim is started - e.g.
+lock shared with the reclaim context or when a transaction context
+nesting would be possible via reclaim. The restore function should be
+called when the critical section ends. All that ideally along with an
+explanation what is the reclaim context for easier maintenance.
+
+Please note that the proper pairing of save/restore functions
+allows nesting so it is safe to call ``memalloc_noio_save`` or
+``memalloc_noio_restore`` respectively from an existing NOIO or NOFS
+scope.
+
+What about __vmalloc(GFP_NOFS)
+==============================
+
+vmalloc doesn't support GFP_NOFS semantic because there are hardcoded
+GFP_KERNEL allocations deep inside the allocator which are quite non-trivial
+to fix up. That means that calling ``vmalloc`` with GFP_NOFS/GFP_NOIO is
+almost always a bug. The good news is that the NOFS/NOIO semantic can be
+achieved by the scope API.
+
+In the ideal world, upper layers should already mark dangerous contexts
+and so no special care is required and vmalloc should be called without
+any problems. Sometimes if the context is not really clear or there are
+layering violations then the recommended way around that is to wrap ``vmalloc``
+by the scope API with a comment explaining the problem.
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 3864de589126..f5a66b72f984 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -27,6 +27,7 @@ Core utilities
    errseq
    printk-formats
    circular-buffers
+   gfp_mask-from-fs-io
 
 Interfaces for kernel debugging
 ===============================
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 4e1411bbbcfc..76a8cb4ef178 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -170,6 +170,17 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
 static inline void fs_reclaim_release(gfp_t gfp_mask) { }
 #endif
 
+/**
+ * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOIO allocation scope.
+ * All further allocations will implicitly drop __GFP_IO flag and so
+ * they are safe for the IO critical section from the allocation recursion
+ * point of view. Use memalloc_noio_restore to end the scope with flags
+ * returned by this function.
+ *
+ * This function is safe to be used from any context.
+ */
 static inline unsigned int memalloc_noio_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
@@ -177,11 +188,30 @@ static inline unsigned int memalloc_noio_save(void)
 	return flags;
 }
 
+/**
+ * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
+ * Always make sure that that the given flags is the return value from the
+ * pairing memalloc_noio_save call.
+ */
 static inline void memalloc_noio_restore(unsigned int flags)
 {
 	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
 }
 
+/**
+ * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOFS allocation scope.
+ * All further allocations will implicitly drop __GFP_FS flag and so
+ * they are safe for the FS critical section from the allocation recursion
+ * point of view. Use memalloc_nofs_restore to end the scope with flags
+ * returned by this function.
+ *
+ * This function is safe to be used from any context.
+ */
 static inline unsigned int memalloc_nofs_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
@@ -189,6 +219,14 @@ static inline unsigned int memalloc_nofs_save(void)
 	return flags;
 }
 
+/**
+ * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
+ * Always make sure that that the given flags is the return value from the
+ * pairing memalloc_nofs_save call.
+ */
 static inline void memalloc_nofs_restore(unsigned int flags)
 {
 	current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
-- 
cgit v1.2.3


From 0cbc06b3faba756113d4ac748b089529f813eda4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 25 May 2018 00:25:48 +0200
Subject: netfilter: nf_tables: remove synchronize_rcu in commit phase

synchronize_rcu() is expensive.

The commit phase currently enforces an unconditional
synchronize_rcu() after incrementing the generation counter.

This is to make sure that a packet always sees a consistent chain, either
nft_do_chain is still using old generation (it will skip the newly added
rules), or the new one (it will skip old ones that might still be linked
into the list).

We could just remove the synchronize_rcu(), it would not cause a crash but
it could cause us to evaluate a rule that was removed and new rule for the
same packet, instead of either-or.

To resolve this, add rule pointer array holding two generations, the
current one and the future generation.

In commit phase, allocate the rule blob and populate it with the rules that
will be active in the new generation.

Then, make this rule blob public, replacing the old generation pointer.

Then the generation counter can be incremented.

nft_do_chain() will either continue to use the current generation
(in case loop was invoked right before increment), or the new one.

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   5 +
 net/netfilter/nf_tables_api.c     | 204 ++++++++++++++++++++++++++++++++++++--
 net/netfilter/nf_tables_core.c    |  24 +++--
 3 files changed, 215 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 603b51401deb..6b9ddb99f3a8 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -858,6 +858,8 @@ enum nft_chain_flags {
  *	@name: name of the chain
  */
 struct nft_chain {
+	struct nft_rule			*__rcu *rules_gen_0;
+	struct nft_rule			*__rcu *rules_gen_1;
 	struct list_head		rules;
 	struct list_head		list;
 	struct nft_table		*table;
@@ -867,6 +869,9 @@ struct nft_chain {
 	u8				flags:6,
 					genmask:2;
 	char				*name;
+
+	/* Only used during control plane commit phase: */
+	struct nft_rule			**rules_next;
 };
 
 enum nft_chain_types {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 87b2a77add65..583673743648 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1237,12 +1237,29 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
 		rcu_assign_pointer(chain->stats, newstats);
 }
 
+static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
+{
+	struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
+	struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+
+	if (g0 != g1)
+		kvfree(g1);
+	kvfree(g0);
+
+	/* should be NULL either via abort or via successful commit */
+	WARN_ON_ONCE(chain->rules_next);
+	kvfree(chain->rules_next);
+}
+
 static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 {
 	struct nft_chain *chain = ctx->chain;
 
 	BUG_ON(chain->use > 0);
 
+	/* no concurrent access possible anymore */
+	nf_tables_chain_free_chain_rules(chain);
+
 	if (nft_is_base_chain(chain)) {
 		struct nft_base_chain *basechain = nft_base_chain(chain);
 
@@ -1335,6 +1352,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
 	module_put(hook->type->owner);
 }
 
+struct nft_rules_old {
+	struct rcu_head h;
+	struct nft_rule **start;
+};
+
+static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+						     unsigned int alloc)
+{
+	if (alloc > INT_MAX)
+		return NULL;
+
+	alloc += 1;	/* NULL, ends rules */
+	if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+		return NULL;
+
+	alloc *= sizeof(struct nft_rule *);
+	alloc += sizeof(struct nft_rules_old);
+
+	return kvmalloc(alloc, GFP_KERNEL);
+}
+
 static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			      u8 policy, bool create)
 {
@@ -1344,6 +1382,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	struct nft_stats __percpu *stats;
 	struct net *net = ctx->net;
 	struct nft_chain *chain;
+	struct nft_rule **rules;
 	int err;
 
 	if (table->use == UINT_MAX)
@@ -1406,6 +1445,16 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		goto err1;
 	}
 
+	rules = nf_tables_chain_alloc_rules(chain, 0);
+	if (!rules) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	*rules = NULL;
+	rcu_assign_pointer(chain->rules_gen_0, rules);
+	rcu_assign_pointer(chain->rules_gen_1, rules);
+
 	err = nf_tables_register_hook(net, table, chain);
 	if (err < 0)
 		goto err1;
@@ -5850,21 +5899,162 @@ static void nf_tables_commit_release(struct net *net)
 	}
 }
 
+static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
+{
+	struct nft_rule *rule;
+	unsigned int alloc = 0;
+	int i;
+
+	/* already handled or inactive chain? */
+	if (chain->rules_next || !nft_is_active_next(net, chain))
+		return 0;
+
+	rule = list_entry(&chain->rules, struct nft_rule, list);
+	i = 0;
+
+	list_for_each_entry_continue(rule, &chain->rules, list) {
+		if (nft_is_active_next(net, rule))
+			alloc++;
+	}
+
+	chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
+	if (!chain->rules_next)
+		return -ENOMEM;
+
+	list_for_each_entry_continue(rule, &chain->rules, list) {
+		if (nft_is_active_next(net, rule))
+			chain->rules_next[i++] = rule;
+	}
+
+	chain->rules_next[i] = NULL;
+	return 0;
+}
+
+static void nf_tables_commit_chain_prepare_cancel(struct net *net)
+{
+	struct nft_trans *trans, *next;
+
+	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+		struct nft_chain *chain = trans->ctx.chain;
+
+		if (trans->msg_type == NFT_MSG_NEWRULE ||
+		    trans->msg_type == NFT_MSG_DELRULE) {
+			kvfree(chain->rules_next);
+			chain->rules_next = NULL;
+		}
+	}
+}
+
+static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+{
+	struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+
+	kvfree(o->start);
+}
+
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+{
+	struct nft_rule **r = rules;
+	struct nft_rules_old *old;
+
+	while (*r)
+		r++;
+
+	r++;	/* rcu_head is after end marker */
+	old = (void *) r;
+	old->start = rules;
+
+	call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+}
+
+static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
+{
+	struct nft_rule **g0, **g1;
+	bool next_genbit;
+
+	next_genbit = nft_gencursor_next(net);
+
+	g0 = rcu_dereference_protected(chain->rules_gen_0,
+				       lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+	g1 = rcu_dereference_protected(chain->rules_gen_1,
+				       lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+	/* No changes to this chain? */
+	if (chain->rules_next == NULL) {
+		/* chain had no change in last or next generation */
+		if (g0 == g1)
+			return;
+		/*
+		 * chain had no change in this generation; make sure next
+		 * one uses same rules as current generation.
+		 */
+		if (next_genbit) {
+			rcu_assign_pointer(chain->rules_gen_1, g0);
+			nf_tables_commit_chain_free_rules_old(g1);
+		} else {
+			rcu_assign_pointer(chain->rules_gen_0, g1);
+			nf_tables_commit_chain_free_rules_old(g0);
+		}
+
+		return;
+	}
+
+	if (next_genbit)
+		rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+	else
+		rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+
+	chain->rules_next = NULL;
+
+	if (g0 == g1)
+		return;
+
+	if (next_genbit)
+		nf_tables_commit_chain_free_rules_old(g1);
+	else
+		nf_tables_commit_chain_free_rules_old(g0);
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
 	struct nft_trans *trans, *next;
 	struct nft_trans_elem *te;
+	struct nft_chain *chain;
+	struct nft_table *table;
 
-	/* Bump generation counter, invalidate any dump in progress */
-	while (++net->nft.base_seq == 0);
+	/* 1.  Allocate space for next generation rules_gen_X[] */
+	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+		int ret;
 
-	/* A new generation has just started */
-	net->nft.gencursor = nft_gencursor_next(net);
+		if (trans->msg_type == NFT_MSG_NEWRULE ||
+		    trans->msg_type == NFT_MSG_DELRULE) {
+			chain = trans->ctx.chain;
+
+			ret = nf_tables_commit_chain_prepare(net, chain);
+			if (ret < 0) {
+				nf_tables_commit_chain_prepare_cancel(net);
+				return ret;
+			}
+		}
+	}
 
-	/* Make sure all packets have left the previous generation before
-	 * purging old rules.
+	/* step 2.  Make rules_gen_X visible to packet path */
+	list_for_each_entry(table, &net->nft.tables, list) {
+		list_for_each_entry(chain, &table->chains, list) {
+			if (!nft_is_active_next(net, chain))
+				continue;
+			nf_tables_commit_chain_active(net, chain);
+		}
+	}
+
+	/*
+	 * Bump generation counter, invalidate any dump in progress.
+	 * Cannot fail after this point.
 	 */
-	synchronize_rcu();
+	while (++net->nft.base_seq == 0);
+
+	/* step 3. Start new generation, rules_gen_X now in use. */
+	net->nft.gencursor = nft_gencursor_next(net);
 
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		switch (trans->msg_type) {
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 4f46d2f4e167..0548cd50ec26 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -133,7 +133,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 
 struct nft_jumpstack {
 	const struct nft_chain	*chain;
-	const struct nft_rule	*rule;
+	struct nft_rule	*const *rules;
 };
 
 unsigned int
@@ -141,27 +141,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 {
 	const struct nft_chain *chain = priv, *basechain = chain;
 	const struct net *net = nft_net(pkt);
+	struct nft_rule *const *rules;
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
 	struct nft_regs regs;
 	unsigned int stackptr = 0;
 	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
-	unsigned int gencursor = nft_genmask_cur(net);
+	bool genbit = READ_ONCE(net->nft.gencursor);
 	struct nft_traceinfo info;
 
 	info.trace = false;
 	if (static_branch_unlikely(&nft_trace_enabled))
 		nft_trace_init(&info, pkt, &regs.verdict, basechain);
 do_chain:
-	rule = list_entry(&chain->rules, struct nft_rule, list);
+	if (genbit)
+		rules = rcu_dereference(chain->rules_gen_1);
+	else
+		rules = rcu_dereference(chain->rules_gen_0);
+
 next_rule:
+	rule = *rules;
 	regs.verdict.code = NFT_CONTINUE;
-	list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
-
-		/* This rule is not active, skip. */
-		if (unlikely(rule->genmask & gencursor))
-			continue;
-
+	for (; *rules ; rules++) {
+		rule = *rules;
 		nft_rule_for_each_expr(expr, last, rule) {
 			if (expr->ops == &nft_cmp_fast_ops)
 				nft_cmp_fast_eval(expr, &regs);
@@ -199,7 +201,7 @@ next_rule:
 	case NFT_JUMP:
 		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
 		jumpstack[stackptr].chain = chain;
-		jumpstack[stackptr].rule  = rule;
+		jumpstack[stackptr].rules = rules + 1;
 		stackptr++;
 		/* fall through */
 	case NFT_GOTO:
@@ -221,7 +223,7 @@ next_rule:
 	if (stackptr > 0) {
 		stackptr--;
 		chain = jumpstack[stackptr].chain;
-		rule  = jumpstack[stackptr].rule;
+		rules = jumpstack[stackptr].rules;
 		goto next_rule;
 	}
 
-- 
cgit v1.2.3


From 6172abc1e2ea12743f368875576952ddf4ebe88c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:30 -0700
Subject: net: sched: add qstats.qlen to qlen

AFAICT struct gnet_stats_queue.qlen is not used in Qdiscs.
It may, however, be useful for offloads to report HW queue
length there.  Add that value to the result of qdisc_qlen_sum().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 827a3711dc68..6488daa32f82 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -350,14 +350,14 @@ static inline int qdisc_qlen(const struct Qdisc *q)
 
 static inline int qdisc_qlen_sum(const struct Qdisc *q)
 {
-	__u32 qlen = 0;
+	__u32 qlen = q->qstats.qlen;
 	int i;
 
 	if (q->flags & TCQ_F_NOLOCK) {
 		for_each_possible_cpu(i)
 			qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
 	} else {
-		qlen = q->q.qlen;
+		qlen += q->q.qlen;
 	}
 
 	return qlen;
-- 
cgit v1.2.3


From f971b132300fb0df63a8de631947adc74a7b3db1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:35 -0700
Subject: net: sched: mq: add simple offload notification

mq offload is trivial, we just need to let the device know
that the root qdisc is mq.  Alternative approach would be
to export qdisc_lookup() and make drivers check the root
type themselves, but notification via ndo_setup_tc is more
in line with other qdiscs.

Note that mq doesn't hold any stats on it's own, it just
adds up stats of its children.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h     | 10 ++++++++++
 net/sched/sch_mq.c        | 19 +++++++++++++++++++
 3 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f45b1a4e37ab..6b863ed3174a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -791,6 +791,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_CBS,
 	TC_SETUP_QDISC_RED,
 	TC_SETUP_QDISC_PRIO,
+	TC_SETUP_QDISC_MQ,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f3ec43725724..942f839dbca4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -778,6 +778,16 @@ struct tc_qopt_offload_stats {
 	struct gnet_stats_queue *qstats;
 };
 
+enum tc_mq_command {
+	TC_MQ_CREATE,
+	TC_MQ_DESTROY,
+};
+
+struct tc_mq_qopt_offload {
+	enum tc_mq_command command;
+	u32 handle;
+};
+
 enum tc_red_command {
 	TC_RED_REPLACE,
 	TC_RED_DESTROY,
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f062a18e9162..6ccf6daa2503 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -16,6 +16,7 @@
 #include <linux/errno.h>
 #include <linux/skbuff.h>
 #include <net/netlink.h>
+#include <net/pkt_cls.h>
 #include <net/pkt_sched.h>
 #include <net/sch_generic.h>
 
@@ -23,12 +24,28 @@ struct mq_sched {
 	struct Qdisc		**qdiscs;
 };
 
+static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mq_qopt_offload opt = {
+		.command = cmd,
+		.handle = sch->handle,
+	};
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
 static void mq_destroy(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mq_sched *priv = qdisc_priv(sch);
 	unsigned int ntx;
 
+	mq_offload(sch, TC_MQ_DESTROY);
+
 	if (!priv->qdiscs)
 		return;
 	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
@@ -70,6 +87,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	sch->flags |= TCQ_F_MQROOT;
+
+	mq_offload(sch, TC_MQ_CREATE);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 47c669a406d8621c69b1c199ce099b54b17b9902 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:37 -0700
Subject: net: sched: mq: request stats from offloads

MQ doesn't hold any statistics on its own, however, statistic
from offloads are requested starting from the root, hence MQ
will read the old values for its sums.  Call into the drivers,
because of the additive nature of the stats drivers are aware
of how much "pending updates" they have to children of the MQ.
Since MQ reset its stats on every dump we can simply offset
the stats, predicting how stats of offloaded children will
change.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h |  2 ++
 net/sched/sch_mq.c    | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 942f839dbca4..a3c1a2c47cd4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -781,11 +781,13 @@ struct tc_qopt_offload_stats {
 enum tc_mq_command {
 	TC_MQ_CREATE,
 	TC_MQ_DESTROY,
+	TC_MQ_STATS,
 };
 
 struct tc_mq_qopt_offload {
 	enum tc_mq_command command;
 	u32 handle;
+	struct tc_qopt_offload_stats stats;
 };
 
 enum tc_red_command {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 6ccf6daa2503..d6b8ae4ed7a3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -38,6 +38,22 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
 }
 
+static void mq_offload_stats(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mq_qopt_offload opt = {
+		.command = TC_MQ_STATS,
+		.handle = sch->handle,
+		.stats = {
+			.bstats = &sch->bstats,
+			.qstats = &sch->qstats,
+		},
+	};
+
+	if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
 static void mq_destroy(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
@@ -146,6 +162,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 			sch->q.qlen		+= qdisc->q.qlen;
 			sch->bstats.bytes	+= qdisc->bstats.bytes;
 			sch->bstats.packets	+= qdisc->bstats.packets;
+			sch->qstats.qlen	+= qdisc->qstats.qlen;
 			sch->qstats.backlog	+= qdisc->qstats.backlog;
 			sch->qstats.drops	+= qdisc->qstats.drops;
 			sch->qstats.requeues	+= qdisc->qstats.requeues;
@@ -154,6 +171,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
+	mq_offload_stats(sch);
 
 	return 0;
 }
-- 
cgit v1.2.3


From e6464b8c6361962f5ff99dc95d010b64432c27b5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:53 -0700
Subject: net/ipv6: Convert ipv6_add_addr to struct ifa6_config

Move config parameters for adding an ipv6 address to a struct. struct
names stem from inet6_rtm_newaddr which is the modern handler for
adding an address.

Start the conversion to ifa6_config with ipv6_add_addr. This is an argument
move only; no functional change intended. Mapping of variable changes:

    addr      -->  cfg->pfx
    peer_addr -->  cfg->peer_pfx
    pfxlen    -->  cfg->plen
    flags     -->  cfg->ifa_flags

scope, valid_lft, prefered_lft have the same names within cfg
(with corrected spelling).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  12 +++++
 net/ipv6/addrconf.c    | 134 +++++++++++++++++++++++++++----------------------
 2 files changed, 87 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index c07d4dd09361..f766af2cd1a4 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -59,6 +59,18 @@ struct in6_validator_info {
 	struct netlink_ext_ack	*extack;
 };
 
+struct ifa6_config {
+	const struct in6_addr	*pfx;
+	unsigned int		plen;
+
+	const struct in6_addr	*peer_pfx;
+
+	u32			ifa_flags;
+	u32			preferred_lft;
+	u32			valid_lft;
+	u16			scope;
+};
+
 int addrconf_init(void);
 void addrconf_cleanup(void);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index fbfd71a2d9c8..4988f2265882 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -986,17 +986,15 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 /* On success it returns ifp with increased reference count */
 
 static struct inet6_ifaddr *
-ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
-	      const struct in6_addr *peer_addr, int pfxlen,
-	      int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
+ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	      bool can_block, struct netlink_ext_ack *extack)
 {
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
+	int addr_type = ipv6_addr_type(cfg->pfx);
 	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
 	struct fib6_info *f6i = NULL;
 	int err = 0;
-	int addr_type = ipv6_addr_type(addr);
 
 	if (addr_type == IPV6_ADDR_ANY ||
 	    addr_type & IPV6_ADDR_MULTICAST ||
@@ -1019,7 +1017,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	 */
 	if (can_block) {
 		struct in6_validator_info i6vi = {
-			.i6vi_addr = *addr,
+			.i6vi_addr = *cfg->pfx,
 			.i6vi_dev = idev,
 			.extack = extack,
 		};
@@ -1036,7 +1034,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags);
+	f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags);
 	if (IS_ERR(f6i)) {
 		err = PTR_ERR(f6i);
 		f6i = NULL;
@@ -1049,21 +1047,21 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 
 	neigh_parms_data_state_setall(idev->nd_parms);
 
-	ifa->addr = *addr;
-	if (peer_addr)
-		ifa->peer_addr = *peer_addr;
+	ifa->addr = *cfg->pfx;
+	if (cfg->peer_pfx)
+		ifa->peer_addr = *cfg->peer_pfx;
 
 	spin_lock_init(&ifa->lock);
 	INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
 	INIT_HLIST_NODE(&ifa->addr_lst);
-	ifa->scope = scope;
-	ifa->prefix_len = pfxlen;
-	ifa->flags = flags;
+	ifa->scope = cfg->scope;
+	ifa->prefix_len = cfg->plen;
+	ifa->flags = cfg->ifa_flags;
 	/* No need to add the TENTATIVE flag for addresses with NODAD */
-	if (!(flags & IFA_F_NODAD))
+	if (!(cfg->ifa_flags & IFA_F_NODAD))
 		ifa->flags |= IFA_F_TENTATIVE;
-	ifa->valid_lft = valid_lft;
-	ifa->prefered_lft = prefered_lft;
+	ifa->valid_lft = cfg->valid_lft;
+	ifa->prefered_lft = cfg->preferred_lft;
 	ifa->cstamp = ifa->tstamp = jiffies;
 	ifa->tokenized = false;
 
@@ -1260,11 +1258,10 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
 {
 	struct inet6_dev *idev = ifp->idev;
 	struct in6_addr addr, *tmpaddr;
-	unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age;
+	unsigned long tmp_tstamp, age;
 	unsigned long regen_advance;
-	int tmp_plen;
+	struct ifa6_config cfg;
 	int ret = 0;
-	u32 addr_flags;
 	unsigned long now = jiffies;
 	long max_desync_factor;
 	s32 cnf_temp_preferred_lft;
@@ -1326,13 +1323,12 @@ retry:
 		}
 	}
 
-	tmp_valid_lft = min_t(__u32,
-			      ifp->valid_lft,
+	cfg.valid_lft = min_t(__u32, ifp->valid_lft,
 			      idev->cnf.temp_valid_lft + age);
-	tmp_prefered_lft = cnf_temp_preferred_lft + age -
-			    idev->desync_factor;
-	tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
-	tmp_plen = ifp->prefix_len;
+	cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
+	cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft);
+
+	cfg.plen = ifp->prefix_len;
 	tmp_tstamp = ifp->tstamp;
 	spin_unlock_bh(&ifp->lock);
 
@@ -1346,21 +1342,22 @@ retry:
 	 * temporary addresses being generated.
 	 */
 	age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
-	if (tmp_prefered_lft <= regen_advance + age) {
+	if (cfg.preferred_lft <= regen_advance + age) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
 		ret = -1;
 		goto out;
 	}
 
-	addr_flags = IFA_F_TEMPORARY;
+	cfg.ifa_flags = IFA_F_TEMPORARY;
 	/* set in addrconf_prefix_rcv() */
 	if (ifp->flags & IFA_F_OPTIMISTIC)
-		addr_flags |= IFA_F_OPTIMISTIC;
+		cfg.ifa_flags |= IFA_F_OPTIMISTIC;
+
+	cfg.pfx = &addr;
+	cfg.scope = ipv6_addr_scope(cfg.pfx);
 
-	ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
-			    ipv6_addr_scope(&addr), addr_flags,
-			    tmp_valid_lft, tmp_prefered_lft, block, NULL);
+	ift = ipv6_add_addr(idev, &cfg, block, NULL);
 	if (IS_ERR(ift)) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
@@ -2031,13 +2028,17 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 	spin_lock_bh(&ifp->lock);
 
 	if (ifp->flags & IFA_F_STABLE_PRIVACY) {
-		int scope = ifp->scope;
-		u32 flags = ifp->flags;
 		struct in6_addr new_addr;
 		struct inet6_ifaddr *ifp2;
-		u32 valid_lft, preferred_lft;
-		int pfxlen = ifp->prefix_len;
 		int retries = ifp->stable_privacy_retry + 1;
+		struct ifa6_config cfg = {
+			.pfx = &new_addr,
+			.plen = ifp->prefix_len,
+			.ifa_flags = ifp->flags,
+			.valid_lft = ifp->valid_lft,
+			.preferred_lft = ifp->prefered_lft,
+			.scope = ifp->scope,
+		};
 
 		if (retries > net->ipv6.sysctl.idgen_retries) {
 			net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
@@ -2050,9 +2051,6 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 						 idev))
 			goto errdad;
 
-		valid_lft = ifp->valid_lft;
-		preferred_lft = ifp->prefered_lft;
-
 		spin_unlock_bh(&ifp->lock);
 
 		if (idev->cnf.max_addresses &&
@@ -2063,9 +2061,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 		net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
 				     ifp->idev->dev->name);
 
-		ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
-				     scope, flags, valid_lft,
-				     preferred_lft, false, NULL);
+		ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
 		if (IS_ERR(ifp2))
 			goto lock_errdad;
 
@@ -2507,12 +2503,20 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 
 	if (!ifp && valid_lft) {
 		int max_addresses = in6_dev->cnf.max_addresses;
+		struct ifa6_config cfg = {
+			.pfx = addr,
+			.plen = pinfo->prefix_len,
+			.ifa_flags = addr_flags,
+			.valid_lft = valid_lft,
+			.preferred_lft = prefered_lft,
+			.scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+		};
 
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 		if ((net->ipv6.devconf_all->optimistic_dad ||
 		     in6_dev->cnf.optimistic_dad) &&
 		    !net->ipv6.devconf_all->forwarding && sllao)
-			addr_flags |= IFA_F_OPTIMISTIC;
+			cfg.ifa_flags |= IFA_F_OPTIMISTIC;
 #endif
 
 		/* Do not allow to create too much of autoconfigured
@@ -2520,11 +2524,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 		 */
 		if (!max_addresses ||
 		    ipv6_count_addresses(in6_dev) < max_addresses)
-			ifp = ipv6_add_addr(in6_dev, addr, NULL,
-					    pinfo->prefix_len,
-					    addr_type&IPV6_ADDR_SCOPE_MASK,
-					    addr_flags, valid_lft,
-					    prefered_lft, false, NULL);
+			ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);
 
 		if (IS_ERR_OR_NULL(ifp))
 			return -1;
@@ -2836,12 +2836,19 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			  __u32 prefered_lft, __u32 valid_lft,
 			  struct netlink_ext_ack *extack)
 {
+	struct ifa6_config cfg = {
+		.pfx = pfx,
+		.plen = plen,
+		.peer_pfx = peer_pfx,
+		.ifa_flags = ifa_flags,
+		.preferred_lft = prefered_lft,
+		.valid_lft = valid_lft,
+	};
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
 	struct net_device *dev;
 	unsigned long timeout;
 	clock_t expires;
-	int scope;
 	u32 flags;
 
 	ASSERT_RTNL();
@@ -2872,7 +2879,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			return ret;
 	}
 
-	scope = ipv6_addr_scope(pfx);
+	cfg.scope = ipv6_addr_scope(pfx);
 
 	timeout = addrconf_timeout_fixup(valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
@@ -2892,9 +2899,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 		prefered_lft = timeout;
 	}
 
-	ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
-			    valid_lft, prefered_lft, true, extack);
-
+	ifp = ipv6_add_addr(idev, &cfg, true, extack);
 	if (!IS_ERR(ifp)) {
 		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
 			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
@@ -3010,11 +3015,16 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		     int plen, int scope)
 {
 	struct inet6_ifaddr *ifp;
+	struct ifa6_config cfg = {
+		.pfx = addr,
+		.plen = plen,
+		.ifa_flags = IFA_F_PERMANENT,
+		.valid_lft = INFINITY_LIFE_TIME,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.scope = scope
+	};
 
-	ifp = ipv6_add_addr(idev, addr, NULL, plen,
-			    scope, IFA_F_PERMANENT,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
-			    true, NULL);
+	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
 		spin_lock_bh(&ifp->lock);
 		ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3104,18 +3114,24 @@ static void init_loopback(struct net_device *dev)
 void addrconf_add_linklocal(struct inet6_dev *idev,
 			    const struct in6_addr *addr, u32 flags)
 {
+	struct ifa6_config cfg = {
+		.pfx = addr,
+		.plen = 64,
+		.ifa_flags = flags | IFA_F_PERMANENT,
+		.valid_lft = INFINITY_LIFE_TIME,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.scope = IFA_LINK
+	};
 	struct inet6_ifaddr *ifp;
-	u32 addr_flags = flags | IFA_F_PERMANENT;
 
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad ||
 	     idev->cnf.optimistic_dad) &&
 	    !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
-		addr_flags |= IFA_F_OPTIMISTIC;
+		cfg.ifa_flags |= IFA_F_OPTIMISTIC;
 #endif
 
-	ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
+	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
 				      0, 0, GFP_ATOMIC);
-- 
cgit v1.2.3


From 620dee9415fae09003d012cc7c23b011e5ebb43d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:56 -0700
Subject: net: Add IFA_RT_PRIORITY address attribute

Currently, if two interfaces have addresses in the same connected route,
then the order of the prefix route entries is determined by the order in
which the addresses are assigned or the links brought up.

Add IFA_RT_PRIORITY to allow user to specify the metric of the prefix
route associated with an address giving interface managers better
control of the order of prefix routes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index 2ef053d265de..ebaf5701c9db 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -33,6 +33,7 @@ enum {
 	IFA_CACHEINFO,
 	IFA_MULTICAST,
 	IFA_FLAGS,
+	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
 	__IFA_MAX,
 };
 
-- 
cgit v1.2.3


From af4d768ad28cbf6542ba70dba10b49127b31b762 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:57 -0700
Subject: net/ipv4: Add support for specifying metric of connected routes

Add support for IFA_RT_PRIORITY to ipv4 addresses.

If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be replaced.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  1 +
 include/net/route.h        |  1 +
 net/ipv4/devinet.c         | 15 +++++++++++++
 net/ipv4/fib_frontend.c    | 53 ++++++++++++++++++++++++++++++++++++----------
 4 files changed, 59 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index e16fe7d44a71..27650f1bff3d 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -139,6 +139,7 @@ struct in_ifaddr {
 	__be32			ifa_local;
 	__be32			ifa_address;
 	__be32			ifa_mask;
+	__u32			ifa_rt_priority;
 	__be32			ifa_broadcast;
 	unsigned char		ifa_scope;
 	unsigned char		ifa_prefixlen;
diff --git a/include/net/route.h b/include/net/route.h
index dbb032d5921b..bb53cdba38dc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -225,6 +225,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
 void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
 
 void rt_add_uncached_list(struct rtable *rt);
 void rt_del_uncached_list(struct rtable *rt);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..d7585ab1a77a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .type = NLA_U32 },
+	[IFA_RT_PRIORITY]	= { .type = NLA_U32 },
 };
 
 #define IN4_ADDR_HSIZE_SHIFT	8
@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
 	else
 		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
 
+	if (tb[IFA_RT_PRIORITY])
+		ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
 	if (tb[IFA_CACHEINFO]) {
 		struct ifa_cacheinfo *ci;
 
@@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
 					 extack);
 	} else {
+		u32 new_metric = ifa->ifa_rt_priority;
+
 		inet_free_ifa(ifa);
 
 		if (nlh->nlmsg_flags & NLM_F_EXCL ||
 		    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 			return -EEXIST;
 		ifa = ifa_existing;
+
+		if (ifa->ifa_rt_priority != new_metric) {
+			fib_modify_prefix_metric(ifa, new_metric);
+			ifa->ifa_rt_priority = new_metric;
+		}
+
 		set_ifa_lifetime(ifa, valid_lft, prefered_lft);
 		cancel_delayed_work(&check_lifetime_work);
 		queue_delayed_work(system_power_efficient_wq,
@@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void)
 	       + nla_total_size(4) /* IFA_BROADCAST */
 	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
 	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(4)  /* IFA_RT_PRIORITY */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
 }
 
@@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	    (ifa->ifa_label[0] &&
 	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
 	    nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+	    (ifa->ifa_rt_priority &&
+	     nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
 	    put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
 			  preferred, valid))
 		goto nla_put_failure;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b69e2824c761..63aa39b3af03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -847,7 +847,8 @@ out_err:
  * to fib engine. It is legal, because all events occur
  * only when netlink is already locked.
  */
-static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+		      struct in_ifaddr *ifa, u32 rt_priority)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
 	u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
@@ -857,6 +858,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
 		.fc_type = type,
 		.fc_dst = dst,
 		.fc_dst_len = dst_len,
+		.fc_priority = rt_priority,
 		.fc_prefsrc = ifa->ifa_local,
 		.fc_oif = ifa->ifa_dev->dev->ifindex,
 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
@@ -902,31 +904,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 		}
 	}
 
-	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
 
 	if (!(dev->flags & IFF_UP))
 		return;
 
 	/* Add broadcast address, if it is explicitly assigned. */
 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
-		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
 
 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
 		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
 			fib_magic(RTM_NEWROUTE,
 				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
-				  prefix, ifa->ifa_prefixlen, prim);
+				  prefix, ifa->ifa_prefixlen, prim,
+				  ifa->ifa_rt_priority);
 
 		/* Add network specific broadcasts, when it takes a sense */
 		if (ifa->ifa_prefixlen < 31) {
-			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
+				  prim, 0);
 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
-				  32, prim);
+				  32, prim, 0);
 		}
 	}
 }
 
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+	__be32 prefix = ifa->ifa_address & ifa->ifa_mask;
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+
+	if (!(dev->flags & IFF_UP) ||
+	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+	    ipv4_is_zeronet(prefix) ||
+	    prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+		return;
+
+	/* add the new */
+	fib_magic(RTM_NEWROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, new_metric);
+
+	/* delete the old */
+	fib_magic(RTM_DELROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
+}
+
 /* Delete primary or secondary address.
  * Optionally, on secondary address promotion consider the addresses
  * from subnet iprim as deleted, even if they are in device list.
@@ -968,7 +996,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
 			fib_magic(RTM_DELROUTE,
 				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
-				  any, ifa->ifa_prefixlen, prim);
+				  any, ifa->ifa_prefixlen, prim, 0);
 		subnet = 1;
 	}
 
@@ -1052,17 +1080,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 
 no_promotions:
 	if (!(ok & BRD_OK))
-		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
 	if (subnet && ifa->ifa_prefixlen < 31) {
 		if (!(ok & BRD1_OK))
-			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+				  prim, 0);
 		if (!(ok & BRD0_OK))
-			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+				  prim, 0);
 	}
 	if (!(ok & LOCAL_OK)) {
 		unsigned int addr_type;
 
-		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
 
 		/* Check, that this local address finally disappeared. */
 		addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
-- 
cgit v1.2.3


From 8308f3ff1753d001f7a73f9bb0f02292b5400557 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:58 -0700
Subject: net/ipv6: Add support for specifying metric of connected routes

Add support for IFA_RT_PRIORITY to ipv6 addresses.

If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be atomically replaced.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  1 +
 include/net/if_inet6.h |  1 +
 net/ipv6/addrconf.c    | 93 ++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 77 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f766af2cd1a4..5f43f7a70fe6 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -65,6 +65,7 @@ struct ifa6_config {
 
 	const struct in6_addr	*peer_pfx;
 
+	u32			rt_priority;
 	u32			ifa_flags;
 	u32			preferred_lft;
 	u32			valid_lft;
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index db389253dc2a..d7578cf49c3a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -42,6 +42,7 @@ enum {
 struct inet6_ifaddr {
 	struct in6_addr		addr;
 	__u32			prefix_len;
+	__u32			rt_priority;
 
 	/* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
 	__u32			valid_lft;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b1e63a4520b..f09afc232da4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1056,6 +1056,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	INIT_HLIST_NODE(&ifa->addr_lst);
 	ifa->scope = cfg->scope;
 	ifa->prefix_len = cfg->plen;
+	ifa->rt_priority = cfg->rt_priority;
 	ifa->flags = cfg->ifa_flags;
 	/* No need to add the TENTATIVE flag for addresses with NODAD */
 	if (!(cfg->ifa_flags & IFA_F_NODAD))
@@ -1356,6 +1357,7 @@ retry:
 
 	cfg.pfx = &addr;
 	cfg.scope = ipv6_addr_scope(cfg.pfx);
+	cfg.rt_priority = 0;
 
 	ift = ipv6_add_addr(idev, &cfg, block, NULL);
 	if (IS_ERR(ift)) {
@@ -2314,12 +2316,13 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
  */
 
 static void
-addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
-		      unsigned long expires, u32 flags, gfp_t gfp_flags)
+addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
+		      struct net_device *dev, unsigned long expires,
+		      u32 flags, gfp_t gfp_flags)
 {
 	struct fib6_config cfg = {
 		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
-		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
 		.fc_ifindex = dev->ifindex,
 		.fc_expires = expires,
 		.fc_dst_len = plen,
@@ -2683,7 +2686,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 				expires = jiffies_to_clock_t(rt_expires);
 			}
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
-					      dev, expires, flags, GFP_ATOMIC);
+					      0, dev, expires, flags,
+					      GFP_ATOMIC);
 		}
 		fib6_info_release(rt);
 	}
@@ -2891,8 +2895,9 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	ifp = ipv6_add_addr(idev, cfg, true, extack);
 	if (!IS_ERR(ifp)) {
 		if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
-			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
-					      expires, flags, GFP_KERNEL);
+			addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+					      ifp->rt_priority, dev, expires,
+					      flags, GFP_KERNEL);
 		}
 
 		/* Send a netlink notification if DAD is enabled and
@@ -3056,7 +3061,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
 	if (addr.s6_addr32[3]) {
 		add_addr(idev, &addr, plen, scope);
-		addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags,
+		addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
 				      GFP_ATOMIC);
 		return;
 	}
@@ -3081,8 +3086,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 				}
 
 				add_addr(idev, &addr, plen, flag);
-				addrconf_prefix_route(&addr, plen, idev->dev, 0,
-						      pflags, GFP_ATOMIC);
+				addrconf_prefix_route(&addr, plen, 0, idev->dev,
+						      0, pflags, GFP_ATOMIC);
 			}
 		}
 	}
@@ -3128,7 +3133,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 
 	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
 				      0, 0, GFP_ATOMIC);
 		addrconf_dad_start(ifp);
 		in6_ifa_put(ifp);
@@ -3244,7 +3249,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 			addrconf_add_linklocal(idev, &addr,
 					       IFA_F_STABLE_PRIVACY);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev,
+			addrconf_prefix_route(&addr, 64, 0, idev->dev,
 					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_EUI64:
@@ -3255,7 +3260,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 		if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
 			addrconf_add_linklocal(idev, &addr, 0);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev,
+			addrconf_prefix_route(&addr, 64, 0, idev->dev,
 					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_NONE:
@@ -3375,7 +3380,8 @@ static int fixup_permanent_addr(struct net *net,
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      idev->dev, 0, 0, GFP_ATOMIC);
+				      ifp->rt_priority, idev->dev, 0, 0,
+				      GFP_ATOMIC);
 	}
 
 	if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -4495,6 +4501,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
 	[IFA_LOCAL]		= { .len = sizeof(struct in6_addr) },
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .len = sizeof(u32) },
+	[IFA_RT_PRIORITY]	= { .len = sizeof(u32) },
 };
 
 static int
@@ -4527,6 +4534,37 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			      ifm->ifa_prefixlen);
 }
 
+static int modify_prefix_route(struct inet6_ifaddr *ifp,
+			       unsigned long expires, u32 flags)
+{
+	struct fib6_info *f6i;
+
+	f6i = addrconf_get_prefix_route(&ifp->addr,
+					ifp->prefix_len,
+					ifp->idev->dev,
+					0, RTF_GATEWAY | RTF_DEFAULT);
+	if (!f6i)
+		return -ENOENT;
+
+	if (f6i->fib6_metric != ifp->rt_priority) {
+		/* add new one */
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      ifp->rt_priority, ifp->idev->dev,
+				      expires, flags, GFP_KERNEL);
+		/* delete old one */
+		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+	} else {
+		if (!expires)
+			fib6_clean_expires(f6i);
+		else
+			fib6_set_expires(f6i, expires);
+
+		fib6_info_release(f6i);
+	}
+
+	return 0;
+}
+
 static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 {
 	u32 flags;
@@ -4577,14 +4615,25 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 	ifp->valid_lft = cfg->valid_lft;
 	ifp->prefered_lft = cfg->preferred_lft;
 
+	if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
+		ifp->rt_priority = cfg->rt_priority;
+
 	spin_unlock_bh(&ifp->lock);
 	if (!(ifp->flags&IFA_F_TENTATIVE))
 		ipv6_ifa_notify(0, ifp);
 
 	if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      ifp->idev->dev, expires, flags,
-				      GFP_KERNEL);
+		int rc = -ENOENT;
+
+		if (had_prefixroute)
+			rc = modify_prefix_route(ifp, expires, flags);
+
+		/* prefix route could have been deleted; if so restore it */
+		if (rc == -ENOENT) {
+			addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+					      ifp->rt_priority, ifp->idev->dev,
+					      expires, flags, GFP_KERNEL);
+		}
 	} else if (had_prefixroute) {
 		enum cleanup_prefix_rt_t action;
 		unsigned long rt_expires;
@@ -4643,6 +4692,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	cfg.peer_pfx = peer_pfx;
 	cfg.plen = ifm->ifa_prefixlen;
+	if (tb[IFA_RT_PRIORITY])
+		cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
 	cfg.valid_lft = INFINITY_LIFE_TIME;
 	cfg.preferred_lft = INFINITY_LIFE_TIME;
 
@@ -4745,7 +4797,8 @@ static inline int inet6_ifaddr_msgsize(void)
 	       + nla_total_size(16) /* IFA_LOCAL */
 	       + nla_total_size(16) /* IFA_ADDRESS */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo))
-	       + nla_total_size(4)  /* IFA_FLAGS */;
+	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(4)  /* IFA_RT_PRIORITY */;
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
@@ -4791,6 +4844,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 		if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
 			goto error;
 
+	if (ifa->rt_priority &&
+	    nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
+		goto error;
+
 	if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
 		goto error;
 
@@ -5635,7 +5692,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
 		if (!ipv6_addr_any(&ifp->peer_addr))
-			addrconf_prefix_route(&ifp->peer_addr, 128,
+			addrconf_prefix_route(&ifp->peer_addr, 128, 0,
 					      ifp->idev->dev, 0, 0,
 					      GFP_ATOMIC);
 		break;
-- 
cgit v1.2.3


From 2d68c0745a0e8349ff71b04af5ee020d40f78d90 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 25 May 2018 18:14:05 +0800
Subject: tcp: use data length instead of skb->len in tcp_probe

skb->len is meaningless to user.
data length could be more helpful, with which we can easily filter out
the packet without payload.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index c1a5284713b8..703abb6e11fa 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -236,7 +236,7 @@ TRACE_EVENT(tcp_probe,
 		__field(__u16, sport)
 		__field(__u16, dport)
 		__field(__u32, mark)
-		__field(__u16, length)
+		__field(__u16, data_len)
 		__field(__u32, snd_nxt)
 		__field(__u32, snd_una)
 		__field(__u32, snd_cwnd)
@@ -261,7 +261,7 @@ TRACE_EVENT(tcp_probe,
 		__entry->dport = ntohs(inet->inet_dport);
 		__entry->mark = skb->mark;
 
-		__entry->length = skb->len;
+		__entry->data_len = skb->len - tcp_hdrlen(skb);
 		__entry->snd_nxt = tp->snd_nxt;
 		__entry->snd_una = tp->snd_una;
 		__entry->snd_cwnd = tp->snd_cwnd;
@@ -272,9 +272,9 @@ TRACE_EVENT(tcp_probe,
 		__entry->sock_cookie = sock_gen_cookie(sk);
 	),
 
-	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
+	TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
 		  __entry->saddr, __entry->daddr, __entry->mark,
-		  __entry->length, __entry->snd_nxt, __entry->snd_una,
+		  __entry->data_len, __entry->snd_nxt, __entry->snd_una,
 		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
 		  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
 );
-- 
cgit v1.2.3


From ad7a9b34fa532b95a7eae1a1708408a4e435a71c Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Tue, 29 May 2018 11:18:28 +0100
Subject: ASoC: qdsp6: dt-bindings: Add q6afe tdm dt binding

This patch adds bindings required for TDM ports on AFE.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/sound/qcom,q6afe.txt       | 68 ++++++++++++++++++
 include/dt-bindings/sound/qcom,q6afe.h             | 80 ++++++++++++++++++++++
 2 files changed, 148 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/qcom,q6afe.txt b/Documentation/devicetree/bindings/sound/qcom,q6afe.txt
index 14335a08b963..bdbf87df8c0b 100644
--- a/Documentation/devicetree/bindings/sound/qcom,q6afe.txt
+++ b/Documentation/devicetree/bindings/sound/qcom,q6afe.txt
@@ -46,6 +46,53 @@ configuration of each dai. Must contain the following properties.
 	Definition: Must be list of serial data lines used by this dai.
 	should be one or more of the 1-4 sd lines.
 
+ - qcom,tdm-sync-mode:
+	Usage: required for tdm interface
+	Value type: <prop-encoded-array>
+	Definition: Synchronization mode.
+		0 - Short sync bit mode
+		1 - Long sync mode
+		2 - Short sync slot mode
+
+ - qcom,tdm-sync-src:
+	Usage: required for tdm interface
+	Value type: <prop-encoded-array>
+	Definition: Synchronization source.
+		0 - External source
+		1 - Internal source
+
+ - qcom,tdm-data-out:
+	Usage: required for tdm interface
+	Value type: <prop-encoded-array>
+	Definition: Data out signal to drive with other masters.
+		0 - Disable
+		1 - Enable
+
+ - qcom,tdm-invert-sync:
+	Usage: required for tdm interface
+	Value type: <prop-encoded-array>
+	Definition: Invert the sync.
+		0 - Normal
+		1 - Invert
+
+ - qcom,tdm-data-delay:
+	Usage: required for tdm interface
+	Value type: <prop-encoded-array>
+	Definition: Number of bit clock to delay data
+		with respect to sync edge.
+		0 - 0 bit clock cycle
+		1 - 1 bit clock cycle
+		2 - 2 bit clock cycle
+
+ - qcom,tdm-data-align:
+	Usage: required for tdm interface
+	Value type: <prop-encoded-array>
+	Definition: Indicate how data is packed
+		within the slot. For example, 32 slot width in case of
+		sample bit width is 24.
+		0 - MSB
+		1 - LSB
+
 = EXAMPLE
 
 q6afe@4 {
@@ -61,6 +108,27 @@ q6afe@4 {
 			reg = <1>;
 		};
 
+		tdm@24 {
+			reg = <24>;
+			qcom,tdm-sync-mode = <1>:
+			qcom,tdm-sync-src = <1>;
+			qcom,tdm-data-out = <0>;
+			qcom,tdm-invert-sync = <1>;
+			qcom,tdm-data-delay = <1>;
+			qcom,tdm-data-align = <0>;
+
+		};
+
+		tdm@25 {
+			reg = <25>;
+			qcom,tdm-sync-mode = <1>:
+			qcom,tdm-sync-src = <1>;
+			qcom,tdm-data-out = <0>;
+			qcom,tdm-invert-sync = <1>;
+			qcom,tdm-data-delay <1>:
+			qcom,tdm-data-align = <0>;
+		};
+
 		prim-mi2s-rx@16 {
 			reg = <16>;
 			qcom,sd-lines = <1 3>;
diff --git a/include/dt-bindings/sound/qcom,q6afe.h b/include/dt-bindings/sound/qcom,q6afe.h
index e162045f5dc9..e2d3892240b8 100644
--- a/include/dt-bindings/sound/qcom,q6afe.h
+++ b/include/dt-bindings/sound/qcom,q6afe.h
@@ -26,6 +26,86 @@
 #define TERTIARY_MI2S_TX	21
 #define QUATERNARY_MI2S_RX	22
 #define QUATERNARY_MI2S_TX	23
+#define PRIMARY_TDM_RX_0	24
+#define PRIMARY_TDM_TX_0	25
+#define PRIMARY_TDM_RX_1	26
+#define PRIMARY_TDM_TX_1	27
+#define PRIMARY_TDM_RX_2	28
+#define PRIMARY_TDM_TX_2	29
+#define PRIMARY_TDM_RX_3	30
+#define PRIMARY_TDM_TX_3	31
+#define PRIMARY_TDM_RX_4	32
+#define PRIMARY_TDM_TX_4	33
+#define PRIMARY_TDM_RX_5	34
+#define PRIMARY_TDM_TX_5	35
+#define PRIMARY_TDM_RX_6	36
+#define PRIMARY_TDM_TX_6	37
+#define PRIMARY_TDM_RX_7	38
+#define PRIMARY_TDM_TX_7	39
+#define SECONDARY_TDM_RX_0	40
+#define SECONDARY_TDM_TX_0	41
+#define SECONDARY_TDM_RX_1	42
+#define SECONDARY_TDM_TX_1	43
+#define SECONDARY_TDM_RX_2	44
+#define SECONDARY_TDM_TX_2	45
+#define SECONDARY_TDM_RX_3	46
+#define SECONDARY_TDM_TX_3	47
+#define SECONDARY_TDM_RX_4	48
+#define SECONDARY_TDM_TX_4	49
+#define SECONDARY_TDM_RX_5	50
+#define SECONDARY_TDM_TX_5	51
+#define SECONDARY_TDM_RX_6	52
+#define SECONDARY_TDM_TX_6	53
+#define SECONDARY_TDM_RX_7	54
+#define SECONDARY_TDM_TX_7	55
+#define TERTIARY_TDM_RX_0	56
+#define TERTIARY_TDM_TX_0	57
+#define TERTIARY_TDM_RX_1	58
+#define TERTIARY_TDM_TX_1	59
+#define TERTIARY_TDM_RX_2	60
+#define TERTIARY_TDM_TX_2	61
+#define TERTIARY_TDM_RX_3	62
+#define TERTIARY_TDM_TX_3	63
+#define TERTIARY_TDM_RX_4	64
+#define TERTIARY_TDM_TX_4	65
+#define TERTIARY_TDM_RX_5	66
+#define TERTIARY_TDM_TX_5	67
+#define TERTIARY_TDM_RX_6	68
+#define TERTIARY_TDM_TX_6	69
+#define TERTIARY_TDM_RX_7	70
+#define TERTIARY_TDM_TX_7	71
+#define QUATERNARY_TDM_RX_0	72
+#define QUATERNARY_TDM_TX_0	73
+#define QUATERNARY_TDM_RX_1	74
+#define QUATERNARY_TDM_TX_1	75
+#define QUATERNARY_TDM_RX_2	76
+#define QUATERNARY_TDM_TX_2	77
+#define QUATERNARY_TDM_RX_3	78
+#define QUATERNARY_TDM_TX_3	79
+#define QUATERNARY_TDM_RX_4	80
+#define QUATERNARY_TDM_TX_4	81
+#define QUATERNARY_TDM_RX_5	82
+#define QUATERNARY_TDM_TX_5	83
+#define QUATERNARY_TDM_RX_6	84
+#define QUATERNARY_TDM_TX_6	85
+#define QUATERNARY_TDM_RX_7	86
+#define QUATERNARY_TDM_TX_7	87
+#define QUINARY_TDM_RX_0	88
+#define QUINARY_TDM_TX_0	89
+#define QUINARY_TDM_RX_1	90
+#define QUINARY_TDM_TX_1	91
+#define QUINARY_TDM_RX_2	92
+#define QUINARY_TDM_TX_2	93
+#define QUINARY_TDM_RX_3	94
+#define QUINARY_TDM_TX_3	95
+#define QUINARY_TDM_RX_4	96
+#define QUINARY_TDM_TX_4	97
+#define QUINARY_TDM_RX_5	98
+#define QUINARY_TDM_TX_5	99
+#define QUINARY_TDM_RX_6	100
+#define QUINARY_TDM_TX_6	101
+#define QUINARY_TDM_RX_7	102
+#define QUINARY_TDM_TX_7	103
 
 #endif /* __DT_BINDINGS_Q6_AFE_H__ */
 
-- 
cgit v1.2.3


From 01fc27d96950149c3e6c0b8dfbe05e26725381cb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 May 2018 15:52:26 +0200
Subject: libata: remove ata_scsi_timed_out

As far as I can tell this function can't even be called any more, given
that ATA implements its own eh_strategy_handler with ata_scsi_error, which
never calls ->eh_timed_out.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/libata-eh.c | 51 -------------------------------------------------
 include/linux/libata.h  |  2 --
 2 files changed, 53 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index c016829a38fd..e87785dec151 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -500,57 +500,6 @@ void ata_eh_release(struct ata_port *ap)
 	mutex_unlock(&ap->host->eh_mutex);
 }
 
-/**
- *	ata_scsi_timed_out - SCSI layer time out callback
- *	@cmd: timed out SCSI command
- *
- *	Handles SCSI layer timeout.  We race with normal completion of
- *	the qc for @cmd.  If the qc is already gone, we lose and let
- *	the scsi command finish (EH_HANDLED).  Otherwise, the qc has
- *	timed out and EH should be invoked.  Prevent ata_qc_complete()
- *	from finishing it by setting EH_SCHEDULED and return
- *	EH_NOT_HANDLED.
- *
- *	TODO: kill this function once old EH is gone.
- *
- *	LOCKING:
- *	Called from timer context
- *
- *	RETURNS:
- *	EH_HANDLED or EH_NOT_HANDLED
- */
-enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
-{
-	struct Scsi_Host *host = cmd->device->host;
-	struct ata_port *ap = ata_shost_to_port(host);
-	unsigned long flags;
-	struct ata_queued_cmd *qc;
-	enum blk_eh_timer_return ret;
-
-	DPRINTK("ENTER\n");
-
-	if (ap->ops->error_handler) {
-		ret = BLK_EH_NOT_HANDLED;
-		goto out;
-	}
-
-	ret = BLK_EH_HANDLED;
-	spin_lock_irqsave(ap->lock, flags);
-	qc = ata_qc_from_tag(ap, ap->link.active_tag);
-	if (qc) {
-		WARN_ON(qc->scsicmd != cmd);
-		qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
-		qc->err_mask |= AC_ERR_TIMEOUT;
-		ret = BLK_EH_NOT_HANDLED;
-	}
-	spin_unlock_irqrestore(ap->lock, flags);
-
- out:
-	DPRINTK("EXIT, ret=%d\n", ret);
-	return ret;
-}
-EXPORT_SYMBOL(ata_scsi_timed_out);
-
 static void ata_eh_unload(struct ata_port *ap)
 {
 	struct ata_link *link;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1795fecdea17..1c113134c98f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1133,7 +1133,6 @@ extern int ata_sas_port_start(struct ata_port *ap);
 extern void ata_sas_port_stop(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
 extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
-extern enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern int sata_scr_valid(struct ata_link *link);
 extern int sata_scr_read(struct ata_link *link, int reg, u32 *val);
 extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
@@ -1359,7 +1358,6 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	.proc_name		= drv_name,			\
 	.slave_configure	= ata_scsi_slave_config,	\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
-	.eh_timed_out		= ata_scsi_timed_out,		\
 	.bios_param		= ata_std_bios_param,		\
 	.unlock_native_capacity	= ata_scsi_unlock_native_capacity, \
 	.sdev_attrs		= ata_common_sdev_attrs
-- 
cgit v1.2.3


From 6059577cb28d8b15d2b7dad51eb90d885f1ed9ab Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:22 +0200
Subject: regulator: fixed: Convert to use GPIO descriptor only

As we augmented the regulator core to accept a GPIO descriptor instead
of a GPIO number, we can augment the fixed GPIO regulator to look up
and pass that descriptor directly from device tree or board GPIO
descriptor look up tables.

Some boards just auto-enumerate their fixed regulator platform devices
and I have assumed they get names like "fixed-regulator.0" but it's
pretty hard to guess this. I need some testing from board maintainers to
be sure. Other boards are straight forward, using just plain
"fixed-regulator" (ID -1) or "fixed-regulator.1" hammering down the
device ID.

The OMAP didn't have proper label names on its GPIO chips so I have fixed
this with a separate patch to the GPIO tree, see
commit 088413bc0bd5f5fb66ca22a19d66a49d7154ba4c
"gpio: omap: Give unique labels to each GPIO bank/chip"

It seems the da9055 and da9211 has never got around to actually passing
any enable gpio into its platform data (not the in-tree code anyway) so we
can just decide to simply pass a descriptor instead.

The fixed GPIO-controlled regulator in mach-pxa/ezx.c was confusingly named
"*_dummy_supply_device" while it is a very real device backed by a GPIO
line. There is nothing dummy about it at all, so I renamed it with the
infix *_regulator_* as part of this patch set.

For the patch hunk hitting arch/blackfin I would say I do not expect
testing, review or ACKs anymore so if it works, it works.

The hunk hitting the x86 BCM43xx driver is especially tricky as the number
comes out of SFI which is a mystery to me. I definately need someone to
look at this. (Hi Andy.)

Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com> # Check the x86 BCM stuff
Cc: Alexander Shiyan <shc_work@mail.ru> # i.MX boards user
Cc: Haojian Zhuang <haojian.zhuang@gmail.com> # MMP2 maintainer
Cc: Aaro Koskinen <aaro.koskinen@iki.fi> # OMAP1 maintainer
Cc: Tony Lindgren <tony@atomide.com> # OMAP1,2,3 maintainer
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> # EM-X270 maintainer
Cc: Robert Jarzmik <robert.jarzmik@free.fr> # EZX maintainer
Cc: Philipp Zabel <philipp.zabel@gmail.com> # Magician maintainer
Cc: Daniel Mack <zonque@gmail.com> # Raumfeld maintainer
Cc: Marc Zyngier <marc.zyngier@arm.com> # Zeus maintainer
Cc: Geert Uytterhoeven <geert+renesas@glider.be> # SuperH pinctrl/GPIO maintainer
Cc: Russell King <rmk+kernel@armlinux.org.uk> # SA1100
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-imx/mach-mx21ads.c                   | 13 ++++++++-
 arch/arm/mach-imx/mach-mx27ads.c                   | 12 +++++++-
 arch/arm/mach-mmp/brownstone.c                     | 12 +++++++-
 arch/arm/mach-omap1/board-ams-delta.c              | 14 ++++++++-
 arch/arm/mach-omap2/pdata-quirks.c                 | 16 ++++++++++-
 arch/arm/mach-pxa/em-x270.c                        |  1 -
 arch/arm/mach-pxa/ezx.c                            | 33 ++++++++++++++--------
 arch/arm/mach-pxa/magician.c                       |  2 +-
 arch/arm/mach-pxa/raumfeld.c                       | 12 ++++++--
 arch/arm/mach-pxa/zeus.c                           | 23 +++++++++++++--
 arch/arm/mach-s3c64xx/mach-crag6410.c              |  1 -
 arch/arm/mach-s3c64xx/mach-smdk6410.c              |  1 -
 arch/arm/mach-sa1100/assabet.c                     | 21 +++++++++-----
 arch/arm/mach-sa1100/generic.c                     |  5 ++--
 arch/arm/mach-sa1100/generic.h                     |  3 +-
 arch/arm/mach-sa1100/shannon.c                     |  4 +--
 arch/sh/boards/mach-ecovec24/setup.c               | 22 +++++++++++++--
 .../intel-mid/device_libs/platform_bcm43xx.c       | 17 +++++++++--
 drivers/regulator/fixed-helper.c                   |  1 -
 drivers/regulator/fixed.c                          | 33 +++++++++++-----------
 include/linux/regulator/fixed.h                    |  3 --
 21 files changed, 187 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-imx/mach-mx21ads.c b/arch/arm/mach-imx/mach-mx21ads.c
index 5e366824814f..5d3b6b4fe6db 100644
--- a/arch/arm/mach-imx/mach-mx21ads.c
+++ b/arch/arm/mach-imx/mach-mx21ads.c
@@ -18,6 +18,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/physmap.h>
 #include <linux/gpio/driver.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio.h>
 #include <linux/regulator/fixed.h>
 #include <linux/regulator/machine.h>
@@ -175,6 +176,7 @@ static struct resource mx21ads_mmgpio_resource =
 	DEFINE_RES_MEM_NAMED(MX21ADS_IO_REG, SZ_2, "dat");
 
 static struct bgpio_pdata mx21ads_mmgpio_pdata = {
+	.label	= "mx21ads-mmgpio",
 	.base	= MX21ADS_MMGPIO_BASE,
 	.ngpio	= 16,
 };
@@ -203,7 +205,6 @@ static struct regulator_init_data mx21ads_lcd_regulator_init_data = {
 static struct fixed_voltage_config mx21ads_lcd_regulator_pdata = {
 	.supply_name	= "LCD",
 	.microvolts	= 3300000,
-	.gpio		= MX21ADS_IO_LCDON,
 	.enable_high	= 1,
 	.init_data	= &mx21ads_lcd_regulator_init_data,
 };
@@ -216,6 +217,15 @@ static struct platform_device mx21ads_lcd_regulator = {
 	},
 };
 
+static struct gpiod_lookup_table mx21ads_lcd_regulator_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.0", /* Let's hope ID 0 is what we get */
+	.table = {
+		GPIO_LOOKUP("mx21ads-mmgpio", 9,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 /*
  * Connected is a portrait Sharp-QVGA display
  * of type: LQ035Q7DB02
@@ -311,6 +321,7 @@ static void __init mx21ads_late_init(void)
 {
 	imx21_add_mxc_mmc(0, &mx21ads_sdhc_pdata);
 
+	gpiod_add_lookup_table(&mx21ads_lcd_regulator_gpiod_table);
 	platform_add_devices(platform_devices, ARRAY_SIZE(platform_devices));
 
 	mx21ads_cs8900_resources[1].start =
diff --git a/arch/arm/mach-imx/mach-mx27ads.c b/arch/arm/mach-imx/mach-mx27ads.c
index a04bb094ded1..0fdb88db0cbd 100644
--- a/arch/arm/mach-imx/mach-mx27ads.c
+++ b/arch/arm/mach-imx/mach-mx27ads.c
@@ -16,6 +16,7 @@
 #include <linux/gpio/driver.h>
 /* Needed for gpio_to_irq() */
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
@@ -230,10 +231,17 @@ static struct regulator_init_data mx27ads_lcd_regulator_init_data = {
 static struct fixed_voltage_config mx27ads_lcd_regulator_pdata = {
 	.supply_name	= "LCD",
 	.microvolts	= 3300000,
-	.gpio		= MX27ADS_LCD_GPIO,
 	.init_data	= &mx27ads_lcd_regulator_init_data,
 };
 
+static struct gpiod_lookup_table mx27ads_lcd_regulator_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.0", /* Let's hope ID 0 is what we get */
+	.table = {
+		GPIO_LOOKUP("LCD", 0, "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static void __init mx27ads_regulator_init(void)
 {
 	struct gpio_chip *vchip;
@@ -247,6 +255,8 @@ static void __init mx27ads_regulator_init(void)
 	vchip->set		= vgpio_set;
 	gpiochip_add_data(vchip, NULL);
 
+	gpiod_add_lookup_table(&mx27ads_lcd_regulator_gpiod_table);
+
 	platform_device_register_data(NULL, "reg-fixed-voltage",
 				      PLATFORM_DEVID_AUTO,
 				      &mx27ads_lcd_regulator_pdata,
diff --git a/arch/arm/mach-mmp/brownstone.c b/arch/arm/mach-mmp/brownstone.c
index d1613b954926..563b5a278d65 100644
--- a/arch/arm/mach-mmp/brownstone.c
+++ b/arch/arm/mach-mmp/brownstone.c
@@ -15,6 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/gpio-pxa.h>
+#include <linux/gpio/machine.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/max8649.h>
 #include <linux/regulator/fixed.h>
@@ -148,7 +149,6 @@ static struct regulator_init_data brownstone_v_5vp_data = {
 static struct fixed_voltage_config brownstone_v_5vp = {
 	.supply_name		= "v_5vp",
 	.microvolts		= 5000000,
-	.gpio			= GPIO_5V_ENABLE,
 	.enable_high		= 1,
 	.enabled_at_boot	= 1,
 	.init_data		= &brownstone_v_5vp_data,
@@ -162,6 +162,15 @@ static struct platform_device brownstone_v_5vp_device = {
 	},
 };
 
+static struct gpiod_lookup_table brownstone_v_5vp_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.1", /* .id set to 1 above */
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", GPIO_5V_ENABLE,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static struct max8925_platform_data brownstone_max8925_info = {
 	.irq_base		= MMP_NR_IRQS,
 };
@@ -217,6 +226,7 @@ static void __init brownstone_init(void)
 	mmp2_add_isram(&mmp2_isram_platdata);
 
 	/* enable 5v regulator */
+	gpiod_add_lookup_table(&brownstone_v_5vp_gpiod_table);
 	platform_device_register(&brownstone_v_5vp_device);
 }
 
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index 52e8e53ca154..759fa18f6ab4 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -12,6 +12,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/gpio/driver.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -203,6 +204,7 @@ static struct resource latch2_resources[] = {
 };
 
 static struct bgpio_pdata latch2_pdata = {
+	.label	= "ams-delta-latch2",
 	.base	= AMS_DELTA_LATCH2_GPIO_BASE,
 	.ngpio	= AMS_DELTA_LATCH2_NGPIO,
 };
@@ -272,7 +274,6 @@ static struct regulator_init_data modem_nreset_data = {
 static struct fixed_voltage_config modem_nreset_config = {
 	.supply_name		= "modem_nreset",
 	.microvolts		= 3300000,
-	.gpio			= AMS_DELTA_GPIO_PIN_MODEM_NRESET,
 	.startup_delay		= 25000,
 	.enable_high		= 1,
 	.enabled_at_boot	= 1,
@@ -287,6 +288,16 @@ static struct platform_device modem_nreset_device = {
 	},
 };
 
+static struct gpiod_lookup_table modem_nreset_gpiod_table = {
+	.dev_id = "reg-fixed-voltage",
+	.table = {
+		/* The AMS_DELTA_GPIO_PIN_MODEM_NRESET is at offset 12 */
+		GPIO_LOOKUP("ams-delta-latch2", 12,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 struct modem_private_data {
 	struct regulator *regulator;
 };
@@ -570,6 +581,7 @@ static int __init late_init(void)
 
 	platform_add_devices(late_devices, ARRAY_SIZE(late_devices));
 
+	gpiod_add_lookup_table(&modem_nreset_gpiod_table);
 	err = platform_device_register(&modem_nreset_device);
 	if (err) {
 		pr_err("Couldn't register the modem regulator device\n");
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index 6459816c2879..7d1447204fb8 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -10,6 +10,7 @@
 #include <linux/clk.h>
 #include <linux/davinci_emac.h>
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/of_platform.h>
@@ -330,7 +331,6 @@ static struct regulator_init_data pandora_vmmc3 = {
 static struct fixed_voltage_config pandora_vwlan = {
 	.supply_name		= "vwlan",
 	.microvolts		= 1800000, /* 1.8V */
-	.gpio			= PANDORA_WIFI_NRESET_GPIO,
 	.startup_delay		= 50000, /* 50ms */
 	.enable_high		= 1,
 	.init_data		= &pandora_vmmc3,
@@ -344,6 +344,19 @@ static struct platform_device pandora_vwlan_device = {
 	},
 };
 
+static struct gpiod_lookup_table pandora_vwlan_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.1",
+	.table = {
+		/*
+		 * As this is a low GPIO number it should be at the first
+		 * GPIO bank.
+		 */
+		GPIO_LOOKUP("gpio-0-31", PANDORA_WIFI_NRESET_GPIO,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static void pandora_wl1251_init_card(struct mmc_card *card)
 {
 	/*
@@ -405,6 +418,7 @@ fail:
 static void __init omap3_pandora_legacy_init(void)
 {
 	platform_device_register(&pandora_backlight);
+	gpiod_add_lookup_table(&pandora_vwlan_gpiod_table);
 	platform_device_register(&pandora_vwlan_device);
 	omap_hsmmc_init(pandora_mmc3);
 	omap_hsmmc_late_init(pandora_mmc3);
diff --git a/arch/arm/mach-pxa/em-x270.c b/arch/arm/mach-pxa/em-x270.c
index 49022ad338e9..6d7d93981098 100644
--- a/arch/arm/mach-pxa/em-x270.c
+++ b/arch/arm/mach-pxa/em-x270.c
@@ -987,7 +987,6 @@ static struct fixed_voltage_config camera_dummy_config = {
 	.supply_name		= "camera_vdd",
 	.input_supply		= "vcc cam",
 	.microvolts		= 2800000,
-	.gpio			= -1,
 	.enable_high		= 0,
 	.init_data		= &camera_dummy_initdata,
 };
diff --git a/arch/arm/mach-pxa/ezx.c b/arch/arm/mach-pxa/ezx.c
index 2c90b58f347d..2b4bd6d94855 100644
--- a/arch/arm/mach-pxa/ezx.c
+++ b/arch/arm/mach-pxa/ezx.c
@@ -21,6 +21,7 @@
 #include <linux/regulator/fixed.h>
 #include <linux/input.h>
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio_keys.h>
 #include <linux/leds-lp3944.h>
 #include <linux/platform_data/i2c-pxa.h>
@@ -698,31 +699,39 @@ static struct pxa27x_keypad_platform_data e2_keypad_platform_data = {
 
 #if defined(CONFIG_MACH_EZX_A780) || defined(CONFIG_MACH_EZX_A910)
 /* camera */
-static struct regulator_consumer_supply camera_dummy_supplies[] = {
+static struct regulator_consumer_supply camera_regulator_supplies[] = {
 	REGULATOR_SUPPLY("vdd", "0-005d"),
 };
 
-static struct regulator_init_data camera_dummy_initdata = {
-	.consumer_supplies = camera_dummy_supplies,
-	.num_consumer_supplies = ARRAY_SIZE(camera_dummy_supplies),
+static struct regulator_init_data camera_regulator_initdata = {
+	.consumer_supplies = camera_regulator_supplies,
+	.num_consumer_supplies = ARRAY_SIZE(camera_regulator_supplies),
 	.constraints = {
 		.valid_ops_mask = REGULATOR_CHANGE_STATUS,
 	},
 };
 
-static struct fixed_voltage_config camera_dummy_config = {
+static struct fixed_voltage_config camera_regulator_config = {
 	.supply_name		= "camera_vdd",
 	.microvolts		= 2800000,
-	.gpio			= GPIO50_nCAM_EN,
 	.enable_high		= 0,
-	.init_data		= &camera_dummy_initdata,
+	.init_data		= &camera_regulator_initdata,
 };
 
-static struct platform_device camera_supply_dummy_device = {
+static struct platform_device camera_supply_regulator_device = {
 	.name	= "reg-fixed-voltage",
 	.id	= 1,
 	.dev	= {
-		.platform_data = &camera_dummy_config,
+		.platform_data = &camera_regulator_config,
+	},
+};
+
+static struct gpiod_lookup_table camera_supply_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.1",
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", GPIO50_nCAM_EN,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
 	},
 };
 #endif
@@ -800,7 +809,7 @@ static struct i2c_board_info a780_i2c_board_info[] = {
 
 static struct platform_device *a780_devices[] __initdata = {
 	&a780_gpio_keys,
-	&camera_supply_dummy_device,
+	&camera_supply_regulator_device,
 };
 
 static void __init a780_init(void)
@@ -823,6 +832,7 @@ static void __init a780_init(void)
 	if (a780_camera_init() == 0)
 		pxa_set_camera_info(&a780_pxacamera_platform_data);
 
+	gpiod_add_lookup_table(&camera_supply_gpiod_table);
 	pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup));
 	platform_add_devices(ARRAY_AND_SIZE(ezx_devices));
 	platform_add_devices(ARRAY_AND_SIZE(a780_devices));
@@ -892,6 +902,7 @@ static void __init e680_init(void)
 
 	pxa_set_keypad_info(&e680_keypad_platform_data);
 
+	gpiod_add_lookup_table(&camera_supply_gpiod_table);
 	pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup));
 	platform_add_devices(ARRAY_AND_SIZE(ezx_devices));
 	platform_add_devices(ARRAY_AND_SIZE(e680_devices));
@@ -1098,7 +1109,7 @@ static struct i2c_board_info __initdata a910_i2c_board_info[] = {
 
 static struct platform_device *a910_devices[] __initdata = {
 	&a910_gpio_keys,
-	&camera_supply_dummy_device,
+	&camera_supply_regulator_device,
 };
 
 static void __init a910_init(void)
diff --git a/arch/arm/mach-pxa/magician.c b/arch/arm/mach-pxa/magician.c
index c5325d1ae77b..14c0f80bc9e7 100644
--- a/arch/arm/mach-pxa/magician.c
+++ b/arch/arm/mach-pxa/magician.c
@@ -18,6 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio_keys.h>
 #include <linux/input.h>
 #include <linux/mfd/htc-pasic3.h>
@@ -696,7 +697,6 @@ static struct regulator_init_data vads7846_regulator = {
 static struct fixed_voltage_config vads7846 = {
 	.supply_name	= "vads7846",
 	.microvolts	= 3300000, /* probably */
-	.gpio		= -EINVAL,
 	.startup_delay	= 0,
 	.init_data	= &vads7846_regulator,
 };
diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c
index 034345546f84..ee766e4ebddc 100644
--- a/arch/arm/mach-pxa/raumfeld.c
+++ b/arch/arm/mach-pxa/raumfeld.c
@@ -886,7 +886,6 @@ static struct regulator_init_data audio_va_initdata = {
 static struct fixed_voltage_config audio_va_config = {
 	.supply_name		= "audio_va",
 	.microvolts		= 5000000,
-	.gpio			= GPIO_AUDIO_VA_ENABLE,
 	.enable_high		= 1,
 	.enabled_at_boot	= 0,
 	.init_data		= &audio_va_initdata,
@@ -900,6 +899,15 @@ static struct platform_device audio_va_device = {
 	},
 };
 
+static struct gpiod_lookup_table audio_va_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.0",
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", GPIO_AUDIO_VA_ENABLE,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 /* Dummy supplies for Codec's VD/VLC */
 
 static struct regulator_consumer_supply audio_dummy_supplies[] = {
@@ -918,7 +926,6 @@ static struct regulator_init_data audio_dummy_initdata = {
 static struct fixed_voltage_config audio_dummy_config = {
 	.supply_name		= "audio_vd",
 	.microvolts		= 3300000,
-	.gpio			= -1,
 	.init_data		= &audio_dummy_initdata,
 };
 
@@ -1033,6 +1040,7 @@ static void __init raumfeld_audio_init(void)
 	else
 		gpio_direction_output(GPIO_MCLK_RESET, 1);
 
+	gpiod_add_lookup_table(&audio_va_gpiod_table);
 	platform_add_devices(ARRAY_AND_SIZE(audio_regulator_devices));
 }
 
diff --git a/arch/arm/mach-pxa/zeus.c b/arch/arm/mach-pxa/zeus.c
index e3851795d6d7..58e05afcece0 100644
--- a/arch/arm/mach-pxa/zeus.c
+++ b/arch/arm/mach-pxa/zeus.c
@@ -17,6 +17,7 @@
 #include <linux/irq.h>
 #include <linux/pm.h>
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/serial_8250.h>
 #include <linux/dm9000.h>
 #include <linux/mmc/host.h>
@@ -410,7 +411,6 @@ static struct regulator_init_data can_regulator_init_data = {
 static struct fixed_voltage_config can_regulator_pdata = {
 	.supply_name	= "CAN_SHDN",
 	.microvolts	= 3300000,
-	.gpio		= ZEUS_CAN_SHDN_GPIO,
 	.init_data	= &can_regulator_init_data,
 };
 
@@ -422,6 +422,15 @@ static struct platform_device can_regulator_device = {
 	},
 };
 
+static struct gpiod_lookup_table can_regulator_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.0",
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", ZEUS_CAN_SHDN_GPIO,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static struct mcp251x_platform_data zeus_mcp2515_pdata = {
 	.oscillator_frequency	= 16*1000*1000,
 };
@@ -538,7 +547,6 @@ static struct regulator_init_data zeus_ohci_regulator_data = {
 static struct fixed_voltage_config zeus_ohci_regulator_config = {
 	.supply_name		= "vbus2",
 	.microvolts		= 5000000, /* 5.0V */
-	.gpio			= ZEUS_USB2_PWREN_GPIO,
 	.enable_high		= 1,
 	.startup_delay		= 0,
 	.init_data		= &zeus_ohci_regulator_data,
@@ -552,6 +560,15 @@ static struct platform_device zeus_ohci_regulator_device = {
 	},
 };
 
+static struct gpiod_lookup_table zeus_ohci_regulator_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.0",
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", ZEUS_USB2_PWREN_GPIO,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static struct pxaohci_platform_data zeus_ohci_platform_data = {
 	.port_mode	= PMM_NPS_MODE,
 	/* Clear Power Control Polarity Low and set Power Sense
@@ -855,6 +872,8 @@ static void __init zeus_init(void)
 
 	pxa2xx_mfp_config(ARRAY_AND_SIZE(zeus_pin_config));
 
+	gpiod_add_lookup_table(&can_regulator_gpiod_table);
+	gpiod_add_lookup_table(&zeus_ohci_regulator_gpiod_table);
 	platform_add_devices(zeus_devices, ARRAY_SIZE(zeus_devices));
 
 	zeus_register_ohci();
diff --git a/arch/arm/mach-s3c64xx/mach-crag6410.c b/arch/arm/mach-s3c64xx/mach-crag6410.c
index f04650297487..379424d72ae7 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410.c
@@ -352,7 +352,6 @@ static struct fixed_voltage_config wallvdd_pdata = {
 	.supply_name = "WALLVDD",
 	.microvolts = 5000000,
 	.init_data = &wallvdd_data,
-	.gpio = -EINVAL,
 };
 
 static struct platform_device wallvdd_device = {
diff --git a/arch/arm/mach-s3c64xx/mach-smdk6410.c b/arch/arm/mach-s3c64xx/mach-smdk6410.c
index c46fa5dfd2e0..908e5aa831c8 100644
--- a/arch/arm/mach-s3c64xx/mach-smdk6410.c
+++ b/arch/arm/mach-s3c64xx/mach-smdk6410.c
@@ -222,7 +222,6 @@ static struct fixed_voltage_config smdk6410_b_pwr_5v_pdata = {
 	.supply_name = "B_PWR_5V",
 	.microvolts = 5000000,
 	.init_data = &smdk6410_b_pwr_5v_data,
-	.gpio = -EINVAL,
 };
 
 static struct platform_device smdk6410_b_pwr_5v = {
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index 575ec085cffa..dbb53c520165 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -101,7 +101,7 @@ static int __init assabet_init_gpio(void __iomem *reg, u32 def_val)
 
 	assabet_bcr_gc = gc;
 
-	return gc->base;
+	return 0;
 }
 
 /*
@@ -471,6 +471,14 @@ static struct fixed_voltage_config assabet_cf_vcc_pdata __initdata = {
 	.enable_high = 1,
 };
 
+static struct gpiod_lookup_table assabet_cf_vcc_gpio_table = {
+	.dev_id = "reg-fixed-voltage.0",
+	.table = {
+		GPIO_LOOKUP("assabet", 0, "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static void __init assabet_init(void)
 {
 	/*
@@ -517,9 +525,11 @@ static void __init assabet_init(void)
 			neponset_resources, ARRAY_SIZE(neponset_resources));
 #endif
 	} else {
+		gpiod_add_lookup_table(&assabet_cf_vcc_gpio_table);
 		sa11x0_register_fixed_regulator(0, &assabet_cf_vcc_pdata,
-					 assabet_cf_vcc_consumers,
-					 ARRAY_SIZE(assabet_cf_vcc_consumers));
+					assabet_cf_vcc_consumers,
+					ARRAY_SIZE(assabet_cf_vcc_consumers),
+					true);
 
 	}
 
@@ -802,7 +812,6 @@ fs_initcall(assabet_leds_init);
 
 void __init assabet_init_irq(void)
 {
-	unsigned int assabet_gpio_base;
 	u32 def_val;
 
 	sa1100_init_irq();
@@ -817,9 +826,7 @@ void __init assabet_init_irq(void)
 	 *
 	 * This must precede any driver calls to BCR_set() or BCR_clear().
 	 */
-	assabet_gpio_base = assabet_init_gpio((void *)&ASSABET_BCR, def_val);
-
-	assabet_cf_vcc_pdata.gpio = assabet_gpio_base + 0;
+	assabet_init_gpio((void *)&ASSABET_BCR, def_val);
 }
 
 MACHINE_START(ASSABET, "Intel-Assabet")
diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c
index 7167ddf84a0e..800321c6cbd8 100644
--- a/arch/arm/mach-sa1100/generic.c
+++ b/arch/arm/mach-sa1100/generic.c
@@ -348,7 +348,8 @@ void __init sa11x0_init_late(void)
 
 int __init sa11x0_register_fixed_regulator(int n,
 	struct fixed_voltage_config *cfg,
-	struct regulator_consumer_supply *supplies, unsigned num_supplies)
+	struct regulator_consumer_supply *supplies, unsigned num_supplies,
+	bool uses_gpio)
 {
 	struct regulator_init_data *id;
 
@@ -356,7 +357,7 @@ int __init sa11x0_register_fixed_regulator(int n,
 	if (!cfg->init_data)
 		return -ENOMEM;
 
-	if (cfg->gpio < 0)
+	if (!uses_gpio)
 		id->constraints.always_on = 1;
 	id->constraints.name = cfg->supply_name;
 	id->constraints.min_uV = cfg->microvolts;
diff --git a/arch/arm/mach-sa1100/generic.h b/arch/arm/mach-sa1100/generic.h
index 5f3cb52fa6ab..158a4fd5ca24 100644
--- a/arch/arm/mach-sa1100/generic.h
+++ b/arch/arm/mach-sa1100/generic.h
@@ -54,4 +54,5 @@ void sa11x0_register_pcmcia(int socket, struct gpiod_lookup_table *);
 struct fixed_voltage_config;
 struct regulator_consumer_supply;
 int sa11x0_register_fixed_regulator(int n, struct fixed_voltage_config *cfg,
-	struct regulator_consumer_supply *supplies, unsigned num_supplies);
+	struct regulator_consumer_supply *supplies, unsigned num_supplies,
+	bool uses_gpio);
diff --git a/arch/arm/mach-sa1100/shannon.c b/arch/arm/mach-sa1100/shannon.c
index 22f7fe0b809f..5bc82e2671c6 100644
--- a/arch/arm/mach-sa1100/shannon.c
+++ b/arch/arm/mach-sa1100/shannon.c
@@ -102,14 +102,14 @@ static struct fixed_voltage_config shannon_cf_vcc_pdata __initdata = {
 	.supply_name = "cf-power",
 	.microvolts = 3300000,
 	.enabled_at_boot = 1,
-	.gpio = -EINVAL,
 };
 
 static void __init shannon_init(void)
 {
 	sa11x0_register_fixed_regulator(0, &shannon_cf_vcc_pdata,
 					shannon_cf_vcc_consumers,
-					ARRAY_SIZE(shannon_cf_vcc_consumers));
+					ARRAY_SIZE(shannon_cf_vcc_consumers),
+					false);
 	sa11x0_register_pcmcia(0, &shannon_pcmcia0_gpio_table);
 	sa11x0_register_pcmcia(1, &shannon_pcmcia1_gpio_table);
 	sa11x0_ppc_configure_mcp();
diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index adc61d14172c..c296b5c399b7 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -633,7 +633,6 @@ static struct regulator_init_data cn12_power_init_data = {
 static struct fixed_voltage_config cn12_power_info = {
 	.supply_name = "CN12 SD/MMC Vdd",
 	.microvolts = 3300000,
-	.gpio = GPIO_PTB7,
 	.enable_high = 1,
 	.init_data = &cn12_power_init_data,
 };
@@ -646,6 +645,16 @@ static struct platform_device cn12_power = {
 	},
 };
 
+static struct gpiod_lookup_table cn12_power_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.0",
+	.table = {
+		/* Offset 7 on port B */
+		GPIO_LOOKUP("sh7724_pfc", GPIO_PTB7,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 #if defined(CONFIG_MMC_SDHI) || defined(CONFIG_MMC_SDHI_MODULE)
 /* SDHI0 */
 static struct regulator_consumer_supply sdhi0_power_consumers[] =
@@ -665,7 +674,6 @@ static struct regulator_init_data sdhi0_power_init_data = {
 static struct fixed_voltage_config sdhi0_power_info = {
 	.supply_name = "CN11 SD/MMC Vdd",
 	.microvolts = 3300000,
-	.gpio = GPIO_PTB6,
 	.enable_high = 1,
 	.init_data = &sdhi0_power_init_data,
 };
@@ -678,6 +686,16 @@ static struct platform_device sdhi0_power = {
 	},
 };
 
+static struct gpiod_lookup_table sdhi0_power_gpiod_table = {
+	.dev_id = "reg-fixed-voltage.1",
+	.table = {
+		/* Offset 6 on port B */
+		GPIO_LOOKUP("sh7724_pfc", GPIO_PTB6,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static struct tmio_mmc_data sdhi0_info = {
 	.chan_priv_tx	= (void *)SHDMA_SLAVE_SDHI0_TX,
 	.chan_priv_rx	= (void *)SHDMA_SLAVE_SDHI0_RX,
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
index 4392c15ed9e0..fc77d69e51d7 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -10,7 +10,7 @@
  * of the License.
  */
 
-#include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/fixed.h>
@@ -43,7 +43,6 @@ static struct fixed_voltage_config bcm43xx_vmmc = {
 	 * real voltage and signaling are still 1.8V.
 	 */
 	.microvolts		= 2000000,		/* 1.8V */
-	.gpio			= -EINVAL,
 	.startup_delay		= 250 * 1000,		/* 250ms */
 	.enable_high		= 1,			/* active high */
 	.enabled_at_boot	= 0,			/* disabled at boot */
@@ -58,11 +57,23 @@ static struct platform_device bcm43xx_vmmc_regulator = {
 	},
 };
 
+static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = {
+	.dev_id	= "reg-fixed-voltage.0",
+	.table	= {
+		GPIO_LOOKUP("0000:00:0c.0", -1, "enable", GPIO_ACTIVE_LOW),
+		{}
+	},
+};
+
 static int __init bcm43xx_regulator_register(void)
 {
+	struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table;
+	struct gpiod_lookup *lookup = table->table;
 	int ret;
 
-	bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+	lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+	gpiod_add_lookup_table(table);
+
 	ret = platform_device_register(&bcm43xx_vmmc_regulator);
 	if (ret) {
 		pr_err("%s: vmmc regulator register failed\n", __func__);
diff --git a/drivers/regulator/fixed-helper.c b/drivers/regulator/fixed-helper.c
index 777fac6fb4cb..2c6098e6f4bc 100644
--- a/drivers/regulator/fixed-helper.c
+++ b/drivers/regulator/fixed-helper.c
@@ -43,7 +43,6 @@ struct platform_device *regulator_register_always_on(int id, const char *name,
 	}
 
 	data->cfg.microvolts = uv;
-	data->cfg.gpio = -EINVAL;
 	data->cfg.enabled_at_boot = 1;
 	data->cfg.init_data = &data->init_data;
 
diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c
index 988a7472c2ab..1142f195529b 100644
--- a/drivers/regulator/fixed.c
+++ b/drivers/regulator/fixed.c
@@ -24,10 +24,9 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/fixed.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/machine.h>
 
@@ -78,10 +77,6 @@ of_get_fixed_voltage_config(struct device *dev,
 	if (init_data->constraints.boot_on)
 		config->enabled_at_boot = true;
 
-	config->gpio = of_get_named_gpio(np, "gpio", 0);
-	if ((config->gpio < 0) && (config->gpio != -ENOENT))
-		return ERR_PTR(config->gpio);
-
 	of_property_read_u32(np, "startup-delay-us", &config->startup_delay);
 
 	config->enable_high = of_property_read_bool(np, "enable-active-high");
@@ -102,6 +97,7 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev)
 	struct fixed_voltage_config *config;
 	struct fixed_voltage_data *drvdata;
 	struct regulator_config cfg = { };
+	enum gpiod_flags gflags;
 	int ret;
 
 	drvdata = devm_kzalloc(&pdev->dev, sizeof(struct fixed_voltage_data),
@@ -150,25 +146,28 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev)
 
 	drvdata->desc.fixed_uV = config->microvolts;
 
-	if (gpio_is_valid(config->gpio)) {
-		cfg.ena_gpio = config->gpio;
-		if (pdev->dev.of_node)
-			cfg.ena_gpio_initialized = true;
-	}
 	cfg.ena_gpio_invert = !config->enable_high;
 	if (config->enabled_at_boot) {
 		if (config->enable_high)
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
+			gflags = GPIOD_OUT_HIGH;
 		else
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
+			gflags = GPIOD_OUT_LOW;
 	} else {
 		if (config->enable_high)
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
+			gflags = GPIOD_OUT_LOW;
 		else
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
+			gflags = GPIOD_OUT_HIGH;
 	}
-	if (config->gpio_is_open_drain)
-		cfg.ena_gpio_flags |= GPIOF_OPEN_DRAIN;
+	if (config->gpio_is_open_drain) {
+		if (gflags == GPIOD_OUT_HIGH)
+			gflags = GPIOD_OUT_HIGH_OPEN_DRAIN;
+		else
+			gflags = GPIOD_OUT_LOW_OPEN_DRAIN;
+	}
+
+	cfg.ena_gpiod = devm_gpiod_get_optional(&pdev->dev, NULL, gflags);
+	if (IS_ERR(cfg.ena_gpiod))
+		return PTR_ERR(cfg.ena_gpiod);
 
 	cfg.dev = &pdev->dev;
 	cfg.init_data = config->init_data;
diff --git a/include/linux/regulator/fixed.h b/include/linux/regulator/fixed.h
index 48918be649d4..1a4340ed8e2b 100644
--- a/include/linux/regulator/fixed.h
+++ b/include/linux/regulator/fixed.h
@@ -24,8 +24,6 @@ struct regulator_init_data;
  * @supply_name:	Name of the regulator supply
  * @input_supply:	Name of the input regulator supply
  * @microvolts:		Output voltage of regulator
- * @gpio:		GPIO to use for enable control
- * 			set to -EINVAL if not used
  * @startup_delay:	Start-up time in microseconds
  * @gpio_is_open_drain: Gpio pin is open drain or normal type.
  *			If it is open drain type then HIGH will be set
@@ -49,7 +47,6 @@ struct fixed_voltage_config {
 	const char *supply_name;
 	const char *input_supply;
 	int microvolts;
-	int gpio;
 	unsigned startup_delay;
 	unsigned gpio_is_open_drain:1;
 	unsigned enable_high:1;
-- 
cgit v1.2.3


From 37bed97f00734ce329495823d9682181028b51e4 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 14 May 2018 10:06:23 +0200
Subject: regulator: gpio: Get enable GPIO using GPIO descriptor

We augment the GPIO regulator to get the *enable* regulator
GPIO line (not the other lines) using a descriptor rather than
a global number.

We then pass this into the regulator core which has been
prepared to hande enable descriptors in a separate patch.

Switch over the two boardfiles using this facility and clean
up so we only pass descriptors around.

Cc: Philipp Zabel <philipp.zabel@gmail.com> # HX4700/Magician maintainer
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-pxa/hx4700.c               | 12 +++++++++++-
 arch/arm/mach-pxa/magician.c             | 11 ++++++++++-
 drivers/regulator/gpio-regulator.c       | 23 +++++++++++------------
 include/linux/regulator/gpio-regulator.h |  3 ---
 4 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-pxa/hx4700.c b/arch/arm/mach-pxa/hx4700.c
index e2e7f247a645..6717a10180eb 100644
--- a/arch/arm/mach-pxa/hx4700.c
+++ b/arch/arm/mach-pxa/hx4700.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/fb.h>
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio_keys.h>
 #include <linux/input.h>
 #include <linux/input/navpoint.h>
@@ -711,7 +712,6 @@ static struct gpio_regulator_state bq24022_states[] = {
 static struct gpio_regulator_config bq24022_info = {
 	.supply_name = "bq24022",
 
-	.enable_gpio = GPIO72_HX4700_BQ24022_nCHARGE_EN,
 	.enable_high = 0,
 	.enabled_at_boot = 0,
 
@@ -733,6 +733,15 @@ static struct platform_device bq24022 = {
 	},
 };
 
+static struct gpiod_lookup_table bq24022_gpiod_table = {
+	.dev_id = "gpio-regulator",
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", GPIO72_HX4700_BQ24022_nCHARGE_EN,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 /*
  * StrataFlash
  */
@@ -875,6 +884,7 @@ static void __init hx4700_init(void)
 	pxa_set_btuart_info(NULL);
 	pxa_set_stuart_info(NULL);
 
+	gpiod_add_lookup_table(&bq24022_gpiod_table);
 	platform_add_devices(devices, ARRAY_SIZE(devices));
 	pwm_add_table(hx4700_pwm_lookup, ARRAY_SIZE(hx4700_pwm_lookup));
 
diff --git a/arch/arm/mach-pxa/magician.c b/arch/arm/mach-pxa/magician.c
index 14c0f80bc9e7..9a5bda3ea194 100644
--- a/arch/arm/mach-pxa/magician.c
+++ b/arch/arm/mach-pxa/magician.c
@@ -657,7 +657,6 @@ static struct gpio_regulator_state bq24022_states[] = {
 static struct gpio_regulator_config bq24022_info = {
 	.supply_name		= "bq24022",
 
-	.enable_gpio		= GPIO30_MAGICIAN_BQ24022_nCHARGE_EN,
 	.enable_high		= 0,
 	.enabled_at_boot	= 1,
 
@@ -679,6 +678,15 @@ static struct platform_device bq24022 = {
 	},
 };
 
+static struct gpiod_lookup_table bq24022_gpiod_table = {
+	.dev_id = "gpio-regulator",
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", GPIO30_MAGICIAN_BQ24022_nCHARGE_EN,
+			    "enable", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 /*
  * fixed regulator for ads7846
  */
@@ -1007,6 +1015,7 @@ static void __init magician_init(void)
 	regulator_register_always_on(0, "power", pwm_backlight_supply,
 		ARRAY_SIZE(pwm_backlight_supply), 5000000);
 
+	gpiod_add_lookup_table(&bq24022_gpiod_table);
 	platform_add_devices(ARRAY_AND_SIZE(devices));
 }
 
diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c
index a86b8997bb54..9d6094c4d71c 100644
--- a/drivers/regulator/gpio-regulator.c
+++ b/drivers/regulator/gpio-regulator.c
@@ -31,6 +31,7 @@
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/gpio-regulator.h>
 #include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
@@ -161,10 +162,6 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np,
 
 	of_property_read_u32(np, "startup-delay-us", &config->startup_delay);
 
-	config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0);
-	if (config->enable_gpio < 0 && config->enable_gpio != -ENOENT)
-		return ERR_PTR(config->enable_gpio);
-
 	/* Fetch GPIOs. - optional property*/
 	ret = of_gpio_count(np);
 	if ((ret < 0) && (ret != -ENOENT))
@@ -255,6 +252,7 @@ static int gpio_regulator_probe(struct platform_device *pdev)
 	struct device_node *np = pdev->dev.of_node;
 	struct gpio_regulator_data *drvdata;
 	struct regulator_config cfg = { };
+	enum gpiod_flags gflags;
 	int ptr, ret, state;
 
 	drvdata = devm_kzalloc(&pdev->dev, sizeof(struct gpio_regulator_data),
@@ -340,21 +338,22 @@ static int gpio_regulator_probe(struct platform_device *pdev)
 	cfg.driver_data = drvdata;
 	cfg.of_node = np;
 
-	if (gpio_is_valid(config->enable_gpio)) {
-		cfg.ena_gpio = config->enable_gpio;
-		cfg.ena_gpio_initialized = true;
-	}
 	cfg.ena_gpio_invert = !config->enable_high;
 	if (config->enabled_at_boot) {
 		if (config->enable_high)
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
+			gflags = GPIOD_OUT_HIGH;
 		else
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
+			gflags = GPIOD_OUT_LOW;
 	} else {
 		if (config->enable_high)
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
+			gflags = GPIOD_OUT_LOW;
 		else
-			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
+			gflags = GPIOD_OUT_HIGH;
+	}
+	cfg.ena_gpiod = devm_gpiod_get_optional(&pdev->dev, "enable", gflags);
+	if (IS_ERR(cfg.ena_gpiod)) {
+		ret = PTR_ERR(cfg.ena_gpiod);
+		goto err_stategpio;
 	}
 
 	drvdata->dev = regulator_register(&drvdata->desc, &cfg);
diff --git a/include/linux/regulator/gpio-regulator.h b/include/linux/regulator/gpio-regulator.h
index 19fbd267406d..536cab86f2d5 100644
--- a/include/linux/regulator/gpio-regulator.h
+++ b/include/linux/regulator/gpio-regulator.h
@@ -44,8 +44,6 @@ struct gpio_regulator_state {
 /**
  * struct gpio_regulator_config - config structure
  * @supply_name:	Name of the regulator supply
- * @enable_gpio:	GPIO to use for enable control
- *			set to -EINVAL if not used
  * @enable_high:	Polarity of enable GPIO
  *			1 = Active high, 0 = Active low
  * @enabled_at_boot:	Whether regulator has been enabled at
@@ -69,7 +67,6 @@ struct gpio_regulator_state {
 struct gpio_regulator_config {
 	const char *supply_name;
 
-	int enable_gpio;
 	unsigned enable_high:1;
 	unsigned enabled_at_boot:1;
 	unsigned startup_delay;
-- 
cgit v1.2.3


From 12f5b93145450c750f315657ef239a314811aeeb Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 29 May 2018 15:52:28 +0200
Subject: blk-mq: Remove generation seqeunce

This patch simplifies the timeout handling by relying on the request
reference counting to ensure the iterator is operating on an inflight
and truly timed out request. Since the reference counting prevents the
tag from being reallocated, the block layer no longer needs to prevent
drivers from completing their requests while the timeout handler is
operating on it: a driver completing a request is allowed to proceed to
the next state without additional syncronization with the block layer.

This also removes any need for generation sequence numbers since the
request lifetime is prevented from being reallocated as a new sequence
while timeout handling is operating on it.

To enables this a refcount is added to struct request so that request
users can be sure they're operating on the same request without it
changing while they're processing it.  The request's tag won't be
released for reuse until both the timeout handler and the completion
are done with it.

Signed-off-by: Keith Busch <keith.busch@intel.com>
[hch: slight cleanups, added back submission side hctx lock, use cmpxchg
 for completions]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |   6 --
 block/blk-mq-debugfs.c |   1 -
 block/blk-mq.c         | 258 ++++++++++++++-----------------------------------
 block/blk-mq.h         |  42 +-------
 block/blk-timeout.c    |   1 -
 include/linux/blkdev.h |  35 +++----
 6 files changed, 89 insertions(+), 254 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 43370faee935..cee03cad99f2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,12 +198,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->internal_tag = -1;
 	rq->start_time_ns = ktime_get_ns();
 	rq->part = NULL;
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * See comment of blk_mq_init_request
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3080e18cb859..ffa622366922 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -344,7 +344,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
-	RQF_NAME(MQ_TIMEOUT_EXPIRED),
 	RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3581a1e5c8a7..6a7803abbf19 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -332,6 +332,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 #endif
 
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
+	refcount_set(&rq->ref, 1);
 	return rq;
 }
 
@@ -465,13 +466,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
+static void __blk_mq_free_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	const int sched_tag = rq->internal_tag;
+
+	if (rq->tag != -1)
+		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+	if (sched_tag != -1)
+		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
+	blk_mq_sched_restart(hctx);
+	blk_queue_exit(q);
+}
+
 void blk_mq_free_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-	const int sched_tag = rq->internal_tag;
 
 	if (rq->rq_flags & RQF_ELVPRIV) {
 		if (e && e->type->ops.mq.finish_request)
@@ -494,13 +509,9 @@ void blk_mq_free_request(struct request *rq)
 	if (blk_rq_rl(rq))
 		blk_put_rl(blk_rq_rl(rq));
 
-	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
-	if (rq->tag != -1)
-		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
-	if (sched_tag != -1)
-		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
-	blk_mq_sched_restart(hctx);
-	blk_queue_exit(q);
+	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+	if (refcount_dec_and_test(&rq->ref))
+		__blk_mq_free_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -547,8 +558,9 @@ static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
-	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
-	blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+	if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
+			MQ_RQ_IN_FLIGHT)
+		return;
 
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
@@ -593,36 +605,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 		*srcu_idx = srcu_read_lock(hctx->srcu);
 }
 
-static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
-{
-	unsigned long flags;
-
-	/*
-	 * blk_mq_rq_aborted_gstate() is used from the completion path and
-	 * can thus be called from irq context.  u64_stats_fetch in the
-	 * middle of update on the same CPU leads to lockup.  Disable irq
-	 * while updating.
-	 */
-	local_irq_save(flags);
-	u64_stats_update_begin(&rq->aborted_gstate_sync);
-	rq->aborted_gstate = gstate;
-	u64_stats_update_end(&rq->aborted_gstate_sync);
-	local_irq_restore(flags);
-}
-
-static u64 blk_mq_rq_aborted_gstate(struct request *rq)
-{
-	unsigned int start;
-	u64 aborted_gstate;
-
-	do {
-		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
-		aborted_gstate = rq->aborted_gstate;
-	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
-
-	return aborted_gstate;
-}
-
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
@@ -633,28 +615,9 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
  **/
 void blk_mq_complete_request(struct request *rq)
 {
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
-	int srcu_idx;
-
-	if (unlikely(blk_should_fake_timeout(q)))
+	if (unlikely(blk_should_fake_timeout(rq->q)))
 		return;
-
-	/*
-	 * If @rq->aborted_gstate equals the current instance, timeout is
-	 * claiming @rq and we lost.  This is synchronized through
-	 * hctx_lock().  See blk_mq_timeout_work() for details.
-	 *
-	 * Completion path never blocks and we can directly use RCU here
-	 * instead of hctx_lock() which can be either RCU or SRCU.
-	 * However, that would complicate paths which want to synchronize
-	 * against us.  Let stay in sync with the issue path so that
-	 * hctx_lock() covers both issue and completion paths.
-	 */
-	hctx_lock(hctx, &srcu_idx);
-	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
-		__blk_mq_complete_request(rq);
-	hctx_unlock(hctx, srcu_idx);
+	__blk_mq_complete_request(rq);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -683,25 +646,8 @@ void blk_mq_start_request(struct request *rq)
 
 	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 
-	/*
-	 * Mark @rq in-flight which also advances the generation number,
-	 * and register for timeout.  Protect with a seqcount to allow the
-	 * timeout path to read both @rq->gstate and @rq->deadline
-	 * coherently.
-	 *
-	 * This is the only place where a request is marked in-flight.  If
-	 * the timeout path reads an in-flight @rq->gstate, the
-	 * @rq->deadline it reads together under @rq->gstate_seq is
-	 * guaranteed to be the matching one.
-	 */
-	preempt_disable();
-	write_seqcount_begin(&rq->gstate_seq);
-
 	blk_add_timer(rq);
-	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
-
-	write_seqcount_end(&rq->gstate_seq);
-	preempt_enable();
+	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -714,11 +660,6 @@ void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
-/*
- * When we reach here because queue is busy, it's safe to change the state
- * to IDLE without checking @rq->aborted_gstate because we should still be
- * holding the RCU read lock and thus protected against timeout.
- */
 static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
@@ -728,8 +669,8 @@ static void __blk_mq_requeue_request(struct request *rq)
 	trace_block_rq_requeue(q, rq);
 	wbt_requeue(q->rq_wb, rq);
 
-	if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
-		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
+	if (blk_mq_request_started(rq)) {
+		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -827,33 +768,20 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-struct blk_mq_timeout_data {
-	unsigned long next;
-	unsigned int next_set;
-	unsigned int nr_expired;
-};
-
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
 	const struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
-	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
-
 	if (ops->timeout)
 		ret = ops->timeout(req, reserved);
 
 	switch (ret) {
 	case BLK_EH_HANDLED:
-		__blk_mq_complete_request(req);
+		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
+			__blk_mq_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
-		/*
-		 * As nothing prevents from completion happening while
-		 * ->aborted_gstate is set, this may lead to ignored
-		 * completions and further spurious timeouts.
-		 */
-		blk_mq_rq_update_aborted_gstate(req, 0);
 		blk_add_timer(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
@@ -864,64 +792,65 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 	}
 }
 
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
-		struct request *rq, void *priv, bool reserved)
+static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 {
-	struct blk_mq_timeout_data *data = priv;
-	unsigned long gstate, deadline;
-	int start;
+	unsigned long deadline;
 
-	might_sleep();
-
-	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
-		return;
+	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
+		return false;
 
-	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
-	while (true) {
-		start = read_seqcount_begin(&rq->gstate_seq);
-		gstate = READ_ONCE(rq->gstate);
-		deadline = blk_rq_deadline(rq);
-		if (!read_seqcount_retry(&rq->gstate_seq, start))
-			break;
-		cond_resched();
-	}
+	deadline = blk_rq_deadline(rq);
+	if (time_after_eq(jiffies, deadline))
+		return true;
 
-	/* if in-flight && overdue, mark for abortion */
-	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
-	    time_after_eq(jiffies, deadline)) {
-		blk_mq_rq_update_aborted_gstate(rq, gstate);
-		data->nr_expired++;
-		hctx->nr_expired++;
-	} else if (!data->next_set || time_after(data->next, deadline)) {
-		data->next = deadline;
-		data->next_set = 1;
-	}
+	if (*next == 0)
+		*next = deadline;
+	else if (time_after(*next, deadline))
+		*next = deadline;
+	return false;
 }
 
-static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
+	unsigned long *next = priv;
+
+	/*
+	 * Just do a quick check if it is expired before locking the request in
+	 * so we're not unnecessarilly synchronizing across CPUs.
+	 */
+	if (!blk_mq_req_expired(rq, next))
+		return;
+
+	/*
+	 * We have reason to believe the request may be expired. Take a
+	 * reference on the request to lock this request lifetime into its
+	 * currently allocated context to prevent it from being reallocated in
+	 * the event the completion by-passes this timeout handler.
+	 *
+	 * If the reference was already released, then the driver beat the
+	 * timeout handler to posting a natural completion.
+	 */
+	if (!refcount_inc_not_zero(&rq->ref))
+		return;
+
 	/*
-	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
-	 * completions that we lost to, they would have finished and
-	 * updated @rq->gstate by now; otherwise, the completion path is
-	 * now guaranteed to see @rq->aborted_gstate and yield.  If
-	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+	 * The request is now locked and cannot be reallocated underneath the
+	 * timeout handler's processing. Re-verify this exact request is truly
+	 * expired; if it is not expired, then the request was completed and
+	 * reallocated as a new request.
 	 */
-	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
-	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
+	if (blk_mq_req_expired(rq, next))
 		blk_mq_rq_timed_out(rq, reserved);
+	if (refcount_dec_and_test(&rq->ref))
+		__blk_mq_free_request(rq);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, timeout_work);
-	struct blk_mq_timeout_data data = {
-		.next		= 0,
-		.next_set	= 0,
-		.nr_expired	= 0,
-	};
+	unsigned long next = 0;
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
@@ -941,39 +870,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
-	/* scan for the expired ones and set their ->aborted_gstate */
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 
-	if (data.nr_expired) {
-		bool has_rcu = false;
-
-		/*
-		 * Wait till everyone sees ->aborted_gstate.  The
-		 * sequential waits for SRCUs aren't ideal.  If this ever
-		 * becomes a problem, we can add per-hw_ctx rcu_head and
-		 * wait in parallel.
-		 */
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (!hctx->nr_expired)
-				continue;
-
-			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-				has_rcu = true;
-			else
-				synchronize_srcu(hctx->srcu);
-
-			hctx->nr_expired = 0;
-		}
-		if (has_rcu)
-			synchronize_rcu();
-
-		/* terminate the ones we won */
-		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
-	}
-
-	if (data.next_set) {
-		data.next = blk_rq_timeout(round_jiffies_up(data.next));
-		mod_timer(&q->timeout, data.next);
+	if (next != 0) {
+		mod_timer(&q->timeout, next);
 	} else {
 		/*
 		 * Request timeouts are handled as a forward rolling timer. If
@@ -2049,15 +1949,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 			return ret;
 	}
 
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * start gstate with gen 1 instead of 0, otherwise it will be equal
-	 * to aborted_gstate, and be identified timed out by
-	 * blk_mq_terminate_expired.
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
-
+	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 	return 0;
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e1bb420dc5d6..89231e439b2f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,20 +30,6 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-/*
- * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
- * and the upper bits the generation number.
- */
-enum mq_rq_state {
-	MQ_RQ_IDLE		= 0,
-	MQ_RQ_IN_FLIGHT		= 1,
-	MQ_RQ_COMPLETE		= 2,
-
-	MQ_RQ_STATE_BITS	= 2,
-	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
-	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
-};
-
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -107,33 +93,9 @@ void blk_mq_release(struct request_queue *q);
  * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
  * @rq: target request.
  */
-static inline int blk_mq_rq_state(struct request *rq)
+static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
 {
-	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
-}
-
-/**
- * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
- * @rq: target request.
- * @state: new state to set.
- *
- * Set @rq's state to @state.  The caller is responsible for ensuring that
- * there are no other updaters.  A request can transition into IN_FLIGHT
- * only from IDLE and doing so increments the generation number.
- */
-static inline void blk_mq_rq_update_state(struct request *rq,
-					  enum mq_rq_state state)
-{
-	u64 old_val = READ_ONCE(rq->gstate);
-	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
-
-	if (state == MQ_RQ_IN_FLIGHT) {
-		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
-		new_val += MQ_RQ_GEN_INC;
-	}
-
-	/* avoid exposing interim values */
-	WRITE_ONCE(rq->gstate, new_val);
+	return READ_ONCE(rq->state);
 }
 
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 652d4d4d3e97..f95d6e6cbc96 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -214,7 +214,6 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	blk_rq_set_deadline(req, jiffies + req->timeout);
-	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3999719f828..a1c05e85a443 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -125,15 +125,22 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
-/* timeout is expired */
-#define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
 /* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 21))
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
 	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
 
+/*
+ * Request state for blk-mq.
+ */
+enum mq_rq_state {
+	MQ_RQ_IDLE		= 0,
+	MQ_RQ_IN_FLIGHT		= 1,
+	MQ_RQ_COMPLETE		= 2,
+};
+
 /*
  * Try to put the fields that are referenced together in the same cacheline.
  *
@@ -236,26 +243,8 @@ struct request {
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
-	/*
-	 * On blk-mq, the lower bits of ->gstate (generation number and
-	 * state) carry the MQ_RQ_* state value and the upper bits the
-	 * generation number which is monotonically incremented and used to
-	 * distinguish the reuse instances.
-	 *
-	 * ->gstate_seq allows updates to ->gstate and other fields
-	 * (currently ->deadline) during request start to be read
-	 * atomically from the timeout path, so that it can operate on a
-	 * coherent set of information.
-	 */
-	seqcount_t gstate_seq;
-	u64 gstate;
-
-	/*
-	 * ->aborted_gstate is used by the timeout to claim a specific
-	 * recycle instance of this request.  See blk_mq_timeout_work().
-	 */
-	struct u64_stats_sync aborted_gstate_sync;
-	u64 aborted_gstate;
+	enum mq_rq_state state;
+	refcount_t ref;
 
 	/* access through blk_rq_set_deadline, blk_rq_deadline */
 	unsigned long __deadline;
-- 
cgit v1.2.3


From 6600593cbd9340b3d4fcde8e58d17653732620c4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 May 2018 15:52:29 +0200
Subject: block: rename BLK_EH_NOT_HANDLED to BLK_EH_DONE

The BLK_EH_NOT_HANDLED implies nothing happen, but very often that
is not what is happening - instead the driver already completed the
command.  Fix the symbolic name to reflect that a little better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/scsi/scsi_eh.txt            | 4 ++--
 block/blk-mq.c                            | 2 +-
 block/blk-timeout.c                       | 2 +-
 drivers/block/nbd.c                       | 2 +-
 drivers/message/fusion/mptsas.c           | 2 +-
 drivers/s390/block/dasd.c                 | 6 +++---
 drivers/scsi/gdth.c                       | 2 +-
 drivers/scsi/libiscsi.c                   | 2 +-
 drivers/scsi/megaraid/megaraid_sas_base.c | 2 +-
 drivers/scsi/mvumi.c                      | 2 +-
 drivers/scsi/qla4xxx/ql4_os.c             | 2 +-
 drivers/scsi/scsi_error.c                 | 4 ++--
 drivers/scsi/scsi_transport_fc.c          | 4 ++--
 drivers/scsi/scsi_transport_srp.c         | 4 ++--
 drivers/scsi/ufs/ufshcd.c                 | 6 +++---
 include/linux/blkdev.h                    | 2 +-
 include/scsi/scsi_host.h                  | 2 +-
 17 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt
index 11e447bdb3a5..3ae8419e72cf 100644
--- a/Documentation/scsi/scsi_eh.txt
+++ b/Documentation/scsi/scsi_eh.txt
@@ -97,9 +97,9 @@ function
 	This indicates that more time is required to finish the
 	command.  Timer is restarted.  This action is counted as a
 	retry and only allowed scmd->allowed + 1(!) times.  Once the
-	limit is reached, action for BLK_EH_NOT_HANDLED is taken instead.
+	limit is reached, action for BLK_EH_DONE is taken instead.
 
-    - BLK_EH_NOT_HANDLED
+    - BLK_EH_DONE
         eh_timed_out() callback did not handle the command.
 	Step #2 is taken.
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6a7803abbf19..42002c2930b0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -784,7 +784,7 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
 		break;
-	case BLK_EH_NOT_HANDLED:
+	case BLK_EH_DONE:
 		break;
 	default:
 		printk(KERN_ERR "block: bad eh return: %d\n", ret);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index f95d6e6cbc96..11879e98c249 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -93,7 +93,7 @@ static void blk_rq_timed_out(struct request *req)
 		blk_add_timer(req);
 		blk_clear_rq_complete(req);
 		break;
-	case BLK_EH_NOT_HANDLED:
+	case BLK_EH_DONE:
 		/*
 		 * LLD handles this for now but in the future
 		 * we can send a request msg to abort the command
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 800e1ec71f3d..88ae833aabe9 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -332,7 +332,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 			}
 			blk_mq_requeue_request(req, true);
 			nbd_config_put(nbd);
-			return BLK_EH_NOT_HANDLED;
+			return BLK_EH_DONE;
 		}
 	} else {
 		dev_err_ratelimited(nbd_to_dev(nbd),
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 86503f60468f..19a5aa70ecda 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1929,7 +1929,7 @@ static enum blk_eh_timer_return mptsas_eh_timed_out(struct scsi_cmnd *sc)
 	MPT_SCSI_HOST *hd;
 	MPT_ADAPTER   *ioc;
 	VirtDevice    *vdevice;
-	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return rc = BLK_EH_DONE;
 
 	hd = shost_priv(sc->device->host);
 	if (hd == NULL) {
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 04143c08bd6e..b0e89ca48a3c 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3053,7 +3053,7 @@ out:
  *
  * Return values:
  * BLK_EH_RESET_TIMER if the request should be left running
- * BLK_EH_NOT_HANDLED if the request is handled or terminated
+ * BLK_EH_DONE if the request is handled or terminated
  *		      by the driver.
  */
 enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
@@ -3065,7 +3065,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
 	int rc = 0;
 
 	if (!cqr)
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 
 	spin_lock_irqsave(&cqr->dq->lock, flags);
 	device = cqr->startdev ? cqr->startdev : block->base;
@@ -3124,7 +3124,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
 	spin_unlock(&block->queue_lock);
 	spin_unlock_irqrestore(&cqr->dq->lock, flags);
 
-	return rc ? BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+	return rc ? BLK_EH_RESET_TIMER : BLK_EH_DONE;
 }
 
 static int dasd_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index c35f05c4c6bb..85604795d8ee 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -3882,7 +3882,7 @@ static enum blk_eh_timer_return gdth_timed_out(struct scsi_cmnd *scp)
 	struct gdth_cmndinfo *cmndinfo = gdth_cmnd_priv(scp);
 	u8 b, t;
 	unsigned long flags;
-	enum blk_eh_timer_return retval = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return retval = BLK_EH_DONE;
 
 	TRACE(("%s() cmd 0x%x\n", scp->cmnd[0], __func__));
 	b = scp->device->channel;
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 15a2fef51e38..eee43ba83a60 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1963,7 +1963,7 @@ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn)
 
 enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 {
-	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return rc = BLK_EH_DONE;
 	struct iscsi_task *task = NULL, *running_task;
 	struct iscsi_cls_session *cls_session;
 	struct iscsi_session *session;
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index b89c6e6c0589..ce656c466ca9 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -2772,7 +2772,7 @@ blk_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd)
 
 	if (time_after(jiffies, scmd->jiffies_at_alloc +
 				(scmd_timeout * 2) * HZ)) {
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 	}
 
 	instance = (struct megasas_instance *)scmd->device->host->hostdata;
diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c
index fe97401ad192..afd27165cd93 100644
--- a/drivers/scsi/mvumi.c
+++ b/drivers/scsi/mvumi.c
@@ -2155,7 +2155,7 @@ static enum blk_eh_timer_return mvumi_timed_out(struct scsi_cmnd *scmd)
 	mvumi_return_cmd(mhba, cmd);
 	spin_unlock_irqrestore(mhba->shost->host_lock, flags);
 
-	return BLK_EH_NOT_HANDLED;
+	return BLK_EH_DONE;
 }
 
 static int
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 94c14ce94da2..0e13349dce57 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1848,7 +1848,7 @@ static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc)
 	struct iscsi_cls_session *session;
 	struct iscsi_session *sess;
 	unsigned long flags;
-	enum blk_eh_timer_return ret = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return ret = BLK_EH_DONE;
 
 	session = starget_to_session(scsi_target(sc->device));
 	sess = session->dd_data;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index b36e73090018..9c02ba2e7ef3 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -282,7 +282,7 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
 enum blk_eh_timer_return scsi_times_out(struct request *req)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
-	enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return rtn = BLK_EH_DONE;
 	struct Scsi_Host *host = scmd->device->host;
 
 	trace_scsi_dispatch_cmd_timeout(scmd);
@@ -294,7 +294,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 	if (host->hostt->eh_timed_out)
 		rtn = host->hostt->eh_timed_out(scmd);
 
-	if (rtn == BLK_EH_NOT_HANDLED) {
+	if (rtn == BLK_EH_DONE) {
 		if (scsi_abort_command(scmd) != SUCCESS) {
 			set_host_byte(scmd, DID_TIME_OUT);
 			scsi_eh_scmd_add(scmd);
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index be3be0f9cb2d..90075a0ddcfe 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -2087,7 +2087,7 @@ fc_eh_timed_out(struct scsi_cmnd *scmd)
 	if (rport->port_state == FC_PORTSTATE_BLOCKED)
 		return BLK_EH_RESET_TIMER;
 
-	return BLK_EH_NOT_HANDLED;
+	return BLK_EH_DONE;
 }
 EXPORT_SYMBOL(fc_eh_timed_out);
 
@@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req)
 
 	/* the blk_end_sync_io() doesn't check the error */
 	if (!inflight)
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 	else
 		return BLK_EH_HANDLED;
 }
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
index 36f6190931bc..a9c1c991da35 100644
--- a/drivers/scsi/scsi_transport_srp.c
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -587,7 +587,7 @@ EXPORT_SYMBOL(srp_reconnect_rport);
  *
  * If a timeout occurs while an rport is in the blocked state, ask the SCSI
  * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core
- * handle the timeout (BLK_EH_NOT_HANDLED).
+ * handle the timeout (BLK_EH_DONE).
  *
  * Note: This function is called from soft-IRQ context and with the request
  * queue lock held.
@@ -602,7 +602,7 @@ enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
 	pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev));
 	return rport->fast_io_fail_tmo < 0 && rport->dev_loss_tmo < 0 &&
 		i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
-		BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+		BLK_EH_RESET_TIMER : BLK_EH_DONE;
 }
 EXPORT_SYMBOL(srp_timed_out);
 
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 00e79057f870..d0a1674915a1 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -6497,12 +6497,12 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
 	bool found = false;
 
 	if (!scmd || !scmd->device || !scmd->device->host)
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 
 	host = scmd->device->host;
 	hba = shost_priv(host);
 	if (!hba)
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 
 	spin_lock_irqsave(host->host_lock, flags);
 
@@ -6520,7 +6520,7 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
 	 * SCSI command was not actually dispatched to UFS driver, otherwise
 	 * let SCSI layer handle the error as usual.
 	 */
-	return found ? BLK_EH_NOT_HANDLED : BLK_EH_RESET_TIMER;
+	return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER;
 }
 
 static const struct attribute_group *ufshcd_driver_groups[] = {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a1c05e85a443..f82e05df905b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -326,7 +326,7 @@ typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
 typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
-	BLK_EH_NOT_HANDLED,
+	BLK_EH_DONE,
 	BLK_EH_HANDLED,
 	BLK_EH_RESET_TIMER,
 };
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 12f454cb6f61..53b485fe9b67 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -307,7 +307,7 @@ struct scsi_host_template {
 	 * EH_HANDLED:		I fixed the error, please complete the command
 	 * EH_RESET_TIMER:	I need more time, reset the timer and
 	 *			begin counting again
-	 * EH_NOT_HANDLED	Begin normal error recovery
+	 * EH_DONE:		Begin normal error recovery
 	 *
 	 * Status: OPTIONAL
 	 */
-- 
cgit v1.2.3


From f6e7d48a780a95a599d7e325f45e9ac208ac6f13 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 May 2018 15:52:37 +0200
Subject: block: remove BLK_EH_HANDLED

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/scsi/scsi_eh.txt | 11 -----------
 block/blk-mq.c                 |  4 ----
 block/blk-timeout.c            |  3 ---
 include/linux/blkdev.h         |  1 -
 4 files changed, 19 deletions(-)

(limited to 'include')

diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt
index 3ae8419e72cf..1b7436932a2b 100644
--- a/Documentation/scsi/scsi_eh.txt
+++ b/Documentation/scsi/scsi_eh.txt
@@ -82,17 +82,6 @@ function
  1. invokes optional hostt->eh_timed_out() callback.  Return value can
     be one of
 
-    - BLK_EH_HANDLED
-	This indicates that eh_timed_out() dealt with the timeout.
-	The command is passed back to the block layer and completed
-	via __blk_complete_requests().
-
-	*NOTE* After returning BLK_EH_HANDLED the SCSI layer is
-	assumed to be finished with the command, and no other
-	functions from the SCSI layer will be called. So this
-	should typically only be returned if the eh_timed_out()
-	handler raced with normal completion.
-
     - BLK_EH_RESET_TIMER
 	This indicates that more time is required to finish the
 	command.  Timer is restarted.  This action is counted as a
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 42002c2930b0..51edbbb91369 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -777,10 +777,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		ret = ops->timeout(req, reserved);
 
 	switch (ret) {
-	case BLK_EH_HANDLED:
-		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
-			__blk_mq_complete_request(req);
-		break;
 	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
 		break;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 11879e98c249..4b8a48d48ba1 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -86,9 +86,6 @@ static void blk_rq_timed_out(struct request *req)
 	if (q->rq_timed_out_fn)
 		ret = q->rq_timed_out_fn(req);
 	switch (ret) {
-	case BLK_EH_HANDLED:
-		__blk_complete_request(req);
-		break;
 	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
 		blk_clear_rq_complete(req);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f82e05df905b..d838a89639f2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -327,7 +327,6 @@ typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
 	BLK_EH_DONE,
-	BLK_EH_HANDLED,
 	BLK_EH_RESET_TIMER,
 };
 
-- 
cgit v1.2.3


From 88b0cfad288831a86ffd79b6d00b689489598186 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 May 2018 15:52:38 +0200
Subject: block: document the blk_eh_timer_return values

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d838a89639f2..9f06b29adaa4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -326,8 +326,8 @@ typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
 typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
-	BLK_EH_DONE,
-	BLK_EH_RESET_TIMER,
+	BLK_EH_DONE,		/* drivers has completed the command */
+	BLK_EH_RESET_TIMER,	/* reset timer and try again */
 };
 
 typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
-- 
cgit v1.2.3


From 0b7576d8eb4c5634276ada359bfdb72e69eebc25 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 29 May 2018 08:47:57 -0600
Subject: block: move ->timeout request member

After the recent timeout handling changes, we have two holes in
the struct. Move the timeout near the deadline, killing both,
and moving related members closer together. On my config on
x86-64, this shrinks struct request from 312 to 304 bytes.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9f06b29adaa4..4efd9af62e25 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -237,8 +237,6 @@ struct request {
 	unsigned short write_hint;
 	unsigned short ioprio;
 
-	unsigned int timeout;
-
 	void *special;		/* opaque pointer available for LLD use */
 
 	unsigned int extra_len;	/* length of alignment and padding */
@@ -246,6 +244,8 @@ struct request {
 	enum mq_rq_state state;
 	refcount_t ref;
 
+	unsigned int timeout;
+
 	/* access through blk_rq_set_deadline, blk_rq_deadline */
 	unsigned long __deadline;
 
-- 
cgit v1.2.3


From 5afb78356cead66db2203061fed6fc8957527ed4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 May 2018 16:42:59 +0200
Subject: block: don't print a message when the device went away

The information about a size change in this case just creates confusion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partition-generic.c |  4 ++--
 fs/block_dev.c            | 14 +++++++++-----
 include/linux/fs.h        |  2 +-
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/partition-generic.c b/block/partition-generic.c
index 93ef8ce568a1..3dcfd4ec0e11 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -516,7 +516,7 @@ rescan:
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, true);
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
@@ -641,7 +641,7 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
 		return res;
 
 	set_capacity(disk, 0);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, false);
 	bdev->bd_invalidated = 0;
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 771ddfa29dc9..81c57c14fae8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1322,21 +1322,25 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
  * check_disk_size_change - checks for disk size change and adjusts bdev size.
  * @disk: struct gendisk to check
  * @bdev: struct bdev to adjust.
+ * @verbose: if %true log a message about a size change if there is any
  *
  * This routine checks to see if the bdev size does not match the disk size
  * and adjusts it if it differs. When shrinking the bdev size, its all caches
  * are freed.
  */
-void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
+		bool verbose)
 {
 	loff_t disk_size, bdev_size;
 
 	disk_size = (loff_t)get_capacity(disk) << 9;
 	bdev_size = i_size_read(bdev->bd_inode);
 	if (disk_size != bdev_size) {
-		printk(KERN_INFO
-		       "%s: detected capacity change from %lld to %lld\n",
-		       disk->disk_name, bdev_size, disk_size);
+		if (verbose) {
+			printk(KERN_INFO
+			       "%s: detected capacity change from %lld to %lld\n",
+			       disk->disk_name, bdev_size, disk_size);
+		}
 		i_size_write(bdev->bd_inode, disk_size);
 		if (bdev_size > disk_size)
 			flush_disk(bdev, false);
@@ -1363,7 +1367,7 @@ int revalidate_disk(struct gendisk *disk)
 		return ret;
 
 	mutex_lock(&bdev->bd_mutex);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, ret == 0);
 	bdev->bd_invalidated = 0;
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..d8d4831af9ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2570,7 +2570,7 @@ extern bool is_bad_inode(struct inode *);
 
 #ifdef CONFIG_BLOCK
 extern void check_disk_size_change(struct gendisk *disk,
-				   struct block_device *bdev);
+		struct block_device *bdev, bool verbose);
 extern int revalidate_disk(struct gendisk *);
 extern int check_disk_change(struct block_device *);
 extern int __invalidate_device(struct block_device *, bool);
-- 
cgit v1.2.3


From 5de815a7eedfacf593817ef34634eaa9b75a1482 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 May 2018 08:40:23 +0200
Subject: block: remove parent device reference from struct bsg_class_device

Bsg holding a reference to the parent device may result in a crash if a
bsg file handle is closed after the parent device driver has unloaded.

Holding a reference is not really needed: the parent device must exist
between bsg_register_queue and bsg_unregister_queue.  Before the device
goes away the caller does blk_cleanup_queue so that all in-flight
requests to the device are gone and all new requests cannot pass beyond
the queue.  The queue itself is a refcounted object and it will stay
alive with a bsg file.

Based on analysis, previous patch and changelog from Anatoliy Glagolev.

Reported-by: Anatoliy Glagolev <glagolig@gmail.com>
Reviewed-by: James E.J. Bottomley <jejb@linux.vnet.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c                     |  6 ++----
 block/bsg.c                         | 39 +++++--------------------------------
 drivers/scsi/scsi_transport_fc.c    |  7 +++----
 drivers/scsi/scsi_transport_iscsi.c |  2 +-
 drivers/scsi/scsi_transport_sas.c   | 19 ++++++------------
 include/linux/bsg-lib.h             |  3 +--
 include/linux/bsg.h                 |  6 +-----
 7 files changed, 19 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fc2e5ff2c4b9..9419def8c017 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -303,11 +303,9 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
  * @name: device to give bsg device
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
- * @release: @dev release function
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size,
-		void (*release)(struct device *))
+		bsg_job_fn *job_fn, int dd_job_size)
 {
 	struct request_queue *q;
 	int ret;
@@ -331,7 +329,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
-	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release);
+	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);
diff --git a/block/bsg.c b/block/bsg.c
index 30876c5fbf15..132e657e2d91 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -649,18 +649,6 @@ static struct bsg_device *bsg_alloc_device(void)
 	return bd;
 }
 
-static void bsg_kref_release_function(struct kref *kref)
-{
-	struct bsg_class_device *bcd =
-		container_of(kref, struct bsg_class_device, ref);
-	struct device *parent = bcd->parent;
-
-	if (bcd->release)
-		bcd->release(bcd->parent);
-
-	put_device(parent);
-}
-
 static int bsg_put_device(struct bsg_device *bd)
 {
 	int ret = 0, do_free;
@@ -693,7 +681,6 @@ static int bsg_put_device(struct bsg_device *bd)
 
 	kfree(bd);
 out:
-	kref_put(&q->bsg_dev.ref, bsg_kref_release_function);
 	if (do_free)
 		blk_put_queue(q);
 	return ret;
@@ -759,8 +746,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 	 */
 	mutex_lock(&bsg_mutex);
 	bcd = idr_find(&bsg_minor_idr, iminor(inode));
-	if (bcd)
-		kref_get(&bcd->ref);
 	mutex_unlock(&bsg_mutex);
 
 	if (!bcd)
@@ -771,8 +756,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 		return bd;
 
 	bd = bsg_add_device(inode, bcd->queue, file);
-	if (IS_ERR(bd))
-		kref_put(&bcd->ref, bsg_kref_release_function);
 
 	return bd;
 }
@@ -912,25 +895,17 @@ void bsg_unregister_queue(struct request_queue *q)
 		sysfs_remove_link(&q->kobj, "bsg");
 	device_unregister(bcd->class_dev);
 	bcd->class_dev = NULL;
-	kref_put(&bcd->ref, bsg_kref_release_function);
 	mutex_unlock(&bsg_mutex);
 }
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
-		const char *name, const struct bsg_ops *ops,
-		void (*release)(struct device *))
+		const char *name, const struct bsg_ops *ops)
 {
 	struct bsg_class_device *bcd;
 	dev_t dev;
 	int ret;
 	struct device *class_dev = NULL;
-	const char *devname;
-
-	if (name)
-		devname = name;
-	else
-		devname = dev_name(parent);
 
 	/*
 	 * we need a proper transport to send commands, not a stacked device
@@ -954,15 +929,12 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 
 	bcd->minor = ret;
 	bcd->queue = q;
-	bcd->parent = get_device(parent);
-	bcd->release = release;
 	bcd->ops = ops;
-	kref_init(&bcd->ref);
 	dev = MKDEV(bsg_major, bcd->minor);
-	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
+	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name);
 	if (IS_ERR(class_dev)) {
 		ret = PTR_ERR(class_dev);
-		goto put_dev;
+		goto idr_remove;
 	}
 	bcd->class_dev = class_dev;
 
@@ -977,8 +949,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 
 unregister_class_dev:
 	device_unregister(class_dev);
-put_dev:
-	put_device(parent);
+idr_remove:
 	idr_remove(&bsg_minor_idr, bcd->minor);
 unlock:
 	mutex_unlock(&bsg_mutex);
@@ -992,7 +963,7 @@ int bsg_scsi_register_queue(struct request_queue *q, struct device *parent)
 		return -EINVAL;
 	}
 
-	return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL);
+	return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops);
 }
 EXPORT_SYMBOL_GPL(bsg_scsi_register_queue);
 
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 7a9a65588a1b..1da3d71e9f61 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3780,8 +3780,7 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 
-	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size,
-			NULL);
+	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev,
 			"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3826,8 +3825,8 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = bsg_setup_queue(dev, NULL, fc_bsg_dispatch, i->f->dd_bsg_size,
-			NULL);
+	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
+			i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 65f6c94f2e9b..6fd2fe210fc3 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0, NULL);
+	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
 	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 08acbabfae07..e2953b416746 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -187,16 +187,6 @@ static int sas_smp_dispatch(struct bsg_job *job)
 	return 0;
 }
 
-static void sas_host_release(struct device *dev)
-{
-	struct Scsi_Host *shost = dev_to_shost(dev);
-	struct sas_host_attrs *sas_host = to_sas_host_attrs(shost);
-	struct request_queue *q = sas_host->q;
-
-	if (q)
-		blk_cleanup_queue(q);
-}
-
 static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 {
 	struct request_queue *q;
@@ -208,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 	if (rphy) {
 		q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
-				sas_smp_dispatch, 0, NULL);
+				sas_smp_dispatch, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		rphy->q = q;
@@ -217,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 		snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
 		q = bsg_setup_queue(&shost->shost_gendev, name,
-				sas_smp_dispatch, 0, sas_host_release);
+				sas_smp_dispatch, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		to_sas_host_attrs(shost)->q = q;
@@ -260,8 +250,11 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct request_queue *q = to_sas_host_attrs(shost)->q;
 
-	if (q)
+	if (q) {
 		bsg_unregister_queue(q);
+		blk_cleanup_queue(q);
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 28a7ccc55c89..6aeaf6472665 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -72,8 +72,7 @@ struct bsg_job {
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size,
-		void (*release)(struct device *));
+		bsg_job_fn *job_fn, int dd_job_size);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 0c7dd9ceb139..dac37b6e00ec 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -17,17 +17,13 @@ struct bsg_ops {
 
 struct bsg_class_device {
 	struct device *class_dev;
-	struct device *parent;
 	int minor;
 	struct request_queue *queue;
-	struct kref ref;
 	const struct bsg_ops *ops;
-	void (*release)(struct device *);
 };
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
-		const char *name, const struct bsg_ops *ops,
-		void (*release)(struct device *));
+		const char *name, const struct bsg_ops *ops);
 int bsg_scsi_register_queue(struct request_queue *q, struct device *parent);
 void bsg_unregister_queue(struct request_queue *q);
 #else
-- 
cgit v1.2.3


From 7a279e93330bd4f0074973293f1cd1aacf7c5fa6 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Tue, 29 May 2018 12:27:44 +0100
Subject: bpf: clean up eBPF helpers documentation

These are minor edits for the eBPF helpers documentation in
include/uapi/linux/bpf.h.

The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends
with a non-escaped underscore that gets interpreted by rst2man and
produces the following message in the resulting manual page:

    DOCUTILS SYSTEM MESSAGES
           System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514)
                  Unknown target name: "bpf_fib_lookup".

Other edits consist in:

- Improving formatting for flag values for "bpf_fib_lookup()" helper.
- Emphasising a parameter name in description of the return value for
  "bpf_get_stack()" helper.
- Removing unnecessary blank lines between "Description" and "Return"
  sections for the few helpers that would use it, for consistency.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h       | 21 ++++++++++-----------
 tools/include/uapi/linux/bpf.h | 21 ++++++++++-----------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
-- 
cgit v1.2.3


From fa898d769b264ede3c10cc30a537316c6a946956 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 29 May 2018 10:58:07 -0700
Subject: bpf: Drop mpls from bpf_fib_lookup

MPLS support will not be submitted this dev cycle, but in working on it
I do see a few changes are needed to the API. For now, drop mpls from the
API. Since the fields in question are unions, the mpls fields can be added
back later without affecting the uapi.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f556b35ac8d..33f37eb0b6bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1852,10 +1852,10 @@ union bpf_attr {
  *		If lookup is successful and result shows packet is to be
  *		forwarded, the neighbor tables are searched for the nexthop.
  *		If successful (ie., FIB lookup shows forwarding and nexthop
- *		is resolved), the nexthop address is returned in ipv4_dst,
- *		ipv6_dst or mpls_out based on family, smac is set to mac
- *		address of egress device, dmac is set to nexthop mac address,
- *		rt_metric is set to metric from route.
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only).
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -2537,8 +2537,10 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
 struct bpf_fib_lookup {
-	/* input */
-	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
 
 	/* set if lookup is to consider L4 data - e.g., FIB rules */
 	__u8	l4_protocol;
@@ -2554,22 +2556,20 @@ struct bpf_fib_lookup {
 		__u8	tos;		/* AF_INET  */
 		__be32	flowlabel;	/* AF_INET6 */
 
-		/* output: metric of fib result */
-		__u32 rt_metric;
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
 	};
 
 	union {
-		__be32		mpls_in;
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
 	};
 
-	/* input to bpf_fib_lookup, *dst is destination address.
-	 * output: bpf_fib_lookup sets to gateway address
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
 	 */
 	union {
-		/* return for MPLS lookups */
-		__be32		mpls_out[4];  /* support up to 4 labels */
 		__be32		ipv4_dst;
 		__u32		ipv6_dst[4];  /* in6_addr; network order */
 	};
-- 
cgit v1.2.3


From ab741b2eed3e456cebd2240d4c9c6be003d5ae72 Mon Sep 17 00:00:00 2001
From: Yonatan Cohen <yonatanc@mellanox.com>
Date: Sun, 27 May 2018 13:42:32 +0300
Subject: net/mlx5: Exposing a new mini-CQE format

The new mini-CQE format includes byte-count, checksum
and stride index.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Yonatan Cohen <yonatanc@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 9c3538f1b8b9..10c1613d9434 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1112,7 +1112,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   num_of_uars_per_page[0x20];
 	u8         reserved_at_540[0x40];
 
-	u8         reserved_at_580[0x3d];
+	u8         reserved_at_580[0x3c];
+	u8         mini_cqe_resp_stride_index[0x1];
 	u8         cqe_128_always[0x1];
 	u8         cqe_compression_128[0x1];
 	u8         cqe_compression[0x1];
-- 
cgit v1.2.3


From 6f1006a43869ff82745eea3b88204d0a3bcc0158 Mon Sep 17 00:00:00 2001
From: Yonatan Cohen <yonatanc@mellanox.com>
Date: Sun, 27 May 2018 13:42:34 +0300
Subject: IB/mlx5: Introduce a new mini-CQE format

The new mini-CQE format includes the stride index, byte count and
packet checksum.
Stride index is needed for striding WQ feature.
This patch exposes this capability and enables its setting
via mlx5 UHW data as part of query device and cq creation.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Yonatan Cohen <yonatanc@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c   | 42 +++++++++++++++++++++++++++++----------
 drivers/infiniband/hw/mlx5/main.c |  4 ++++
 include/uapi/rdma/mlx5-abi.h      |  2 +-
 3 files changed, 37 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 6d52ea03574e..68775e12d721 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -742,6 +742,28 @@ static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev,
 	return 0;
 }
 
+enum {
+	MLX5_CQE_RES_FORMAT_HASH = 0,
+	MLX5_CQE_RES_FORMAT_CSUM = 1,
+	MLX5_CQE_RES_FORMAT_CSUM_STRIDX = 3,
+};
+
+static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format)
+{
+	switch (format) {
+	case MLX5_IB_CQE_RES_FORMAT_HASH:
+		return MLX5_CQE_RES_FORMAT_HASH;
+	case MLX5_IB_CQE_RES_FORMAT_CSUM:
+		return MLX5_CQE_RES_FORMAT_CSUM;
+	case MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX:
+		if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
+			return MLX5_CQE_RES_FORMAT_CSUM_STRIDX;
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+}
+
 static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 			  struct ib_ucontext *context, struct mlx5_ib_cq *cq,
 			  int entries, u32 **cqb,
@@ -807,6 +829,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	*index = to_mucontext(context)->bfregi.sys_pages[0];
 
 	if (ucmd.cqe_comp_en == 1) {
+		int mini_cqe_format;
+
 		if (!((*cqe_size == 128 &&
 		       MLX5_CAP_GEN(dev->mdev, cqe_compression_128)) ||
 		      (*cqe_size == 64  &&
@@ -817,20 +841,18 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 			goto err_cqb;
 		}
 
-		if (unlikely(!ucmd.cqe_comp_res_format ||
-			     !(ucmd.cqe_comp_res_format <
-			       MLX5_IB_CQE_RES_RESERVED) ||
-			     (ucmd.cqe_comp_res_format &
-			      (ucmd.cqe_comp_res_format - 1)))) {
-			err = -EOPNOTSUPP;
-			mlx5_ib_warn(dev, "CQE compression res format %d is not supported!\n",
-				     ucmd.cqe_comp_res_format);
+		mini_cqe_format =
+			mini_cqe_res_format_to_hw(dev,
+						  ucmd.cqe_comp_res_format);
+		if (mini_cqe_format < 0) {
+			err = mini_cqe_format;
+			mlx5_ib_dbg(dev, "CQE compression res format %d error: %d\n",
+				    ucmd.cqe_comp_res_format, err);
 			goto err_cqb;
 		}
 
 		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
-		MLX5_SET(cqc, cqc, mini_cqe_res_format,
-			 ilog2(ucmd.cqe_comp_res_format));
+		MLX5_SET(cqc, cqc, mini_cqe_res_format, mini_cqe_format);
 	}
 
 	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD) {
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 029c310a0dd1..e0894b203d59 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -993,6 +993,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 			resp.cqe_comp_caps.supported_format =
 				MLX5_IB_CQE_RES_FORMAT_HASH |
 				MLX5_IB_CQE_RES_FORMAT_CSUM;
+
+			if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
+				resp.cqe_comp_caps.supported_format |=
+					MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
 		}
 	}
 
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index cb4a02c4a1ce..9783648d5511 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -163,7 +163,7 @@ struct mlx5_ib_rss_caps {
 enum mlx5_ib_cqe_comp_res_format {
 	MLX5_IB_CQE_RES_FORMAT_HASH	= 1 << 0,
 	MLX5_IB_CQE_RES_FORMAT_CSUM	= 1 << 1,
-	MLX5_IB_CQE_RES_RESERVED	= 1 << 2,
+	MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX = 1 << 2,
 };
 
 struct mlx5_ib_cqe_comp_caps {
-- 
cgit v1.2.3


From 7e3c03817feccdf2b26722993825dc09377908b4 Mon Sep 17 00:00:00 2001
From: Lina Iyer <ilina@codeaurora.org>
Date: Mon, 7 May 2018 11:52:29 -0600
Subject: drivers: thermal: Update license to SPDX format

Update licences format for core thermal files.

Signed-off-by: Lina Iyer <ilina@codeaurora.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/of-thermal.c      | 19 +------------------
 drivers/thermal/thermal_core.c    |  5 +----
 drivers/thermal/thermal_core.h    | 17 +----------------
 drivers/thermal/thermal_helpers.c |  5 +----
 drivers/thermal/thermal_hwmon.c   | 17 +----------------
 drivers/thermal/thermal_hwmon.h   | 17 +----------------
 drivers/thermal/thermal_sysfs.c   |  5 +----
 include/linux/thermal.h           | 17 +----------------
 8 files changed, 8 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index e09f0354a4bc..eea2fce82bf7 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -1,26 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  of-thermal.c - Generic Thermal Management device tree support.
  *
  *  Copyright (C) 2013 Texas Instruments
  *  Copyright (C) 2013 Eduardo Valentin <eduardo.valentin@ti.com>
- *
- *
- *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 #include <linux/thermal.h>
 #include <linux/slab.h>
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 649fd2a04823..6ab982309e6a 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  thermal.c - Generic Thermal Management Sysfs support.
  *
  *  Copyright (C) 2008 Intel Corp
  *  Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
  *  Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index eddee27c1d06..0df190ed82a7 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -1,24 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  *  thermal_core.h
  *
  *  Copyright (C) 2012  Intel Corp
  *  Author: Durgadoss R <durgadoss.r@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
 #ifndef __THERMAL_CORE_H__
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index eb03d7e099bb..2ba756af76b7 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  thermal_helpers.c - helper functions to handle thermal devices
  *
@@ -7,10 +8,6 @@
  *  Copyright (C) 2008 Intel Corp
  *  Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
  *  Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c
index c4a508a124dc..11278836ed12 100644
--- a/drivers/thermal/thermal_hwmon.c
+++ b/drivers/thermal/thermal_hwmon.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  thermal_hwmon.c - Generic Thermal Management hwmon support.
  *
@@ -8,22 +9,6 @@
  *
  *  Copyright (C) 2013 Texas Instruments
  *  Copyright (C) 2013 Eduardo Valentin <eduardo.valentin@ti.com>
- *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 #include <linux/hwmon.h>
 #include <linux/thermal.h>
diff --git a/drivers/thermal/thermal_hwmon.h b/drivers/thermal/thermal_hwmon.h
index c798fdb2ae43..019f6f88224e 100644
--- a/drivers/thermal/thermal_hwmon.h
+++ b/drivers/thermal/thermal_hwmon.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  *  thermal_hwmon.h - Generic Thermal Management hwmon support.
  *
@@ -8,22 +9,6 @@
  *
  *  Copyright (C) 2013 Texas Instruments
  *  Copyright (C) 2013 Eduardo Valentin <eduardo.valentin@ti.com>
- *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 #ifndef __THERMAL_HWMON_H__
 #define __THERMAL_HWMON_H__
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 721726565fef..2241ceae7d7f 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  thermal.c - sysfs interface of thermal devices
  *
@@ -7,10 +8,6 @@
  *  Copyright (C) 2008 Intel Corp
  *  Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
  *  Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 7834be668d80..5f4705f46c2f 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -1,25 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  *  thermal.h  ($Revision: 0 $)
  *
  *  Copyright (C) 2008  Intel Corp
  *  Copyright (C) 2008  Zhang Rui <rui.zhang@intel.com>
  *  Copyright (C) 2008  Sujith Thomas <sujith.thomas@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
 #ifndef __THERMAL_H__
-- 
cgit v1.2.3


From 9c80172b902db58233346adbb139cfdcb9229f0f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 25 May 2018 12:19:57 +0200
Subject: kernel/SRCU: provide a static initializer

There are macros for static initializer for the three out of four
possible notifier types, that are:
	ATOMIC_NOTIFIER_HEAD()
	BLOCKING_NOTIFIER_HEAD()
	RAW_NOTIFIER_HEAD()

This patch provides a static initilizer for the forth type to make it
complete.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/notifier.h | 34 +++++++++++++++++++++++++++++-----
 include/linux/srcutiny.h |  6 +++---
 include/linux/srcutree.h |  6 +++---
 3 files changed, 35 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 6d731110e0db..f35c7bf76143 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -43,9 +43,7 @@
  * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
  * As compensation, srcu_notifier_chain_unregister() is rather expensive.
  * SRCU notifier chains should be used when the chain will be called very
- * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
- * chains are slightly more difficult to use because they require special
- * runtime initialization.
+ * often but notifier_blocks will seldom be removed.
  */
 
 struct notifier_block;
@@ -91,7 +89,7 @@ struct srcu_notifier_head {
 		(name)->head = NULL;		\
 	} while (0)
 
-/* srcu_notifier_heads must be initialized and cleaned up dynamically */
+/* srcu_notifier_heads must be cleaned up dynamically */
 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 #define srcu_cleanup_notifier_head(name)	\
 		cleanup_srcu_struct(&(name)->srcu);
@@ -104,7 +102,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 		.head = NULL }
 #define RAW_NOTIFIER_INIT(name)	{				\
 		.head = NULL }
-/* srcu_notifier_heads cannot be initialized statically */
+
+#define SRCU_NOTIFIER_INIT(name, pcpu)				\
+	{							\
+		.mutex = __MUTEX_INITIALIZER(name.mutex),	\
+		.head = NULL,					\
+		.srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),	\
+	}
 
 #define ATOMIC_NOTIFIER_HEAD(name)				\
 	struct atomic_notifier_head name =			\
@@ -116,6 +120,26 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 	struct raw_notifier_head name =				\
 		RAW_NOTIFIER_INIT(name)
 
+#ifdef CONFIG_TREE_SRCU
+#define _SRCU_NOTIFIER_HEAD(name, mod)				\
+	static DEFINE_PER_CPU(struct srcu_data,			\
+			name##_head_srcu_data);			\
+	mod struct srcu_notifier_head name =			\
+			SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)
+
+#else
+#define _SRCU_NOTIFIER_HEAD(name, mod)				\
+	mod struct srcu_notifier_head name =			\
+			SRCU_NOTIFIER_INIT(name, name)
+
+#endif
+
+#define SRCU_NOTIFIER_HEAD(name)				\
+	_SRCU_NOTIFIER_HEAD(name, /* not static */)
+
+#define SRCU_NOTIFIER_HEAD_STATIC(name)				\
+	_SRCU_NOTIFIER_HEAD(name, static)
+
 #ifdef __KERNEL__
 
 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 261471f407a5..f41d2fb09f87 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -43,7 +43,7 @@ struct srcu_struct {
 
 void srcu_drive_gp(struct work_struct *wp);
 
-#define __SRCU_STRUCT_INIT(name)					\
+#define __SRCU_STRUCT_INIT(name, __ignored)				\
 {									\
 	.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),	\
 	.srcu_cb_tail = &name.srcu_cb_head,				\
@@ -56,9 +56,9 @@ void srcu_drive_gp(struct work_struct *wp);
  * Tree SRCU, which needs some per-CPU data.
  */
 #define DEFINE_SRCU(name) \
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+	struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
 #define DEFINE_STATIC_SRCU(name) \
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
 
 void synchronize_srcu(struct srcu_struct *sp);
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 4eda108abee0..745d4ca4dd50 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -104,9 +104,9 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
-#define __SRCU_STRUCT_INIT(name)					\
+#define __SRCU_STRUCT_INIT(name, pcpu_name)				\
 	{								\
-		.sda = &name##_srcu_data,				\
+		.sda = &pcpu_name,					\
 		.lock = __SPIN_LOCK_UNLOCKED(name.lock),		\
 		.srcu_gp_seq_needed = 0 - 1,				\
 		__SRCU_DEP_MAP_INIT(name)				\
@@ -133,7 +133,7 @@ struct srcu_struct {
  */
 #define __DEFINE_SRCU(name, is_static)					\
 	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
-	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
 
-- 
cgit v1.2.3


From 0b014d72ebae14c0c6ab3fb36a442fda91e1a1b3 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 29 May 2018 18:30:02 -0500
Subject: ASoC: fix 0-day warnings with snd_soc_new_compress()

All conditionally-defined routines in include/sound/soc.h expose a
static inline fallback to avoid 0-day warnings and compilation issues,
except snd_soc_new_compress().

Fixes: 5db6aab6f36f ('ASoC: topology: Add support for compressed PCMs')
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 600a7ebd10c0..1378dcd2128a 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -462,6 +462,11 @@ struct snd_soc_component *snd_soc_lookup_component(struct device *dev,
 int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num);
 #ifdef CONFIG_SND_SOC_COMPRESS
 int snd_soc_new_compress(struct snd_soc_pcm_runtime *rtd, int num);
+#else
+static inline int snd_soc_new_compress(struct snd_soc_pcm_runtime *rtd, int num)
+{
+	return 0;
+}
 #endif
 
 void snd_soc_disconnect_sync(struct device *dev);
-- 
cgit v1.2.3


From c25f2566250cc58491a489c854013aecdf75dfef Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Wed, 23 May 2018 11:43:00 +0200
Subject: ASoC: imx-audmux: add RXFS/RXCLK defines for 6-wire connections

In asynchronous mode, a RxFS and RxClk connection needs to be made between
two ports. Add a define for the bit to be set in the *SEL fields.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
[m.felsch@pengutronix.de: fixed comment to include i.MX21 and 35]
Signed-off-by: Marco Felsch <m.felsch@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/dt-bindings/sound/fsl-imx-audmux.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/sound/fsl-imx-audmux.h b/include/dt-bindings/sound/fsl-imx-audmux.h
index 751fe1416f95..15f138bebe16 100644
--- a/include/dt-bindings/sound/fsl-imx-audmux.h
+++ b/include/dt-bindings/sound/fsl-imx-audmux.h
@@ -25,6 +25,13 @@
 #define MX51_AUDMUX_PORT6		5
 #define MX51_AUDMUX_PORT7		6
 
+/*
+ * TFCSEL/RFCSEL (i.MX27) or TFSEL/TCSEL/RFSEL/RCSEL (i.MX31/51/53/6Q)
+ * can be sourced from Rx/Tx.
+ */
+#define IMX_AUDMUX_RXFS			0x8
+#define IMX_AUDMUX_RXCLK		0x8
+
 /* Register definitions for the i.MX21/27 Digital Audio Multiplexer */
 #define IMX_AUDMUX_V1_PCR_INMMASK(x)	((x) & 0xff)
 #define IMX_AUDMUX_V1_PCR_INMEN		(1 << 8)
-- 
cgit v1.2.3


From f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 27 May 2018 12:24:09 +0100
Subject: media: rc: introduce BPF_PROG_LIRC_MODE2

Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call
rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
that the last key should be repeated.

The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
the target_fd must be the /dev/lircN device.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/media/rc/Kconfig        |  13 ++
 drivers/media/rc/Makefile       |   1 +
 drivers/media/rc/bpf-lirc.c     | 313 ++++++++++++++++++++++++++++++++++++++++
 drivers/media/rc/lirc_dev.c     |  30 ++++
 drivers/media/rc/rc-core-priv.h |  21 +++
 drivers/media/rc/rc-ir-raw.c    |  12 +-
 include/linux/bpf_lirc.h        |  29 ++++
 include/linux/bpf_types.h       |   3 +
 include/uapi/linux/bpf.h        |  53 ++++++-
 kernel/bpf/syscall.c            |   7 +
 10 files changed, 479 insertions(+), 3 deletions(-)
 create mode 100644 drivers/media/rc/bpf-lirc.c
 create mode 100644 include/linux/bpf_lirc.h

(limited to 'include')

diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index eb2c3b6eca7f..d5b35a6ba899 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -25,6 +25,19 @@ config LIRC
 	   passes raw IR to and from userspace, which is needed for
 	   IR transmitting (aka "blasting") and for the lirc daemon.
 
+config BPF_LIRC_MODE2
+	bool "Support for eBPF programs attached to lirc devices"
+	depends on BPF_SYSCALL
+	depends on RC_CORE=y
+	depends on LIRC
+	help
+	   Allow attaching eBPF programs to a lirc device using the bpf(2)
+	   syscall command BPF_PROG_ATTACH. This is supported for raw IR
+	   receivers.
+
+	   These eBPF programs can be used to decode IR into scancodes, for
+	   IR protocols not supported by the kernel decoders.
+
 menuconfig RC_DECODERS
 	bool "Remote controller decoders"
 	depends on RC_CORE
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 2e1c87066f6c..e0340d043fe8 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -5,6 +5,7 @@ obj-y += keymaps/
 obj-$(CONFIG_RC_CORE) += rc-core.o
 rc-core-y := rc-main.o rc-ir-raw.o
 rc-core-$(CONFIG_LIRC) += lirc_dev.o
+rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o
 obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
 obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
 obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
new file mode 100644
index 000000000000..40826bba06b6
--- /dev/null
+++ b/drivers/media/rc/bpf-lirc.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+// bpf-lirc.c - handles bpf
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_lirc.h>
+#include "rc-core-priv.h"
+
+/*
+ * BPF interface for raw IR
+ */
+const struct bpf_prog_ops lirc_mode2_prog_ops = {
+};
+
+BPF_CALL_1(bpf_rc_repeat, u32*, sample)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_repeat(ctrl->dev);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_repeat_proto = {
+	.func	   = bpf_rc_repeat,
+	.gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+};
+
+/*
+ * Currently rc-core does not support 64-bit scancodes, but there are many
+ * known protocols with more than 32 bits. So, define the interface as u64
+ * as a future-proof.
+ */
+BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode,
+	   u32, toggle)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_keydown_proto = {
+	.func	   = bpf_rc_keydown,
+	.gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_rc_repeat:
+		return &rc_repeat_proto;
+	case BPF_FUNC_rc_keydown:
+		return &rc_keydown_proto;
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_prandom_u32:
+		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+		/* fall through */
+	default:
+		return NULL;
+	}
+}
+
+static bool lirc_mode2_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	/* We have one field of u32 */
+	return type == BPF_READ && off == 0 && size == sizeof(u32);
+}
+
+const struct bpf_verifier_ops lirc_mode2_verifier_ops = {
+	.get_func_proto  = lirc_mode2_func_proto,
+	.is_valid_access = lirc_mode2_is_valid_access
+};
+
+#define BPF_MAX_PROGS 64
+
+static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) {
+		ret = -E2BIG;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array);
+	/*
+	 * Do not use bpf_prog_array_delete_safe() as we would end up
+	 * with a dummy entry in the array, and the we would free the
+	 * dummy in lirc_bpf_free()
+	 */
+	if (ret)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
+{
+	struct ir_raw_event_ctrl *raw = rcdev->raw;
+
+	raw->bpf_sample = sample;
+
+	if (raw->progs)
+		BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN);
+}
+
+/*
+ * This should be called once the rc thread has been stopped, so there can be
+ * no concurrent bpf execution.
+ */
+void lirc_bpf_free(struct rc_dev *rcdev)
+{
+	struct bpf_prog **progs;
+
+	if (!rcdev->raw->progs)
+		return;
+
+	progs = rcu_dereference(rcdev->raw->progs)->progs;
+	while (*progs)
+		bpf_prog_put(*progs++);
+
+	bpf_prog_array_free(rcdev->raw->progs);
+}
+
+int lirc_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_attach(rcdev, prog);
+	if (ret)
+		bpf_prog_put(prog);
+
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_detach(rcdev, prog);
+
+	bpf_prog_put(prog);
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct bpf_prog_array __rcu *progs;
+	struct rc_dev *rcdev;
+	u32 cnt, flags = 0;
+	int ret;
+
+	if (attr->query.query_flags)
+		return -EINVAL;
+
+	rcdev = rc_dev_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(rcdev))
+		return PTR_ERR(rcdev);
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW) {
+		ret = -EINVAL;
+		goto put;
+	}
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		goto put;
+
+	progs = rcdev->raw->progs;
+	cnt = progs ? bpf_prog_array_length(progs) : 0;
+
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (attr->query.prog_cnt != 0 && prog_ids && cnt)
+		ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+put:
+	put_device(&rcdev->dev);
+
+	return ret;
+}
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 24e9fbb80e81..da7013a12a58 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/file.h>
 #include <linux/idr.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
@@ -104,6 +105,12 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev)
 			TO_US(ev.duration), TO_STR(ev.pulse));
 	}
 
+	/*
+	 * bpf does not care about the gap generated above; that exists
+	 * for backwards compatibility
+	 */
+	lirc_bpf_run(dev, sample);
+
 	spin_lock_irqsave(&dev->lirc_fh_lock, flags);
 	list_for_each_entry(fh, &dev->lirc_fh, list) {
 		if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports)
@@ -816,4 +823,27 @@ void __exit lirc_dev_exit(void)
 	unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX);
 }
 
+struct rc_dev *rc_dev_get_from_fd(int fd)
+{
+	struct fd f = fdget(fd);
+	struct lirc_fh *fh;
+	struct rc_dev *dev;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &lirc_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fh = f.file->private_data;
+	dev = fh->rc;
+
+	get_device(&dev->dev);
+	fdput(f);
+
+	return dev;
+}
+
 MODULE_ALIAS("lirc_dev");
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index e0e6a17460f6..eb004757038b 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -13,6 +13,7 @@
 #define	MAX_IR_EVENT_SIZE	512
 
 #include <linux/slab.h>
+#include <uapi/linux/bpf.h>
 #include <media/rc-core.h>
 
 /**
@@ -57,6 +58,11 @@ struct ir_raw_event_ctrl {
 	/* raw decoder state follows */
 	struct ir_raw_event prev_ev;
 	struct ir_raw_event this_ev;
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+	u32				bpf_sample;
+	struct bpf_prog_array __rcu	*progs;
+#endif
 	struct nec_dec {
 		int state;
 		unsigned count;
@@ -126,6 +132,9 @@ struct ir_raw_event_ctrl {
 	} imon;
 };
 
+/* Mutex for locking raw IR processing and handler change */
+extern struct mutex ir_raw_handler_lock;
+
 /* macros for IR decoders */
 static inline bool geq_margin(unsigned d1, unsigned d2, unsigned margin)
 {
@@ -288,6 +297,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev);
 void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc);
 int ir_lirc_register(struct rc_dev *dev);
 void ir_lirc_unregister(struct rc_dev *dev);
+struct rc_dev *rc_dev_get_from_fd(int fd);
 #else
 static inline int lirc_dev_init(void) { return 0; }
 static inline void lirc_dev_exit(void) {}
@@ -299,4 +309,15 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; }
 static inline void ir_lirc_unregister(struct rc_dev *dev) { }
 #endif
 
+/*
+ * bpf interface
+ */
+#ifdef CONFIG_BPF_LIRC_MODE2
+void lirc_bpf_free(struct rc_dev *dev);
+void lirc_bpf_run(struct rc_dev *dev, u32 sample);
+#else
+static inline void lirc_bpf_free(struct rc_dev *dev) { }
+static inline void lirc_bpf_run(struct rc_dev *dev, u32 sample) { }
+#endif
+
 #endif /* _RC_CORE_PRIV */
diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index 374f83105a23..7675b7ee5bc7 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -14,7 +14,7 @@
 static LIST_HEAD(ir_raw_client_list);
 
 /* Used to handle IR raw handler extensions */
-static DEFINE_MUTEX(ir_raw_handler_lock);
+DEFINE_MUTEX(ir_raw_handler_lock);
 static LIST_HEAD(ir_raw_handler_list);
 static atomic64_t available_protocols = ATOMIC64_INIT(0);
 
@@ -621,9 +621,17 @@ void ir_raw_event_unregister(struct rc_dev *dev)
 	list_for_each_entry(handler, &ir_raw_handler_list, list)
 		if (handler->raw_unregister)
 			handler->raw_unregister(dev);
-	mutex_unlock(&ir_raw_handler_lock);
+
+	lirc_bpf_free(dev);
 
 	ir_raw_event_free(dev);
+
+	/*
+	 * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so
+	 * ensure that the raw member is null on unlock; this is how
+	 * "device gone" is checked.
+	 */
+	mutex_unlock(&ir_raw_handler_lock);
 }
 
 /*
diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h
new file mode 100644
index 000000000000..5f8a4283092d
--- /dev/null
+++ b/include/linux/bpf_lirc.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_LIRC_H
+#define _BPF_LIRC_H
+
+#include <uapi/linux/bpf.h>
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+int lirc_prog_attach(const union bpf_attr *attr);
+int lirc_prog_detach(const union bpf_attr *attr);
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int lirc_prog_attach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_detach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_query(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif /* _BPF_LIRC_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b161e506dcfc..c5700c2d5549 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_BPF_LIRC_MODE2
+BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 33f37eb0b6bf..64ac0f7a689e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,6 +143,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
 };
 
 enum bpf_attach_type {
@@ -162,6 +163,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_POST_BIND,
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2005,6 +2007,53 @@ union bpf_attr {
  * 		direct packet access.
  *	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2083,7 +2132,9 @@ union bpf_attr {
 	FN(lwt_push_encap),		\
 	FN(lwt_seg6_store_bytes),	\
 	FN(lwt_seg6_adjust_srh),	\
-	FN(lwt_seg6_action),
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e254526d6744..7365d79ae00d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
 #include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
@@ -1582,6 +1583,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_attach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1654,6 +1657,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1703,6 +1708,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
+	case BPF_LIRC_MODE2:
+		return lirc_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 781b9d6b84324b47e316a233433fa2c179cb1ee5 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 29 May 2018 12:04:13 +0200
Subject: PM / Domains: Drop extern declarations of functions in pm_domain.h

Using "extern" to declare a function in a public header file is somewhat
pointless, but also doesn't hurt. However, to make all the function
declarations in pm_domain.h to be consistent, let's drop the use of
"extern".

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_domain.h | 51 +++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 4e5764083fd8..c847e9a3033d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -143,21 +143,17 @@ static inline struct generic_pm_domain_data *dev_gpd_data(struct device *dev)
 	return to_gpd_data(dev->power.subsys_data->domain_data);
 }
 
-extern int __pm_genpd_add_device(struct generic_pm_domain *genpd,
-				 struct device *dev,
-				 struct gpd_timing_data *td);
-
-extern int pm_genpd_remove_device(struct generic_pm_domain *genpd,
-				  struct device *dev);
-extern int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
-				  struct generic_pm_domain *new_subdomain);
-extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
-				     struct generic_pm_domain *target);
-extern int pm_genpd_init(struct generic_pm_domain *genpd,
-			 struct dev_power_governor *gov, bool is_off);
-extern int pm_genpd_remove(struct generic_pm_domain *genpd);
-extern int dev_pm_genpd_set_performance_state(struct device *dev,
-					      unsigned int state);
+int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
+			  struct gpd_timing_data *td);
+int pm_genpd_remove_device(struct generic_pm_domain *genpd, struct device *dev);
+int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
+			   struct generic_pm_domain *new_subdomain);
+int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
+			      struct generic_pm_domain *target);
+int pm_genpd_init(struct generic_pm_domain *genpd,
+		  struct dev_power_governor *gov, bool is_off);
+int pm_genpd_remove(struct generic_pm_domain *genpd);
+int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state);
 
 extern struct dev_power_governor simple_qos_governor;
 extern struct dev_power_governor pm_domain_always_on_gov;
@@ -215,8 +211,8 @@ static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
 }
 
 #ifdef CONFIG_PM_GENERIC_DOMAINS_SLEEP
-extern void pm_genpd_syscore_poweroff(struct device *dev);
-extern void pm_genpd_syscore_poweron(struct device *dev);
+void pm_genpd_syscore_poweroff(struct device *dev);
+void pm_genpd_syscore_poweron(struct device *dev);
 #else
 static inline void pm_genpd_syscore_poweroff(struct device *dev) {}
 static inline void pm_genpd_syscore_poweron(struct device *dev) {}
@@ -240,14 +236,13 @@ int of_genpd_add_provider_simple(struct device_node *np,
 int of_genpd_add_provider_onecell(struct device_node *np,
 				  struct genpd_onecell_data *data);
 void of_genpd_del_provider(struct device_node *np);
-extern int of_genpd_add_device(struct of_phandle_args *args,
-			       struct device *dev);
-extern int of_genpd_add_subdomain(struct of_phandle_args *parent,
-				  struct of_phandle_args *new_subdomain);
-extern struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
-extern int of_genpd_parse_idle_states(struct device_node *dn,
-			struct genpd_power_state **states, int *n);
-extern unsigned int of_genpd_opp_to_performance_state(struct device *dev,
+int of_genpd_add_device(struct of_phandle_args *args, struct device *dev);
+int of_genpd_add_subdomain(struct of_phandle_args *parent,
+			   struct of_phandle_args *new_subdomain);
+struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
+int of_genpd_parse_idle_states(struct device_node *dn,
+			       struct genpd_power_state **states, int *n);
+unsigned int of_genpd_opp_to_performance_state(struct device *dev,
 				struct device_node *opp_node);
 
 int genpd_dev_pm_attach(struct device *dev);
@@ -304,9 +299,9 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
 
 #ifdef CONFIG_PM
-extern int dev_pm_domain_attach(struct device *dev, bool power_on);
-extern void dev_pm_domain_detach(struct device *dev, bool power_off);
-extern void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
+int dev_pm_domain_attach(struct device *dev, bool power_on);
+void dev_pm_domain_detach(struct device *dev, bool power_off);
+void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
 #else
 static inline int dev_pm_domain_attach(struct device *dev, bool power_on)
 {
-- 
cgit v1.2.3


From 1a7a67072f35b3e65e76fc694b088ca48b4dae35 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 29 May 2018 12:04:14 +0200
Subject: PM / Domains: Drop __pm_genpd_add_device()

There are still a few non-DT existing users of genpd, however neither of
them uses __pm_genpd_add_device(), hence let's drop it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 10 ++++------
 include/linux/pm_domain.h   | 14 +++-----------
 2 files changed, 7 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 166259053f8d..2bb67e4f6280 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1414,23 +1414,21 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 }
 
 /**
- * __pm_genpd_add_device - Add a device to an I/O PM domain.
+ * pm_genpd_add_device - Add a device to an I/O PM domain.
  * @genpd: PM domain to add the device to.
  * @dev: Device to be added.
- * @td: Set of PM QoS timing parameters to attach to the device.
  */
-int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
-			  struct gpd_timing_data *td)
+int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
 {
 	int ret;
 
 	mutex_lock(&gpd_list_lock);
-	ret = genpd_add_device(genpd, dev, td);
+	ret = genpd_add_device(genpd, dev, NULL);
 	mutex_unlock(&gpd_list_lock);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__pm_genpd_add_device);
+EXPORT_SYMBOL_GPL(pm_genpd_add_device);
 
 static int genpd_remove_device(struct generic_pm_domain *genpd,
 			       struct device *dev)
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index c847e9a3033d..79888fb4a81f 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -143,8 +143,7 @@ static inline struct generic_pm_domain_data *dev_gpd_data(struct device *dev)
 	return to_gpd_data(dev->power.subsys_data->domain_data);
 }
 
-int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
-			  struct gpd_timing_data *td);
+int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev);
 int pm_genpd_remove_device(struct generic_pm_domain *genpd, struct device *dev);
 int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 			   struct generic_pm_domain *new_subdomain);
@@ -163,9 +162,8 @@ static inline struct generic_pm_domain_data *dev_gpd_data(struct device *dev)
 {
 	return ERR_PTR(-ENOSYS);
 }
-static inline int __pm_genpd_add_device(struct generic_pm_domain *genpd,
-					struct device *dev,
-					struct gpd_timing_data *td)
+static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
+				      struct device *dev)
 {
 	return -ENOSYS;
 }
@@ -204,12 +202,6 @@ static inline int dev_pm_genpd_set_performance_state(struct device *dev,
 #define pm_domain_always_on_gov		(*(struct dev_power_governor *)(NULL))
 #endif
 
-static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
-				      struct device *dev)
-{
-	return __pm_genpd_add_device(genpd, dev, NULL);
-}
-
 #ifdef CONFIG_PM_GENERIC_DOMAINS_SLEEP
 void pm_genpd_syscore_poweroff(struct device *dev);
 void pm_genpd_syscore_poweron(struct device *dev);
-- 
cgit v1.2.3


From 924f448699627722a7dcaefb857d09fd324e75c5 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 29 May 2018 12:04:15 +0200
Subject: PM / Domains: Drop genpd as in-param for pm_genpd_remove_device()

There is no need to pass a genpd struct to pm_genpd_remove_device(), as we
already have the information about the PM domain (genpd) through the device
structure.

Additionally, we don't allow to remove a PM domain from a device, other
than the one it may have assigned to it, so really it does not make sense
to have a separate in-param for it.

For these reason, drop it and update the current only call to
pm_genpd_remove_device() from amdgpu_acp.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c             | 8 ++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c | 2 +-
 include/linux/pm_domain.h               | 5 ++---
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 2bb67e4f6280..83ce6ca6c769 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1475,13 +1475,13 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
 /**
  * pm_genpd_remove_device - Remove a device from an I/O PM domain.
- * @genpd: PM domain to remove the device from.
  * @dev: Device to be removed.
  */
-int pm_genpd_remove_device(struct generic_pm_domain *genpd,
-			   struct device *dev)
+int pm_genpd_remove_device(struct device *dev)
 {
-	if (!genpd || genpd != genpd_lookup_dev(dev))
+	struct generic_pm_domain *genpd = genpd_lookup_dev(dev);
+
+	if (!genpd)
 		return -EINVAL;
 
 	return genpd_remove_device(genpd, dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
index a29362f9ef41..12558044acd4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
@@ -513,7 +513,7 @@ static int acp_hw_fini(void *handle)
 	if (adev->acp.acp_genpd) {
 		for (i = 0; i < ACP_DEVS ; i++) {
 			dev = get_mfd_cell_dev(adev->acp.acp_cell[i].name, i);
-			ret = pm_genpd_remove_device(&adev->acp.acp_genpd->gpd, dev);
+			ret = pm_genpd_remove_device(dev);
 			/* If removal fails, dont giveup and try rest */
 			if (ret)
 				dev_err(dev, "remove dev from genpd failed\n");
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 79888fb4a81f..42e0d649e653 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -144,7 +144,7 @@ static inline struct generic_pm_domain_data *dev_gpd_data(struct device *dev)
 }
 
 int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev);
-int pm_genpd_remove_device(struct generic_pm_domain *genpd, struct device *dev);
+int pm_genpd_remove_device(struct device *dev);
 int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 			   struct generic_pm_domain *new_subdomain);
 int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
@@ -167,8 +167,7 @@ static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
 {
 	return -ENOSYS;
 }
-static inline int pm_genpd_remove_device(struct generic_pm_domain *genpd,
-					 struct device *dev)
+static inline int pm_genpd_remove_device(struct device *dev)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From ab4d11e2c2329cf7cb7be31ff22489aae4dee5dc Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 30 May 2018 15:15:20 +0100
Subject: regulator: wm8994: Fix shared GPIOs

This reverts commit 3c6b38d45fa51c7c51 "regulator: wm8994: Pass
descriptor instead of GPIO number" as it has problems with shared
GPIOs similar to that on s2mps11.

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-s3c64xx/mach-crag6410-module.c | 17 ++---------------
 drivers/mfd/wm8994-core.c                    |  9 +++++++++
 drivers/regulator/wm8994-regulator.c         | 19 ++++++++-----------
 include/linux/mfd/wm8994/pdata.h             |  3 +++
 4 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-s3c64xx/mach-crag6410-module.c b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
index 76c4855a03bc..5aa472892465 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410-module.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
@@ -194,8 +194,8 @@ static struct wm8994_pdata wm8994_pdata = {
 		0x3,          /* IRQ out, active high, CMOS */
 	},
 	.ldo = {
-		 { .init_data = &wm8994_ldo1, },
-		 { .init_data = &wm8994_ldo2, },
+		 { .enable = S3C64XX_GPN(6), .init_data = &wm8994_ldo1, },
+		 { .enable = S3C64XX_GPN(4), .init_data = &wm8994_ldo2, },
 	},
 };
 
@@ -203,18 +203,6 @@ static const struct i2c_board_info wm1277_devs[] = {
 	{ I2C_BOARD_INFO("wm8958", 0x1a),  /* WM8958 is the superset */
 	  .platform_data = &wm8994_pdata,
 	  .irq = GLENFARCLAS_PMIC_IRQ_BASE + WM831X_IRQ_GPIO_2,
-	  .dev_name = "wm8958",
-	},
-};
-
-static struct gpiod_lookup_table wm8994_gpiod_table = {
-	.dev_id = "i2c-wm8958", /* I2C device name */
-	.table = {
-		GPIO_LOOKUP("GPION", 6,
-			    "wlf,ldo1ena", GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("GPION", 4,
-			    "wlf,ldo2ena", GPIO_ACTIVE_HIGH),
-		{ },
 	},
 };
 
@@ -393,7 +381,6 @@ static int wlf_gf_module_probe(struct i2c_client *i2c,
 
 	gpiod_add_lookup_table(&wm5102_reva_gpiod_table);
 	gpiod_add_lookup_table(&wm5102_gpiod_table);
-	gpiod_add_lookup_table(&wm8994_gpiod_table);
 
 	if (i < ARRAY_SIZE(gf_mods)) {
 		dev_info(&i2c->dev, "%s revision %d\n",
diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index c409464231f6..953d0790ffd5 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -21,6 +21,7 @@
 #include <linux/mfd/core.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/of_gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
@@ -301,6 +302,14 @@ static int wm8994_set_pdata_from_of(struct wm8994 *wm8994)
 	if (of_find_property(np, "wlf,ldoena-always-driven", NULL))
 		pdata->lineout2fb = true;
 
+	pdata->ldo[0].enable = of_get_named_gpio(np, "wlf,ldo1ena", 0);
+	if (pdata->ldo[0].enable < 0)
+		pdata->ldo[0].enable = 0;
+
+	pdata->ldo[1].enable = of_get_named_gpio(np, "wlf,ldo2ena", 0);
+	if (pdata->ldo[1].enable < 0)
+		pdata->ldo[1].enable = 0;
+
 	return 0;
 }
 #else
diff --git a/drivers/regulator/wm8994-regulator.c b/drivers/regulator/wm8994-regulator.c
index d3a5f48119c2..7a4ce6df4f22 100644
--- a/drivers/regulator/wm8994-regulator.c
+++ b/drivers/regulator/wm8994-regulator.c
@@ -19,7 +19,7 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
-#include <linux/gpio/consumer.h>
+#include <linux/gpio.h>
 #include <linux/slab.h>
 
 #include <linux/mfd/wm8994/core.h>
@@ -129,7 +129,6 @@ static int wm8994_ldo_probe(struct platform_device *pdev)
 	int id = pdev->id % ARRAY_SIZE(pdata->ldo);
 	struct regulator_config config = { };
 	struct wm8994_ldo *ldo;
-	struct gpio_desc *gpiod;
 	int ret;
 
 	dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1);
@@ -146,14 +145,12 @@ static int wm8994_ldo_probe(struct platform_device *pdev)
 	config.driver_data = ldo;
 	config.regmap = wm8994->regmap;
 	config.init_data = &ldo->init_data;
-
-	/* Look up LDO enable GPIO from the parent device node */
-	gpiod = devm_gpiod_get_optional(pdev->dev.parent,
-					id ? "wlf,ldo2ena" : "wlf,ldo1ena",
-					GPIOD_OUT_LOW);
-	if (IS_ERR(gpiod))
-		return PTR_ERR(gpiod);
-	config.ena_gpiod = gpiod;
+	if (pdata) {
+		config.ena_gpio = pdata->ldo[id].enable;
+	} else if (wm8994->dev->of_node) {
+		config.ena_gpio = wm8994->pdata.ldo[id].enable;
+		config.ena_gpio_initialized = true;
+	}
 
 	/* Use default constraints if none set up */
 	if (!pdata || !pdata->ldo[id].init_data || wm8994->dev->of_node) {
@@ -162,7 +159,7 @@ static int wm8994_ldo_probe(struct platform_device *pdev)
 
 		ldo->init_data = wm8994_ldo_default[id];
 		ldo->init_data.consumer_supplies = &ldo->supply;
-		if (!gpiod)
+		if (!config.ena_gpio)
 			ldo->init_data.constraints.valid_ops_mask = 0;
 	} else {
 		ldo->init_data = *pdata->ldo[id].init_data;
diff --git a/include/linux/mfd/wm8994/pdata.h b/include/linux/mfd/wm8994/pdata.h
index fca67bd194e2..90c60524a496 100644
--- a/include/linux/mfd/wm8994/pdata.h
+++ b/include/linux/mfd/wm8994/pdata.h
@@ -20,6 +20,9 @@
 #define WM8994_NUM_AIF   3
 
 struct wm8994_ldo_pdata {
+	/** GPIOs to enable regulator, 0 or less if not available */
+	int enable;
+
 	const struct regulator_init_data *init_data;
 };
 
-- 
cgit v1.2.3


From edd303ff0e9ebc39118e633916278b5ca8558c30 Mon Sep 17 00:00:00 2001
From: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Date: Fri, 25 May 2018 15:23:29 -0500
Subject: crypto: ccp - Add DOWNLOAD_FIRMWARE SEV command

The DOWNLOAD_FIRMWARE command, added as of SEV API v0.15, allows the OS
to install SEV firmware newer than the currently active SEV firmware.

For the new SEV firmware to be applied it must:
* Pass the validation test performed by the existing firmware.
* Be of the same build or a newer build compared to the existing firmware.

For more information please refer to "Section 5.11 DOWNLOAD_FIRMWARE" of
https://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf

Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/psp-dev.c | 99 +++++++++++++++++++++++++++++++++++++++-----
 drivers/crypto/ccp/psp-dev.h |  4 ++
 include/linux/psp-sev.h      | 12 ++++++
 3 files changed, 105 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index d95ec526587a..12838b406696 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -22,11 +22,17 @@
 #include <linux/delay.h>
 #include <linux/hw_random.h>
 #include <linux/ccp.h>
+#include <linux/firmware.h>
 
 #include "sp-dev.h"
 #include "psp-dev.h"
 
+#define SEV_VERSION_GREATER_OR_EQUAL(_maj, _min)	\
+		((psp_master->api_major) >= _maj &&	\
+		 (psp_master->api_minor) >= _min)
+
 #define DEVICE_NAME	"sev"
+#define SEV_FW_FILE	"amd/sev.fw"
 
 static DEFINE_MUTEX(sev_cmd_mutex);
 static struct sev_misc_dev *misc_dev;
@@ -112,6 +118,7 @@ static int sev_cmd_buffer_len(int cmd)
 	case SEV_CMD_RECEIVE_UPDATE_DATA:	return sizeof(struct sev_data_receive_update_data);
 	case SEV_CMD_RECEIVE_UPDATE_VMSA:	return sizeof(struct sev_data_receive_update_vmsa);
 	case SEV_CMD_LAUNCH_UPDATE_SECRET:	return sizeof(struct sev_data_launch_secret);
+	case SEV_CMD_DOWNLOAD_FIRMWARE:		return sizeof(struct sev_data_download_firmware);
 	default:				return 0;
 	}
 
@@ -378,6 +385,79 @@ void *psp_copy_user_blob(u64 __user uaddr, u32 len)
 }
 EXPORT_SYMBOL_GPL(psp_copy_user_blob);
 
+static int sev_get_api_version(void)
+{
+	struct sev_user_data_status *status;
+	int error, ret;
+
+	status = &psp_master->status_cmd_buf;
+	ret = sev_platform_status(status, &error);
+	if (ret) {
+		dev_err(psp_master->dev,
+			"SEV: failed to get status. Error: %#x\n", error);
+		return 1;
+	}
+
+	psp_master->api_major = status->api_major;
+	psp_master->api_minor = status->api_minor;
+	psp_master->build = status->build;
+
+	return 0;
+}
+
+/* Don't fail if SEV FW couldn't be updated. Continue with existing SEV FW */
+static int sev_update_firmware(struct device *dev)
+{
+	struct sev_data_download_firmware *data;
+	const struct firmware *firmware;
+	int ret, error, order;
+	struct page *p;
+	u64 data_size;
+
+	ret = request_firmware(&firmware, SEV_FW_FILE, dev);
+	if (ret < 0)
+		return -1;
+
+	/*
+	 * SEV FW expects the physical address given to it to be 32
+	 * byte aligned. Memory allocated has structure placed at the
+	 * beginning followed by the firmware being passed to the SEV
+	 * FW. Allocate enough memory for data structure + alignment
+	 * padding + SEV FW.
+	 */
+	data_size = ALIGN(sizeof(struct sev_data_download_firmware), 32);
+
+	order = get_order(firmware->size + data_size);
+	p = alloc_pages(GFP_KERNEL, order);
+	if (!p) {
+		ret = -1;
+		goto fw_err;
+	}
+
+	/*
+	 * Copy firmware data to a kernel allocated contiguous
+	 * memory region.
+	 */
+	data = page_address(p);
+	memcpy(page_address(p) + data_size, firmware->data, firmware->size);
+
+	data->address = __psp_pa(page_address(p) + data_size);
+	data->len = firmware->size;
+
+	ret = sev_do_cmd(SEV_CMD_DOWNLOAD_FIRMWARE, data, &error);
+	if (ret)
+		dev_dbg(dev, "Failed to update SEV firmware: %#x\n", error);
+	else
+		dev_info(dev, "SEV firmware update successful\n");
+
+	__free_pages(p, order);
+
+fw_err:
+	release_firmware(firmware);
+
+	return ret;
+}
+
 static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp)
 {
 	struct sev_user_data_pek_cert_import input;
@@ -750,7 +830,6 @@ EXPORT_SYMBOL_GPL(sev_issue_cmd_external_user);
 
 void psp_pci_init(void)
 {
-	struct sev_user_data_status *status;
 	struct sp_device *sp;
 	int error, rc;
 
@@ -760,6 +839,13 @@ void psp_pci_init(void)
 
 	psp_master = sp->psp_data;
 
+	if (sev_get_api_version())
+		goto err;
+
+	if (SEV_VERSION_GREATER_OR_EQUAL(0, 15) &&
+	    sev_update_firmware(psp_master->dev) == 0)
+		sev_get_api_version();
+
 	/* Initialize the platform */
 	rc = sev_platform_init(&error);
 	if (rc) {
@@ -767,16 +853,9 @@ void psp_pci_init(void)
 		goto err;
 	}
 
-	/* Display SEV firmware version */
-	status = &psp_master->status_cmd_buf;
-	rc = sev_platform_status(status, &error);
-	if (rc) {
-		dev_err(sp->dev, "SEV: failed to get status error %#x\n", error);
-		goto err;
-	}
+	dev_info(sp->dev, "SEV API:%d.%d build:%d\n", psp_master->api_major,
+		 psp_master->api_minor, psp_master->build);
 
-	dev_info(sp->dev, "SEV API:%d.%d build:%d\n", status->api_major,
-		 status->api_minor, status->build);
 	return;
 
 err:
diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h
index c81f0b11287a..c7e9098a233c 100644
--- a/drivers/crypto/ccp/psp-dev.h
+++ b/drivers/crypto/ccp/psp-dev.h
@@ -78,6 +78,10 @@ struct psp_device {
 	struct sev_misc_dev *sev_misc;
 	struct sev_user_data_status status_cmd_buf;
 	struct sev_data_init init_cmd_buf;
+
+	u8 api_major;
+	u8 api_minor;
+	u8 build;
 };
 
 #endif /* __PSP_DEV_H */
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 93addfa34061..1d2496246072 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -54,6 +54,7 @@ enum sev_cmd {
 	SEV_CMD_PDH_CERT_EXPORT		= 0x008,
 	SEV_CMD_PDH_GEN			= 0x009,
 	SEV_CMD_DF_FLUSH		= 0x00A,
+	SEV_CMD_DOWNLOAD_FIRMWARE	= 0x00B,
 
 	/* Guest commands */
 	SEV_CMD_DECOMMISSION		= 0x020,
@@ -129,6 +130,17 @@ struct sev_data_pek_cert_import {
 	u32 oca_cert_len;			/* In */
 } __packed;
 
+/**
+ * struct sev_data_download_firmware - DOWNLOAD_FIRMWARE command parameters
+ *
+ * @address: physical address of firmware image
+ * @len: len of the firmware image
+ */
+struct sev_data_download_firmware {
+	u64 address;				/* In */
+	u32 len;				/* In */
+} __packed;
+
 /**
  * struct sev_data_pdh_cert_export - PDH_CERT_EXPORT command parameters
  *
-- 
cgit v1.2.3


From 0b3a830bb407dce79468a26f382260131b50b3c5 Mon Sep 17 00:00:00 2001
From: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Date: Fri, 25 May 2018 15:23:30 -0500
Subject: crypto: ccp - Add GET_ID SEV command

The GET_ID command, added as of SEV API v0.16, allows the SEV firmware
to be queried about a unique CPU ID. This unique ID can then be used
to obtain the public certificate containing the Chip Endorsement Key
(CEK) public key signed by the AMD SEV Signing Key (ASK).

For more information please refer to "Section 5.12 GET_ID" of
https://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf

Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/psp-dev.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/psp-sev.h      | 11 +++++++++++
 include/uapi/linux/psp-sev.h | 12 ++++++++++++
 3 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index 12838b406696..ff478d826d7d 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -119,6 +119,7 @@ static int sev_cmd_buffer_len(int cmd)
 	case SEV_CMD_RECEIVE_UPDATE_VMSA:	return sizeof(struct sev_data_receive_update_vmsa);
 	case SEV_CMD_LAUNCH_UPDATE_SECRET:	return sizeof(struct sev_data_launch_secret);
 	case SEV_CMD_DOWNLOAD_FIRMWARE:		return sizeof(struct sev_data_download_firmware);
+	case SEV_CMD_GET_ID:			return sizeof(struct sev_data_get_id);
 	default:				return 0;
 	}
 
@@ -510,6 +511,46 @@ e_free:
 	return ret;
 }
 
+static int sev_ioctl_do_get_id(struct sev_issue_cmd *argp)
+{
+	struct sev_data_get_id *data;
+	u64 data_size, user_size;
+	void *id_blob, *mem;
+	int ret;
+
+	/* SEV GET_ID available from SEV API v0.16 and up */
+	if (!SEV_VERSION_GREATER_OR_EQUAL(0, 16))
+		return -ENOTSUPP;
+
+	/* SEV FW expects the buffer it fills with the ID to be
+	 * 8-byte aligned. Memory allocated should be enough to
+	 * hold data structure + alignment padding + memory
+	 * where SEV FW writes the ID.
+	 */
+	data_size = ALIGN(sizeof(struct sev_data_get_id), 8);
+	user_size = sizeof(struct sev_user_data_get_id);
+
+	mem = kzalloc(data_size + user_size, GFP_KERNEL);
+	if (!mem)
+		return -ENOMEM;
+
+	data = mem;
+	id_blob = mem + data_size;
+
+	data->address = __psp_pa(id_blob);
+	data->len = user_size;
+
+	ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, data, &argp->error);
+	if (!ret) {
+		if (copy_to_user((void __user *)argp->data, id_blob, data->len))
+			ret = -EFAULT;
+	}
+
+	kfree(mem);
+
+	return ret;
+}
+
 static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp)
 {
 	struct sev_user_data_pdh_cert_export input;
@@ -647,6 +688,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SEV_PDH_CERT_EXPORT:
 		ret = sev_ioctl_do_pdh_export(&input);
 		break;
+	case SEV_GET_ID:
+		ret = sev_ioctl_do_get_id(&input);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 1d2496246072..827c601841c4 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -55,6 +55,7 @@ enum sev_cmd {
 	SEV_CMD_PDH_GEN			= 0x009,
 	SEV_CMD_DF_FLUSH		= 0x00A,
 	SEV_CMD_DOWNLOAD_FIRMWARE	= 0x00B,
+	SEV_CMD_GET_ID			= 0x00C,
 
 	/* Guest commands */
 	SEV_CMD_DECOMMISSION		= 0x020,
@@ -141,6 +142,16 @@ struct sev_data_download_firmware {
 	u32 len;				/* In */
 } __packed;
 
+/**
+ * struct sev_data_get_id - GET_ID command parameters
+ *
+ * @address: physical address of region to place unique CPU ID(s)
+ * @len: len of the region
+ */
+struct sev_data_get_id {
+	u64 address;				/* In */
+	u32 len;				/* In/Out */
+} __packed;
 /**
  * struct sev_data_pdh_cert_export - PDH_CERT_EXPORT command parameters
  *
diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index 9008f31c7eb6..ac8c60bcc83b 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -30,6 +30,7 @@ enum {
 	SEV_PDH_GEN,
 	SEV_PDH_CERT_EXPORT,
 	SEV_PEK_CERT_IMPORT,
+	SEV_GET_ID,
 
 	SEV_MAX,
 };
@@ -123,6 +124,17 @@ struct sev_user_data_pdh_cert_export {
 	__u32 cert_chain_len;			/* In/Out */
 } __packed;
 
+/**
+ * struct sev_user_data_get_id - GET_ID command parameters
+ *
+ * @socket1: Buffer to pass unique ID of first socket
+ * @socket2: Buffer to pass unique ID of second socket
+ */
+struct sev_user_data_get_id {
+	__u8 socket1[64];			/* Out */
+	__u8 socket2[64];			/* Out */
+} __packed;
+
 /**
  * struct sev_issue_cmd - SEV ioctl parameters
  *
-- 
cgit v1.2.3


From 015a03704df11c552501e0b52cc264b5c57a9a41 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 26 May 2018 00:08:59 -0700
Subject: crypto: salsa20 - Revert "crypto: salsa20 - export generic helpers"

This reverts commit eb772f37ae8163a89e28a435f6a18742ae06653b, as now the
x86 Salsa20 implementation has been removed and the generic helpers are
no longer needed outside of salsa20_generic.c.

We could keep this just in case someone else wants to add a new
optimized Salsa20 implementation.  But given that we have ChaCha20 now
too, I think it's unlikely.  And this can always be reverted back.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/salsa20_generic.c | 20 +++++++++++++-------
 include/crypto/salsa20.h | 27 ---------------------------
 2 files changed, 13 insertions(+), 34 deletions(-)
 delete mode 100644 include/crypto/salsa20.h

(limited to 'include')

diff --git a/crypto/salsa20_generic.c b/crypto/salsa20_generic.c
index 5074006a56c3..8c77bc78a09f 100644
--- a/crypto/salsa20_generic.c
+++ b/crypto/salsa20_generic.c
@@ -21,9 +21,17 @@
 
 #include <asm/unaligned.h>
 #include <crypto/internal/skcipher.h>
-#include <crypto/salsa20.h>
 #include <linux/module.h>
 
+#define SALSA20_IV_SIZE        8
+#define SALSA20_MIN_KEY_SIZE  16
+#define SALSA20_MAX_KEY_SIZE  32
+#define SALSA20_BLOCK_SIZE    64
+
+struct salsa20_ctx {
+	u32 initial_state[16];
+};
+
 static void salsa20_block(u32 *state, __le32 *stream)
 {
 	u32 x[16];
@@ -93,16 +101,15 @@ static void salsa20_docrypt(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
-void crypto_salsa20_init(u32 *state, const struct salsa20_ctx *ctx,
+static void salsa20_init(u32 *state, const struct salsa20_ctx *ctx,
 			 const u8 *iv)
 {
 	memcpy(state, ctx->initial_state, sizeof(ctx->initial_state));
 	state[6] = get_unaligned_le32(iv + 0);
 	state[7] = get_unaligned_le32(iv + 4);
 }
-EXPORT_SYMBOL_GPL(crypto_salsa20_init);
 
-int crypto_salsa20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+static int salsa20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			  unsigned int keysize)
 {
 	static const char sigma[16] = "expand 32-byte k";
@@ -143,7 +150,6 @@ int crypto_salsa20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(crypto_salsa20_setkey);
 
 static int salsa20_crypt(struct skcipher_request *req)
 {
@@ -155,7 +161,7 @@ static int salsa20_crypt(struct skcipher_request *req)
 
 	err = skcipher_walk_virt(&walk, req, true);
 
-	crypto_salsa20_init(state, ctx, walk.iv);
+	salsa20_init(state, ctx, walk.iv);
 
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
@@ -183,7 +189,7 @@ static struct skcipher_alg alg = {
 	.max_keysize		= SALSA20_MAX_KEY_SIZE,
 	.ivsize			= SALSA20_IV_SIZE,
 	.chunksize		= SALSA20_BLOCK_SIZE,
-	.setkey			= crypto_salsa20_setkey,
+	.setkey			= salsa20_setkey,
 	.encrypt		= salsa20_crypt,
 	.decrypt		= salsa20_crypt,
 };
diff --git a/include/crypto/salsa20.h b/include/crypto/salsa20.h
deleted file mode 100644
index 19ed48aefc86..000000000000
--- a/include/crypto/salsa20.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Common values for the Salsa20 algorithm
- */
-
-#ifndef _CRYPTO_SALSA20_H
-#define _CRYPTO_SALSA20_H
-
-#include <linux/types.h>
-
-#define SALSA20_IV_SIZE		8
-#define SALSA20_MIN_KEY_SIZE	16
-#define SALSA20_MAX_KEY_SIZE	32
-#define SALSA20_BLOCK_SIZE	64
-
-struct crypto_skcipher;
-
-struct salsa20_ctx {
-	u32 initial_state[16];
-};
-
-void crypto_salsa20_init(u32 *state, const struct salsa20_ctx *ctx,
-			 const u8 *iv);
-int crypto_salsa20_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			  unsigned int keysize);
-
-#endif /* _CRYPTO_SALSA20_H */
-- 
cgit v1.2.3


From 126b7de6bfd84b2daadca2c3396108cd847adb5d Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 15 May 2018 11:07:02 +0200
Subject: PCI: Rename of_pci_get_host_bridge_resources() device node parameter

We will add a "struct device *dev" parameter to this function soon, so
rename the existing "struct device_node *dev" parameter to "dev_node".

Tested-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
---
 drivers/pci/of.c       | 18 +++++++++---------
 include/linux/of_pci.h |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index a28355c273ae..8d4778ef5806 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -245,7 +245,7 @@ EXPORT_SYMBOL_GPL(of_pci_check_probe_only);
 #if defined(CONFIG_OF_ADDRESS)
 /**
  * of_pci_get_host_bridge_resources - Parse PCI host bridge resources from DT
- * @dev: device node of the host bridge having the range property
+ * @dev_node: device node of the host bridge having the range property
  * @busno: bus number associated with the bridge root bus
  * @bus_max: maximum number of buses for this bridge
  * @resources: list where the range of resources will be added after DT parsing
@@ -262,7 +262,7 @@ EXPORT_SYMBOL_GPL(of_pci_check_probe_only);
  * It returns zero if the range parsing has been successful or a standard error
  * value if it failed.
  */
-int of_pci_get_host_bridge_resources(struct device_node *dev,
+int of_pci_get_host_bridge_resources(struct device_node *dev_node,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base)
 {
@@ -281,15 +281,15 @@ int of_pci_get_host_bridge_resources(struct device_node *dev,
 	if (!bus_range)
 		return -ENOMEM;
 
-	pr_info("host bridge %pOF ranges:\n", dev);
+	pr_info("host bridge %pOF ranges:\n", dev_node);
 
-	err = of_pci_parse_bus_range(dev, bus_range);
+	err = of_pci_parse_bus_range(dev_node, bus_range);
 	if (err) {
 		bus_range->start = busno;
 		bus_range->end = bus_max;
 		bus_range->flags = IORESOURCE_BUS;
 		pr_info("  No bus range found for %pOF, using %pR\n",
-			dev, bus_range);
+			dev_node, bus_range);
 	} else {
 		if (bus_range->end > bus_range->start + bus_max)
 			bus_range->end = bus_range->start + bus_max;
@@ -297,7 +297,7 @@ int of_pci_get_host_bridge_resources(struct device_node *dev,
 	pci_add_resource(resources, bus_range);
 
 	/* Check for ranges property */
-	err = of_pci_range_parser_init(&parser, dev);
+	err = of_pci_range_parser_init(&parser, dev_node);
 	if (err)
 		goto parse_failed;
 
@@ -327,7 +327,7 @@ int of_pci_get_host_bridge_resources(struct device_node *dev,
 			goto parse_failed;
 		}
 
-		err = of_pci_range_to_resource(&range, dev, res);
+		err = of_pci_range_to_resource(&range, dev_node, res);
 		if (err) {
 			kfree(res);
 			continue;
@@ -336,13 +336,13 @@ int of_pci_get_host_bridge_resources(struct device_node *dev,
 		if (resource_type(res) == IORESOURCE_IO) {
 			if (!io_base) {
 				pr_err("I/O range found for %pOF. Please provide an io_base pointer to save CPU base address\n",
-					dev);
+					dev_node);
 				err = -EINVAL;
 				goto conversion_failed;
 			}
 			if (*io_base != (resource_size_t)OF_BAD_ADDR)
 				pr_warn("More than one I/O resource converted for %pOF. CPU base address for old range lost!\n",
-					dev);
+					dev_node);
 			*io_base = range.cpu_addr;
 		}
 
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index 091033a6b836..74eec1943ad2 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -71,11 +71,11 @@ of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin)
 #endif
 
 #if defined(CONFIG_OF_ADDRESS)
-int of_pci_get_host_bridge_resources(struct device_node *dev,
+int of_pci_get_host_bridge_resources(struct device_node *dev_node,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base);
 #else
-static inline int of_pci_get_host_bridge_resources(struct device_node *dev,
+static inline int of_pci_get_host_bridge_resources(struct device_node *dev_node,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base)
 {
-- 
cgit v1.2.3


From 055f87a2a33640923d400fd5c3ebfff24198459f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 15 May 2018 11:07:03 +0200
Subject: PCI: Pass struct device to of_pci_get_host_bridge_resources()

Another step towards a managed version of
of_pci_get_host_bridge_resources(): Feed in the underlying device, rather
than just the OF node.  This will allow us to use managed resource
allocation internally later on.

Tested-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Acked-by: Jingoo Han <jingoohan1@gmail.com>
CC: Joao Pinto <Joao.Pinto@synopsys.com>
CC: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 drivers/pci/dwc/pcie-designware-host.c | 2 +-
 drivers/pci/host/pci-aardvark.c        | 5 ++---
 drivers/pci/host/pci-ftpci100.c        | 4 ++--
 drivers/pci/host/pci-v3-semi.c         | 3 ++-
 drivers/pci/host/pci-versatile.c       | 3 +--
 drivers/pci/host/pci-xgene.c           | 3 ++-
 drivers/pci/host/pcie-altera.c         | 5 ++---
 drivers/pci/host/pcie-iproc-platform.c | 4 ++--
 drivers/pci/host/pcie-rcar.c           | 5 ++---
 drivers/pci/host/pcie-rockchip.c       | 4 ++--
 drivers/pci/host/pcie-xilinx-nwl.c     | 4 ++--
 drivers/pci/host/pcie-xilinx.c         | 4 ++--
 drivers/pci/of.c                       | 9 +++++----
 include/linux/of_pci.h                 | 4 ++--
 14 files changed, 29 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/dwc/pcie-designware-host.c b/drivers/pci/dwc/pcie-designware-host.c
index 6c409079d514..5a535690b7b5 100644
--- a/drivers/pci/dwc/pcie-designware-host.c
+++ b/drivers/pci/dwc/pcie-designware-host.c
@@ -342,7 +342,7 @@ int dw_pcie_host_init(struct pcie_port *pp)
 	if (!bridge)
 		return -ENOMEM;
 
-	ret = of_pci_get_host_bridge_resources(np, 0, 0xff,
+	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff,
 					&bridge->windows, &pp->io_base);
 	if (ret)
 		return ret;
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
index b04d37b3c5de..603c72181af2 100644
--- a/drivers/pci/host/pci-aardvark.c
+++ b/drivers/pci/host/pci-aardvark.c
@@ -815,14 +815,13 @@ static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie)
 {
 	int err, res_valid = 0;
 	struct device *dev = &pcie->pdev->dev;
-	struct device_node *np = dev->of_node;
 	struct resource_entry *win, *tmp;
 	resource_size_t iobase;
 
 	INIT_LIST_HEAD(&pcie->resources);
 
-	err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pcie->resources,
-					       &iobase);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+						    &pcie->resources, &iobase);
 	if (err)
 		return err;
 
diff --git a/drivers/pci/host/pci-ftpci100.c b/drivers/pci/host/pci-ftpci100.c
index 5008fd87956a..5c176f806fe5 100644
--- a/drivers/pci/host/pci-ftpci100.c
+++ b/drivers/pci/host/pci-ftpci100.c
@@ -476,8 +476,8 @@ static int faraday_pci_probe(struct platform_device *pdev)
 	if (IS_ERR(p->base))
 		return PTR_ERR(p->base);
 
-	ret = of_pci_get_host_bridge_resources(dev->of_node, 0, 0xff,
-					       &res, &io_base);
+	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+						    &res, &io_base);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pci/host/pci-v3-semi.c b/drivers/pci/host/pci-v3-semi.c
index 0a4dea796663..f3f39935ac2f 100644
--- a/drivers/pci/host/pci-v3-semi.c
+++ b/drivers/pci/host/pci-v3-semi.c
@@ -791,7 +791,8 @@ static int v3_pci_probe(struct platform_device *pdev)
 	if (IS_ERR(v3->config_base))
 		return PTR_ERR(v3->config_base);
 
-	ret = of_pci_get_host_bridge_resources(np, 0, 0xff, &res, &io_base);
+	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+						    &io_base);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pci/host/pci-versatile.c b/drivers/pci/host/pci-versatile.c
index 5b3876f5312b..ef33ec0a9e1b 100644
--- a/drivers/pci/host/pci-versatile.c
+++ b/drivers/pci/host/pci-versatile.c
@@ -64,11 +64,10 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev,
 						     struct list_head *res)
 {
 	int err, mem = 1, res_valid = 0;
-	struct device_node *np = dev->of_node;
 	resource_size_t iobase;
 	struct resource_entry *win, *tmp;
 
-	err = of_pci_get_host_bridge_resources(np, 0, 0xff, res, &iobase);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, res, &iobase);
 	if (err)
 		return err;
 
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index 0a0d7ee6d3c9..88e9a6d315b3 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -632,7 +632,8 @@ static int xgene_pcie_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = of_pci_get_host_bridge_resources(dn, 0, 0xff, &res, &iobase);
+	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+						    &iobase);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index a6af62e0256d..9567415aaad0 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -488,11 +488,10 @@ static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie)
 {
 	int err, res_valid = 0;
 	struct device *dev = &pcie->pdev->dev;
-	struct device_node *np = dev->of_node;
 	struct resource_entry *win;
 
-	err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pcie->resources,
-					       NULL);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+						    &pcie->resources, NULL);
 	if (err)
 		return err;
 
diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c
index e764a2a2693c..cec0130326c9 100644
--- a/drivers/pci/host/pcie-iproc-platform.c
+++ b/drivers/pci/host/pcie-iproc-platform.c
@@ -99,8 +99,8 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 		pcie->phy = NULL;
 	}
 
-	ret = of_pci_get_host_bridge_resources(np, 0, 0xff, &resources,
-					       &iobase);
+	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff, &resources,
+						    &iobase);
 	if (ret) {
 		dev_err(dev, "unable to get PCI host bridge resources\n");
 		return ret;
diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c
index 6ab28f29ac6a..4c1787e021fd 100644
--- a/drivers/pci/host/pcie-rcar.c
+++ b/drivers/pci/host/pcie-rcar.c
@@ -1067,12 +1067,11 @@ static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci)
 {
 	int err;
 	struct device *dev = pci->dev;
-	struct device_node *np = dev->of_node;
 	resource_size_t iobase;
 	struct resource_entry *win, *tmp;
 
-	err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pci->resources,
-					       &iobase);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+						    &pci->resources, &iobase);
 	if (err)
 		return err;
 
diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
index f1e8f97ea1fb..abac972f0dc2 100644
--- a/drivers/pci/host/pcie-rockchip.c
+++ b/drivers/pci/host/pcie-rockchip.c
@@ -1560,8 +1560,8 @@ static int rockchip_pcie_probe(struct platform_device *pdev)
 	if (err < 0)
 		goto err_deinit_port;
 
-	err = of_pci_get_host_bridge_resources(dev->of_node, 0, 0xff,
-					       &res, &io_base);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+						    &res, &io_base);
 	if (err)
 		goto err_remove_irq_domain;
 
diff --git a/drivers/pci/host/pcie-xilinx-nwl.c b/drivers/pci/host/pcie-xilinx-nwl.c
index 4839ae578711..6aea997cd21b 100644
--- a/drivers/pci/host/pcie-xilinx-nwl.c
+++ b/drivers/pci/host/pcie-xilinx-nwl.c
@@ -825,7 +825,6 @@ static const struct of_device_id nwl_pcie_of_match[] = {
 static int nwl_pcie_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct device_node *node = dev->of_node;
 	struct nwl_pcie *pcie;
 	struct pci_bus *bus;
 	struct pci_bus *child;
@@ -855,7 +854,8 @@ static int nwl_pcie_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	err = of_pci_get_host_bridge_resources(node, 0, 0xff, &res, &iobase);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+						    &iobase);
 	if (err) {
 		dev_err(dev, "Getting bridge resources failed\n");
 		return err;
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index 0ad188effc09..fa5e44a480a4 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -643,8 +643,8 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	err = of_pci_get_host_bridge_resources(dev->of_node, 0, 0xff, &res,
-					       &iobase);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+						    &iobase);
 	if (err) {
 		dev_err(dev, "Getting bridge resources failed\n");
 		return err;
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 8d4778ef5806..ac97491ba377 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -245,7 +245,7 @@ EXPORT_SYMBOL_GPL(of_pci_check_probe_only);
 #if defined(CONFIG_OF_ADDRESS)
 /**
  * of_pci_get_host_bridge_resources - Parse PCI host bridge resources from DT
- * @dev_node: device node of the host bridge having the range property
+ * @dev: host bridge device
  * @busno: bus number associated with the bridge root bus
  * @bus_max: maximum number of buses for this bridge
  * @resources: list where the range of resources will be added after DT parsing
@@ -262,10 +262,11 @@ EXPORT_SYMBOL_GPL(of_pci_check_probe_only);
  * It returns zero if the range parsing has been successful or a standard error
  * value if it failed.
  */
-int of_pci_get_host_bridge_resources(struct device_node *dev_node,
+int of_pci_get_host_bridge_resources(struct device *dev,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base)
 {
+	struct device_node *dev_node = dev->of_node;
 	struct resource_entry *window;
 	struct resource *res;
 	struct resource *bus_range;
@@ -599,12 +600,12 @@ int pci_parse_request_of_pci_ranges(struct device *dev,
 				    struct resource **bus_range)
 {
 	int err, res_valid = 0;
-	struct device_node *np = dev->of_node;
 	resource_size_t iobase;
 	struct resource_entry *win, *tmp;
 
 	INIT_LIST_HEAD(resources);
-	err = of_pci_get_host_bridge_resources(np, 0, 0xff, resources, &iobase);
+	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, resources,
+						    &iobase);
 	if (err)
 		return err;
 
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index 74eec1943ad2..e6684c68cb94 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -71,11 +71,11 @@ of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin)
 #endif
 
 #if defined(CONFIG_OF_ADDRESS)
-int of_pci_get_host_bridge_resources(struct device_node *dev_node,
+int of_pci_get_host_bridge_resources(struct device *dev,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base);
 #else
-static inline int of_pci_get_host_bridge_resources(struct device_node *dev_node,
+static inline int of_pci_get_host_bridge_resources(struct device *dev,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base)
 {
-- 
cgit v1.2.3


From 5bd51b35c7cbbc98786282ada940429a33b52e17 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 15 May 2018 11:07:05 +0200
Subject: PCI: Rework of_pci_get_host_bridge_resources() to
 devm_of_pci_get_host_bridge_resources()

of_pci_get_host_bridge_resources() allocates the resource structures it
fills dynamically, but none of its callers care to release them so far.
Rather than requiring everyone to do this explicitly, convert the existing
function to a managed version.

Tested-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Acked-by: Joao Pinto <jpinto@synopsys.com>
Acked-by: Jingoo Han <jingoohan1@gmail.com>
CC: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 drivers/pci/dwc/pcie-designware-host.c |  2 +-
 drivers/pci/host/pci-aardvark.c        |  2 +-
 drivers/pci/host/pci-ftpci100.c        |  2 +-
 drivers/pci/host/pci-v3-semi.c         |  2 +-
 drivers/pci/host/pci-versatile.c       |  2 +-
 drivers/pci/host/pci-xgene.c           |  2 +-
 drivers/pci/host/pcie-altera.c         |  2 +-
 drivers/pci/host/pcie-iproc-platform.c |  2 +-
 drivers/pci/host/pcie-rcar.c           |  2 +-
 drivers/pci/host/pcie-rockchip.c       |  2 +-
 drivers/pci/host/pcie-xilinx-nwl.c     |  2 +-
 drivers/pci/host/pcie-xilinx.c         |  2 +-
 drivers/pci/of.c                       | 30 ++++++++++++------------------
 include/linux/of_pci.h                 |  4 ++--
 14 files changed, 26 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/dwc/pcie-designware-host.c b/drivers/pci/dwc/pcie-designware-host.c
index 5a535690b7b5..a8f6ab54b4c0 100644
--- a/drivers/pci/dwc/pcie-designware-host.c
+++ b/drivers/pci/dwc/pcie-designware-host.c
@@ -342,7 +342,7 @@ int dw_pcie_host_init(struct pcie_port *pp)
 	if (!bridge)
 		return -ENOMEM;
 
-	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+	ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff,
 					&bridge->windows, &pp->io_base);
 	if (ret)
 		return ret;
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
index 603c72181af2..709f0d69e35b 100644
--- a/drivers/pci/host/pci-aardvark.c
+++ b/drivers/pci/host/pci-aardvark.c
@@ -820,7 +820,7 @@ static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie)
 
 	INIT_LIST_HEAD(&pcie->resources);
 
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff,
 						    &pcie->resources, &iobase);
 	if (err)
 		return err;
diff --git a/drivers/pci/host/pci-ftpci100.c b/drivers/pci/host/pci-ftpci100.c
index 5c176f806fe5..87748eaeaaed 100644
--- a/drivers/pci/host/pci-ftpci100.c
+++ b/drivers/pci/host/pci-ftpci100.c
@@ -476,7 +476,7 @@ static int faraday_pci_probe(struct platform_device *pdev)
 	if (IS_ERR(p->base))
 		return PTR_ERR(p->base);
 
-	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+	ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff,
 						    &res, &io_base);
 	if (ret)
 		return ret;
diff --git a/drivers/pci/host/pci-v3-semi.c b/drivers/pci/host/pci-v3-semi.c
index f3f39935ac2f..167bf6f6b378 100644
--- a/drivers/pci/host/pci-v3-semi.c
+++ b/drivers/pci/host/pci-v3-semi.c
@@ -791,7 +791,7 @@ static int v3_pci_probe(struct platform_device *pdev)
 	if (IS_ERR(v3->config_base))
 		return PTR_ERR(v3->config_base);
 
-	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+	ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
 						    &io_base);
 	if (ret)
 		return ret;
diff --git a/drivers/pci/host/pci-versatile.c b/drivers/pci/host/pci-versatile.c
index ef33ec0a9e1b..ff2cd12b3978 100644
--- a/drivers/pci/host/pci-versatile.c
+++ b/drivers/pci/host/pci-versatile.c
@@ -67,7 +67,7 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev,
 	resource_size_t iobase;
 	struct resource_entry *win, *tmp;
 
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, res, &iobase);
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, res, &iobase);
 	if (err)
 		return err;
 
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index 88e9a6d315b3..7b3ed6e34b6c 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -632,7 +632,7 @@ static int xgene_pcie_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+	ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
 						    &iobase);
 	if (ret)
 		return ret;
diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index 9567415aaad0..0694271d4036 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -490,7 +490,7 @@ static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie)
 	struct device *dev = &pcie->pdev->dev;
 	struct resource_entry *win;
 
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff,
 						    &pcie->resources, NULL);
 	if (err)
 		return err;
diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c
index cec0130326c9..99c2022813e4 100644
--- a/drivers/pci/host/pcie-iproc-platform.c
+++ b/drivers/pci/host/pcie-iproc-platform.c
@@ -99,7 +99,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 		pcie->phy = NULL;
 	}
 
-	ret = of_pci_get_host_bridge_resources(dev, 0, 0xff, &resources,
+	ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &resources,
 						    &iobase);
 	if (ret) {
 		dev_err(dev, "unable to get PCI host bridge resources\n");
diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c
index 4c1787e021fd..6eb36c924983 100644
--- a/drivers/pci/host/pcie-rcar.c
+++ b/drivers/pci/host/pcie-rcar.c
@@ -1070,7 +1070,7 @@ static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci)
 	resource_size_t iobase;
 	struct resource_entry *win, *tmp;
 
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff,
 						    &pci->resources, &iobase);
 	if (err)
 		return err;
diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
index abac972f0dc2..27b97fcddf15 100644
--- a/drivers/pci/host/pcie-rockchip.c
+++ b/drivers/pci/host/pcie-rockchip.c
@@ -1560,7 +1560,7 @@ static int rockchip_pcie_probe(struct platform_device *pdev)
 	if (err < 0)
 		goto err_deinit_port;
 
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff,
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff,
 						    &res, &io_base);
 	if (err)
 		goto err_remove_irq_domain;
diff --git a/drivers/pci/host/pcie-xilinx-nwl.c b/drivers/pci/host/pcie-xilinx-nwl.c
index 6aea997cd21b..64df768c795c 100644
--- a/drivers/pci/host/pcie-xilinx-nwl.c
+++ b/drivers/pci/host/pcie-xilinx-nwl.c
@@ -854,7 +854,7 @@ static int nwl_pcie_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
 						    &iobase);
 	if (err) {
 		dev_err(dev, "Getting bridge resources failed\n");
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index fa5e44a480a4..88c96e5669e0 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -643,7 +643,7 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &res,
 						    &iobase);
 	if (err) {
 		dev_err(dev, "Getting bridge resources failed\n");
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 820fe058d116..d088c9147f10 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -244,7 +244,8 @@ EXPORT_SYMBOL_GPL(of_pci_check_probe_only);
 
 #if defined(CONFIG_OF_ADDRESS)
 /**
- * of_pci_get_host_bridge_resources - Parse PCI host bridge resources from DT
+ * devm_of_pci_get_host_bridge_resources() - Resource-managed parsing of PCI
+ *                                           host bridge resources from DT
  * @dev: host bridge device
  * @busno: bus number associated with the bridge root bus
  * @bus_max: maximum number of buses for this bridge
@@ -253,8 +254,6 @@ EXPORT_SYMBOL_GPL(of_pci_check_probe_only);
  * address for the start of the I/O range. Can be NULL if the caller doesn't
  * expect I/O ranges to be present in the device tree.
  *
- * It is the caller's job to free the @resources list.
- *
  * This function will parse the "ranges" property of a PCI host bridge device
  * node and setup the resource mapping based on its content. It is expected
  * that the property conforms with the Power ePAPR document.
@@ -262,12 +261,11 @@ EXPORT_SYMBOL_GPL(of_pci_check_probe_only);
  * It returns zero if the range parsing has been successful or a standard error
  * value if it failed.
  */
-int of_pci_get_host_bridge_resources(struct device *dev,
+int devm_of_pci_get_host_bridge_resources(struct device *dev,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base)
 {
 	struct device_node *dev_node = dev->of_node;
-	struct resource_entry *window;
 	struct resource *res;
 	struct resource *bus_range;
 	struct of_pci_range range;
@@ -278,7 +276,7 @@ int of_pci_get_host_bridge_resources(struct device *dev,
 	if (io_base)
 		*io_base = (resource_size_t)OF_BAD_ADDR;
 
-	bus_range = kzalloc(sizeof(*bus_range), GFP_KERNEL);
+	bus_range = devm_kzalloc(dev, sizeof(*bus_range), GFP_KERNEL);
 	if (!bus_range)
 		return -ENOMEM;
 
@@ -300,7 +298,7 @@ int of_pci_get_host_bridge_resources(struct device *dev,
 	/* Check for ranges property */
 	err = of_pci_range_parser_init(&parser, dev_node);
 	if (err)
-		goto parse_failed;
+		goto failed;
 
 	dev_dbg(dev, "Parsing ranges property...\n");
 	for_each_of_pci_range(&parser, &range) {
@@ -322,15 +320,15 @@ int of_pci_get_host_bridge_resources(struct device *dev,
 		if (range.cpu_addr == OF_BAD_ADDR || range.size == 0)
 			continue;
 
-		res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+		res = devm_kzalloc(dev, sizeof(struct resource), GFP_KERNEL);
 		if (!res) {
 			err = -ENOMEM;
-			goto parse_failed;
+			goto failed;
 		}
 
 		err = of_pci_range_to_resource(&range, dev_node, res);
 		if (err) {
-			kfree(res);
+			devm_kfree(dev, res);
 			continue;
 		}
 
@@ -339,7 +337,7 @@ int of_pci_get_host_bridge_resources(struct device *dev,
 				dev_err(dev, "I/O range found for %pOF. Please provide an io_base pointer to save CPU base address\n",
 					dev_node);
 				err = -EINVAL;
-				goto conversion_failed;
+				goto failed;
 			}
 			if (*io_base != (resource_size_t)OF_BAD_ADDR)
 				dev_warn(dev, "More than one I/O resource converted for %pOF. CPU base address for old range lost!\n",
@@ -352,15 +350,11 @@ int of_pci_get_host_bridge_resources(struct device *dev,
 
 	return 0;
 
-conversion_failed:
-	kfree(res);
-parse_failed:
-	resource_list_for_each_entry(window, resources)
-		kfree(window->res);
+failed:
 	pci_free_resource_list(resources);
 	return err;
 }
-EXPORT_SYMBOL_GPL(of_pci_get_host_bridge_resources);
+EXPORT_SYMBOL_GPL(devm_of_pci_get_host_bridge_resources);
 #endif /* CONFIG_OF_ADDRESS */
 
 /**
@@ -604,7 +598,7 @@ int pci_parse_request_of_pci_ranges(struct device *dev,
 	struct resource_entry *win, *tmp;
 
 	INIT_LIST_HEAD(resources);
-	err = of_pci_get_host_bridge_resources(dev, 0, 0xff, resources,
+	err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, resources,
 						    &iobase);
 	if (err)
 		return err;
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index e6684c68cb94..fa4463a52900 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -71,11 +71,11 @@ of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin)
 #endif
 
 #if defined(CONFIG_OF_ADDRESS)
-int of_pci_get_host_bridge_resources(struct device *dev,
+int devm_of_pci_get_host_bridge_resources(struct device *dev,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base);
 #else
-static inline int of_pci_get_host_bridge_resources(struct device *dev,
+static inline int devm_of_pci_get_host_bridge_resources(struct device *dev,
 			unsigned char busno, unsigned char bus_max,
 			struct list_head *resources, resource_size_t *io_base)
 {
-- 
cgit v1.2.3


From 01fcb7f777a9f5d216a1ff41228f15656e50fb63 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 15 May 2018 11:07:06 +0200
Subject: PCI: Add support for unbinding the generic PCI host controller

Add support for unbinding the generic PCI host controller.  This is
particularly useful when working in virtual environments where the
controller may come and go, but possibly not only there.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 drivers/pci/host/pci-host-common.c  | 13 +++++++++++++
 drivers/pci/host/pci-host-generic.c |  1 +
 include/linux/pci-ecam.h            |  1 +
 3 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/host/pci-host-common.c b/drivers/pci/host/pci-host-common.c
index 5d028f53fdcd..d8f10451f273 100644
--- a/drivers/pci/host/pci-host-common.c
+++ b/drivers/pci/host/pci-host-common.c
@@ -101,5 +101,18 @@ int pci_host_common_probe(struct platform_device *pdev,
 		return ret;
 	}
 
+	platform_set_drvdata(pdev, bridge->bus);
+	return 0;
+}
+
+int pci_host_common_remove(struct platform_device *pdev)
+{
+	struct pci_bus *bus = platform_get_drvdata(pdev);
+
+	pci_lock_rescan_remove();
+	pci_stop_root_bus(bus);
+	pci_remove_root_bus(bus);
+	pci_unlock_rescan_remove();
+
 	return 0;
 }
diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c
index 45319ee3b484..dea3ec7592a2 100644
--- a/drivers/pci/host/pci-host-generic.c
+++ b/drivers/pci/host/pci-host-generic.c
@@ -95,5 +95,6 @@ static struct platform_driver gen_pci_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = gen_pci_probe,
+	.remove = pci_host_common_remove,
 };
 builtin_platform_driver(gen_pci_driver);
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index baadad1aabbc..29efa09d686b 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -62,5 +62,6 @@ extern struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */
 /* for DT-based PCI controllers that support ECAM */
 int pci_host_common_probe(struct platform_device *pdev,
 			  struct pci_ecam_ops *ops);
+int pci_host_common_remove(struct platform_device *pdev);
 #endif
 #endif
-- 
cgit v1.2.3


From 9c5587346490ad4355e8de6ae402b76e55c411d5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 30 May 2018 15:26:07 +0800
Subject: blk-mq: abstract out blk-mq-sched rq list iteration bio merge helper

No functional changes in this patch, just a prep patch for utilizing
this in an IO scheduler.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-mq-sched.c   | 34 ++++++++++++++++++++++++----------
 include/linux/blk-mq.h |  3 ++-
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 25c14c58385c..b0f2c2a40a0c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -268,19 +268,16 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 
 /*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
+ * Iterate list of requests and see if we can merge this bio with any
+ * of them.
  */
-static bool blk_mq_attempt_merge(struct request_queue *q,
-				 struct blk_mq_ctx *ctx, struct bio *bio)
+bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
+			   struct bio *bio)
 {
 	struct request *rq;
 	int checked = 8;
 
-	lockdep_assert_held(&ctx->lock);
-
-	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+	list_for_each_entry_reverse(rq, list, queuelist) {
 		bool merged = false;
 
 		if (!checked--)
@@ -305,13 +302,30 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 			continue;
 		}
 
-		if (merged)
-			ctx->rq_merged++;
 		return merged;
 	}
 
 	return false;
 }
+EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
+
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+				 struct blk_mq_ctx *ctx, struct bio *bio)
+{
+	lockdep_assert_held(&ctx->lock);
+
+	if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
+		ctx->rq_merged++;
+		return true;
+	}
+
+	return false;
+}
 
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ebc34a5686dc..fb355173f3c7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -259,7 +259,8 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_complete_request(struct request *rq);
-
+bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
+			   struct bio *bio);
 bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From 79a3d60300fc28def9b58dc30d86274b47052422 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Wed, 30 May 2018 09:04:13 -0700
Subject: platform/chrome: Use to_cros_ec_dev more broadly

Move to_cros_ec_dev macro to cros_ec.h and use it when the private ec
object is needed from device object.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Reviewed-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 drivers/platform/chrome/cros_ec_lightbar.c | 21 +++++++--------------
 drivers/platform/chrome/cros_ec_sysfs.c    |  2 --
 drivers/platform/chrome/cros_ec_vbc.c      |  9 +++------
 include/linux/mfd/cros_ec.h                |  2 ++
 4 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 6ea79d495aa2..68193bb53383 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -170,8 +170,7 @@ static ssize_t version_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	uint32_t version = 0, flags = 0;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	int ret;
 
 	ret = lb_throttle();
@@ -193,8 +192,7 @@ static ssize_t brightness_store(struct device *dev,
 	struct cros_ec_command *msg;
 	int ret;
 	unsigned int val;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	if (kstrtouint(buf, 0, &val))
 		return -EINVAL;
@@ -238,8 +236,7 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 {
 	struct ec_params_lightbar *param;
 	struct cros_ec_command *msg;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	unsigned int val[4];
 	int ret, i = 0, j = 0, ok = 0;
 
@@ -311,8 +308,7 @@ static ssize_t sequence_show(struct device *dev,
 	struct ec_response_lightbar *resp;
 	struct cros_ec_command *msg;
 	int ret;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	msg = alloc_lightbar_cmd_msg(ec);
 	if (!msg)
@@ -439,8 +435,7 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
 	struct cros_ec_command *msg;
 	unsigned int num;
 	int ret, len;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	for (len = 0; len < count; len++)
 		if (!isalnum(buf[len]))
@@ -488,8 +483,7 @@ static ssize_t program_store(struct device *dev, struct device_attribute *attr,
 	int extra_bytes, max_size, ret;
 	struct ec_params_lightbar *param;
 	struct cros_ec_command *msg;
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	/*
 	 * We might need to reject the program for size reasons. The EC
@@ -599,8 +593,7 @@ static umode_t cros_ec_lightbar_attrs_are_visible(struct kobject *kobj,
 						  struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct platform_device *pdev = to_platform_device(ec->dev);
 	struct cros_ec_platform *pdata = pdev->dev.platform_data;
 	int is_cros_ec;
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index 5a6db3fe213a..f34a50121064 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -34,8 +34,6 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 
-#define to_cros_ec_dev(dev)  container_of(dev, struct cros_ec_dev, class_dev)
-
 /* Accessor functions */
 
 static ssize_t reboot_show(struct device *dev,
diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c
index 6d38e6b08334..5356f26bc022 100644
--- a/drivers/platform/chrome/cros_ec_vbc.c
+++ b/drivers/platform/chrome/cros_ec_vbc.c
@@ -29,8 +29,7 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj,
 				  loff_t pos, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct cros_ec_device *ecdev = ec->ec_dev;
 	struct ec_params_vbnvcontext *params;
 	struct cros_ec_command *msg;
@@ -70,8 +69,7 @@ static ssize_t vboot_context_write(struct file *filp, struct kobject *kobj,
 				   loff_t pos, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct cros_ec_device *ecdev = ec->ec_dev;
 	struct ec_params_vbnvcontext *params;
 	struct cros_ec_command *msg;
@@ -111,8 +109,7 @@ static umode_t cros_ec_vbc_is_visible(struct kobject *kobj,
 				      struct bin_attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct device_node *np = ec->ec_dev->dev->of_node;
 
 	if (IS_ENABLED(CONFIG_OF) && np) {
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 2d4e23c9ea0a..f09e9cf2e4ab 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -197,6 +197,8 @@ struct cros_ec_dev {
 	u32 features[2];
 };
 
+#define to_cros_ec_dev(dev)  container_of(dev, struct cros_ec_dev, class_dev)
+
 /**
  * cros_ec_suspend - Handle a suspend operation for the ChromeOS EC device
  *
-- 
cgit v1.2.3


From 76c9dd9dbd6459f1faf2b10351eb3d3f90255fa1 Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Fri, 25 May 2018 13:11:47 -0500
Subject: clk: davinci: pll: allow dev == NULL

This modifies the TI Davinci PLL clock driver to allow for the case
when dev == NULL. On some (most) SoCs that use this driver, the PLL
clock needs to be registered during early boot because it is used
for clocksource/clkevent and there will be no platform device available.

Signed-off-by: David Lechner <david@lechnology.com>
Reviewed-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/20180525181150.17873-7-david@lechnology.com
---
 drivers/clk/davinci/pll-da830.c  |   5 +-
 drivers/clk/davinci/pll-da850.c  |  22 +--
 drivers/clk/davinci/pll-dm355.c  |   9 +-
 drivers/clk/davinci/pll-dm365.c  |   9 +-
 drivers/clk/davinci/pll-dm644x.c |   9 +-
 drivers/clk/davinci/pll-dm646x.c |   9 +-
 drivers/clk/davinci/pll.c        | 279 ++++++++++++++++++++++++++-------------
 drivers/clk/davinci/pll.h        |  30 ++---
 include/linux/clk/davinci.h      |  24 ++++
 9 files changed, 259 insertions(+), 137 deletions(-)
 create mode 100644 include/linux/clk/davinci.h

(limited to 'include')

diff --git a/drivers/clk/davinci/pll-da830.c b/drivers/clk/davinci/pll-da830.c
index 929a3d3a9adb..0a0d06fb25fd 100644
--- a/drivers/clk/davinci/pll-da830.c
+++ b/drivers/clk/davinci/pll-da830.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/clkdev.h>
+#include <linux/clk/davinci.h>
 #include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -36,11 +37,11 @@ SYSCLK(5, pll0_sysclk5, pll0_pllen, 5, 0);
 SYSCLK(6, pll0_sysclk6, pll0_pllen, 5, SYSCLK_FIXED_DIV);
 SYSCLK(7, pll0_sysclk7, pll0_pllen, 5, 0);
 
-int da830_pll_init(struct device *dev, void __iomem *base)
+int da830_pll_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &da830_pll_info, "ref_clk", base);
+	davinci_pll_clk_register(dev, &da830_pll_info, "ref_clk", base, cfgchip);
 
 	clk = davinci_pll_sysclk_register(dev, &pll0_sysclk2, base);
 	clk_register_clkdev(clk, "pll0_sysclk2", "da830-psc0");
diff --git a/drivers/clk/davinci/pll-da850.c b/drivers/clk/davinci/pll-da850.c
index 2a038b7908cc..59cc2e3733f9 100644
--- a/drivers/clk/davinci/pll-da850.c
+++ b/drivers/clk/davinci/pll-da850.c
@@ -7,7 +7,9 @@
 
 #include <linux/bitops.h>
 #include <linux/clk-provider.h>
+#include <linux/clk/davinci.h>
 #include <linux/clkdev.h>
+#include <linux/device.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mfd/da8xx-cfgchip.h>
@@ -81,11 +83,11 @@ static const struct davinci_pll_obsclk_info da850_pll0_obsclk_info = {
 	.ocsrc_mask = GENMASK(4, 0),
 };
 
-int da850_pll0_init(struct device *dev, void __iomem *base)
+int da850_pll0_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &da850_pll0_info, "ref_clk", base);
+	davinci_pll_clk_register(dev, &da850_pll0_info, "ref_clk", base, cfgchip);
 
 	clk = davinci_pll_sysclk_register(dev, &pll0_sysclk1, base);
 	clk_register_clkdev(clk, "pll0_sysclk1", "da850-psc0");
@@ -134,11 +136,11 @@ static const struct davinci_pll_sysclk_info *da850_pll0_sysclk_info[] = {
 	NULL
 };
 
-int of_da850_pll0_init(struct device *dev, void __iomem *base)
+int of_da850_pll0_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
-	return of_davinci_pll_init(dev, &da850_pll0_info,
+	return of_davinci_pll_init(dev, dev->of_node, &da850_pll0_info,
 				   &da850_pll0_obsclk_info,
-				   da850_pll0_sysclk_info, 7, base);
+				   da850_pll0_sysclk_info, 7, base, cfgchip);
 }
 
 static const struct davinci_pll_clk_info da850_pll1_info = {
@@ -179,11 +181,11 @@ static const struct davinci_pll_obsclk_info da850_pll1_obsclk_info = {
 	.ocsrc_mask = GENMASK(4, 0),
 };
 
-int da850_pll1_init(struct device *dev, void __iomem *base)
+int da850_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &da850_pll1_info, "oscin", base);
+	davinci_pll_clk_register(dev, &da850_pll1_info, "oscin", base, cfgchip);
 
 	davinci_pll_sysclk_register(dev, &pll1_sysclk1, base);
 
@@ -204,9 +206,9 @@ static const struct davinci_pll_sysclk_info *da850_pll1_sysclk_info[] = {
 	NULL
 };
 
-int of_da850_pll1_init(struct device *dev, void __iomem *base)
+int of_da850_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
-	return of_davinci_pll_init(dev, &da850_pll1_info,
+	return of_davinci_pll_init(dev, dev->of_node, &da850_pll1_info,
 				   &da850_pll1_obsclk_info,
-				   da850_pll1_sysclk_info, 3, base);
+				   da850_pll1_sysclk_info, 3, base, cfgchip);
 }
diff --git a/drivers/clk/davinci/pll-dm355.c b/drivers/clk/davinci/pll-dm355.c
index 93f4a53d6b44..505aed80be9a 100644
--- a/drivers/clk/davinci/pll-dm355.c
+++ b/drivers/clk/davinci/pll-dm355.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/clk/davinci.h>
 #include <linux/clkdev.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -27,11 +28,11 @@ SYSCLK(2, pll1_sysclk2, pll1_pllen, 5, SYSCLK_FIXED_DIV | SYSCLK_ALWAYS_ENABLED)
 SYSCLK(3, pll1_sysclk3, pll1_pllen, 5, SYSCLK_ALWAYS_ENABLED);
 SYSCLK(4, pll1_sysclk4, pll1_pllen, 5, SYSCLK_ALWAYS_ENABLED);
 
-int dm355_pll1_init(struct device *dev, void __iomem *base)
+int dm355_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &dm355_pll1_info, "ref_clk", base);
+	davinci_pll_clk_register(dev, &dm355_pll1_info, "ref_clk", base, cfgchip);
 
 	clk = davinci_pll_sysclk_register(dev, &pll1_sysclk1, base);
 	clk_register_clkdev(clk, "pll1_sysclk1", "dm355-psc");
@@ -64,9 +65,9 @@ static const struct davinci_pll_clk_info dm355_pll2_info = {
 
 SYSCLK(1, pll2_sysclk1, pll2_pllen, 5, SYSCLK_FIXED_DIV | SYSCLK_ALWAYS_ENABLED);
 
-int dm355_pll2_init(struct device *dev, void __iomem *base)
+int dm355_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
-	davinci_pll_clk_register(dev, &dm355_pll2_info, "oscin", base);
+	davinci_pll_clk_register(dev, &dm355_pll2_info, "oscin", base, cfgchip);
 
 	davinci_pll_sysclk_register(dev, &pll2_sysclk1, base);
 
diff --git a/drivers/clk/davinci/pll-dm365.c b/drivers/clk/davinci/pll-dm365.c
index 5f8d9f42d0f3..2d29712753a3 100644
--- a/drivers/clk/davinci/pll-dm365.c
+++ b/drivers/clk/davinci/pll-dm365.c
@@ -7,6 +7,7 @@
 
 #include <linux/bitops.h>
 #include <linux/clkdev.h>
+#include <linux/clk/davinci.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
@@ -56,11 +57,11 @@ static const struct davinci_pll_obsclk_info dm365_pll1_obsclk_info = {
 	.ocsrc_mask = BIT(4),
 };
 
-int dm365_pll1_init(struct device *dev, void __iomem *base)
+int dm365_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &dm365_pll1_info, "ref_clk", base);
+	davinci_pll_clk_register(dev, &dm365_pll1_info, "ref_clk", base, cfgchip);
 
 	clk = davinci_pll_sysclk_register(dev, &pll1_sysclk1, base);
 	clk_register_clkdev(clk, "pll1_sysclk1", "dm365-psc");
@@ -119,11 +120,11 @@ static const struct davinci_pll_obsclk_info dm365_pll2_obsclk_info = {
 	.ocsrc_mask = BIT(4),
 };
 
-int dm365_pll2_init(struct device *dev, void __iomem *base)
+int dm365_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &dm365_pll2_info, "oscin", base);
+	davinci_pll_clk_register(dev, &dm365_pll2_info, "oscin", base, cfgchip);
 
 	davinci_pll_sysclk_register(dev, &pll2_sysclk1, base);
 
diff --git a/drivers/clk/davinci/pll-dm644x.c b/drivers/clk/davinci/pll-dm644x.c
index 69bf785377cf..7650fadfaac8 100644
--- a/drivers/clk/davinci/pll-dm644x.c
+++ b/drivers/clk/davinci/pll-dm644x.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/clk/davinci.h>
 #include <linux/clkdev.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -27,11 +28,11 @@ SYSCLK(2, pll1_sysclk2, pll1_pllen, 4, SYSCLK_FIXED_DIV);
 SYSCLK(3, pll1_sysclk3, pll1_pllen, 4, SYSCLK_FIXED_DIV);
 SYSCLK(5, pll1_sysclk5, pll1_pllen, 4, SYSCLK_FIXED_DIV);
 
-int dm644x_pll1_init(struct device *dev, void __iomem *base)
+int dm644x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &dm644x_pll1_info, "ref_clk", base);
+	davinci_pll_clk_register(dev, &dm644x_pll1_info, "ref_clk", base, cfgchip);
 
 	clk = davinci_pll_sysclk_register(dev, &pll1_sysclk1, base);
 	clk_register_clkdev(clk, "pll1_sysclk1", "dm644x-psc");
@@ -66,9 +67,9 @@ static const struct davinci_pll_clk_info dm644x_pll2_info = {
 SYSCLK(1, pll2_sysclk1, pll2_pllen, 4, 0);
 SYSCLK(2, pll2_sysclk2, pll2_pllen, 4, 0);
 
-int dm644x_pll2_init(struct device *dev, void __iomem *base)
+int dm644x_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
-	davinci_pll_clk_register(dev, &dm644x_pll2_info, "oscin", base);
+	davinci_pll_clk_register(dev, &dm644x_pll2_info, "oscin", base, cfgchip);
 
 	davinci_pll_sysclk_register(dev, &pll2_sysclk1, base);
 
diff --git a/drivers/clk/davinci/pll-dm646x.c b/drivers/clk/davinci/pll-dm646x.c
index 0ae827e3ce80..26982970df0e 100644
--- a/drivers/clk/davinci/pll-dm646x.c
+++ b/drivers/clk/davinci/pll-dm646x.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/clk/davinci.h>
 #include <linux/clkdev.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -29,11 +30,11 @@ SYSCLK(6, pll1_sysclk6, pll1_pllen, 4, 0);
 SYSCLK(8, pll1_sysclk8, pll1_pllen, 4, 0);
 SYSCLK(9, pll1_sysclk9, pll1_pllen, 4, 0);
 
-int dm646x_pll1_init(struct device *dev, void __iomem *base)
+int dm646x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
 	struct clk *clk;
 
-	davinci_pll_clk_register(dev, &dm646x_pll1_info, "ref_clk", base);
+	davinci_pll_clk_register(dev, &dm646x_pll1_info, "ref_clk", base, cfgchip);
 
 	clk = davinci_pll_sysclk_register(dev, &pll1_sysclk1, base);
 	clk_register_clkdev(clk, "pll1_sysclk1", "dm646x-psc");
@@ -74,9 +75,9 @@ static const struct davinci_pll_clk_info dm646x_pll2_info = {
 
 SYSCLK(1, pll2_sysclk1, pll2_pllen, 4, SYSCLK_ALWAYS_ENABLED);
 
-int dm646x_pll2_init(struct device *dev, void __iomem *base)
+int dm646x_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip)
 {
-	davinci_pll_clk_register(dev, &dm646x_pll2_info, "oscin", base);
+	davinci_pll_clk_register(dev, &dm646x_pll2_info, "oscin", base, cfgchip);
 
 	davinci_pll_sysclk_register(dev, &pll2_sysclk1, base);
 
diff --git a/drivers/clk/davinci/pll.c b/drivers/clk/davinci/pll.c
index 23a24c944f1d..2eb981e61185 100644
--- a/drivers/clk/davinci/pll.c
+++ b/drivers/clk/davinci/pll.c
@@ -11,6 +11,7 @@
 
 #include <linux/clk-provider.h>
 #include <linux/clk.h>
+#include <linux/clk/davinci.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/io.h>
@@ -223,6 +224,7 @@ static const struct clk_ops dm365_pll_ops = {
 
 /**
  * davinci_pll_div_register - common *DIV clock implementation
+ * @dev: The PLL platform device or NULL
  * @name: the clock name
  * @parent_name: the parent clock name
  * @reg: the *DIV register
@@ -240,17 +242,21 @@ static struct clk *davinci_pll_div_register(struct device *dev,
 	const struct clk_ops *divider_ops = &clk_divider_ops;
 	struct clk_gate *gate;
 	struct clk_divider *divider;
+	struct clk *clk;
+	int ret;
 
-	gate = devm_kzalloc(dev, sizeof(*gate), GFP_KERNEL);
+	gate = kzalloc(sizeof(*gate), GFP_KERNEL);
 	if (!gate)
 		return ERR_PTR(-ENOMEM);
 
 	gate->reg = reg;
 	gate->bit_idx = DIV_ENABLE_SHIFT;
 
-	divider = devm_kzalloc(dev, sizeof(*divider), GFP_KERNEL);
-	if (!divider)
-		return ERR_PTR(-ENOMEM);
+	divider = kzalloc(sizeof(*divider), GFP_KERNEL);
+	if (!divider) {
+		ret = -ENOMEM;
+		goto err_free_gate;
+	}
 
 	divider->reg = reg;
 	divider->shift = DIV_RATIO_SHIFT;
@@ -261,9 +267,22 @@ static struct clk *davinci_pll_div_register(struct device *dev,
 		divider_ops = &clk_divider_ro_ops;
 	}
 
-	return clk_register_composite(dev, name, parent_names, num_parents,
-				      NULL, NULL, &divider->hw, divider_ops,
-				      &gate->hw, &clk_gate_ops, flags);
+	clk = clk_register_composite(dev, name, parent_names, num_parents,
+				     NULL, NULL, &divider->hw, divider_ops,
+				     &gate->hw, &clk_gate_ops, flags);
+	if (IS_ERR(clk)) {
+		ret = PTR_ERR(clk);
+		goto err_free_divider;
+	}
+
+	return clk;
+
+err_free_divider:
+	kfree(divider);
+err_free_gate:
+	kfree(gate);
+
+	return ERR_PTR(ret);
 }
 
 struct davinci_pllen_clk {
@@ -321,36 +340,17 @@ static int davinci_pllen_rate_change(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-static struct davinci_pll_platform_data *davinci_pll_get_pdata(struct device *dev)
-{
-	struct davinci_pll_platform_data *pdata = dev_get_platdata(dev);
-
-	/*
-	 * Platform data is optional, so allocate a new struct if one was not
-	 * provided. For device tree, this will always be the case.
-	 */
-	if (!pdata)
-		pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
-	if (!pdata)
-		return NULL;
-
-	/* for device tree, we need to fill in the struct */
-	if (dev->of_node)
-		pdata->cfgchip =
-			syscon_regmap_lookup_by_compatible("ti,da830-cfgchip");
-
-	return pdata;
-}
-
 static struct notifier_block davinci_pllen_notifier = {
 	.notifier_call = davinci_pllen_rate_change,
 };
 
 /**
  * davinci_pll_clk_register - Register a PLL clock
+ * @dev: The PLL platform device or NULL
  * @info: The device-specific clock info
  * @parent_name: The parent clock name
  * @base: The PLL's memory region
+ * @cfgchip: CFGCHIP syscon regmap for info->unlock_reg or NULL
  *
  * This creates a series of clocks that represent the PLL.
  *
@@ -366,9 +366,9 @@ static struct notifier_block davinci_pllen_notifier = {
 struct clk *davinci_pll_clk_register(struct device *dev,
 				     const struct davinci_pll_clk_info *info,
 				     const char *parent_name,
-				     void __iomem *base)
+				     void __iomem *base,
+				     struct regmap *cfgchip)
 {
-	struct davinci_pll_platform_data *pdata;
 	char prediv_name[MAX_NAME_SIZE];
 	char pllout_name[MAX_NAME_SIZE];
 	char postdiv_name[MAX_NAME_SIZE];
@@ -376,11 +376,12 @@ struct clk *davinci_pll_clk_register(struct device *dev,
 	struct clk_init_data init;
 	struct davinci_pll_clk *pllout;
 	struct davinci_pllen_clk *pllen;
-	struct clk *pllout_clk, *clk;
-
-	pdata = davinci_pll_get_pdata(dev);
-	if (!pdata)
-		return ERR_PTR(-ENOMEM);
+	struct clk *oscin_clk = NULL;
+	struct clk *prediv_clk = NULL;
+	struct clk *pllout_clk;
+	struct clk *postdiv_clk = NULL;
+	struct clk *pllen_clk;
+	int ret;
 
 	if (info->flags & PLL_HAS_CLKMODE) {
 		/*
@@ -392,10 +393,10 @@ struct clk *davinci_pll_clk_register(struct device *dev,
 		 * a number of different things. In this driver we use it to
 		 * mean the signal after the PLLCTL[CLKMODE] switch.
 		 */
-		clk = clk_register_fixed_factor(dev, OSCIN_CLK_NAME,
-						parent_name, 0, 1, 1);
-		if (IS_ERR(clk))
-			return clk;
+		oscin_clk = clk_register_fixed_factor(dev, OSCIN_CLK_NAME,
+						      parent_name, 0, 1, 1);
+		if (IS_ERR(oscin_clk))
+			return oscin_clk;
 
 		parent_name = OSCIN_CLK_NAME;
 	}
@@ -411,30 +412,34 @@ struct clk *davinci_pll_clk_register(struct device *dev,
 
 		/* Some? DM355 chips don't correctly report the PREDIV value */
 		if (info->flags & PLL_PREDIV_FIXED8)
-			clk = clk_register_fixed_factor(dev, prediv_name,
-						parent_name, flags, 1, 8);
+			prediv_clk = clk_register_fixed_factor(dev, prediv_name,
+							parent_name, flags, 1, 8);
 		else
-			clk = davinci_pll_div_register(dev, prediv_name,
+			prediv_clk = davinci_pll_div_register(dev, prediv_name,
 				parent_name, base + PREDIV, fixed, flags);
-		if (IS_ERR(clk))
-			return clk;
+		if (IS_ERR(prediv_clk)) {
+			ret = PTR_ERR(prediv_clk);
+			goto err_unregister_oscin;
+		}
 
 		parent_name = prediv_name;
 	}
 
 	/* Unlock writing to PLL registers */
 	if (info->unlock_reg) {
-		if (IS_ERR_OR_NULL(pdata->cfgchip))
+		if (IS_ERR_OR_NULL(cfgchip))
 			dev_warn(dev, "Failed to get CFGCHIP (%ld)\n",
-				 PTR_ERR(pdata->cfgchip));
+				 PTR_ERR(cfgchip));
 		else
-			regmap_write_bits(pdata->cfgchip, info->unlock_reg,
+			regmap_write_bits(cfgchip, info->unlock_reg,
 					  info->unlock_mask, 0);
 	}
 
-	pllout = devm_kzalloc(dev, sizeof(*pllout), GFP_KERNEL);
-	if (!pllout)
-		return ERR_PTR(-ENOMEM);
+	pllout = kzalloc(sizeof(*pllout), GFP_KERNEL);
+	if (!pllout) {
+		ret = -ENOMEM;
+		goto err_unregister_prediv;
+	}
 
 	snprintf(pllout_name, MAX_NAME_SIZE, "%s_pllout", info->name);
 
@@ -456,9 +461,11 @@ struct clk *davinci_pll_clk_register(struct device *dev,
 	pllout->pllm_min = info->pllm_min;
 	pllout->pllm_max = info->pllm_max;
 
-	pllout_clk = devm_clk_register(dev, &pllout->hw);
-	if (IS_ERR(pllout_clk))
-		return pllout_clk;
+	pllout_clk = clk_register(dev, &pllout->hw);
+	if (IS_ERR(pllout_clk)) {
+		ret = PTR_ERR(pllout_clk);
+		goto err_free_pllout;
+	}
 
 	clk_hw_set_rate_range(&pllout->hw, info->pllout_min_rate,
 			      info->pllout_max_rate);
@@ -474,17 +481,21 @@ struct clk *davinci_pll_clk_register(struct device *dev,
 		if (info->flags & PLL_POSTDIV_ALWAYS_ENABLED)
 			flags |= CLK_IS_CRITICAL;
 
-		clk = davinci_pll_div_register(dev, postdiv_name, parent_name,
-					       base + POSTDIV, fixed, flags);
-		if (IS_ERR(clk))
-			return clk;
+		postdiv_clk = davinci_pll_div_register(dev, postdiv_name,
+				parent_name, base + POSTDIV, fixed, flags);
+		if (IS_ERR(postdiv_clk)) {
+			ret = PTR_ERR(postdiv_clk);
+			goto err_unregister_pllout;
+		}
 
 		parent_name = postdiv_name;
 	}
 
-	pllen = devm_kzalloc(dev, sizeof(*pllout), GFP_KERNEL);
-	if (!pllen)
-		return ERR_PTR(-ENOMEM);
+	pllen = kzalloc(sizeof(*pllout), GFP_KERNEL);
+	if (!pllen) {
+		ret = -ENOMEM;
+		goto err_unregister_postdiv;
+	}
 
 	snprintf(pllen_name, MAX_NAME_SIZE, "%s_pllen", info->name);
 
@@ -497,17 +508,35 @@ struct clk *davinci_pll_clk_register(struct device *dev,
 	pllen->hw.init = &init;
 	pllen->base = base;
 
-	clk = devm_clk_register(dev, &pllen->hw);
-	if (IS_ERR(clk))
-		return clk;
+	pllen_clk = clk_register(dev, &pllen->hw);
+	if (IS_ERR(pllen_clk)) {
+		ret = PTR_ERR(pllen_clk);
+		goto err_free_pllen;
+	}
 
-	clk_notifier_register(clk, &davinci_pllen_notifier);
+	clk_notifier_register(pllen_clk, &davinci_pllen_notifier);
 
 	return pllout_clk;
+
+err_free_pllen:
+	kfree(pllen);
+err_unregister_postdiv:
+	clk_unregister(postdiv_clk);
+err_unregister_pllout:
+	clk_unregister(pllout_clk);
+err_free_pllout:
+	kfree(pllout);
+err_unregister_prediv:
+	clk_unregister(prediv_clk);
+err_unregister_oscin:
+	clk_unregister(oscin_clk);
+
+	return ERR_PTR(ret);
 }
 
 /**
  * davinci_pll_auxclk_register - Register bypass clock (AUXCLK)
+ * @dev: The PLL platform device or NULL
  * @name: The clock name
  * @base: The PLL memory region
  */
@@ -521,6 +550,7 @@ struct clk *davinci_pll_auxclk_register(struct device *dev,
 
 /**
  * davinci_pll_sysclkbp_clk_register - Register bypass divider clock (SYSCLKBP)
+ * @dev: The PLL platform device or NULL
  * @name: The clock name
  * @base: The PLL memory region
  */
@@ -535,6 +565,7 @@ struct clk *davinci_pll_sysclkbp_clk_register(struct device *dev,
 
 /**
  * davinci_pll_obsclk_register - Register oscillator divider clock (OBSCLK)
+ * @dev: The PLL platform device or NULL
  * @info: The clock info
  * @base: The PLL memory region
  */
@@ -546,9 +577,11 @@ davinci_pll_obsclk_register(struct device *dev,
 	struct clk_mux *mux;
 	struct clk_gate *gate;
 	struct clk_divider *divider;
+	struct clk *clk;
 	u32 oscdiv;
+	int ret;
 
-	mux = devm_kzalloc(dev, sizeof(*mux), GFP_KERNEL);
+	mux = kzalloc(sizeof(*mux), GFP_KERNEL);
 	if (!mux)
 		return ERR_PTR(-ENOMEM);
 
@@ -556,16 +589,20 @@ davinci_pll_obsclk_register(struct device *dev,
 	mux->table = info->table;
 	mux->mask = info->ocsrc_mask;
 
-	gate = devm_kzalloc(dev, sizeof(*gate), GFP_KERNEL);
-	if (!gate)
-		return ERR_PTR(-ENOMEM);
+	gate = kzalloc(sizeof(*gate), GFP_KERNEL);
+	if (!gate) {
+		ret = -ENOMEM;
+		goto err_free_mux;
+	}
 
 	gate->reg = base + CKEN;
 	gate->bit_idx = CKEN_OBSCLK_SHIFT;
 
-	divider = devm_kzalloc(dev, sizeof(*divider), GFP_KERNEL);
-	if (!divider)
-		return ERR_PTR(-ENOMEM);
+	divider = kzalloc(sizeof(*divider), GFP_KERNEL);
+	if (!divider) {
+		ret = -ENOMEM;
+		goto err_free_gate;
+	}
 
 	divider->reg = base + OSCDIV;
 	divider->shift = DIV_RATIO_SHIFT;
@@ -576,11 +613,27 @@ davinci_pll_obsclk_register(struct device *dev,
 	oscdiv |= BIT(DIV_ENABLE_SHIFT);
 	writel(oscdiv, base + OSCDIV);
 
-	return clk_register_composite(dev, info->name, info->parent_names,
-				      info->num_parents,
-				      &mux->hw, &clk_mux_ops,
-				      &divider->hw, &clk_divider_ops,
-				      &gate->hw, &clk_gate_ops, 0);
+	clk = clk_register_composite(dev, info->name, info->parent_names,
+				     info->num_parents,
+				     &mux->hw, &clk_mux_ops,
+				     &divider->hw, &clk_divider_ops,
+				     &gate->hw, &clk_gate_ops, 0);
+
+	if (IS_ERR(clk)) {
+		ret = PTR_ERR(clk);
+		goto err_free_divider;
+	}
+
+	return clk;
+
+err_free_divider:
+	kfree(divider);
+err_free_gate:
+	kfree(gate);
+err_free_mux:
+	kfree(mux);
+
+	return ERR_PTR(ret);
 }
 
 /* The PLL SYSCLKn clocks have a mechanism for synchronizing rate changes. */
@@ -616,6 +669,7 @@ static struct notifier_block davinci_pll_sysclk_notifier = {
 
 /**
  * davinci_pll_sysclk_register - Register divider clocks (SYSCLKn)
+ * @dev: The PLL platform device or NULL
  * @info: The clock info
  * @base: The PLL memory region
  */
@@ -630,6 +684,7 @@ davinci_pll_sysclk_register(struct device *dev,
 	struct clk *clk;
 	u32 reg;
 	u32 flags = 0;
+	int ret;
 
 	/* PLLDIVn registers are not entirely consecutive */
 	if (info->id < 4)
@@ -637,16 +692,18 @@ davinci_pll_sysclk_register(struct device *dev,
 	else
 		reg = PLLDIV4 + 4 * (info->id - 4);
 
-	gate = devm_kzalloc(dev, sizeof(*gate), GFP_KERNEL);
+	gate = kzalloc(sizeof(*gate), GFP_KERNEL);
 	if (!gate)
 		return ERR_PTR(-ENOMEM);
 
 	gate->reg = base + reg;
 	gate->bit_idx = DIV_ENABLE_SHIFT;
 
-	divider = devm_kzalloc(dev, sizeof(*divider), GFP_KERNEL);
-	if (!divider)
-		return ERR_PTR(-ENOMEM);
+	divider = kzalloc(sizeof(*divider), GFP_KERNEL);
+	if (!divider) {
+		ret = -ENOMEM;
+		goto err_free_gate;
+	}
 
 	divider->reg = base + reg;
 	divider->shift = DIV_RATIO_SHIFT;
@@ -668,22 +725,31 @@ davinci_pll_sysclk_register(struct device *dev,
 	clk = clk_register_composite(dev, info->name, &info->parent_name, 1,
 				     NULL, NULL, &divider->hw, divider_ops,
 				     &gate->hw, &clk_gate_ops, flags);
-	if (IS_ERR(clk))
-		return clk;
+	if (IS_ERR(clk)) {
+		ret = PTR_ERR(clk);
+		goto err_free_divider;
+	}
 
 	clk_notifier_register(clk, &davinci_pll_sysclk_notifier);
 
 	return clk;
+
+err_free_divider:
+	kfree(divider);
+err_free_gate:
+	kfree(gate);
+
+	return ERR_PTR(ret);
 }
 
-int of_davinci_pll_init(struct device *dev,
+int of_davinci_pll_init(struct device *dev, struct device_node *node,
 			const struct davinci_pll_clk_info *info,
 			const struct davinci_pll_obsclk_info *obsclk_info,
 			const struct davinci_pll_sysclk_info **div_info,
 			u8 max_sysclk_id,
-			void __iomem *base)
+			void __iomem *base,
+			struct regmap *cfgchip)
 {
-	struct device_node *node = dev->of_node;
 	struct device_node *child;
 	const char *parent_name;
 	struct clk *clk;
@@ -693,7 +759,7 @@ int of_davinci_pll_init(struct device *dev,
 	else
 		parent_name = OSCIN_CLK_NAME;
 
-	clk = davinci_pll_clk_register(dev, info, parent_name, base);
+	clk = davinci_pll_clk_register(dev, info, parent_name, base, cfgchip);
 	if (IS_ERR(clk)) {
 		dev_err(dev, "failed to register %s\n", info->name);
 		return PTR_ERR(clk);
@@ -711,13 +777,15 @@ int of_davinci_pll_init(struct device *dev,
 		int n_clks =  max_sysclk_id + 1;
 		int i;
 
-		clk_data = devm_kzalloc(dev, sizeof(*clk_data), GFP_KERNEL);
+		clk_data = kzalloc(sizeof(*clk_data), GFP_KERNEL);
 		if (!clk_data)
 			return -ENOMEM;
 
-		clks = devm_kmalloc_array(dev, n_clks, sizeof(*clks), GFP_KERNEL);
-		if (!clks)
+		clks = kmalloc_array(n_clks, sizeof(*clks), GFP_KERNEL);
+		if (!clks) {
+			kfree(clk_data);
 			return -ENOMEM;
+		}
 
 		clk_data->clks = clks;
 		clk_data->clk_num = n_clks;
@@ -770,6 +838,27 @@ int of_davinci_pll_init(struct device *dev,
 	return 0;
 }
 
+static struct davinci_pll_platform_data *davinci_pll_get_pdata(struct device *dev)
+{
+	struct davinci_pll_platform_data *pdata = dev_get_platdata(dev);
+
+	/*
+	 * Platform data is optional, so allocate a new struct if one was not
+	 * provided. For device tree, this will always be the case.
+	 */
+	if (!pdata)
+		pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return NULL;
+
+	/* for device tree, we need to fill in the struct */
+	if (dev->of_node)
+		pdata->cfgchip =
+			syscon_regmap_lookup_by_compatible("ti,da830-cfgchip");
+
+	return pdata;
+}
+
 static const struct of_device_id davinci_pll_of_match[] = {
 	{ .compatible = "ti,da850-pll0", .data = of_da850_pll0_init },
 	{ .compatible = "ti,da850-pll1", .data = of_da850_pll1_init },
@@ -791,11 +880,13 @@ static const struct platform_device_id davinci_pll_id_table[] = {
 	{ }
 };
 
-typedef int (*davinci_pll_init)(struct device *dev, void __iomem *base);
+typedef int (*davinci_pll_init)(struct device *dev, void __iomem *base,
+				struct regmap *cfgchip);
 
 static int davinci_pll_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
+	struct davinci_pll_platform_data *pdata;
 	const struct of_device_id *of_id;
 	davinci_pll_init pll_init = NULL;
 	struct resource *res;
@@ -812,12 +903,18 @@ static int davinci_pll_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
+	pdata = davinci_pll_get_pdata(dev);
+	if (!pdata) {
+		dev_err(dev, "missing platform data\n");
+		return -EINVAL;
+	}
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(base))
 		return PTR_ERR(base);
 
-	return pll_init(dev, base);
+	return pll_init(dev, base, pdata->cfgchip);
 }
 
 static struct platform_driver davinci_pll_driver = {
diff --git a/drivers/clk/davinci/pll.h b/drivers/clk/davinci/pll.h
index b1b6fb23f972..562652fc0759 100644
--- a/drivers/clk/davinci/pll.h
+++ b/drivers/clk/davinci/pll.h
@@ -11,6 +11,7 @@
 #include <linux/bitops.h>
 #include <linux/clk-provider.h>
 #include <linux/of.h>
+#include <linux/regmap.h>
 #include <linux/types.h>
 
 #define PLL_HAS_CLKMODE			BIT(0) /* PLL has PLLCTL[CLKMODE] */
@@ -94,7 +95,8 @@ struct davinci_pll_obsclk_info {
 struct clk *davinci_pll_clk_register(struct device *dev,
 				     const struct davinci_pll_clk_info *info,
 				     const char *parent_name,
-				     void __iomem *base);
+				     void __iomem *base,
+				     struct regmap *cfgchip);
 struct clk *davinci_pll_auxclk_register(struct device *dev,
 					const char *name,
 					void __iomem *base);
@@ -110,32 +112,24 @@ davinci_pll_sysclk_register(struct device *dev,
 			    const struct davinci_pll_sysclk_info *info,
 			    void __iomem *base);
 
-int of_davinci_pll_init(struct device *dev,
+int of_davinci_pll_init(struct device *dev, struct device_node *node,
 			const struct davinci_pll_clk_info *info,
 			const struct davinci_pll_obsclk_info *obsclk_info,
 			const struct davinci_pll_sysclk_info **div_info,
 			u8 max_sysclk_id,
-			void __iomem *base);
+			void __iomem *base,
+			struct regmap *cfgchip);
 
 /* Platform-specific callbacks */
 
-int da830_pll_init(struct device *dev, void __iomem *base);
+int da850_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int of_da850_pll0_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int of_da850_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 
-int da850_pll0_init(struct device *dev, void __iomem *base);
-int da850_pll1_init(struct device *dev, void __iomem *base);
-int of_da850_pll0_init(struct device *dev, void __iomem *base);
-int of_da850_pll1_init(struct device *dev, void __iomem *base);
+int dm355_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 
-int dm355_pll1_init(struct device *dev, void __iomem *base);
-int dm355_pll2_init(struct device *dev, void __iomem *base);
+int dm644x_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 
-int dm365_pll1_init(struct device *dev, void __iomem *base);
-int dm365_pll2_init(struct device *dev, void __iomem *base);
-
-int dm644x_pll1_init(struct device *dev, void __iomem *base);
-int dm644x_pll2_init(struct device *dev, void __iomem *base);
-
-int dm646x_pll1_init(struct device *dev, void __iomem *base);
-int dm646x_pll2_init(struct device *dev, void __iomem *base);
+int dm646x_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 
 #endif /* __CLK_DAVINCI_PLL_H___ */
diff --git a/include/linux/clk/davinci.h b/include/linux/clk/davinci.h
new file mode 100644
index 000000000000..ebdd9df1c0ef
--- /dev/null
+++ b/include/linux/clk/davinci.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clock drivers for TI DaVinci PLL and PSC controllers
+ *
+ * Copyright (C) 2018 David Lechner <david@lechnology.com>
+ */
+
+#ifndef __LINUX_CLK_DAVINCI_PLL_H___
+#define __LINUX_CLK_DAVINCI_PLL_H___
+
+#include <linux/device.h>
+#include <linux/regmap.h>
+
+/* function for registering clocks in early boot */
+
+int da830_pll_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int da850_pll0_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int dm355_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int dm365_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int dm365_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int dm644x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int dm646x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+
+#endif /* __LINUX_CLK_DAVINCI_PLL_H___ */
-- 
cgit v1.2.3


From 043eaa70ad736380a631e820e32ad9176b020887 Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Fri, 25 May 2018 13:11:49 -0500
Subject: clk: davinci: psc: allow for dev == NULL

On some davinci SoCs, we need to register the PSC clocks during early
boot because they are needed for clocksource/clockevent. These changes
allow for dev == NULL because in this case, we won't have a platform
device for the clocks.

Signed-off-by: David Lechner <david@lechnology.com>
Reviewed-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/20180525181150.17873-9-david@lechnology.com
---
 drivers/clk/davinci/psc-dm355.c  |  3 ++-
 drivers/clk/davinci/psc-dm365.c  |  3 ++-
 drivers/clk/davinci/psc-dm644x.c |  3 ++-
 drivers/clk/davinci/psc-dm646x.c |  3 ++-
 drivers/clk/davinci/psc.c        | 58 ++++++++++++++++++++++++++++++----------
 include/linux/clk/davinci.h      |  5 ++++
 6 files changed, 57 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/davinci/psc-dm355.c b/drivers/clk/davinci/psc-dm355.c
index 128e7345b20c..ddd250107c4e 100644
--- a/drivers/clk/davinci/psc-dm355.c
+++ b/drivers/clk/davinci/psc-dm355.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/clk/davinci.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/init.h>
@@ -68,7 +69,7 @@ static const struct davinci_lpsc_clk_info dm355_psc_info[] = {
 	{ }
 };
 
-static int dm355_psc_init(struct device *dev, void __iomem *base)
+int dm355_psc_init(struct device *dev, void __iomem *base)
 {
 	return davinci_psc_register_clocks(dev, dm355_psc_info, 42, base);
 }
diff --git a/drivers/clk/davinci/psc-dm365.c b/drivers/clk/davinci/psc-dm365.c
index 289af3913fb0..8c73086cc676 100644
--- a/drivers/clk/davinci/psc-dm365.c
+++ b/drivers/clk/davinci/psc-dm365.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/clk/davinci.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/init.h>
@@ -86,7 +87,7 @@ static const struct davinci_lpsc_clk_info dm365_psc_info[] = {
 	{ }
 };
 
-static int dm365_psc_init(struct device *dev, void __iomem *base)
+int dm365_psc_init(struct device *dev, void __iomem *base)
 {
 	return davinci_psc_register_clocks(dev, dm365_psc_info, 52, base);
 }
diff --git a/drivers/clk/davinci/psc-dm644x.c b/drivers/clk/davinci/psc-dm644x.c
index c22367baa46f..fc0230e3a3d6 100644
--- a/drivers/clk/davinci/psc-dm644x.c
+++ b/drivers/clk/davinci/psc-dm644x.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/clk/davinci.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/init.h>
@@ -63,7 +64,7 @@ static const struct davinci_lpsc_clk_info dm644x_psc_info[] = {
 	{ }
 };
 
-static int dm644x_psc_init(struct device *dev, void __iomem *base)
+int dm644x_psc_init(struct device *dev, void __iomem *base)
 {
 	return davinci_psc_register_clocks(dev, dm644x_psc_info, 41, base);
 }
diff --git a/drivers/clk/davinci/psc-dm646x.c b/drivers/clk/davinci/psc-dm646x.c
index 468ef86ea40b..c3f82ed70a80 100644
--- a/drivers/clk/davinci/psc-dm646x.c
+++ b/drivers/clk/davinci/psc-dm646x.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/clk/davinci.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/init.h>
@@ -58,7 +59,7 @@ static const struct davinci_lpsc_clk_info dm646x_psc_info[] = {
 	{ }
 };
 
-static int dm646x_psc_init(struct device *dev, void __iomem *base)
+int dm646x_psc_init(struct device *dev, void __iomem *base)
 {
 	return davinci_psc_register_clocks(dev, dm646x_psc_info, 46, base);
 }
diff --git a/drivers/clk/davinci/psc.c b/drivers/clk/davinci/psc.c
index ce170e600f09..6326ba1fe3cc 100644
--- a/drivers/clk/davinci/psc.c
+++ b/drivers/clk/davinci/psc.c
@@ -15,6 +15,7 @@
 
 #include <linux/clk-provider.h>
 #include <linux/clk.h>
+#include <linux/clk/davinci.h>
 #include <linux/clkdev.h>
 #include <linux/err.h>
 #include <linux/of_address.h>
@@ -63,7 +64,7 @@ struct davinci_psc_data {
 
 /**
  * struct davinci_lpsc_clk - LPSC clock structure
- * @dev: the device that provides this LPSC
+ * @dev: the device that provides this LPSC or NULL
  * @hw: clk_hw for the LPSC
  * @pm_domain: power domain for the LPSC
  * @genpd_clk: clock reference owned by @pm_domain
@@ -221,6 +222,7 @@ static void davinci_psc_genpd_detach_dev(struct generic_pm_domain *pm_domain,
 
 /**
  * davinci_lpsc_clk_register - register LPSC clock
+ * @dev: the clocks's device or NULL
  * @name: name of this clock
  * @parent_name: name of clock's parent
  * @regmap: PSC MMIO region
@@ -238,7 +240,7 @@ davinci_lpsc_clk_register(struct device *dev, const char *name,
 	int ret;
 	bool is_on;
 
-	lpsc = devm_kzalloc(dev, sizeof(*lpsc), GFP_KERNEL);
+	lpsc = kzalloc(sizeof(*lpsc), GFP_KERNEL);
 	if (!lpsc)
 		return ERR_PTR(-ENOMEM);
 
@@ -261,9 +263,15 @@ davinci_lpsc_clk_register(struct device *dev, const char *name,
 	lpsc->pd = pd;
 	lpsc->flags = flags;
 
-	ret = devm_clk_hw_register(dev, &lpsc->hw);
-	if (ret < 0)
+	ret = clk_hw_register(dev, &lpsc->hw);
+	if (ret < 0) {
+		kfree(lpsc);
 		return ERR_PTR(ret);
+	}
+
+	/* for now, genpd is only registered when using device-tree */
+	if (!dev || !dev->of_node)
+		return lpsc;
 
 	/* genpd attach needs a way to look up this clock */
 	ret = clk_hw_register_clkdev(&lpsc->hw, name, best_dev_name(dev));
@@ -378,13 +386,15 @@ __davinci_psc_register_clocks(struct device *dev,
 	struct regmap *regmap;
 	int i, ret;
 
-	psc = devm_kzalloc(dev, sizeof(*psc), GFP_KERNEL);
+	psc = kzalloc(sizeof(*psc), GFP_KERNEL);
 	if (!psc)
 		return ERR_PTR(-ENOMEM);
 
-	clks = devm_kmalloc_array(dev, num_clks, sizeof(*clks), GFP_KERNEL);
-	if (!clks)
-		return ERR_PTR(-ENOMEM);
+	clks = kmalloc_array(num_clks, sizeof(*clks), GFP_KERNEL);
+	if (!clks) {
+		ret = -ENOMEM;
+		goto err_free_psc;
+	}
 
 	psc->clk_data.clks = clks;
 	psc->clk_data.clk_num = num_clks;
@@ -396,16 +406,20 @@ __davinci_psc_register_clocks(struct device *dev,
 	for (i = 0; i < num_clks; i++)
 		clks[i] = ERR_PTR(-ENOENT);
 
-	pm_domains = devm_kcalloc(dev, num_clks, sizeof(*pm_domains), GFP_KERNEL);
-	if (!pm_domains)
-		return ERR_PTR(-ENOMEM);
+	pm_domains = kcalloc(num_clks, sizeof(*pm_domains), GFP_KERNEL);
+	if (!pm_domains) {
+		ret = -ENOMEM;
+		goto err_free_clks;
+	}
 
 	psc->pm_data.domains = pm_domains;
 	psc->pm_data.num_domains = num_clks;
 
-	regmap = devm_regmap_init_mmio(dev, base, &davinci_psc_regmap_config);
-	if (IS_ERR(regmap))
-		return ERR_CAST(regmap);
+	regmap = regmap_init_mmio(dev, base, &davinci_psc_regmap_config);
+	if (IS_ERR(regmap)) {
+		ret = PTR_ERR(regmap);
+		goto err_free_pm_domains;
+	}
 
 	for (; info->name; info++) {
 		struct davinci_lpsc_clk *lpsc;
@@ -423,6 +437,13 @@ __davinci_psc_register_clocks(struct device *dev,
 		pm_domains[info->md] = &lpsc->pm_domain;
 	}
 
+	/*
+	 * for now, a reset controller is only registered when there is a device
+	 * to associate it with.
+	 */
+	if (!dev)
+		return psc;
+
 	psc->rcdev.ops = &davinci_psc_reset_ops;
 	psc->rcdev.owner = THIS_MODULE;
 	psc->rcdev.dev = dev;
@@ -436,6 +457,15 @@ __davinci_psc_register_clocks(struct device *dev,
 		dev_warn(dev, "Failed to register reset controller (%d)\n", ret);
 
 	return psc;
+
+err_free_pm_domains:
+	kfree(pm_domains);
+err_free_clks:
+	kfree(clks);
+err_free_psc:
+	kfree(psc);
+
+	return ERR_PTR(ret);
 }
 
 int davinci_psc_register_clocks(struct device *dev,
diff --git a/include/linux/clk/davinci.h b/include/linux/clk/davinci.h
index ebdd9df1c0ef..62764c5cc86e 100644
--- a/include/linux/clk/davinci.h
+++ b/include/linux/clk/davinci.h
@@ -21,4 +21,9 @@ int dm365_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgch
 int dm644x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 int dm646x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 
+int dm355_psc_init(struct device *dev, void __iomem *base);
+int dm365_psc_init(struct device *dev, void __iomem *base);
+int dm644x_psc_init(struct device *dev, void __iomem *base);
+int dm646x_psc_init(struct device *dev, void __iomem *base);
+
 #endif /* __LINUX_CLK_DAVINCI_PLL_H___ */
-- 
cgit v1.2.3


From 4eff0bebf4ed5ed6d1a4dffe7dfd420b270c229a Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Fri, 25 May 2018 13:11:50 -0500
Subject: clk: davinci: Fix link errors when not all SoCs are enabled

This fixes linker errors due to undefined symbols when one or more of
the TI DaVinci SoCs is not enabled in the kernel config.

Signed-off-by: David Lechner <david@lechnology.com>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/20180525181150.17873-10-david@lechnology.com
---
 drivers/clk/davinci/pll.c   | 16 ++++++++++++++++
 drivers/clk/davinci/pll.h   | 11 ++++++++---
 drivers/clk/davinci/psc.c   | 14 ++++++++++++++
 drivers/clk/davinci/psc.h   | 12 ++++++++++++
 include/linux/clk/davinci.h | 19 +++++++++++++++----
 5 files changed, 65 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/davinci/pll.c b/drivers/clk/davinci/pll.c
index 84a343060bc8..65abd371692d 100644
--- a/drivers/clk/davinci/pll.c
+++ b/drivers/clk/davinci/pll.c
@@ -860,25 +860,41 @@ static struct davinci_pll_platform_data *davinci_pll_get_pdata(struct device *de
 }
 
 /* needed in early boot for clocksource/clockevent */
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 CLK_OF_DECLARE(da850_pll0, "ti,da850-pll0", of_da850_pll0_init);
+#endif
 
 static const struct of_device_id davinci_pll_of_match[] = {
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 	{ .compatible = "ti,da850-pll1", .data = of_da850_pll1_init },
+#endif
 	{ }
 };
 
 static const struct platform_device_id davinci_pll_id_table[] = {
+#ifdef CONFIG_ARCH_DAVINCI_DA830
 	{ .name = "da830-pll",   .driver_data = (kernel_ulong_t)da830_pll_init   },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 	{ .name = "da850-pll0",  .driver_data = (kernel_ulong_t)da850_pll0_init  },
 	{ .name = "da850-pll1",  .driver_data = (kernel_ulong_t)da850_pll1_init  },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM355
 	{ .name = "dm355-pll1",  .driver_data = (kernel_ulong_t)dm355_pll1_init  },
 	{ .name = "dm355-pll2",  .driver_data = (kernel_ulong_t)dm355_pll2_init  },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM365
 	{ .name = "dm365-pll1",  .driver_data = (kernel_ulong_t)dm365_pll1_init  },
 	{ .name = "dm365-pll2",  .driver_data = (kernel_ulong_t)dm365_pll2_init  },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM644x
 	{ .name = "dm644x-pll1", .driver_data = (kernel_ulong_t)dm644x_pll1_init },
 	{ .name = "dm644x-pll2", .driver_data = (kernel_ulong_t)dm644x_pll2_init },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM646x
 	{ .name = "dm646x-pll1", .driver_data = (kernel_ulong_t)dm646x_pll1_init },
 	{ .name = "dm646x-pll2", .driver_data = (kernel_ulong_t)dm646x_pll2_init },
+#endif
 	{ }
 };
 
diff --git a/drivers/clk/davinci/pll.h b/drivers/clk/davinci/pll.h
index b2e5c4496645..7cc354dd29e2 100644
--- a/drivers/clk/davinci/pll.h
+++ b/drivers/clk/davinci/pll.h
@@ -122,14 +122,19 @@ int of_davinci_pll_init(struct device *dev, struct device_node *node,
 
 /* Platform-specific callbacks */
 
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 int da850_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 void of_da850_pll0_init(struct device_node *node);
 int of_da850_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM355
 int dm355_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM644x
 int dm644x_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM646x
 int dm646x_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+#endif
 
 #endif /* __CLK_DAVINCI_PLL_H___ */
diff --git a/drivers/clk/davinci/psc.c b/drivers/clk/davinci/psc.c
index 6326ba1fe3cc..fffbed5e263b 100644
--- a/drivers/clk/davinci/psc.c
+++ b/drivers/clk/davinci/psc.c
@@ -513,20 +513,34 @@ int of_davinci_psc_clk_init(struct device *dev,
 }
 
 static const struct of_device_id davinci_psc_of_match[] = {
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 	{ .compatible = "ti,da850-psc0", .data = &of_da850_psc0_init_data },
 	{ .compatible = "ti,da850-psc1", .data = &of_da850_psc1_init_data },
+#endif
 	{ }
 };
 
 static const struct platform_device_id davinci_psc_id_table[] = {
+#ifdef CONFIG_ARCH_DAVINCI_DA830
 	{ .name = "da830-psc0", .driver_data = (kernel_ulong_t)&da830_psc0_init_data },
 	{ .name = "da830-psc1", .driver_data = (kernel_ulong_t)&da830_psc1_init_data },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 	{ .name = "da850-psc0", .driver_data = (kernel_ulong_t)&da850_psc0_init_data },
 	{ .name = "da850-psc1", .driver_data = (kernel_ulong_t)&da850_psc1_init_data },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM355
 	{ .name = "dm355-psc",  .driver_data = (kernel_ulong_t)&dm355_psc_init_data  },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM365
 	{ .name = "dm365-psc",  .driver_data = (kernel_ulong_t)&dm365_psc_init_data  },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM644x
 	{ .name = "dm644x-psc", .driver_data = (kernel_ulong_t)&dm644x_psc_init_data },
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM646x
 	{ .name = "dm646x-psc", .driver_data = (kernel_ulong_t)&dm646x_psc_init_data },
+#endif
 	{ }
 };
 
diff --git a/drivers/clk/davinci/psc.h b/drivers/clk/davinci/psc.h
index c2a7df6413fe..6a42529d31a9 100644
--- a/drivers/clk/davinci/psc.h
+++ b/drivers/clk/davinci/psc.h
@@ -94,15 +94,27 @@ struct davinci_psc_init_data {
 	int (*psc_init)(struct device *dev, void __iomem *base);
 };
 
+#ifdef CONFIG_ARCH_DAVINCI_DA830
 extern const struct davinci_psc_init_data da830_psc0_init_data;
 extern const struct davinci_psc_init_data da830_psc1_init_data;
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 extern const struct davinci_psc_init_data da850_psc0_init_data;
 extern const struct davinci_psc_init_data da850_psc1_init_data;
 extern const struct davinci_psc_init_data of_da850_psc0_init_data;
 extern const struct davinci_psc_init_data of_da850_psc1_init_data;
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM355
 extern const struct davinci_psc_init_data dm355_psc_init_data;
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM356
 extern const struct davinci_psc_init_data dm365_psc_init_data;
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM644x
 extern const struct davinci_psc_init_data dm644x_psc_init_data;
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM646x
 extern const struct davinci_psc_init_data dm646x_psc_init_data;
+#endif
 
 #endif /* __CLK_DAVINCI_PSC_H__ */
diff --git a/include/linux/clk/davinci.h b/include/linux/clk/davinci.h
index 62764c5cc86e..8a7b5cd7eac0 100644
--- a/include/linux/clk/davinci.h
+++ b/include/linux/clk/davinci.h
@@ -13,17 +13,28 @@
 
 /* function for registering clocks in early boot */
 
+#ifdef CONFIG_ARCH_DAVINCI_DA830
 int da830_pll_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DA850
 int da850_pll0_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM355
 int dm355_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
+int dm355_psc_init(struct device *dev, void __iomem *base);
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM365
 int dm365_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 int dm365_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-int dm644x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-int dm646x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-
-int dm355_psc_init(struct device *dev, void __iomem *base);
 int dm365_psc_init(struct device *dev, void __iomem *base);
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM644x
+int dm644x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 int dm644x_psc_init(struct device *dev, void __iomem *base);
+#endif
+#ifdef CONFIG_ARCH_DAVINCI_DM646x
+int dm646x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 int dm646x_psc_init(struct device *dev, void __iomem *base);
+#endif
 
 #endif /* __LINUX_CLK_DAVINCI_PLL_H___ */
-- 
cgit v1.2.3


From 0fc784fb09f6db8d6650aac137daa779da25c73b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 May 2018 13:43:01 +0200
Subject: cpuidle: governors: Consolidate PM QoS handling

There is some code duplication related to the PM QoS handling between
the existing cpuidle governors, so move that code to a common helper
function and call that from the governors.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governor.c         | 17 ++++++++++++++++-
 drivers/cpuidle/governors/ladder.c |  9 +--------
 drivers/cpuidle/governors/menu.c   |  9 +--------
 include/linux/cpuidle.h            |  1 +
 4 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 5d359aff3cc5..9fed1b829292 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -8,8 +8,10 @@
  * This code is licenced under the GPL.
  */
 
-#include <linux/mutex.h>
+#include <linux/cpu.h>
 #include <linux/cpuidle.h>
+#include <linux/mutex.h>
+#include <linux/pm_qos.h>
 
 #include "cpuidle.h"
 
@@ -93,3 +95,16 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
 
 	return ret;
 }
+
+/**
+ * cpuidle_governor_latency_req - Compute a latency constraint for CPU
+ * @cpu: Target CPU
+ */
+int cpuidle_governor_latency_req(unsigned int cpu)
+{
+	int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+	struct device *device = get_cpu_device(cpu);
+	int device_req = dev_pm_qos_raw_read_value(device);
+
+	return device_req < global_req ? device_req : global_req;
+}
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 060db5182bdb..704880a6612a 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -14,10 +14,8 @@
 
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
-#include <linux/pm_qos.h>
 #include <linux/jiffies.h>
 #include <linux/tick.h>
-#include <linux/cpu.h>
 
 #include <asm/io.h>
 #include <linux/uaccess.h>
@@ -69,15 +67,10 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 			       struct cpuidle_device *dev, bool *dummy)
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
-	struct device *device = get_cpu_device(dev->cpu);
 	struct ladder_device_state *last_state;
 	int last_residency, last_idx = ldev->last_state_idx;
 	int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0;
-	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
-	int resume_latency = dev_pm_qos_raw_read_value(device);
-
-	if (resume_latency < latency_req)
-		latency_req = resume_latency;
+	int latency_req = cpuidle_governor_latency_req(dev->cpu);
 
 	/* Special case when user has set very strict latency requirement */
 	if (unlikely(latency_req == 0)) {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 5d15bc0ba2e0..1aef60d160eb 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -12,7 +12,6 @@
 
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
-#include <linux/pm_qos.h>
 #include <linux/time.h>
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -21,7 +20,6 @@
 #include <linux/sched/loadavg.h>
 #include <linux/sched/stat.h>
 #include <linux/math64.h>
-#include <linux/cpu.h>
 
 /*
  * Please note when changing the tuning values:
@@ -286,15 +284,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		       bool *stop_tick)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
-	struct device *device = get_cpu_device(dev->cpu);
-	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+	int latency_req = cpuidle_governor_latency_req(dev->cpu);
 	int i;
 	int first_idx;
 	int idx;
 	unsigned int interactivity_req;
 	unsigned int expected_interval;
 	unsigned long nr_iowaiters, cpu_load;
-	int resume_latency = dev_pm_qos_raw_read_value(device);
 	ktime_t delta_next;
 
 	if (data->needs_update) {
@@ -302,9 +298,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		data->needs_update = 0;
 	}
 
-	if (resume_latency < latency_req)
-		latency_req = resume_latency;
-
 	/* Special case when user has set very strict latency requirement */
 	if (unlikely(latency_req == 0)) {
 		*stop_tick = false;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1eefabf1621f..4325d6fdde9b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -258,6 +258,7 @@ struct cpuidle_governor {
 
 #ifdef CONFIG_CPU_IDLE
 extern int cpuidle_register_governor(struct cpuidle_governor *gov);
+extern int cpuidle_governor_latency_req(unsigned int cpu);
 #else
 static inline int cpuidle_register_governor(struct cpuidle_governor *gov)
 {return 0;}
-- 
cgit v1.2.3


From 338aa96d5661048b3c0cafc6d91876025603cacf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 May 2018 18:25:47 -0400
Subject: block: convert bounce, q->bio_split to bioset_init()/mempool_init()

Convert the core block functionality to embedded bio sets.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  7 ++++---
 block/blk-merge.c      |  8 ++++----
 block/blk-sysfs.c      |  3 +--
 block/bounce.c         | 47 +++++++++++++++++++++++++----------------------
 drivers/md/dm.c        |  2 +-
 include/linux/bio.h    |  5 +++++
 include/linux/blkdev.h |  2 +-
 7 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index cee03cad99f2..a295b3c159b2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -992,6 +992,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 					   spinlock_t *lock)
 {
 	struct request_queue *q;
+	int ret;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
@@ -1002,8 +1003,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!q->bio_split)
+	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	if (ret)
 		goto fail_id;
 
 	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
@@ -1075,7 +1076,7 @@ fail_bdi:
 fail_stats:
 	bdi_put(q->backing_dev_info);
 fail_split:
-	bioset_free(q->bio_split);
+	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5573d0fbec53..d70ab08820e5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -188,16 +188,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	}
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 31347e31daa3..94987b1f69e1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -824,8 +824,7 @@ static void __blk_release_queue(struct work_struct *work)
 	if (q->mq_ops)
 		blk_mq_debugfs_unregister(q);
 
-	if (q->bio_split)
-		bioset_free(q->bio_split);
+	bioset_exit(&q->bio_split);
 
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
diff --git a/block/bounce.c b/block/bounce.c
index fea9c8146d82..7a6c4d50b51c 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -28,28 +28,29 @@
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
-static struct bio_set *bounce_bio_set, *bounce_bio_split;
-static mempool_t *page_pool, *isa_page_pool;
+static struct bio_set bounce_bio_set, bounce_bio_split;
+static mempool_t page_pool, isa_page_pool;
 
 #if defined(CONFIG_HIGHMEM)
 static __init int init_emergency_pool(void)
 {
+	int ret;
 #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
 	if (max_pfn <= max_low_pfn)
 		return 0;
 #endif
 
-	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
-	BUG_ON(!page_pool);
+	ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
+	BUG_ON(ret);
 	pr_info("pool size: %d pages\n", POOL_SIZE);
 
-	bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	BUG_ON(!bounce_bio_set);
+	ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	BUG_ON(ret);
 	if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE))
 		BUG_ON(1);
 
-	bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
-	BUG_ON(!bounce_bio_split);
+	ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
+	BUG_ON(ret);
 
 	return 0;
 }
@@ -91,12 +92,14 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
  */
 int init_emergency_isa_pool(void)
 {
-	if (isa_page_pool)
+	int ret;
+
+	if (mempool_initialized(&isa_page_pool))
 		return 0;
 
-	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
-				       mempool_free_pages, (void *) 0);
-	BUG_ON(!isa_page_pool);
+	ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
+			   mempool_free_pages, (void *) 0);
+	BUG_ON(ret);
 
 	pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
 	return 0;
@@ -163,13 +166,13 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 
 static void bounce_end_io_write(struct bio *bio)
 {
-	bounce_end_io(bio, page_pool);
+	bounce_end_io(bio, &page_pool);
 }
 
 static void bounce_end_io_write_isa(struct bio *bio)
 {
 
-	bounce_end_io(bio, isa_page_pool);
+	bounce_end_io(bio, &isa_page_pool);
 }
 
 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
@@ -184,12 +187,12 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 
 static void bounce_end_io_read(struct bio *bio)
 {
-	__bounce_end_io_read(bio, page_pool);
+	__bounce_end_io_read(bio, &page_pool);
 }
 
 static void bounce_end_io_read_isa(struct bio *bio)
 {
-	__bounce_end_io_read(bio, isa_page_pool);
+	__bounce_end_io_read(bio, &isa_page_pool);
 }
 
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
@@ -214,13 +217,13 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		return;
 
 	if (!passthrough && sectors < bio_sectors(*bio_orig)) {
-		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
+		bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
 	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
-			bounce_bio_set);
+			&bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
@@ -247,7 +250,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 
 	bio->bi_flags |= (1 << BIO_BOUNCED);
 
-	if (pool == page_pool) {
+	if (pool == &page_pool) {
 		bio->bi_end_io = bounce_end_io_write;
 		if (rw == READ)
 			bio->bi_end_io = bounce_end_io_read;
@@ -279,10 +282,10 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	if (!(q->bounce_gfp & GFP_DMA)) {
 		if (q->limits.bounce_pfn >= blk_max_pfn)
 			return;
-		pool = page_pool;
+		pool = &page_pool;
 	} else {
-		BUG_ON(!isa_page_pool);
-		pool = isa_page_pool;
+		BUG_ON(!mempool_initialized(&isa_page_pool));
+		pool = &isa_page_pool;
 	}
 
 	/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4ea404dbcf0b..7d98ee1137b4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1582,7 +1582,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 				 * won't be affected by this reassignment.
 				 */
 				struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
-								 md->queue->bio_split);
+								 &md->queue->bio_split);
 				ci.io->orig_bio = b;
 				bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
 				bio_chain(b, bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 98b175cc00d5..5e472fcafa24 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -760,6 +760,11 @@ struct biovec_slab {
 	struct kmem_cache *slab;
 };
 
+static inline bool bioset_initialized(struct bio_set *bs)
+{
+	return bs->bio_slab != NULL;
+}
+
 /*
  * a small number of entries is fine, not going to be performance critical.
  * basically we just need to survive
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4efd9af62e25..bca3a92eb55f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -652,7 +652,7 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-	struct bio_set		*bio_split;
+	struct bio_set		bio_split;
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
-- 
cgit v1.2.3


From 64c4bc4de79fec06bb46d9827e7c4df67a025590 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 May 2018 18:25:49 -0400
Subject: pktcdvd: convert to bioset_init()/mempool_init()

Convert pktcdvd to embedded bio sets.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/pktcdvd.c | 50 ++++++++++++++++++++++++-------------------------
 include/linux/pktcdvd.h |  2 +-
 2 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 2f2d476ffc31..1a2c0101cfcb 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -97,8 +97,8 @@ static int pktdev_major;
 static int write_congestion_on  = PKT_WRITE_CONGESTION_ON;
 static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
 static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
-static mempool_t *psd_pool;
-static struct bio_set *pkt_bio_set;
+static mempool_t psd_pool;
+static struct bio_set pkt_bio_set;
 
 static struct class	*class_pktcdvd = NULL;    /* /sys/class/pktcdvd */
 static struct dentry	*pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
@@ -631,7 +631,7 @@ static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
 static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
 {
 	rb_erase(&node->rb_node, &pd->bio_queue);
-	mempool_free(node, pd->rb_pool);
+	mempool_free(node, &pd->rb_pool);
 	pd->bio_queue_size--;
 	BUG_ON(pd->bio_queue_size < 0);
 }
@@ -2303,14 +2303,14 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 	psd->bio->bi_status = bio->bi_status;
 	bio_put(bio);
 	bio_endio(psd->bio);
-	mempool_free(psd, psd_pool);
+	mempool_free(psd, &psd_pool);
 	pkt_bio_finished(pd);
 }
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set);
-	struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+	struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set);
+	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
 
 	psd->pd = pd;
 	psd->bio = bio;
@@ -2381,7 +2381,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
 	/*
 	 * No matching packet found. Store the bio in the work queue.
 	 */
-	node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+	node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
 	node->bio = bio;
 	spin_lock(&pd->lock);
 	BUG_ON(pd->bio_queue_size < 0);
@@ -2451,7 +2451,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio)
 
 			split = bio_split(bio, last_zone -
 					  bio->bi_iter.bi_sector,
-					  GFP_NOIO, pkt_bio_set);
+					  GFP_NOIO, &pkt_bio_set);
 			bio_chain(split, bio);
 		} else {
 			split = bio;
@@ -2707,9 +2707,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	if (!pd)
 		goto out_mutex;
 
-	pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE,
-						  sizeof(struct pkt_rb_node));
-	if (!pd->rb_pool)
+	ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
+					sizeof(struct pkt_rb_node));
+	if (ret)
 		goto out_mem;
 
 	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
@@ -2766,7 +2766,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 out_mem2:
 	put_disk(disk);
 out_mem:
-	mempool_destroy(pd->rb_pool);
+	mempool_exit(&pd->rb_pool);
 	kfree(pd);
 out_mutex:
 	mutex_unlock(&ctl_mutex);
@@ -2817,7 +2817,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	blk_cleanup_queue(pd->disk->queue);
 	put_disk(pd->disk);
 
-	mempool_destroy(pd->rb_pool);
+	mempool_exit(&pd->rb_pool);
 	kfree(pd);
 
 	/* This is safe: open() is still holding a reference. */
@@ -2914,14 +2914,14 @@ static int __init pkt_init(void)
 
 	mutex_init(&ctl_mutex);
 
-	psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE,
-					sizeof(struct packet_stacked_data));
-	if (!psd_pool)
-		return -ENOMEM;
-	pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!pkt_bio_set) {
-		mempool_destroy(psd_pool);
-		return -ENOMEM;
+	ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
+				    sizeof(struct packet_stacked_data));
+	if (ret)
+		return ret;
+	ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret) {
+		mempool_exit(&psd_pool);
+		return ret;
 	}
 
 	ret = register_blkdev(pktdev_major, DRIVER_NAME);
@@ -2954,8 +2954,8 @@ out_misc:
 out:
 	unregister_blkdev(pktdev_major, DRIVER_NAME);
 out2:
-	mempool_destroy(psd_pool);
-	bioset_free(pkt_bio_set);
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
 	return ret;
 }
 
@@ -2968,8 +2968,8 @@ static void __exit pkt_exit(void)
 	pkt_sysfs_cleanup();
 
 	unregister_blkdev(pktdev_major, DRIVER_NAME);
-	mempool_destroy(psd_pool);
-	bioset_free(pkt_bio_set);
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
 }
 
 MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 93d142ad1528..174601554b06 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -186,7 +186,7 @@ struct pktcdvd_device
 	sector_t		current_sector;	/* Keep track of where the elevator is */
 	atomic_t		scan_queue;	/* Set to non-zero when pkt_handle_queue */
 						/* needs to be run. */
-	mempool_t		*rb_pool;	/* mempool for pkt_rb_node allocations */
+	mempool_t		rb_pool;	/* mempool for pkt_rb_node allocations */
 
 	struct packet_iosched   iosched;
 	struct gendisk		*disk;
-- 
cgit v1.2.3


From dad08527525f9a8ac9c7f278864c65f94bc5e9b3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 May 2018 18:25:58 -0400
Subject: block: Drop bioset_create()

All users have been converted to bioset_init(), kill off the
old API.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 61 ++++++++++++-----------------------------------------
 include/linux/bio.h |  6 ++----
 2 files changed, 15 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 0a4df92cd689..595663e0281a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1908,22 +1908,26 @@ void bioset_exit(struct bio_set *bs)
 }
 EXPORT_SYMBOL(bioset_exit);
 
-void bioset_free(struct bio_set *bs)
-{
-	bioset_exit(bs);
-	kfree(bs);
-}
-EXPORT_SYMBOL(bioset_free);
-
 /**
  * bioset_init - Initialize a bio_set
+ * @bs:		pool to initialize
  * @pool_size:	Number of bio and bio_vecs to cache in the mempool
  * @front_pad:	Number of bytes to allocate in front of the returned bio
  * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
  *              and %BIOSET_NEED_RESCUER
  *
- * Similar to bioset_create(), but initializes a passed-in bioset instead of
- * separately allocating it.
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
+ *    for allocating iovecs.  This pool is not needed e.g. for bio_clone_fast().
+ *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
+ *    dispatch queued requests when the mempool runs out of space.
+ *
  */
 int bioset_init(struct bio_set *bs,
 		unsigned int pool_size,
@@ -1963,45 +1967,6 @@ bad:
 }
 EXPORT_SYMBOL(bioset_init);
 
-/**
- * bioset_create  - Create a bio_set
- * @pool_size:	Number of bio and bio_vecs to cache in the mempool
- * @front_pad:	Number of bytes to allocate in front of the returned bio
- * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
- *              and %BIOSET_NEED_RESCUER
- *
- * Description:
- *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
- *    to ask for a number of bytes to be allocated in front of the bio.
- *    Front pad allocation is useful for embedding the bio inside
- *    another structure, to avoid allocating extra data to go with the bio.
- *    Note that the bio must be embedded at the END of that structure always,
- *    or things will break badly.
- *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
- *    for allocating iovecs.  This pool is not needed e.g. for bio_clone_fast().
- *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
- *    dispatch queued requests when the mempool runs out of space.
- *
- */
-struct bio_set *bioset_create(unsigned int pool_size,
-			      unsigned int front_pad,
-			      int flags)
-{
-	struct bio_set *bs;
-
-	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
-	if (!bs)
-		return NULL;
-
-	if (bioset_init(bs, pool_size, front_pad, flags)) {
-		kfree(bs);
-		return NULL;
-	}
-
-	return bs;
-}
-EXPORT_SYMBOL(bioset_create);
-
 #ifdef CONFIG_BLK_CGROUP
 
 /**
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5e472fcafa24..810a8bee8f85 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -410,14 +410,12 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 	return bio_split(bio, sectors, gfp, bs);
 }
 
-extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
-extern void bioset_exit(struct bio_set *);
-extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags);
 enum {
 	BIOSET_NEED_BVECS = BIT(0),
 	BIOSET_NEED_RESCUER = BIT(1),
 };
-extern void bioset_free(struct bio_set *);
+extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
+extern void bioset_exit(struct bio_set *);
 extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 
 extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
-- 
cgit v1.2.3


From 8c97a46af04b4f7c0a0dded031fef1806872e648 Mon Sep 17 00:00:00 2001
From: Martin Liu <liumartin@google.com>
Date: Thu, 31 May 2018 00:31:36 +0800
Subject: driver core: hold dev's parent lock when needed

SoC have internal I/O buses that can't be proved for devices. The
devices on the buses can be accessed directly without additinal
configuration required. This type of bus is represented as
"simple-bus". In some platforms, we name "soc" with "simple-bus"
attribute and many devices are hooked under it described in DT
(device tree).

In commit bf74ad5bc417 ("Hold the device's parent's lock during
probe and remove") to solve USB subsystem lock sequence since
USB device's characteristic. Thus "soc" needs to be locked
whenever a device and driver's probing happen under "soc" bus.
During this period, an async driver tries to probe a device which
is under the "soc" bus would be blocked until previous driver
finish the probing and release "soc" lock. And the next probing
under the "soc" bus need to wait for async finish. Because of
that, driver's async probe for init time improvement will be
shadowed.

Since many devices don't have USB devices' characteristic, they
actually don't need parent's lock. Thus, we introduce a lock flag
in bus_type struct and driver core would lock the parent lock base
on the flag. For USB, we set this flag in usb_bus_type to keep
original lock behavior in driver core.

Async probe could have more benefit after this patch.

Signed-off-by: Martin Liu <liumartin@google.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/bus.c        | 16 ++++++++--------
 drivers/base/dd.c         |  8 ++++----
 drivers/usb/core/driver.c |  1 +
 include/linux/device.h    |  3 +++
 4 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index ef6183306b40..8bfd27ec73d6 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -184,10 +184,10 @@ static ssize_t unbind_store(struct device_driver *drv, const char *buf,
 
 	dev = bus_find_device_by_name(bus, NULL, buf);
 	if (dev && dev->driver == drv) {
-		if (dev->parent)	/* Needed for USB */
+		if (dev->parent && dev->bus->need_parent_lock)
 			device_lock(dev->parent);
 		device_release_driver(dev);
-		if (dev->parent)
+		if (dev->parent && dev->bus->need_parent_lock)
 			device_unlock(dev->parent);
 		err = count;
 	}
@@ -211,12 +211,12 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf,
 
 	dev = bus_find_device_by_name(bus, NULL, buf);
 	if (dev && dev->driver == NULL && driver_match_device(drv, dev)) {
-		if (dev->parent)	/* Needed for USB */
+		if (dev->parent && bus->need_parent_lock)
 			device_lock(dev->parent);
 		device_lock(dev);
 		err = driver_probe_device(drv, dev);
 		device_unlock(dev);
-		if (dev->parent)
+		if (dev->parent && bus->need_parent_lock)
 			device_unlock(dev->parent);
 
 		if (err > 0) {
@@ -735,10 +735,10 @@ static int __must_check bus_rescan_devices_helper(struct device *dev,
 	int ret = 0;
 
 	if (!dev->driver) {
-		if (dev->parent)	/* Needed for USB */
+		if (dev->parent && dev->bus->need_parent_lock)
 			device_lock(dev->parent);
 		ret = device_attach(dev);
-		if (dev->parent)
+		if (dev->parent && dev->bus->need_parent_lock)
 			device_unlock(dev->parent);
 	}
 	return ret < 0 ? ret : 0;
@@ -770,10 +770,10 @@ EXPORT_SYMBOL_GPL(bus_rescan_devices);
 int device_reprobe(struct device *dev)
 {
 	if (dev->driver) {
-		if (dev->parent)        /* Needed for USB */
+		if (dev->parent && dev->bus->need_parent_lock)
 			device_lock(dev->parent);
 		device_release_driver(dev);
-		if (dev->parent)
+		if (dev->parent && dev->bus->need_parent_lock)
 			device_unlock(dev->parent);
 	}
 	return bus_rescan_devices_helper(dev, NULL);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index c9f54089429b..7c09f73b96f3 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -817,13 +817,13 @@ static int __driver_attach(struct device *dev, void *data)
 		return ret;
 	} /* ret > 0 means positive match */
 
-	if (dev->parent)	/* Needed for USB */
+	if (dev->parent && dev->bus->need_parent_lock)
 		device_lock(dev->parent);
 	device_lock(dev);
 	if (!dev->driver)
 		driver_probe_device(drv, dev);
 	device_unlock(dev);
-	if (dev->parent)
+	if (dev->parent && dev->bus->need_parent_lock)
 		device_unlock(dev->parent);
 
 	return 0;
@@ -919,7 +919,7 @@ void device_release_driver_internal(struct device *dev,
 				    struct device_driver *drv,
 				    struct device *parent)
 {
-	if (parent)
+	if (parent && dev->bus->need_parent_lock)
 		device_lock(parent);
 
 	device_lock(dev);
@@ -927,7 +927,7 @@ void device_release_driver_internal(struct device *dev,
 		__device_release_driver(dev, parent);
 
 	device_unlock(dev);
-	if (parent)
+	if (parent && dev->bus->need_parent_lock)
 		device_unlock(parent);
 }
 
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 9792cedfc351..e76e95f62f76 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1922,4 +1922,5 @@ struct bus_type usb_bus_type = {
 	.name =		"usb",
 	.match =	usb_device_match,
 	.uevent =	usb_uevent,
+	.need_parent_lock =	true,
 };
diff --git a/include/linux/device.h b/include/linux/device.h
index 477956990f5e..beca424395dd 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -98,6 +98,8 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @lock_key:	Lock class key for use by the lock validator
  * @force_dma:	Assume devices on this bus should be set up by dma_configure()
  * 		even if DMA capability is not explicitly described by firmware.
+ * @need_parent_lock:	When probing or removing a device on this bus, the
+ *			device core should lock the device's parent.
  *
  * A bus is a channel between the processor and one or more devices. For the
  * purposes of the device model, all devices are connected via a bus, even if
@@ -138,6 +140,7 @@ struct bus_type {
 	struct lock_class_key lock_key;
 
 	bool force_dma;
+	bool need_parent_lock;
 };
 
 extern int __must_check bus_register(struct bus_type *bus);
-- 
cgit v1.2.3


From 80ea09a002bf4384fda5f087b1b198b3a274f9da Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 17 May 2018 10:53:05 +0200
Subject: vfs: factor out inode_insert5()

Split out common helper for race free insertion of an already allocated
inode into the cache.  Use this from iget5_locked() and
insert_inode_locked4().  Make iget5_locked() use new_inode()/iput() instead
of alloc_inode()/destroy_inode() directly.

Also export to modules for use by filesystems which want to preallocate an
inode before file/directory creation.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/inode.c         | 164 ++++++++++++++++++++++++-----------------------------
 include/linux/fs.h |   4 ++
 2 files changed, 79 insertions(+), 89 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index 13ceb98c3bd3..1bea65d37afe 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1002,6 +1002,70 @@ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 }
 EXPORT_SYMBOL(unlock_two_nondirectories);
 
+/**
+ * inode_insert5 - obtain an inode from a mounted file system
+ * @inode:	pre-allocated inode to use for insert to cache
+ * @hashval:	hash value (usually inode number) to get
+ * @test:	callback used for comparisons between inodes
+ * @set:	callback used to initialize a new struct inode
+ * @data:	opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a variant of iget5_locked() for callers that don't want to fail on memory
+ * allocation of inode.
+ *
+ * If the inode is not in cache, insert the pre-allocated inode to cache and
+ * return it locked, hashed, and with the I_NEW flag set. The file system gets
+ * to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
+ */
+struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+			    int (*test)(struct inode *, void *),
+			    int (*set)(struct inode *, void *), void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
+	struct inode *old;
+
+again:
+	spin_lock(&inode_hash_lock);
+	old = find_inode(inode->i_sb, head, test, data);
+	if (unlikely(old)) {
+		/*
+		 * Uhhuh, somebody else created the same inode under us.
+		 * Use the old inode instead of the preallocated one.
+		 */
+		spin_unlock(&inode_hash_lock);
+		wait_on_inode(old);
+		if (unlikely(inode_unhashed(old))) {
+			iput(old);
+			goto again;
+		}
+		return old;
+	}
+
+	if (set && unlikely(set(inode, data))) {
+		inode = NULL;
+		goto unlock;
+	}
+
+	/*
+	 * Return the locked inode with I_NEW set, the
+	 * caller is responsible for filling in the contents
+	 */
+	spin_lock(&inode->i_lock);
+	inode->i_state |= I_NEW;
+	hlist_add_head(&inode->i_hash, head);
+	spin_unlock(&inode->i_lock);
+unlock:
+	spin_unlock(&inode_hash_lock);
+
+	return inode;
+}
+EXPORT_SYMBOL(inode_insert5);
+
 /**
  * iget5_locked - obtain an inode from a mounted file system
  * @sb:		super block of file system
@@ -1026,66 +1090,18 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *),
 		int (*set)(struct inode *, void *), void *data)
 {
-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-	struct inode *inode;
-again:
-	spin_lock(&inode_hash_lock);
-	inode = find_inode(sb, head, test, data);
-	spin_unlock(&inode_hash_lock);
+	struct inode *inode = ilookup5(sb, hashval, test, data);
 
-	if (inode) {
-		wait_on_inode(inode);
-		if (unlikely(inode_unhashed(inode))) {
-			iput(inode);
-			goto again;
-		}
-		return inode;
-	}
+	if (!inode) {
+		struct inode *new = new_inode(sb);
 
-	inode = alloc_inode(sb);
-	if (inode) {
-		struct inode *old;
-
-		spin_lock(&inode_hash_lock);
-		/* We released the lock, so.. */
-		old = find_inode(sb, head, test, data);
-		if (!old) {
-			if (set(inode, data))
-				goto set_failed;
-
-			spin_lock(&inode->i_lock);
-			inode->i_state = I_NEW;
-			hlist_add_head(&inode->i_hash, head);
-			spin_unlock(&inode->i_lock);
-			inode_sb_list_add(inode);
-			spin_unlock(&inode_hash_lock);
-
-			/* Return the locked inode with I_NEW set, the
-			 * caller is responsible for filling in the contents
-			 */
-			return inode;
-		}
-
-		/*
-		 * Uhhuh, somebody else created the same inode under
-		 * us. Use the old inode instead of the one we just
-		 * allocated.
-		 */
-		spin_unlock(&inode_hash_lock);
-		destroy_inode(inode);
-		inode = old;
-		wait_on_inode(inode);
-		if (unlikely(inode_unhashed(inode))) {
-			iput(inode);
-			goto again;
+		if (new) {
+			inode = inode_insert5(new, hashval, test, set, data);
+			if (unlikely(inode != new))
+				iput(new);
 		}
 	}
 	return inode;
-
-set_failed:
-	spin_unlock(&inode_hash_lock);
-	destroy_inode(inode);
-	return NULL;
 }
 EXPORT_SYMBOL(iget5_locked);
 
@@ -1426,43 +1442,13 @@ EXPORT_SYMBOL(insert_inode_locked);
 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
-	struct super_block *sb = inode->i_sb;
-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *old = inode_insert5(inode, hashval, test, NULL, data);
 
-	while (1) {
-		struct inode *old = NULL;
-
-		spin_lock(&inode_hash_lock);
-		hlist_for_each_entry(old, head, i_hash) {
-			if (old->i_sb != sb)
-				continue;
-			if (!test(old, data))
-				continue;
-			spin_lock(&old->i_lock);
-			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
-				spin_unlock(&old->i_lock);
-				continue;
-			}
-			break;
-		}
-		if (likely(!old)) {
-			spin_lock(&inode->i_lock);
-			inode->i_state |= I_NEW;
-			hlist_add_head(&inode->i_hash, head);
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&inode_hash_lock);
-			return 0;
-		}
-		__iget(old);
-		spin_unlock(&old->i_lock);
-		spin_unlock(&inode_hash_lock);
-		wait_on_inode(old);
-		if (unlikely(!inode_unhashed(old))) {
-			iput(old);
-			return -EBUSY;
-		}
+	if (old != inode) {
 		iput(old);
+		return -EBUSY;
 	}
+	return 0;
 }
 EXPORT_SYMBOL(insert_inode_locked4);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..4f637a9b213d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2879,6 +2879,10 @@ extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data);
 extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 
+extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *),
+		void *data);
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
 extern struct inode *find_inode_nowait(struct super_block *,
-- 
cgit v1.2.3


From b64ec075bded2b30bcd90af5aa5256d2237c885d Mon Sep 17 00:00:00 2001
From: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Date: Mon, 21 May 2018 10:09:42 +0900
Subject: btrfs: Add unprivileged ioctl which returns subvolume information

Add new unprivileged ioctl BTRFS_IOC_GET_SUBVOL_INFO which returns
the information of subvolume containing this inode.
(i.e. returns the information in ROOT_ITEM and ROOT_BACKREF.)

Reviewed-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Tested-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
[ minor style fixes, update struct comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 121 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |  62 +++++++++++++++++++++++
 2 files changed, 183 insertions(+)

(limited to 'include')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4db2446c7015..42ed752288e6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2383,6 +2383,125 @@ out:
 	return ret;
 }
 
+/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
+static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_root_item *root_item;
+	struct btrfs_root_ref *rref;
+	struct extent_buffer *leaf;
+	unsigned long item_off;
+	unsigned long item_len;
+	struct inode *inode;
+	int slot;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
+	if (!subvol_info) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	inode = file_inode(file);
+	fs_info = BTRFS_I(inode)->root->fs_info;
+
+	/* Get root_item of inode's subvolume */
+	key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+	root_item = &root->root_item;
+
+	subvol_info->treeid = key.objectid;
+
+	subvol_info->generation = btrfs_root_generation(root_item);
+	subvol_info->flags = btrfs_root_flags(root_item);
+
+	memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
+	memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
+						    BTRFS_UUID_SIZE);
+	memcpy(subvol_info->received_uuid, root_item->received_uuid,
+						    BTRFS_UUID_SIZE);
+
+	subvol_info->ctransid = btrfs_root_ctransid(root_item);
+	subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
+	subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
+
+	subvol_info->otransid = btrfs_root_otransid(root_item);
+	subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
+	subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
+
+	subvol_info->stransid = btrfs_root_stransid(root_item);
+	subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
+	subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
+
+	subvol_info->rtransid = btrfs_root_rtransid(root_item);
+	subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
+	subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
+
+	if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
+		/* Search root tree for ROOT_BACKREF of this subvolume */
+		root = fs_info->tree_root;
+
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = 0;
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			goto out;
+		} else if (path->slots[0] >=
+			   btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = -EUCLEAN;
+				goto out;
+			}
+		}
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid == subvol_info->treeid &&
+		    key.type == BTRFS_ROOT_BACKREF_KEY) {
+			subvol_info->parent_id = key.offset;
+
+			rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+			subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
+
+			item_off = btrfs_item_ptr_offset(leaf, slot)
+					+ sizeof(struct btrfs_root_ref);
+			item_len = btrfs_item_size_nr(leaf, slot)
+					- sizeof(struct btrfs_root_ref);
+			read_extent_buffer(leaf, subvol_info->name,
+					   item_off, item_len);
+		} else {
+			ret = -ENOENT;
+			goto out;
+		}
+	}
+
+	if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
+		ret = -EFAULT;
+
+out:
+	btrfs_free_path(path);
+	kzfree(subvol_info);
+	return ret;
+}
+
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     void __user *arg)
 {
@@ -5545,6 +5664,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_fsgetxattr(file, argp);
 	case FS_IOC_FSSETXATTR:
 		return btrfs_ioctl_fssetxattr(file, argp);
+	case BTRFS_IOC_GET_SUBVOL_INFO:
+		return btrfs_ioctl_get_subvol_info(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index c8d99b9ca550..f8f20d72b852 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -725,6 +725,66 @@ struct btrfs_ioctl_send_args {
 	__u64 reserved[4];		/* in */
 };
 
+/*
+ * Information about a fs tree root.
+ *
+ * All items are filled by the ioctl
+ */
+struct btrfs_ioctl_get_subvol_info_args {
+	/* Id of this subvolume */
+	__u64 treeid;
+
+	/* Name of this subvolume, used to get the real name at mount point */
+	char name[BTRFS_VOL_NAME_MAX + 1];
+
+	/*
+	 * Id of the subvolume which contains this subvolume.
+	 * Zero for top-level subvolume or a deleted subvolume.
+	 */
+	__u64 parent_id;
+
+	/*
+	 * Inode number of the directory which contains this subvolume.
+	 * Zero for top-level subvolume or a deleted subvolume
+	 */
+	__u64 dirid;
+
+	/* Latest transaction id of this subvolume */
+	__u64 generation;
+
+	/* Flags of this subvolume */
+	__u64 flags;
+
+	/* UUID of this subvolume */
+	__u8 uuid[BTRFS_UUID_SIZE];
+
+	/*
+	 * UUID of the subvolume of which this subvolume is a snapshot.
+	 * All zero for a non-snapshot subvolume.
+	 */
+	__u8 parent_uuid[BTRFS_UUID_SIZE];
+
+	/*
+	 * UUID of the subvolume from which this subvolume was received.
+	 * All zero for non-received subvolume.
+	 */
+	__u8 received_uuid[BTRFS_UUID_SIZE];
+
+	/* Transaction id indicating when change/create/send/receive happened */
+	__u64 ctransid;
+	__u64 otransid;
+	__u64 stransid;
+	__u64 rtransid;
+	/* Time corresponding to c/o/s/rtransid */
+	struct btrfs_ioctl_timespec ctime;
+	struct btrfs_ioctl_timespec otime;
+	struct btrfs_ioctl_timespec stime;
+	struct btrfs_ioctl_timespec rtime;
+
+	/* Must be zero */
+	__u64 reserved[8];
+};
+
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -843,5 +903,7 @@ enum btrfs_err_code {
 				   struct btrfs_ioctl_vol_args_v2)
 #define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \
 					struct btrfs_ioctl_logical_ino_args)
+#define BTRFS_IOC_GET_SUBVOL_INFO _IOR(BTRFS_IOCTL_MAGIC, 60, \
+				struct btrfs_ioctl_get_subvol_info_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
cgit v1.2.3


From 42e4b520c812daaf5e6177c2e4beec012ce1e2ce Mon Sep 17 00:00:00 2001
From: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Date: Mon, 21 May 2018 10:09:43 +0900
Subject: btrfs: Add unprivileged ioctl which returns subvolume's ROOT_REF

Add unprivileged ioctl BTRFS_IOC_GET_SUBVOL_ROOTREF which returns
ROOT_REF information of the subvolume containing this inode except the
subvolume name (this is because to prevent potential name leak). The
subvolume name will be gained by user version of ino_lookup ioctl
(BTRFS_IOC_INO_LOOKUP_USER) which also performs permission check.

The min id of root ref's subvolume to be searched is specified by
@min_id in struct btrfs_ioctl_get_subvol_rootref_args. After the search
ends, @min_id is set to the last searched root ref's subvolid + 1. Also,
if there are more root refs than BTRFS_MAX_ROOTREF_BUFFER_NUM,
-EOVERFLOW is returned. Therefore the caller can just call this ioctl
again without changing the argument to continue search.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Tested-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
[ style fixes and struct item renames ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 99 ++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h | 18 +++++++++
 2 files changed, 117 insertions(+)

(limited to 'include')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 42ed752288e6..be9b3f39183c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2502,6 +2502,103 @@ out:
 	return ret;
 }
 
+/*
+ * Return ROOT_REF information of the subvolume containing this inode
+ * except the subvolume name.
+ */
+static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+	struct btrfs_root_ref *rref;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct inode *inode;
+	u64 objectid;
+	int slot;
+	int ret;
+	u8 found;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	rootrefs = memdup_user(argp, sizeof(*rootrefs));
+	if (IS_ERR(rootrefs)) {
+		btrfs_free_path(path);
+		return PTR_ERR(rootrefs);
+	}
+
+	inode = file_inode(file);
+	root = BTRFS_I(inode)->root->fs_info->tree_root;
+	objectid = BTRFS_I(inode)->root->root_key.objectid;
+
+	key.objectid = objectid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = rootrefs->min_treeid;
+	found = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (path->slots[0] >=
+		   btrfs_header_nritems(path->nodes[0])) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret < 0) {
+			goto out;
+		} else if (ret > 0) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
+			ret = 0;
+			goto out;
+		}
+
+		if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
+			ret = -EOVERFLOW;
+			goto out;
+		}
+
+		rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+		rootrefs->rootref[found].treeid = key.offset;
+		rootrefs->rootref[found].dirid =
+				  btrfs_root_ref_dirid(leaf, rref);
+		found++;
+
+		ret = btrfs_next_item(root, path);
+		if (ret < 0) {
+			goto out;
+		} else if (ret > 0) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+
+out:
+	if (!ret || ret == -EOVERFLOW) {
+		rootrefs->num_items = found;
+		/* update min_treeid for next search */
+		if (found)
+			rootrefs->min_treeid =
+				rootrefs->rootref[found - 1].treeid + 1;
+		if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
+			ret = -EFAULT;
+	}
+
+	kfree(rootrefs);
+	btrfs_free_path(path);
+
+	return ret;
+}
+
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     void __user *arg)
 {
@@ -5666,6 +5763,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_fssetxattr(file, argp);
 	case BTRFS_IOC_GET_SUBVOL_INFO:
 		return btrfs_ioctl_get_subvol_info(file, argp);
+	case BTRFS_IOC_GET_SUBVOL_ROOTREF:
+		return btrfs_ioctl_get_subvol_rootref(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index f8f20d72b852..f90d10478235 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -785,6 +785,22 @@ struct btrfs_ioctl_get_subvol_info_args {
 	__u64 reserved[8];
 };
 
+#define BTRFS_MAX_ROOTREF_BUFFER_NUM 255
+struct btrfs_ioctl_get_subvol_rootref_args {
+		/* in/out, minimum id of rootref's treeid to be searched */
+		__u64 min_treeid;
+
+		/* out */
+		struct {
+			__u64 treeid;
+			__u64 dirid;
+		} rootref[BTRFS_MAX_ROOTREF_BUFFER_NUM];
+
+		/* out, number of found items */
+		__u8 num_items;
+		__u8 align[7];
+};
+
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -905,5 +921,7 @@ enum btrfs_err_code {
 					struct btrfs_ioctl_logical_ino_args)
 #define BTRFS_IOC_GET_SUBVOL_INFO _IOR(BTRFS_IOCTL_MAGIC, 60, \
 				struct btrfs_ioctl_get_subvol_info_args)
+#define BTRFS_IOC_GET_SUBVOL_ROOTREF _IOWR(BTRFS_IOCTL_MAGIC, 61, \
+				struct btrfs_ioctl_get_subvol_rootref_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
cgit v1.2.3


From 23d0b79dfaed2305b500b0215b0421701ada6b1a Mon Sep 17 00:00:00 2001
From: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Date: Mon, 21 May 2018 10:09:44 +0900
Subject: btrfs: Add unprivileged version of ino_lookup ioctl

Add unprivileged version of ino_lookup ioctl BTRFS_IOC_INO_LOOKUP_USER
to allow normal users to call "btrfs subvolume list/show" etc. in
combination with BTRFS_IOC_GET_SUBVOL_INFO/BTRFS_IOC_GET_SUBVOL_ROOTREF.

This can be used like BTRFS_IOC_INO_LOOKUP but the argument is
different. This is  because it always searches the fs/file tree
correspoinding to the fd with which this ioctl is called and also
returns the name of bottom subvolume.

The main differences from original ino_lookup ioctl are:

  1. Read + Exec permission will be checked using inode_permission()
     during path construction. -EACCES will be returned in case
     of failure.
  2. Path construction will be stopped at the inode number which
     corresponds to the fd with which this ioctl is called. If
     constructed path does not exist under fd's inode, -EACCES
     will be returned.
  3. The name of bottom subvolume is also searched and filled.

Note that the maximum length of path is shorter 256 (BTRFS_VOL_NAME_MAX+1)
bytes than ino_lookup ioctl because of space of subvolume's name.

Reviewed-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Tested-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
[ style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 204 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |  17 ++++
 2 files changed, 221 insertions(+)

(limited to 'include')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index be9b3f39183c..d29992f7dc63 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2341,6 +2341,165 @@ out:
 	return ret;
 }
 
+static int btrfs_search_path_in_tree_user(struct inode *inode,
+				struct btrfs_ioctl_ino_lookup_user_args *args)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct super_block *sb = inode->i_sb;
+	struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+	u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+	u64 dirid = args->dirid;
+	unsigned long item_off;
+	unsigned long item_len;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_root_ref *rref;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key, key2;
+	struct extent_buffer *leaf;
+	struct inode *temp_inode;
+	char *ptr;
+	int slot;
+	int len;
+	int total_len = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * If the bottom subvolume does not exist directly under upper_limit,
+	 * construct the path in from the bottom up.
+	 */
+	if (dirid != upper_limit.objectid) {
+		ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
+
+		key.objectid = treeid;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (IS_ERR(root)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
+
+		key.objectid = dirid;
+		key.type = BTRFS_INODE_REF_KEY;
+		key.offset = (u64)-1;
+		while (1) {
+			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = btrfs_previous_item(root, path, dirid,
+							  BTRFS_INODE_REF_KEY);
+				if (ret < 0) {
+					goto out;
+				} else if (ret > 0) {
+					ret = -ENOENT;
+					goto out;
+				}
+			}
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(leaf, &key, slot);
+
+			iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
+			len = btrfs_inode_ref_name_len(leaf, iref);
+			ptr -= len + 1;
+			total_len += len + 1;
+			if (ptr < args->path) {
+				ret = -ENAMETOOLONG;
+				goto out;
+			}
+
+			*(ptr + len) = '/';
+			read_extent_buffer(leaf, ptr,
+					(unsigned long)(iref + 1), len);
+
+			/* Check the read+exec permission of this directory */
+			ret = btrfs_previous_item(root, path, dirid,
+						  BTRFS_INODE_ITEM_KEY);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = -ENOENT;
+				goto out;
+			}
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(leaf, &key2, slot);
+			if (key2.objectid != dirid) {
+				ret = -ENOENT;
+				goto out;
+			}
+
+			temp_inode = btrfs_iget(sb, &key2, root, NULL);
+			ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+			iput(temp_inode);
+			if (ret) {
+				ret = -EACCES;
+				goto out;
+			}
+
+			if (key.offset == upper_limit.objectid)
+				break;
+			if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
+				ret = -EACCES;
+				goto out;
+			}
+
+			btrfs_release_path(path);
+			key.objectid = key.offset;
+			key.offset = (u64)-1;
+			dirid = key.objectid;
+		}
+
+		memmove(args->path, ptr, total_len);
+		args->path[total_len] = '\0';
+		btrfs_release_path(path);
+	}
+
+	/* Get the bottom subvolume's name from ROOT_REF */
+	root = fs_info->tree_root;
+	key.objectid = treeid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = args->treeid;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+
+	item_off = btrfs_item_ptr_offset(leaf, slot);
+	item_len = btrfs_item_size_nr(leaf, slot);
+	/* Check if dirid in ROOT_REF corresponds to passed dirid */
+	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+	if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Copy subvolume's name */
+	item_off += sizeof(struct btrfs_root_ref);
+	item_len -= sizeof(struct btrfs_root_ref);
+	read_extent_buffer(leaf, args->name, item_off, item_len);
+	args->name[item_len] = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 					   void __user *argp)
 {
@@ -2383,6 +2542,49 @@ out:
 	return ret;
 }
 
+/*
+ * Version of ino_lookup ioctl (unprivileged)
+ *
+ * The main differences from ino_lookup ioctl are:
+ *
+ *   1. Read + Exec permission will be checked using inode_permission() during
+ *      path construction. -EACCES will be returned in case of failure.
+ *   2. Path construction will be stopped at the inode number which corresponds
+ *      to the fd with which this ioctl is called. If constructed path does not
+ *      exist under fd's inode, -EACCES will be returned.
+ *   3. The name of bottom subvolume is also searched and filled.
+ */
+static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_ino_lookup_user_args *args;
+	struct inode *inode;
+	int ret;
+
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
+
+	inode = file_inode(file);
+
+	if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
+	    BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+		/*
+		 * The subvolume does not exist under fd with which this is
+		 * called
+		 */
+		kfree(args);
+		return -EACCES;
+	}
+
+	ret = btrfs_search_path_in_tree_user(inode, args);
+
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+
+	kfree(args);
+	return ret;
+}
+
 /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
 static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
 {
@@ -5765,6 +5967,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_subvol_info(file, argp);
 	case BTRFS_IOC_GET_SUBVOL_ROOTREF:
 		return btrfs_ioctl_get_subvol_rootref(file, argp);
+	case BTRFS_IOC_INO_LOOKUP_USER:
+		return btrfs_ioctl_ino_lookup_user(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index f90d10478235..5ca1d21fc4a7 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -422,6 +422,21 @@ struct btrfs_ioctl_ino_lookup_args {
 	char name[BTRFS_INO_LOOKUP_PATH_MAX];
 };
 
+#define BTRFS_INO_LOOKUP_USER_PATH_MAX (4080 - BTRFS_VOL_NAME_MAX - 1)
+struct btrfs_ioctl_ino_lookup_user_args {
+	/* in, inode number containing the subvolume of 'subvolid' */
+	__u64 dirid;
+	/* in */
+	__u64 treeid;
+	/* out, name of the subvolume of 'treeid' */
+	char name[BTRFS_VOL_NAME_MAX + 1];
+	/*
+	 * out, constructed path from the directory with which the ioctl is
+	 * called to dirid
+	 */
+	char path[BTRFS_INO_LOOKUP_USER_PATH_MAX];
+};
+
 /* Search criteria for the btrfs SEARCH ioctl family. */
 struct btrfs_ioctl_search_key {
 	/*
@@ -923,5 +938,7 @@ enum btrfs_err_code {
 				struct btrfs_ioctl_get_subvol_info_args)
 #define BTRFS_IOC_GET_SUBVOL_ROOTREF _IOWR(BTRFS_IOCTL_MAGIC, 61, \
 				struct btrfs_ioctl_get_subvol_rootref_args)
+#define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \
+				struct btrfs_ioctl_ino_lookup_user_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
cgit v1.2.3


From 25244227158e1502062041365a439a54cb8fe673 Mon Sep 17 00:00:00 2001
From: Nicolas Boichat <drinkcat@chromium.org>
Date: Mon, 28 May 2018 14:32:18 +0800
Subject: usb: hub: Per-port setting to use old enumeration scheme

The "old" enumeration scheme is considerably faster (it takes
~244ms instead of ~356ms to get the descriptor).

It is currently only possible to use the old scheme globally
(/sys/module/usbcore/parameters/old_scheme_first), which is not
desirable as the new scheme was introduced to increase compatibility
with more devices.

However, in our case, we care about time-to-active for a specific
USB device (which we make the firmware for), on a specific port
(that is pogo-pin based: not a standard USB port). This new
sysfs option makes it possible to use the old scheme on a single
port only.

Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-usb | 18 ++++++++++++++++++
 drivers/usb/core/hub.c                  | 13 +++++++++----
 drivers/usb/core/hub.h                  |  1 +
 drivers/usb/core/port.c                 | 23 +++++++++++++++++++++++
 include/linux/usb.h                     |  7 +++++++
 5 files changed, 58 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index c6e9b30f05b1..a31a66d62cbb 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -189,6 +189,24 @@ Description:
 		The file will read "hotplug", "wired" and "not used" if the
 		information is available, and "unknown" otherwise.
 
+What:		/sys/bus/usb/devices/.../(hub interface)/portX/quirks
+Date:		May 2018
+Contact:	Nicolas Boichat <drinkcat@chromium.org>
+Description:
+		In some cases, we care about time-to-active for devices
+		connected on a specific port (e.g. non-standard USB port like
+		pogo pins), where the device to be connected is known in
+		advance, and behaves well according to the specification.
+		This attribute is a bit-field that controls the behavior of
+		a specific port:
+		 - Bit 0 of this field selects the "old" enumeration scheme,
+		   as it is considerably faster (it only causes one USB reset
+		   instead of 2).
+		   The old enumeration scheme can also be selected globally
+		   using /sys/module/usbcore/parameters/old_scheme_first, but
+		   it is often not desirable as the new scheme was introduced to
+		   increase compatibility with more devices.
+
 What:		/sys/bus/usb/devices/.../(hub interface)/portX/over_current_count
 Date:		February 2018
 Contact:	Richard Leitner <richard.leitner@skidata.com>
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index c2d993d3816f..f900f66a6285 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2636,7 +2636,7 @@ static unsigned hub_is_wusb(struct usb_hub *hub)
 #define SET_ADDRESS_TRIES	2
 #define GET_DESCRIPTOR_TRIES	2
 #define SET_CONFIG_TRIES	(2 * (use_both_schemes + 1))
-#define USE_NEW_SCHEME(i)	((i) / 2 == (int)old_scheme_first)
+#define USE_NEW_SCHEME(i, scheme)	((i) / 2 == (int)scheme)
 
 #define HUB_ROOT_RESET_TIME	60	/* times are in msec */
 #define HUB_SHORT_RESET_TIME	10
@@ -2651,12 +2651,16 @@ static unsigned hub_is_wusb(struct usb_hub *hub)
  * enumeration failures, so disable this enumeration scheme for USB3
  * devices.
  */
-static bool use_new_scheme(struct usb_device *udev, int retry)
+static bool use_new_scheme(struct usb_device *udev, int retry,
+			   struct usb_port *port_dev)
 {
+	int old_scheme_first_port =
+		port_dev->quirks & USB_PORT_QUIRK_OLD_SCHEME;
+
 	if (udev->speed >= USB_SPEED_SUPER)
 		return false;
 
-	return USE_NEW_SCHEME(retry);
+	return USE_NEW_SCHEME(retry, old_scheme_first_port || old_scheme_first);
 }
 
 /* Is a USB 3.0 port in the Inactive or Compliance Mode state?
@@ -4392,6 +4396,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 {
 	struct usb_device	*hdev = hub->hdev;
 	struct usb_hcd		*hcd = bus_to_hcd(hdev->bus);
+	struct usb_port		*port_dev = hub->ports[port1 - 1];
 	int			retries, operations, retval, i;
 	unsigned		delay = HUB_SHORT_RESET_TIME;
 	enum usb_device_speed	oldspeed = udev->speed;
@@ -4513,7 +4518,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 	for (retries = 0; retries < GET_DESCRIPTOR_TRIES; (++retries, msleep(100))) {
 		bool did_new_scheme = false;
 
-		if (use_new_scheme(udev, retry_counter)) {
+		if (use_new_scheme(udev, retry_counter, port_dev)) {
 			struct usb_device_descriptor *buf;
 			int r = 0;
 
diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index 4dc769ee9c74..4accfb63f7dc 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h
@@ -98,6 +98,7 @@ struct usb_port {
 	struct mutex status_lock;
 	u32 over_current_count;
 	u8 portnum;
+	u32 quirks;
 	unsigned int is_superspeed:1;
 	unsigned int usb3_lpm_u1_permit:1;
 	unsigned int usb3_lpm_u2_permit:1;
diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c
index 6979bde87d31..4a2143195395 100644
--- a/drivers/usb/core/port.c
+++ b/drivers/usb/core/port.c
@@ -50,6 +50,28 @@ static ssize_t over_current_count_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(over_current_count);
 
+static ssize_t quirks_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct usb_port *port_dev = to_usb_port(dev);
+
+	return sprintf(buf, "%08x\n", port_dev->quirks);
+}
+
+static ssize_t quirks_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct usb_port *port_dev = to_usb_port(dev);
+	u32 value;
+
+	if (kstrtou32(buf, 16, &value))
+		return -EINVAL;
+
+	port_dev->quirks = value;
+	return count;
+}
+static DEVICE_ATTR_RW(quirks);
+
 static ssize_t usb3_lpm_permit_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
@@ -118,6 +140,7 @@ static DEVICE_ATTR_RW(usb3_lpm_permit);
 
 static struct attribute *port_dev_attrs[] = {
 	&dev_attr_connect_type.attr,
+	&dev_attr_quirks.attr,
 	&dev_attr_over_current_count.attr,
 	NULL,
 };
diff --git a/include/linux/usb.h b/include/linux/usb.h
index beffceec4915..2ade17992ed6 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -489,6 +489,13 @@ enum usb_port_connect_type {
 	USB_PORT_NOT_USED,
 };
 
+/*
+ * USB port quirks.
+ */
+
+/* For the given port, prefer the old (faster) enumeration scheme. */
+#define USB_PORT_QUIRK_OLD_SCHEME	BIT(0)
+
 /*
  * USB 2.0 Link Power Management (LPM) parameters.
  */
-- 
cgit v1.2.3


From aa071a92bbf09d993ff0dbf3b1f2b53ac93ad654 Mon Sep 17 00:00:00 2001
From: Nicolas Boichat <drinkcat@chromium.org>
Date: Mon, 28 May 2018 14:32:19 +0800
Subject: usb: hub: Per-port setting to reduce TRSTRCY to 10 ms

Currently, the USB hub core waits for 50 ms after enumerating the
device. This was added to help "some high speed devices" to
enumerate (b789696af8 "[PATCH] USB: relax usbcore reset timings").

On some devices, the time-to-active is important, so we provide
a per-port option to reduce the time to what the USB specification
requires: 10 ms.

Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-usb | 4 ++++
 drivers/usb/core/hub.c                  | 6 +++++-
 include/linux/usb.h                     | 3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index a31a66d62cbb..08d456e07b53 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -206,6 +206,10 @@ Description:
 		   using /sys/module/usbcore/parameters/old_scheme_first, but
 		   it is often not desirable as the new scheme was introduced to
 		   increase compatibility with more devices.
+		 - Bit 1 reduces TRSTRCY to the 10 ms that are required by the
+		   USB 2.0 specification, instead of the 50 ms that are normally
+		   used to help make enumeration work better on some high speed
+		   devices.
 
 What:		/sys/bus/usb/devices/.../(hub interface)/portX/over_current_count
 Date:		February 2018
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index f900f66a6285..26c2438d2889 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2879,7 +2879,11 @@ static int hub_port_reset(struct usb_hub *hub, int port1,
 done:
 	if (status == 0) {
 		/* TRSTRCY = 10 ms; plus some extra */
-		msleep(10 + 40);
+		if (port_dev->quirks & USB_PORT_QUIRK_FAST_ENUM)
+			usleep_range(10000, 12000);
+		else
+			msleep(10 + 40);
+
 		if (udev) {
 			struct usb_hcd *hcd = bus_to_hcd(udev->bus);
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 2ade17992ed6..4cdd515a4385 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -496,6 +496,9 @@ enum usb_port_connect_type {
 /* For the given port, prefer the old (faster) enumeration scheme. */
 #define USB_PORT_QUIRK_OLD_SCHEME	BIT(0)
 
+/* Decrease TRSTRCY to 10ms during device enumeration. */
+#define USB_PORT_QUIRK_FAST_ENUM	BIT(1)
+
 /*
  * USB 2.0 Link Power Management (LPM) parameters.
  */
-- 
cgit v1.2.3


From aa43457799f715d76cb77342baab0615877e2b8a Mon Sep 17 00:00:00 2001
From: Adam Manzanares <adam.manzanares@wdc.com>
Date: Tue, 22 May 2018 10:52:17 -0700
Subject: block: add ioprio_check_cap function

Aio per command iopriority support introduces a second interface between
userland and the kernel capable of passing iopriority. The aio interface also
needs the ability to verify that the submitting context has sufficient
privileges to submit IOPRIO_RT commands. This patch creates the
ioprio_check_cap function to be used by the ioprio_set system call and also by
the aio interface.

Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/ioprio.c         | 22 ++++++++++++++++------
 include/linux/ioprio.h |  2 ++
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/block/ioprio.c b/block/ioprio.c
index 6f5d0b6625e3..f9821080c92c 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -61,15 +61,10 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 
-SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+int ioprio_check_cap(int ioprio)
 {
 	int class = IOPRIO_PRIO_CLASS(ioprio);
 	int data = IOPRIO_PRIO_DATA(ioprio);
-	struct task_struct *p, *g;
-	struct user_struct *user;
-	struct pid *pgrp;
-	kuid_t uid;
-	int ret;
 
 	switch (class) {
 		case IOPRIO_CLASS_RT:
@@ -92,6 +87,21 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 			return -EINVAL;
 	}
 
+	return 0;
+}
+
+SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+{
+	struct task_struct *p, *g;
+	struct user_struct *user;
+	struct pid *pgrp;
+	kuid_t uid;
+	int ret;
+
+	ret = ioprio_check_cap(ioprio);
+	if (ret)
+		return ret;
+
 	ret = -ESRCH;
 	rcu_read_lock();
 	switch (which) {
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 627efac73e6d..4a28cec49ec3 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -77,4 +77,6 @@ extern int ioprio_best(unsigned short aprio, unsigned short bprio);
 
 extern int set_task_ioprio(struct task_struct *task, int ioprio);
 
+extern int ioprio_check_cap(int ioprio);
+
 #endif
-- 
cgit v1.2.3


From fc28724d67c90ff48b976e0687caf79993160bed Mon Sep 17 00:00:00 2001
From: Adam Manzanares <adam.manzanares@wdc.com>
Date: Tue, 22 May 2018 10:52:18 -0700
Subject: fs: Convert kiocb rw_hint from enum to u16

In order to avoid kiocb bloat for per command iopriority support, rw_hint
is converted from enum to a u16. Added a guard around ki_hint assignment.

Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c           |  2 +-
 include/linux/fs.h | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index b850e92ee0d5..33299ece7540 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1434,7 +1434,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
 	req->ki_flags = iocb_flags(req->ki_filp);
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
 		req->ki_flags |= IOCB_EVENTFD;
-	req->ki_hint = file_write_hint(req->ki_filp);
+	req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))
 		fput(req->ki_filp);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4b6045ebb2f2..b432fc3feb93 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -299,7 +299,7 @@ struct kiocb {
 	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
 	void			*private;
 	int			ki_flags;
-	enum rw_hint		ki_hint;
+	u16			ki_hint;
 } __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -1929,12 +1929,21 @@ static inline enum rw_hint file_write_hint(struct file *file)
 
 static inline int iocb_flags(struct file *file);
 
+static inline u16 ki_hint_validate(enum rw_hint hint)
+{
+	typeof(((struct kiocb *)0)->ki_hint) max_hint = -1;
+
+	if (hint <= max_hint)
+		return hint;
+	return 0;
+}
+
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
 	*kiocb = (struct kiocb) {
 		.ki_filp = filp,
 		.ki_flags = iocb_flags(filp),
-		.ki_hint = file_write_hint(filp),
+		.ki_hint = ki_hint_validate(file_write_hint(filp)),
 	};
 }
 
-- 
cgit v1.2.3


From d9a08a9e616beeccdbd0e7262b7225ffdfa49e92 Mon Sep 17 00:00:00 2001
From: Adam Manzanares <adam.manzanares@wdc.com>
Date: Tue, 22 May 2018 10:52:19 -0700
Subject: fs: Add aio iopriority support

This is the per-I/O equivalent of the ioprio_set system call.

When IOCB_FLAG_IOPRIO is set on the iocb aio_flags field, then we set the
newly added kiocb ki_ioprio field to the value in the iocb aio_reqprio field.

This patch depends on block: add ioprio_check_cap function.

Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/loop.c         |  3 +++
 fs/aio.c                     | 16 ++++++++++++++++
 include/linux/fs.h           |  3 +++
 include/uapi/linux/aio_abi.h |  1 +
 4 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5d4e31655d96..dd98dfd97f5e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -76,6 +76,8 @@
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
 #include <linux/uio.h>
+#include <linux/ioprio.h>
+
 #include "loop.h"
 
 #include <linux/uaccess.h>
@@ -559,6 +561,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_filp = file;
 	cmd->iocb.ki_complete = lo_rw_aio_complete;
 	cmd->iocb.ki_flags = IOCB_DIRECT;
+	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
 	if (cmd->css)
 		kthread_associate_blkcg(cmd->css);
 
diff --git a/fs/aio.c b/fs/aio.c
index 33299ece7540..9527ededa669 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1435,6 +1435,22 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
 		req->ki_flags |= IOCB_EVENTFD;
 	req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
+	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
+		/*
+		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
+		 * aio_reqprio is interpreted as an I/O scheduling
+		 * class and priority.
+		 */
+		ret = ioprio_check_cap(iocb->aio_reqprio);
+		if (ret) {
+			pr_debug("aio ioprio check cap error\n");
+			return -EINVAL;
+		}
+
+		req->ki_ioprio = iocb->aio_reqprio;
+	} else
+		req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))
 		fput(req->ki_filp);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b432fc3feb93..eef9334b26d1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -36,6 +36,7 @@
 #include <linux/delayed_call.h>
 #include <linux/uuid.h>
 #include <linux/errseq.h>
+#include <linux/ioprio.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -300,6 +301,7 @@ struct kiocb {
 	void			*private;
 	int			ki_flags;
 	u16			ki_hint;
+	u16			ki_ioprio; /* See linux/ioprio.h */
 } __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -1944,6 +1946,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 		.ki_filp = filp,
 		.ki_flags = iocb_flags(filp),
 		.ki_hint = ki_hint_validate(file_write_hint(filp)),
+		.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0),
 	};
 }
 
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index ed0185945bb2..75846164290e 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -53,6 +53,7 @@ enum {
  *                   is valid.
  */
 #define IOCB_FLAG_RESFD		(1 << 0)
+#define IOCB_FLAG_IOPRIO	(1 << 1)
 
 /* read() from /dev/aio returns these structures. */
 struct io_event {
-- 
cgit v1.2.3


From ba23cba9b3bdc967aabdc6ff1e3e9b11ce05bb4f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 30 May 2018 13:03:45 -0700
Subject: fs: allow per-device dax status checking for filesystems

Change bdev_dax_supported so it takes a bdev parameter.  This enables
multi-device filesystems like xfs to check that a dax device can work for
the particular filesystem.  Once that's in place, actually fix all the
parts of XFS where we need to be able to distinguish between datadev and
rtdev.

This patch fixes the problem where we screw up the dax support checking
in xfs if the datadev and rtdev have different dax capabilities.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
[rez: Re-added __bdev_dax_supported() for !CONFIG_FS_DAX cases]
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 drivers/dax/super.c | 26 +++++++++++++-------------
 fs/ext2/super.c     |  2 +-
 fs/ext4/super.c     |  2 +-
 fs/xfs/xfs_ioctl.c  |  3 ++-
 fs/xfs/xfs_iops.c   | 30 +++++++++++++++++++++++++-----
 fs/xfs/xfs_super.c  | 10 ++++++++--
 include/linux/dax.h |  9 +++++----
 7 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..3943feb9a090 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -74,7 +74,7 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
 
 /**
  * __bdev_dax_supported() - Check if the device supports dax for filesystem
- * @sb: The superblock of the device
+ * @bdev: block device to check
  * @blocksize: The block size of the device
  *
  * This is a library function for filesystems to check if the block device
@@ -82,33 +82,33 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
  *
  * Return: negative errno if unsupported, 0 if supported.
  */
-int __bdev_dax_supported(struct super_block *sb, int blocksize)
+int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
-	struct block_device *bdev = sb->s_bdev;
 	struct dax_device *dax_dev;
 	pgoff_t pgoff;
 	int err, id;
 	void *kaddr;
 	pfn_t pfn;
 	long len;
+	char buf[BDEVNAME_SIZE];
 
 	if (blocksize != PAGE_SIZE) {
-		pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
-				sb->s_id);
+		pr_debug("%s: error: unsupported blocksize for dax\n",
+				bdevname(bdev, buf));
 		return -EINVAL;
 	}
 
 	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 	if (err) {
-		pr_debug("VFS (%s): error: unaligned partition for dax\n",
-				sb->s_id);
+		pr_debug("%s: error: unaligned partition for dax\n",
+				bdevname(bdev, buf));
 		return err;
 	}
 
 	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	if (!dax_dev) {
-		pr_debug("VFS (%s): error: device does not support dax\n",
-				sb->s_id);
+		pr_debug("%s: error: device does not support dax\n",
+				bdevname(bdev, buf));
 		return -EOPNOTSUPP;
 	}
 
@@ -119,8 +119,8 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	put_dax(dax_dev);
 
 	if (len < 1) {
-		pr_debug("VFS (%s): error: dax access failed (%ld)\n",
-				sb->s_id, len);
+		pr_debug("%s: error: dax access failed (%ld)\n",
+				bdevname(bdev, buf), len);
 		return len < 0 ? len : -EIO;
 	}
 
@@ -137,8 +137,8 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	} else if (pfn_t_devmap(pfn)) {
 		/* pass */;
 	} else {
-		pr_debug("VFS (%s): error: dax support not enabled\n",
-				sb->s_id);
+		pr_debug("%s: error: dax support not enabled\n",
+				bdevname(bdev, buf));
 		return -EOPNOTSUPP;
 	}
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index de1694512f1f..9627c3054b5c 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -961,7 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
 	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-		err = bdev_dax_supported(sb, blocksize);
+		err = bdev_dax_supported(sb->s_bdev, blocksize);
 		if (err) {
 			ext2_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb104e8476f0..089170e99895 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3732,7 +3732,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 					" that may contain inline data");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
 		}
-		err = bdev_dax_supported(sb, blocksize);
+		err = bdev_dax_supported(sb->s_bdev, blocksize);
 		if (err) {
 			ext4_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f9c97ea52961..c9a5b75a73a8 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1103,7 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate(
 	if (fa->fsx_xflags & FS_XFLAG_DAX) {
 		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 			return -EINVAL;
-		if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
+		if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+				sb->s_blocksize) < 0)
 			return -EINVAL;
 	}
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 5afe3c2234b3..9925d75411bb 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1195,6 +1195,30 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = {
 	.update_time		= xfs_vn_update_time,
 };
 
+/* Figure out if this file actually supports DAX. */
+static bool
+xfs_inode_supports_dax(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	/* Only supported on non-reflinked files. */
+	if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip))
+		return false;
+
+	/* DAX mount option or DAX iflag must be set. */
+	if (!(mp->m_flags & XFS_MOUNT_DAX) &&
+	    !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+		return false;
+
+	/* Block size must match page size */
+	if (mp->m_sb.sb_blocksize != PAGE_SIZE)
+		return false;
+
+	/* Device has to support DAX too. */
+	return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL;
+}
+
 STATIC void
 xfs_diflags_to_iflags(
 	struct inode		*inode,
@@ -1213,11 +1237,7 @@ xfs_diflags_to_iflags(
 		inode->i_flags |= S_SYNC;
 	if (flags & XFS_DIFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
-	if (S_ISREG(inode->i_mode) &&
-	    ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
-	    !xfs_is_reflink_inode(ip) &&
-	    (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
-	     ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+	if (xfs_inode_supports_dax(ip))
 		inode->i_flags |= S_DAX;
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 39e5ec3d407f..fed63e0b8f4d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1701,11 +1701,17 @@ xfs_fs_fill_super(
 		sb->s_flags |= SB_I_VERSION;
 
 	if (mp->m_flags & XFS_MOUNT_DAX) {
+		int	error2 = 0;
+
 		xfs_warn(mp,
 		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 
-		error = bdev_dax_supported(sb, sb->s_blocksize);
-		if (error) {
+		error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
+				sb->s_blocksize);
+		if (mp->m_rtdev_targp)
+			error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev,
+					sb->s_blocksize);
+		if (error && error2) {
 			xfs_alert(mp,
 			"DAX unsupported by block device. Turning off DAX.");
 			mp->m_flags &= ~XFS_MOUNT_DAX;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f9eb22ad341e..2a4f7295c12b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -64,10 +64,10 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
-int __bdev_dax_supported(struct super_block *sb, int blocksize);
-static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+int __bdev_dax_supported(struct block_device *bdev, int blocksize);
+static inline int bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
-	return __bdev_dax_supported(sb, blocksize);
+	return __bdev_dax_supported(bdev, blocksize);
 }
 
 static inline struct dax_device *fs_dax_get_by_host(const char *host)
@@ -84,7 +84,8 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc);
 #else
-static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+static inline int bdev_dax_supported(struct block_device *bdev,
+		int blocksize)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From 80660f20252d6f76c9f203874ad7c7a4a8508cf8 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 30 May 2018 13:03:46 -0700
Subject: dax: change bdev_dax_supported() to support boolean returns

The function return values are confusing with the way the function is
named. We expect a true or false return value but it actually returns
0/-errno.  This makes the code very confusing. Changing the return values
to return a bool where if DAX is supported then return true and no DAX
support returns false.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 drivers/dax/super.c | 16 ++++++++--------
 fs/ext2/super.c     |  3 +--
 fs/ext4/super.c     |  3 +--
 fs/xfs/xfs_ioctl.c  |  4 ++--
 fs/xfs/xfs_super.c  | 12 ++++++------
 include/linux/dax.h |  8 ++++----
 6 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 3943feb9a090..1d7bd96511f0 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -80,9 +80,9 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
  * This is a library function for filesystems to check if the block device
  * can be mounted with dax option.
  *
- * Return: negative errno if unsupported, 0 if supported.
+ * Return: true if supported, false if unsupported
  */
-int __bdev_dax_supported(struct block_device *bdev, int blocksize)
+bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
 	struct dax_device *dax_dev;
 	pgoff_t pgoff;
@@ -95,21 +95,21 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	if (blocksize != PAGE_SIZE) {
 		pr_debug("%s: error: unsupported blocksize for dax\n",
 				bdevname(bdev, buf));
-		return -EINVAL;
+		return false;
 	}
 
 	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 	if (err) {
 		pr_debug("%s: error: unaligned partition for dax\n",
 				bdevname(bdev, buf));
-		return err;
+		return false;
 	}
 
 	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	if (!dax_dev) {
 		pr_debug("%s: error: device does not support dax\n",
 				bdevname(bdev, buf));
-		return -EOPNOTSUPP;
+		return false;
 	}
 
 	id = dax_read_lock();
@@ -121,7 +121,7 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	if (len < 1) {
 		pr_debug("%s: error: dax access failed (%ld)\n",
 				bdevname(bdev, buf), len);
-		return len < 0 ? len : -EIO;
+		return false;
 	}
 
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
@@ -139,10 +139,10 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	} else {
 		pr_debug("%s: error: dax support not enabled\n",
 				bdevname(bdev, buf));
-		return -EOPNOTSUPP;
+		return false;
 	}
 
-	return 0;
+	return true;
 }
 EXPORT_SYMBOL_GPL(__bdev_dax_supported);
 #endif
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9627c3054b5c..c09289a42dc5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
 	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-		err = bdev_dax_supported(sb->s_bdev, blocksize);
-		if (err) {
+		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext2_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
 			sbi->s_mount_opt &= ~EXT2_MOUNT_DAX;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 089170e99895..2e1622907f4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3732,8 +3732,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 					" that may contain inline data");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
 		}
-		err = bdev_dax_supported(sb->s_bdev, blocksize);
-		if (err) {
+		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext4_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c9a5b75a73a8..5dd9e22b4a4c 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1103,8 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate(
 	if (fa->fsx_xflags & FS_XFLAG_DAX) {
 		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 			return -EINVAL;
-		if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
-				sb->s_blocksize) < 0)
+		if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+				sb->s_blocksize))
 			return -EINVAL;
 	}
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fed63e0b8f4d..4e66e61865fb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1701,17 +1701,17 @@ xfs_fs_fill_super(
 		sb->s_flags |= SB_I_VERSION;
 
 	if (mp->m_flags & XFS_MOUNT_DAX) {
-		int	error2 = 0;
+		bool rtdev_is_dax = false, datadev_is_dax;
 
 		xfs_warn(mp,
 		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 
-		error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
-				sb->s_blocksize);
+		datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
+			sb->s_blocksize);
 		if (mp->m_rtdev_targp)
-			error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev,
-					sb->s_blocksize);
-		if (error && error2) {
+			rtdev_is_dax = bdev_dax_supported(
+				mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
+		if (!rtdev_is_dax && !datadev_is_dax) {
 			xfs_alert(mp,
 			"DAX unsupported by block device. Turning off DAX.");
 			mp->m_flags &= ~XFS_MOUNT_DAX;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2a4f7295c12b..c99692ddd4b5 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -64,8 +64,8 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
-int __bdev_dax_supported(struct block_device *bdev, int blocksize);
-static inline int bdev_dax_supported(struct block_device *bdev, int blocksize)
+bool __bdev_dax_supported(struct block_device *bdev, int blocksize);
+static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
 	return __bdev_dax_supported(bdev, blocksize);
 }
@@ -84,10 +84,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc);
 #else
-static inline int bdev_dax_supported(struct block_device *bdev,
+static inline bool bdev_dax_supported(struct block_device *bdev,
 		int blocksize)
 {
-	return -EOPNOTSUPP;
+	return false;
 }
 
 static inline struct dax_device *fs_dax_get_by_host(const char *host)
-- 
cgit v1.2.3


From eff0e9e1078ea7dc1d794dc50e31baef984c46d7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 29 May 2018 13:11:05 +0100
Subject: arm/arm64: smccc: Add SMCCC-specific return codes

We've so far used the PSCI return codes for SMCCC because they
were extremely similar. But with the new ARM DEN 0070A specification,
"NOT_REQUIRED" (-2) is clashing with PSCI's "PSCI_RET_INVALID_PARAMS".

Let's bite the bullet and add SMCCC specific return codes. Users
can be repainted as and when required.

Acked-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/arm-smccc.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index a031897fca76..c89da86de99f 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -291,5 +291,10 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
  */
 #define arm_smccc_1_1_hvc(...)	__arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__)
 
+/* Return codes defined in ARM DEN 0070A */
+#define SMCCC_RET_SUCCESS			0
+#define SMCCC_RET_NOT_SUPPORTED			-1
+#define SMCCC_RET_NOT_REQUIRED			-2
+
 #endif /*__ASSEMBLY__*/
 #endif /*__LINUX_ARM_SMCCC_H*/
-- 
cgit v1.2.3


From 8e2906245f1e3b0d027169d9f2e55ce0548cb96e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 29 May 2018 13:11:06 +0100
Subject: arm64: Call ARCH_WORKAROUND_2 on transitions between EL0 and EL1

In order for the kernel to protect itself, let's call the SSBD mitigation
implemented by the higher exception level (either hypervisor or firmware)
on each transition between userspace and kernel.

We must take the PSCI conduit into account in order to target the
right exception level, hence the introduction of a runtime patching
callback.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Julien Grall <julien.grall@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpu_errata.c | 24 ++++++++++++++++++++++++
 arch/arm64/kernel/entry.S      | 22 ++++++++++++++++++++++
 include/linux/arm-smccc.h      |  5 +++++
 3 files changed, 51 insertions(+)

(limited to 'include')

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index a900befadfe8..c1eda6be7758 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -232,6 +232,30 @@ enable_smccc_arch_workaround_1(const struct arm64_cpu_capabilities *entry)
 }
 #endif	/* CONFIG_HARDEN_BRANCH_PREDICTOR */
 
+#ifdef CONFIG_ARM64_SSBD
+void __init arm64_update_smccc_conduit(struct alt_instr *alt,
+				       __le32 *origptr, __le32 *updptr,
+				       int nr_inst)
+{
+	u32 insn;
+
+	BUG_ON(nr_inst != 1);
+
+	switch (psci_ops.conduit) {
+	case PSCI_CONDUIT_HVC:
+		insn = aarch64_insn_get_hvc_value();
+		break;
+	case PSCI_CONDUIT_SMC:
+		insn = aarch64_insn_get_smc_value();
+		break;
+	default:
+		return;
+	}
+
+	*updptr = cpu_to_le32(insn);
+}
+#endif	/* CONFIG_ARM64_SSBD */
+
 #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max)	\
 	.matches = is_affected_midr_range,			\
 	.midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ec2ee720e33e..f33e6aed3037 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -18,6 +18,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/arm-smccc.h>
 #include <linux/init.h>
 #include <linux/linkage.h>
 
@@ -137,6 +138,18 @@ alternative_else_nop_endif
 	add	\dst, \dst, #(\sym - .entry.tramp.text)
 	.endm
 
+	// This macro corrupts x0-x3. It is the caller's duty
+	// to save/restore them if required.
+	.macro	apply_ssbd, state
+#ifdef CONFIG_ARM64_SSBD
+	mov	w0, #ARM_SMCCC_ARCH_WORKAROUND_2
+	mov	w1, #\state
+alternative_cb	arm64_update_smccc_conduit
+	nop					// Patched to SMC/HVC #0
+alternative_cb_end
+#endif
+	.endm
+
 	.macro	kernel_entry, el, regsize = 64
 	.if	\regsize == 32
 	mov	w0, w0				// zero upper 32 bits of x0
@@ -163,6 +176,13 @@ alternative_else_nop_endif
 	ldr	x19, [tsk, #TSK_TI_FLAGS]	// since we can unmask debug
 	disable_step_tsk x19, x20		// exceptions when scheduling.
 
+	apply_ssbd 1
+
+#ifdef CONFIG_ARM64_SSBD
+	ldp	x0, x1, [sp, #16 * 0]
+	ldp	x2, x3, [sp, #16 * 1]
+#endif
+
 	mov	x29, xzr			// fp pointed to user-space
 	.else
 	add	x21, sp, #S_FRAME_SIZE
@@ -303,6 +323,8 @@ alternative_if ARM64_WORKAROUND_845719
 alternative_else_nop_endif
 #endif
 3:
+	apply_ssbd 0
+
 	.endif
 
 	msr	elr_el1, x21			// set up the return data
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index c89da86de99f..ca1d2cc2cdfa 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -80,6 +80,11 @@
 			   ARM_SMCCC_SMC_32,				\
 			   0, 0x8000)
 
+#define ARM_SMCCC_ARCH_WORKAROUND_2					\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   0, 0x7fff)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/linkage.h>
-- 
cgit v1.2.3


From 868c2392a700fc1e72fb1bc044b7cceecd4f095c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 22 May 2018 11:09:54 +0200
Subject: nvme.h: untangle AEN notice definitions

Stop including the event type in the definitions for the notice type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 drivers/nvme/host/core.c | 30 ++++++++++++++++++------------
 include/linux/nvme.h     |  8 ++++++--
 2 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2c4cf65641a6..86cb78653155 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3321,6 +3321,21 @@ static void nvme_fw_act_work(struct work_struct *work)
 	nvme_get_fw_slot_info(ctrl);
 }
 
+static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
+{
+	switch ((result & 0xff00) >> 8) {
+	case NVME_AER_NOTICE_NS_CHANGED:
+		dev_info(ctrl->device, "rescanning\n");
+		nvme_queue_scan(ctrl);
+		break;
+	case NVME_AER_NOTICE_FW_ACT_STARTING:
+		queue_work(nvme_wq, &ctrl->fw_act_work);
+		break;
+	default:
+		dev_warn(ctrl->device, "async event result %08x\n", result);
+	}
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		volatile union nvme_result *res)
 {
@@ -3330,6 +3345,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		return;
 
 	switch (result & 0x7) {
+	case NVME_AER_NOTICE:
+		nvme_handle_aen_notice(ctrl, result);
+		break;
 	case NVME_AER_ERROR:
 	case NVME_AER_SMART:
 	case NVME_AER_CSS:
@@ -3339,18 +3357,6 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 	default:
 		break;
 	}
-
-	switch (result & 0xff07) {
-	case NVME_AER_NOTICE_NS_CHANGED:
-		dev_info(ctrl->device, "rescanning\n");
-		nvme_queue_scan(ctrl);
-		break;
-	case NVME_AER_NOTICE_FW_ACT_STARTING:
-		queue_work(nvme_wq, &ctrl->fw_act_work);
-		break;
-	default:
-		dev_warn(ctrl->device, "async event result %08x\n", result);
-	}
 	queue_work(nvme_wq, &ctrl->async_event_work);
 }
 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4112e2bd747f..c37103a4ad38 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -436,10 +436,14 @@ enum {
 enum {
 	NVME_AER_ERROR			= 0,
 	NVME_AER_SMART			= 1,
+	NVME_AER_NOTICE			= 2,
 	NVME_AER_CSS			= 6,
 	NVME_AER_VS			= 7,
-	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
-	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
+};
+
+enum {
+	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
+	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 };
 
 struct nvme_lba_range_type {
-- 
cgit v1.2.3


From b3984e064ec15aa349a635affd70d16eedbb539b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 May 2018 17:18:33 +0200
Subject: nvme.h: add the changed namespace list log

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 include/linux/nvme.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c37103a4ad38..7ce0f3cf4409 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -751,6 +751,7 @@ enum {
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
+	NVME_LOG_CHANGED_NS	= 0x04,
 	NVME_LOG_CMD_EFFECTS	= 0x05,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
@@ -759,6 +760,8 @@ enum {
 	NVME_FWACT_ACTV		= (2 << 3),
 };
 
+#define NVME_MAX_CHANGED_NAMESPACES	1024
+
 struct nvme_identify {
 	__u8			opcode;
 	__u8			flags;
-- 
cgit v1.2.3


From 3d97d88e8091f3501e016f6b4ce45a32c4b8f2f6 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Tue, 29 May 2018 23:27:31 +0800
Subject: tcp: minor optimization around tcp_hdr() usage in receive path

This is additional to the
commit ea1627c20c34 ("tcp: minor optimizations around tcp_hdr() usage").
At this point, skb->data is same with tcp_hdr() as tcp header has not
been pulled yet. So use the less expensive one to get the tcp header.

Remove the third parameter of tcp_rcv_established() and put it into
the function body.

Furthermore, the local variables are listed as a reverse christmas tree :)

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h          | 3 +--
 include/trace/events/tcp.h | 5 +++--
 net/ipv4/tcp_input.c       | 6 +++---
 net/ipv4/tcp_ipv4.c        | 2 +-
 net/ipv6/tcp_ipv6.c        | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 952d842a604a..029a51b91742 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -334,8 +334,7 @@ void tcp_write_timer_handler(struct sock *sk);
 void tcp_delack_timer_handler(struct sock *sk);
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
-void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 const struct tcphdr *th);
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 703abb6e11fa..ac55b328d61b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -248,8 +248,9 @@ TRACE_EVENT(tcp_probe,
 	),
 
 	TP_fast_assign(
-		const struct tcp_sock *tp = tcp_sk(sk);
+		const struct tcphdr *th = (const struct tcphdr *)skb->data;
 		const struct inet_sock *inet = inet_sk(sk);
+		const struct tcp_sock *tp = tcp_sk(sk);
 
 		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
 		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
@@ -261,7 +262,7 @@ TRACE_EVENT(tcp_probe,
 		__entry->dport = ntohs(inet->inet_dport);
 		__entry->mark = skb->mark;
 
-		__entry->data_len = skb->len - tcp_hdrlen(skb);
+		__entry->data_len = skb->len - __tcp_hdrlen(th);
 		__entry->snd_nxt = tp->snd_nxt;
 		__entry->snd_una = tp->snd_una;
 		__entry->snd_cwnd = tp->snd_cwnd;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1191cac72109..d5ffb573ca4d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5390,11 +5390,11 @@ discard:
  *	the rest is checked inline. Fast processing is turned on in
  *	tcp_data_queue when everything is OK.
  */
-void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 const struct tcphdr *th)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 {
-	unsigned int len = skb->len;
+	const struct tcphdr *th = (const struct tcphdr *)skb->data;
 	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int len = skb->len;
 
 	/* TCP congestion window tracking */
 	trace_tcp_probe(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index adbdb503db0c..749b0ef9f405 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1486,7 +1486,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		tcp_rcv_established(sk, skb, tcp_hdr(skb));
+		tcp_rcv_established(sk, skb);
 		return 0;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7d47c2b550a9..8764a63abd91 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1322,7 +1322,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 
-		tcp_rcv_established(sk, skb, tcp_hdr(skb));
+		tcp_rcv_established(sk, skb);
 		if (opt_skb)
 			goto ipv6_pktoptions;
 		return 0;
-- 
cgit v1.2.3


From 32d26a685c1802a0e485bd674e7dd038e88019f7 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 29 May 2018 02:31:24 -0700
Subject: qed*: Add link change count value to ethtool statistics display.

This patch adds driver changes for capturing the link change count in
ethtool statistics display.

Please consider applying this to "net-next".

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 12 ++++++++++--
 drivers/net/ethernet/qlogic/qede/qede.h         |  1 +
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  2 ++
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  1 +
 include/linux/qed/qed_if.h                      |  1 +
 5 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 1c0d0c217936..eed4725761f5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1854,6 +1854,11 @@ static void __qed_get_vport_port_stats(struct qed_hwfn *p_hwfn,
 		p_ah->tx_1519_to_max_byte_packets =
 		    port_stats.eth.u1.ah1.t1519_to_max;
 	}
+
+	p_common->link_change_count = qed_rd(p_hwfn, p_ptt,
+					     p_hwfn->mcp_info->port_addr +
+					     offsetof(struct public_port,
+						      link_change_count));
 }
 
 static void __qed_get_vport_stats(struct qed_hwfn *p_hwfn,
@@ -1961,11 +1966,14 @@ void qed_reset_vport_stats(struct qed_dev *cdev)
 
 	/* PORT statistics are not necessarily reset, so we need to
 	 * read and create a baseline for future statistics.
+	 * Link change stat is maintained by MFW, return its value as is.
 	 */
-	if (!cdev->reset_stats)
+	if (!cdev->reset_stats) {
 		DP_INFO(cdev, "Reset stats not allocated\n");
-	else
+	} else {
 		_qed_get_vport_stats(cdev, cdev->reset_stats);
+		cdev->reset_stats->common.link_change_count = 0;
+	}
 }
 
 static enum gft_profile_type
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 81c5c8dfa2ef..d7ed0d3dbf71 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -88,6 +88,7 @@ struct qede_stats_common {
 	u64 coalesced_aborts_num;
 	u64 non_coalesced_pkts;
 	u64 coalesced_bytes;
+	u64 link_change_count;
 
 	/* port */
 	u64 rx_64_byte_packets;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 6906e04b609e..f4a0f8ff8261 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -171,6 +171,8 @@ static const struct {
 	QEDE_STAT(coalesced_aborts_num),
 	QEDE_STAT(non_coalesced_pkts),
 	QEDE_STAT(coalesced_bytes),
+
+	QEDE_STAT(link_change_count),
 };
 
 #define QEDE_NUM_STATS	ARRAY_SIZE(qede_stats_arr)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index d118771e1a7b..6a796040a32c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -399,6 +399,7 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	p_common->brb_truncates = stats.common.brb_truncates;
 	p_common->brb_discards = stats.common.brb_discards;
 	p_common->tx_mac_ctrl_frames = stats.common.tx_mac_ctrl_frames;
+	p_common->link_change_count = stats.common.link_change_count;
 
 	if (QEDE_IS_BB(edev)) {
 		struct qede_stats_bb *p_bb = &edev->stats.bb;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ac991a3b8f03..b4040023cbfb 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -1180,6 +1180,7 @@ struct qed_eth_stats_common {
 	u64	tx_mac_mc_packets;
 	u64	tx_mac_bc_packets;
 	u64	tx_mac_ctrl_frames;
+	u64	link_change_count;
 };
 
 struct qed_eth_stats_bb {
-- 
cgit v1.2.3


From ead9ad7253f4296ba8cd3eb930f06a74924700a9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 30 Apr 2018 14:31:30 +1000
Subject: rculist: add list_for_each_entry_from_rcu()

list_for_each_entry_from_rcu() is an RCU version of
list_for_each_entry_from().  It walks a linked list under rcu
protection, from a given start point.

It is similar to list_for_each_entry_continue_rcu() but starts *at*
the given position rather than *after* it.

Naturally, the start point must be known to be in the list.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/rculist.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 127f534fec94..36df6ccbc874 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -403,6 +403,19 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
 	     &pos->member != (head);	\
 	     pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
+/**
+ * list_for_each_entry_from_rcu - iterate over a list from current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_node within the struct.
+ *
+ * Iterate over the tail of a list starting from a given position,
+ * which must have been in the list when the RCU read lock was taken.
+ */
+#define list_for_each_entry_from_rcu(pos, head, member)			\
+	for (; &(pos)->member != (head);					\
+		pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))
+
 /**
  * hlist_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
-- 
cgit v1.2.3


From fb91fb0ee7b266ed0344515c048f57ac65cdb4b4 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 4 May 2018 16:22:48 -0400
Subject: NFS: Move call to nfs4_state_protect_write() to nfs4_write_setup()

This doesn't really need to be in the generic NFS client code, and I
think it makes more sense to keep the v4 code in one place.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3proc.c       | 3 ++-
 fs/nfs/nfs4proc.c       | 4 +++-
 fs/nfs/proc.c           | 3 ++-
 fs/nfs/write.c          | 5 +----
 include/linux/nfs_xdr.h | 3 ++-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index eadf1ab31d16..3ffaff471592 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -823,7 +823,8 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 }
 
 static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
-				  struct rpc_message *msg)
+				  struct rpc_message *msg,
+				  struct rpc_clnt **clnt)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b71757e85066..09be575e9194 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4979,7 +4979,8 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
 }
 
 static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
-				  struct rpc_message *msg)
+				  struct rpc_message *msg,
+				  struct rpc_clnt **clnt)
 {
 	struct nfs_server *server = NFS_SERVER(hdr->inode);
 
@@ -4996,6 +4997,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
 
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
+	nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4e93d6308733..aaad337962d0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -618,7 +618,8 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 }
 
 static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
-				 struct rpc_message *msg)
+				 struct rpc_message *msg,
+				 struct rpc_clnt **clnt)
 {
 	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
 	hdr->args.stable = NFS_FILE_SYNC;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0193053bc139..b633583ca268 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1375,12 +1375,9 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
 	int priority = flush_task_priority(how);
 
 	task_setup_data->priority = priority;
-	rpc_ops->write_setup(hdr, msg);
+	rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client);
 	trace_nfs_initiate_write(hdr->inode, hdr->io_start, hdr->good_bytes,
 				 hdr->args.stable);
-
-	nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
-				 &task_setup_data->rpc_client, msg, hdr);
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 34d28564ecf3..1009269e1a49 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1620,7 +1620,8 @@ struct nfs_rpc_ops {
 				    struct nfs_pgio_header *);
 	void	(*read_setup)(struct nfs_pgio_header *, struct rpc_message *);
 	int	(*read_done)(struct rpc_task *, struct nfs_pgio_header *);
-	void	(*write_setup)(struct nfs_pgio_header *, struct rpc_message *);
+	void	(*write_setup)(struct nfs_pgio_header *, struct rpc_message *,
+				struct rpc_clnt **);
 	int	(*write_done)(struct rpc_task *, struct nfs_pgio_header *);
 	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
 	void	(*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
-- 
cgit v1.2.3


From e9ae1ee2b2ace7dcc37bc4e486b4410e7702ae57 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 4 May 2018 16:22:49 -0400
Subject: NFS: Move call to nfs4_state_protect() to nfs4_commit_setup()

Rather than doing this in the generic NFS client code.  Let's put this
with the other v4 stuff so it's all in one place.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3proc.c       | 3 ++-
 fs/nfs/nfs4proc.c       | 4 +++-
 fs/nfs/proc.c           | 3 ++-
 fs/nfs/write.c          | 5 +----
 include/linux/nfs_xdr.h | 3 ++-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3ffaff471592..6ed6a4195906 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -845,7 +845,8 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 	return 0;
 }
 
-static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
+static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+				   struct rpc_clnt **clnt)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 09be575e9194..50a067018a60 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5028,7 +5028,8 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 	return data->commit_done_cb(task, data);
 }
 
-static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
+static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+				   struct rpc_clnt **clnt)
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
@@ -5037,6 +5038,7 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
 }
 
 struct nfs4_renewdata {
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index aaad337962d0..80c350b6232a 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -632,7 +632,8 @@ static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit
 }
 
 static void
-nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
+nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+			struct rpc_clnt **clnt)
 {
 	BUG();
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b633583ca268..a057b4f45a46 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1666,14 +1666,11 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 		.priority = priority,
 	};
 	/* Set up the initial task struct.  */
-	nfs_ops->commit_setup(data, &msg);
+	nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client);
 	trace_nfs_initiate_commit(data);
 
 	dprintk("NFS: initiated commit call\n");
 
-	nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
-		NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
-
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 1009269e1a49..52b481dfd61e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1623,7 +1623,8 @@ struct nfs_rpc_ops {
 	void	(*write_setup)(struct nfs_pgio_header *, struct rpc_message *,
 				struct rpc_clnt **);
 	int	(*write_done)(struct rpc_task *, struct nfs_pgio_header *);
-	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
+	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *,
+				struct rpc_clnt **);
 	void	(*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
 	int	(*commit_done) (struct rpc_task *, struct nfs_commit_data *);
 	int	(*lock)(struct file *, int, struct file_lock *);
-- 
cgit v1.2.3


From ed7e9ad0908a8c2a502f49ceed940d0ce122fe8b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 30 May 2018 16:11:52 -0400
Subject: NFSv4: Fix sillyrename to return the delegation when appropriate

Ensure that we pass down the inode of the file being deleted so
that we can return any delegation being held.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3proc.c       |  4 +++-
 fs/nfs/nfs4proc.c       |  5 +++--
 fs/nfs/proc.c           |  4 +++-
 fs/nfs/unlink.c         | 10 +++++-----
 include/linux/nfs_xdr.h |  2 +-
 5 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 6ed6a4195906..5645ef4c5259 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -414,7 +414,9 @@ out:
 }
 
 static void
-nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry)
+nfs3_proc_unlink_setup(struct rpc_message *msg,
+		struct dentry *dentry,
+		struct inode *inode)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 844eafbf1564..d21c5e4220a0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4260,11 +4260,12 @@ static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name)
 	return err;
 }
 
-static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry)
+static void nfs4_proc_unlink_setup(struct rpc_message *msg,
+		struct dentry *dentry,
+		struct inode *inode)
 {
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
-	struct inode *inode = d_inode(dentry);
 
 	res->server = NFS_SB(dentry->d_sb);
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 80c350b6232a..763f77e7f1f1 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -321,7 +321,9 @@ nfs_proc_remove(struct inode *dir, struct dentry *dentry)
 }
 
 static void
-nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry)
+nfs_proc_unlink_setup(struct rpc_message *msg,
+		struct dentry *dentry,
+		struct inode *inode)
 {
 	msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index bf54fc9ae135..6a73b8c808ea 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -85,7 +85,7 @@ static const struct rpc_call_ops nfs_unlink_ops = {
 	.rpc_call_prepare = nfs_unlink_prepare,
 };
 
-static void nfs_do_call_unlink(struct nfs_unlinkdata *data)
+static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
 {
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
@@ -105,7 +105,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data)
 	data->args.fh = NFS_FH(dir);
 	nfs_fattr_init(data->res.dir_attr);
 
-	NFS_PROTO(dir)->unlink_setup(&msg, data->dentry);
+	NFS_PROTO(dir)->unlink_setup(&msg, data->dentry, inode);
 
 	task_setup_data.rpc_client = NFS_CLIENT(dir);
 	task = rpc_run_task(&task_setup_data);
@@ -113,7 +113,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data)
 		rpc_put_task_async(task);
 }
 
-static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
+static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nfs_unlinkdata *data)
 {
 	struct inode *dir = d_inode(dentry->d_parent);
 	struct dentry *alias;
@@ -153,7 +153,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
 		return ret;
 	}
 	data->dentry = alias;
-	nfs_do_call_unlink(data);
+	nfs_do_call_unlink(inode, data);
 	return 1;
 }
 
@@ -231,7 +231,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
 	dentry->d_fsdata = NULL;
 	spin_unlock(&dentry->d_lock);
 
-	if (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))
+	if (NFS_STALE(inode) || !nfs_call_unlink(dentry, inode, data))
 		nfs_free_unlinkdata(data);
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 52b481dfd61e..d3cefe57c2a3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1591,7 +1591,7 @@ struct nfs_rpc_ops {
 	int	(*create)  (struct inode *, struct dentry *,
 			    struct iattr *, int);
 	int	(*remove)  (struct inode *, struct dentry *);
-	void	(*unlink_setup)  (struct rpc_message *, struct dentry *);
+	void	(*unlink_setup)  (struct rpc_message *, struct dentry *, struct inode *);
 	void	(*unlink_rpc_prepare) (struct rpc_task *, struct nfs_unlinkdata *);
 	int	(*unlink_done) (struct rpc_task *, struct inode *);
 	void	(*rename_setup)  (struct rpc_message *msg,
-- 
cgit v1.2.3


From 808ba32abe84b74abef5eb7507b8031f65b8221d Mon Sep 17 00:00:00 2001
From: Fred Isaman <fred.isaman@gmail.com>
Date: Tue, 4 Oct 2016 15:02:21 -0400
Subject: pnfs: Store return value of decode_layoutget for later processing

This will be needed to seperate return value of OPEN and LAYOUTGET
when they are combined into a single RPC.

Signed-off-by: Fred Isaman <fred.isaman@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c        | 15 ++++++++++-----
 include/linux/nfs_xdr.h |  1 +
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9b7392032321..6024980dfc9e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6024,7 +6024,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 
 	status = decode_op_hdr(xdr, OP_LAYOUTGET);
 	if (status)
-		return status;
+		goto out;
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -6037,7 +6037,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 	if (!layout_count) {
 		dprintk("%s: server responded with empty layout array\n",
 			__func__);
-		return -EINVAL;
+		status = -EINVAL;
+		goto out;
 	}
 
 	p = xdr_inline_decode(xdr, 28);
@@ -6062,7 +6063,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 		dprintk("NFS: server cheating in layoutget reply: "
 				"layout len %u > recvd %u\n",
 				res->layoutp->len, recvd);
-		return -EINVAL;
+		status = -EINVAL;
+		goto out;
 	}
 
 	if (layout_count > 1) {
@@ -6075,10 +6077,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 			__func__, layout_count);
 	}
 
-	return 0;
+out:
+	res->status = status;
+	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	status = -EIO;
+	goto out;
 }
 
 static int decode_layoutreturn(struct xdr_stream *xdr,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d3cefe57c2a3..d3aa5eaf99a7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -259,6 +259,7 @@ struct nfs4_layoutget_args {
 
 struct nfs4_layoutget_res {
 	struct nfs4_sequence_res seq_res;
+	int status;
 	__u32 return_on_close;
 	struct pnfs_layout_range range;
 	__u32 type;
-- 
cgit v1.2.3


From 56f487f8c8fc5d6e582b79a86fc132d050129e15 Mon Sep 17 00:00:00 2001
From: Fred Isaman <fred.isaman@gmail.com>
Date: Wed, 21 Sep 2016 11:43:41 -0400
Subject: pnfs: Add conditional encode/decode of LAYOUTGET within OPEN compound

Signed-off-by: Fred Isaman <fred.isaman@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c       |  1 +
 fs/nfs/nfs4xdr.c        | 50 +++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/nfs_xdr.h |  2 ++
 3 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 16b14aa99b79..b3f200208295 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1080,6 +1080,7 @@ struct nfs4_opendata {
 	struct nfs4_state_owner *owner;
 	struct nfs4_state *state;
 	struct iattr attrs;
+	struct nfs4_layoutget *lgp;
 	unsigned long timestamp;
 	bool rpc_done;
 	bool file_created;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6024980dfc9e..738a7be019d2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -65,7 +65,13 @@
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO		EIO
 
+struct compound_hdr;
 static int nfs4_stat_to_errno(int);
+static void encode_layoutget(struct xdr_stream *xdr,
+			     const struct nfs4_layoutget_args *args,
+			     struct compound_hdr *hdr);
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+			     struct nfs4_layoutget_res *res);
 
 /* NFSv4 COMPOUND tags are only wanted for debugging purposes */
 #ifdef DEBUG
@@ -424,6 +430,8 @@ static int nfs4_stat_to_errno(int);
 #define decode_sequence_maxsz	0
 #define encode_layoutreturn_maxsz 0
 #define decode_layoutreturn_maxsz 0
+#define encode_layoutget_maxsz	0
+#define decode_layoutget_maxsz	0
 #endif /* CONFIG_NFS_V4_1 */
 
 #define NFS4_enc_compound_sz	(1024)  /* XXX: large enough? */
@@ -476,14 +484,16 @@ static int nfs4_stat_to_errno(int);
 				encode_open_maxsz + \
 				encode_access_maxsz + \
 				encode_getfh_maxsz + \
-				encode_getattr_maxsz)
+				encode_getattr_maxsz + \
+				encode_layoutget_maxsz)
 #define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_open_maxsz + \
 				decode_access_maxsz + \
 				decode_getfh_maxsz + \
-				decode_getattr_maxsz)
+				decode_getattr_maxsz + \
+				decode_layoutget_maxsz)
 #define NFS4_enc_open_confirm_sz \
 				(compound_encode_hdr_maxsz + \
 				 encode_putfh_maxsz + \
@@ -497,13 +507,15 @@ static int nfs4_stat_to_errno(int);
 					encode_putfh_maxsz + \
 					encode_open_maxsz + \
 					encode_access_maxsz + \
-					encode_getattr_maxsz)
+					encode_getattr_maxsz + \
+					encode_layoutget_maxsz)
 #define NFS4_dec_open_noattr_sz	(compound_decode_hdr_maxsz + \
 					decode_sequence_maxsz + \
 					decode_putfh_maxsz + \
 					decode_open_maxsz + \
 					decode_access_maxsz + \
-					decode_getattr_maxsz)
+					decode_getattr_maxsz + \
+					decode_layoutget_maxsz)
 #define NFS4_enc_open_downgrade_sz \
 				(compound_encode_hdr_maxsz + \
 				 encode_sequence_maxsz + \
@@ -2070,6 +2082,13 @@ encode_layoutreturn(struct xdr_stream *xdr,
 		    struct compound_hdr *hdr)
 {
 }
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+		      const struct nfs4_layoutget_args *args,
+		      struct compound_hdr *hdr)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -2316,6 +2335,12 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 	if (args->access)
 		encode_access(xdr, args->access, &hdr);
 	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+	if (args->lg_args) {
+		encode_layoutget(xdr, args->lg_args, &hdr);
+		xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
+				 args->lg_args->layout.pages,
+				 0, args->lg_args->layout.pglen);
+	}
 	encode_nops(&hdr);
 }
 
@@ -2356,6 +2381,12 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
 	if (args->access)
 		encode_access(xdr, args->access, &hdr);
 	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+	if (args->lg_args) {
+		encode_layoutget(xdr, args->lg_args, &hdr);
+		xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
+				 args->lg_args->layout.pages,
+				 0, args->lg_args->layout.pglen);
+	}
 	encode_nops(&hdr);
 }
 
@@ -6182,6 +6213,13 @@ int decode_layoutreturn(struct xdr_stream *xdr,
 {
 	return 0;
 }
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+			    struct nfs4_layoutget_res *res)
+{
+	return 0;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -6628,6 +6666,8 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (res->access_request)
 		decode_access(xdr, &res->access_supported, &res->access_result);
 	decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
+	if (res->lg_res)
+		decode_layoutget(xdr, rqstp, res->lg_res);
 out:
 	return status;
 }
@@ -6680,6 +6720,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
 	if (res->access_request)
 		decode_access(xdr, &res->access_supported, &res->access_result);
 	decode_getfattr(xdr, res->f_attr, res->server);
+	if (res->lg_res)
+		decode_layoutget(xdr, rqstp, res->lg_res);
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d3aa5eaf99a7..bc235e50415f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -436,6 +436,7 @@ struct nfs_openargs {
 	enum createmode4	createmode;
 	const struct nfs4_label *label;
 	umode_t			umask;
+	struct nfs4_layoutget_args *lg_args;
 };
 
 struct nfs_openres {
@@ -458,6 +459,7 @@ struct nfs_openres {
 	__u32			access_request;
 	__u32			access_supported;
 	__u32			access_result;
+	struct nfs4_layoutget_res *lg_res;
 };
 
 /*
-- 
cgit v1.2.3


From 6e01260ceeca299b82d466660935534c5c909d54 Mon Sep 17 00:00:00 2001
From: Fred Isaman <fred.isaman@gmail.com>
Date: Tue, 4 Oct 2016 15:26:41 -0400
Subject: pnfs: Stop attempting LAYOUTGET on OPEN on failure

Signed-off-by: Fred Isaman <fred.isaman@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c         |  4 +++-
 fs/nfs/pnfs.c             | 19 ++++++++++++++++++-
 include/linux/nfs_fs_sb.h |  1 +
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e6db190a874..05454cbc473d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -9465,7 +9465,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 		| NFS_CAP_ATOMIC_OPEN
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
-		| NFS_CAP_ATOMIC_OPEN_V1,
+		| NFS_CAP_ATOMIC_OPEN_V1
+		| NFS_CAP_LGOPEN,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
@@ -9490,6 +9491,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
+		| NFS_CAP_LGOPEN
 		| NFS_CAP_ALLOCATE
 		| NFS_CAP_COPY
 		| NFS_CAP_DEALLOCATE
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a6fe8132e944..3dfe9fa264a5 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2088,6 +2088,8 @@ void pnfs_lgopen_prepare(struct nfs4_opendata *data,
 	      server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN))
 		return;
 	/* Could check on max_ops, but currently hardcoded high enough */
+	if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
+		return;
 	if (data->state)
 		_lgopen_prepare_attached(data, ctx);
 	else
@@ -2101,8 +2103,23 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
 	struct pnfs_layout_segment *lseg;
 	u32 iomode;
 
-	if (!lgp || lgp->res.layoutp->len == 0)
+	if (!lgp)
 		return;
+	dprintk("%s: entered with status %i\n", __func__, lgp->res.status);
+	if (lgp->res.status) {
+		switch (lgp->res.status) {
+		case -NFS4ERR_DELAY:
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_LAYOUTTRYLATER:
+			break;
+		default:
+			/* FIXME - Any error not listed above permanently
+			 * halts lgopen attempts.
+			 */
+			NFS_SERVER(ino)->caps &= ~NFS_CAP_LGOPEN;
+		}
+		return;
+	}
 	if (!lgp->args.inode) {
 		lo = _pnfs_grab_empty_layout(ino, ctx);
 		if (!lo)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4e735be53e70..2c18d618604e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -235,6 +235,7 @@ struct nfs_server {
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
 /* #define NFS_CAP_CHANGE_ATTR	(1U << 5) */
+#define NFS_CAP_LGOPEN		(1U << 5)
 #define NFS_CAP_FILEID		(1U << 6)
 #define NFS_CAP_MODE		(1U << 7)
 #define NFS_CAP_NLINK		(1U << 8)
-- 
cgit v1.2.3


From c49b5209f99abe082d3d4cd94f0ad924baea34ed Mon Sep 17 00:00:00 2001
From: Fred Isaman <fred.isaman@gmail.com>
Date: Wed, 5 Oct 2016 09:37:12 -0400
Subject: pnfs: Add barrier to prevent lgopen using LAYOUTGET during recall

Since the LAYOUTGET on OPEN can be sent without prior inode information,
existing methods to prevent LAYOUTGET from being sent while processing
CB_LAYOUTRECALL don't work.  Track if a recall occurred while LAYOUTGET
was being sent, and if so ignore the results.

Signed-off-by: Fred Isaman <fred.isaman@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/callback_proc.c    | 2 ++
 fs/nfs/pnfs.c             | 8 +++++++-
 include/linux/nfs_fs_sb.h | 1 +
 include/linux/nfs_xdr.h   | 1 +
 4 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a50d7813e3ea..d561161b7c3e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -322,6 +322,8 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
 static u32 do_callback_layoutrecall(struct nfs_client *clp,
 				    struct cb_layoutrecallargs *args)
 {
+	write_seqcount_begin(&clp->cl_callback_count);
+	write_seqcount_end(&clp->cl_callback_count);
 	if (args->cbl_recall_type == RETURN_FILE)
 		return initiate_file_draining(clp, args);
 	return initiate_bulk_draining(clp, args);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3dfe9fa264a5..a29fdea2db91 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1017,6 +1017,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
 	nfs4_stateid_copy(&lgp->args.stateid, stateid);
 	lgp->gfp_flags = gfp_flags;
 	lgp->cred = get_rpccred(ctx->cred);
+	lgp->callback_count = raw_seqcount_begin(&server->nfs_client->cl_callback_count);
 	return lgp;
 }
 
@@ -2101,6 +2102,7 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
 {
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_segment *lseg;
+	struct nfs_server *srv = NFS_SERVER(ino);
 	u32 iomode;
 
 	if (!lgp)
@@ -2116,7 +2118,7 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
 			/* FIXME - Any error not listed above permanently
 			 * halts lgopen attempts.
 			 */
-			NFS_SERVER(ino)->caps &= ~NFS_CAP_LGOPEN;
+			srv->caps &= ~NFS_CAP_LGOPEN;
 		}
 		return;
 	}
@@ -2129,6 +2131,9 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
 		lo = NFS_I(lgp->args.inode)->layout;
 	pnfs_get_layout_hdr(lo);
 
+	if (read_seqcount_retry(&srv->nfs_client->cl_callback_count,
+				lgp->callback_count))
+		goto out;
 	lseg = pnfs_layout_process(lgp);
 	atomic_dec(&lo->plh_outstanding);
 	if (IS_ERR(lseg)) {
@@ -2139,6 +2144,7 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
 		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 		pnfs_put_lseg(lseg);
 	}
+out:
 	pnfs_clear_first_layoutget(lo);
 	pnfs_put_layout_hdr(lo);
 }
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2c18d618604e..74ae3e1d19a0 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -28,6 +28,7 @@ struct nfs41_impl_id;
 struct nfs_client {
 	refcount_t		cl_count;
 	atomic_t		cl_mds_count;
+	seqcount_t		cl_callback_count;
 	int			cl_cons_state;	/* current construction state (-ve: init error) */
 #define NFS_CS_READY		0		/* ready to be used */
 #define NFS_CS_INITING		1		/* busy initialising */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index bc235e50415f..09dc14ac5804 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -271,6 +271,7 @@ struct nfs4_layoutget {
 	struct nfs4_layoutget_args args;
 	struct nfs4_layoutget_res res;
 	struct rpc_cred *cred;
+	unsigned callback_count;
 	gfp_t gfp_flags;
 };
 
-- 
cgit v1.2.3


From 35aada99b5bfa7a9360f9653377688556f71edcb Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@cumulusnetworks.com>
Date: Wed, 30 May 2018 08:27:32 -0400
Subject: rtnetlink: Add more well known protocol values

FRRouting installs routes into the kernel associated with
the originating protocol.  Add these values to the well
known values in rtnetlink.h.

Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cabb210c93af..7d8502313c99 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -254,6 +254,11 @@ enum {
 #define RTPROT_DHCP	16      /* DHCP client */
 #define RTPROT_MROUTED	17      /* Multicast daemon */
 #define RTPROT_BABEL	42      /* Babel daemon */
+#define RTPROT_BGP	186     /* BGP Routes */
+#define RTPROT_ISIS	187     /* ISIS Routes */
+#define RTPROT_OSPF	188     /* OSPF Routes */
+#define RTPROT_RIP	189     /* RIP Routes */
+#define RTPROT_EIGRP	192     /* EIGRP Routes */
 
 /* rtm_scope
 
-- 
cgit v1.2.3


From 1865ea9adbfaf341c5cd5d8f7d384f19948b2fe9 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Wed, 30 May 2018 10:59:49 -0700
Subject: net/mlx5: Add temperature warning event to log

Temperature warning event is sent by FW to indicate high temperature
as detected by one of the sensors on the board.
Add handling of this event by writing the numbers of the alert sensors
to the kernel log.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 23 +++++++++++++++++++++++
 include/linux/mlx5/device.h                  |  7 +++++++
 include/linux/mlx5/mlx5_ifc.h                |  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 1814f803bd2c..1a3a2b9a7232 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -144,6 +144,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_GPIO_EVENT";
 	case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
 		return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
+	case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+		return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT";
 	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
 		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
 	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
@@ -396,6 +398,20 @@ static void general_event_handler(struct mlx5_core_dev *dev,
 	}
 }
 
+static void mlx5_temp_warning_event(struct mlx5_core_dev *dev,
+				    struct mlx5_eqe *eqe)
+{
+	u64 value_lsb;
+	u64 value_msb;
+
+	value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb);
+	value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb);
+
+	mlx5_core_warn(dev,
+		       "High temperature on sensors with bit set %llx %llx",
+		       value_msb, value_lsb);
+}
+
 /* caller must eventually call mlx5_cq_put on the returned cq */
 static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 {
@@ -550,6 +566,10 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
 			break;
 
+		case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+			mlx5_temp_warning_event(dev, eqe);
+			break;
+
 		case MLX5_EVENT_TYPE_GENERAL_EVENT:
 			general_event_handler(dev, eqe);
 			break;
@@ -827,6 +847,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
 
+	if (MLX5_CAP_GEN(dev, temp_warn_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
+
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index db0332a6d23c..c1095e08f0c8 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -314,6 +314,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
 	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
 	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
+	MLX5_EVENT_TYPE_TEMP_WARN_EVENT    = 0x17,
 	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
 	MLX5_EVENT_TYPE_GENERAL_EVENT	   = 0x22,
 	MLX5_EVENT_TYPE_PPS_EVENT          = 0x25,
@@ -626,6 +627,11 @@ struct mlx5_eqe_dct {
 	__be32  dctn;
 };
 
+struct mlx5_eqe_temp_warning {
+	__be64 sensor_warning_msb;
+	__be64 sensor_warning_lsb;
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -642,6 +648,7 @@ union ev_data {
 	struct mlx5_eqe_port_module	port_module;
 	struct mlx5_eqe_pps		pps;
 	struct mlx5_eqe_dct             dct;
+	struct mlx5_eqe_temp_warning	temp_warning;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 05b480fae27d..ca6c0dfb5ffe 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -912,7 +912,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_msg[0x5];
 	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1d0[0x1];
+	u8         temp_warn_event[0x1];
 	u8         dcbx[0x1];
 	u8         general_notification_event[0x1];
 	u8         reserved_at_1d3[0x2];
-- 
cgit v1.2.3


From 1f0cf89b09305743fca5898660a7be83aab38a74 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Wed, 30 May 2018 10:59:50 -0700
Subject: net/mlx5: Add FPGA QP error event

The FPGA queue pair (QP) event fires whenever a QP on the FPGA
transitions to the error state.

At this stage, this event is unrecoverable, it may become recoverable
in the future.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c |  7 +++++--
 include/linux/mlx5/device.h                  |  1 +
 include/linux/mlx5/mlx5_ifc.h                |  1 +
 include/linux/mlx5/mlx5_ifc_fpga.h           | 16 ++++++++++++++++
 4 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 1a3a2b9a7232..406c23862f5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -164,6 +164,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
 	case MLX5_EVENT_TYPE_FPGA_ERROR:
 		return "MLX5_EVENT_TYPE_FPGA_ERROR";
+	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+		return "MLX5_EVENT_TYPE_FPGA_QP_ERROR";
 	case MLX5_EVENT_TYPE_GENERAL_EVENT:
 		return "MLX5_EVENT_TYPE_GENERAL_EVENT";
 	default:
@@ -563,6 +565,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 
 		case MLX5_EVENT_TYPE_FPGA_ERROR:
+		case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
 			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
 			break;
 
@@ -842,11 +845,11 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
 
 	if (MLX5_CAP_GEN(dev, fpga))
-		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR);
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR) |
+				    (1ull << MLX5_EVENT_TYPE_FPGA_QP_ERROR);
 	if (MLX5_CAP_GEN_MAX(dev, dct))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
-
 	if (MLX5_CAP_GEN(dev, temp_warn_event))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index c1095e08f0c8..0f006cf8343d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -331,6 +331,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_DCT_DRAINED        = 0x1c,
 
 	MLX5_EVENT_TYPE_FPGA_ERROR         = 0x20,
+	MLX5_EVENT_TYPE_FPGA_QP_ERROR      = 0x21,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ca6c0dfb5ffe..8e0b8865f91e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -60,6 +60,7 @@ enum {
 	MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION        = 0xa,
 	MLX5_EVENT_TYPE_CODING_PAGE_REQUEST                        = 0xb,
 	MLX5_EVENT_TYPE_CODING_FPGA_ERROR                          = 0x20,
+	MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR                       = 0x21
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index 193091537cb6..64d0f40d4cc3 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -470,6 +470,22 @@ struct mlx5_ifc_ipsec_counters_bits {
 	u8         dropped_cmd[0x40];
 };
 
+enum {
+	MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED  = 0x1,
+	MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED            = 0x2,
+};
+
+struct mlx5_ifc_fpga_qp_error_event_bits {
+	u8         reserved_at_0[0x40];
+
+	u8         reserved_at_40[0x18];
+	u8         syndrome[0x8];
+
+	u8         reserved_at_60[0x60];
+
+	u8         reserved_at_c0[0x8];
+	u8         fpga_qpn[0x18];
+};
 enum mlx5_ifc_fpga_ipsec_response_syndrome {
 	MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0,
 	MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1,
-- 
cgit v1.2.3


From f0907827a8a9152aedac2833ed1b674a7b2a44f2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 8 May 2018 00:36:27 +0200
Subject: compiler.h: enable builtin overflow checkers and add fallback code

This adds wrappers for the __builtin overflow checkers present in gcc
5.1+ as well as fallback implementations for earlier compilers. It's not
that easy to implement the fully generic __builtin_X_overflow(T1 a, T2
b, T3 *d) in macros, so the fallback code assumes that T1, T2 and T3 are
the same. We obviously don't want the wrappers to have different
semantics depending on $GCC_VERSION, so we also insist on that even when
using the builtins.

There are a few problems with the 'a+b < a' idiom for checking for
overflow: For signed types, it relies on undefined behaviour and is
not actually complete (it doesn't check underflow;
e.g. INT_MIN+INT_MIN == 0 isn't caught). Due to type promotion it
is wrong for all types (signed and unsigned) narrower than
int. Similarly, when a and b does not have the same type, there are
subtle cases like

  u32 a;

  if (a + sizeof(foo) < a)
    return -EOVERFLOW;
  a += sizeof(foo);

where the test is always false on 64 bit platforms. Add to that that it
is not always possible to determine the types involved at a glance.

The new overflow.h is somewhat bulky, but that's mostly a result of
trying to be type-generic, complete (e.g. catching not only overflow
but also signed underflow) and not relying on undefined behaviour.

Linus is of course right [1] that for unsigned subtraction a-b, the
right way to check for overflow (underflow) is "b > a" and not
"__builtin_sub_overflow(a, b, &d)", but that's just one out of six cases
covered here, and included mostly for completeness.

So is it worth it? I think it is, if nothing else for the documentation
value of seeing

  if (check_add_overflow(a, b, &d))
    return -EGOAWAY;
  do_stuff_with(d);

instead of the open-coded (and possibly wrong and/or incomplete and/or
UBsan-tickling)

  if (a+b < a)
    return -EGOAWAY;
  do_stuff_with(a+b);

While gcc does recognize the 'a+b < a' idiom for testing unsigned add
overflow, it doesn't do nearly as good for unsigned multiplication
(there's also no single well-established idiom). So using
check_mul_overflow in kcalloc and friends may also make gcc generate
slightly better code.

[1] https://lkml.org/lkml/2015/11/2/658

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/compiler-clang.h |  14 +++
 include/linux/compiler-gcc.h   |   4 +
 include/linux/compiler-intel.h |   4 +
 include/linux/overflow.h       | 205 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 227 insertions(+)
 create mode 100644 include/linux/overflow.h

(limited to 'include')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 7d98e263e048..7087446c24c8 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -32,3 +32,17 @@
 #ifdef __noretpoline
 #undef __noretpoline
 #endif
+
+/*
+ * Not all versions of clang implement the the type-generic versions
+ * of the builtin overflow checkers. Fortunately, clang implements
+ * __has_builtin allowing us to avoid awkward version
+ * checks. Unfortunately, we don't know which version of gcc clang
+ * pretends to be, so the macro may or may not be defined.
+ */
+#undef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+#if __has_builtin(__builtin_mul_overflow) && \
+    __has_builtin(__builtin_add_overflow) && \
+    __has_builtin(__builtin_sub_overflow)
+#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
+#endif
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index b4bf73f5e38f..f1a7492a5cc8 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -343,3 +343,7 @@
  * code
  */
 #define uninitialized_var(x) x = x
+
+#if GCC_VERSION >= 50100
+#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
+#endif
diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
index bfa08160db3a..547cdc920a3c 100644
--- a/include/linux/compiler-intel.h
+++ b/include/linux/compiler-intel.h
@@ -44,3 +44,7 @@
 #define __builtin_bswap16 _bswap16
 #endif
 
+/*
+ * icc defines __GNUC__, but does not implement the builtin overflow checkers.
+ */
+#undef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
new file mode 100644
index 000000000000..c8890ec358a7
--- /dev/null
+++ b/include/linux/overflow.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define is_signed_type(type)       (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_add_overflow(__a, __b, __d);	\
+})
+
+#define check_sub_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_sub_overflow(__a, __b, __d);	\
+})
+
+#define check_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_mul_overflow(__a, __b, __d);	\
+})
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a + __b;			\
+	*__d < __a;				\
+})
+#define __unsigned_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a - __b;			\
+	__a < __b;				\
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);				\
+	typeof(b) __b = (b);				\
+	typeof(d) __d = (d);				\
+	(void) (&__a == &__b);				\
+	(void) (&__a == __d);				\
+	*__d = __a * __b;				\
+	__builtin_constant_p(__b) ?			\
+	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
+	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a + (u64)__b;		\
+	(((~(__a ^ __b)) & (*__d ^ __a))	\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a - (u64)__b;		\
+	((((__a ^ __b)) & (*__d ^ __a))		\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({				\
+	typeof(a) __a = (a);						\
+	typeof(b) __b = (b);						\
+	typeof(d) __d = (d);						\
+	typeof(a) __tmax = type_max(typeof(a));				\
+	typeof(a) __tmin = type_min(typeof(a));				\
+	(void) (&__a == &__b);						\
+	(void) (&__a == __d);						\
+	*__d = (u64)__a * (u64)__b;					\
+	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
+	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
+	(__b == (typeof(__b))-1 && __a == __tmin);			\
+})
+
+
+#define check_add_overflow(a, b, d)					\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_add_overflow(a, b, d),			\
+			__unsigned_add_overflow(a, b, d))
+
+#define check_sub_overflow(a, b, d)					\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_sub_overflow(a, b, d),			\
+			__unsigned_sub_overflow(a, b, d))
+
+#define check_mul_overflow(a, b, d)					\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_mul_overflow(a, b, d),			\
+			__unsigned_mul_overflow(a, b, d))
+
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+#endif /* __LINUX_OVERFLOW_H */
-- 
cgit v1.2.3


From ccf8dbcd062a930e64741c939ca784d15316aa0c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 30 May 2018 15:20:52 -0700
Subject: rtnetlink: Remove VLA usage

In the quest to remove all stack VLA usage from the kernel[1], this
allocates the maximum size expected for all possible types and adds
sanity-checks at both registration and usage to make sure nothing gets
out of sync. This matches the proposed VLA solution for nfnetlink[2]. The
values chosen here were based on finding assignments for .maxtype and
.slave_maxtype and manually counting the enums:

slave_maxtype (max 33):
	IFLA_BRPORT_MAX     33
	IFLA_BOND_SLAVE_MAX  9

maxtype (max 45):
	IFLA_BOND_MAX       28
	IFLA_BR_MAX         45
	__IFLA_CAIF_HSI_MAX  8
	IFLA_CAIF_MAX        4
	IFLA_CAN_MAX        16
	IFLA_GENEVE_MAX     12
	IFLA_GRE_MAX        25
	IFLA_GTP_MAX         5
	IFLA_HSR_MAX         7
	IFLA_IPOIB_MAX       4
	IFLA_IPTUN_MAX      21
	IFLA_IPVLAN_MAX      3
	IFLA_MACSEC_MAX     15
	IFLA_MACVLAN_MAX     7
	IFLA_PPP_MAX         2
	__IFLA_RMNET_MAX     4
	IFLA_VLAN_MAX        6
	IFLA_VRF_MAX         2
	IFLA_VTI_MAX         7
	IFLA_VXLAN_MAX      28
	VETH_INFO_MAX        2
	VXCAN_INFO_MAX       2

This additionally changes maxtype and slave_maxtype fields to unsigned,
since they're only ever using positive values.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
[2] https://patchwork.kernel.org/patch/10439647/

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h |  4 ++--
 net/core/rtnetlink.c    | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 14b6b3af8918..0bbaa5488423 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -64,7 +64,7 @@ struct rtnl_link_ops {
 	size_t			priv_size;
 	void			(*setup)(struct net_device *dev);
 
-	int			maxtype;
+	unsigned int		maxtype;
 	const struct nla_policy	*policy;
 	int			(*validate)(struct nlattr *tb[],
 					    struct nlattr *data[],
@@ -92,7 +92,7 @@ struct rtnl_link_ops {
 	unsigned int		(*get_num_tx_queues)(void);
 	unsigned int		(*get_num_rx_queues)(void);
 
-	int			slave_maxtype;
+	unsigned int		slave_maxtype;
 	const struct nla_policy	*slave_policy;
 	int			(*slave_changelink)(struct net_device *dev,
 						    struct net_device *slave_dev,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 80802546c279..8ca49a0e13fb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,6 +59,9 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 
+#define RTNL_MAX_TYPE		48
+#define RTNL_SLAVE_MAX_TYPE	36
+
 struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
@@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
 {
 	int err;
 
+	/* Sanity-check max sizes to avoid stack buffer overflow. */
+	if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
+		    ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
+		return -EINVAL;
+
 	rtnl_lock();
 	err = __rtnl_link_register(ops);
 	rtnl_unlock();
@@ -2902,13 +2910,16 @@ replay:
 	}
 
 	if (1) {
-		struct nlattr *attr[ops ? ops->maxtype + 1 : 1];
-		struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1];
+		struct nlattr *attr[RTNL_MAX_TYPE + 1];
+		struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
 		struct nlattr **data = NULL;
 		struct nlattr **slave_data = NULL;
 		struct net *dest_net, *link_net = NULL;
 
 		if (ops) {
+			if (ops->maxtype > RTNL_MAX_TYPE)
+				return -EINVAL;
+
 			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
 				err = nla_parse_nested(attr, ops->maxtype,
 						       linkinfo[IFLA_INFO_DATA],
@@ -2925,6 +2936,9 @@ replay:
 		}
 
 		if (m_ops) {
+			if (ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+				return -EINVAL;
+
 			if (m_ops->slave_maxtype &&
 			    linkinfo[IFLA_INFO_SLAVE_DATA]) {
 				err = nla_parse_nested(slave_attr,
-- 
cgit v1.2.3


From 554ced0a6e2946562c20d9fffdbaf2aa7da36b1b Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Mon, 28 May 2018 09:15:33 +0200
Subject: netfilter: nf_tables: add support for native socket matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now it can only match the transparent flag of an ip/ipv6 socket.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  25 ++++++
 net/netfilter/Kconfig                    |   9 ++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_socket.c               | 143 +++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)
 create mode 100644 net/netfilter/nft_socket.c

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9c71f024f9cc..3d46c82a5ebd 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -904,6 +904,31 @@ enum nft_rt_attributes {
 };
 #define NFTA_RT_MAX		(__NFTA_RT_MAX - 1)
 
+/**
+ * enum nft_socket_attributes - nf_tables socket expression netlink attributes
+ *
+ * @NFTA_SOCKET_KEY: socket key to match
+ * @NFTA_SOCKET_DREG: destination register
+ */
+enum nft_socket_attributes {
+	NFTA_SOCKET_UNSPEC,
+	NFTA_SOCKET_KEY,
+	NFTA_SOCKET_DREG,
+	__NFTA_SOCKET_MAX
+};
+#define NFTA_SOCKET_MAX		(__NFTA_SOCKET_MAX - 1)
+
+/*
+ * enum nft_socket_keys - nf_tables socket expression keys
+ *
+ * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_
+ */
+enum nft_socket_keys {
+	NFT_SOCKET_TRANSPARENT,
+	__NFT_SOCKET_MAX
+};
+#define NFT_SOCKET_MAX	(__NFT_SOCKET_MAX - 1)
+
 /**
  * enum nft_ct_keys - nf_tables ct expression keys
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 3ec8886850b2..276e1e32f44e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -613,6 +613,15 @@ config NFT_FIB_INET
 	  The lookup will be delegated to the IPv4 or IPv6 FIB depending
 	  on the protocol of the packet.
 
+config NFT_SOCKET
+	tristate "Netfilter nf_tables socket match support"
+	depends on IPV6 || IPV6=n
+	select NF_SOCKET_IPV4
+	select NF_SOCKET_IPV6 if IPV6
+	help
+	  This option allows matching for the presence or absence of a
+	  corresponding socket and its attributes.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 9b3434360d49..eec169555731 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_NFT_FIB)		+= nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NF_OSF)		+= nf_osf.o
+obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
new file mode 100644
index 000000000000..d86337068ecb
--- /dev/null
+++ b/net/netfilter/nft_socket.c
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_socket.h>
+#include <net/inet_sock.h>
+
+struct nft_socket {
+	enum nft_socket_keys		key:8;
+	union {
+		enum nft_registers	dreg:8;
+	};
+};
+
+static void nft_socket_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_socket *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	struct sock *sk = skb->sk;
+	u32 *dest = &regs->data[priv->dreg];
+
+	if (!sk)
+		switch(nft_pf(pkt)) {
+		case NFPROTO_IPV4:
+			sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
+			break;
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+		case NFPROTO_IPV6:
+			sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
+			break;
+#endif
+		default:
+			WARN_ON_ONCE(1);
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
+
+	if(!sk) {
+		nft_reg_store8(dest, 0);
+		return;
+	}
+
+	/* So that subsequent socket matching not to require other lookups. */
+	skb->sk = sk;
+
+	switch(priv->key) {
+	case NFT_SOCKET_TRANSPARENT:
+		nft_reg_store8(dest, nf_sk_is_transparent(sk));
+		break;
+	default:
+		WARN_ON(1);
+		regs->verdict.code = NFT_BREAK;
+	}
+}
+
+static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
+	[NFTA_SOCKET_KEY]		= { .type = NLA_U32 },
+	[NFTA_SOCKET_DREG]		= { .type = NLA_U32 },
+};
+
+static int nft_socket_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_socket *priv = nft_expr_priv(expr);
+	unsigned int len;
+
+	if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
+		return -EINVAL;
+
+	switch(ctx->family) {
+	case NFPROTO_IPV4:
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+	case NFPROTO_IPV6:
+#endif
+	case NFPROTO_INET:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+	switch(priv->key) {
+	case NFT_SOCKET_TRANSPARENT:
+		len = sizeof(u8);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, len);
+}
+
+static int nft_socket_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr)
+{
+	const struct nft_socket *priv = nft_expr_priv(expr);
+
+	if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+		return -1;
+	if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
+		return -1;
+	return 0;
+}
+
+static struct nft_expr_type nft_socket_type;
+static const struct nft_expr_ops nft_socket_ops = {
+	.type		= &nft_socket_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_socket)),
+	.eval		= nft_socket_eval,
+	.init		= nft_socket_init,
+	.dump		= nft_socket_dump,
+};
+
+static struct nft_expr_type nft_socket_type __read_mostly = {
+	.name		= "socket",
+	.ops		= &nft_socket_ops,
+	.policy		= nft_socket_policy,
+	.maxattr	= NFTA_SOCKET_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_socket_module_init(void)
+{
+	return nft_register_expr(&nft_socket_type);
+}
+
+static void __exit nft_socket_module_exit(void)
+{
+	nft_unregister_expr(&nft_socket_type);
+}
+
+module_init(nft_socket_module_init);
+module_exit(nft_socket_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables socket match module");
-- 
cgit v1.2.3


From 1a893b44de4528887e7dabcdce7151ca2a8ee238 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 30 May 2018 11:06:22 +0200
Subject: netfilter: nf_tables: Add audit support to log statement

This extends log statement to support the behaviour achieved with
AUDIT target in iptables.

Audit logging is enabled via a pseudo log level 8. In this case any
other settings like log prefix are ignored since audit log format is
fixed.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  5 ++
 net/netfilter/nft_log.c                  | 92 +++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3d46c82a5ebd..5c7eb9b9f6d6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1080,6 +1080,11 @@ enum nft_log_attributes {
 };
 #define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
 
+/**
+ * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging
+ */
+#define LOGLEVEL_AUDIT		8
+
 /**
  * enum nft_queue_attributes - nf_tables queue expression netlink attributes
  *
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index a27be36dc0af..7eef1cffbf1b 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -9,12 +9,15 @@
  * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
+#include <linux/audit.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 #include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
 	char			*prefix;
 };
 
+static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct iphdr _iph;
+	const struct iphdr *ih;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
+	if (!ih)
+		return false;
+
+	audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+			 &ih->saddr, &ih->daddr, ih->protocol);
+
+	return true;
+}
+
+static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct ipv6hdr _ip6h;
+	const struct ipv6hdr *ih;
+	u8 nexthdr;
+	__be16 frag_off;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+	if (!ih)
+		return false;
+
+	nexthdr = ih->nexthdr;
+	ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
+
+	audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+			 &ih->saddr, &ih->daddr, nexthdr);
+
+	return true;
+}
+
+static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
+{
+	struct sk_buff *skb = pkt->skb;
+	struct audit_buffer *ab;
+	int fam = -1;
+
+	if (!audit_enabled)
+		return;
+
+	ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+	if (!ab)
+		return;
+
+	audit_log_format(ab, "mark=%#x", skb->mark);
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_BRIDGE:
+		switch (eth_hdr(skb)->h_proto) {
+		case htons(ETH_P_IP):
+			fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+			break;
+		case htons(ETH_P_IPV6):
+			fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+			break;
+		}
+		break;
+	case NFPROTO_IPV4:
+		fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+		break;
+	case NFPROTO_IPV6:
+		fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+		break;
+	}
+
+	if (fam == -1)
+		audit_log_format(ab, " saddr=? daddr=? proto=-1");
+
+	audit_log_end(ab);
+}
+
 static void nft_log_eval(const struct nft_expr *expr,
 			 struct nft_regs *regs,
 			 const struct nft_pktinfo *pkt)
 {
 	const struct nft_log *priv = nft_expr_priv(expr);
 
+	if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
+	    priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+		nft_log_eval_audit(pkt);
+		return;
+	}
+
 	nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
 		      nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
 		      priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		} else {
 			li->u.log.level = LOGLEVEL_WARNING;
 		}
-		if (li->u.log.level > LOGLEVEL_DEBUG) {
+		if (li->u.log.level > LOGLEVEL_AUDIT) {
 			err = -EINVAL;
 			goto err1;
 		}
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		break;
 	}
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return 0;
+
 	err = nf_logger_find_get(ctx->family, li->type);
 	if (err < 0)
 		goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	if (priv->prefix != nft_log_null_prefix)
 		kfree(priv->prefix);
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return;
+
 	nf_logger_put(ctx->family, li->type);
 }
 
-- 
cgit v1.2.3


From a654de8fdc1815676ab750e70cab231fc814c29f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 30 May 2018 20:18:57 +0200
Subject: netfilter: nf_tables: fix chain dependency validation

The following ruleset:

 add table ip filter
 add chain ip filter input { type filter hook input priority 4; }
 add chain ip filter ap
 add rule ip filter input jump ap
 add rule ip filter ap masquerade

results in a panic, because the masquerade extension should be rejected
from the filter chain. The existing validation is missing a chain
dependency check when the rule is added to the non-base chain.

This patch fixes the problem by walking down the rules from the
basechains, searching for either immediate or lookup expressions, then
jumping to non-base chains and again walking down the rules to perform
the expression validation, so we make sure the full ruleset graph is
validated. This is done only once from the commit phase, in case of
problem, we abort the transaction and perform fine grain validation for
error reporting. This patch requires 003087911af2 ("netfilter:
nfnetlink: allow commit to fail") to achieve this behaviour.

This patch also adds a cleanup callback to nfnl batch interface to reset
the validate state from the exit path.

As a result of this patch, nf_tables_check_loops() doesn't use
->validate to check for loops, instead it just checks for immediate
expressions.

Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h    |   1 +
 include/net/netfilter/nf_tables.h      |   2 +
 include/net/netfilter/nf_tables_core.h |   8 ++
 include/net/netns/nftables.h           |   1 +
 net/netfilter/nf_tables_api.c          | 152 ++++++++++++++++++++++++++++-----
 net/netfilter/nfnetlink.c              |   2 +
 net/netfilter/nft_immediate.c          |  27 ++++--
 net/netfilter/nft_lookup.c             |  47 ++++++++++
 8 files changed, 208 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 34551f8aaf9d..3ecc3050be0e 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -31,6 +31,7 @@ struct nfnetlink_subsystem {
 	const struct nfnl_callback *cb;	/* callback for individual types */
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
+	void (*cleanup)(struct net *net);
 	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 6b9ddb99f3a8..435c32d8a995 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -874,6 +874,8 @@ struct nft_chain {
 	struct nft_rule			**rules_next;
 };
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+
 enum nft_chain_types {
 	NFT_CHAIN_T_DEFAULT = 0,
 	NFT_CHAIN_T_ROUTE,
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index cd6915b6c054..e0c0c2558ec4 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -2,6 +2,8 @@
 #ifndef _NET_NF_TABLES_CORE_H
 #define _NET_NF_TABLES_CORE_H
 
+#include <net/netfilter/nf_tables.h>
+
 extern struct nft_expr_type nft_imm_type;
 extern struct nft_expr_type nft_cmp_type;
 extern struct nft_expr_type nft_lookup_type;
@@ -23,6 +25,12 @@ struct nft_cmp_fast_expr {
 	u8			len;
 };
 
+struct nft_immediate_expr {
+	struct nft_data		data;
+	enum nft_registers	dreg:8;
+	u8			dlen;
+};
+
 /* Calculate the mask for the nft_cmp_fast expression. On big endian the
  * mask needs to include the *upper* bytes when interpreting that data as
  * something smaller than the full u32, therefore a cpu_to_le32 is done.
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 29c3851b486a..94767ea3a490 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -9,6 +9,7 @@ struct netns_nftables {
 	struct list_head	commit_list;
 	unsigned int		base_seq;
 	u8			gencursor;
+	u8			validate_state;
 };
 
 #endif
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3b2ad96a9a05..c785bc5a66f1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -28,6 +28,28 @@ static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
 static u64 table_handle;
 
+enum {
+	NFT_VALIDATE_SKIP	= 0,
+	NFT_VALIDATE_NEED,
+	NFT_VALIDATE_DO,
+};
+
+static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+{
+	switch (net->nft.validate_state) {
+	case NFT_VALIDATE_SKIP:
+		WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
+		break;
+	case NFT_VALIDATE_NEED:
+		break;
+	case NFT_VALIDATE_DO:
+		if (new_validate_state == NFT_VALIDATE_NEED)
+			return;
+	}
+
+	net->nft.validate_state = new_validate_state;
+}
+
 static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
 			 const struct sk_buff *skb,
@@ -1921,19 +1943,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
 			goto err1;
 	}
 
-	if (ops->validate) {
-		const struct nft_data *data = NULL;
-
-		err = ops->validate(ctx, expr, &data);
-		if (err < 0)
-			goto err2;
-	}
-
 	return 0;
-
-err2:
-	if (ops->destroy)
-		ops->destroy(ctx, expr);
 err1:
 	expr->ops = NULL;
 	return err;
@@ -2299,6 +2309,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
 	nf_tables_rule_destroy(ctx, rule);
 }
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+{
+	struct nft_expr *expr, *last;
+	const struct nft_data *data;
+	struct nft_rule *rule;
+	int err;
+
+	list_for_each_entry(rule, &chain->rules, list) {
+		if (!nft_is_active_next(ctx->net, rule))
+			continue;
+
+		nft_rule_for_each_expr(expr, last, rule) {
+			if (!expr->ops->validate)
+				continue;
+
+			err = expr->ops->validate(ctx, expr, &data);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate);
+
+static int nft_table_validate(struct net *net, const struct nft_table *table)
+{
+	struct nft_chain *chain;
+	struct nft_ctx ctx = {
+		.net	= net,
+		.family	= table->family,
+	};
+	int err;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_base_chain(chain))
+			continue;
+
+		ctx.chain = chain;
+		err = nft_chain_validate(&ctx, chain);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
 #define NFT_RULE_MAXEXPRS	128
 
 static struct nft_expr_info *info;
@@ -2426,6 +2483,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
 		if (err < 0)
 			goto err2;
+
+		if (info[i].ops->validate)
+			nft_validate_state_update(net, NFT_VALIDATE_NEED);
+
 		info[i].ops = NULL;
 		expr = nft_expr_next(expr);
 	}
@@ -2469,8 +2530,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		}
 	}
 	chain->use++;
-	return 0;
 
+	if (net->nft.validate_state == NFT_VALIDATE_DO)
+		return nft_table_validate(net, table);
+
+	return 0;
 err2:
 	nf_tables_rule_release(&ctx, rule);
 err1:
@@ -4112,6 +4176,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 							  d2.type, d2.len);
 			if (err < 0)
 				goto err3;
+
+			if (d2.type == NFT_DATA_VERDICT &&
+			    (data.verdict.code == NFT_GOTO ||
+			     data.verdict.code == NFT_JUMP))
+				nft_validate_state_update(ctx->net,
+							  NFT_VALIDATE_NEED);
 		}
 
 		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4211,7 +4281,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
-	int rem, err = 0;
+	int rem, err;
 
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
 		return -EINVAL;
@@ -4232,9 +4302,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
 		err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
 		if (err < 0)
-			break;
+			return err;
 	}
-	return err;
+
+	if (net->nft.validate_state == NFT_VALIDATE_DO)
+		return nft_table_validate(net, ctx.table);
+
+	return 0;
 }
 
 /**
@@ -5867,6 +5941,27 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 	},
 };
 
+static int nf_tables_validate(struct net *net)
+{
+	struct nft_table *table;
+
+	switch (net->nft.validate_state) {
+	case NFT_VALIDATE_SKIP:
+		break;
+	case NFT_VALIDATE_NEED:
+		nft_validate_state_update(net, NFT_VALIDATE_DO);
+		/* fall through */
+	case NFT_VALIDATE_DO:
+		list_for_each_entry(table, &net->nft.tables, list) {
+			if (nft_table_validate(net, table) < 0)
+				return -EAGAIN;
+		}
+		break;
+	}
+
+	return 0;
+}
+
 static void nft_chain_commit_update(struct nft_trans *trans)
 {
 	struct nft_base_chain *basechain;
@@ -6055,6 +6150,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 	struct nft_chain *chain;
 	struct nft_table *table;
 
+	/* 0. Validate ruleset, otherwise roll back for error reporting. */
+	if (nf_tables_validate(net) < 0)
+		return -EAGAIN;
+
 	/* 1.  Allocate space for next generation rules_gen_X[] */
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		int ret;
@@ -6349,6 +6448,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
+static void nf_tables_cleanup(struct net *net)
+{
+	nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+}
+
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
 	return net->nft.base_seq == genid;
@@ -6361,6 +6465,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 	.commit		= nf_tables_commit,
 	.abort		= nf_tables_abort,
+	.cleanup	= nf_tables_cleanup,
 	.valid_genid	= nf_tables_valid_genid,
 };
 
@@ -6444,19 +6549,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 
 	list_for_each_entry(rule, &chain->rules, list) {
 		nft_rule_for_each_expr(expr, last, rule) {
-			const struct nft_data *data = NULL;
+			struct nft_immediate_expr *priv;
+			const struct nft_data *data;
 			int err;
 
-			if (!expr->ops->validate)
+			if (strcmp(expr->ops->type->name, "immediate"))
 				continue;
 
-			err = expr->ops->validate(ctx, expr, &data);
-			if (err < 0)
-				return err;
-
-			if (data == NULL)
+			priv = nft_expr_priv(expr);
+			if (priv->dreg != NFT_REG_VERDICT)
 				continue;
 
+			data = &priv->data;
 			switch (data->verdict.code) {
 			case NFT_JUMP:
 			case NFT_GOTO:
@@ -6936,6 +7040,8 @@ static int __net_init nf_tables_init_net(struct net *net)
 	INIT_LIST_HEAD(&net->nft.tables);
 	INIT_LIST_HEAD(&net->nft.commit_list);
 	net->nft.base_seq = 1;
+	net->nft.validate_state = NFT_VALIDATE_SKIP;
+
 	return 0;
 }
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 5a1bd23af1a3..261820627711 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -460,6 +460,8 @@ done:
 	} else {
 		ss->abort(net, oskb);
 	}
+	if (ss->cleanup)
+		ss->cleanup(net);
 
 	nfnl_err_deliver(&err_list, oskb);
 	nfnl_unlock(subsys_id);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index aa87ff8beae8..15adf8ca82c3 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -17,12 +17,6 @@
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 
-struct nft_immediate_expr {
-	struct nft_data		data;
-	enum nft_registers	dreg:8;
-	u8			dlen;
-};
-
 static void nft_immediate_eval(const struct nft_expr *expr,
 			       struct nft_regs *regs,
 			       const struct nft_pktinfo *pkt)
@@ -101,12 +95,27 @@ nla_put_failure:
 
 static int nft_immediate_validate(const struct nft_ctx *ctx,
 				  const struct nft_expr *expr,
-				  const struct nft_data **data)
+				  const struct nft_data **d)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	const struct nft_data *data;
+	int err;
 
-	if (priv->dreg == NFT_REG_VERDICT)
-		*data = &priv->data;
+	if (priv->dreg != NFT_REG_VERDICT)
+		return 0;
+
+	data = &priv->data;
+
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		err = nft_chain_validate(ctx, data->verdict.chain);
+		if (err < 0)
+			return err;
+		break;
+	default:
+		break;
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index f52da5e2199f..42e6fadf1417 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -149,6 +149,52 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
+				       struct nft_set *set,
+				       const struct nft_set_iter *iter,
+				       struct nft_set_elem *elem)
+{
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_data *data;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+		return 0;
+
+	data = nft_set_ext_data(ext);
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		return nft_chain_validate(ctx, data->verdict.chain);
+	default:
+		return 0;
+	}
+}
+
+static int nft_lookup_validate(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nft_data **d)
+{
+	const struct nft_lookup *priv = nft_expr_priv(expr);
+	struct nft_set_iter iter;
+
+	if (!(priv->set->flags & NFT_SET_MAP) ||
+	    priv->set->dtype != NFT_DATA_VERDICT)
+		return 0;
+
+	iter.genmask	= nft_genmask_next(ctx->net);
+	iter.skip	= 0;
+	iter.count	= 0;
+	iter.err	= 0;
+	iter.fn		= nft_lookup_validate_setelem;
+
+	priv->set->ops->walk(ctx, priv->set, &iter);
+	if (iter.err < 0)
+		return iter.err;
+
+	return 0;
+}
+
 static const struct nft_expr_ops nft_lookup_ops = {
 	.type		= &nft_lookup_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
 	.init		= nft_lookup_init,
 	.destroy	= nft_lookup_destroy,
 	.dump		= nft_lookup_dump,
+	.validate	= nft_lookup_validate,
 };
 
 struct nft_expr_type nft_lookup_type __read_mostly = {
-- 
cgit v1.2.3


From d32de98ea70fe7cf606f3809f0970b31c115764b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 31 May 2018 01:58:00 +0200
Subject: netfilter: nft_fwd_netdev: allow to forward packets via neighbour
 layer

This allows us to forward packets from the netdev family via neighbour
layer, so you don't need an explicit link-layer destination when using
this expression from rules. The ttl/hop_limit field is decremented.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +
 net/netfilter/nft_fwd_netdev.c           | 146 ++++++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5c7eb9b9f6d6..a089af092a29 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1260,10 +1260,14 @@ enum nft_dup_attributes {
  * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes
  *
  * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register)
+ * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto)
  */
 enum nft_fwd_attributes {
 	NFTA_FWD_UNSPEC,
 	NFTA_FWD_SREG_DEV,
+	NFTA_FWD_SREG_ADDR,
+	NFTA_FWD_NFPROTO,
 	__NFTA_FWD_MAX
 };
 #define NFTA_FWD_MAX	(__NFTA_FWD_MAX - 1)
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index ce13a50b9189..8abb9891cdf2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,8 +12,12 @@
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_dup_netdev.h>
+#include <net/neighbour.h>
+#include <net/ip.h>
 
 struct nft_fwd_netdev {
 	enum nft_registers	sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
 	[NFTA_FWD_SREG_DEV]	= { .type = NLA_U32 },
+	[NFTA_FWD_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_FWD_NFPROTO]	= { .type = NLA_U32 },
 };
 
 static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ nla_put_failure:
 	return -1;
 }
 
+struct nft_fwd_neigh {
+	enum nft_registers	sreg_dev:8;
+	enum nft_registers	sreg_addr:8;
+	u8			nfproto;
+};
+
+static void nft_fwd_neigh_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	void *addr = &regs->data[priv->sreg_addr];
+	int oif = regs->data[priv->sreg_dev];
+	unsigned int verdict = NF_STOLEN;
+	struct sk_buff *skb = pkt->skb;
+	struct net_device *dev;
+	int neigh_table;
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4: {
+		struct iphdr *iph;
+
+		if (skb->protocol != htons(ETH_P_IP)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*iph))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		iph = ip_hdr(skb);
+		ip_decrease_ttl(iph);
+		neigh_table = NEIGH_ARP_TABLE;
+		break;
+		}
+	case NFPROTO_IPV6: {
+		struct ipv6hdr *ip6h;
+
+		if (skb->protocol != htons(ETH_P_IPV6)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		ip6h = ipv6_hdr(skb);
+		ip6h->hop_limit--;
+		neigh_table = NEIGH_ND_TABLE;
+		break;
+		}
+	default:
+		verdict = NFT_BREAK;
+		goto out;
+	}
+
+	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+	if (dev == NULL)
+		return;
+
+	skb->dev = dev;
+	neigh_xmit(neigh_table, dev, addr, skb);
+out:
+	regs->verdict.code = verdict;
+}
+
+static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	unsigned int addr_len;
+	int err;
+
+	if (!tb[NFTA_FWD_SREG_DEV] ||
+	    !tb[NFTA_FWD_SREG_ADDR] ||
+	    !tb[NFTA_FWD_NFPROTO])
+		return -EINVAL;
+
+	priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+	priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
+	priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4:
+		addr_len = sizeof(struct in_addr);
+		break;
+	case NFPROTO_IPV6:
+		addr_len = sizeof(struct in6_addr);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	if (err < 0)
+		return err;
+
+	return nft_validate_register_load(priv->sreg_addr, addr_len);
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
+	    nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
+	    nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
+	.type		= &nft_fwd_netdev_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
+	.eval		= nft_fwd_neigh_eval,
+	.init		= nft_fwd_neigh_init,
+	.dump		= nft_fwd_neigh_dump,
+};
+
 static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.type		= &nft_fwd_netdev_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.dump		= nft_fwd_netdev_dump,
 };
 
+static const struct nft_expr_ops *
+nft_fwd_select_ops(const struct nft_ctx *ctx,
+		   const struct nlattr * const tb[])
+{
+	if (tb[NFTA_FWD_SREG_ADDR])
+		return &nft_fwd_neigh_netdev_ops;
+	if (tb[NFTA_FWD_SREG_DEV])
+		return &nft_fwd_netdev_ops;
+
+        return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
 	.family		= NFPROTO_NETDEV,
 	.name		= "fwd",
-	.ops		= &nft_fwd_netdev_ops,
+	.select_ops	= nft_fwd_select_ops,
 	.policy		= nft_fwd_netdev_policy,
 	.maxattr	= NFTA_FWD_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3


From d12e12299a6915fc10131602cca41170e46ae755 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 25 May 2018 22:06:25 +0300
Subject: ipvs: add ipv6 support to ftp

Add support for FTP commands with extended format (RFC 2428):

- FTP EPRT: IPv4 and IPv6, active mode, similar to PORT
- FTP EPSV: IPv4 and IPv6, passive mode, similar to PASV.
EPSV response usually contains only port but we allow real
server to provide different address

We restrict control and data connection to be from same
address family.

Allow the "(" and ")" to be optional in PASV response.

Also, add ipvsh argument to the pkt_in/pkt_out handlers to better
access the payload after transport header.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h                   |  10 +-
 net/netfilter/ipvs/ip_vs_app.c        |  24 +-
 net/netfilter/ipvs/ip_vs_ftp.c        | 467 ++++++++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_proto_sctp.c |   4 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |   4 +-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |   4 +-
 6 files changed, 331 insertions(+), 182 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 0ac795b41ab8..03f567eb9536 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -763,14 +763,14 @@ struct ip_vs_app {
 	 *	   2=Mangled but checksum was not updated
 	 */
 	int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
-		       struct sk_buff *, int *diff);
+		       struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
 	/* input hook: Process packet in outin direction, diff set for TCP.
 	 * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
 	 *	   2=Mangled but checksum was not updated
 	 */
 	int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
-		      struct sk_buff *, int *diff);
+		      struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
 	/* ip_vs_app initializer */
 	int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
@@ -1328,8 +1328,10 @@ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16
 int ip_vs_app_inc_get(struct ip_vs_app *inc);
 void ip_vs_app_inc_put(struct ip_vs_app *inc);
 
-int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
-int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
+int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb,
+		      struct ip_vs_iphdr *ipvsh);
+int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb,
+		     struct ip_vs_iphdr *ipvsh);
 
 int register_ip_vs_pe(struct ip_vs_pe *pe);
 int unregister_ip_vs_pe(struct ip_vs_pe *pe);
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1c98c907bc63..12d74896556a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
 }
 
 static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
-				  struct ip_vs_app *app)
+				  struct ip_vs_app *app,
+				  struct ip_vs_iphdr *ipvsh)
 {
 	int diff;
 	const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
 	if (app->pkt_out == NULL)
 		return 1;
 
-	if (!app->pkt_out(app, cp, skb, &diff))
+	if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
 		return 0;
 
 	/*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
  *	called by ipvs packet handler, assumes previously checked cp!=NULL
  *	returns false if it can't handle packet (oom)
  */
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+		      struct ip_vs_iphdr *ipvsh)
 {
 	struct ip_vs_app *app;
 
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 	/* TCP is complicated */
 	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_out(cp, skb, app);
+		return app_tcp_pkt_out(cp, skb, app, ipvsh);
 
 	/*
 	 *	Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
 	if (app->pkt_out == NULL)
 		return 1;
 
-	return app->pkt_out(app, cp, skb, NULL);
+	return app->pkt_out(app, cp, skb, NULL, ipvsh);
 }
 
 
 static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
-				 struct ip_vs_app *app)
+				 struct ip_vs_app *app,
+				 struct ip_vs_iphdr *ipvsh)
 {
 	int diff;
 	const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
 	if (app->pkt_in == NULL)
 		return 1;
 
-	if (!app->pkt_in(app, cp, skb, &diff))
+	if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
 		return 0;
 
 	/*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
  *	called by ipvs packet handler, assumes previously checked cp!=NULL.
  *	returns false if can't handle packet (oom).
  */
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+		     struct ip_vs_iphdr *ipvsh)
 {
 	struct ip_vs_app *app;
 
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 	/* TCP is complicated */
 	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_in(cp, skb, app);
+		return app_tcp_pkt_in(cp, skb, app, ipvsh);
 
 	/*
 	 *	Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
 	if (app->pkt_in == NULL)
 		return 1;
 
-	return app->pkt_in(app, cp, skb, NULL);
+	return app->pkt_in(app, cp, skb, NULL, ipvsh);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..4398a72edec5 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -29,6 +29,8 @@
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
@@ -44,9 +46,18 @@
 #include <net/ip_vs.h>
 
 
-#define SERVER_STRING "227 "
-#define CLIENT_STRING "PORT"
+#define SERVER_STRING_PASV "227 "
+#define CLIENT_STRING_PORT "PORT"
+#define SERVER_STRING_EPSV "229 "
+#define CLIENT_STRING_EPRT "EPRT"
 
+enum {
+	IP_VS_FTP_ACTIVE = 0,
+	IP_VS_FTP_PORT = 0,
+	IP_VS_FTP_PASV,
+	IP_VS_FTP_EPRT,
+	IP_VS_FTP_EPSV,
+};
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
 MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
 
 
-/*	Dummy variable */
-static int ip_vs_ftp_pasv;
+static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
+{
+	struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
+
+	if ((th->doff << 2) < sizeof(struct tcphdr))
+		return NULL;
 
+	return (char *)th + (th->doff << 2);
+}
 
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 }
 
 
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern", ignoring before "skip" and terminated with
- * the "term" character.
- * <addr,port> is in network order.
+/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern". <addr,port> is in network order.
+ * Parse extended format depending on ext. In this case addr can be pre-set.
  */
 static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 				  const char *pattern, size_t plen,
-				  char skip, char term,
-				  __be32 *addr, __be16 *port,
-				  char **start, char **end)
+				  char skip, bool ext, int mode,
+				  union nf_inet_addr *addr, __be16 *port,
+				  __u16 af, char **start, char **end)
 {
 	char *s, c;
 	unsigned char p[6];
+	char edelim;
+	__u16 hport;
 	int i = 0;
 
 	if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 			if (s == data_limit)
 				return -1;
 			if (!found) {
+				/* "(" is optional for non-extended format,
+				 * so catch the start of IPv4 address
+				 */
+				if (!ext && isdigit(*s))
+					break;
 				if (*s == skip)
 					found = 1;
 			} else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 			}
 		}
 	}
+	/* Old IPv4-only format? */
+	if (!ext) {
+		p[0] = 0;
+		for (data = s; ; data++) {
+			if (data == data_limit)
+				return -1;
+			c = *data;
+			if (isdigit(c)) {
+				p[i] = p[i]*10 + c - '0';
+			} else if (c == ',' && i < 5) {
+				i++;
+				p[i] = 0;
+			} else {
+				/* unexpected character or terminator */
+				break;
+			}
+		}
 
-	for (data = s; ; data++) {
-		if (data == data_limit)
+		if (i != 5)
 			return -1;
-		if (*data == term)
-			break;
+
+		*start = s;
+		*end = data;
+		addr->ip = get_unaligned((__be32 *) p);
+		*port = get_unaligned((__be16 *) (p + 4));
+		return 1;
 	}
-	*end = data;
+	if (s == data_limit)
+		return -1;
+	*start = s;
+	edelim = *s++;
+	if (edelim < 33 || edelim > 126)
+		return -1;
+	if (s == data_limit)
+		return -1;
+	if (*s == edelim) {
+		/* Address family is usually missing for EPSV response */
+		if (mode != IP_VS_FTP_EPSV)
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		/* Then address should be missing too */
+		if (*s != edelim)
+			return -1;
+		/* Caller can pre-set addr, if needed */
+		s++;
+	} else {
+		const char *ep;
 
-	memset(p, 0, sizeof(p));
-	for (data = s; ; data++) {
-		c = *data;
-		if (c == term)
-			break;
-		if (c >= '0' && c <= '9') {
-			p[i] = p[i]*10 + c - '0';
-		} else if (c == ',' && i < 5) {
-			i++;
-		} else {
-			/* unexpected character */
+		/* We allow address only from same family */
+		if (af == AF_INET6 && *s != '2')
 			return -1;
+		if (af == AF_INET && *s != '1')
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		if (*s != edelim)
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		if (af == AF_INET6) {
+			if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
+				     &ep) <= 0)
+				return -1;
+		} else {
+			if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
+				     &ep) <= 0)
+				return -1;
 		}
+		s = (char *) ep;
+		if (s == data_limit)
+			return -1;
+		if (*s != edelim)
+			return -1;
+		s++;
 	}
-
-	if (i != 5)
+	for (hport = 0; ; s++)
+	{
+		if (s == data_limit)
+			return -1;
+		if (!isdigit(*s))
+			break;
+		hport = hport * 10 + *s - '0';
+	}
+	if (s == data_limit || !hport || *s != edelim)
 		return -1;
-
-	*start = s;
-	*addr = get_unaligned((__be32 *) p);
-	*port = get_unaligned((__be16 *) (p + 4));
+	s++;
+	*end = s;
+	*port = htons(hport);
 	return 1;
 }
 
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
+/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
  * from the server (inside-to-outside).
  * When we see one, we build a connection entry with the client address,
  * client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
  * The outgoing packet should be something like
  *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
  * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * The extended format for EPSV response provides usually only port:
+ *   "229 Entering Extended Passive Mode (|||ppp|)"
  */
 static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			 struct sk_buff *skb, int *diff)
+			 struct sk_buff *skb, int *diff,
+			 struct ip_vs_iphdr *ipvsh)
 {
-	struct iphdr *iph;
-	struct tcphdr *th;
 	char *data, *data_limit;
 	char *start, *end;
 	union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
 	*diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
 	/* Only useful for established sessions */
 	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
 		return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	if (!skb_make_writable(skb, skb->len))
 		return 0;
 
-	if (cp->app_data == &ip_vs_ftp_pasv) {
-		iph = ip_hdr(skb);
-		th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-		data = (char *)th + (th->doff << 2);
+	if (cp->app_data == (void *) IP_VS_FTP_PASV) {
+		data = ip_vs_ftp_data_ptr(skb, ipvsh);
 		data_limit = skb_tail_pointer(skb);
 
+		if (!data || data >= data_limit)
+			return 1;
+
 		if (ip_vs_ftp_get_addrport(data, data_limit,
-					   SERVER_STRING,
-					   sizeof(SERVER_STRING)-1,
-					   '(', ')',
-					   &from.ip, &port,
+					   SERVER_STRING_PASV,
+					   sizeof(SERVER_STRING_PASV)-1,
+					   '(', false, IP_VS_FTP_PASV,
+					   &from, &port, cp->af,
 					   &start, &end) != 1)
 			return 1;
 
-		IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+		IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
 			  &from.ip, ntohs(port), &cp->caddr.ip, 0);
+	} else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+		data = ip_vs_ftp_data_ptr(skb, ipvsh);
+		data_limit = skb_tail_pointer(skb);
 
-		/*
-		 * Now update or create an connection entry for it
+		if (!data || data >= data_limit)
+			return 1;
+
+		/* Usually, data address is not specified but
+		 * we support different address, so pre-set it.
 		 */
-		{
-			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-					      iph->protocol, &from, port,
-					      &cp->caddr, 0, &p);
-			n_cp = ip_vs_conn_out_get(&p);
-		}
-		if (!n_cp) {
-			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(cp->ipvs,
-					      AF_INET, IPPROTO_TCP, &cp->caddr,
-					      0, &cp->vaddr, port, &p);
-			/* As above, this is ipv4 only */
-			n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
-					      IP_VS_CONN_F_NO_CPORT |
-					      IP_VS_CONN_F_NFCT,
-					      cp->dest, skb->mark);
-			if (!n_cp)
-				return 0;
+		from = cp->daddr;
+		if (ip_vs_ftp_get_addrport(data, data_limit,
+					   SERVER_STRING_EPSV,
+					   sizeof(SERVER_STRING_EPSV)-1,
+					   '(', true, IP_VS_FTP_EPSV,
+					   &from, &port, cp->af,
+					   &start, &end) != 1)
+			return 1;
 
-			/* add its controller */
-			ip_vs_control_add(n_cp, cp);
-		}
+		IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
+			      IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
+			      IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
+	} else {
+		return 1;
+	}
 
-		/*
-		 * Replace the old passive address with the new one
-		 */
+	/* Now update or create a connection entry for it */
+	{
+		struct ip_vs_conn_param p;
+
+		ip_vs_conn_fill_param(cp->ipvs, cp->af,
+				      ipvsh->protocol, &from, port,
+				      &cp->caddr, 0, &p);
+		n_cp = ip_vs_conn_out_get(&p);
+	}
+	if (!n_cp) {
+		struct ip_vs_conn_param p;
+
+		ip_vs_conn_fill_param(cp->ipvs,
+				      cp->af, ipvsh->protocol, &cp->caddr,
+				      0, &cp->vaddr, port, &p);
+		n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
+				      IP_VS_CONN_F_NO_CPORT |
+				      IP_VS_CONN_F_NFCT,
+				      cp->dest, skb->mark);
+		if (!n_cp)
+			return 0;
+
+		/* add its controller */
+		ip_vs_control_add(n_cp, cp);
+	}
+
+	/* Replace the old passive address with the new one */
+	if (cp->app_data == (void *) IP_VS_FTP_PASV) {
 		from.ip = n_cp->vaddr.ip;
 		port = n_cp->vport;
 		snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			 ((unsigned char *)&from.ip)[3],
 			 ntohs(port) >> 8,
 			 ntohs(port) & 0xFF);
+	} else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+		from = n_cp->vaddr;
+		port = n_cp->vport;
+		/* Only port, client will use VIP for the data connection */
+		snprintf(buf, sizeof(buf), "|||%u|",
+			 ntohs(port));
+	} else {
+		*buf = 0;
+	}
+	buf_len = strlen(buf);
 
-		buf_len = strlen(buf);
-
-		ct = nf_ct_get(skb, &ctinfo);
-		if (ct) {
-			bool mangled;
-
-			/* If mangling fails this function will return 0
-			 * which will cause the packet to be dropped.
-			 * Mangling can only fail under memory pressure,
-			 * hopefully it will succeed on the retransmitted
-			 * packet.
-			 */
-			mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-							   iph->ihl * 4,
-							   start - data,
-							   end - start,
-							   buf, buf_len);
-			if (mangled) {
-				ip_vs_nfct_expect_related(skb, ct, n_cp,
-							  IPPROTO_TCP, 0, 0);
-				if (skb->ip_summed == CHECKSUM_COMPLETE)
-					skb->ip_summed = CHECKSUM_UNNECESSARY;
-				/* csum is updated */
-				ret = 1;
-			}
-		}
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		bool mangled;
 
-		/*
-		 * Not setting 'diff' is intentional, otherwise the sequence
-		 * would be adjusted twice.
+		/* If mangling fails this function will return 0
+		 * which will cause the packet to be dropped.
+		 * Mangling can only fail under memory pressure,
+		 * hopefully it will succeed on the retransmitted
+		 * packet.
 		 */
-
-		cp->app_data = NULL;
-		ip_vs_tcp_conn_listen(n_cp);
-		ip_vs_conn_put(n_cp);
-		return ret;
+		mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						   ipvsh->len,
+						   start - data,
+						   end - start,
+						   buf, buf_len);
+		if (mangled) {
+			ip_vs_nfct_expect_related(skb, ct, n_cp,
+						  ipvsh->protocol, 0, 0);
+			if (skb->ip_summed == CHECKSUM_COMPLETE)
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+			/* csum is updated */
+			ret = 1;
+		}
 	}
-	return 1;
+
+	/* Not setting 'diff' is intentional, otherwise the sequence
+	 * would be adjusted twice.
+	 */
+
+	cp->app_data = (void *) IP_VS_FTP_ACTIVE;
+	ip_vs_tcp_conn_listen(n_cp);
+	ip_vs_conn_put(n_cp);
+	return ret;
 }
 
 
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
+/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
  * (outside-to-inside).
  *
  * The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
  * In this case, we create a connection entry using the client address and
  * port, so that the active ftp data connection from the server can reach
  * the client.
+ * Extended format:
+ *	"EPSV\r\n" when client requests server address from same family
+ *	"EPSV 1\r\n" when client requests IPv4 server address
+ *	"EPSV 2\r\n" when client requests IPv6 server address
+ *	"EPSV ALL\r\n" - not supported
+ *	EPRT with specified delimiter (ASCII 33..126), "|" by default:
+ *	"EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
+ *	"EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
  */
 static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			struct sk_buff *skb, int *diff)
+			struct sk_buff *skb, int *diff,
+			struct ip_vs_iphdr *ipvsh)
 {
-	struct iphdr *iph;
-	struct tcphdr *th;
 	char *data, *data_start, *data_limit;
 	char *start, *end;
 	union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	/* no diff required for incoming packets */
 	*diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
 	/* Only useful for established sessions */
 	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
 		return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	if (!skb_make_writable(skb, skb->len))
 		return 0;
 
-	/*
-	 * Detecting whether it is passive
-	 */
-	iph = ip_hdr(skb);
-	th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
-	/* Since there may be OPTIONS in the TCP packet and the HLEN is
-	   the length of the header in 32-bit multiples, it is accurate
-	   to calculate data address by th+HLEN*4 */
-	data = data_start = (char *)th + (th->doff << 2);
+	data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
 	data_limit = skb_tail_pointer(skb);
+	if (!data || data >= data_limit)
+		return 1;
 
 	while (data <= data_limit - 6) {
-		if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+		if (cp->af == AF_INET &&
+		    strncasecmp(data, "PASV\r\n", 6) == 0) {
 			/* Passive mode on */
 			IP_VS_DBG(7, "got PASV at %td of %td\n",
 				  data - data_start,
 				  data_limit - data_start);
-			cp->app_data = &ip_vs_ftp_pasv;
+			cp->app_data = (void *) IP_VS_FTP_PASV;
 			return 1;
 		}
+
+		/* EPSV or EPSV<space><net-prt> */
+		if (strncasecmp(data, "EPSV", 4) == 0 &&
+		    (data[4] == ' ' || data[4] == '\r')) {
+			if (data[4] == ' ') {
+				char proto = data[5];
+
+				if (data > data_limit - 7 || data[6] != '\r')
+					return 1;
+
+#ifdef CONFIG_IP_VS_IPV6
+				if (cp->af == AF_INET6 && proto == '2') {
+				} else
+#endif
+				if (cp->af == AF_INET && proto == '1') {
+				} else {
+					return 1;
+				}
+			}
+			/* Extended Passive mode on */
+			IP_VS_DBG(7, "got EPSV at %td of %td\n",
+				  data - data_start,
+				  data_limit - data_start);
+			cp->app_data = (void *) IP_VS_FTP_EPSV;
+			return 1;
+		}
+
 		data++;
 	}
 
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	 * then create a new connection entry for the coming data
 	 * connection.
 	 */
-	if (ip_vs_ftp_get_addrport(data_start, data_limit,
-				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
-				   ' ', '\r', &to.ip, &port,
-				   &start, &end) != 1)
+	if (cp->af == AF_INET &&
+	    ip_vs_ftp_get_addrport(data_start, data_limit,
+				   CLIENT_STRING_PORT,
+				   sizeof(CLIENT_STRING_PORT)-1,
+				   ' ', false, IP_VS_FTP_PORT,
+				   &to, &port, cp->af,
+				   &start, &end) == 1) {
+
+		IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
+
+		/* Now update or create a connection entry for it */
+		IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
+			  ip_vs_proto_name(ipvsh->protocol),
+			  &to.ip, ntohs(port), &cp->vaddr.ip,
+			  ntohs(cp->vport)-1);
+	} else if (ip_vs_ftp_get_addrport(data_start, data_limit,
+					  CLIENT_STRING_EPRT,
+					  sizeof(CLIENT_STRING_EPRT)-1,
+					  ' ', true, IP_VS_FTP_EPRT,
+					  &to, &port, cp->af,
+					  &start, &end) == 1) {
+
+		IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
+			      IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
+
+		/* Now update or create a connection entry for it */
+		IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
+			      ip_vs_proto_name(ipvsh->protocol),
+			      IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
+			      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+			      ntohs(cp->vport)-1);
+	} else {
 		return 1;
-
-	IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+	}
 
 	/* Passive mode off */
-	cp->app_data = NULL;
-
-	/*
-	 * Now update or create a connection entry for it
-	 */
-	IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
-		  ip_vs_proto_name(iph->protocol),
-		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+	cp->app_data = (void *) IP_VS_FTP_ACTIVE;
 
 	{
 		struct ip_vs_conn_param p;
-		ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-				      iph->protocol, &to, port, &cp->vaddr,
+		ip_vs_conn_fill_param(cp->ipvs, cp->af,
+				      ipvsh->protocol, &to, port, &cp->vaddr,
 				      htons(ntohs(cp->vport)-1), &p);
 		n_cp = ip_vs_conn_in_get(&p);
 		if (!n_cp) {
-			/* This is ipv4 only */
-			n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+			n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
 					      htons(ntohs(cp->dport)-1),
 					      IP_VS_CONN_F_NFCT, cp->dest,
 					      skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
 		ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
 		if (ret)
 			goto err_unreg;
-		pr_info("%s: loaded support on port[%d] = %d\n",
+		pr_info("%s: loaded support on port[%d] = %u\n",
 			app->name, i, ports[i]);
 	}
 	return 0;
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index eff7569824e5..3250c4a1111e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		ret = ip_vs_app_pkt_out(cp, skb);
+		ret = ip_vs_app_pkt_out(cp, skb, iph);
 		if (ret == 0)
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		ret = ip_vs_app_pkt_in(cp, skb);
+		ret = ip_vs_app_pkt_in(cp, skb, iph);
 		if (ret == 0)
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 569631d2b2a1..80d10ad12a15 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn and iph ack_seq stuff
 		 */
-		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index c15ef7c2a1fa..e0ef11c3691e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		/*
 		 *	Call application helper if needed
 		 */
-		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn
 		 */
-		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
-- 
cgit v1.2.3


From aafd3afe9d2e731f50477cd1b4db7efa86d2637a Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Fri, 25 May 2018 17:34:00 +0200
Subject: nvme.h: add AEN configuration symbols

Signed-off-by: Hannes Reinecke <hare@suse.com>
[hch: split from a larger patch]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 include/linux/nvme.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7ce0f3cf4409..2950ce957656 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -446,6 +446,11 @@ enum {
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 };
 
+enum {
+	NVME_AEN_CFG_NS_ATTR		= 1 << 8,
+	NVME_AEN_CFG_FW_ACT		= 1 << 9,
+};
+
 struct nvme_lba_range_type {
 	__u8			type;
 	__u8			attributes;
-- 
cgit v1.2.3


From f567c71f4f91d173795a2f46d6fc493387ce084d Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Tue, 15 May 2018 17:08:21 +0200
Subject: mtd: nand: Pass mode information to nand_page_io_req

The NAND sub-layers are likely to need the MTD_OPS_XXX mode information
in order to decide if they should enable/disable ECC or how they should
place the OOB bytes in the provided OOB buffer.

Add a field to nand_page_io_req to pass this information.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Reviewed-by: Frieder Schrempf <frieder.schrempf@exceet.de>
---
 include/linux/mtd/nand.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 792ea5c26329..abe975c87b90 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -86,6 +86,7 @@ struct nand_pos {
  * @ooboffs: the OOB offset within the page
  * @ooblen: the number of OOB bytes to read from/write to this page
  * @oobbuf: buffer to store OOB data in or get OOB data from
+ * @mode: one of the %MTD_OPS_XXX mode
  *
  * This object is used to pass per-page I/O requests to NAND sub-layers. This
  * way all useful information are already formatted in a useful way and
@@ -106,6 +107,7 @@ struct nand_page_io_req {
 		const void *out;
 		void *in;
 	} oobbuf;
+	int mode;
 };
 
 /**
@@ -599,6 +601,7 @@ static inline void nanddev_io_iter_init(struct nand_device *nand,
 {
 	struct mtd_info *mtd = nanddev_to_mtd(nand);
 
+	iter->req.mode = req->mode;
 	iter->req.dataoffs = nanddev_offs_to_pos(nand, offs, &iter->req.pos);
 	iter->req.ooboffs = req->ooboffs;
 	iter->oobbytes_per_page = mtd_oobavail(mtd, req);
-- 
cgit v1.2.3


From a8a275c9c2fb6bc9b45ad3e4187469726e2af7d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 31 May 2018 19:11:37 +0200
Subject: block: unexport elevator_init/exit

These are only used by the block core.  Also move the declarations to
block/blk.h.

Reported-by: Damien Le Moal <Damien.LeMoal@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Tested-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h              | 2 ++
 block/elevator.c         | 2 --
 include/linux/elevator.h | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/block/blk.h b/block/blk.h
index eaf1a8e87d11..a559c0532347 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -231,6 +231,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
 		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
+int elevator_init(struct request_queue *, char *);
+void elevator_exit(struct request_queue *, struct elevator_queue *);
 int elv_register_queue(struct request_queue *q);
 void elv_unregister_queue(struct request_queue *q);
 
diff --git a/block/elevator.c b/block/elevator.c
index 8b64f92e029a..a7baa4687b28 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -263,7 +263,6 @@ int elevator_init(struct request_queue *q, char *name)
 		elevator_put(e);
 	return err;
 }
-EXPORT_SYMBOL(elevator_init);
 
 void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
@@ -276,7 +275,6 @@ void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 
 	kobject_put(&e->kobj);
 }
-EXPORT_SYMBOL(elevator_exit);
 
 static inline void __elv_rqhash_del(struct request *rq)
 {
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 6d9e230dffd2..a02deea30185 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -218,8 +218,6 @@ extern void elv_unregister(struct elevator_type *);
 extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
-extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(struct request_queue *, struct elevator_queue *);
 extern bool elv_bio_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
 					struct elevator_type *);
-- 
cgit v1.2.3


From a7c9e9109ca1142f33b882615cc5fa048e07f3ea Mon Sep 17 00:00:00 2001
From: Javier González <javier@javigon.com>
Date: Fri, 1 Jun 2018 15:04:24 +0200
Subject: lightnvm: pass flag on graceful teardown to targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the namespace is unregistered before the LightNVM target is removed
(e.g., on hot unplug) it is too late for the target to store any metadata
on the device - any attempt to write to the device will fail. In this
case, pass on a "gracefull teardown" flag to the target to let it know
when this happens.

In the case of pblk, we pad the open line (close all open chunks) to
improve data retention. In the event of an ungraceful shutdown, avoid
this part and just clean up.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 10 +++++-----
 drivers/lightnvm/pblk-core.c | 13 ++++++++++++-
 drivers/lightnvm/pblk-gc.c   | 10 ++++++----
 drivers/lightnvm/pblk-init.c | 14 ++++++++------
 drivers/lightnvm/pblk.h      |  4 +++-
 include/linux/lightnvm.h     |  2 +-
 6 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 63171cdce270..60aa7bc5a630 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -431,7 +431,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	return 0;
 err_sysfs:
 	if (tt->exit)
-		tt->exit(targetdata);
+		tt->exit(targetdata, true);
 err_init:
 	blk_cleanup_queue(tqueue);
 	tdisk->queue = NULL;
@@ -446,7 +446,7 @@ err_reserve:
 	return ret;
 }
 
-static void __nvm_remove_target(struct nvm_target *t)
+static void __nvm_remove_target(struct nvm_target *t, bool graceful)
 {
 	struct nvm_tgt_type *tt = t->type;
 	struct gendisk *tdisk = t->disk;
@@ -459,7 +459,7 @@ static void __nvm_remove_target(struct nvm_target *t)
 		tt->sysfs_exit(tdisk);
 
 	if (tt->exit)
-		tt->exit(tdisk->private_data);
+		tt->exit(tdisk->private_data, graceful);
 
 	nvm_remove_tgt_dev(t->dev, 1);
 	put_disk(tdisk);
@@ -489,7 +489,7 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 		mutex_unlock(&dev->mlock);
 		return 1;
 	}
-	__nvm_remove_target(t);
+	__nvm_remove_target(t, true);
 	mutex_unlock(&dev->mlock);
 
 	return 0;
@@ -963,7 +963,7 @@ void nvm_unregister(struct nvm_dev *dev)
 	list_for_each_entry_safe(t, tmp, &dev->targets, list) {
 		if (t->dev->parent != dev)
 			continue;
-		__nvm_remove_target(t);
+		__nvm_remove_target(t, false);
 	}
 	mutex_unlock(&dev->mlock);
 
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 26f3c14ad799..6e6a65b278b9 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1461,7 +1461,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
 	flush_workqueue(pblk->close_wq);
 }
 
-void pblk_pipeline_stop(struct pblk *pblk)
+void __pblk_pipeline_flush(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	int ret;
@@ -1486,6 +1486,11 @@ void pblk_pipeline_stop(struct pblk *pblk)
 
 	flush_workqueue(pblk->bb_wq);
 	pblk_line_close_meta_sync(pblk);
+}
+
+void __pblk_pipeline_stop(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 
 	spin_lock(&l_mg->free_lock);
 	pblk->state = PBLK_STATE_STOPPED;
@@ -1494,6 +1499,12 @@ void pblk_pipeline_stop(struct pblk *pblk)
 	spin_unlock(&l_mg->free_lock);
 }
 
+void pblk_pipeline_stop(struct pblk *pblk)
+{
+	__pblk_pipeline_flush(pblk);
+	__pblk_pipeline_stop(pblk);
+}
+
 struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 6851a5c67189..b0cc277bf972 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -649,7 +649,7 @@ fail_free_main_kthread:
 	return ret;
 }
 
-void pblk_gc_exit(struct pblk *pblk)
+void pblk_gc_exit(struct pblk *pblk, bool graceful)
 {
 	struct pblk_gc *gc = &pblk->gc;
 
@@ -663,10 +663,12 @@ void pblk_gc_exit(struct pblk *pblk)
 	if (gc->gc_reader_ts)
 		kthread_stop(gc->gc_reader_ts);
 
-	flush_workqueue(gc->gc_reader_wq);
-	destroy_workqueue(gc->gc_reader_wq);
+	if (graceful) {
+		flush_workqueue(gc->gc_reader_wq);
+		flush_workqueue(gc->gc_line_reader_wq);
+	}
 
-	flush_workqueue(gc->gc_line_reader_wq);
+	destroy_workqueue(gc->gc_reader_wq);
 	destroy_workqueue(gc->gc_line_reader_wq);
 
 	if (gc->gc_writer_ts)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 81c103b341bd..f47e95c0e5da 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1118,23 +1118,25 @@ static void pblk_free(struct pblk *pblk)
 	kfree(pblk);
 }
 
-static void pblk_tear_down(struct pblk *pblk)
+static void pblk_tear_down(struct pblk *pblk, bool graceful)
 {
-	pblk_pipeline_stop(pblk);
+	if (graceful)
+		__pblk_pipeline_flush(pblk);
+	__pblk_pipeline_stop(pblk);
 	pblk_writer_stop(pblk);
 	pblk_rb_sync_l2p(&pblk->rwb);
 	pblk_rl_free(&pblk->rl);
 
-	pr_debug("pblk: consistent tear down\n");
+	pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful);
 }
 
-static void pblk_exit(void *private)
+static void pblk_exit(void *private, bool graceful)
 {
 	struct pblk *pblk = private;
 
 	down_write(&pblk_lock);
-	pblk_gc_exit(pblk);
-	pblk_tear_down(pblk);
+	pblk_gc_exit(pblk, graceful);
+	pblk_tear_down(pblk, graceful);
 
 #ifdef CONFIG_NVM_DEBUG
 	pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 97c0dd5f4857..2399db1b479a 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -771,6 +771,8 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close_ws(struct work_struct *work);
 void pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_flush(struct pblk *pblk);
 void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 		     void (*work)(struct work_struct *), gfp_t gfp_mask,
 		     struct workqueue_struct *wq);
@@ -864,7 +866,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
 #define PBLK_GC_RSV_LINE 1	/* Reserved lines for GC */
 
 int pblk_gc_init(struct pblk *pblk);
-void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk, bool graceful);
 void pblk_gc_should_start(struct pblk *pblk);
 void pblk_gc_should_stop(struct pblk *pblk);
 void pblk_gc_should_kick(struct pblk *pblk);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 6e0859b9d4d2..e9e0d1c7eaf5 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -489,7 +489,7 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
 				int flags);
-typedef void (nvm_tgt_exit_fn)(void *);
+typedef void (nvm_tgt_exit_fn)(void *, bool);
 typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
 typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
 
-- 
cgit v1.2.3


From 27d036e33237e49801780eb703ea38dad5449e12 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 29 May 2018 15:14:07 +0300
Subject: mm: Remove return value of zap_vma_ptes()

All callers of zap_vma_ptes() are not interested in the return value of
that function, so let's simplify its interface and drop the return
value.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mm.h | 8 ++++----
 mm/memory.c        | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06a4be6..edf44265c752 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -379,7 +379,7 @@ enum page_entry_size {
 /*
  * These are the virtual MM functions - opening of an area, closing and
  * unmapping it (needed to keep files on disk up-to-date etc), pointer
- * to the functions called when a no-page or a wp-page exception occurs. 
+ * to the functions called when a no-page or a wp-page exception occurs.
  */
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
@@ -1267,10 +1267,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
-int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
-		unsigned long size);
+void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+		  unsigned long size);
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
-		unsigned long size);
+		    unsigned long size);
 void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long start, unsigned long end);
 
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464e0fd2..5d8c2afb0730 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1660,16 +1660,15 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
  *
  * The entire address range must be fully contained within the vma.
  *
- * Returns 0 if successful.
  */
-int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size)
 {
 	if (address < vma->vm_start || address + size > vma->vm_end ||
 	    		!(vma->vm_flags & VM_PFNMAP))
-		return -1;
+		return;
+
 	zap_page_range_single(vma, address, size, NULL);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
-- 
cgit v1.2.3


From 348f48220b97130817de4aa2058569133c5cc051 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Thu, 24 May 2018 12:49:22 -0700
Subject: ASoC: topology: Move v4 manifest header data structures to uapi

Topology manifest v4 is still part of the ABI. Move its data structures
into the uapi header file.

No functional change.

Signed-off-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 sound/soc/soc-topology.c  | 56 ----------------------------------------------
 2 files changed, 57 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index b901cdbe532a..a74ca232f1fc 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -573,4 +573,61 @@ struct snd_soc_tplg_dai {
 	__le32 flags;           /* SND_SOC_TPLG_DAI_FLGBIT_* */
 	struct snd_soc_tplg_private priv;
 } __attribute__((packed));
+
+/*
+ * Old version of ABI structs, supported for backward compatibility.
+ */
+
+/* Manifest v4 */
+struct snd_soc_tplg_manifest_v4 {
+	__le32 size;		/* in bytes of this structure */
+	__le32 control_elems;	/* number of control elements */
+	__le32 widget_elems;	/* number of widget elements */
+	__le32 graph_elems;	/* number of graph elements */
+	__le32 pcm_elems;	/* number of PCM elements */
+	__le32 dai_link_elems;	/* number of DAI link elements */
+	struct snd_soc_tplg_private priv;
+} __packed;
+
+/* Stream Capabilities v4 */
+struct snd_soc_tplg_stream_caps_v4 {
+	__le32 size;		/* in bytes of this structure */
+	char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
+	__le64 formats;	/* supported formats SNDRV_PCM_FMTBIT_* */
+	__le32 rates;		/* supported rates SNDRV_PCM_RATE_* */
+	__le32 rate_min;	/* min rate */
+	__le32 rate_max;	/* max rate */
+	__le32 channels_min;	/* min channels */
+	__le32 channels_max;	/* max channels */
+	__le32 periods_min;	/* min number of periods */
+	__le32 periods_max;	/* max number of periods */
+	__le32 period_size_min;	/* min period size bytes */
+	__le32 period_size_max;	/* max period size bytes */
+	__le32 buffer_size_min;	/* min buffer size bytes */
+	__le32 buffer_size_max;	/* max buffer size bytes */
+} __packed;
+
+/* PCM v4 */
+struct snd_soc_tplg_pcm_v4 {
+	__le32 size;		/* in bytes of this structure */
+	char pcm_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
+	char dai_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
+	__le32 pcm_id;		/* unique ID - used to match with DAI link */
+	__le32 dai_id;		/* unique ID - used to match */
+	__le32 playback;	/* supports playback mode */
+	__le32 capture;		/* supports capture mode */
+	__le32 compress;	/* 1 = compressed; 0 = PCM */
+	struct snd_soc_tplg_stream stream[SND_SOC_TPLG_STREAM_CONFIG_MAX]; /* for DAI link */
+	__le32 num_streams;	/* number of streams */
+	struct snd_soc_tplg_stream_caps_v4 caps[2]; /* playback and capture for DAI */
+} __packed;
+
+/* Physical link config v4 */
+struct snd_soc_tplg_link_config_v4 {
+	__le32 size;            /* in bytes of this structure */
+	__le32 id;              /* unique ID - used to match */
+	struct snd_soc_tplg_stream stream[SND_SOC_TPLG_STREAM_CONFIG_MAX]; /* supported configs playback and captrure */
+	__le32 num_streams;     /* number of streams */
+} __packed;
+
 #endif
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 3d04fa297677..3fd5d9c867b9 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -54,62 +54,6 @@
 #define SOC_TPLG_PASS_START	SOC_TPLG_PASS_MANIFEST
 #define SOC_TPLG_PASS_END	SOC_TPLG_PASS_LINK
 
-/*
- * Old version of ABI structs, supported for backward compatibility.
- */
-
-/* Manifest v4 */
-struct snd_soc_tplg_manifest_v4 {
-	__le32 size;		/* in bytes of this structure */
-	__le32 control_elems;	/* number of control elements */
-	__le32 widget_elems;	/* number of widget elements */
-	__le32 graph_elems;	/* number of graph elements */
-	__le32 pcm_elems;	/* number of PCM elements */
-	__le32 dai_link_elems;	/* number of DAI link elements */
-	struct snd_soc_tplg_private priv;
-} __packed;
-
-/* Stream Capabilities v4 */
-struct snd_soc_tplg_stream_caps_v4 {
-	__le32 size;		/* in bytes of this structure */
-	char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
-	__le64 formats;	/* supported formats SNDRV_PCM_FMTBIT_* */
-	__le32 rates;		/* supported rates SNDRV_PCM_RATE_* */
-	__le32 rate_min;	/* min rate */
-	__le32 rate_max;	/* max rate */
-	__le32 channels_min;	/* min channels */
-	__le32 channels_max;	/* max channels */
-	__le32 periods_min;	/* min number of periods */
-	__le32 periods_max;	/* max number of periods */
-	__le32 period_size_min;	/* min period size bytes */
-	__le32 period_size_max;	/* max period size bytes */
-	__le32 buffer_size_min;	/* min buffer size bytes */
-	__le32 buffer_size_max;	/* max buffer size bytes */
-} __packed;
-
-/* PCM v4 */
-struct snd_soc_tplg_pcm_v4 {
-	__le32 size;		/* in bytes of this structure */
-	char pcm_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
-	char dai_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
-	__le32 pcm_id;		/* unique ID - used to match with DAI link */
-	__le32 dai_id;		/* unique ID - used to match */
-	__le32 playback;	/* supports playback mode */
-	__le32 capture;		/* supports capture mode */
-	__le32 compress;	/* 1 = compressed; 0 = PCM */
-	struct snd_soc_tplg_stream stream[SND_SOC_TPLG_STREAM_CONFIG_MAX]; /* for DAI link */
-	__le32 num_streams;	/* number of streams */
-	struct snd_soc_tplg_stream_caps_v4 caps[2]; /* playback and capture for DAI */
-} __packed;
-
-/* Physical link config v4 */
-struct snd_soc_tplg_link_config_v4 {
-	__le32 size;            /* in bytes of this structure */
-	__le32 id;              /* unique ID - used to match */
-	struct snd_soc_tplg_stream stream[SND_SOC_TPLG_STREAM_CONFIG_MAX]; /* supported configs playback and captrure */
-	__le32 num_streams;     /* number of streams */
-} __packed;
-
 /* topology context */
 struct soc_tplg {
 	const struct firmware *fw;
-- 
cgit v1.2.3


From 0c24fdc00244cc08309e397e3783f2943221dc53 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Thu, 24 May 2018 12:49:23 -0700
Subject: ASoC: topology: Move skl-tplg-interface.h to uapi

skl-tplg-interface.h describes firmware format details for Skylake
topology files. It is part of the ABI and should reside in the uapi
directory.

While moving the file, also replace the license boilerplate with
the SPDX License Identifier.

No functional change.

Signed-off-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/skl-tplg-interface.h      | 237 ++++++++++++++++++++++++++
 sound/soc/intel/skylake/skl-debug.c          |   2 +-
 sound/soc/intel/skylake/skl-messages.c       |   2 +-
 sound/soc/intel/skylake/skl-topology.c       |   2 +-
 sound/soc/intel/skylake/skl-topology.h       |   2 +-
 sound/soc/intel/skylake/skl-tplg-interface.h | 246 ---------------------------
 6 files changed, 241 insertions(+), 250 deletions(-)
 create mode 100644 include/uapi/sound/skl-tplg-interface.h
 delete mode 100644 sound/soc/intel/skylake/skl-tplg-interface.h

(limited to 'include')

diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h
new file mode 100644
index 000000000000..f58cafa42f18
--- /dev/null
+++ b/include/uapi/sound/skl-tplg-interface.h
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * skl-tplg-interface.h - Intel DSP FW private data interface
+ *
+ * Copyright (C) 2015 Intel Corp
+ * Author: Jeeja KP <jeeja.kp@intel.com>
+ *	    Nilofer, Samreen <samreen.nilofer@intel.com>
+ */
+
+#ifndef __HDA_TPLG_INTERFACE_H__
+#define __HDA_TPLG_INTERFACE_H__
+
+/*
+ * Default types range from 0~12. type can range from 0 to 0xff
+ * SST types start at higher to avoid any overlapping in future
+ */
+#define SKL_CONTROL_TYPE_BYTE_TLV	0x100
+#define SKL_CONTROL_TYPE_MIC_SELECT	0x102
+
+#define HDA_SST_CFG_MAX	900 /* size of copier cfg*/
+#define MAX_IN_QUEUE 8
+#define MAX_OUT_QUEUE 8
+
+#define SKL_UUID_STR_SZ 40
+/* Event types goes here */
+/* Reserve event type 0 for no event handlers */
+enum skl_event_types {
+	SKL_EVENT_NONE = 0,
+	SKL_MIXER_EVENT,
+	SKL_MUX_EVENT,
+	SKL_VMIXER_EVENT,
+	SKL_PGA_EVENT
+};
+
+/**
+ * enum skl_ch_cfg - channel configuration
+ *
+ * @SKL_CH_CFG_MONO:	One channel only
+ * @SKL_CH_CFG_STEREO:	L & R
+ * @SKL_CH_CFG_2_1:	L, R & LFE
+ * @SKL_CH_CFG_3_0:	L, C & R
+ * @SKL_CH_CFG_3_1:	L, C, R & LFE
+ * @SKL_CH_CFG_QUATRO:	L, R, Ls & Rs
+ * @SKL_CH_CFG_4_0:	L, C, R & Cs
+ * @SKL_CH_CFG_5_0:	L, C, R, Ls & Rs
+ * @SKL_CH_CFG_5_1:	L, C, R, Ls, Rs & LFE
+ * @SKL_CH_CFG_DUAL_MONO: One channel replicated in two
+ * @SKL_CH_CFG_I2S_DUAL_STEREO_0: Stereo(L,R) in 4 slots, 1st stream:[ L, R, -, - ]
+ * @SKL_CH_CFG_I2S_DUAL_STEREO_1: Stereo(L,R) in 4 slots, 2nd stream:[ -, -, L, R ]
+ * @SKL_CH_CFG_INVALID:	Invalid
+ */
+enum skl_ch_cfg {
+	SKL_CH_CFG_MONO = 0,
+	SKL_CH_CFG_STEREO = 1,
+	SKL_CH_CFG_2_1 = 2,
+	SKL_CH_CFG_3_0 = 3,
+	SKL_CH_CFG_3_1 = 4,
+	SKL_CH_CFG_QUATRO = 5,
+	SKL_CH_CFG_4_0 = 6,
+	SKL_CH_CFG_5_0 = 7,
+	SKL_CH_CFG_5_1 = 8,
+	SKL_CH_CFG_DUAL_MONO = 9,
+	SKL_CH_CFG_I2S_DUAL_STEREO_0 = 10,
+	SKL_CH_CFG_I2S_DUAL_STEREO_1 = 11,
+	SKL_CH_CFG_4_CHANNEL = 12,
+	SKL_CH_CFG_INVALID
+};
+
+enum skl_module_type {
+	SKL_MODULE_TYPE_MIXER = 0,
+	SKL_MODULE_TYPE_COPIER,
+	SKL_MODULE_TYPE_UPDWMIX,
+	SKL_MODULE_TYPE_SRCINT,
+	SKL_MODULE_TYPE_ALGO,
+	SKL_MODULE_TYPE_BASE_OUTFMT,
+	SKL_MODULE_TYPE_KPB,
+	SKL_MODULE_TYPE_MIC_SELECT,
+};
+
+enum skl_core_affinity {
+	SKL_AFFINITY_CORE_0 = 0,
+	SKL_AFFINITY_CORE_1,
+	SKL_AFFINITY_CORE_MAX
+};
+
+enum skl_pipe_conn_type {
+	SKL_PIPE_CONN_TYPE_NONE = 0,
+	SKL_PIPE_CONN_TYPE_FE,
+	SKL_PIPE_CONN_TYPE_BE
+};
+
+enum skl_hw_conn_type {
+	SKL_CONN_NONE = 0,
+	SKL_CONN_SOURCE = 1,
+	SKL_CONN_SINK = 2
+};
+
+enum skl_dev_type {
+	SKL_DEVICE_BT = 0x0,
+	SKL_DEVICE_DMIC = 0x1,
+	SKL_DEVICE_I2S = 0x2,
+	SKL_DEVICE_SLIMBUS = 0x3,
+	SKL_DEVICE_HDALINK = 0x4,
+	SKL_DEVICE_HDAHOST = 0x5,
+	SKL_DEVICE_NONE
+};
+
+/**
+ * enum skl_interleaving - interleaving style
+ *
+ * @SKL_INTERLEAVING_PER_CHANNEL: [s1_ch1...s1_chN,...,sM_ch1...sM_chN]
+ * @SKL_INTERLEAVING_PER_SAMPLE: [s1_ch1...sM_ch1,...,s1_chN...sM_chN]
+ */
+enum skl_interleaving {
+	SKL_INTERLEAVING_PER_CHANNEL = 0,
+	SKL_INTERLEAVING_PER_SAMPLE = 1,
+};
+
+enum skl_sample_type {
+	SKL_SAMPLE_TYPE_INT_MSB = 0,
+	SKL_SAMPLE_TYPE_INT_LSB = 1,
+	SKL_SAMPLE_TYPE_INT_SIGNED = 2,
+	SKL_SAMPLE_TYPE_INT_UNSIGNED = 3,
+	SKL_SAMPLE_TYPE_FLOAT = 4
+};
+
+enum module_pin_type {
+	/* All pins of the module takes same PCM inputs or outputs
+	* e.g. mixout
+	*/
+	SKL_PIN_TYPE_HOMOGENEOUS,
+	/* All pins of the module takes different PCM inputs or outputs
+	* e.g mux
+	*/
+	SKL_PIN_TYPE_HETEROGENEOUS,
+};
+
+enum skl_module_param_type {
+	SKL_PARAM_DEFAULT = 0,
+	SKL_PARAM_INIT,
+	SKL_PARAM_SET,
+	SKL_PARAM_BIND
+};
+
+struct skl_dfw_algo_data {
+	u32 set_params:2;
+	u32 rsvd:30;
+	u32 param_id;
+	u32 max;
+	char params[0];
+} __packed;
+
+enum skl_tkn_dir {
+	SKL_DIR_IN,
+	SKL_DIR_OUT
+};
+
+enum skl_tuple_type {
+	SKL_TYPE_TUPLE,
+	SKL_TYPE_DATA
+};
+
+/* v4 configuration data */
+
+struct skl_dfw_v4_module_pin {
+	u16 module_id;
+	u16 instance_id;
+} __packed;
+
+struct skl_dfw_v4_module_fmt {
+	u32 channels;
+	u32 freq;
+	u32 bit_depth;
+	u32 valid_bit_depth;
+	u32 ch_cfg;
+	u32 interleaving_style;
+	u32 sample_type;
+	u32 ch_map;
+} __packed;
+
+struct skl_dfw_v4_module_caps {
+	u32 set_params:2;
+	u32 rsvd:30;
+	u32 param_id;
+	u32 caps_size;
+	u32 caps[HDA_SST_CFG_MAX];
+} __packed;
+
+struct skl_dfw_v4_pipe {
+	u8 pipe_id;
+	u8 pipe_priority;
+	u16 conn_type:4;
+	u16 rsvd:4;
+	u16 memory_pages:8;
+} __packed;
+
+struct skl_dfw_v4_module {
+	char uuid[SKL_UUID_STR_SZ];
+
+	u16 module_id;
+	u16 instance_id;
+	u32 max_mcps;
+	u32 mem_pages;
+	u32 obs;
+	u32 ibs;
+	u32 vbus_id;
+
+	u32 max_in_queue:8;
+	u32 max_out_queue:8;
+	u32 time_slot:8;
+	u32 core_id:4;
+	u32 rsvd1:4;
+
+	u32 module_type:8;
+	u32 conn_type:4;
+	u32 dev_type:4;
+	u32 hw_conn_type:4;
+	u32 rsvd2:12;
+
+	u32 params_fixup:8;
+	u32 converter:8;
+	u32 input_pin_type:1;
+	u32 output_pin_type:1;
+	u32 is_dynamic_in_pin:1;
+	u32 is_dynamic_out_pin:1;
+	u32 is_loadable:1;
+	u32 rsvd3:11;
+
+	struct skl_dfw_v4_pipe pipe;
+	struct skl_dfw_v4_module_fmt in_fmt[MAX_IN_QUEUE];
+	struct skl_dfw_v4_module_fmt out_fmt[MAX_OUT_QUEUE];
+	struct skl_dfw_v4_module_pin in_pin[MAX_IN_QUEUE];
+	struct skl_dfw_v4_module_pin out_pin[MAX_OUT_QUEUE];
+	struct skl_dfw_v4_module_caps caps;
+} __packed;
+
+#endif
diff --git a/sound/soc/intel/skylake/skl-debug.c b/sound/soc/intel/skylake/skl-debug.c
index 302b40e63e6c..5d7ac2ee7a3c 100644
--- a/sound/soc/intel/skylake/skl-debug.c
+++ b/sound/soc/intel/skylake/skl-debug.c
@@ -15,10 +15,10 @@
 
 #include <linux/pci.h>
 #include <linux/debugfs.h>
+#include <uapi/sound/skl-tplg-interface.h>
 #include "skl.h"
 #include "skl-sst-dsp.h"
 #include "skl-sst-ipc.h"
-#include "skl-tplg-interface.h"
 #include "skl-topology.h"
 #include "../common/sst-dsp.h"
 #include "../common/sst-dsp-priv.h"
diff --git a/sound/soc/intel/skylake/skl-messages.c b/sound/soc/intel/skylake/skl-messages.c
index dd590a1c58e2..d5f9c30eba32 100644
--- a/sound/soc/intel/skylake/skl-messages.c
+++ b/sound/soc/intel/skylake/skl-messages.c
@@ -21,6 +21,7 @@
 #include <linux/pci.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
+#include <uapi/sound/skl-tplg-interface.h>
 #include "skl-sst-dsp.h"
 #include "cnl-sst-dsp.h"
 #include "skl-sst-ipc.h"
@@ -28,7 +29,6 @@
 #include "../common/sst-dsp.h"
 #include "../common/sst-dsp-priv.h"
 #include "skl-topology.h"
-#include "skl-tplg-interface.h"
 
 static int skl_alloc_dma_buf(struct device *dev,
 		struct snd_dma_buffer *dmab, size_t size)
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index 9e4c2cb88dea..2c5129782959 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -23,11 +23,11 @@
 #include <sound/soc.h>
 #include <sound/soc-topology.h>
 #include <uapi/sound/snd_sst_tokens.h>
+#include <uapi/sound/skl-tplg-interface.h>
 #include "skl-sst-dsp.h"
 #include "skl-sst-ipc.h"
 #include "skl-topology.h"
 #include "skl.h"
-#include "skl-tplg-interface.h"
 #include "../common/sst-dsp.h"
 #include "../common/sst-dsp-priv.h"
 
diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h
index b1e0667c0ae0..6d7e0569695f 100644
--- a/sound/soc/intel/skylake/skl-topology.h
+++ b/sound/soc/intel/skylake/skl-topology.h
@@ -25,8 +25,8 @@
 
 #include <sound/hdaudio_ext.h>
 #include <sound/soc.h>
+#include <uapi/sound/skl-tplg-interface.h>
 #include "skl.h"
-#include "skl-tplg-interface.h"
 
 #define BITS_PER_BYTE 8
 #define MAX_TS_GROUPS 8
diff --git a/sound/soc/intel/skylake/skl-tplg-interface.h b/sound/soc/intel/skylake/skl-tplg-interface.h
deleted file mode 100644
index b0e3d376594c..000000000000
--- a/sound/soc/intel/skylake/skl-tplg-interface.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * skl-tplg-interface.h - Intel DSP FW private data interface
- *
- * Copyright (C) 2015 Intel Corp
- * Author: Jeeja KP <jeeja.kp@intel.com>
- *	    Nilofer, Samreen <samreen.nilofer@intel.com>
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#ifndef __HDA_TPLG_INTERFACE_H__
-#define __HDA_TPLG_INTERFACE_H__
-
-/*
- * Default types range from 0~12. type can range from 0 to 0xff
- * SST types start at higher to avoid any overlapping in future
- */
-#define SKL_CONTROL_TYPE_BYTE_TLV	0x100
-#define SKL_CONTROL_TYPE_MIC_SELECT	0x102
-
-#define HDA_SST_CFG_MAX	900 /* size of copier cfg*/
-#define MAX_IN_QUEUE 8
-#define MAX_OUT_QUEUE 8
-
-#define SKL_UUID_STR_SZ 40
-/* Event types goes here */
-/* Reserve event type 0 for no event handlers */
-enum skl_event_types {
-	SKL_EVENT_NONE = 0,
-	SKL_MIXER_EVENT,
-	SKL_MUX_EVENT,
-	SKL_VMIXER_EVENT,
-	SKL_PGA_EVENT
-};
-
-/**
- * enum skl_ch_cfg - channel configuration
- *
- * @SKL_CH_CFG_MONO:	One channel only
- * @SKL_CH_CFG_STEREO:	L & R
- * @SKL_CH_CFG_2_1:	L, R & LFE
- * @SKL_CH_CFG_3_0:	L, C & R
- * @SKL_CH_CFG_3_1:	L, C, R & LFE
- * @SKL_CH_CFG_QUATRO:	L, R, Ls & Rs
- * @SKL_CH_CFG_4_0:	L, C, R & Cs
- * @SKL_CH_CFG_5_0:	L, C, R, Ls & Rs
- * @SKL_CH_CFG_5_1:	L, C, R, Ls, Rs & LFE
- * @SKL_CH_CFG_DUAL_MONO: One channel replicated in two
- * @SKL_CH_CFG_I2S_DUAL_STEREO_0: Stereo(L,R) in 4 slots, 1st stream:[ L, R, -, - ]
- * @SKL_CH_CFG_I2S_DUAL_STEREO_1: Stereo(L,R) in 4 slots, 2nd stream:[ -, -, L, R ]
- * @SKL_CH_CFG_INVALID:	Invalid
- */
-enum skl_ch_cfg {
-	SKL_CH_CFG_MONO = 0,
-	SKL_CH_CFG_STEREO = 1,
-	SKL_CH_CFG_2_1 = 2,
-	SKL_CH_CFG_3_0 = 3,
-	SKL_CH_CFG_3_1 = 4,
-	SKL_CH_CFG_QUATRO = 5,
-	SKL_CH_CFG_4_0 = 6,
-	SKL_CH_CFG_5_0 = 7,
-	SKL_CH_CFG_5_1 = 8,
-	SKL_CH_CFG_DUAL_MONO = 9,
-	SKL_CH_CFG_I2S_DUAL_STEREO_0 = 10,
-	SKL_CH_CFG_I2S_DUAL_STEREO_1 = 11,
-	SKL_CH_CFG_4_CHANNEL = 12,
-	SKL_CH_CFG_INVALID
-};
-
-enum skl_module_type {
-	SKL_MODULE_TYPE_MIXER = 0,
-	SKL_MODULE_TYPE_COPIER,
-	SKL_MODULE_TYPE_UPDWMIX,
-	SKL_MODULE_TYPE_SRCINT,
-	SKL_MODULE_TYPE_ALGO,
-	SKL_MODULE_TYPE_BASE_OUTFMT,
-	SKL_MODULE_TYPE_KPB,
-	SKL_MODULE_TYPE_MIC_SELECT,
-};
-
-enum skl_core_affinity {
-	SKL_AFFINITY_CORE_0 = 0,
-	SKL_AFFINITY_CORE_1,
-	SKL_AFFINITY_CORE_MAX
-};
-
-enum skl_pipe_conn_type {
-	SKL_PIPE_CONN_TYPE_NONE = 0,
-	SKL_PIPE_CONN_TYPE_FE,
-	SKL_PIPE_CONN_TYPE_BE
-};
-
-enum skl_hw_conn_type {
-	SKL_CONN_NONE = 0,
-	SKL_CONN_SOURCE = 1,
-	SKL_CONN_SINK = 2
-};
-
-enum skl_dev_type {
-	SKL_DEVICE_BT = 0x0,
-	SKL_DEVICE_DMIC = 0x1,
-	SKL_DEVICE_I2S = 0x2,
-	SKL_DEVICE_SLIMBUS = 0x3,
-	SKL_DEVICE_HDALINK = 0x4,
-	SKL_DEVICE_HDAHOST = 0x5,
-	SKL_DEVICE_NONE
-};
-
-/**
- * enum skl_interleaving - interleaving style
- *
- * @SKL_INTERLEAVING_PER_CHANNEL: [s1_ch1...s1_chN,...,sM_ch1...sM_chN]
- * @SKL_INTERLEAVING_PER_SAMPLE: [s1_ch1...sM_ch1,...,s1_chN...sM_chN]
- */
-enum skl_interleaving {
-	SKL_INTERLEAVING_PER_CHANNEL = 0,
-	SKL_INTERLEAVING_PER_SAMPLE = 1,
-};
-
-enum skl_sample_type {
-	SKL_SAMPLE_TYPE_INT_MSB = 0,
-	SKL_SAMPLE_TYPE_INT_LSB = 1,
-	SKL_SAMPLE_TYPE_INT_SIGNED = 2,
-	SKL_SAMPLE_TYPE_INT_UNSIGNED = 3,
-	SKL_SAMPLE_TYPE_FLOAT = 4
-};
-
-enum module_pin_type {
-	/* All pins of the module takes same PCM inputs or outputs
-	* e.g. mixout
-	*/
-	SKL_PIN_TYPE_HOMOGENEOUS,
-	/* All pins of the module takes different PCM inputs or outputs
-	* e.g mux
-	*/
-	SKL_PIN_TYPE_HETEROGENEOUS,
-};
-
-enum skl_module_param_type {
-	SKL_PARAM_DEFAULT = 0,
-	SKL_PARAM_INIT,
-	SKL_PARAM_SET,
-	SKL_PARAM_BIND
-};
-
-struct skl_dfw_algo_data {
-	u32 set_params:2;
-	u32 rsvd:30;
-	u32 param_id;
-	u32 max;
-	char params[0];
-} __packed;
-
-enum skl_tkn_dir {
-	SKL_DIR_IN,
-	SKL_DIR_OUT
-};
-
-enum skl_tuple_type {
-	SKL_TYPE_TUPLE,
-	SKL_TYPE_DATA
-};
-
-/* v4 configuration data */
-
-struct skl_dfw_v4_module_pin {
-	u16 module_id;
-	u16 instance_id;
-} __packed;
-
-struct skl_dfw_v4_module_fmt {
-	u32 channels;
-	u32 freq;
-	u32 bit_depth;
-	u32 valid_bit_depth;
-	u32 ch_cfg;
-	u32 interleaving_style;
-	u32 sample_type;
-	u32 ch_map;
-} __packed;
-
-struct skl_dfw_v4_module_caps {
-	u32 set_params:2;
-	u32 rsvd:30;
-	u32 param_id;
-	u32 caps_size;
-	u32 caps[HDA_SST_CFG_MAX];
-} __packed;
-
-struct skl_dfw_v4_pipe {
-	u8 pipe_id;
-	u8 pipe_priority;
-	u16 conn_type:4;
-	u16 rsvd:4;
-	u16 memory_pages:8;
-} __packed;
-
-struct skl_dfw_v4_module {
-	char uuid[SKL_UUID_STR_SZ];
-
-	u16 module_id;
-	u16 instance_id;
-	u32 max_mcps;
-	u32 mem_pages;
-	u32 obs;
-	u32 ibs;
-	u32 vbus_id;
-
-	u32 max_in_queue:8;
-	u32 max_out_queue:8;
-	u32 time_slot:8;
-	u32 core_id:4;
-	u32 rsvd1:4;
-
-	u32 module_type:8;
-	u32 conn_type:4;
-	u32 dev_type:4;
-	u32 hw_conn_type:4;
-	u32 rsvd2:12;
-
-	u32 params_fixup:8;
-	u32 converter:8;
-	u32 input_pin_type:1;
-	u32 output_pin_type:1;
-	u32 is_dynamic_in_pin:1;
-	u32 is_dynamic_out_pin:1;
-	u32 is_loadable:1;
-	u32 rsvd3:11;
-
-	struct skl_dfw_v4_pipe pipe;
-	struct skl_dfw_v4_module_fmt in_fmt[MAX_IN_QUEUE];
-	struct skl_dfw_v4_module_fmt out_fmt[MAX_OUT_QUEUE];
-	struct skl_dfw_v4_module_pin in_pin[MAX_IN_QUEUE];
-	struct skl_dfw_v4_module_pin out_pin[MAX_OUT_QUEUE];
-	struct skl_dfw_v4_module_caps caps;
-} __packed;
-
-#endif
-- 
cgit v1.2.3


From 1499fa809e9e6713952ef84a7e9d51606881681f Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Thu, 19 Apr 2018 00:49:58 +0530
Subject: kvm: Change return type to vm_fault_t

Use new return type vm_fault_t for fault handler. For
now, this is just documenting that the function returns
a VM_FAULT value rather than an errno. Once all instances
are converted, vm_fault_t will become a distinct type.

commit 1c8f422059ae ("mm: change return type to vm_fault_t")

Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c       | 2 +-
 arch/powerpc/kvm/powerpc.c | 2 +-
 arch/s390/kvm/kvm-s390.c   | 2 +-
 arch/x86/kvm/x86.c         | 2 +-
 include/linux/kvm_host.h   | 2 +-
 virt/kvm/arm/arm.c         | 2 +-
 virt/kvm/kvm_main.c        | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 2549fdd27ee1..03e0e0f189cc 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1076,7 +1076,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return -ENOIOCTLCMD;
 }
 
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4e387647b5af..3764d000872e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1830,7 +1830,7 @@ out:
 	return r;
 }
 
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e521f7699032..7142508ca6e1 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3993,7 +3993,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
 #ifdef CONFIG_KVM_S390_UCONTROL
 	if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 22bd20fedd6d..1d3dfc2c941d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3966,7 +3966,7 @@ out_nofree:
 	return r;
 }
 
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b81769a5a2b7..bca28637bcd4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -739,7 +739,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg);
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 126b98fbf9ba..3d92214c5038 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -165,7 +165,7 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c5f6a552e486..8938c9e553df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2358,7 +2358,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
-static int kvm_vcpu_fault(struct vm_fault *vmf)
+static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 {
 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
 	struct page *page;
-- 
cgit v1.2.3


From d1e5b0e98ea27b4f17871dc4e8ea4b0447e35221 Mon Sep 17 00:00:00 2001
From: Marc Orr <marcorr@google.com>
Date: Tue, 15 May 2018 04:37:37 -0700
Subject: kvm: Make VM ioctl do valloc for some archs

The kvm struct has been bloating. For example, it's tens of kilo-bytes
for x86, which turns out to be a large amount of memory to allocate
contiguously via kzalloc. Thus, this patch does the following:
1. Uses architecture-specific routines to allocate the kvm struct via
   vzalloc for x86.
2. Switches arm to __KVM_HAVE_ARCH_VM_ALLOC so that it can use vzalloc
   when has_vhe() is true.

Other architectures continue to default to kalloc, as they have a
dependency on kalloc or have a small-enough struct kvm.

Signed-off-by: Marc Orr <marcorr@google.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h   |  4 ++++
 arch/arm64/include/asm/kvm_host.h |  4 ++++
 arch/x86/kvm/svm.c                |  4 ++--
 arch/x86/kvm/vmx.c                |  4 ++--
 include/linux/kvm_host.h          |  5 +++++
 virt/kvm/arm/arm.c                | 15 +++++++++++++++
 6 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index f079a2039c8a..4b12f32f540c 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -324,4 +324,8 @@ static inline bool kvm_arm_harden_branch_predictor(void)
 static inline void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu) {}
 static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
 
+#define __KVM_HAVE_ARCH_VM_ALLOC
+struct kvm *kvm_arch_alloc_vm(void);
+void kvm_arch_free_vm(struct kvm *kvm);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a4ca202ff3f2..c923d3e17ba3 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -482,4 +482,8 @@ static inline bool kvm_arm_harden_branch_predictor(void)
 void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
 
+#define __KVM_HAVE_ARCH_VM_ALLOC
+struct kvm *kvm_arch_alloc_vm(void);
+void kvm_arch_free_vm(struct kvm *kvm);
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de21d5c5168b..d9305f1723f5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1852,13 +1852,13 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
 
 static struct kvm *svm_vm_alloc(void)
 {
-	struct kvm_svm *kvm_svm = kzalloc(sizeof(struct kvm_svm), GFP_KERNEL);
+	struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
 	return &kvm_svm->kvm;
 }
 
 static void svm_vm_free(struct kvm *kvm)
 {
-	kfree(to_kvm_svm(kvm));
+	vfree(to_kvm_svm(kvm));
 }
 
 static void sev_vm_destroy(struct kvm *kvm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e50beb76d846..d205e9246f99 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10139,13 +10139,13 @@ STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
 
 static struct kvm *vmx_vm_alloc(void)
 {
-	struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
+	struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
 	return &kvm_vmx->kvm;
 }
 
 static void vmx_vm_free(struct kvm *kvm)
 {
-	kfree(to_kvm_vmx(kvm));
+	vfree(to_kvm_vmx(kvm));
 }
 
 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bca28637bcd4..4ee7bc548a83 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -19,6 +19,7 @@
 #include <linux/preempt.h>
 #include <linux/msi.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/rcupdate.h>
 #include <linux/ratelimit.h>
 #include <linux/err.h>
@@ -811,6 +812,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
+/*
+ * All architectures that want to use vzalloc currently also
+ * need their own kvm_arch_alloc_vm implementation.
+ */
 static inline struct kvm *kvm_arch_alloc_vm(void)
 {
 	return kzalloc(sizeof(struct kvm), GFP_KERNEL);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 3d92214c5038..72be779cffe2 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -251,6 +251,21 @@ long kvm_arch_dev_ioctl(struct file *filp,
 	return -EINVAL;
 }
 
+struct kvm *kvm_arch_alloc_vm(void)
+{
+	if (!has_vhe())
+		return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+	return vzalloc(sizeof(struct kvm));
+}
+
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+	if (!has_vhe())
+		kfree(kvm);
+	else
+		vfree(kvm);
+}
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-- 
cgit v1.2.3


From 8335640cf89faa0f4e39e73e314f3f3a22d776f3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 4 May 2018 15:36:02 -0400
Subject: xprtrdma: Add trace_xprtrdma_dma_map(mr)

Matches trace_xprtrdma_dma_unmap(mr).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 1 +
 net/sunrpc/xprtrdma/fmr_ops.c  | 1 +
 net/sunrpc/xprtrdma/frwr_ops.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index ac82849954e4..c4494a2b3ecd 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -650,6 +650,7 @@ DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
 
 DEFINE_MR_EVENT(xprtrdma_localinv);
+DEFINE_MR_EVENT(xprtrdma_dma_map);
 DEFINE_MR_EVENT(xprtrdma_dma_unmap);
 DEFINE_MR_EVENT(xprtrdma_remoteinv);
 DEFINE_MR_EVENT(xprtrdma_recover_mr);
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 592d1e8e27c3..d2bc82f577b0 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -244,6 +244,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 				     mr->mr_sg, i, mr->mr_dir);
 	if (!mr->mr_nents)
 		goto out_dmamap_err;
+	trace_xprtrdma_dma_map(mr);
 
 	for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
 		dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 0f2e108d387e..3056dc901be0 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -418,6 +418,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir);
 	if (!mr->mr_nents)
 		goto out_dmamap_err;
+	trace_xprtrdma_dma_map(mr);
 
 	ibmr = frwr->fr_mr;
 	n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
-- 
cgit v1.2.3


From 84b66b2116031fc4e5b73ce304c6a6a2e08778a3 Mon Sep 17 00:00:00 2001
From: Amit Nischal <anischal@codeaurora.org>
Date: Wed, 9 May 2018 17:02:30 +0530
Subject: dt-bindings: clock: Introduce QCOM Video clock bindings

Add device tree bindings for video clock controller for
Qualcomm Technology Inc's SoCs.

Signed-off-by: Amit Nischal <anischal@codeaurora.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/qcom,videocc.txt     | 19 ++++++++++++
 include/dt-bindings/clock/qcom,videocc-sdm845.h    | 35 ++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,videocc.txt
 create mode 100644 include/dt-bindings/clock/qcom,videocc-sdm845.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,videocc.txt b/Documentation/devicetree/bindings/clock/qcom,videocc.txt
new file mode 100644
index 000000000000..e7c035afa778
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,videocc.txt
@@ -0,0 +1,19 @@
+Qualcomm Video Clock & Reset Controller Binding
+-----------------------------------------------
+
+Required properties :
+- compatible : shall contain "qcom,sdm845-videocc"
+- reg : shall contain base register location and length
+- #clock-cells : from common clock binding, shall contain 1.
+- #power-domain-cells : from generic power domain binding, shall contain 1.
+
+Optional properties :
+- #reset-cells : from common reset binding, shall contain 1.
+
+Example:
+	videocc: clock-controller@ab00000 {
+		compatible = "qcom,sdm845-videocc";
+		reg = <0xab00000 0x10000>;
+		#clock-cells = <1>;
+		#power-domain-cells = <1>;
+	};
diff --git a/include/dt-bindings/clock/qcom,videocc-sdm845.h b/include/dt-bindings/clock/qcom,videocc-sdm845.h
new file mode 100644
index 000000000000..1b868165e8ce
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,videocc-sdm845.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_SDM_VIDEO_CC_SDM845_H
+#define _DT_BINDINGS_CLK_SDM_VIDEO_CC_SDM845_H
+
+/* VIDEO_CC clock registers */
+#define VIDEO_CC_APB_CLK		0
+#define VIDEO_CC_AT_CLK			1
+#define VIDEO_CC_QDSS_TRIG_CLK		2
+#define VIDEO_CC_QDSS_TSCTR_DIV8_CLK	3
+#define VIDEO_CC_VCODEC0_AXI_CLK	4
+#define VIDEO_CC_VCODEC0_CORE_CLK	5
+#define VIDEO_CC_VCODEC1_AXI_CLK	6
+#define VIDEO_CC_VCODEC1_CORE_CLK	7
+#define VIDEO_CC_VENUS_AHB_CLK		8
+#define VIDEO_CC_VENUS_CLK_SRC		9
+#define VIDEO_CC_VENUS_CTL_AXI_CLK	10
+#define VIDEO_CC_VENUS_CTL_CORE_CLK	11
+#define VIDEO_PLL0			12
+
+/* VIDEO_CC Resets */
+#define VIDEO_CC_VENUS_BCR		0
+#define VIDEO_CC_VCODEC0_BCR		1
+#define VIDEO_CC_VCODEC1_BCR		2
+#define VIDEO_CC_INTERFACE_BCR		3
+
+/* VIDEO_CC GDSCRs */
+#define VENUS_GDSC			0
+#define VCODEC0_GDSC			1
+#define VCODEC1_GDSC			2
+
+#endif
-- 
cgit v1.2.3


From 9c7150daffeca95c575be807db8bc8d25d8e5a5f Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Fri, 18 May 2018 09:01:05 +0800
Subject: clk: imx7d: correct enet clock CCGR registers

Correct enet clock gates as below:

CCGR6: IMX7D_ENET_AXI_ROOT_CLK (enet1 and enet2 bus clocks)
CCGR112: IMX7D_ENET1_TIME_ROOT_CLK, IMX7D_ENET1_IPG_ROOT_CLK
CCGR113: IMX7D_ENET2_TIME_ROOT_CLK, IMX7D_ENET2_IPG_ROOT_CLK

Just rename unused IMX7D_ENETx_REF_ROOT_CLK for
IMX7D_ENETx_IPG_ROOT_CLK instead of adding new clocks.

Based on Andy Duan's patch from the NXP kernel tree.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx7d.c             | 10 ++++++----
 include/dt-bindings/clock/imx7d-clock.h |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/imx/clk-imx7d.c b/drivers/clk/imx/clk-imx7d.c
index 23d5090a5441..d4936b93a53e 100644
--- a/drivers/clk/imx/clk-imx7d.c
+++ b/drivers/clk/imx/clk-imx7d.c
@@ -26,6 +26,8 @@ static u32 share_count_sai1;
 static u32 share_count_sai2;
 static u32 share_count_sai3;
 static u32 share_count_nand;
+static u32 share_count_enet1;
+static u32 share_count_enet2;
 
 static const struct clk_div_table test_div_table[] = {
 	{ .val = 3, .div = 1, },
@@ -805,6 +807,10 @@ static void __init imx7d_clocks_init(struct device_node *ccm_node)
 	clks[IMX7D_MIPI_DSI_ROOT_CLK] = imx_clk_gate4("mipi_dsi_root_clk", "mipi_dsi_post_div", base + 0x4650, 0);
 	clks[IMX7D_MIPI_CSI_ROOT_CLK] = imx_clk_gate4("mipi_csi_root_clk", "mipi_csi_post_div", base + 0x4640, 0);
 	clks[IMX7D_MIPI_DPHY_ROOT_CLK] = imx_clk_gate4("mipi_dphy_root_clk", "mipi_dphy_post_div", base + 0x4660, 0);
+	clks[IMX7D_ENET1_IPG_ROOT_CLK] = imx_clk_gate2_shared2("enet1_ipg_root_clk", "enet_axi_post_div", base + 0x4700, 0, &share_count_enet1);
+	clks[IMX7D_ENET1_TIME_ROOT_CLK] = imx_clk_gate2_shared2("enet1_time_root_clk", "enet1_time_post_div", base + 0x4700, 0, &share_count_enet1);
+	clks[IMX7D_ENET2_IPG_ROOT_CLK] = imx_clk_gate2_shared2("enet2_ipg_root_clk", "enet_axi_post_div", base + 0x4710, 0, &share_count_enet2);
+	clks[IMX7D_ENET2_TIME_ROOT_CLK] = imx_clk_gate2_shared2("enet2_time_root_clk", "enet2_time_post_div", base + 0x4710, 0, &share_count_enet2);
 	clks[IMX7D_SAI1_ROOT_CLK] = imx_clk_gate2_shared2("sai1_root_clk", "sai1_post_div", base + 0x48c0, 0, &share_count_sai1);
 	clks[IMX7D_SAI1_IPG_CLK]  = imx_clk_gate2_shared2("sai1_ipg_clk",  "ipg_root_clk",  base + 0x48c0, 0, &share_count_sai1);
 	clks[IMX7D_SAI2_ROOT_CLK] = imx_clk_gate2_shared2("sai2_root_clk", "sai2_post_div", base + 0x48d0, 0, &share_count_sai2);
@@ -812,10 +818,6 @@ static void __init imx7d_clocks_init(struct device_node *ccm_node)
 	clks[IMX7D_SAI3_ROOT_CLK] = imx_clk_gate2_shared2("sai3_root_clk", "sai3_post_div", base + 0x48e0, 0, &share_count_sai3);
 	clks[IMX7D_SAI3_IPG_CLK]  = imx_clk_gate2_shared2("sai3_ipg_clk",  "ipg_root_clk",  base + 0x48e0, 0, &share_count_sai3);
 	clks[IMX7D_SPDIF_ROOT_CLK] = imx_clk_gate4("spdif_root_clk", "spdif_post_div", base + 0x44d0, 0);
-	clks[IMX7D_ENET1_REF_ROOT_CLK] = imx_clk_gate4("enet1_ref_root_clk", "enet1_ref_post_div", base + 0x44e0, 0);
-	clks[IMX7D_ENET1_TIME_ROOT_CLK] = imx_clk_gate4("enet1_time_root_clk", "enet1_time_post_div", base + 0x44f0, 0);
-	clks[IMX7D_ENET2_REF_ROOT_CLK] = imx_clk_gate4("enet2_ref_root_clk", "enet2_ref_post_div", base + 0x4500, 0);
-	clks[IMX7D_ENET2_TIME_ROOT_CLK] = imx_clk_gate4("enet2_time_root_clk", "enet2_time_post_div", base + 0x4510, 0);
 	clks[IMX7D_EIM_ROOT_CLK] = imx_clk_gate4("eim_root_clk", "eim_post_div", base + 0x4160, 0);
 	clks[IMX7D_NAND_RAWNAND_CLK] = imx_clk_gate2_shared2("nand_rawnand_clk", "nand_root_clk", base + 0x4140, 0, &share_count_nand);
 	clks[IMX7D_NAND_USDHC_BUS_RAWNAND_CLK] = imx_clk_gate2_shared2("nand_usdhc_rawnand_clk", "nand_usdhc_root_clk", base + 0x4140, 0, &share_count_nand);
diff --git a/include/dt-bindings/clock/imx7d-clock.h b/include/dt-bindings/clock/imx7d-clock.h
index b2325d3e236a..0d67f53bba93 100644
--- a/include/dt-bindings/clock/imx7d-clock.h
+++ b/include/dt-bindings/clock/imx7d-clock.h
@@ -168,7 +168,7 @@
 #define IMX7D_SPDIF_ROOT_SRC		155
 #define IMX7D_SPDIF_ROOT_CG		156
 #define IMX7D_SPDIF_ROOT_DIV		157
-#define IMX7D_ENET1_REF_ROOT_CLK	158
+#define IMX7D_ENET1_IPG_ROOT_CLK        158
 #define IMX7D_ENET1_REF_ROOT_SRC	159
 #define IMX7D_ENET1_REF_ROOT_CG		160
 #define IMX7D_ENET1_REF_ROOT_DIV	161
@@ -176,7 +176,7 @@
 #define IMX7D_ENET1_TIME_ROOT_SRC	163
 #define IMX7D_ENET1_TIME_ROOT_CG	164
 #define IMX7D_ENET1_TIME_ROOT_DIV	165
-#define IMX7D_ENET2_REF_ROOT_CLK	166
+#define IMX7D_ENET2_IPG_ROOT_CLK        166
 #define IMX7D_ENET2_REF_ROOT_SRC	167
 #define IMX7D_ENET2_REF_ROOT_CG		168
 #define IMX7D_ENET2_REF_ROOT_DIV	169
-- 
cgit v1.2.3


From 67b6e5cfdb1fb2607a20e1e002719f01b025b197 Mon Sep 17 00:00:00 2001
From: Lei YU <mine260309@gmail.com>
Date: Fri, 18 May 2018 16:57:02 +0800
Subject: clk: aspeed: Add 24MHz fixed clock

Add a 24MHz fixed clock.
This clock will be used for certain devices, e.g. pwm.

Signed-off-by: Lei YU <mine260309@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-aspeed.c                 | 9 ++++++++-
 include/dt-bindings/clock/aspeed-clock.h | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
index 5eb50c31e455..466408825406 100644
--- a/drivers/clk/clk-aspeed.c
+++ b/drivers/clk/clk-aspeed.c
@@ -14,7 +14,7 @@
 
 #include <dt-bindings/clock/aspeed-clock.h>
 
-#define ASPEED_NUM_CLKS		35
+#define ASPEED_NUM_CLKS		36
 
 #define ASPEED_RESET_CTRL	0x04
 #define ASPEED_CLK_SELECTION	0x08
@@ -474,6 +474,13 @@ static int aspeed_clk_probe(struct platform_device *pdev)
 		return PTR_ERR(hw);
 	aspeed_clk_data->hws[ASPEED_CLK_BCLK] = hw;
 
+	/* Fixed 24MHz clock */
+	hw = clk_hw_register_fixed_rate(NULL, "fixed-24m", "clkin",
+					0, 24000000);
+	if (IS_ERR(hw))
+		return PTR_ERR(hw);
+	aspeed_clk_data->hws[ASPEED_CLK_24M] = hw;
+
 	/*
 	 * TODO: There are a number of clocks that not included in this driver
 	 * as more information is required:
diff --git a/include/dt-bindings/clock/aspeed-clock.h b/include/dt-bindings/clock/aspeed-clock.h
index d3558d897a4d..ff29d8ef716f 100644
--- a/include/dt-bindings/clock/aspeed-clock.h
+++ b/include/dt-bindings/clock/aspeed-clock.h
@@ -38,6 +38,7 @@
 #define ASPEED_CLK_MAC			32
 #define ASPEED_CLK_BCLK			33
 #define ASPEED_CLK_MPLL			34
+#define ASPEED_CLK_24M			35
 
 #define ASPEED_RESET_XDMA		0
 #define ASPEED_RESET_MCTP		1
-- 
cgit v1.2.3


From 31875d4970baa02e08b719fdfea6f43e9e2f7e77 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 24 May 2018 23:40:12 +0300
Subject: ipvs: register conntrack hooks for ftp

ip_vs_ftp requires conntrack modules for mangling
of FTP command responses in passive mode.

Make sure the conntrack hooks are registered when
real servers use NAT method in FTP virtual service.
The hooks will be registered while the service is
present.

Fixes: 0c66dc1ea3f0 ("netfilter: conntrack: register hooks in netns when needed by ruleset")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h            | 30 ++++++++++++++++++++++++++++++
 net/netfilter/ipvs/ip_vs_ctl.c |  4 ++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec043c96..ae72d9057eda 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -643,6 +643,7 @@ struct ip_vs_service {
 
 	/* alternate persistence engine */
 	struct ip_vs_pe __rcu	*pe;
+	int			conntrack_afmask;
 
 	struct rcu_head		rcu_head;
 };
@@ -1620,6 +1621,35 @@ static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp,
 	return false;
 }
 
+static inline int ip_vs_register_conntrack(struct ip_vs_service *svc)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	int afmask = (svc->af == AF_INET6) ? 2 : 1;
+	int ret = 0;
+
+	if (!(svc->conntrack_afmask & afmask)) {
+		ret = nf_ct_netns_get(svc->ipvs->net, svc->af);
+		if (ret >= 0)
+			svc->conntrack_afmask |= afmask;
+	}
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+static inline void ip_vs_unregister_conntrack(struct ip_vs_service *svc)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	int afmask = (svc->af == AF_INET6) ? 2 : 1;
+
+	if (svc->conntrack_afmask & afmask) {
+		nf_ct_netns_put(svc->ipvs->net, svc->af);
+		svc->conntrack_afmask &= ~afmask;
+	}
+#endif
+}
+
 static inline int
 ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
 {
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3ecca0616d8c..ee0ab278f1f1 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -835,6 +835,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		 *    For now only for NAT!
 		 */
 		ip_vs_rs_hash(ipvs, dest);
+		/* FTP-NAT requires conntrack for mangling */
+		if (svc->port == FTPPORT)
+			ip_vs_register_conntrack(svc);
 	}
 	atomic_set(&dest->conn_flags, conn_flags);
 
@@ -1458,6 +1461,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
  */
 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
 {
+	ip_vs_unregister_conntrack(svc);
 	/* Hold svc to avoid double release from dest_trash */
 	atomic_inc(&svc->refcnt);
 	/*
-- 
cgit v1.2.3


From 0aa69fd32a5f766e997ca8ab4723c5a1146efa8b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:05 -0700
Subject: block: add a lower-level bio_add_page interface

For the upcoming removal of buffer heads in XFS we need to keep track of
the number of outstanding writeback requests per page.  For this we need
to know if bio_add_page merged a region with the previous bvec or not.
Instead of adding additional arguments this refactors bio_add_page to
be implemented using three lower level helpers which users like XFS can
use directly if they care about the merge decisions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 block/bio.c         | 96 +++++++++++++++++++++++++++++++++++------------------
 include/linux/bio.h |  9 +++++
 2 files changed, 72 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 53e0f0a1ed94..fdf635d42bbd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -773,7 +773,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 			return 0;
 	}
 
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
+	if (bio_full(bio))
 		return 0;
 
 	/*
@@ -821,52 +821,82 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 EXPORT_SYMBOL(bio_add_pc_page);
 
 /**
- *	bio_add_page	-	attempt to add page to bio
- *	@bio: destination bio
- *	@page: page to add
- *	@len: vec entry length
- *	@offset: vec entry offset
+ * __bio_try_merge_page - try appending data to an existing bvec.
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the data to add
+ * @off: offset of the data in @page
  *
- *	Attempt to add a page to the bio_vec maplist. This will only fail
- *	if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
+ * Try to add the data at @page + @off to the last bvec of @bio.  This is a
+ * a useful optimisation for file systems with a block size smaller than the
+ * page size.
+ *
+ * Return %true on success or %false on failure.
  */
-int bio_add_page(struct bio *bio, struct page *page,
-		 unsigned int len, unsigned int offset)
+bool __bio_try_merge_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off)
 {
-	struct bio_vec *bv;
-
-	/*
-	 * cloned bio must not modify vec list
-	 */
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
-		return 0;
+		return false;
 
-	/*
-	 * For filesystems with a blocksize smaller than the pagesize
-	 * we will often be called with the same page as last time and
-	 * a consecutive offset.  Optimize this special case.
-	 */
 	if (bio->bi_vcnt > 0) {
-		bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
-		if (page == bv->bv_page &&
-		    offset == bv->bv_offset + bv->bv_len) {
+		if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
 			bv->bv_len += len;
-			goto done;
+			bio->bi_iter.bi_size += len;
+			return true;
 		}
 	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(__bio_try_merge_page);
 
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return 0;
+/**
+ * __bio_add_page - add page to a bio in a new segment
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the data to add
+ * @off: offset of the data in @page
+ *
+ * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
+ * that @bio has space for another bvec.
+ */
+void __bio_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
 
-	bv		= &bio->bi_io_vec[bio->bi_vcnt];
-	bv->bv_page	= page;
-	bv->bv_len	= len;
-	bv->bv_offset	= offset;
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	WARN_ON_ONCE(bio_full(bio));
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
 
-	bio->bi_vcnt++;
-done:
 	bio->bi_iter.bi_size += len;
+	bio->bi_vcnt++;
+}
+EXPORT_SYMBOL_GPL(__bio_add_page);
+
+/**
+ *	bio_add_page	-	attempt to add page to bio
+ *	@bio: destination bio
+ *	@page: page to add
+ *	@len: vec entry length
+ *	@offset: vec entry offset
+ *
+ *	Attempt to add a page to the bio_vec maplist. This will only fail
+ *	if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
+ */
+int bio_add_page(struct bio *bio, struct page *page,
+		 unsigned int len, unsigned int offset)
+{
+	if (!__bio_try_merge_page(bio, page, len, offset)) {
+		if (bio_full(bio))
+			return 0;
+		__bio_add_page(bio, page, len, offset);
+	}
 	return len;
 }
 EXPORT_SYMBOL(bio_add_page);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce547a25e8ae..3e73c8bc25ea 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -123,6 +123,11 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
+static inline bool bio_full(struct bio *bio)
+{
+	return bio->bi_vcnt >= bio->bi_max_vecs;
+}
+
 /*
  * will die
  */
@@ -470,6 +475,10 @@ void bio_chain(struct bio *, struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
+bool __bio_try_merge_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off);
+void __bio_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
-- 
cgit v1.2.3


From 19319b53210c6b89c375cf395c08f156cccd83ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:06 -0700
Subject: iomap: inline data should be an iomap type, not a flag

Inline data is fundamentally different from our normal mapped case in that
it doesn't even have a block address.  So instead of having a flag for it
it should be an entirely separate iomap range type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/ext4/inline.c      |  4 ++--
 fs/gfs2/bmap.c        |  3 +--
 fs/iomap.c            | 21 ++++++++++++---------
 include/linux/iomap.h |  2 +-
 4 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 70cf4c7b268a..e1f00891ef95 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1835,8 +1835,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
 	iomap->offset = 0;
 	iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
 			      i_size_read(inode));
-	iomap->type = 0;
-	iomap->flags = IOMAP_F_DATA_INLINE;
+	iomap->type = IOMAP_INLINE;
+	iomap->flags = 0;
 
 out:
 	up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 278ed0869c3c..cbeedd3cfb36 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -680,8 +680,7 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
 		      sizeof(struct gfs2_dinode);
 	iomap->offset = 0;
 	iomap->length = i_size_read(inode);
-	iomap->type = IOMAP_MAPPED;
-	iomap->flags = IOMAP_F_DATA_INLINE;
+	iomap->type = IOMAP_INLINE;
 }
 
 /**
diff --git a/fs/iomap.c b/fs/iomap.c
index f2456d0d8ddd..df2652b0d85d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -502,10 +502,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 	case IOMAP_DELALLOC:
 		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
 		break;
+	case IOMAP_MAPPED:
+		break;
 	case IOMAP_UNWRITTEN:
 		flags |= FIEMAP_EXTENT_UNWRITTEN;
 		break;
-	case IOMAP_MAPPED:
+	case IOMAP_INLINE:
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
 		break;
 	}
 
@@ -513,8 +516,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 		flags |= FIEMAP_EXTENT_MERGED;
 	if (iomap->flags & IOMAP_F_SHARED)
 		flags |= FIEMAP_EXTENT_SHARED;
-	if (iomap->flags & IOMAP_F_DATA_INLINE)
-		flags |= FIEMAP_EXTENT_DATA_INLINE;
 
 	return fiemap_fill_next_extent(fi, iomap->offset,
 			iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
@@ -1214,14 +1215,16 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
 	struct iomap_swapfile_info *isi = data;
 	int error;
 
-	/* No inline data. */
-	if (iomap->flags & IOMAP_F_DATA_INLINE) {
+	switch (iomap->type) {
+	case IOMAP_MAPPED:
+	case IOMAP_UNWRITTEN:
+		/* Only real or unwritten extents. */
+		break;
+	case IOMAP_INLINE:
+		/* No inline data. */
 		pr_err("swapon: file is inline\n");
 		return -EINVAL;
-	}
-
-	/* Only real or unwritten extents. */
-	if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN) {
+	default:
 		pr_err("swapon: file has unallocated extents\n");
 		return -EINVAL;
 	}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4bd87294219a..8f7095fc514e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -18,6 +18,7 @@ struct vm_fault;
 #define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
 #define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
+#define IOMAP_INLINE	0x05	/* data inline in the inode */
 
 /*
  * Flags for all iomap mappings:
@@ -34,7 +35,6 @@ struct vm_fault;
  */
 #define IOMAP_F_MERGED		0x10	/* contains multiple blocks/extents */
 #define IOMAP_F_SHARED		0x20	/* block shared with another file */
-#define IOMAP_F_DATA_INLINE	0x40	/* data inline in the inode */
 
 /*
  * Magic value for addr:
-- 
cgit v1.2.3


From 9ecac0ef2233edcb418cc17f9fcbe51ddb0c696c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:07 -0700
Subject: iomap: fix the comment describing IOMAP_NOWAIT

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 include/linux/iomap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8f7095fc514e..13d19b4c29a9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -59,7 +59,7 @@ struct iomap {
 #define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
-#define IOMAP_NOWAIT		(1 << 5) /* Don't wait for writeback */
+#define IOMAP_NOWAIT		(1 << 5) /* do not block */
 
 struct iomap_ops {
 	/*
-- 
cgit v1.2.3


From 7ee66c03e40a570cbf641ff83c063f5209eb22b2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:07 -0700
Subject: iomap: move IOMAP_F_BOUNDARY to gfs2

Just define a range of fs specific flags and use that in gfs2 instead of
exposing this internal flag globally.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/gfs2/bmap.c        | 8 +++++---
 include/linux/iomap.h | 9 +++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index cbeedd3cfb36..8efa6297e19c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -683,6 +683,8 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
 	iomap->type = IOMAP_INLINE;
 }
 
+#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
+
 /**
  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
  * @inode: The inode
@@ -774,7 +776,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 	bh = mp.mp_bh[ip->i_height - 1];
 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
 	if (eob)
-		iomap->flags |= IOMAP_F_BOUNDARY;
+		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 	iomap->length = (u64)len << inode->i_blkbits;
 
 out_release:
@@ -846,12 +848,12 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 
 	if (iomap.length > bh_map->b_size) {
 		iomap.length = bh_map->b_size;
-		iomap.flags &= ~IOMAP_F_BOUNDARY;
+		iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
 	}
 	if (iomap.addr != IOMAP_NULL_ADDR)
 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
 	bh_map->b_size = iomap.length;
-	if (iomap.flags & IOMAP_F_BOUNDARY)
+	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
 		set_buffer_boundary(bh_map);
 	if (iomap.flags & IOMAP_F_NEW)
 		set_buffer_new(bh_map);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 13d19b4c29a9..819e0cd2a950 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -27,8 +27,7 @@ struct vm_fault;
  * written data and requires fdatasync to commit them to persistent storage.
  */
 #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
-#define IOMAP_F_BOUNDARY	0x02	/* mapping ends at metadata boundary */
-#define IOMAP_F_DIRTY		0x04	/* uncommitted metadata */
+#define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
@@ -36,6 +35,12 @@ struct vm_fault;
 #define IOMAP_F_MERGED		0x10	/* contains multiple blocks/extents */
 #define IOMAP_F_SHARED		0x20	/* block shared with another file */
 
+/*
+ * Flags from 0x1000 up are for file system specific usage:
+ */
+#define IOMAP_F_PRIVATE		0x1000
+
+
 /*
  * Magic value for addr:
  */
-- 
cgit v1.2.3


From 89eb1906a9530ef694b2a5817c9498997722754f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:08 -0700
Subject: iomap: add an iomap-based bmap implementation

This adds a simple iomap-based implementation of the legacy ->bmap
interface.  Note that we can't easily add checks for rt or reflink
files, so these will have to remain in the callers.  This interface
just needs to die..

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c            | 34 ++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  3 +++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/fs/iomap.c b/fs/iomap.c
index 74cdf8b5bbb0..b0bc928672af 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1307,3 +1307,37 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
 }
 EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
 #endif /* CONFIG_SWAP */
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	sector_t *bno = data, addr;
+
+	if (iomap->type == IOMAP_MAPPED) {
+		addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
+		if (addr > INT_MAX)
+			WARN(1, "would truncate bmap result\n");
+		else
+			*bno = addr;
+	}
+	return 0;
+}
+
+/* legacy ->bmap interface.  0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+		const struct iomap_ops *ops)
+{
+	struct inode *inode = mapping->host;
+	loff_t pos = bno >> inode->i_blkbits;
+	unsigned blocksize = i_blocksize(inode);
+
+	if (filemap_write_and_wait(mapping))
+		return 0;
+
+	bno = 0;
+	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+	return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 819e0cd2a950..a044a824da85 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 
+struct address_space;
 struct fiemap_extent_info;
 struct inode;
 struct iov_iter;
@@ -100,6 +101,8 @@ loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
 		const struct iomap_ops *ops);
 loff_t iomap_seek_data(struct inode *inode, loff_t offset,
 		const struct iomap_ops *ops);
+sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
+		const struct iomap_ops *ops);
 
 /*
  * Flags for direct I/O ->end_io:
-- 
cgit v1.2.3


From 8a78cb1f1b98e5ea970674e0f049832d19e76ace Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:04:40 -0700
Subject: fs: move page_cache_seek_hole_data to iomap.c

This function is only used by the iomap code, depends on being called
from it, and will soon stop poking into buffer head internals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/buffer.c                 | 114 -------------------------------------------
 fs/iomap.c                  | 116 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/buffer_head.h |   2 -
 3 files changed, 116 insertions(+), 116 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 249b83fafe48..cabc045f483d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3427,120 +3427,6 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
 
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- *
- * Returns the offset within the file on success, and -ENOENT otherwise.
- */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
-{
-	loff_t offset = page_offset(page);
-	struct buffer_head *bh, *head;
-	bool seek_data = whence == SEEK_DATA;
-
-	if (lastoff < offset)
-		lastoff = offset;
-
-	bh = head = page_buffers(page);
-	do {
-		offset += bh->b_size;
-		if (lastoff >= offset)
-			continue;
-
-		/*
-		 * Unwritten extents that have data in the page cache covering
-		 * them can be identified by the BH_Unwritten state flag.
-		 * Pages with multiple buffers might have a mix of holes, data
-		 * and unwritten extents - any buffer with valid data in it
-		 * should have BH_Uptodate flag set on it.
-		 */
-
-		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
-			return lastoff;
-
-		lastoff = offset;
-	} while ((bh = bh->b_this_page) != head);
-	return -ENOENT;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
-loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
-			  int whence)
-{
-	pgoff_t index = offset >> PAGE_SHIFT;
-	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-	loff_t lastoff = offset;
-	struct pagevec pvec;
-
-	if (length <= 0)
-		return -ENOENT;
-
-	pagevec_init(&pvec);
-
-	do {
-		unsigned nr_pages, i;
-
-		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-						end - 1);
-		if (nr_pages == 0)
-			break;
-
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/*
-			 * At this point, the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or
-			 * even swizzled back from swapper_space to tmpfs file
-			 * mapping.  However, page->index will not change
-			 * because we have a reference on the page.
-                         *
-			 * If current page offset is beyond where we've ended,
-			 * we've found a hole.
-                         */
-			if (whence == SEEK_HOLE &&
-			    lastoff < page_offset(page))
-				goto check_range;
-
-			lock_page(page);
-			if (likely(page->mapping == inode->i_mapping) &&
-			    page_has_buffers(page)) {
-				lastoff = page_seek_hole_data(page, lastoff, whence);
-				if (lastoff >= 0) {
-					unlock_page(page);
-					goto check_range;
-				}
-			}
-			unlock_page(page);
-			lastoff = page_offset(page) + PAGE_SIZE;
-		}
-		pagevec_release(&pvec);
-	} while (index < end);
-
-	/* When no page at lastoff and we are not done, we found a hole. */
-	if (whence != SEEK_HOLE)
-		goto not_found;
-
-check_range:
-	if (lastoff < offset + length)
-		goto out;
-not_found:
-	lastoff = -ENOENT;
-out:
-	pagevec_release(&pvec);
-	return lastoff;
-}
-
 void __init buffer_init(void)
 {
 	unsigned long nrpages;
diff --git a/fs/iomap.c b/fs/iomap.c
index b0bc928672af..fe3b9222dd46 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/file.h>
 #include <linux/uio.h>
 #include <linux/backing-dev.h>
@@ -592,6 +593,121 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
 }
 EXPORT_SYMBOL_GPL(iomap_fiemap);
 
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ *
+ * Returns the offset within the file on success, and -ENOENT otherwise.
+ */
+static loff_t
+page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+{
+	loff_t offset = page_offset(page);
+	struct buffer_head *bh, *head;
+	bool seek_data = whence == SEEK_DATA;
+
+	if (lastoff < offset)
+		lastoff = offset;
+
+	bh = head = page_buffers(page);
+	do {
+		offset += bh->b_size;
+		if (lastoff >= offset)
+			continue;
+
+		/*
+		 * Unwritten extents that have data in the page cache covering
+		 * them can be identified by the BH_Unwritten state flag.
+		 * Pages with multiple buffers might have a mix of holes, data
+		 * and unwritten extents - any buffer with valid data in it
+		 * should have BH_Uptodate flag set on it.
+		 */
+
+		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+			return lastoff;
+
+		lastoff = offset;
+	} while ((bh = bh->b_this_page) != head);
+	return -ENOENT;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: unwritten and uptodate buffer heads count as data;
+ * everything else counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+		int whence)
+{
+	pgoff_t index = offset >> PAGE_SHIFT;
+	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	loff_t lastoff = offset;
+	struct pagevec pvec;
+
+	if (length <= 0)
+		return -ENOENT;
+
+	pagevec_init(&pvec);
+
+	do {
+		unsigned nr_pages, i;
+
+		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+						end - 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping.  However, page->index will not change
+			 * because we have a reference on the page.
+                         *
+			 * If current page offset is beyond where we've ended,
+			 * we've found a hole.
+                         */
+			if (whence == SEEK_HOLE &&
+			    lastoff < page_offset(page))
+				goto check_range;
+
+			lock_page(page);
+			if (likely(page->mapping == inode->i_mapping) &&
+			    page_has_buffers(page)) {
+				lastoff = page_seek_hole_data(page, lastoff, whence);
+				if (lastoff >= 0) {
+					unlock_page(page);
+					goto check_range;
+				}
+			}
+			unlock_page(page);
+			lastoff = page_offset(page) + PAGE_SIZE;
+		}
+		pagevec_release(&pvec);
+	} while (index < end);
+
+	/* When no page at lastoff and we are not done, we found a hole. */
+	if (whence != SEEK_HOLE)
+		goto not_found;
+
+check_range:
+	if (lastoff < offset + length)
+		goto out;
+not_found:
+	lastoff = -ENOENT;
+out:
+	pagevec_release(&pvec);
+	return lastoff;
+}
+
+
 static loff_t
 iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
 		      void *data, struct iomap *iomap)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 894e5d125de6..96225a77c112 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,8 +205,6 @@ void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
 int bh_submit_read(struct buffer_head *bh);
-loff_t page_cache_seek_hole_data(struct inode *inode, loff_t offset,
-				 loff_t length, int whence);
 
 extern int buffer_heads_over_limit;
 
-- 
cgit v1.2.3


From b1569380a60ff4145898721e4f17b045a7cacc34 Mon Sep 17 00:00:00 2001
From: Colin Didier <colin.didier@devialet.com>
Date: Tue, 29 May 2018 19:04:33 +0200
Subject: clk: imx6: add EPIT clock support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add EPIT clock support to the i.MX6Q clocking infrastructure.

Signed-off-by: Colin Didier <colin.didier@devialet.com>
Signed-off-by: Clément Peron <clement.peron@devialet.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx6q.c               | 2 ++
 include/dt-bindings/clock/imx6qdl-clock.h | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/clk/imx/clk-imx6q.c b/drivers/clk/imx/clk-imx6q.c
index 8d518ad5dc13..b9ea7037e193 100644
--- a/drivers/clk/imx/clk-imx6q.c
+++ b/drivers/clk/imx/clk-imx6q.c
@@ -753,6 +753,8 @@ static void __init imx6q_clocks_init(struct device_node *ccm_node)
 	else
 		clk[IMX6Q_CLK_ECSPI5] = imx_clk_gate2("ecspi5",        "ecspi_root",        base + 0x6c, 8);
 	clk[IMX6QDL_CLK_ENET]         = imx_clk_gate2("enet",          "ipg",               base + 0x6c, 10);
+	clk[IMX6QDL_CLK_EPIT1]        = imx_clk_gate2("epit1",         "ipg",               base + 0x6c, 12);
+	clk[IMX6QDL_CLK_EPIT2]        = imx_clk_gate2("epit2",         "ipg",               base + 0x6c, 14);
 	clk[IMX6QDL_CLK_ESAI_EXTAL]   = imx_clk_gate2_shared("esai_extal",   "esai_podf",   base + 0x6c, 16, &share_count_esai);
 	clk[IMX6QDL_CLK_ESAI_IPG]     = imx_clk_gate2_shared("esai_ipg",   "ahb",           base + 0x6c, 16, &share_count_esai);
 	clk[IMX6QDL_CLK_ESAI_MEM]     = imx_clk_gate2_shared("esai_mem", "ahb",             base + 0x6c, 16, &share_count_esai);
diff --git a/include/dt-bindings/clock/imx6qdl-clock.h b/include/dt-bindings/clock/imx6qdl-clock.h
index da59fd9cdb5e..7ad171b8f3bf 100644
--- a/include/dt-bindings/clock/imx6qdl-clock.h
+++ b/include/dt-bindings/clock/imx6qdl-clock.h
@@ -271,6 +271,8 @@
 #define IMX6QDL_CLK_PRE_AXI			258
 #define IMX6QDL_CLK_MLB_SEL			259
 #define IMX6QDL_CLK_MLB_PODF			260
-#define IMX6QDL_CLK_END				261
+#define IMX6QDL_CLK_EPIT1			261
+#define IMX6QDL_CLK_EPIT2			262
+#define IMX6QDL_CLK_END				263
 
 #endif /* __DT_BINDINGS_CLOCK_IMX6QDL_H */
-- 
cgit v1.2.3


From c1c2873df06372a108976599583eba7e47483379 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 29 May 2018 18:08:04 +0200
Subject: clk: remove clk_debugfs_add_file()

No one was using this api call, so remove it.  If it is ever needed in
the future, a "raw" debugfs call can be used.

Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 13 -------------
 include/linux/clk-provider.h |  5 -----
 2 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index ef701ce44439..59207cd6138a 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2669,19 +2669,6 @@ static void clk_debug_unregister(struct clk_core *core)
 	mutex_unlock(&clk_debug_lock);
 }
 
-struct dentry *clk_debugfs_add_file(struct clk_hw *hw, char *name, umode_t mode,
-				void *data, const struct file_operations *fops)
-{
-	struct dentry *d = NULL;
-
-	if (hw->core->dentry)
-		d = debugfs_create_file(name, mode, hw->core->dentry, data,
-					fops);
-
-	return d;
-}
-EXPORT_SYMBOL_GPL(clk_debugfs_add_file);
-
 /**
  * clk_debug_init - lazily populate the debugfs clk directory
  *
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 210a890008f9..00e55d4d07f1 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -996,10 +996,5 @@ static inline void clk_writel(u32 val, u32 __iomem *reg)
 
 #endif	/* platform dependent I/O accessors */
 
-#ifdef CONFIG_DEBUG_FS
-struct dentry *clk_debugfs_add_file(struct clk_hw *hw, char *name, umode_t mode,
-				void *data, const struct file_operations *fops);
-#endif
-
 #endif /* CONFIG_COMMON_CLK */
 #endif /* CLK_PROVIDER_H */
-- 
cgit v1.2.3


From 5f9bf63ae80c4d0e5e986b6c1280bf8174978545 Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:30 +0300
Subject: net/mlx5: Export flow counter related API

Exports counters API to be used in both IB and EN.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h     | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 3 +++
 include/linux/mlx5/fs.h                               | 3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 276b13d7ea43..4157dd76165e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -233,8 +233,6 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
 			      unsigned long delay);
 void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
 				      unsigned long interval);
-int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
-		  u64 *packets, u64 *bytes);
 
 int mlx5_init_fs(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index f5cb3fa5d8cf..58af6be13dfa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -243,6 +243,7 @@ err_out:
 
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL(mlx5_fc_create);
 
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 {
@@ -260,6 +261,7 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 	mlx5_cmd_fc_free(dev, counter->id);
 	kfree(counter);
 }
+EXPORT_SYMBOL(mlx5_fc_destroy);
 
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 {
@@ -317,6 +319,7 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
 {
 	return mlx5_cmd_fc_query(dev, counter->id, packets, bytes);
 }
+EXPORT_SYMBOL(mlx5_fc_query);
 
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 9f4d32e41c06..3b4c3298061c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -186,6 +186,9 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
+int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
+		  u64 *packets, u64 *bytes);
+
 int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
 int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
 
-- 
cgit v1.2.3


From 3efa38125b81fb4eb7dbb43295cb82a203dc0398 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 31 May 2018 16:43:28 +0300
Subject: IB/uverbs: Add an ib_uobject getter to ioctl() infrastructure

Previously, the user had to dig inside the attribute to get the uobject.
Add a helper function that correctly extract it (and do the required
checks) for him/her.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_std_types_cq.c      | 23 +++++++++++-----------
 .../infiniband/core/uverbs_std_types_flow_action.c |  4 ++--
 include/rdma/uverbs_ioctl.h                        | 11 +++++++++++
 3 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index b0dbae9dd0d7..3d293d01afea 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -65,7 +65,6 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev,
 	struct ib_cq_init_attr attr = {};
 	struct ib_cq                   *cq;
 	struct ib_uverbs_completion_event_file    *ev_file = NULL;
-	const struct uverbs_attr *ev_file_attr;
 	struct ib_uobject *ev_file_uobj;
 
 	if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ))
@@ -87,10 +86,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev,
 						UVERBS_ATTR_CREATE_CQ_FLAGS)))
 		return -EFAULT;
 
-	ev_file_attr = uverbs_attr_get(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
-	if (!IS_ERR(ev_file_attr)) {
-		ev_file_uobj = ev_file_attr->obj_attr.uobject;
-
+	ev_file_uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
+	if (!IS_ERR(ev_file_uobj)) {
 		ev_file = container_of(ev_file_uobj,
 				       struct ib_uverbs_completion_event_file,
 				       uobj_file.uobj);
@@ -102,8 +99,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev,
 		goto err_event_file;
 	}
 
-	obj = container_of(uverbs_attr_get(attrs,
-					   UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject,
+	obj = container_of(uverbs_attr_get_uobject(attrs,
+						   UVERBS_ATTR_CREATE_CQ_HANDLE),
 			   typeof(*obj), uobject);
 	obj->uverbs_file	   = ucontext->ufile;
 	obj->comp_events_reported  = 0;
@@ -170,13 +167,17 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev,
 						    struct ib_uverbs_file *file,
 						    struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_destroy_cq_resp resp;
 	struct ib_uobject *uobj =
-		uverbs_attr_get(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject;
-	struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
-						 uobject);
+		uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE);
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_ucq_object *obj;
 	int ret;
 
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	obj = container_of(uobj, struct ib_ucq_object, uobject);
+
 	if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ))
 		return -EOPNOTSUPP;
 
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index cbcec3da12f6..00e6b34bc16b 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -320,7 +320,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device
 		return ret;
 
 	/* No need to check as this attribute is marked as MANDATORY */
-	uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+	uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE);
 	action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs);
 	if (IS_ERR(action))
 		return PTR_ERR(action);
@@ -350,7 +350,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(struct ib_device
 	if (ret)
 		return ret;
 
-	uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+	uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE);
 	action = uobj->object;
 
 	if (action->type != IB_FLOW_ACTION_ESP)
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 4a4201d997a7..7ac6271a5ee0 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -420,6 +420,17 @@ static inline void *uverbs_attr_get_obj(const struct uverbs_attr_bundle *attrs_b
 	return uobj->object;
 }
 
+static inline struct ib_uobject *uverbs_attr_get_uobject(const struct uverbs_attr_bundle *attrs_bundle,
+							 u16 idx)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+	if (IS_ERR(attr))
+		return ERR_CAST(attr);
+
+	return attr->obj_attr.uobject;
+}
+
 static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle,
 				 size_t idx, const void *from, size_t size)
 {
-- 
cgit v1.2.3


From fa9b1802d140f4d4a781b3444aadf47fa511cabb Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:31 +0300
Subject: IB/core: Introduce counters object and its create/destroy

A verbs application may need to get statistics and info on various aspects
of a verb object (e.g. Flow, QP, ...), in general case the application
will state which object's counters its interested in (we refer to this
action as attach), bind this new counters object to the appropriate verb
object and on later stage read their values using the counters object.

This series introduces a general API for counters object that may
accumulate any ib object counters type, bound and read on demand.

Counters instance is allocated on an IB context and belongs to that
context.  Upon successful creation the counters can be bound to a verbs
object so that hardware counter instances can be created and read.

Downstream patches in this series will introduce the attach, bind and the
read functionality.

Counters instance can be de-allocated, upon successful destruction the
related hardware resources are released.

Prior to destroy call the user must first make sure that the counters is
not being used by any IB object, e.g. not attached to any of its counted
type otherwise an EBUSY error is invoked.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9fc8a825aa28..4f5d9fd4f920 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2180,6 +2180,13 @@ struct ib_port_pkey_list {
 	struct list_head              pkey_list;
 };
 
+struct ib_counters {
+	struct ib_device	*device;
+	struct ib_uobject	*uobject;
+	/* num of objects attached */
+	atomic_t	usecnt;
+};
+
 struct uverbs_attr_bundle;
 
 struct ib_device {
@@ -2451,6 +2458,10 @@ struct ib_device {
 	struct ib_mr *             (*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm,
 						struct ib_dm_mr_attr *attr,
 						struct uverbs_attr_bundle *attrs);
+	struct ib_counters *	(*create_counters)(struct ib_device *device,
+						   struct uverbs_attr_bundle *attrs);
+	int	(*destroy_counters)(struct ib_counters	*counters);
+
 	/**
 	 * rdma netdev operation
 	 *
-- 
cgit v1.2.3


From d9a5a6441e9dde080e9d69e736c623f7369472ed Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:32 +0300
Subject: IB/uverbs: Add create/destroy counters support

User space application which uses counters functionality, is expected to
allocate/release the counters resources by calling create/destroy verbs
and in turn get a unique handle that can be used to attach the counters to
its counted type.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/Makefile                   |   2 +-
 drivers/infiniband/core/uverbs.h                   |   1 +
 drivers/infiniband/core/uverbs_std_types.c         |   3 +-
 .../infiniband/core/uverbs_std_types_counters.c    | 100 +++++++++++++++++++++
 include/uapi/rdma/ib_user_ioctl_cmds.h             |  14 +++
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/core/uverbs_std_types_counters.c

(limited to 'include')

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index dda9e856e3fa..589d3b744b1e 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -36,4 +36,4 @@ ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
 				uverbs_ioctl_merge.o uverbs_std_types_cq.o \
 				uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
-				uverbs_std_types_mr.o
+				uverbs_std_types_mr.o uverbs_std_types_counters.o
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index cfb51618ab7a..5b2461fa634d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -287,6 +287,7 @@ extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS);
 
 #define IB_UVERBS_DECLARE_CMD(name)					\
 	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index 569f48bd821e..b570acbd94af 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -302,7 +302,8 @@ static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects,
 				  &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL),
 				  &UVERBS_OBJECT(UVERBS_OBJECT_XRCD),
 				  &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION),
-				  &UVERBS_OBJECT(UVERBS_OBJECT_DM));
+				  &UVERBS_OBJECT(UVERBS_OBJECT_DM),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS));
 
 const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
 {
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
new file mode 100644
index 000000000000..4b6a985aad97
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_counters(struct ib_uobject *uobject,
+				enum rdma_remove_reason why)
+{
+	struct ib_counters *counters = uobject->object;
+
+	if (why == RDMA_REMOVE_DESTROY &&
+	    atomic_read(&counters->usecnt))
+		return -EBUSY;
+
+	return counters->device->destroy_counters(counters);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(struct ib_device *ib_dev,
+							 struct ib_uverbs_file *file,
+							 struct uverbs_attr_bundle *attrs)
+{
+	struct ib_counters *counters;
+	struct ib_uobject *uobj;
+	int ret;
+
+	/*
+	 * This check should be removed once the infrastructure
+	 * have the ability to remove methods from parse tree once
+	 * such condition is met.
+	 */
+	if (!ib_dev->create_counters)
+		return -EOPNOTSUPP;
+
+	uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE);
+	counters = ib_dev->create_counters(ib_dev, attrs);
+	if (IS_ERR(counters)) {
+		ret = PTR_ERR(counters);
+		goto err_create_counters;
+	}
+
+	counters->device = ib_dev;
+	counters->uobject = uobj;
+	uobj->object = counters;
+	atomic_set(&counters->usecnt, 0);
+
+	return 0;
+
+err_create_counters:
+	return ret;
+}
+
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_CREATE,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
+			 UVERBS_OBJECT_COUNTERS,
+			 UVERBS_ACCESS_NEW,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_COUNTERS_DESTROY,
+	uverbs_destroy_def_handler,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_COUNTERS_HANDLE,
+			 UVERBS_OBJECT_COUNTERS,
+			 UVERBS_ACCESS_DESTROY,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS,
+			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_counters),
+			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
+			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY));
+
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index 83e3890eef20..c28ce62d2e40 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -55,6 +55,7 @@ enum uverbs_default_objects {
 	UVERBS_OBJECT_WQ,
 	UVERBS_OBJECT_FLOW_ACTION,
 	UVERBS_OBJECT_DM,
+	UVERBS_OBJECT_COUNTERS,
 };
 
 enum {
@@ -131,4 +132,17 @@ enum uverbs_methods_mr {
 	UVERBS_METHOD_DM_MR_REG,
 };
 
+enum uverbs_attrs_create_counters_cmd_attr_ids {
+	UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
+};
+
+enum uverbs_attrs_destroy_counters_cmd_attr_ids {
+	UVERBS_ATTR_DESTROY_COUNTERS_HANDLE,
+};
+
+enum uverbs_methods_actions_counters_ops {
+	UVERBS_METHOD_COUNTERS_CREATE,
+	UVERBS_METHOD_COUNTERS_DESTROY,
+};
+
 #endif
-- 
cgit v1.2.3


From 51d7a5387464f1b1fee3f2db9287409189d83d65 Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:33 +0300
Subject: IB/core: Introduce counters read verb

The user supplies counters instance and a reference to an output array of
uint64_t.  The driver reads the hardware counters values and writes them
to the output index location in the user supplied array.  All counters
values are represented as uint64_t types.

To be able to successfully read the data the counters must be first bound
to an IB object.

Downstream patches will present binding method for flow counters.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4f5d9fd4f920..3769a1cc99b0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2187,6 +2187,17 @@ struct ib_counters {
 	atomic_t	usecnt;
 };
 
+enum ib_read_counters_flags {
+	/* prefer read values from driver cache */
+	IB_READ_COUNTERS_ATTR_PREFER_CACHED = 1 << 0,
+};
+
+struct ib_counters_read_attr {
+	u64	*counters_buff;
+	u32	ncounters;
+	u32	flags; /* use enum ib_read_counters_flags */
+};
+
 struct uverbs_attr_bundle;
 
 struct ib_device {
@@ -2461,6 +2472,9 @@ struct ib_device {
 	struct ib_counters *	(*create_counters)(struct ib_device *device,
 						   struct uverbs_attr_bundle *attrs);
 	int	(*destroy_counters)(struct ib_counters	*counters);
+	int	(*read_counters)(struct ib_counters *counters,
+				 struct ib_counters_read_attr *counters_read_attr,
+				 struct uverbs_attr_bundle *attrs);
 
 	/**
 	 * rdma netdev operation
-- 
cgit v1.2.3


From ebb6796bd397f3fb9b2b46398b2fbb585e1b8bb6 Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:34 +0300
Subject: IB/uverbs: Add read counters support

This patch exposes the read counters verb to user space applications.  By
that verb the user can read the hardware counters which are associated
with the counters object.

The application needs to provide a sufficient memory to hold the
statistics.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 .../infiniband/core/uverbs_std_types_counters.c    | 59 +++++++++++++++++++++-
 include/uapi/rdma/ib_user_ioctl_cmds.h             |  7 +++
 2 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
index 4b6a985aad97..03b182a684a6 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -80,6 +80,49 @@ err_create_counters:
 	return ret;
 }
 
+static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(struct ib_device *ib_dev,
+						       struct ib_uverbs_file *file,
+						       struct uverbs_attr_bundle *attrs)
+{
+	struct ib_counters_read_attr read_attr = {};
+	const struct uverbs_attr *uattr;
+	struct ib_counters *counters =
+		uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE);
+	int ret;
+
+	if (!ib_dev->read_counters)
+		return -EOPNOTSUPP;
+
+	if (!atomic_read(&counters->usecnt))
+		return -EINVAL;
+
+	ret = uverbs_copy_from(&read_attr.flags, attrs,
+			       UVERBS_ATTR_READ_COUNTERS_FLAGS);
+	if (ret)
+		return ret;
+
+	uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF);
+	read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64);
+	read_attr.counters_buff = kcalloc(read_attr.ncounters,
+					  sizeof(u64), GFP_KERNEL);
+	if (!read_attr.counters_buff)
+		return -ENOMEM;
+
+	ret = ib_dev->read_counters(counters,
+				    &read_attr,
+				    attrs);
+	if (ret)
+		goto err_read;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF,
+			     read_attr.counters_buff,
+			     read_attr.ncounters * sizeof(u64));
+
+err_read:
+	kfree(read_attr.counters_buff);
+	return ret;
+}
+
 static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_CREATE,
 	&UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
 			 UVERBS_OBJECT_COUNTERS,
@@ -93,8 +136,22 @@ static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_COUNTERS_DESTROY,
 			 UVERBS_ACCESS_DESTROY,
 			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
 
+#define MAX_COUNTERS_BUFF_SIZE USHRT_MAX
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_READ,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE,
+			 UVERBS_OBJECT_COUNTERS,
+			 UVERBS_ACCESS_READ,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF,
+			     UVERBS_ATTR_SIZE(0, MAX_COUNTERS_BUFF_SIZE),
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS,
+			    UVERBS_ATTR_TYPE(__u32),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS,
 			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_counters),
 			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
-			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY));
+			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY),
+			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ));
 
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index c28ce62d2e40..888ac5975a6c 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -140,9 +140,16 @@ enum uverbs_attrs_destroy_counters_cmd_attr_ids {
 	UVERBS_ATTR_DESTROY_COUNTERS_HANDLE,
 };
 
+enum uverbs_attrs_read_counters_cmd_attr_ids {
+	UVERBS_ATTR_READ_COUNTERS_HANDLE,
+	UVERBS_ATTR_READ_COUNTERS_BUFF,
+	UVERBS_ATTR_READ_COUNTERS_FLAGS,
+};
+
 enum uverbs_methods_actions_counters_ops {
 	UVERBS_METHOD_COUNTERS_CREATE,
 	UVERBS_METHOD_COUNTERS_DESTROY,
+	UVERBS_METHOD_COUNTERS_READ,
 };
 
 #endif
-- 
cgit v1.2.3


From 59082a327d0145c69b419a0f5bed96b13c5e9ed4 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 31 May 2018 16:43:35 +0300
Subject: IB/core: Support passing uhw for create_flow

This is required when user-space drivers need to pass extra information
regarding how to handle this flow steering specification.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 7 ++++++-
 drivers/infiniband/core/verbs.c      | 2 +-
 drivers/infiniband/hw/mlx4/main.c    | 6 +++++-
 drivers/infiniband/hw/mlx5/main.c    | 7 ++++++-
 include/rdma/ib_verbs.h              | 3 ++-
 5 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 13cb5e4deb86..c735c13e53b0 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3513,11 +3513,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 		err = -EINVAL;
 		goto err_free;
 	}
-	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+
+	flow_id = qp->device->create_flow(qp, flow_attr,
+					  IB_FLOW_DOMAIN_USER, uhw);
+
 	if (IS_ERR(flow_id)) {
 		err = PTR_ERR(flow_id);
 		goto err_free;
 	}
+	atomic_inc(&qp->usecnt);
+	flow_id->qp = qp;
 	flow_id->uobject = uobj;
 	uobj->object = flow_id;
 	uflow = container_of(uobj, typeof(*uflow), uobject);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 7eff3aeffe01..9a3e886c12fd 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1982,7 +1982,7 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp,
 	if (!qp->device->create_flow)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	flow_id = qp->device->create_flow(qp, flow_attr, domain);
+	flow_id = qp->device->create_flow(qp, flow_attr, domain, NULL);
 	if (!IS_ERR(flow_id)) {
 		atomic_inc(&qp->usecnt);
 		flow_id->qp = qp;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 5b70744f414a..5b88bdd1ecef 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1847,7 +1847,7 @@ static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev,
 
 static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
 				    struct ib_flow_attr *flow_attr,
-				    int domain)
+				    int domain, struct ib_udata *udata)
 {
 	int err = 0, i = 0, j = 0;
 	struct mlx4_ib_flow *mflow;
@@ -1865,6 +1865,10 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
 	    (flow_attr->type != IB_FLOW_ATTR_NORMAL))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	if (udata &&
+	    udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	memset(type, 0, sizeof(type));
 
 	mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index daa919e5a442..e94df85ddf08 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3245,7 +3245,8 @@ err:
 
 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 					   struct ib_flow_attr *flow_attr,
-					   int domain)
+					   int domain,
+					   struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
 	struct mlx5_ib_qp *mqp = to_mqp(qp);
@@ -3257,6 +3258,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 	int err;
 	int underlay_qpn;
 
+	if (udata &&
+	    udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3769a1cc99b0..ea97b91dd88c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2427,7 +2427,8 @@ struct ib_device {
 	struct ib_flow *	   (*create_flow)(struct ib_qp *qp,
 						  struct ib_flow_attr
 						  *flow_attr,
-						  int domain);
+						  int domain,
+						  struct ib_udata *udata);
 	int			   (*destroy_flow)(struct ib_flow *flow_id);
 	int			   (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
 						      struct ib_mr_status *mr_status);
-- 
cgit v1.2.3


From 7eea23a5cd51a945700ad44c7ce585dd2a640dea Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:36 +0300
Subject: IB/core: Add support for flow counters

A counters object could be attached to flow on creation by providing the
counter specification action.

General counters description which count packets and bytes are introduced,
downstream patches from this series will use them as part of flow counters
binding.

In addition, increase number of flow specifications supported layers to 10
upon adding count specification and for the previously added drop
specification.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index ea97b91dd88c..ee8ab9f2d250 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1857,9 +1857,10 @@ enum ib_flow_spec_type {
 	IB_FLOW_SPEC_ACTION_TAG         = 0x1000,
 	IB_FLOW_SPEC_ACTION_DROP        = 0x1001,
 	IB_FLOW_SPEC_ACTION_HANDLE	= 0x1002,
+	IB_FLOW_SPEC_ACTION_COUNT       = 0x1003,
 };
 #define IB_FLOW_SPEC_LAYER_MASK	0xF0
-#define IB_FLOW_SPEC_SUPPORT_LAYERS 8
+#define IB_FLOW_SPEC_SUPPORT_LAYERS 10
 
 /* Flow steering rule priority is set according to it's domain.
  * Lower domain value means higher priority.
@@ -2011,6 +2012,17 @@ struct ib_flow_spec_action_handle {
 	struct ib_flow_action	     *act;
 };
 
+enum ib_counters_description {
+	IB_COUNTER_PACKETS,
+	IB_COUNTER_BYTES,
+};
+
+struct ib_flow_spec_action_count {
+	enum ib_flow_spec_type type;
+	u16 size;
+	struct ib_counters *counters;
+};
+
 union ib_flow_spec {
 	struct {
 		u32			type;
@@ -2026,6 +2038,7 @@ union ib_flow_spec {
 	struct ib_flow_spec_action_tag  flow_tag;
 	struct ib_flow_spec_action_drop drop;
 	struct ib_flow_spec_action_handle action;
+	struct ib_flow_spec_action_count flow_count;
 };
 
 struct ib_flow_attr {
-- 
cgit v1.2.3


From b6ba4a9aa59fe99c6e9ca6ec941cd5f9823b0cae Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:37 +0300
Subject: IB/uverbs: Add support for flow counters

The struct ib_uverbs_flow_spec_action_count associates a counters object
with the flow.

Post this association the flow counters can be read via the counters
object.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs.h     |  1 +
 drivers/infiniband/core/uverbs_cmd.c | 81 +++++++++++++++++++++++++++++++-----
 include/uapi/rdma/ib_user_verbs.h    | 13 ++++++
 3 files changed, 84 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 5b2461fa634d..c0d40fc3a53a 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -263,6 +263,7 @@ struct ib_uverbs_flow_spec {
 		struct ib_uverbs_flow_spec_action_tag	flow_tag;
 		struct ib_uverbs_flow_spec_action_drop	drop;
 		struct ib_uverbs_flow_spec_action_handle action;
+		struct ib_uverbs_flow_spec_action_count flow_count;
 	};
 };
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index c735c13e53b0..8fbeece7f831 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2742,43 +2742,82 @@ out_put:
 struct ib_uflow_resources {
 	size_t			max;
 	size_t			num;
-	struct ib_flow_action	*collection[0];
+	size_t			collection_num;
+	size_t			counters_num;
+	struct ib_counters	**counters;
+	struct ib_flow_action	**collection;
 };
 
 static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
 {
 	struct ib_uflow_resources *resources;
 
-	resources =
-		kmalloc(sizeof(*resources) +
-			num_specs * sizeof(*resources->collection), GFP_KERNEL);
+	resources = kzalloc(sizeof(*resources), GFP_KERNEL);
 
 	if (!resources)
-		return NULL;
+		goto err_res;
+
+	resources->counters =
+		kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL);
+
+	if (!resources->counters)
+		goto err_cnt;
+
+	resources->collection =
+		kcalloc(num_specs, sizeof(*resources->collection), GFP_KERNEL);
+
+	if (!resources->collection)
+		goto err_collection;
 
-	resources->num = 0;
 	resources->max = num_specs;
 
 	return resources;
+
+err_collection:
+	kfree(resources->counters);
+err_cnt:
+	kfree(resources);
+err_res:
+	return NULL;
 }
 
 void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
 {
 	unsigned int i;
 
-	for (i = 0; i < uflow_res->num; i++)
+	for (i = 0; i < uflow_res->collection_num; i++)
 		atomic_dec(&uflow_res->collection[i]->usecnt);
 
+	for (i = 0; i < uflow_res->counters_num; i++)
+		atomic_dec(&uflow_res->counters[i]->usecnt);
+
+	kfree(uflow_res->collection);
+	kfree(uflow_res->counters);
 	kfree(uflow_res);
 }
 
 static void flow_resources_add(struct ib_uflow_resources *uflow_res,
-			       struct ib_flow_action *action)
+			       enum ib_flow_spec_type type,
+			       void *ibobj)
 {
 	WARN_ON(uflow_res->num >= uflow_res->max);
 
-	atomic_inc(&action->usecnt);
-	uflow_res->collection[uflow_res->num++] = action;
+	switch (type) {
+	case IB_FLOW_SPEC_ACTION_HANDLE:
+		atomic_inc(&((struct ib_flow_action *)ibobj)->usecnt);
+		uflow_res->collection[uflow_res->collection_num++] =
+			(struct ib_flow_action *)ibobj;
+		break;
+	case IB_FLOW_SPEC_ACTION_COUNT:
+		atomic_inc(&((struct ib_counters *)ibobj)->usecnt);
+		uflow_res->counters[uflow_res->counters_num++] =
+			(struct ib_counters *)ibobj;
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	uflow_res->num++;
 }
 
 static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext,
@@ -2815,9 +2854,29 @@ static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext,
 			return -EINVAL;
 		ib_spec->action.size =
 			sizeof(struct ib_flow_spec_action_handle);
-		flow_resources_add(uflow_res, ib_spec->action.act);
+		flow_resources_add(uflow_res,
+				   IB_FLOW_SPEC_ACTION_HANDLE,
+				   ib_spec->action.act);
 		uobj_put_obj_read(ib_spec->action.act);
 		break;
+	case IB_FLOW_SPEC_ACTION_COUNT:
+		if (kern_spec->flow_count.size !=
+			sizeof(struct ib_uverbs_flow_spec_action_count))
+			return -EINVAL;
+		ib_spec->flow_count.counters =
+			uobj_get_obj_read(counters,
+					  UVERBS_OBJECT_COUNTERS,
+					  kern_spec->flow_count.handle,
+					  ucontext);
+		if (!ib_spec->flow_count.counters)
+			return -EINVAL;
+		ib_spec->flow_count.size =
+				sizeof(struct ib_flow_spec_action_count);
+		flow_resources_add(uflow_res,
+				   IB_FLOW_SPEC_ACTION_COUNT,
+				   ib_spec->flow_count.counters);
+		uobj_put_obj_read(ib_spec->flow_count.counters);
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 9be07394fdbe..f83e8ef6f056 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -998,6 +998,19 @@ struct ib_uverbs_flow_spec_action_handle {
 	__u32			      reserved1;
 };
 
+struct ib_uverbs_flow_spec_action_count {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	__u32			      handle;
+	__u32			      reserved1;
+};
+
 struct ib_uverbs_flow_tunnel_filter {
 	__be32 tunnel_id;
 };
-- 
cgit v1.2.3


From 3b3233fbf02ee4c5de4d635ca6c4f2566d9716df Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Thu, 31 May 2018 16:43:39 +0300
Subject: IB/mlx5: Add flow counters binding support

Associates a counters with a flow when IB_FLOW_SPEC_ACTION_COUNT is part
of the flow specifications.

The counters user space placements of location and description (index,
description) pairs are passed as private data of the counters flow
specification.

Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c    | 222 +++++++++++++++++++++++++++++++++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  15 +++
 include/linux/mlx5/fs.h              |   1 +
 include/uapi/rdma/mlx5-abi.h         |  24 ++++
 4 files changed, 249 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 81471013b776..c52841bad4e7 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2449,6 +2449,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
 #define LAST_TUNNEL_FIELD tunnel_id
 #define LAST_FLOW_TAG_FIELD tag_id
 #define LAST_DROP_FIELD size
+#define LAST_COUNTERS_FIELD counters
 
 /* Field is the last supported field */
 #define FIELDS_NOT_SUPPORTED(filter, field)\
@@ -2721,6 +2722,18 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
 		if (ret)
 			return ret;
 		break;
+	case IB_FLOW_SPEC_ACTION_COUNT:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
+					 LAST_COUNTERS_FIELD))
+			return -EOPNOTSUPP;
+
+		/* for now support only one counters spec per flow */
+		if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
+			return -EINVAL;
+
+		action->counters = ib_spec->flow_count.counters;
+		action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -2868,6 +2881,17 @@ static void put_flow_table(struct mlx5_ib_dev *dev,
 	}
 }
 
+static void counters_clear_description(struct ib_counters *counters)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	kfree(mcounters->counters_data);
+	mcounters->counters_data = NULL;
+	mcounters->cntrs_max_index = 0;
+	mutex_unlock(&mcounters->mcntrs_mutex);
+}
+
 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
 {
 	struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
@@ -2887,8 +2911,11 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
 
 	mlx5_del_flow_rules(handler->rule);
 	put_flow_table(dev, handler->prio, true);
-	mutex_unlock(&dev->flow_db->lock);
+	if (handler->ibcounters &&
+	    atomic_read(&handler->ibcounters->usecnt) == 1)
+		counters_clear_description(handler->ibcounters);
 
+	mutex_unlock(&dev->flow_db->lock);
 	kfree(handler);
 
 	return 0;
@@ -3008,21 +3035,127 @@ static void set_underlay_qp(struct mlx5_ib_dev *dev,
 	}
 }
 
+static int counters_set_description(struct ib_counters *counters,
+				    enum mlx5_ib_counters_type counters_type,
+				    struct mlx5_ib_flow_counters_desc *desc_data,
+				    u32 ncounters)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+	u32 cntrs_max_index = 0;
+	int i;
+
+	if (counters_type != MLX5_IB_COUNTERS_FLOW)
+		return -EINVAL;
+
+	/* init the fields for the object */
+	mcounters->type = counters_type;
+	mcounters->ncounters = ncounters;
+	/* each counter entry have both description and index pair */
+	for (i = 0; i < ncounters; i++) {
+		if (desc_data[i].description > IB_COUNTER_BYTES)
+			return -EINVAL;
+
+		if (cntrs_max_index <= desc_data[i].index)
+			cntrs_max_index = desc_data[i].index + 1;
+	}
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	mcounters->counters_data = desc_data;
+	mcounters->cntrs_max_index = cntrs_max_index;
+	mutex_unlock(&mcounters->mcntrs_mutex);
+
+	return 0;
+}
+
+#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
+static int flow_counters_set_data(struct ib_counters *ibcounters,
+				  struct mlx5_ib_create_flow *ucmd)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
+	struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
+	struct mlx5_ib_flow_counters_desc *desc_data = NULL;
+	bool hw_hndl = false;
+	int ret = 0;
+
+	if (ucmd && ucmd->ncounters_data != 0) {
+		cntrs_data = ucmd->data;
+		if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
+			return -EINVAL;
+
+		desc_data = kcalloc(cntrs_data->ncounters,
+				    sizeof(*desc_data),
+				    GFP_KERNEL);
+		if (!desc_data)
+			return  -ENOMEM;
+
+		if (copy_from_user(desc_data,
+				   u64_to_user_ptr(cntrs_data->counters_data),
+				   sizeof(*desc_data) * cntrs_data->ncounters)) {
+			ret = -EFAULT;
+			goto free;
+		}
+	}
+
+	if (!mcounters->hw_cntrs_hndl) {
+		mcounters->hw_cntrs_hndl = mlx5_fc_create(
+			to_mdev(ibcounters->device)->mdev, false);
+		if (!mcounters->hw_cntrs_hndl) {
+			ret = -ENOMEM;
+			goto free;
+		}
+		hw_hndl = true;
+	}
+
+	if (desc_data) {
+		/* counters already bound to at least one flow */
+		if (mcounters->cntrs_max_index) {
+			ret = -EINVAL;
+			goto free_hndl;
+		}
+
+		ret = counters_set_description(ibcounters,
+					       MLX5_IB_COUNTERS_FLOW,
+					       desc_data,
+					       cntrs_data->ncounters);
+		if (ret)
+			goto free_hndl;
+
+	} else if (!mcounters->cntrs_max_index) {
+		/* counters not bound yet, must have udata passed */
+		ret = -EINVAL;
+		goto free_hndl;
+	}
+
+	return 0;
+
+free_hndl:
+	if (hw_hndl) {
+		mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
+				mcounters->hw_cntrs_hndl);
+		mcounters->hw_cntrs_hndl = NULL;
+	}
+free:
+	kfree(desc_data);
+	return ret;
+}
+
 static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 						      struct mlx5_ib_flow_prio *ft_prio,
 						      const struct ib_flow_attr *flow_attr,
 						      struct mlx5_flow_destination *dst,
-						      u32 underlay_qpn)
+						      u32 underlay_qpn,
+						      struct mlx5_ib_create_flow *ucmd)
 {
 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
 	struct mlx5_ib_flow_handler *handler;
 	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
 	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_destination *rule_dst = dst;
+	struct mlx5_flow_destination dest_arr[2] = {};
+	struct mlx5_flow_destination *rule_dst = dest_arr;
 	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
 	unsigned int spec_index;
 	int err = 0;
-	int dest_num = 1;
+	int dest_num = 0;
 	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
 
 	if (!is_valid_attr(dev->mdev, flow_attr))
@@ -3036,6 +3169,10 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 	}
 
 	INIT_LIST_HEAD(&handler->list);
+	if (dst) {
+		memcpy(&dest_arr[0], dst, sizeof(*dst));
+		dest_num++;
+	}
 
 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
 		err = parse_flow_attr(dev->mdev, spec->match_criteria,
@@ -3070,15 +3207,30 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 		goto free;
 	}
 
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		err = flow_counters_set_data(flow_act.counters, ucmd);
+		if (err)
+			goto free;
+
+		handler->ibcounters = flow_act.counters;
+		dest_arr[dest_num].type =
+			MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest_arr[dest_num].counter =
+			to_mcounters(flow_act.counters)->hw_cntrs_hndl;
+		dest_num++;
+	}
+
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
-		rule_dst = NULL;
-		dest_num = 0;
+		if (!(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT)) {
+			rule_dst = NULL;
+			dest_num = 0;
+		}
 	} else {
 		if (is_egress)
 			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
 		else
 			flow_act.action |=
-				dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
+				dest_num ?  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
 					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 	}
 
@@ -3104,8 +3256,12 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 
 	ft_prio->flow_table = ft;
 free:
-	if (err)
+	if (err && handler) {
+		if (handler->ibcounters &&
+		    atomic_read(&handler->ibcounters->usecnt) == 1)
+			counters_clear_description(handler->ibcounters);
 		kfree(handler);
+	}
 	kvfree(spec);
 	return err ? ERR_PTR(err) : handler;
 }
@@ -3115,7 +3271,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 						     const struct ib_flow_attr *flow_attr,
 						     struct mlx5_flow_destination *dst)
 {
-	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0);
+	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
 }
 
 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
@@ -3255,12 +3411,43 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
 	struct mlx5_ib_flow_prio *ft_prio;
 	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
+	struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
+	size_t min_ucmd_sz, required_ucmd_sz;
 	int err;
 	int underlay_qpn;
 
-	if (udata &&
-	    udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen))
-		return ERR_PTR(-EOPNOTSUPP);
+	if (udata && udata->inlen) {
+		min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
+				sizeof(ucmd_hdr.reserved);
+		if (udata->inlen < min_ucmd_sz)
+			return ERR_PTR(-EOPNOTSUPP);
+
+		err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
+		if (err)
+			return ERR_PTR(err);
+
+		/* currently supports only one counters data */
+		if (ucmd_hdr.ncounters_data > 1)
+			return ERR_PTR(-EINVAL);
+
+		required_ucmd_sz = min_ucmd_sz +
+			sizeof(struct mlx5_ib_flow_counters_data) *
+			ucmd_hdr.ncounters_data;
+		if (udata->inlen > required_ucmd_sz &&
+		    !ib_is_udata_cleared(udata, required_ucmd_sz,
+					 udata->inlen - required_ucmd_sz))
+			return ERR_PTR(-EOPNOTSUPP);
+
+		ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
+		if (!ucmd)
+			return ERR_PTR(-ENOMEM);
+
+		err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
+		if (err) {
+			kfree(ucmd);
+			return ERR_PTR(err);
+		}
+	}
 
 	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
 		return ERR_PTR(-ENOMEM);
@@ -3315,7 +3502,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 			underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
 					mqp->underlay_qpn : 0;
 			handler = _create_flow_rule(dev, ft_prio, flow_attr,
-						    dst, underlay_qpn);
+						    dst, underlay_qpn, ucmd);
 		}
 	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
@@ -3336,6 +3523,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 
 	mutex_unlock(&dev->flow_db->lock);
 	kfree(dst);
+	kfree(ucmd);
 
 	return &handler->ibflow;
 
@@ -3346,6 +3534,7 @@ destroy_ft:
 unlock:
 	mutex_unlock(&dev->flow_db->lock);
 	kfree(dst);
+	kfree(ucmd);
 	kfree(handler);
 	return ERR_PTR(err);
 }
@@ -5010,6 +5199,11 @@ static int mlx5_ib_destroy_counters(struct ib_counters *counters)
 {
 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
 
+	counters_clear_description(counters);
+	if (mcounters->hw_cntrs_hndl)
+		mlx5_fc_destroy(to_mdev(counters->device)->mdev,
+				mcounters->hw_cntrs_hndl);
+
 	kfree(mcounters);
 
 	return 0;
@@ -5024,6 +5218,8 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
 	if (!mcounters)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&mcounters->mcntrs_mutex);
+
 	return &mcounters->ibcntrs;
 }
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index fd27ec1aed08..155bca627222 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -175,6 +175,7 @@ struct mlx5_ib_flow_handler {
 	struct ib_flow			ibflow;
 	struct mlx5_ib_flow_prio	*prio;
 	struct mlx5_flow_handle		*rule;
+	struct ib_counters		*ibcounters;
 };
 
 struct mlx5_ib_flow_db {
@@ -813,8 +814,22 @@ struct mlx5_memic {
 	DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES);
 };
 
+enum mlx5_ib_counters_type {
+	MLX5_IB_COUNTERS_FLOW,
+};
+
 struct mlx5_ib_mcounters {
 	struct ib_counters ibcntrs;
+	enum mlx5_ib_counters_type type;
+	void *hw_cntrs_hndl;
+	/* max index set as part of create_flow */
+	u32 cntrs_max_index;
+	/* number of counters data entries (<description,index> pair) */
+	u32 ncounters;
+	/* counters data array for descriptions and indexes */
+	struct mlx5_ib_flow_counters_desc *counters_data;
+	/* protects access to mcounters internal data */
+	struct mutex mcntrs_mutex;
 };
 
 static inline struct mlx5_ib_mcounters *
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 3b4c3298061c..757b4a30281e 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -160,6 +160,7 @@ struct mlx5_flow_act {
 	u32 modify_id;
 	uintptr_t esp_id;
 	struct mlx5_fs_vlan vlan;
+	struct ib_counters *counters;
 };
 
 #define MLX5_DECLARE_FLOW_ACT(name) \
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index cb4a02c4a1ce..ab71e939eb78 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -36,6 +36,7 @@
 
 #include <linux/types.h>
 #include <linux/if_ether.h>	/* For ETH_ALEN. */
+#include <rdma/ib_user_ioctl_verbs.h>
 
 enum {
 	MLX5_QP_FLAG_SIGNATURE		= 1 << 0,
@@ -441,4 +442,27 @@ enum {
 enum {
 	MLX5_IB_CLOCK_INFO_V1              = 0,
 };
+
+struct mlx5_ib_flow_counters_desc {
+	__u32	description;
+	__u32	index;
+};
+
+struct mlx5_ib_flow_counters_data {
+	RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data);
+	__u32   ncounters;
+	__u32   reserved;
+};
+
+struct mlx5_ib_create_flow {
+	__u32   ncounters_data;
+	__u32   reserved;
+	/*
+	 * Following are counters data based on ncounters_data, each
+	 * entry in the data[] should match a corresponding counter object
+	 * that was pointed by a counters spec upon the flow creation
+	 */
+	struct mlx5_ib_flow_counters_data data[];
+};
+
 #endif /* MLX5_ABI_USER_H */
-- 
cgit v1.2.3


From d75d50c016a4eff96e004921402128dc2bc3d65b Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Fri, 1 Jun 2018 21:42:07 -0700
Subject: clk: Return void from debug_init op

We only have two users of the debug_init hook, and we recently stopped
caring about the return value from that op. Finish that off by changing
the clk_op to return void instead of int because it doesn't matter if
debugfs fails or not.

Cc: Eric Anholt <eric@anholt.net>
Cc: David Lechner <david@lechnology.com>
Cc: Sekhar Nori <nsekhar@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 Documentation/clk.txt         |  2 +-
 drivers/clk/bcm/clk-bcm2835.c | 25 +++++++++++--------------
 drivers/clk/davinci/pll.c     |  8 +++-----
 include/linux/clk-provider.h  |  2 +-
 4 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/Documentation/clk.txt b/Documentation/clk.txt
index 511628bb3d3a..593cca5058b1 100644
--- a/Documentation/clk.txt
+++ b/Documentation/clk.txt
@@ -96,7 +96,7 @@ the operations defined in clk-provider.h::
 		int		(*get_phase)(struct clk_hw *hw);
 		int		(*set_phase)(struct clk_hw *hw, int degrees);
 		void		(*init)(struct clk_hw *hw);
-		int		(*debug_init)(struct clk_hw *hw,
+		void		(*debug_init)(struct clk_hw *hw,
 					      struct dentry *dentry);
 	};
 
diff --git a/drivers/clk/bcm/clk-bcm2835.c b/drivers/clk/bcm/clk-bcm2835.c
index 1329440af59f..0bd62efc07f8 100644
--- a/drivers/clk/bcm/clk-bcm2835.c
+++ b/drivers/clk/bcm/clk-bcm2835.c
@@ -394,7 +394,7 @@ out:
 	return count * 1000;
 }
 
-static int bcm2835_debugfs_regset(struct bcm2835_cprman *cprman, u32 base,
+static void bcm2835_debugfs_regset(struct bcm2835_cprman *cprman, u32 base,
 				  struct debugfs_reg32 *regs, size_t nregs,
 				  struct dentry *dentry)
 {
@@ -402,15 +402,13 @@ static int bcm2835_debugfs_regset(struct bcm2835_cprman *cprman, u32 base,
 
 	regset = devm_kzalloc(cprman->dev, sizeof(*regset), GFP_KERNEL);
 	if (!regset)
-		return -ENOMEM;
+		return;
 
 	regset->regs = regs;
 	regset->nregs = nregs;
 	regset->base = cprman->regs + base;
 
 	debugfs_create_regset32("regdump", S_IRUGO, dentry, regset);
-
-	return 0;
 }
 
 struct bcm2835_pll_data {
@@ -728,7 +726,7 @@ static int bcm2835_pll_set_rate(struct clk_hw *hw,
 	return 0;
 }
 
-static int bcm2835_pll_debug_init(struct clk_hw *hw,
+static void bcm2835_pll_debug_init(struct clk_hw *hw,
 				  struct dentry *dentry)
 {
 	struct bcm2835_pll *pll = container_of(hw, struct bcm2835_pll, hw);
@@ -738,7 +736,7 @@ static int bcm2835_pll_debug_init(struct clk_hw *hw,
 
 	regs = devm_kzalloc(cprman->dev, 7 * sizeof(*regs), GFP_KERNEL);
 	if (!regs)
-		return -ENOMEM;
+		return;
 
 	regs[0].name = "cm_ctrl";
 	regs[0].offset = data->cm_ctrl_reg;
@@ -755,7 +753,7 @@ static int bcm2835_pll_debug_init(struct clk_hw *hw,
 	regs[6].name = "ana3";
 	regs[6].offset = data->ana_reg_base + 3 * 4;
 
-	return bcm2835_debugfs_regset(cprman, 0, regs, 7, dentry);
+	bcm2835_debugfs_regset(cprman, 0, regs, 7, dentry);
 }
 
 static const struct clk_ops bcm2835_pll_clk_ops = {
@@ -859,8 +857,8 @@ static int bcm2835_pll_divider_set_rate(struct clk_hw *hw,
 	return 0;
 }
 
-static int bcm2835_pll_divider_debug_init(struct clk_hw *hw,
-					  struct dentry *dentry)
+static void bcm2835_pll_divider_debug_init(struct clk_hw *hw,
+					   struct dentry *dentry)
 {
 	struct bcm2835_pll_divider *divider = bcm2835_pll_divider_from_hw(hw);
 	struct bcm2835_cprman *cprman = divider->cprman;
@@ -869,14 +867,14 @@ static int bcm2835_pll_divider_debug_init(struct clk_hw *hw,
 
 	regs = devm_kzalloc(cprman->dev, 7 * sizeof(*regs), GFP_KERNEL);
 	if (!regs)
-		return -ENOMEM;
+		return;
 
 	regs[0].name = "cm";
 	regs[0].offset = data->cm_reg;
 	regs[1].name = "a2w";
 	regs[1].offset = data->a2w_reg;
 
-	return bcm2835_debugfs_regset(cprman, 0, regs, 2, dentry);
+	bcm2835_debugfs_regset(cprman, 0, regs, 2, dentry);
 }
 
 static const struct clk_ops bcm2835_pll_divider_clk_ops = {
@@ -1252,15 +1250,14 @@ static struct debugfs_reg32 bcm2835_debugfs_clock_reg32[] = {
 	},
 };
 
-static int bcm2835_clock_debug_init(struct clk_hw *hw,
+static void bcm2835_clock_debug_init(struct clk_hw *hw,
 				    struct dentry *dentry)
 {
 	struct bcm2835_clock *clock = bcm2835_clock_from_hw(hw);
 	struct bcm2835_cprman *cprman = clock->cprman;
 	const struct bcm2835_clock_data *data = clock->data;
 
-	return bcm2835_debugfs_regset(
-		cprman, data->ctl_reg,
+	bcm2835_debugfs_regset(cprman, data->ctl_reg,
 		bcm2835_debugfs_clock_reg32,
 		ARRAY_SIZE(bcm2835_debugfs_clock_reg32),
 		dentry);
diff --git a/drivers/clk/davinci/pll.c b/drivers/clk/davinci/pll.c
index bb9594703d4a..5a5b853dde8a 100644
--- a/drivers/clk/davinci/pll.c
+++ b/drivers/clk/davinci/pll.c
@@ -190,7 +190,7 @@ static int davinci_pll_set_rate(struct clk_hw *hw, unsigned long rate,
 }
 
 #ifdef CONFIG_DEBUG_FS
-static int davinci_pll_debug_init(struct clk_hw *hw, struct dentry *dentry);
+static void davinci_pll_debug_init(struct clk_hw *hw, struct dentry *dentry);
 #else
 #define davinci_pll_debug_init NULL
 #endif
@@ -874,21 +874,19 @@ static const struct debugfs_reg32 davinci_pll_regs[] = {
 	DEBUG_REG(PLLDIV9),
 };
 
-static int davinci_pll_debug_init(struct clk_hw *hw, struct dentry *dentry)
+static void davinci_pll_debug_init(struct clk_hw *hw, struct dentry *dentry)
 {
 	struct davinci_pll_clk *pll = to_davinci_pll_clk(hw);
 	struct debugfs_regset32 *regset;
 
 	regset = kzalloc(sizeof(*regset), GFP_KERNEL);
 	if (!regset)
-		return -ENOMEM;
+		return;
 
 	regset->regs = davinci_pll_regs;
 	regset->nregs = ARRAY_SIZE(davinci_pll_regs);
 	regset->base = pll->base;
 
 	debugfs_create_regset32("registers", 0400, dentry, regset);
-
-	return 0;
 }
 #endif
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 00e55d4d07f1..a4dc15afbdaf 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -218,7 +218,7 @@ struct clk_ops {
 	int		(*get_phase)(struct clk_hw *hw);
 	int		(*set_phase)(struct clk_hw *hw, int degrees);
 	void		(*init)(struct clk_hw *hw);
-	int		(*debug_init)(struct clk_hw *hw, struct dentry *dentry);
+	void		(*debug_init)(struct clk_hw *hw, struct dentry *dentry);
 };
 
 /**
-- 
cgit v1.2.3


From 9310f0dc1c6430ca9e370a8341bea9f5dc85f40b Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 23 May 2018 17:22:19 -0500
Subject: PCI: pciehp: Rename host->native_hotplug to host->native_pcie_hotplug

Rename host->native_hotplug to host->native_pcie_hotplug to make room for a
similar flag for SHPC hotplug.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[bhelgaas: split to separate patch]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/pci_root.c         | 2 +-
 drivers/pci/pcie/portdrv_core.c | 2 +-
 drivers/pci/probe.c             | 2 +-
 include/linux/pci.h             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index dd946b62fedd..032a578da5d4 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -902,7 +902,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 
 	host_bridge = to_pci_host_bridge(bus->bridge);
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL))
-		host_bridge->native_hotplug = 0;
+		host_bridge->native_pcie_hotplug = 0;
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_AER_CONTROL))
 		host_bridge->native_aer = 0;
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_PME_CONTROL))
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663db282..6cb30aec2452 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -199,7 +199,7 @@ static int get_port_device_capability(struct pci_dev *dev)
 	int services = 0;
 
 	if (dev->is_hotplug_bridge &&
-	    (pcie_ports_native || host->native_hotplug)) {
+	    (pcie_ports_native || host->native_pcie_hotplug)) {
 		services |= PCIE_PORT_SERVICE_HP;
 
 		/*
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac91b6fd0bcd..eba2b17d2d80 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -552,7 +552,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
 	 * OS from interfering.
 	 */
 	bridge->native_aer = 1;
-	bridge->native_hotplug = 1;
+	bridge->native_pcie_hotplug = 1;
 	bridge->native_pme = 1;
 
 	return bridge;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 14ab65b83067..30ec7e86af55 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -474,7 +474,7 @@ struct pci_host_bridge {
 	unsigned int	ignore_reset_delay:1;	/* For entire hierarchy */
 	unsigned int	no_ext_tags:1;		/* No Extended Tags */
 	unsigned int	native_aer:1;		/* OS may use PCIe AER */
-	unsigned int	native_hotplug:1;	/* OS may use PCIe hotplug */
+	unsigned int	native_pcie_hotplug:1;	/* OS may use PCIe hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
-- 
cgit v1.2.3


From 5352a44a561d708f1a975a90f5ce16a054fe265c Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 23 May 2018 17:24:08 -0500
Subject: PCI: pciehp: Make pciehp_is_native() stricter

Previously pciehp_is_native() returned true for any PCI device in a
hierarchy where _OSC says we can use pciehp.  This is incorrect because
bridges without PCI_EXP_SLTCAP_HPC capability should be managed by acpiphp
instead.

Improve pciehp_is_native() to return true only when PCI_EXP_SLTCAP_HPC is
set and the pciehp driver is present.  In any other case return false
to let acpiphp handle those.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[bhelgaas: remove NULL pointer check]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/pci-acpi.c      | 26 ++++++++++++++------------
 drivers/pci/pcie/portdrv.h  |  2 --
 include/linux/pci.h         |  2 ++
 include/linux/pci_hotplug.h |  4 ++--
 4 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 1abdbf267c19..52b8434d4d6e 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -370,26 +370,28 @@ EXPORT_SYMBOL_GPL(pci_get_hp_params);
 
 /**
  * pciehp_is_native - Check whether a hotplug port is handled by the OS
- * @pdev: Hotplug port to check
+ * @bridge: Hotplug port to check
  *
- * Walk up from @pdev to the host bridge, obtain its cached _OSC Control Field
- * and return the value of the "PCI Express Native Hot Plug control" bit.
- * On failure to obtain the _OSC Control Field return %false.
+ * Returns true if the given @bridge is handled by the native PCIe hotplug
+ * driver.
  */
-bool pciehp_is_native(struct pci_dev *pdev)
+bool pciehp_is_native(struct pci_dev *bridge)
 {
-	struct acpi_pci_root *root;
-	acpi_handle handle;
+	const struct pci_host_bridge *host;
+	u32 slot_cap;
 
-	handle = acpi_find_root_bridge_handle(pdev);
-	if (!handle)
+	if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
 		return false;
 
-	root = acpi_pci_find_root(handle);
-	if (!root)
+	pcie_capability_read_dword(bridge, PCI_EXP_SLTCAP, &slot_cap);
+	if (!(slot_cap & PCI_EXP_SLTCAP_HPC))
 		return false;
 
-	return root->osc_control_set & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL;
+	if (pcie_ports_native)
+		return true;
+
+	host = pci_find_host_bridge(bridge->bus);
+	return host->native_pcie_hotplug;
 }
 
 /**
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index d0c6783dbfe3..aa542dc10d23 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -11,8 +11,6 @@
 
 #include <linux/compiler.h>
 
-extern bool pcie_ports_native;
-
 /* Service Type */
 #define PCIE_PORT_SERVICE_PME_SHIFT	0	/* Power Management Event */
 #define PCIE_PORT_SERVICE_PME		(1 << PCIE_PORT_SERVICE_PME_SHIFT)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 30ec7e86af55..3f009003706a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1454,8 +1454,10 @@ static inline int pci_irqd_intx_xlate(struct irq_domain *d,
 
 #ifdef CONFIG_PCIEPORTBUS
 extern bool pcie_ports_disabled;
+extern bool pcie_ports_native;
 #else
 #define pcie_ports_disabled	true
+#define pcie_ports_native	false
 #endif
 
 #ifdef CONFIG_PCIEASPM
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 26213024e81b..3f32575d1ce8 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -162,7 +162,7 @@ struct hotplug_params {
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
 int pci_get_hp_params(struct pci_dev *dev, struct hotplug_params *hpp);
-bool pciehp_is_native(struct pci_dev *pdev);
+bool pciehp_is_native(struct pci_dev *bridge);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(acpi_handle handle);
@@ -172,6 +172,6 @@ static inline int pci_get_hp_params(struct pci_dev *dev,
 {
 	return -ENODEV;
 }
-static inline bool pciehp_is_native(struct pci_dev *pdev) { return true; }
+static inline bool pciehp_is_native(struct pci_dev *bridge) { return true; }
 #endif
 #endif
-- 
cgit v1.2.3


From 1df81a6d6e01ff3f351c614c5bc35b49847e1dc5 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 23 May 2018 17:40:23 -0500
Subject: PCI: shpchp: Request SHPC control via _OSC when adding host bridge

The SHPC driver now must be builtin (it cannot be a module).  If it is
present, request SHPC control immediately when adding the ACPI host bridge.
This is similar to how we handle native PCIe hotplug via pciehp.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[bhelgaas: split to separate patch]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/pci_root.c | 5 +++++
 drivers/pci/probe.c     | 1 +
 include/linux/pci.h     | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 032a578da5d4..d9b8407ce4e8 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -477,6 +477,9 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm)
 	if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
 		control |= OSC_PCI_EXPRESS_NATIVE_HP_CONTROL;
 
+	if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC))
+		control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL;
+
 	if (pci_aer_available()) {
 		if (aer_acpi_firmware_first())
 			dev_info(&device->dev,
@@ -903,6 +906,8 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	host_bridge = to_pci_host_bridge(bus->bridge);
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL))
 		host_bridge->native_pcie_hotplug = 0;
+	if (!(root->osc_control_set & OSC_PCI_SHPC_NATIVE_HP_CONTROL))
+		host_bridge->native_shpc_hotplug = 0;
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_AER_CONTROL))
 		host_bridge->native_aer = 0;
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_PME_CONTROL))
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index eba2b17d2d80..91712b2ee2c6 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -553,6 +553,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
 	 */
 	bridge->native_aer = 1;
 	bridge->native_pcie_hotplug = 1;
+	bridge->native_shpc_hotplug = 1;
 	bridge->native_pme = 1;
 
 	return bridge;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3f009003706a..a4968cdb5f33 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -475,6 +475,7 @@ struct pci_host_bridge {
 	unsigned int	no_ext_tags:1;		/* No Extended Tags */
 	unsigned int	native_aer:1;		/* OS may use PCIe AER */
 	unsigned int	native_pcie_hotplug:1;	/* OS may use PCIe hotplug */
+	unsigned int	native_shpc_hotplug:1;	/* OS may use SHPC hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
-- 
cgit v1.2.3


From 6f77fa4941aac0fa721eef5fe61820a4c314fffb Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 23 May 2018 17:32:23 -0500
Subject: PCI: shpchp: Remove acpi_get_hp_hw_control_from_firmware() flags

acpi_get_hp_hw_control_from_firmware() no longer uses the flags parameter,
so remove it.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[bhelgaas: split to separate patch]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/hotplug/acpi_pcihp.c | 3 +--
 drivers/pci/hotplug/shpchp.h     | 3 +--
 include/linux/pci_hotplug.h      | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 65cc5042489a..7cc50cfef9c6 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -63,11 +63,10 @@ static acpi_status acpi_run_oshp(acpi_handle handle)
 /**
  * acpi_get_hp_hw_control_from_firmware
  * @dev: the pci_dev of the bridge that has a hotplug controller
- * @flags: requested control bits for _OSC
  *
  * Attempt to take hotplug control from firmware.
  */
-int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev, u32 flags)
+int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev)
 {
 	const struct pci_host_bridge *host;
 	const struct acpi_pci_root *root;
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index c55730b61c9a..71f44edaa94d 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -177,8 +177,7 @@ static inline const char *slot_name(struct slot *slot)
 #include <linux/pci-acpi.h>
 static inline int get_hp_hw_control_from_firmware(struct pci_dev *dev)
 {
-	u32 flags = OSC_PCI_SHPC_NATIVE_HP_CONTROL;
-	return acpi_get_hp_hw_control_from_firmware(dev, flags);
+	return acpi_get_hp_hw_control_from_firmware(dev);
 }
 #else
 #define get_hp_hw_control_from_firmware(dev) (0)
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 3f32575d1ce8..39591213a584 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -163,7 +163,7 @@ struct hotplug_params {
 #include <linux/acpi.h>
 int pci_get_hp_params(struct pci_dev *dev, struct hotplug_params *hpp);
 bool pciehp_is_native(struct pci_dev *bridge);
-int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
+int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(acpi_handle handle);
 #else
-- 
cgit v1.2.3


From 96a621e01a42dc53848e2e4915fd807ebc1fc82f Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 24 May 2018 15:10:21 -0500
Subject: PCI: shpchp: Remove get_hp_hw_control_from_firmware() wrapper

get_hp_hw_control_from_firmware() is a trivial wrapper around
acpi_get_hp_hw_control_from_firmware(), probably intended to be generic in
case other firmware needed similar OS/platform negotiation.

Remove get_hp_hw_control_from_firmware() and call
acpi_get_hp_hw_control_from_firmware() directly.  Add a stub for
acpi_get_hp_hw_control_from_firmware() for the non-ACPI case.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/hotplug/shpchp.h      | 10 ----------
 drivers/pci/hotplug/shpchp_core.c |  2 +-
 include/linux/pci_hotplug.h       |  5 +++++
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index 71f44edaa94d..9675ab757323 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -173,16 +173,6 @@ static inline const char *slot_name(struct slot *slot)
 	return hotplug_slot_name(slot->hotplug_slot);
 }
 
-#ifdef CONFIG_ACPI
-#include <linux/pci-acpi.h>
-static inline int get_hp_hw_control_from_firmware(struct pci_dev *dev)
-{
-	return acpi_get_hp_hw_control_from_firmware(dev);
-}
-#else
-#define get_hp_hw_control_from_firmware(dev) (0)
-#endif
-
 struct ctrl_reg {
 	volatile u32 base_offset;
 	volatile u32 slot_avail1;
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index 1f0f96908b5a..47decc9b3bb3 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -277,7 +277,7 @@ static int is_shpc_capable(struct pci_dev *dev)
 		return 1;
 	if (!pci_find_capability(dev, PCI_CAP_ID_SHPC))
 		return 0;
-	if (get_hp_hw_control_from_firmware(dev))
+	if (acpi_get_hp_hw_control_from_firmware(dev))
 		return 0;
 	return 1;
 }
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 39591213a584..1f5c935eb0de 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -172,6 +172,11 @@ static inline int pci_get_hp_params(struct pci_dev *dev,
 {
 	return -ENODEV;
 }
+
+static inline int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge)
+{
+	return 0;
+}
 static inline bool pciehp_is_native(struct pci_dev *bridge) { return true; }
 #endif
 #endif
-- 
cgit v1.2.3


From 48bf9a522c14449cc7c214c6062668ac54e4e88f Mon Sep 17 00:00:00 2001
From: Pramod Kumar <pramod.kumar@broadcom.com>
Date: Fri, 1 Jun 2018 17:56:06 -0700
Subject: dt-bindings: clk: Update Stingray binding doc

Update Stingray clock binding document to add additional clock entries
with names matching the latest ASIC datasheet. Also modify a few existing
entries to make their naming more consistent with the rest of the entries

Signed-off-by: Pramod Kumar <pramod.kumar@broadcom.com>
Signed-off-by: Ray Jui <ray.jui@broadcom.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/clock/brcm,iproc-clocks.txt           | 26 ++++++++++++----------
 include/dt-bindings/clock/bcm-sr.h                 | 24 ++++++++++++++------
 2 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt b/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt
index f8e4a93466cb..ab730ea0a560 100644
--- a/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt
+++ b/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt
@@ -276,36 +276,38 @@ These clock IDs are defined in:
     clk_ts_500_ref	genpll2		2	BCM_SR_GENPLL2_TS_500_REF_CLK
     clk_125_nitro	genpll2		3	BCM_SR_GENPLL2_125_NITRO_CLK
     clk_chimp		genpll2		4	BCM_SR_GENPLL2_CHIMP_CLK
-    clk_nic_flash	genpll2		5	BCM_SR_GENPLL2_NIC_FLASH
+    clk_nic_flash	genpll2		5	BCM_SR_GENPLL2_NIC_FLASH_CLK
+    clk_fs		genpll2		6	BCM_SR_GENPLL2_FS_CLK
 
     genpll3		crystal		0	BCM_SR_GENPLL3
     clk_hsls		genpll3		1	BCM_SR_GENPLL3_HSLS_CLK
     clk_sdio		genpll3		2	BCM_SR_GENPLL3_SDIO_CLK
 
     genpll4		crystal		0	BCM_SR_GENPLL4
-    ccn			genpll4		1	BCM_SR_GENPLL4_CCN_CLK
+    clk_ccn		genpll4		1	BCM_SR_GENPLL4_CCN_CLK
     clk_tpiu_pll	genpll4		2	BCM_SR_GENPLL4_TPIU_PLL_CLK
-    noc_clk		genpll4		3	BCM_SR_GENPLL4_NOC_CLK
+    clk_noc		genpll4		3	BCM_SR_GENPLL4_NOC_CLK
     clk_chclk_fs4	genpll4		4	BCM_SR_GENPLL4_CHCLK_FS4_CLK
     clk_bridge_fscpu	genpll4		5	BCM_SR_GENPLL4_BRIDGE_FSCPU_CLK
 
-
     genpll5		crystal		0	BCM_SR_GENPLL5
-    fs4_hf_clk		genpll5		1	BCM_SR_GENPLL5_FS4_HF_CLK
-    crypto_ae_clk	genpll5		2	BCM_SR_GENPLL5_CRYPTO_AE_CLK
-    raid_ae_clk		genpll5		3	BCM_SR_GENPLL5_RAID_AE_CLK
+    clk_fs4_hf		genpll5		1	BCM_SR_GENPLL5_FS4_HF_CLK
+    clk_crypto_ae	genpll5		2	BCM_SR_GENPLL5_CRYPTO_AE_CLK
+    clk_raid_ae		genpll5		3	BCM_SR_GENPLL5_RAID_AE_CLK
 
     genpll6		crystal		0	BCM_SR_GENPLL6
-    48_usb		genpll6		1	BCM_SR_GENPLL6_48_USB_CLK
+    clk_48_usb		genpll6		1	BCM_SR_GENPLL6_48_USB_CLK
 
     lcpll0		crystal		0	BCM_SR_LCPLL0
     clk_sata_refp 	lcpll0		1	BCM_SR_LCPLL0_SATA_REFP_CLK
     clk_sata_refn	lcpll0		2	BCM_SR_LCPLL0_SATA_REFN_CLK
-    clk_usb_ref		lcpll0		3	BCM_SR_LCPLL0_USB_REF_CLK
-    sata_refpn		lcpll0		3	BCM_SR_LCPLL0_SATA_REFPN_CLK
+    clk_sata_350	lcpll0		3	BCM_SR_LCPLL0_SATA_350_CLK
+    clk_sata_500	lcpll0		4	BCM_SR_LCPLL0_SATA_500_CLK
 
     lcpll1		crystal		0	BCM_SR_LCPLL1
-    wan 		lcpll1		1	BCM_SR_LCPLL0_WAN_CLK
+    clk_wan		lcpll1		1	BCM_SR_LCPLL1_WAN_CLK
+    clk_usb_ref		lcpll1		2	BCM_SR_LCPLL1_USB_REF_CLK
+    clk_crmu_ts		lcpll1		3	BCM_SR_LCPLL1_CRMU_TS_CLK
 
     lcpll_pcie		crystal		0	BCM_SR_LCPLL_PCIE
-    pcie_phy_ref 	lcpll1		1	BCM_SR_LCPLL_PCIE_PHY_REF_CLK
+    clk_pcie_phy_ref	lcpll1		1	BCM_SR_LCPLL_PCIE_PHY_REF_CLK
diff --git a/include/dt-bindings/clock/bcm-sr.h b/include/dt-bindings/clock/bcm-sr.h
index cff6c6fe2947..419011ba1a94 100644
--- a/include/dt-bindings/clock/bcm-sr.h
+++ b/include/dt-bindings/clock/bcm-sr.h
@@ -35,7 +35,7 @@
 
 /* GENPLL 0 clock channel ID SCR HSLS FS PCIE */
 #define BCM_SR_GENPLL0			0
-#define BCM_SR_GENPLL0_SATA_CLK		1
+#define BCM_SR_GENPLL0_125M_CLK		1
 #define BCM_SR_GENPLL0_SCR_CLK		2
 #define BCM_SR_GENPLL0_250M_CLK		3
 #define BCM_SR_GENPLL0_PCIE_AXI_CLK	4
@@ -50,9 +50,11 @@
 /* GENPLL 2 clock channel ID NITRO MHB*/
 #define BCM_SR_GENPLL2			0
 #define BCM_SR_GENPLL2_NIC_CLK		1
-#define BCM_SR_GENPLL2_250_NITRO_CLK	2
+#define BCM_SR_GENPLL2_TS_500_CLK	2
 #define BCM_SR_GENPLL2_125_NITRO_CLK	3
 #define BCM_SR_GENPLL2_CHIMP_CLK	4
+#define BCM_SR_GENPLL2_NIC_FLASH_CLK	5
+#define BCM_SR_GENPLL2_FS4_CLK		6
 
 /* GENPLL 3 HSLS clock channel ID */
 #define BCM_SR_GENPLL3			0
@@ -62,11 +64,16 @@
 /* GENPLL 4 SCR clock channel ID */
 #define BCM_SR_GENPLL4			0
 #define BCM_SR_GENPLL4_CCN_CLK		1
+#define BCM_SR_GENPLL4_TPIU_PLL_CLK	2
+#define BCM_SR_GENPLL4_NOC_CLK		3
+#define BCM_SR_GENPLL4_CHCLK_FS4_CLK	4
+#define BCM_SR_GENPLL4_BRIDGE_FSCPU_CLK	5
 
 /* GENPLL 5 FS4 clock channel ID */
 #define BCM_SR_GENPLL5			0
-#define BCM_SR_GENPLL5_FS_CLK		1
-#define BCM_SR_GENPLL5_SPU_CLK		2
+#define BCM_SR_GENPLL5_FS4_HF_CLK	1
+#define BCM_SR_GENPLL5_CRYPTO_AE_CLK	2
+#define BCM_SR_GENPLL5_RAID_AE_CLK	3
 
 /* GENPLL 6 NITRO clock channel ID */
 #define BCM_SR_GENPLL6			0
@@ -74,13 +81,16 @@
 
 /* LCPLL0  clock channel ID */
 #define BCM_SR_LCPLL0			0
-#define BCM_SR_LCPLL0_SATA_REF_CLK	1
-#define BCM_SR_LCPLL0_USB_REF_CLK	2
-#define BCM_SR_LCPLL0_SATA_REFPN_CLK	3
+#define BCM_SR_LCPLL0_SATA_REFP_CLK	1
+#define BCM_SR_LCPLL0_SATA_REFN_CLK	2
+#define BCM_SR_LCPLL0_SATA_350_CLK	3
+#define BCM_SR_LCPLL0_SATA_500_CLK	4
 
 /* LCPLL1  clock channel ID */
 #define BCM_SR_LCPLL1			0
 #define BCM_SR_LCPLL1_WAN_CLK		1
+#define BCM_SR_LCPLL1_USB_REF_CLK	2
+#define BCM_SR_LCPLL1_CRMU_TS_CLK	3
 
 /* LCPLL PCIE  clock channel ID */
 #define BCM_SR_LCPLL_PCIE		0
-- 
cgit v1.2.3


From 32561354b16944e784e82a2011765c50e77e1c56 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 28 May 2018 17:55:29 +0200
Subject: ARM: tegra: fix compile-testing PCI host driver

The tegra_cpuidle_pcie_irqs_in_use() function is stubbed out for non-ARM
builds, but now we can compile-test the Tegra pci driver on non-Tegra
ARM platforms as well, which results in a new link error:

drivers/pci/host/pci-tegra.o: In function `tegra_pcie_map_irq':
pci-tegra.c:(.text+0x288): undefined reference to `tegra_cpuidle_pcie_irqs_in_use'
drivers/pci/host/pci-tegra.o: In function `tegra_msi_map':
pci-tegra.c:(.text+0xba0): undefined reference to `tegra_cpuidle_pcie_irqs_in_use'

This adapts the #ifdef statement to match the exact condition under which
the function can be called.

Fixes: 51bc085d6454 ("PCI: Improve host drivers compile test coverage")
Cc: Rob Herring <robh@kernel.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 include/soc/tegra/cpuidle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/soc/tegra/cpuidle.h b/include/soc/tegra/cpuidle.h
index 1fae9c7800d1..b6cf32211520 100644
--- a/include/soc/tegra/cpuidle.h
+++ b/include/soc/tegra/cpuidle.h
@@ -14,7 +14,7 @@
 #ifndef __SOC_TEGRA_CPUIDLE_H__
 #define __SOC_TEGRA_CPUIDLE_H__
 
-#if defined(CONFIG_ARM) && defined(CONFIG_CPU_IDLE)
+#if defined(CONFIG_ARM) && defined(CONFIG_ARCH_TEGRA) && defined(CONFIG_CPU_IDLE)
 void tegra_cpuidle_pcie_irqs_in_use(void);
 #else
 static inline void tegra_cpuidle_pcie_irqs_in_use(void)
-- 
cgit v1.2.3


From 8d6e555773690e6fdefd99723fcd0a7e432c0c90 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Fri, 1 Jun 2018 14:54:07 +0200
Subject: netfilter: Decrease code duplication regarding transparent socket
 option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a function in include/net/netfilter/nf_socket.h to decide if a
socket has IP(V6)_TRANSPARENT socket option set or not. However this
does the same as inet_sk_transparent() in include/net/tcp.h

include/net/tcp.h:1733
/* This helper checks if socket has IP_TRANSPARENT set */
static inline bool inet_sk_transparent(const struct sock *sk)
{
	switch (sk->sk_state) {
	case TCP_TIME_WAIT:
		return inet_twsk(sk)->tw_transparent;
	case TCP_NEW_SYN_RECV:
		return inet_rsk(inet_reqsk(sk))->no_srccheck;
	}
	return inet_sk(sk)->transparent;
}

tproxy_sk_is_transparent has also been refactored to use this function
instead of reimplementing it.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_socket.h | 13 -------------
 net/netfilter/nft_socket.c        |  3 ++-
 net/netfilter/xt_TPROXY.c         | 15 ++-------------
 net/netfilter/xt_socket.c         |  4 ++--
 4 files changed, 6 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h
index 29b6313f0557..f9d7bee9bd4e 100644
--- a/include/net/netfilter/nf_socket.h
+++ b/include/net/netfilter/nf_socket.h
@@ -3,19 +3,6 @@
 #define _NF_SOCK_H_
 
 #include <net/sock.h>
-#include <net/inet_timewait_sock.h>
-
-static inline bool nf_sk_is_transparent(struct sock *sk)
-{
-	switch (sk->sk_state) {
-	case TCP_TIME_WAIT:
-		return inet_twsk(sk)->tw_transparent;
-	case TCP_NEW_SYN_RECV:
-		return inet_rsk(inet_reqsk(sk))->no_srccheck;
-	default:
-		return inet_sk(sk)->transparent;
-	}
-}
 
 struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
 				  const struct net_device *indev);
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index d86337068ecb..f28a0b944087 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -5,6 +5,7 @@
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_socket.h>
 #include <net/inet_sock.h>
+#include <net/tcp.h>
 
 struct nft_socket {
 	enum nft_socket_keys		key:8;
@@ -48,7 +49,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
 
 	switch(priv->key) {
 	case NFT_SOCKET_TRANSPARENT:
-		nft_reg_store8(dest, nf_sk_is_transparent(sk));
+		nft_reg_store8(dest, inet_sk_transparent(sk));
 		break;
 	default:
 		WARN_ON(1);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 8c89323c06af..74df7977c17c 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -42,19 +42,8 @@ enum nf_tproxy_lookup_t {
 
 static bool tproxy_sk_is_transparent(struct sock *sk)
 {
-	switch (sk->sk_state) {
-	case TCP_TIME_WAIT:
-		if (inet_twsk(sk)->tw_transparent)
-			return true;
-		break;
-	case TCP_NEW_SYN_RECV:
-		if (inet_rsk(inet_reqsk(sk))->no_srccheck)
-			return true;
-		break;
-	default:
-		if (inet_sk(sk)->transparent)
-			return true;
-	}
+	if (inet_sk_transparent(sk))
+		return true;
 
 	sock_gen_put(sk);
 	return false;
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ac7f674d19b..5c0779c4fa3c 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -73,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 		 * if XT_SOCKET_TRANSPARENT is used
 		 */
 		if (info->flags & XT_SOCKET_TRANSPARENT)
-			transparent = nf_sk_is_transparent(sk);
+			transparent = inet_sk_transparent(sk);
 
 		if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
 		    transparent && sk_fullsock(sk))
@@ -130,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
 		 * if XT_SOCKET_TRANSPARENT is used
 		 */
 		if (info->flags & XT_SOCKET_TRANSPARENT)
-			transparent = nf_sk_is_transparent(sk);
+			transparent = inet_sk_transparent(sk);
 
 		if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
 		    transparent && sk_fullsock(sk))
-- 
cgit v1.2.3


From 45ca4e0cf2734f8cc14b43e47c23618215abf1b8 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Fri, 1 Jun 2018 20:44:56 +0200
Subject: netfilter: Libify xt_TPROXY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The extracted functions will likely be usefull to implement tproxy
support in nf_tables.

Extrancted functions:
	- nf_tproxy_sk_is_transparent
	- nf_tproxy_laddr4
	- nf_tproxy_handle_time_wait4
	- nf_tproxy_get_sock_v4
	- nf_tproxy_laddr6
	- nf_tproxy_handle_time_wait6
	- nf_tproxy_get_sock_v6

(nf_)tproxy_handle_time_wait6 also needed some refactor as its current
implementation was xtables-specific.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tproxy.h   | 113 ++++++++++++
 net/ipv4/netfilter/Kconfig          |   5 +-
 net/ipv4/netfilter/Makefile         |   1 +
 net/ipv4/netfilter/nf_tproxy_ipv4.c | 147 +++++++++++++++
 net/ipv6/netfilter/Kconfig          |   5 +-
 net/ipv6/netfilter/Makefile         |   1 +
 net/ipv6/netfilter/nf_tproxy_ipv6.c | 146 +++++++++++++++
 net/netfilter/Kconfig               |   2 +
 net/netfilter/xt_TPROXY.c           | 355 ++----------------------------------
 9 files changed, 436 insertions(+), 339 deletions(-)
 create mode 100644 include/net/netfilter/nf_tproxy.h
 create mode 100644 net/ipv4/netfilter/nf_tproxy_ipv4.c
 create mode 100644 net/ipv6/netfilter/nf_tproxy_ipv6.c

(limited to 'include')

diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h
new file mode 100644
index 000000000000..9754a50ecde9
--- /dev/null
+++ b/include/net/netfilter/nf_tproxy.h
@@ -0,0 +1,113 @@
+#ifndef _NF_TPROXY_H_
+#define _NF_TPROXY_H_
+
+#include <net/tcp.h>
+
+enum nf_tproxy_lookup_t {
+	 NF_TPROXY_LOOKUP_LISTENER,
+	 NF_TPROXY_LOOKUP_ESTABLISHED,
+};
+
+static inline bool nf_tproxy_sk_is_transparent(struct sock *sk)
+{
+	if (inet_sk_transparent(sk))
+		return true;
+
+	sock_gen_put(sk);
+	return false;
+}
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr);
+
+/**
+ * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @laddr:	IPv4 address to redirect to or zero.
+ * @lport:	TCP port to redirect to or zero.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * nf_tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+			    __be32 laddr, __be16 lport, struct sock *sk);
+
+/*
+ * This is used when the user wants to intercept a connection matching
+ * an explicit iptables rule. In this case the sockets are assumed
+ * matching in preference order:
+ *
+ *   - match: if there's a fully established connection matching the
+ *     _packet_ tuple, it is returned, assuming the redirection
+ *     already took place and we process a packet belonging to an
+ *     established connection
+ *
+ *   - match: if there's a listening socket matching the redirection
+ *     (e.g. on-port & on-ip of the connection), it is returned,
+ *     regardless if it was bound to 0.0.0.0 or an explicit
+ *     address. The reasoning is that if there's an explicit rule, it
+ *     does not really matter if the listener is bound to an interface
+ *     or to 0. The user already stated that he wants redirection
+ *     (since he added the rule).
+ *
+ * Please note that there's an overlap between what a TPROXY target
+ * and a socket match will match. Normally if you have both rules the
+ * "socket" match will be the first one, effectively all packets
+ * belonging to established connections going through that one.
+ */
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+		      const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type);
+
+const struct in6_addr *
+nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+		 const struct in6_addr *daddr);
+
+/**
+ * nf_tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @net:	Network namespace.
+ * @laddr:	IPv6 address to redirect to.
+ * @lport:	TCP port to redirect to or zero.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * nf_tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+struct sock *
+nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			    struct net *net,
+			    const struct in6_addr *laddr,
+			    const __be16 lport,
+			    struct sock *sk);
+
+struct sock *
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+		      const u8 protocol,
+		      const struct in6_addr *saddr, const struct in6_addr *daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type);
+
+#endif /* _NF_TPROXY_H_ */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d03bc5a01a70..bbfc356cb1b5 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV4
 	tristate "IPv4 socket lookup support"
 	help
 	  This option enables the IPv4 socket lookup infrastructure. This is
-	  is required by the iptables socket match.
+	  is required by the {ip,nf}tables socket match.
+
+config NF_TPROXY_IPV4
+	tristate "IPv4 tproxy support"
 
 if NF_TABLES
 
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c4b05b174091..8394c17c269f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 
 obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
 
 # logging
 obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
new file mode 100644
index 000000000000..805e83ec3ad9
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <linux/inetdevice.h>
+
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+			 __be32 laddr, __be16 lport, struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	indev = __in_dev_get_rcu(skb->dev);
+	for_primary_ifa(indev) {
+		laddr = ifa->ifa_local;
+		break;
+	} endfor_ifa(indev);
+
+	return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
+
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+		      const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type)
+{
+	struct sock *sk;
+	struct tcphdr *tcph;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		switch (lookup_type) {
+		case NF_TPROXY_LOOKUP_LISTENER:
+			tcph = hp;
+			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
+						    ip_hdrlen(skb) +
+						      __tcp_hdrlen(tcph),
+						    saddr, sport,
+						    daddr, dport,
+						    in->ifindex, 0);
+
+			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+				sk = NULL;
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			break;
+		case NF_TPROXY_LOOKUP_ESTABLISHED:
+			sk = inet_lookup_established(net, &tcp_hashinfo,
+						    saddr, sport, daddr, dport,
+						    in->ifindex);
+			break;
+		default:
+			BUG();
+		}
+		break;
+	case IPPROTO_UDP:
+		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
+			      (!connected || wildcard)) ||
+			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 9f5b00a39adf..37b14dc9d863 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV6
 	tristate "IPv6 socket lookup support"
 	help
 	  This option enables the IPv6 socket lookup infrastructure. This
-	  is used by the ip6tables socket match.
+	  is used by the {ip6,nf}tables socket match.
+
+config NF_TPROXY_IPV6
+	tristate "IPv6 tproxy support"
 
 if NF_TABLES
 
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 71518f22ae39..10a5a1c87320 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -26,6 +26,7 @@ nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
 obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
 obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
+obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
 
 # logging
 obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
new file mode 100644
index 000000000000..bf1d6c421e3b
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -0,0 +1,146 @@
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <net/inet6_hashtables.h>
+#include <net/addrconf.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+
+const struct in6_addr *
+nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+	      const struct in6_addr *daddr)
+{
+	struct inet6_dev *indev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr *laddr;
+
+	if (!ipv6_addr_any(user_laddr))
+		return user_laddr;
+	laddr = NULL;
+
+	indev = __in6_dev_get(skb->dev);
+	if (indev) {
+		read_lock_bh(&indev->lock);
+		list_for_each_entry(ifa, &indev->addr_list, if_list) {
+			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+				continue;
+
+			laddr = &ifa->addr;
+			break;
+		}
+		read_unlock_bh(&indev->lock);
+	}
+
+	return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr6);
+
+struct sock *
+nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 struct net *net,
+			 const struct in6_addr *laddr,
+			 const __be16 lport,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
+					    &iph->saddr,
+					    nf_tproxy_laddr6(skb, laddr, &iph->daddr),
+					    hp->source,
+					    lport ? lport : hp->dest,
+					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
+
+struct sock *
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+		      const u8 protocol,
+		      const struct in6_addr *saddr, const struct in6_addr *daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type)
+{
+	struct sock *sk;
+	struct tcphdr *tcph;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		switch (lookup_type) {
+		case NF_TPROXY_LOOKUP_LISTENER:
+			tcph = hp;
+			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+						   thoff + __tcp_hdrlen(tcph),
+						   saddr, sport,
+						   daddr, ntohs(dport),
+						   in->ifindex, 0);
+
+			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+				sk = NULL;
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			break;
+		case NF_TPROXY_LOOKUP_ESTABLISHED:
+			sk = __inet6_lookup_established(net, &tcp_hashinfo,
+							saddr, sport, daddr, ntohs(dport),
+							in->ifindex, 0);
+			break;
+		default:
+			BUG();
+		}
+		break;
+	case IPPROTO_UDP:
+		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 276e1e32f44e..41240abd755f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -989,6 +989,8 @@ config NETFILTER_XT_TARGET_TPROXY
 	depends on IP_NF_MANGLE
 	select NF_DEFRAG_IPV4
 	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
+	select NF_TPROXY_IPV4
+	select NF_TPROXY_IPV6 if IP6_NF_IPTABLES
 	help
 	  This option adds a `TPROXY' target, which is somewhat similar to
 	  REDIRECT.  It can only be used in the mangle table and is useful
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 74df7977c17c..58fce4e749a9 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,253 +33,9 @@
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #endif
 
+#include <net/netfilter/nf_tproxy.h>
 #include <linux/netfilter/xt_TPROXY.h>
 
-enum nf_tproxy_lookup_t {
-	 NFT_LOOKUP_LISTENER,
-	 NFT_LOOKUP_ESTABLISHED,
-};
-
-static bool tproxy_sk_is_transparent(struct sock *sk)
-{
-	if (inet_sk_transparent(sk))
-		return true;
-
-	sock_gen_put(sk);
-	return false;
-}
-
-static inline __be32
-tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
-{
-	struct in_device *indev;
-	__be32 laddr;
-
-	if (user_laddr)
-		return user_laddr;
-
-	laddr = 0;
-	indev = __in_dev_get_rcu(skb->dev);
-	for_primary_ifa(indev) {
-		laddr = ifa->ifa_local;
-		break;
-	} endfor_ifa(indev);
-
-	return laddr ? laddr : daddr;
-}
-
-/*
- * This is used when the user wants to intercept a connection matching
- * an explicit iptables rule. In this case the sockets are assumed
- * matching in preference order:
- *
- *   - match: if there's a fully established connection matching the
- *     _packet_ tuple, it is returned, assuming the redirection
- *     already took place and we process a packet belonging to an
- *     established connection
- *
- *   - match: if there's a listening socket matching the redirection
- *     (e.g. on-port & on-ip of the connection), it is returned,
- *     regardless if it was bound to 0.0.0.0 or an explicit
- *     address. The reasoning is that if there's an explicit rule, it
- *     does not really matter if the listener is bound to an interface
- *     or to 0. The user already stated that he wants redirection
- *     (since he added the rule).
- *
- * Please note that there's an overlap between what a TPROXY target
- * and a socket match will match. Normally if you have both rules the
- * "socket" match will be the first one, effectively all packets
- * belonging to established connections going through that one.
- */
-static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
-		      const u8 protocol,
-		      const __be32 saddr, const __be32 daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in,
-		      const enum nf_tproxy_lookup_t lookup_type)
-{
-	struct sock *sk;
-	struct tcphdr *tcph;
-
-	switch (protocol) {
-	case IPPROTO_TCP:
-		switch (lookup_type) {
-		case NFT_LOOKUP_LISTENER:
-			tcph = hp;
-			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
-						    ip_hdrlen(skb) +
-						      __tcp_hdrlen(tcph),
-						    saddr, sport,
-						    daddr, dport,
-						    in->ifindex, 0);
-
-			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
-				sk = NULL;
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			break;
-		case NFT_LOOKUP_ESTABLISHED:
-			sk = inet_lookup_established(net, &tcp_hashinfo,
-						    saddr, sport, daddr, dport,
-						    in->ifindex);
-			break;
-		default:
-			BUG();
-		}
-		break;
-	case IPPROTO_UDP:
-		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		if (sk) {
-			int connected = (sk->sk_state == TCP_ESTABLISHED);
-			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
-			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
-				sock_put(sk);
-				sk = NULL;
-			}
-		}
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
-
-	return sk;
-}
-
-#ifdef XT_TPROXY_HAVE_IPV6
-static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
-		      const u8 protocol,
-		      const struct in6_addr *saddr, const struct in6_addr *daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in,
-		      const enum nf_tproxy_lookup_t lookup_type)
-{
-	struct sock *sk;
-	struct tcphdr *tcph;
-
-	switch (protocol) {
-	case IPPROTO_TCP:
-		switch (lookup_type) {
-		case NFT_LOOKUP_LISTENER:
-			tcph = hp;
-			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
-						   thoff + __tcp_hdrlen(tcph),
-						   saddr, sport,
-						   daddr, ntohs(dport),
-						   in->ifindex, 0);
-
-			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
-				sk = NULL;
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			break;
-		case NFT_LOOKUP_ESTABLISHED:
-			sk = __inet6_lookup_established(net, &tcp_hashinfo,
-							saddr, sport, daddr, ntohs(dport),
-							in->ifindex, 0);
-			break;
-		default:
-			BUG();
-		}
-		break;
-	case IPPROTO_UDP:
-		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		if (sk) {
-			int connected = (sk->sk_state == TCP_ESTABLISHED);
-			int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
-			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
-				sock_put(sk);
-				sk = NULL;
-			}
-		}
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
-		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
-
-	return sk;
-}
-#endif
-
-/**
- * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
- * @skb:	The skb being processed.
- * @laddr:	IPv4 address to redirect to or zero.
- * @lport:	TCP port to redirect to or zero.
- * @sk:		The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait4() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
-			 __be32 laddr, __be16 lport, struct sock *sk)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	struct tcphdr _hdr, *hp;
-
-	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
-	if (hp == NULL) {
-		inet_twsk_put(inet_twsk(sk));
-		return NULL;
-	}
-
-	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
-		/* SYN to a TIME_WAIT socket, we'd rather redirect it
-		 * to a listener socket if there's one */
-		struct sock *sk2;
-
-		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
-					    iph->saddr, laddr ? laddr : iph->daddr,
-					    hp->source, lport ? lport : hp->dest,
-					    skb->dev, NFT_LOOKUP_LISTENER);
-		if (sk2) {
-			inet_twsk_deschedule_put(inet_twsk(sk));
-			sk = sk2;
-		}
-	}
-
-	return sk;
-}
-
 /* assign a socket to the skb -- consumes sk */
 static void
 nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
@@ -308,26 +64,26 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
-				   skb->dev, NFT_LOOKUP_ESTABLISHED);
+				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
 
-	laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+	laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
 	if (!lport)
 		lport = hp->dest;
 
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
 		/* reopening a TIME_WAIT connection needs special handling */
-		sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
+		sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
 	else if (!sk)
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 					   iph->saddr, laddr,
 					   hp->source, lport,
-					   skb->dev, NFT_LOOKUP_LISTENER);
+					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
-	if (sk && tproxy_sk_is_transparent(sk)) {
+	if (sk && nf_tproxy_sk_is_transparent(sk)) {
 		/* This should be in a separate target, but we don't do multiple
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -366,87 +122,6 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
 
 #ifdef XT_TPROXY_HAVE_IPV6
 
-static inline const struct in6_addr *
-tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
-	      const struct in6_addr *daddr)
-{
-	struct inet6_dev *indev;
-	struct inet6_ifaddr *ifa;
-	struct in6_addr *laddr;
-
-	if (!ipv6_addr_any(user_laddr))
-		return user_laddr;
-	laddr = NULL;
-
-	indev = __in6_dev_get(skb->dev);
-	if (indev) {
-		read_lock_bh(&indev->lock);
-		list_for_each_entry(ifa, &indev->addr_list, if_list) {
-			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
-				continue;
-
-			laddr = &ifa->addr;
-			break;
-		}
-		read_unlock_bh(&indev->lock);
-	}
-
-	return laddr ? laddr : daddr;
-}
-
-/**
- * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
- * @skb:	The skb being processed.
- * @tproto:	Transport protocol.
- * @thoff:	Transport protocol header offset.
- * @par:	Iptables target parameters.
- * @sk:		The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait6() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
-			 const struct xt_action_param *par,
-			 struct sock *sk)
-{
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct tcphdr _hdr, *hp;
-	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
-
-	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
-	if (hp == NULL) {
-		inet_twsk_put(inet_twsk(sk));
-		return NULL;
-	}
-
-	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
-		/* SYN to a TIME_WAIT socket, we'd rather redirect it
-		 * to a listener socket if there's one */
-		struct sock *sk2;
-
-		sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
-					    &iph->saddr,
-					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
-					    hp->source,
-					    tgi->lport ? tgi->lport : hp->dest,
-					    skb->dev, NFT_LOOKUP_LISTENER);
-		if (sk2) {
-			inet_twsk_deschedule_put(inet_twsk(sk));
-			sk = sk2;
-		}
-	}
-
-	return sk;
-}
-
 static unsigned int
 tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -478,25 +153,31 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
-				   xt_in(par), NFT_LOOKUP_ESTABLISHED);
+				   xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
 
-	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+	laddr = nf_tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
 	lport = tgi->lport ? tgi->lport : hp->dest;
 
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
-	if (sk && sk->sk_state == TCP_TIME_WAIT)
+	if (sk && sk->sk_state == TCP_TIME_WAIT) {
+		const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
 		/* reopening a TIME_WAIT connection needs special handling */
-		sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+		sk = nf_tproxy_handle_time_wait6(skb, tproto, thoff,
+					      xt_net(par),
+					      &tgi->laddr.in6,
+					      tgi->lport,
+					      sk);
+	}
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
 					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
-					   xt_in(par), NFT_LOOKUP_LISTENER);
+					   xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
-	if (sk && tproxy_sk_is_transparent(sk)) {
+	if (sk && nf_tproxy_sk_is_transparent(sk)) {
 		/* This should be in a separate target, but we don't do multiple
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
-- 
cgit v1.2.3


From 00bfb3205e62970ff14ac8756e1a72b753a990ab Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:46 +0200
Subject: netfilter: nf_tables: pass context to object destroy indirection

The new connlimit object needs this to properly deal with conntrack
dependencies.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  3 ++-
 net/netfilter/nf_tables_api.c     | 12 ++++++------
 net/netfilter/nft_counter.c       |  3 ++-
 net/netfilter/nft_ct.c            |  3 ++-
 4 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 435c32d8a995..81ec070582b6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1070,7 +1070,8 @@ struct nft_object_ops {
 	int				(*init)(const struct nft_ctx *ctx,
 						const struct nlattr *const tb[],
 						struct nft_object *obj);
-	void				(*destroy)(struct nft_object *obj);
+	void				(*destroy)(const struct nft_ctx *ctx,
+						   struct nft_object *obj);
 	int				(*dump)(struct sk_buff *skb,
 						struct nft_object *obj,
 						bool reset);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c785bc5a66f1..177658f4007e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4787,7 +4787,7 @@ err3:
 	kfree(obj->name);
 err2:
 	if (obj->ops->destroy)
-		obj->ops->destroy(obj);
+		obj->ops->destroy(&ctx, obj);
 	kfree(obj);
 err1:
 	module_put(type->owner);
@@ -4997,10 +4997,10 @@ err:
 	return err;
 }
 
-static void nft_obj_destroy(struct nft_object *obj)
+static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
 {
 	if (obj->ops->destroy)
-		obj->ops->destroy(obj);
+		obj->ops->destroy(ctx, obj);
 
 	module_put(obj->ops->type->owner);
 	kfree(obj->name);
@@ -6003,7 +6003,7 @@ static void nft_commit_release(struct nft_trans *trans)
 					   nft_trans_elem(trans).priv);
 		break;
 	case NFT_MSG_DELOBJ:
-		nft_obj_destroy(nft_trans_obj(trans));
+		nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
 		break;
 	case NFT_MSG_DELFLOWTABLE:
 		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -6328,7 +6328,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 				     nft_trans_elem(trans).priv, true);
 		break;
 	case NFT_MSG_NEWOBJ:
-		nft_obj_destroy(nft_trans_obj(trans));
+		nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
 		break;
 	case NFT_MSG_NEWFLOWTABLE:
 		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -7022,7 +7022,7 @@ static void __nft_release_tables(struct net *net)
 		list_for_each_entry_safe(obj, ne, &table->objects, list) {
 			list_del(&obj->list);
 			table->use--;
-			nft_obj_destroy(obj);
+			nft_obj_destroy(&ctx, obj);
 		}
 		list_for_each_entry_safe(chain, nc, &table->chains, list) {
 			ctx.chain = chain;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index eefe3b409925..e59a74d6b7d6 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -96,7 +96,8 @@ static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
 	free_percpu(priv->counter);
 }
 
-static void nft_counter_obj_destroy(struct nft_object *obj)
+static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
+				    struct nft_object *obj)
 {
 	struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
 
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index ea737fd789e8..f8b19eacfa0c 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -826,7 +826,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static void nft_ct_helper_obj_destroy(struct nft_object *obj)
+static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
+				      struct nft_object *obj)
 {
 	struct nft_ct_helper_obj *priv = nft_obj_data(obj);
 
-- 
cgit v1.2.3


From 5e5cbc7b23eaf13e18652c03efbad5be6995de6a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:47 +0200
Subject: netfilter: nf_conncount: expose connection list interface

This patch provides an interface to maintain the list of connections and
the lookup function to obtain the number of connections in the list.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h | 11 +++++++++
 net/netfilter/nf_conncount.c               | 36 +++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index e61184fbfb71..1910b6572430 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -13,4 +13,15 @@ unsigned int nf_conncount_count(struct net *net,
 				const u32 *key,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone);
+
+unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+				 const struct nf_conntrack_tuple *tuple,
+				 const struct nf_conntrack_zone *zone,
+				 bool *addit);
+
+bool nf_conncount_add(struct hlist_head *head,
+		      const struct nf_conntrack_tuple *tuple);
+
+void nf_conncount_cache_free(struct hlist_head *hhead);
+
 #endif
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 153e690e2893..3b5059a8dcdd 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -79,7 +79,7 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 	return memcmp(a, b, klen * sizeof(u32));
 }
 
-static bool add_hlist(struct hlist_head *head,
+bool nf_conncount_add(struct hlist_head *head,
 		      const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conncount_tuple *conn;
@@ -91,12 +91,12 @@ static bool add_hlist(struct hlist_head *head,
 	hlist_add_head(&conn->node, head);
 	return true;
 }
+EXPORT_SYMBOL_GPL(nf_conncount_add);
 
-static unsigned int check_hlist(struct net *net,
-				struct hlist_head *head,
-				const struct nf_conntrack_tuple *tuple,
-				const struct nf_conntrack_zone *zone,
-				bool *addit)
+unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+				 const struct nf_conntrack_tuple *tuple,
+				 const struct nf_conntrack_zone *zone,
+				 bool *addit)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn;
@@ -141,6 +141,7 @@ static unsigned int check_hlist(struct net *net,
 
 	return length;
 }
+EXPORT_SYMBOL_GPL(nf_conncount_lookup);
 
 static void tree_nodes_free(struct rb_root *root,
 			    struct nf_conncount_rb *gc_nodes[],
@@ -187,13 +188,15 @@ count_tree(struct net *net, struct rb_root *root,
 		} else {
 			/* same source network -> be counted! */
 			unsigned int count;
-			count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+
+			count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
+						    zone, &addit);
 
 			tree_nodes_free(root, gc_nodes, gc_count);
 			if (!addit)
 				return count;
 
-			if (!add_hlist(&rbconn->hhead, tuple))
+			if (!nf_conncount_add(&rbconn->hhead, tuple))
 				return 0; /* hotdrop */
 
 			return count + 1;
@@ -203,7 +206,7 @@ count_tree(struct net *net, struct rb_root *root,
 			continue;
 
 		/* only used for GC on hhead, retval and 'addit' ignored */
-		check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+		nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
 		if (hlist_empty(&rbconn->hhead))
 			gc_nodes[gc_count++] = rbconn;
 	}
@@ -303,11 +306,19 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
 }
 EXPORT_SYMBOL_GPL(nf_conncount_init);
 
-static void destroy_tree(struct rb_root *r)
+void nf_conncount_cache_free(struct hlist_head *hhead)
 {
 	struct nf_conncount_tuple *conn;
-	struct nf_conncount_rb *rbconn;
 	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(conn, n, hhead, node)
+		kmem_cache_free(conncount_conn_cachep, conn);
+}
+EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
+
+static void destroy_tree(struct rb_root *r)
+{
+	struct nf_conncount_rb *rbconn;
 	struct rb_node *node;
 
 	while ((node = rb_first(r)) != NULL) {
@@ -315,8 +326,7 @@ static void destroy_tree(struct rb_root *r)
 
 		rb_erase(node, r);
 
-		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
-			kmem_cache_free(conncount_conn_cachep, conn);
+		nf_conncount_cache_free(&rbconn->hhead);
 
 		kmem_cache_free(conncount_rb_cachep, rbconn);
 	}
-- 
cgit v1.2.3


From 3453c92731884bad7c4c3a0667228b964747f3d5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:48 +0200
Subject: netfilter: nf_tables: pass ctx to nf_tables_expr_destroy()

nft_set_elem_destroy() can be called from call_rcu context. Annotate
netns and table in set object so we can populate the context object.
Moreover, pass context object to nf_tables_set_elem_destroy() from the
commit phase, since it is already available from there.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nf_tables_api.c     | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 81ec070582b6..e3d1bac9b0d5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -370,6 +370,8 @@ void nft_unregister_set(struct nft_set_type *type);
  *
  *	@list: table set list node
  *	@bindings: list of set bindings
+ *	@table: table this set belongs to
+ *	@net: netnamespace this set belongs to
  * 	@name: name of the set
  *	@handle: unique handle of the set
  * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
@@ -393,6 +395,8 @@ void nft_unregister_set(struct nft_set_type *type);
 struct nft_set {
 	struct list_head		list;
 	struct list_head		bindings;
+	struct nft_table		*table;
+	possible_net_t			net;
 	char				*name;
 	u64				handle;
 	u32				ktype;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 177658f4007e..12463984dd5c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3359,6 +3359,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	}
 
 	INIT_LIST_HEAD(&set->bindings);
+	set->table = table;
+	write_pnet(&set->net, net);
 	set->ops   = ops;
 	set->ktype = ktype;
 	set->klen  = desc.klen;
@@ -4036,12 +4038,16 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 			  bool destroy_expr)
 {
 	struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+	struct nft_ctx ctx = {
+		.net	= read_pnet(&set->net),
+		.family	= set->table->family,
+	};
 
 	nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
 		nft_data_release(nft_set_ext_data(ext), set->dtype);
 	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-		nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+		nf_tables_expr_destroy(&ctx, nft_set_ext_expr(ext));
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
 		(*nft_set_ext_obj(ext))->use--;
 	kfree(elem);
@@ -4051,12 +4057,13 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
 /* Only called from commit path, nft_set_elem_deactivate() already deals with
  * the refcounting from the preparation phase.
  */
-static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
+static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+				       const struct nft_set *set, void *elem)
 {
 	struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-		nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+		nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
 	kfree(elem);
 }
 
@@ -5999,7 +6006,8 @@ static void nft_commit_release(struct nft_trans *trans)
 		nft_set_destroy(nft_trans_set(trans));
 		break;
 	case NFT_MSG_DELSETELEM:
-		nf_tables_set_elem_destroy(nft_trans_elem_set(trans),
+		nf_tables_set_elem_destroy(&trans->ctx,
+					   nft_trans_elem_set(trans),
 					   nft_trans_elem(trans).priv);
 		break;
 	case NFT_MSG_DELOBJ:
-- 
cgit v1.2.3


From 79b174ade16d90302aef6e14f5eefd0b723c1602 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:49 +0200
Subject: netfilter: nf_tables: garbage collection for stateful expressions

Use garbage collector to schedule removal of elements based of feedback
from expression that this element comes with. Therefore, the garbage
collector is not guided by timeout expirations in this new mode.

The new connlimit expression sets on the NFT_EXPR_GC flag to enable this
behaviour, the dynset expression needs to explicitly enable the garbage
collector via set->ops->gc_init call.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nft_dynset.c        |  9 +++++++++
 net/netfilter/nft_set_hash.c      | 21 +++++++++++++++++++--
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e3d1bac9b0d5..871cb3b012e9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -342,6 +342,7 @@ struct nft_set_ops {
 						const struct nft_set_desc *desc,
 						const struct nlattr * const nla[]);
 	void				(*destroy)(const struct nft_set *set);
+	void				(*gc_init)(const struct nft_set *set);
 
 	unsigned int			elemsize;
 };
@@ -712,6 +713,7 @@ struct nft_expr_type {
 };
 
 #define NFT_EXPR_STATEFUL		0x1
+#define NFT_EXPR_GC			0x2
 
 /**
  *	struct nft_expr_ops - nf_tables expression operations
@@ -748,6 +750,8 @@ struct nft_expr_ops {
 	int				(*validate)(const struct nft_ctx *ctx,
 						    const struct nft_expr *expr,
 						    const struct nft_data **data);
+	bool				(*gc)(struct net *net,
+					      const struct nft_expr *expr);
 	const struct nft_expr_type	*type;
 	void				*data;
 };
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index b07a3fd9eeea..4d49529cff61 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -195,6 +195,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 		err = -EOPNOTSUPP;
 		if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
 			goto err1;
+
+		if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
+			if (set->flags & NFT_SET_TIMEOUT)
+				goto err1;
+			if (!set->ops->gc_init)
+				goto err1;
+			set->ops->gc_init(set);
+		}
+
 	} else if (set->flags & NFT_SET_EVAL)
 		return -EINVAL;
 
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index dbf1f4ad077c..6f9a1365a09f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -311,8 +311,16 @@ static void nft_rhash_gc(struct work_struct *work)
 			continue;
 		}
 
+		if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
+			struct nft_expr *expr = nft_set_ext_expr(&he->ext);
+
+			if (expr->ops->gc &&
+			    expr->ops->gc(read_pnet(&set->net), expr))
+				goto gc;
+		}
 		if (!nft_set_elem_expired(&he->ext))
 			continue;
+gc:
 		if (nft_set_elem_mark_busy(&he->ext))
 			continue;
 
@@ -339,6 +347,14 @@ static unsigned int nft_rhash_privsize(const struct nlattr * const nla[],
 	return sizeof(struct nft_rhash);
 }
 
+static void nft_rhash_gc_init(const struct nft_set *set)
+{
+	struct nft_rhash *priv = nft_set_priv(set);
+
+	queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+			   nft_set_gc_interval(set));
+}
+
 static int nft_rhash_init(const struct nft_set *set,
 			  const struct nft_set_desc *desc,
 			  const struct nlattr * const tb[])
@@ -356,8 +372,8 @@ static int nft_rhash_init(const struct nft_set *set,
 
 	INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
 	if (set->flags & NFT_SET_TIMEOUT)
-		queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
-				   nft_set_gc_interval(set));
+		nft_rhash_gc_init(set);
+
 	return 0;
 }
 
@@ -647,6 +663,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = {
 		.elemsize	= offsetof(struct nft_rhash_elem, ext),
 		.estimate	= nft_rhash_estimate,
 		.init		= nft_rhash_init,
+		.gc_init	= nft_rhash_gc_init,
 		.destroy	= nft_rhash_destroy,
 		.insert		= nft_rhash_insert,
 		.activate	= nft_rhash_activate,
-- 
cgit v1.2.3


From 371ebcbb9ee62fb46a0a27f358941588f7048678 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:50 +0200
Subject: netfilter: nf_tables: add destroy_clone expression

Before this patch, cloned expressions are released via ->destroy. This
is a problem for the new connlimit expression since the ->destroy path
drop a reference on the conntrack modules and it unregisters hooks. The
new ->destroy_clone provides context that this expression is being
released from the packet path, so it is mirroring ->clone(), where
neither module reference is dropped nor hooks need to be unregistered -
because this done from the control plane path from the ->init() path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c     | 12 ++++++++++--
 net/netfilter/nft_counter.c       |  1 +
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 871cb3b012e9..83e7b83ecf3e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -745,6 +745,8 @@ struct nft_expr_ops {
 						      const struct nft_expr *expr);
 	void				(*destroy)(const struct nft_ctx *ctx,
 						   const struct nft_expr *expr);
+	void				(*destroy_clone)(const struct nft_ctx *ctx,
+							 const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
 						const struct nft_expr *expr);
 	int				(*validate)(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 12463984dd5c..0a6eafa49879 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4046,8 +4046,16 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 	nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
 		nft_data_release(nft_set_ext_data(ext), set->dtype);
-	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-		nf_tables_expr_destroy(&ctx, nft_set_ext_expr(ext));
+	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
+		struct nft_expr *expr = nft_set_ext_expr(ext);
+
+		if (expr->ops->destroy_clone) {
+			expr->ops->destroy_clone(&ctx, expr);
+			module_put(expr->ops->type->owner);
+		} else {
+			nf_tables_expr_destroy(&ctx, expr);
+		}
+	}
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
 		(*nft_set_ext_obj(ext))->use--;
 	kfree(elem);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index e59a74d6b7d6..a61d7edfc290 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -258,6 +258,7 @@ static const struct nft_expr_ops nft_counter_ops = {
 	.eval		= nft_counter_eval,
 	.init		= nft_counter_init,
 	.destroy	= nft_counter_destroy,
+	.destroy_clone	= nft_counter_destroy,
 	.dump		= nft_counter_dump,
 	.clone		= nft_counter_clone,
 };
-- 
cgit v1.2.3


From 290180e2448c02d6b391455937098882a73a9494 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 21:38:51 +0200
Subject: netfilter: nf_tables: add connlimit support

This features which allows you to limit the maximum number of
connections per arbitrary key. The connlimit expression is stateful,
therefore it can be used from meters to dynamically populate a set, this
provides a mapping to the iptables' connlimit match. This patch also
comes that allows you define static connlimit policies.

This extension depends on the nf_conncount infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  21 ++-
 net/netfilter/Kconfig                    |   9 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_connlimit.c            | 297 +++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 net/netfilter/nft_connlimit.c

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a089af092a29..ae00a3c49b8a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1043,6 +1043,24 @@ enum nft_limit_attributes {
 };
 #define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
 
+enum nft_connlimit_flags {
+	NFT_CONNLIMIT_F_INV	= (1 << 0),
+};
+
+/**
+ * enum nft_connlimit_attributes - nf_tables connlimit expression netlink attributes
+ *
+ * @NFTA_CONNLIMIT_COUNT: number of connections (NLA_U32)
+ * @NFTA_CONNLIMIT_FLAGS: flags (NLA_U32: enum nft_connlimit_flags)
+ */
+enum nft_connlimit_attributes {
+	NFTA_CONNLIMIT_UNSPEC,
+	NFTA_CONNLIMIT_COUNT,
+	NFTA_CONNLIMIT_FLAGS,
+	__NFTA_CONNLIMIT_MAX
+};
+#define NFTA_CONNLIMIT_MAX	(__NFTA_CONNLIMIT_MAX - 1)
+
 /**
  * enum nft_counter_attributes - nf_tables counter expression netlink attributes
  *
@@ -1357,7 +1375,8 @@ enum nft_ct_helper_attributes {
 #define NFT_OBJECT_QUOTA	2
 #define NFT_OBJECT_CT_HELPER	3
 #define NFT_OBJECT_LIMIT	4
-#define __NFT_OBJECT_MAX	5
+#define NFT_OBJECT_CONNLIMIT	5
+#define __NFT_OBJECT_MAX	6
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 41240abd755f..dbd7d1fad277 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -517,6 +517,15 @@ config NFT_COUNTER
 	  This option adds the "counter" expression that you can use to
 	  include packet and byte counters in a rule.
 
+config NFT_CONNLIMIT
+	tristate "Netfilter nf_tables connlimit module"
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_CONNCOUNT
+	help
+	  This option adds the "connlimit" expression that you can use to
+	  ratelimit rule matchings per connections.
+
 config NFT_LOG
 	tristate "Netfilter nf_tables log module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index eec169555731..44449389e527 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -80,6 +80,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
+obj-$(CONFIG_NFT_CONNLIMIT)	+= nft_connlimit.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
new file mode 100644
index 000000000000..50c068d660e5
--- /dev/null
+++ b/net/netfilter/nft_connlimit.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_count.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+struct nft_connlimit {
+	spinlock_t		lock;
+	struct hlist_head	hhead;
+	u32			limit;
+	bool			invert;
+};
+
+static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
+					 struct nft_regs *regs,
+					 const struct nft_pktinfo *pkt,
+					 const struct nft_set_ext *ext)
+{
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+	const struct nf_conntrack_tuple *tuple_ptr;
+	struct nf_conntrack_tuple tuple;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int count;
+	bool addit;
+
+	tuple_ptr = &tuple;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+	if (ct != NULL) {
+		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+		zone = nf_ct_zone(ct);
+	} else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
+				      nft_pf(pkt), nft_net(pkt), &tuple)) {
+		regs->verdict.code = NF_DROP;
+		return;
+	}
+
+	spin_lock_bh(&priv->lock);
+	count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
+				    &addit);
+
+	if (!addit)
+		goto out;
+
+	if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
+		regs->verdict.code = NF_DROP;
+		spin_unlock_bh(&priv->lock);
+		return;
+	}
+	count++;
+out:
+	spin_unlock_bh(&priv->lock);
+
+	if ((count > priv->limit) ^ priv->invert) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+}
+
+static int nft_connlimit_do_init(const struct nft_ctx *ctx,
+				 const struct nlattr * const tb[],
+				 struct nft_connlimit *priv)
+{
+	bool invert = false;
+	u32 flags, limit;
+
+	if (!tb[NFTA_CONNLIMIT_COUNT])
+		return -EINVAL;
+
+	limit = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_COUNT]));
+
+	if (tb[NFTA_CONNLIMIT_FLAGS]) {
+		flags = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_FLAGS]));
+		if (flags & ~NFT_CONNLIMIT_F_INV)
+			return -EOPNOTSUPP;
+		if (flags & NFT_CONNLIMIT_F_INV)
+			invert = true;
+	}
+
+	spin_lock_init(&priv->lock);
+	INIT_HLIST_HEAD(&priv->hhead);
+	priv->limit	= limit;
+	priv->invert	= invert;
+
+	return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
+				     struct nft_connlimit *priv)
+{
+	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_conncount_cache_free(&priv->hhead);
+}
+
+static int nft_connlimit_do_dump(struct sk_buff *skb,
+				 struct nft_connlimit *priv)
+{
+	if (nla_put_be32(skb, NFTA_CONNLIMIT_COUNT, htonl(priv->limit)))
+		goto nla_put_failure;
+	if (priv->invert &&
+	    nla_put_be32(skb, NFTA_CONNLIMIT_FLAGS, htonl(NFT_CONNLIMIT_F_INV)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline void nft_connlimit_obj_eval(struct nft_object *obj,
+					struct nft_regs *regs,
+					const struct nft_pktinfo *pkt)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
+				const struct nlattr * const tb[],
+				struct nft_object *obj)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
+				      struct nft_object *obj)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_obj_dump(struct sk_buff *skb,
+				  struct nft_object *obj, bool reset)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	return nft_connlimit_do_dump(skb, priv);
+}
+
+static const struct nla_policy nft_connlimit_policy[NFTA_CONNLIMIT_MAX + 1] = {
+	[NFTA_CONNLIMIT_COUNT]	= { .type = NLA_U32 },
+	[NFTA_CONNLIMIT_FLAGS]	= { .type = NLA_U32 },
+};
+
+static struct nft_object_type nft_connlimit_obj_type;
+static const struct nft_object_ops nft_connlimit_obj_ops = {
+	.type		= &nft_connlimit_obj_type,
+	.size		= sizeof(struct nft_connlimit),
+	.eval		= nft_connlimit_obj_eval,
+	.init		= nft_connlimit_obj_init,
+	.destroy	= nft_connlimit_obj_destroy,
+	.dump		= nft_connlimit_obj_dump,
+};
+
+static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_CONNLIMIT,
+	.ops		= &nft_connlimit_obj_ops,
+	.maxattr	= NFTA_CONNLIMIT_MAX,
+	.policy		= nft_connlimit_policy,
+	.owner		= THIS_MODULE,
+};
+
+static void nft_connlimit_eval(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	return nft_connlimit_do_dump(skb, priv);
+}
+
+static int nft_connlimit_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_destroy(const struct nft_ctx *ctx,
+				const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+	struct nft_connlimit *priv_dst = nft_expr_priv(dst);
+	struct nft_connlimit *priv_src = nft_expr_priv(src);
+
+	spin_lock_init(&priv_dst->lock);
+	INIT_HLIST_HEAD(&priv_dst->hhead);
+	priv_dst->limit	 = priv_src->limit;
+	priv_dst->invert = priv_src->invert;
+
+	return 0;
+}
+
+static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
+					const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nf_conncount_cache_free(&priv->hhead);
+}
+
+static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+	bool addit, ret;
+
+	spin_lock_bh(&priv->lock);
+	nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
+
+	ret = hlist_empty(&priv->hhead);
+	spin_unlock_bh(&priv->lock);
+
+	return ret;
+}
+
+static struct nft_expr_type nft_connlimit_type;
+static const struct nft_expr_ops nft_connlimit_ops = {
+	.type		= &nft_connlimit_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_connlimit)),
+	.eval		= nft_connlimit_eval,
+	.init		= nft_connlimit_init,
+	.destroy	= nft_connlimit_destroy,
+	.clone		= nft_connlimit_clone,
+	.destroy_clone	= nft_connlimit_destroy_clone,
+	.dump		= nft_connlimit_dump,
+	.gc		= nft_connlimit_gc,
+};
+
+static struct nft_expr_type nft_connlimit_type __read_mostly = {
+	.name		= "connlimit",
+	.ops		= &nft_connlimit_ops,
+	.policy		= nft_connlimit_policy,
+	.maxattr	= NFTA_CONNLIMIT_MAX,
+	.flags		= NFT_EXPR_STATEFUL | NFT_EXPR_GC,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_connlimit_module_init(void)
+{
+	int err;
+
+	err = nft_register_obj(&nft_connlimit_obj_type);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_connlimit_type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+err1:
+	nft_unregister_obj(&nft_connlimit_obj_type);
+	return err;
+}
+
+static void __exit nft_connlimit_module_exit(void)
+{
+	nft_unregister_expr(&nft_connlimit_type);
+	nft_unregister_obj(&nft_connlimit_obj_type);
+}
+
+module_init(nft_connlimit_module_init);
+module_exit(nft_connlimit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso");
+MODULE_ALIAS_NFT_EXPR("connlimit");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
-- 
cgit v1.2.3


From 1b2470e59fb1e983a3655feba30cdfc03e609d51 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Jun 2018 23:41:06 +0200
Subject: netfilter: nf_tables: handle chain name lookups via rhltable

If there is a significant amount of chains list search is too slow, so
add an rhlist table for this.

This speeds up ruleset loading: for every new rule we have to check if
the name already exists in current generation.

We need to be able to cope with duplicate chain names in case a transaction
drops the nfnl mutex (for request_module) and the abort of this old
transaction is still pending.

The list is kept -- we need a way to iterate chains even if hash resize is
in progress without missing an entry.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   7 ++-
 net/netfilter/nf_tables_api.c     | 113 +++++++++++++++++++++++++++++++++-----
 2 files changed, 104 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 83e7b83ecf3e..08c005ce56e9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -9,6 +9,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/rhashtable.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netlink.h>
 
@@ -860,6 +861,7 @@ enum nft_chain_flags {
  *
  *	@rules: list of rules in the chain
  *	@list: used internally
+ *	@rhlhead: used internally
  *	@table: table that this chain belongs to
  *	@handle: chain handle
  *	@use: number of jump references to this chain
@@ -872,6 +874,7 @@ struct nft_chain {
 	struct nft_rule			*__rcu *rules_gen_1;
 	struct list_head		rules;
 	struct list_head		list;
+	struct rhlist_head		rhlhead;
 	struct nft_table		*table;
 	u64				handle;
 	u32				use;
@@ -965,7 +968,8 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	struct nft_table - nf_tables table
  *
  *	@list: used internally
- *	@chains: chains in the table
+ *	@chains_ht: chains in the table
+ *	@chains: same, for stable walks
  *	@sets: sets in the table
  *	@objects: stateful objects in the table
  *	@flowtables: flow tables in the table
@@ -979,6 +983,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  */
 struct nft_table {
 	struct list_head		list;
+	struct rhltable			chains_ht;
 	struct list_head		chains;
 	struct list_head		sets;
 	struct list_head		objects;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0a6eafa49879..2e8fd961746d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -34,6 +34,20 @@ enum {
 	NFT_VALIDATE_DO,
 };
 
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);
+
+static const struct rhashtable_params nft_chain_ht_params = {
+	.head_offset		= offsetof(struct nft_chain, rhlhead),
+	.key_offset		= offsetof(struct nft_chain, name),
+	.hashfn			= nft_chain_hash,
+	.obj_hashfn		= nft_chain_hash_obj,
+	.obj_cmpfn		= nft_chain_hash_cmp,
+	.locks_mul		= 1,
+	.automatic_shrinking	= true,
+};
+
 static void nft_validate_state_update(struct net *net, u8 new_validate_state)
 {
 	switch (net->nft.validate_state) {
@@ -720,6 +734,29 @@ err:
 	return ret;
 }
 
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed)
+{
+	const char *name = data;
+
+	return jhash(name, strlen(name), seed);
+}
+
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct nft_chain *chain = data;
+
+	return nft_chain_hash(chain->name, 0, seed);
+}
+
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
+			      const void *ptr)
+{
+	const struct nft_chain *chain = ptr;
+	const char *name = arg->key;
+
+	return strcmp(chain->name, name);
+}
+
 static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 			      struct sk_buff *skb, const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[],
@@ -766,6 +803,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (table->name == NULL)
 		goto err_strdup;
 
+	err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
+	if (err)
+		goto err_chain_ht;
+
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
@@ -782,6 +823,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	list_add_tail_rcu(&table->list, &net->nft.tables);
 	return 0;
 err_trans:
+	rhltable_destroy(&table->chains_ht);
+err_chain_ht:
 	kfree(table->name);
 err_strdup:
 	kfree(table);
@@ -922,6 +965,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 {
 	BUG_ON(ctx->table->use > 0);
 
+	rhltable_destroy(&ctx->table->chains_ht);
 	kfree(ctx->table->name);
 	kfree(ctx->table);
 }
@@ -967,21 +1011,35 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_chain *nft_chain_lookup(const struct nft_table *table,
+static struct nft_chain *nft_chain_lookup(struct nft_table *table,
 					  const struct nlattr *nla, u8 genmask)
 {
+	char search[NFT_CHAIN_MAXNAMELEN + 1];
+	struct rhlist_head *tmp, *list;
 	struct nft_chain *chain;
 
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	list_for_each_entry_rcu(chain, &table->chains, list) {
-		if (!nla_strcmp(nla, chain->name) &&
-		    nft_active_genmask(chain, genmask))
-			return chain;
-	}
+	nla_strlcpy(search, nla, sizeof(search));
 
-	return ERR_PTR(-ENOENT);
+	WARN_ON(!rcu_read_lock_held() &&
+		!lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+	chain = ERR_PTR(-ENOENT);
+	rcu_read_lock();
+	list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params);
+	if (!list)
+		goto out_unlock;
+
+	rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
+		if (nft_active_genmask(chain, genmask))
+			goto out_unlock;
+	}
+	chain = ERR_PTR(-ENOENT);
+out_unlock:
+	rcu_read_unlock();
+	return chain;
 }
 
 static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
@@ -1185,8 +1243,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_table *table;
 	const struct nft_chain *chain;
+	struct nft_table *table;
 	struct sk_buff *skb2;
 	int family = nfmsg->nfgen_family;
 	int err;
@@ -1504,9 +1562,17 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	if (err < 0)
 		goto err1;
 
+	err = rhltable_insert_key(&table->chains_ht, chain->name,
+				  &chain->rhlhead, nft_chain_ht_params);
+	if (err)
+		goto err2;
+
 	err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
-	if (err < 0)
+	if (err < 0) {
+		rhltable_remove(&table->chains_ht, &chain->rhlhead,
+				nft_chain_ht_params);
 		goto err2;
+	}
 
 	table->use++;
 	list_add_tail_rcu(&chain->list, &table->chains);
@@ -2206,9 +2272,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_table *table;
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
+	struct nft_table *table;
 	struct sk_buff *skb2;
 	int family = nfmsg->nfgen_family;
 	int err;
@@ -5981,8 +6047,16 @@ static void nft_chain_commit_update(struct nft_trans *trans)
 {
 	struct nft_base_chain *basechain;
 
-	if (nft_trans_chain_name(trans))
+	if (nft_trans_chain_name(trans)) {
+		rhltable_remove(&trans->ctx.table->chains_ht,
+				&trans->ctx.chain->rhlhead,
+				nft_chain_ht_params);
 		swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
+		rhltable_insert_key(&trans->ctx.table->chains_ht,
+				    trans->ctx.chain->name,
+				    &trans->ctx.chain->rhlhead,
+				    nft_chain_ht_params);
+	}
 
 	if (!nft_is_base_chain(trans->ctx.chain))
 		return;
@@ -6159,6 +6233,15 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha
 		nf_tables_commit_chain_free_rules_old(g0);
 }
 
+static void nft_chain_del(struct nft_chain *chain)
+{
+	struct nft_table *table = chain->table;
+
+	WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead,
+				     nft_chain_ht_params));
+	list_del_rcu(&chain->list);
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
 	struct nft_trans *trans, *next;
@@ -6233,7 +6316,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELCHAIN:
-			list_del_rcu(&trans->ctx.chain->list);
+			nft_chain_del(trans->ctx.chain);
 			nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
 			nf_tables_unregister_hook(trans->ctx.net,
 						  trans->ctx.table,
@@ -6384,7 +6467,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 				nft_trans_destroy(trans);
 			} else {
 				trans->ctx.table->use--;
-				list_del_rcu(&trans->ctx.chain->list);
+				nft_chain_del(trans->ctx.chain);
 				nf_tables_unregister_hook(trans->ctx.net,
 							  trans->ctx.table,
 							  trans->ctx.chain);
@@ -6986,7 +7069,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 		ctx->chain->use--;
 		nf_tables_rule_release(ctx, rule);
 	}
-	list_del(&ctx->chain->list);
+	nft_chain_del(ctx->chain);
 	ctx->table->use--;
 	nf_tables_chain_destroy(ctx);
 
@@ -7042,7 +7125,7 @@ static void __nft_release_tables(struct net *net)
 		}
 		list_for_each_entry_safe(chain, nc, &table->chains, list) {
 			ctx.chain = chain;
-			list_del(&chain->list);
+			nft_chain_del(chain);
 			table->use--;
 			nf_tables_chain_destroy(&ctx);
 		}
-- 
cgit v1.2.3


From 6927868e7ae929f037eea973c582b870808880b6 Mon Sep 17 00:00:00 2001
From: Oza Pawandeep <poza@codeaurora.org>
Date: Thu, 17 May 2018 16:44:18 -0500
Subject: PCI/DPC: Disable ERR_NONFATAL handling by DPC

PCIe ERR_NONFATAL errors mean a particular transaction is unreliable but
the Link is otherwise fully functional (PCIe r4.0, sec 6.2.2).

The AER driver handles these by logging the error details and calling
driver-supplied pci_error_handlers callbacks.  It does not reset downstream
devices, does not remove them from the PCI subsystem, does not re-enumerate
them, and does not call their driver .remove() or .probe() methods.

But DPC driver previously enabled DPC on ERR_NONFATAL, so if the hardware
supports DPC, these errors caused a Link reset (performed automatically by
the hardware), followed by the DPC driver removing affected devices (which
calls their .remove() methods), bringing the Link back up, and
re-enumerating (which calls driver .probe() methods).

Disable ERR_NONFATAL DPC triggering so these errors will only be handled by
AER.  This means drivers won't have to deal with different usage of their
pci_error_handlers callbacks and .probe() and .remove() methods based on
whether the platform has DPC support.

Signed-off-by: Oza Pawandeep <poza@codeaurora.org>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/dpc.c        | 4 ++--
 include/uapi/linux/pci_regs.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec3849f8d3..361903fd986e 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -260,7 +260,7 @@ static int dpc_probe(struct pcie_device *dev)
 		}
 	}
 
-	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN;
+	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
 	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
 	dev_info(device, "DPC error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -278,7 +278,7 @@ static void dpc_remove(struct pcie_device *dev)
 	u16 ctl;
 
 	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
-	ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
 	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba797a8f3..5182e0dda083 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE	0x1000	/* ERR_COR signal on DL_Active supported */
 
 #define PCI_EXP_DPC_CTL			6	/* DPC control */
+#define  PCI_EXP_DPC_CTL_EN_FATAL 	0x0001	/* Enable trigger on ERR_FATAL message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL 	0x0002	/* Enable trigger on ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN 	0x0008	/* DPC Interrupt Enable */
 
-- 
cgit v1.2.3


From b09803b5e546d553aebbb017ff30b8a54d81d1de Mon Sep 17 00:00:00 2001
From: Oza Pawandeep <poza@codeaurora.org>
Date: Thu, 17 May 2018 16:44:20 -0500
Subject: PCI/DPC: Use the generic pcie_do_fatal_recovery() path

Our goal is to handle ERR_FATAL errors similarly, whether they are reported
via AER or via DPC.  A previous commit changed AER so it handles ERR_FATAL
by calling driver .remove() methods and resetting the Link.  DPC already
does that (although the Link reset is done automatically by hardware and
happens before we call the driver .remove() methods).

Restructure the DPC code so it calls the same pcie_do_fatal_recovery()
interface used by AER.  This makes it clearer that we want to use the same
path.

Implement the .reset_link() method used by pcie_do_fatal_recovery().  For
DPC, the actual reset is done automatically by hardware, so we really only
have to wait for the Link to be inactive, then release the Port from DPC.

Signed-off-by: Oza Pawandeep <poza@codeaurora.org>
[bhelgaas: changelog, DPC_FATAL is not a bitfield, can be sequential]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/dpc.c | 53 +++++++++++++++++++++++++++++++-------------------
 include/linux/aer.h    |  1 +
 2 files changed, 34 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 361903fd986e..6064041409d2 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,30 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 	pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-	struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-	struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-	struct pci_bus *parent = pdev->subordinate;
-	u16 cap = dpc->cap_pos, ctl;
-
-	pci_lock_rescan_remove();
-	list_for_each_entry_safe_reverse(dev, temp, &parent->devices,
-					 bus_list) {
-		pci_dev_get(dev);
-		pci_dev_set_disconnected(dev, NULL);
-		if (pci_has_subordinate(dev))
-			pci_walk_bus(dev->subordinate,
-				     pci_dev_set_disconnected, NULL);
-		pci_stop_and_remove_bus_device(dev);
-		pci_dev_put(dev);
-	}
-	pci_unlock_rescan_remove();
-
+	struct dpc_dev *dpc;
+	struct pcie_device *pciedev;
+	struct device *devdpc;
+	u16 cap, ctl;
+
+	/*
+	 * DPC disables the Link automatically in hardware, so it has
+	 * already been reset by the time we get here.
+	 */
+	devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+	pciedev = to_pcie_device(devdpc);
+	dpc = get_service_data(pciedev);
+	cap = dpc->cap_pos;
+
+	/*
+	 * Wait until the Link is inactive, then clear DPC Trigger Status
+	 * to allow the Port to leave DPC.
+	 */
 	dpc_wait_link_inactive(dpc);
+
 	if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-		return;
+		return PCI_ERS_RESULT_DISCONNECT;
 	if (dpc->rp_extensions && dpc->rp_pio_status) {
 		pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
 				       dpc->rp_pio_status);
@@ -108,6 +109,17 @@ static void dpc_work(struct work_struct *work)
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, &ctl);
 	pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
 			      ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+	struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+	struct pci_dev *pdev = dpc->dev->port;
+
+	/* We configure DPC so it only triggers on ERR_FATAL */
+	pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +300,7 @@ static struct pcie_port_service_driver dpcdriver = {
 	.service	= PCIE_PORT_SERVICE_DPC,
 	.probe		= dpc_probe,
 	.remove		= dpc_remove,
+	.reset_link	= dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 8f87bbeceef4..514bffa11dbb 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -14,6 +14,7 @@
 #define AER_NONFATAL			0
 #define AER_FATAL			1
 #define AER_CORRECTABLE			2
+#define DPC_FATAL			3
 
 struct pci_dev;
 
-- 
cgit v1.2.3


From cd4a4ae4683dc2e09380118e205e057896dcda2b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 2 Jun 2018 14:04:07 -0600
Subject: block: don't use blocking queue entered for recursive bio submits

If we end up splitting a bio and the queue goes away between
the initial submission and the later split submission, then we
can block forever in blk_queue_enter() waiting for the reference
to drop to zero. This will never happen, since we already hold
a reference.

Mark a split bio as already having entered the queue, so we can
just use the live non-blocking queue enter variant.

Thanks to Tetsuo Handa for the analysis.

Reported-by: syzbot+c4f9cebf9d651f6e54de@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          |  4 +++-
 block/blk-merge.c         | 10 ++++++++++
 include/linux/blk_types.h |  2 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index cd573a33a6f3..3f56be15f17e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2377,7 +2377,9 @@ blk_qc_t generic_make_request(struct bio *bio)
 
 	if (bio->bi_opf & REQ_NOWAIT)
 		flags = BLK_MQ_REQ_NOWAIT;
-	if (blk_queue_enter(q, flags) < 0) {
+	if (bio_flagged(bio, BIO_QUEUE_ENTERED))
+		blk_queue_enter_live(q);
+	else if (blk_queue_enter(q, flags) < 0) {
 		if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
 			bio_wouldblock_error(bio);
 		else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d70ab08820e5..aaec38cc37b8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 		/* there isn't chance to merge the splitted bio */
 		split->bi_opf |= REQ_NOMERGE;
 
+		/*
+		 * Since we're recursing into make_request here, ensure
+		 * that we mark this bio as already having entered the queue.
+		 * If not, and the queue is going away, we can get stuck
+		 * forever on waiting for the queue reference to drop. But
+		 * that will never happen, as we're already holding a
+		 * reference to it.
+		 */
+		bio_set_flag(*bio, BIO_QUEUE_ENTERED);
+
 		bio_chain(split, *bio);
 		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
 		generic_make_request(*bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4cb970cdcd11..3c4f390aea4b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -229,6 +229,8 @@ struct bio {
 				 * throttling rules. Don't do it again. */
 #define BIO_TRACE_COMPLETION 10	/* bio_endio() should trace the final completion
 				 * of this bio. */
+#define BIO_QUEUE_ENTERED 11	/* can use blk_queue_enter_live() */
+
 /* See BVEC_POOL_OFFSET below before adding new flags */
 
 /*
-- 
cgit v1.2.3


From e948e06fc63a1c1e36ec4c8e5c510b881ff19c26 Mon Sep 17 00:00:00 2001
From: Alastair D'Silva <alastair@d-silva.org>
Date: Fri, 11 May 2018 16:13:01 +1000
Subject: ocxl: Expose the thread_id needed for wait on POWER9

In order to successfully issue as_notify, an AFU needs to know the TID
to notify, which in turn means that this information should be
available in userspace so it can be communicated to the AFU.

Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/ocxl/context.c       |  5 +++-
 drivers/misc/ocxl/file.c          | 53 +++++++++++++++++++++++++++++++++++++++
 drivers/misc/ocxl/link.c          | 36 ++++++++++++++++++++++++++
 drivers/misc/ocxl/ocxl_internal.h |  1 +
 include/misc/ocxl.h               |  9 +++++++
 include/uapi/misc/ocxl.h          |  8 ++++++
 6 files changed, 111 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index 909e8807824a..95f74623113e 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -34,6 +34,8 @@ int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu,
 	mutex_init(&ctx->xsl_error_lock);
 	mutex_init(&ctx->irq_lock);
 	idr_init(&ctx->irq_idr);
+	ctx->tidr = 0;
+
 	/*
 	 * Keep a reference on the AFU to make sure it's valid for the
 	 * duration of the life of the context
@@ -65,6 +67,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr)
 {
 	int rc;
 
+	// Locks both status & tidr
 	mutex_lock(&ctx->status_mutex);
 	if (ctx->status != OPENED) {
 		rc = -EIO;
@@ -72,7 +75,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr)
 	}
 
 	rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid,
-			current->mm->context.id, 0, amr, current->mm,
+			current->mm->context.id, ctx->tidr, amr, current->mm,
 			xsl_fault_error, ctx);
 	if (rc)
 		goto out;
diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index 038509e5d031..eb409a469f21 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -5,6 +5,8 @@
 #include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <uapi/misc/ocxl.h>
+#include <asm/reg.h>
+#include <asm/switch_to.h>
 #include "ocxl_internal.h"
 
 
@@ -123,11 +125,55 @@ static long afu_ioctl_get_metadata(struct ocxl_context *ctx,
 	return 0;
 }
 
+#ifdef CONFIG_PPC64
+static long afu_ioctl_enable_p9_wait(struct ocxl_context *ctx,
+		struct ocxl_ioctl_p9_wait __user *uarg)
+{
+	struct ocxl_ioctl_p9_wait arg;
+
+	memset(&arg, 0, sizeof(arg));
+
+	if (cpu_has_feature(CPU_FTR_P9_TIDR)) {
+		enum ocxl_context_status status;
+
+		// Locks both status & tidr
+		mutex_lock(&ctx->status_mutex);
+		if (!ctx->tidr) {
+			if (set_thread_tidr(current))
+				return -ENOENT;
+
+			ctx->tidr = current->thread.tidr;
+		}
+
+		status = ctx->status;
+		mutex_unlock(&ctx->status_mutex);
+
+		if (status == ATTACHED) {
+			int rc;
+			struct link *link = ctx->afu->fn->link;
+
+			rc = ocxl_link_update_pe(link, ctx->pasid, ctx->tidr);
+			if (rc)
+				return rc;
+		}
+
+		arg.thread_id = ctx->tidr;
+	} else
+		return -ENOENT;
+
+	if (copy_to_user(uarg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
+}
+#endif
+
 #define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" :			\
 			x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" :	\
 			x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" :		\
 			x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" :	\
 			x == OCXL_IOCTL_GET_METADATA ? "GET_METADATA" :	\
+			x == OCXL_IOCTL_ENABLE_P9_WAIT ? "ENABLE_P9_WAIT" :	\
 			"UNKNOWN")
 
 static long afu_ioctl(struct file *file, unsigned int cmd,
@@ -186,6 +232,13 @@ static long afu_ioctl(struct file *file, unsigned int cmd,
 				(struct ocxl_ioctl_metadata __user *) args);
 		break;
 
+#ifdef CONFIG_PPC64
+	case OCXL_IOCTL_ENABLE_P9_WAIT:
+		rc = afu_ioctl_enable_p9_wait(ctx,
+				(struct ocxl_ioctl_p9_wait __user *) args);
+		break;
+#endif
+
 	default:
 		rc = -EINVAL;
 	}
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index 656e8610eec2..88876ae8f330 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -544,6 +544,42 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
 
+int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid)
+{
+	struct link *link = (struct link *) link_handle;
+	struct spa *spa = link->spa;
+	struct ocxl_process_element *pe;
+	int pe_handle, rc;
+
+	if (pasid > SPA_PASID_MAX)
+		return -EINVAL;
+
+	pe_handle = pasid & SPA_PE_MASK;
+	pe = spa->spa_mem + pe_handle;
+
+	mutex_lock(&spa->spa_lock);
+
+	pe->tid = tid;
+
+	/*
+	 * The barrier makes sure the PE is updated
+	 * before we clear the NPU context cache below, so that the
+	 * old PE cannot be reloaded erroneously.
+	 */
+	mb();
+
+	/*
+	 * hook to platform code
+	 * On powerpc, the entry needs to be cleared from the context
+	 * cache of the NPU.
+	 */
+	rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
+	WARN_ON(rc);
+
+	mutex_unlock(&spa->spa_lock);
+	return rc;
+}
+
 int ocxl_link_remove_pe(void *link_handle, int pasid)
 {
 	struct link *link = (struct link *) link_handle;
diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
index 5d421824afd9..a32f2151029f 100644
--- a/drivers/misc/ocxl/ocxl_internal.h
+++ b/drivers/misc/ocxl/ocxl_internal.h
@@ -77,6 +77,7 @@ struct ocxl_context {
 	struct ocxl_xsl_error xsl_error;
 	struct mutex irq_lock;
 	struct idr irq_idr;
+	u16 tidr; // Thread ID used for P9 wait implementation
 };
 
 struct ocxl_process_element {
diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h
index 51ccf76db293..9ff6ddc28e22 100644
--- a/include/misc/ocxl.h
+++ b/include/misc/ocxl.h
@@ -188,6 +188,15 @@ extern int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
 		void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
 		void *xsl_err_data);
 
+/**
+ * Update values within a Process Element
+ *
+ * link_handle: the link handle associated with the process element
+ * pasid: the PASID for the AFU context
+ * tid: the new thread id for the process element
+ */
+extern int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid);
+
 /*
  * Remove a Process Element from the Shared Process Area for a link
  */
diff --git a/include/uapi/misc/ocxl.h b/include/uapi/misc/ocxl.h
index 0af83d80fb3e..561e6f0dfcb7 100644
--- a/include/uapi/misc/ocxl.h
+++ b/include/uapi/misc/ocxl.h
@@ -48,6 +48,13 @@ struct ocxl_ioctl_metadata {
 	__u64 reserved[13]; // Total of 16*u64
 };
 
+struct ocxl_ioctl_p9_wait {
+	__u16 thread_id; // The thread ID required to wake this thread
+	__u16 reserved1;
+	__u32 reserved2;
+	__u64 reserved3[3];
+};
+
 struct ocxl_ioctl_irq_fd {
 	__u64 irq_offset;
 	__s32 eventfd;
@@ -62,5 +69,6 @@ struct ocxl_ioctl_irq_fd {
 #define OCXL_IOCTL_IRQ_FREE	_IOW(OCXL_MAGIC, 0x12, __u64)
 #define OCXL_IOCTL_IRQ_SET_FD	_IOW(OCXL_MAGIC, 0x13, struct ocxl_ioctl_irq_fd)
 #define OCXL_IOCTL_GET_METADATA _IOR(OCXL_MAGIC, 0x14, struct ocxl_ioctl_metadata)
+#define OCXL_IOCTL_ENABLE_P9_WAIT	_IOR(OCXL_MAGIC, 0x15, struct ocxl_ioctl_p9_wait)
 
 #endif /* _UAPI_MISC_OCXL_H */
-- 
cgit v1.2.3


From 02a8e5bc1c06045f36423bd9632ad9f40da18d3f Mon Sep 17 00:00:00 2001
From: Alastair D'Silva <alastair@d-silva.org>
Date: Fri, 11 May 2018 16:13:02 +1000
Subject: ocxl: Add an IOCTL so userspace knows what OCXL features are
 available

In order for a userspace AFU driver to call the POWER9 specific
OCXL_IOCTL_ENABLE_P9_WAIT, it needs to verify that it can actually
make that call.

Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
Acked-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/ocxl/file.c | 25 +++++++++++++++++++++++++
 include/uapi/misc/ocxl.h |  6 ++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index eb409a469f21..33ae46ce0a8a 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -168,12 +168,32 @@ static long afu_ioctl_enable_p9_wait(struct ocxl_context *ctx,
 }
 #endif
 
+
+static long afu_ioctl_get_features(struct ocxl_context *ctx,
+		struct ocxl_ioctl_features __user *uarg)
+{
+	struct ocxl_ioctl_features arg;
+
+	memset(&arg, 0, sizeof(arg));
+
+#ifdef CONFIG_PPC64
+	if (cpu_has_feature(CPU_FTR_P9_TIDR))
+		arg.flags[0] |= OCXL_IOCTL_FEATURES_FLAGS0_P9_WAIT;
+#endif
+
+	if (copy_to_user(uarg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
+}
+
 #define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" :			\
 			x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" :	\
 			x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" :		\
 			x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" :	\
 			x == OCXL_IOCTL_GET_METADATA ? "GET_METADATA" :	\
 			x == OCXL_IOCTL_ENABLE_P9_WAIT ? "ENABLE_P9_WAIT" :	\
+			x == OCXL_IOCTL_GET_FEATURES ? "GET_FEATURES" :	\
 			"UNKNOWN")
 
 static long afu_ioctl(struct file *file, unsigned int cmd,
@@ -239,6 +259,11 @@ static long afu_ioctl(struct file *file, unsigned int cmd,
 		break;
 #endif
 
+	case OCXL_IOCTL_GET_FEATURES:
+		rc = afu_ioctl_get_features(ctx,
+				(struct ocxl_ioctl_features __user *) args);
+		break;
+
 	default:
 		rc = -EINVAL;
 	}
diff --git a/include/uapi/misc/ocxl.h b/include/uapi/misc/ocxl.h
index 561e6f0dfcb7..97937cfa3baa 100644
--- a/include/uapi/misc/ocxl.h
+++ b/include/uapi/misc/ocxl.h
@@ -55,6 +55,11 @@ struct ocxl_ioctl_p9_wait {
 	__u64 reserved3[3];
 };
 
+#define OCXL_IOCTL_FEATURES_FLAGS0_P9_WAIT	0x01
+struct ocxl_ioctl_features {
+	__u64 flags[4];
+};
+
 struct ocxl_ioctl_irq_fd {
 	__u64 irq_offset;
 	__s32 eventfd;
@@ -70,5 +75,6 @@ struct ocxl_ioctl_irq_fd {
 #define OCXL_IOCTL_IRQ_SET_FD	_IOW(OCXL_MAGIC, 0x13, struct ocxl_ioctl_irq_fd)
 #define OCXL_IOCTL_GET_METADATA _IOR(OCXL_MAGIC, 0x14, struct ocxl_ioctl_metadata)
 #define OCXL_IOCTL_ENABLE_P9_WAIT	_IOR(OCXL_MAGIC, 0x15, struct ocxl_ioctl_p9_wait)
+#define OCXL_IOCTL_GET_FEATURES _IOR(OCXL_MAGIC, 0x16, struct ocxl_ioctl_features)
 
 #endif /* _UAPI_MISC_OCXL_H */
-- 
cgit v1.2.3


From 06be0864c77ae6861632a678a6378511a4828d6e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:31 +0200
Subject: bpf: test case for map pointer poison with calls/branches

Add several test cases where the same or different map pointers
originate from different paths in the program and execute a map
lookup or tail call at a common location.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h                      |  10 ++
 tools/include/linux/filter.h                |  10 ++
 tools/testing/selftests/bpf/test_verifier.c | 185 ++++++++++++++++++++++++----
 3 files changed, 178 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d90abdaea94b..6fd375fe7079 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -289,6 +289,16 @@ struct xdp_buff;
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Relative call */
+
+#define BPF_CALL_REL(TGT)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_CALL,			\
+		.dst_reg = 0,					\
+		.src_reg = BPF_PSEUDO_CALL,			\
+		.off   = 0,					\
+		.imm   = TGT })
+
 /* Function call */
 
 #define BPF_EMIT_CALL(FUNC)					\
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index c5e512da8d8a..af55acf73e75 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -263,6 +263,16 @@
 #define BPF_LD_MAP_FD(DST, MAP_FD)				\
 	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
 
+/* Relative call */
+
+#define BPF_CALL_REL(TGT)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_CALL,			\
+		.dst_reg = 0,					\
+		.src_reg = BPF_PSEUDO_CALL,			\
+		.off   = 0,					\
+		.imm   = TGT })
+
 /* Program exit */
 
 #define BPF_EXIT_INSN()						\
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 4b4f015be217..7cb1d74057ce 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -50,7 +50,7 @@
 
 #define MAX_INSNS	BPF_MAXINSNS
 #define MAX_FIXUPS	8
-#define MAX_NR_MAPS	4
+#define MAX_NR_MAPS	7
 #define POINTER_VALUE	0xcafe4all
 #define TEST_DATA_LEN	64
 
@@ -66,7 +66,9 @@ struct bpf_test {
 	int fixup_map1[MAX_FIXUPS];
 	int fixup_map2[MAX_FIXUPS];
 	int fixup_map3[MAX_FIXUPS];
-	int fixup_prog[MAX_FIXUPS];
+	int fixup_map4[MAX_FIXUPS];
+	int fixup_prog1[MAX_FIXUPS];
+	int fixup_prog2[MAX_FIXUPS];
 	int fixup_map_in_map[MAX_FIXUPS];
 	const char *errstr;
 	const char *errstr_unpriv;
@@ -2769,7 +2771,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.errstr_unpriv = "R3 leaks addr into helper",
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
@@ -2856,7 +2858,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 42,
 	},
@@ -2870,7 +2872,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 41,
 	},
@@ -2884,7 +2886,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 1,
 	},
@@ -2898,7 +2900,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 2,
 	},
@@ -2912,7 +2914,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 2,
 	},
@@ -2926,7 +2928,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 2 },
+		.fixup_prog1 = { 2 },
 		.result = ACCEPT,
 		.retval = 42,
 	},
@@ -11681,6 +11683,112 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"calls: two calls returning different map pointers for lookup (hash, array)",
+		.insns = {
+			/* main prog */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+			BPF_CALL_REL(11),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_CALL_REL(12),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+				   offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			/* subprog 1 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* subprog 2 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.fixup_map2 = { 13 },
+		.fixup_map4 = { 16 },
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"calls: two calls returning different map pointers for lookup (hash, map in map)",
+		.insns = {
+			/* main prog */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+			BPF_CALL_REL(11),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_CALL_REL(12),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+				   offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			/* subprog 1 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* subprog 2 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.fixup_map_in_map = { 16 },
+		.fixup_map4 = { 13 },
+		.result = REJECT,
+		.errstr = "R0 invalid mem access 'map_ptr'",
+	},
+	{
+		"cond: two branches returning different map pointers for lookup (tail, tail)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 3),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog1 = { 5 },
+		.fixup_prog2 = { 2 },
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "tail_call abusing map_ptr",
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"cond: two branches returning same map pointers for lookup (tail, tail)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 3),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog2 = { 2, 5 },
+		.result_unpriv = ACCEPT,
+		.result = ACCEPT,
+		.retval = 42,
+	},
 	{
 		"search pruning: all branches should be verified (nop operation)",
 		.insns = {
@@ -12162,12 +12270,13 @@ static int probe_filter_length(const struct bpf_insn *fp)
 	return len + 1;
 }
 
-static int create_map(uint32_t size_value, uint32_t max_elem)
+static int create_map(uint32_t type, uint32_t size_key,
+		      uint32_t size_value, uint32_t max_elem)
 {
 	int fd;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
-			    size_value, max_elem, BPF_F_NO_PREALLOC);
+	fd = bpf_create_map(type, size_key, size_value, max_elem,
+			    type == BPF_MAP_TYPE_HASH ? BPF_F_NO_PREALLOC : 0);
 	if (fd < 0)
 		printf("Failed to create hash map '%s'!\n", strerror(errno));
 
@@ -12200,13 +12309,13 @@ static int create_prog_dummy2(int mfd, int idx)
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_array(void)
+static int create_prog_array(uint32_t max_elem, int p1key)
 {
-	int p1key = 0, p2key = 1;
+	int p2key = 1;
 	int mfd, p1fd, p2fd;
 
 	mfd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, sizeof(int),
-			     sizeof(int), 4, 0);
+			     sizeof(int), max_elem, 0);
 	if (mfd < 0) {
 		printf("Failed to create prog array '%s'!\n", strerror(errno));
 		return -1;
@@ -12261,7 +12370,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	int *fixup_map1 = test->fixup_map1;
 	int *fixup_map2 = test->fixup_map2;
 	int *fixup_map3 = test->fixup_map3;
-	int *fixup_prog = test->fixup_prog;
+	int *fixup_map4 = test->fixup_map4;
+	int *fixup_prog1 = test->fixup_prog1;
+	int *fixup_prog2 = test->fixup_prog2;
 	int *fixup_map_in_map = test->fixup_map_in_map;
 
 	if (test->fill_helper)
@@ -12272,7 +12383,8 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	 * that really matters is value size in this case.
 	 */
 	if (*fixup_map1) {
-		map_fds[0] = create_map(sizeof(long long), 1);
+		map_fds[0] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(long long), 1);
 		do {
 			prog[*fixup_map1].imm = map_fds[0];
 			fixup_map1++;
@@ -12280,7 +12392,8 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_map2) {
-		map_fds[1] = create_map(sizeof(struct test_val), 1);
+		map_fds[1] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(struct test_val), 1);
 		do {
 			prog[*fixup_map2].imm = map_fds[1];
 			fixup_map2++;
@@ -12288,25 +12401,43 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_map3) {
-		map_fds[1] = create_map(sizeof(struct other_val), 1);
+		map_fds[2] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(struct other_val), 1);
 		do {
-			prog[*fixup_map3].imm = map_fds[1];
+			prog[*fixup_map3].imm = map_fds[2];
 			fixup_map3++;
 		} while (*fixup_map3);
 	}
 
-	if (*fixup_prog) {
-		map_fds[2] = create_prog_array();
+	if (*fixup_map4) {
+		map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+					sizeof(struct test_val), 1);
+		do {
+			prog[*fixup_map4].imm = map_fds[3];
+			fixup_map4++;
+		} while (*fixup_map4);
+	}
+
+	if (*fixup_prog1) {
+		map_fds[4] = create_prog_array(4, 0);
+		do {
+			prog[*fixup_prog1].imm = map_fds[4];
+			fixup_prog1++;
+		} while (*fixup_prog1);
+	}
+
+	if (*fixup_prog2) {
+		map_fds[5] = create_prog_array(8, 7);
 		do {
-			prog[*fixup_prog].imm = map_fds[2];
-			fixup_prog++;
-		} while (*fixup_prog);
+			prog[*fixup_prog2].imm = map_fds[5];
+			fixup_prog2++;
+		} while (*fixup_prog2);
 	}
 
 	if (*fixup_map_in_map) {
-		map_fds[3] = create_map_in_map();
+		map_fds[6] = create_map_in_map();
 		do {
-			prog[*fixup_map_in_map].imm = map_fds[3];
+			prog[*fixup_map_in_map].imm = map_fds[6];
 			fixup_map_in_map++;
 		} while (*fixup_map_in_map);
 	}
-- 
cgit v1.2.3


From 09772d92cd5ad998b0d5f6f46cd1658f8cb698cf Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:35 +0200
Subject: bpf: avoid retpoline for lookup/update/delete calls on maps

While some of the BPF map lookup helpers provide a ->map_gen_lookup()
callback for inlining the map lookup altogether it is not available
for every map, so the remaining ones have to call bpf_map_lookup_elem()
helper which does a dispatch to map->ops->map_lookup_elem(). In
times of retpolines, this will control and trap speculative execution
rather than letting it do its work for the indirect call and will
therefore cause a slowdown. Likewise, bpf_map_update_elem() and
bpf_map_delete_elem() do not have an inlined version and need to call
into their map->ops->map_update_elem() resp. map->ops->map_delete_elem()
handlers.

Before:

  # bpftool prog dump xlated id 1
    0: (bf) r2 = r10
    1: (07) r2 += -8
    2: (7a) *(u64 *)(r2 +0) = 0
    3: (18) r1 = map[id:1]
    5: (85) call __htab_map_lookup_elem#232656
    6: (15) if r0 == 0x0 goto pc+4
    7: (71) r1 = *(u8 *)(r0 +35)
    8: (55) if r1 != 0x0 goto pc+1
    9: (72) *(u8 *)(r0 +35) = 1
   10: (07) r0 += 56
   11: (15) if r0 == 0x0 goto pc+4
   12: (bf) r2 = r0
   13: (18) r1 = map[id:1]
   15: (85) call bpf_map_delete_elem#215008  <-- indirect call via
   16: (95) exit                                 helper

After:

  # bpftool prog dump xlated id 1
    0: (bf) r2 = r10
    1: (07) r2 += -8
    2: (7a) *(u64 *)(r2 +0) = 0
    3: (18) r1 = map[id:1]
    5: (85) call __htab_map_lookup_elem#233328
    6: (15) if r0 == 0x0 goto pc+4
    7: (71) r1 = *(u8 *)(r0 +35)
    8: (55) if r1 != 0x0 goto pc+1
    9: (72) *(u8 *)(r0 +35) = 1
   10: (07) r0 += 56
   11: (15) if r0 == 0x0 goto pc+4
   12: (bf) r2 = r0
   13: (18) r1 = map[id:1]
   15: (85) call htab_lru_map_delete_elem#238240  <-- direct call
   16: (95) exit

In all three lookup/update/delete cases however we can use the actual
address of the map callback directly if we find that there's only a
single path with a map pointer leading to the helper call, meaning
when the map pointer has not been poisoned from verifier side.
Example code can be seen above for the delete case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/hashtab.c   | 12 ++++++---
 kernel/bpf/verifier.c  | 68 ++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 61 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6fd375fe7079..8e60f1e9a702 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -301,6 +301,9 @@ struct xdp_buff;
 
 /* Function call */
 
+#define BPF_CAST_CALL(x)					\
+		((u64 (*)(u64, u64, u64, u64, u64))(x))
+
 #define BPF_EMIT_CALL(FUNC)					\
 	((struct bpf_insn) {					\
 		.code  = BPF_JMP | BPF_CALL,			\
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b76828f23b49..3ca2198a6d22 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
@@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
 	const int ret = BPF_REG_0;
 	const int ref_reg = BPF_REG_1;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
 	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
 			      offsetof(struct htab_elem, lru_node) +
@@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4f4786ea2296..1dd6d5a7aa5b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2421,8 +2421,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
 
 	if (func_id != BPF_FUNC_tail_call &&
-	    func_id != BPF_FUNC_map_lookup_elem)
+	    func_id != BPF_FUNC_map_lookup_elem &&
+	    func_id != BPF_FUNC_map_update_elem &&
+	    func_id != BPF_FUNC_map_delete_elem)
 		return 0;
+
 	if (meta->map_ptr == NULL) {
 		verbose(env, "kernel subsystem misconfigured verifier\n");
 		return -EINVAL;
@@ -5586,6 +5589,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
+	const struct bpf_map_ops *ops;
 	struct bpf_insn_aux_data *aux;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
@@ -5715,35 +5719,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 		}
 
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
-		 * handlers are currently limited to 64 bit only.
+		 * and other inlining handlers are currently limited to 64 bit
+		 * only.
 		 */
 		if (prog->jit_requested && BITS_PER_LONG == 64 &&
-		    insn->imm == BPF_FUNC_map_lookup_elem) {
+		    (insn->imm == BPF_FUNC_map_lookup_elem ||
+		     insn->imm == BPF_FUNC_map_update_elem ||
+		     insn->imm == BPF_FUNC_map_delete_elem)) {
 			aux = &env->insn_aux_data[i + delta];
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
 
 			map_ptr = BPF_MAP_PTR(aux->map_state);
-			if (!map_ptr->ops->map_gen_lookup)
-				goto patch_call_imm;
+			ops = map_ptr->ops;
+			if (insn->imm == BPF_FUNC_map_lookup_elem &&
+			    ops->map_gen_lookup) {
+				cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+				if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+					verbose(env, "bpf verifier is misconfigured\n");
+					return -EINVAL;
+				}
 
-			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
-			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose(env, "bpf verifier is misconfigured\n");
-				return -EINVAL;
-			}
+				new_prog = bpf_patch_insn_data(env, i + delta,
+							       insn_buf, cnt);
+				if (!new_prog)
+					return -ENOMEM;
 
-			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
-						       cnt);
-			if (!new_prog)
-				return -ENOMEM;
+				delta    += cnt - 1;
+				env->prog = prog = new_prog;
+				insn      = new_prog->insnsi + i + delta;
+				continue;
+			}
 
-			delta += cnt - 1;
+			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+				     (void *(*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+				     (int (*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+				     (int (*)(struct bpf_map *map, void *key, void *value,
+					      u64 flags))NULL));
+			switch (insn->imm) {
+			case BPF_FUNC_map_lookup_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_update_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_delete_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
+					    __bpf_call_base;
+				continue;
+			}
 
-			/* keep walking new program and skip insns we just inserted */
-			env->prog = prog = new_prog;
-			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto patch_call_imm;
 		}
 
 		if (insn->imm == BPF_FUNC_redirect_map) {
-- 
cgit v1.2.3


From cb20b08ead401fd17627a36f035c0bf5bfee5567 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:36 +0200
Subject: bpf: add bpf_skb_cgroup_id helper

Add a new bpf_skb_cgroup_id() helper that allows to retrieve the
cgroup id from the skb's socket. This is useful in particular to
enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in
cgroup v2 by allowing ID based matching on egress. This can in
particular be used in combination with applying policy e.g. from
map lookups, and also complements the older bpf_skb_under_cgroup()
interface. In user space the cgroup id for a given path can be
retrieved through the f_handle as demonstrated in [0] recently.

  [0] https://lkml.org/lkml/2018/5/22/1190

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 19 ++++++++++++++++++-
 net/core/filter.c        | 29 +++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 64ac0f7a689e..661318141f72 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2054,6 +2054,22 @@ union bpf_attr {
  *
  *	Return
  *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2134,7 +2150,8 @@ union bpf_attr {
 	FN(lwt_seg6_adjust_srh),	\
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
-	FN(rc_keydown),
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 885fb0e2f264..edbfaa613b6d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3661,6 +3661,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+	struct cgroup *cgrp;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+	.func           = bpf_skb_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+#endif
+
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 				  unsigned long off, unsigned long len)
 {
@@ -4747,12 +4768,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 #ifdef CONFIG_XFRM
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
-	case BPF_FUNC_fib_lookup:
-		return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	case BPF_FUNC_skb_cgroup_id:
+		return &bpf_skb_cgroup_id_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 1fbc2e0cfcf9677683aea2b1f9ea76fbdc6fb6d1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:37 +0200
Subject: bpf: make sure to clear unused fields in tunnel/xfrm state fetch

Since the remaining bits are not filled in struct bpf_tunnel_key
resp. struct bpf_xfrm_state and originate from uninitialized stack
space, we should make sure to clear them before handing control
back to the program.

Also add a padding element to struct bpf_xfrm_state for future use
similar as we have in struct bpf_tunnel_key and clear it as well.

  struct bpf_xfrm_state {
      __u32                      reqid;            /*     0     4 */
      __u32                      spi;              /*     4     4 */
      __u16                      family;           /*     8     2 */

      /* XXX 2 bytes hole, try to pack */

      union {
          __u32              remote_ipv4;          /*           4 */
          __u32              remote_ipv6[4];       /*          16 */
      };                                           /*    12    16 */

      /* size: 28, cachelines: 1, members: 4 */
      /* sum members: 26, holes: 1, sum holes: 2 */
      /* last cacheline: 28 bytes */
  };

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 3 ++-
 net/core/filter.c        | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 661318141f72..f0b6608b1f1c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2268,7 +2268,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;
+	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
 };
 
@@ -2279,6 +2279,7 @@ struct bpf_xfrm_state {
 	__u32 reqid;
 	__u32 spi;	/* Stored in network byte order */
 	__u16 family;
+	__u16 ext;	/* Padding, future use. */
 	union {
 		__u32 remote_ipv4;	/* Stored in network byte order */
 		__u32 remote_ipv6[4];	/* Stored in network byte order */
diff --git a/net/core/filter.c b/net/core/filter.c
index edbfaa613b6d..28e864777c0f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3445,6 +3445,7 @@ set_compat:
 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
+	to->tunnel_ext = 0;
 
 	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3452,6 +3453,8 @@ set_compat:
 		to->tunnel_label = be32_to_cpu(info->key.label);
 	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+		to->tunnel_label = 0;
 	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
 	to->reqid = x->props.reqid;
 	to->spi = x->id.spi;
 	to->family = x->props.family;
+	to->ext = 0;
+
 	if (to->family == AF_INET6) {
 		memcpy(to->remote_ipv6, x->props.saddr.a6,
 		       sizeof(to->remote_ipv6));
 	} else {
 		to->remote_ipv4 = x->props.saddr.a4;
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From bc23105ca0abdeed98366af01c700c2c3aff5cd5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:39 +0200
Subject: bpf: fix context access in tracing progs on 32 bit archs

Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
program type in test_verifier report the following errors on x86_32:

  172/p unpriv: spill/fill of different pointers ldx FAIL
  Unexpected error message!
  0: (bf) r6 = r10
  1: (07) r6 += -8
  2: (15) if r1 == 0x0 goto pc+3
  R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
  3: (bf) r2 = r10
  4: (07) r2 += -76
  5: (7b) *(u64 *)(r6 +0) = r2
  6: (55) if r1 != 0x0 goto pc+1
  R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp
  7: (7b) *(u64 *)(r6 +0) = r1
  8: (79) r1 = *(u64 *)(r6 +0)
  9: (79) r1 = *(u64 *)(r1 +68)
  invalid bpf_context access off=68 size=8

  378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (71) r0 = *(u8 *)(r1 +68)
  invalid bpf_context access off=68 size=1

  379/p check bpf_perf_event_data->sample_period half load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (69) r0 = *(u16 *)(r1 +68)
  invalid bpf_context access off=68 size=2

  380/p check bpf_perf_event_data->sample_period word load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (61) r0 = *(u32 *)(r1 +68)
  invalid bpf_context access off=68 size=4

  381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (79) r0 = *(u64 *)(r1 +68)
  invalid bpf_context access off=68 size=8

Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok()
will then bail out saying that off & (size_default - 1) which is 68 & 7
doesn't cleanly align in the case of sample_period access from struct
bpf_perf_event_data, hence verifier wrongly thinks we might be doing an
unaligned access here though underlying arch can handle it just fine.
Therefore adjust this down to machine size and check and rewrite the
offset for narrow access on that basis. We also need to fix corresponding
pe_prog_is_valid_access(), since we hit the check for off % size != 0
(e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs
for tracing work on x86_32.

Reported-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h   | 30 ++++++++++++++++++++++++------
 kernel/bpf/verifier.c    |  3 ++-
 kernel/trace/bpf_trace.c | 10 ++++++++--
 3 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8e60f1e9a702..45fc0f5000d8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 	return prog->type == BPF_PROG_TYPE_UNSPEC;
 }
 
-static inline bool
-bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
+static inline u32 bpf_ctx_off_adjust_machine(u32 size)
+{
+	const u32 size_machine = sizeof(unsigned long);
+
+	if (size > size_machine && size % size_machine == 0)
+		size = size_machine;
+
+	return size;
+}
+
+static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access,
+					   u32 size_default)
 {
-	bool off_ok;
+	size_default = bpf_ctx_off_adjust_machine(size_default);
+	size_access  = bpf_ctx_off_adjust_machine(size_access);
+
 #ifdef __LITTLE_ENDIAN
-	off_ok = (off & (size_default - 1)) == 0;
+	return (off & (size_default - 1)) == 0;
 #else
-	off_ok = (off & (size_default - 1)) + size == size_default;
+	return (off & (size_default - 1)) + size_access == size_default;
 #endif
-	return off_ok && size <= size_default && (size & (size - 1)) == 0;
+}
+
+static inline bool
+bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
+{
+	return bpf_ctx_narrow_align_ok(off, size, size_default) &&
+	       size <= size_default && (size & (size - 1)) == 0;
 }
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1dd6d5a7aa5b..d6403b5166f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5349,6 +5349,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 */
 		is_narrower_load = size < ctx_field_size;
 		if (is_narrower_load) {
+			u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
 			u32 off = insn->off;
 			u8 size_code;
 
@@ -5363,7 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			else if (ctx_field_size == 8)
 				size_code = BPF_DW;
 
-			insn->off = off & ~(ctx_field_size - 1);
+			insn->off = off & ~(size_default - 1);
 			insn->code = BPF_LDX | BPF_MEM | size_code;
 		}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index af1486d9a0ed..752992ce3513 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -880,8 +880,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 		return false;
 	if (type != BPF_READ)
 		return false;
-	if (off % size != 0)
-		return false;
+	if (off % size != 0) {
+		if (sizeof(unsigned long) != 4)
+			return false;
+		if (size != 8)
+			return false;
+		if (off % size != 4)
+			return false;
+	}
 
 	switch (off) {
 	case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
-- 
cgit v1.2.3


From 8051ac7643e8d0f1dd7272042d66c0518e718347 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Date: Thu, 31 May 2018 09:20:20 -0300
Subject: vlan: use non-archaic spelling of failes

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 78a5a90b4267..83ea4df6ab81 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -331,7 +331,7 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features,
  * @mac_len: MAC header length including outer vlan headers
  *
  * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
- * Returns error if skb_cow_head failes.
+ * Returns error if skb_cow_head fails.
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
@@ -379,7 +379,7 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
- * Returns error if skb_cow_head failes.
+ * Returns error if skb_cow_head fails.
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-- 
cgit v1.2.3


From 42b33468987bac0dd95c30f14820c7abac04a153 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 10:59:47 +0200
Subject: xdp: add flags argument to ndo_xdp_xmit API

This patch only change the API and reject any use of flags. This is an
intermediate step that allows us to implement the flush flag operation
later, for each individual driver in a separate patch.

The plan is to implement flush operation via XDP_XMIT_FLUSH flag
and then remove XDP_XMIT_FLAGS_NONE when done.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 6 +++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 ++++-
 drivers/net/tun.c                             | 8 ++++++--
 drivers/net/virtio_net.c                      | 5 ++++-
 include/linux/netdevice.h                     | 7 ++++---
 include/net/xdp.h                             | 5 +++++
 kernel/bpf/devmap.c                           | 2 +-
 net/core/filter.c                             | 2 +-
 9 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 9b698c5acd05..c0451d6e0790 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3670,7 +3670,8 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * For error cases, a negative errno code is returned and no-frames
  * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
@@ -3684,6 +3685,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	for (i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
 		int err;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index eb8804b3d7b6..820f76db251b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,8 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 031d65c4178d..87f088f4af52 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10023,7 +10023,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 }
 
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
-			  struct xdp_frame **frames)
+			  struct xdp_frame **frames, u32 flags)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
@@ -10033,6 +10033,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	/* During program transitions its possible adapter->xdp_prog is assigned
 	 * but ring has not been configured yet. In this case simply abort xmit.
 	 */
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2265d2ccea47..b182b8cdd219 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,7 +1285,8 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+static int tun_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
@@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames
 	int cnt = n;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
@@ -1332,7 +1336,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, 1, &frame);
+	return tun_xdp_xmit(dev, 1, &frame, 0);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..4ed823625953 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -468,7 +468,7 @@ static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
 }
 
 static int virtnet_xdp_xmit(struct net_device *dev,
-			    int n, struct xdp_frame **frames)
+			    int n, struct xdp_frame **frames, u32 flags)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
@@ -481,6 +481,9 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	int err;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..7f17785a59d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,13 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
+ *			u32 flags);
  *	This function is used to submit @n XDP packets for transmit on a
  *	netdevice. Returns number of frames successfully transmitted, frames
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1380,7 +1380,8 @@ struct net_device_ops {
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
-						struct xdp_frame **xdp);
+						struct xdp_frame **xdp,
+						u32 flags);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 7ad779237ae8..0c45f0f943ed 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -40,6 +40,11 @@ enum xdp_mem_type {
 	MEM_TYPE_MAX,
 };
 
+/* XDP flags for ndo_xdp_xmit */
+#define XDP_XMIT_FLAGS_NONE	0U
+#define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
+#define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
+
 struct xdp_mem_info {
 	u32 type; /* enum xdp_mem_type, but known size type */
 	u32 id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1fe3fe60508a..037e234056f7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index 28e864777c0f..56e40dafdde7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,7 +3056,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0);
 	if (sent <= 0)
 		return sent;
 	dev->netdev_ops->ndo_xdp_flush(dev);
-- 
cgit v1.2.3


From 73de5717eb30ceedef6abf9da4bc2194f25d92ca Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:13 +0200
Subject: xdp: done implementing ndo_xdp_xmit flush flag for all drivers

Removing XDP_XMIT_FLAGS_NONE as all driver now implement
a flush operation in their ndo_xdp_xmit call.  The compiler
will catch if any users of XDP_XMIT_FLAGS_NONE remains.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0c45f0f943ed..a3b71a4dd71d 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,7 +41,6 @@ enum xdp_mem_type {
 };
 
 /* XDP flags for ndo_xdp_xmit */
-#define XDP_XMIT_FLAGS_NONE	0U
 #define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
 #define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
 
-- 
cgit v1.2.3


From 3e79bfd61cbba28ffa54ff15475495f18bcf3c7f Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Thu, 31 May 2018 12:10:59 -0500
Subject: rpmsg: Switch to SPDX license identifier

Use the appropriate SPDX license identifier in the rpmsg core
source files and drop the previous boilerplate license text.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/Kconfig          |  2 ++
 drivers/rpmsg/rpmsg_core.c     | 10 +---------
 drivers/rpmsg/rpmsg_internal.h | 10 +---------
 include/linux/rpmsg.h          | 27 +--------------------------
 4 files changed, 5 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig
index 9b14090e3603..d0322b41eca5 100644
--- a/drivers/rpmsg/Kconfig
+++ b/drivers/rpmsg/Kconfig
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
 menu "Rpmsg drivers"
 
 # RPMSG always gets selected by whoever wants it
diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
index bea7c74e12eb..b714a543a91d 100644
--- a/drivers/rpmsg/rpmsg_core.c
+++ b/drivers/rpmsg/rpmsg_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * remote processor messaging bus
  *
@@ -6,15 +7,6 @@
  *
  * Ohad Ben-Cohen <ohad@wizery.com>
  * Brian Swetland <swetland@google.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #define pr_fmt(fmt) "%s: " fmt, __func__
diff --git a/drivers/rpmsg/rpmsg_internal.h b/drivers/rpmsg/rpmsg_internal.h
index 685aa70e9cbe..0d791c30b7ea 100644
--- a/drivers/rpmsg/rpmsg_internal.h
+++ b/drivers/rpmsg/rpmsg_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * remote processor messaging bus internals
  *
@@ -6,15 +7,6 @@
  *
  * Ohad Ben-Cohen <ohad@wizery.com>
  * Brian Swetland <swetland@google.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __RPMSG_INTERNAL_H__
diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index ca07366c4c33..9fe156d1c018 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -1,35 +1,10 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
 /*
  * Remote processor messaging
  *
  * Copyright (C) 2011 Texas Instruments, Inc.
  * Copyright (C) 2011 Google, Inc.
  * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in
- *   the documentation and/or other materials provided with the
- *   distribution.
- * * Neither the name Texas Instruments nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef _LINUX_RPMSG_H
-- 
cgit v1.2.3


From 84369fbe62a286711de27a0949f808726f810392 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Thu, 31 May 2018 12:11:02 -0500
Subject: rpmsg: glink: Switch to SPDX license identifier

Use the appropriate SPDX license identifier in various rpmsg
glink driver source files and drop the previous boilerplate
license text.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/qcom_glink_native.c | 10 +---------
 drivers/rpmsg/qcom_glink_native.h | 10 +---------
 drivers/rpmsg/qcom_glink_rpm.c    | 10 +---------
 drivers/rpmsg/qcom_glink_smem.c   | 10 +---------
 include/linux/rpmsg/qcom_glink.h  |  2 ++
 5 files changed, 6 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c
index 768ef542a841..f505f58b797d 100644
--- a/drivers/rpmsg/qcom_glink_native.c
+++ b/drivers/rpmsg/qcom_glink_native.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2016-2017, Linaro Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/idr.h>
diff --git a/drivers/rpmsg/qcom_glink_native.h b/drivers/rpmsg/qcom_glink_native.h
index 0cae8a8199f8..624184fc458e 100644
--- a/drivers/rpmsg/qcom_glink_native.h
+++ b/drivers/rpmsg/qcom_glink_native.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (c) 2016-2017, Linaro Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __QCOM_GLINK_NATIVE_H__
diff --git a/drivers/rpmsg/qcom_glink_rpm.c b/drivers/rpmsg/qcom_glink_rpm.c
index 69b25d157d0f..f64f45d1a735 100644
--- a/drivers/rpmsg/qcom_glink_rpm.c
+++ b/drivers/rpmsg/qcom_glink_rpm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2016-2017, Linaro Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/idr.h>
diff --git a/drivers/rpmsg/qcom_glink_smem.c b/drivers/rpmsg/qcom_glink_smem.c
index 3fa9d43e2c87..2b5cf2790954 100644
--- a/drivers/rpmsg/qcom_glink_smem.c
+++ b/drivers/rpmsg/qcom_glink_smem.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2016, Linaro Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/io.h>
diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h
index a622f029836e..96e26d94719f 100644
--- a/include/linux/rpmsg/qcom_glink.h
+++ b/include/linux/rpmsg/qcom_glink.h
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
 #ifndef _LINUX_RPMSG_QCOM_GLINK_H
 #define _LINUX_RPMSG_QCOM_GLINK_H
 
-- 
cgit v1.2.3


From 136200f4fd365a0a9c549893c9641f6eeff53f22 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Thu, 31 May 2018 12:11:03 -0500
Subject: rpmsg: char: Switch to SPDX license identifier

Use the appropriate SPDX license identifier in the rpmsg char driver
source file and drop the previous boilerplate license text. The uapi
header file already had the SPDX license identifier added as part of
a mass update but the license text removal was deferred for later,
and this patch drops the same.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/rpmsg_char.c | 10 +---------
 include/uapi/linux/rpmsg.h |  9 ---------
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c
index 1efdf9ff8679..76a4477c6364 100644
--- a/drivers/rpmsg/rpmsg_char.c
+++ b/drivers/rpmsg/rpmsg_char.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2016, Linaro Ltd.
  * Copyright (c) 2012, Michal Simek <monstr@monstr.eu>
@@ -7,15 +8,6 @@
  *
  * Based on rpmsg performance statistics driver by Michal Simek, which in turn
  * was based on TI & Google OMX rpmsg driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/cdev.h>
 #include <linux/device.h>
diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h
index 225eb38705dc..e14c6dab4223 100644
--- a/include/uapi/linux/rpmsg.h
+++ b/include/uapi/linux/rpmsg.h
@@ -1,15 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  * Copyright (c) 2016, Linaro Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _UAPI_RPMSG_H_
-- 
cgit v1.2.3


From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 3 Jun 2018 15:59:41 -0700
Subject: bpf: implement bpf_get_current_cgroup_id() helper

bpf has been used extensively for tracing. For example, bcc
contains an almost full set of bpf-based tools to trace kernel
and user functions/events. Most tracing tools are currently
either filtered based on pid or system-wide.

Containers have been used quite extensively in industry and
cgroup is often used together to provide resource isolation
and protection. Several processes may run inside the same
container. It is often desirable to get container-level tracing
results as well, e.g. syscall count, function count, I/O
activity, etc.

This patch implements a new helper, bpf_get_current_cgroup_id(),
which will return cgroup id based on the cgroup within which
the current task is running.

The later patch will provide an example to show that
userspace can get the same cgroup id so it could
configure a filter or policy in the bpf program based on
task cgroup id.

The helper is currently implemented for tracing. It can
be added to other program types as well when needed.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  8 +++++++-
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 15 +++++++++++++++
 kernel/trace/bpf_trace.c |  2 ++
 5 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbe297436e5d..995c3b1e59bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
+extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f0b6608b1f1c..18712b0dbfe7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2070,6 +2070,11 @@ union bpf_attr {
  * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2151,7 +2156,8 @@ union bpf_attr {
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
-	FN(skb_cgroup_id),
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 527587de8a67..9f1493705f40 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1765,6 +1765,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..73065e2d23c2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE,
 };
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+
+	return cgrp->kn->id.id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+	.func		= bpf_get_current_cgroup_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+#endif
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 752992ce3513..e2ab5b7f29d2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -564,6 +564,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_probe_read_str:
 		return &bpf_probe_read_str_proto;
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From bd3a08aaa9a383ffbbd5b788b797ae6e64eaa7a1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 3 Jun 2018 08:15:19 -0700
Subject: bpf: flowlabel in bpf_fib_lookup should be flowinfo

As Michal noted the flow struct takes both the flow label and priority.
Update the bpf_fib_lookup API to note that it is flowinfo and not just
the flow label.

Cc: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h   | 2 +-
 net/core/filter.c          | 2 +-
 samples/bpf/xdp_fwd_kern.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 18712b0dbfe7..eeb6237be5c2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2629,7 +2629,7 @@ struct bpf_fib_lookup {
 	union {
 		/* inputs to lookup */
 		__u8	tos;		/* AF_INET  */
-		__be32	flowlabel;	/* AF_INET6 */
+		__be32	flowinfo;	/* AF_INET6, flow_label + priority */
 
 		/* output: metric of fib result (IPv4/IPv6 only) */
 		__u32	rt_metric;
diff --git a/net/core/filter.c b/net/core/filter.c
index a72ea9f61010..3d9ba7e5965a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4221,7 +4221,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		fl6.flowi6_oif = 0;
 		strict = RT6_LOOKUP_F_HAS_SADDR;
 	}
-	fl6.flowlabel = params->flowlabel;
+	fl6.flowlabel = params->flowinfo;
 	fl6.flowi6_scope = 0;
 	fl6.flowi6_flags = 0;
 	fl6.mp_hash = 0;
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 4a6be0f87505..6673cdb9f55c 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -88,7 +88,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 			return XDP_PASS;
 
 		fib_params.family	= AF_INET6;
-		fib_params.flowlabel	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+		fib_params.flowinfo	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
 		fib_params.l4_protocol	= ip6h->nexthdr;
 		fib_params.sport	= 0;
 		fib_params.dport	= 0;
-- 
cgit v1.2.3


From d95c9760d9c8e046b09b54a3bb6dd0c9aa7a0eff Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Fri, 18 May 2018 17:24:03 +0200
Subject: mfd: stm32-timers: Fix pwm-stm32 linker issue with COMPILE_TEST

This is seen when COMPILE_TEST=y and MFD_STM32_TIMERS=n.
drivers/pwm/pwm-stm32.o: In function 'stm32_pwm_raw_capture':
pwm-stm32.c:... undefined reference to 'stm32_timers_dma_burst_read'
Fixes: 0c6609805b63 ("mfd: stm32-timers: Add support for DMAs")

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>

Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/stm32-timers.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index 9da1d7ece079..067d14655c28 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -124,8 +124,20 @@ struct stm32_timers {
 	struct stm32_timers_dma dma; /* Only to be used by the parent */
 };
 
+#if IS_REACHABLE(CONFIG_MFD_STM32_TIMERS)
 int stm32_timers_dma_burst_read(struct device *dev, u32 *buf,
 				enum stm32_timers_dmas id, u32 reg,
 				unsigned int num_reg, unsigned int bursts,
 				unsigned long tmo_ms);
+#else
+static inline int stm32_timers_dma_burst_read(struct device *dev, u32 *buf,
+					      enum stm32_timers_dmas id,
+					      u32 reg,
+					      unsigned int num_reg,
+					      unsigned int bursts,
+					      unsigned long tmo_ms)
+{
+	return -ENODEV;
+}
+#endif
 #endif
-- 
cgit v1.2.3


From ca24da008fd665db125aed889e857ccaa033de84 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Thu, 31 May 2018 11:05:24 +0300
Subject: RDMA/core: introduce check masks for T10-PI offload

T10-PI offload capability is currently supported in iSER protocol only,
and the definition of the HCA protection information checks are missing
from the core layer. Add those definition to avoid code duplication in
other drivers (such iSER target and NVMeoF).

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 2cc04abb6df8..2043e1a8f851 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -860,6 +860,19 @@ enum ib_sig_err_type {
 	IB_SIG_BAD_APPTAG,
 };
 
+/**
+ * Signature check masks (8 bytes in total) according to the T10-PI standard:
+ *  -------- -------- ------------
+ * | GUARD  | APPTAG |   REFTAG   |
+ * |  2B    |  2B    |    4B      |
+ *  -------- -------- ------------
+ */
+enum {
+	IB_SIG_CHECK_GUARD	= 0xc0,
+	IB_SIG_CHECK_APPTAG	= 0x30,
+	IB_SIG_CHECK_REFTAG	= 0x0f,
+};
+
 /**
  * struct ib_sig_err - signature error descriptor
  */
-- 
cgit v1.2.3


From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:13 +0200
Subject: xsk: new descriptor addressing scheme
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data.  Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).

By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.

In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.

In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.

We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.

To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.

On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.

Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.

This commit changes the uapi of AF_XDP sockets, and updates the
documentation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/af_xdp.rst | 101 +++++++++++++++++++++---------------
 include/uapi/linux/if_xdp.h         |  12 ++---
 net/xdp/xdp_umem.c                  |  33 ++++++------
 net/xdp/xdp_umem.h                  |  27 +++-------
 net/xdp/xdp_umem_props.h            |   4 +-
 net/xdp/xsk.c                       |  30 ++++++-----
 net/xdp/xsk_queue.c                 |   2 +-
 net/xdp/xsk_queue.h                 |  43 +++++++--------
 8 files changed, 123 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 91928d9ee4bf..ff929cfab4f4 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -12,7 +12,7 @@ packet processing.
 
 This document assumes that the reader is familiar with BPF and XDP. If
 not, the Cilium project has an excellent reference guide at
-http://cilium.readthedocs.io/en/doc-1.0/bpf/.
+http://cilium.readthedocs.io/en/latest/bpf/.
 
 Using the XDP_REDIRECT action from an XDP program, the program can
 redirect ingress frames to other XDP enabled netdevs, using the
@@ -33,22 +33,22 @@ for a while due to a possible retransmit, the descriptor that points
 to that packet can be changed to point to another and reused right
 away. This again avoids copying data.
 
-The UMEM consists of a number of equally size frames and each frame
-has a unique frame id. A descriptor in one of the rings references a
-frame by referencing its frame id. The user space allocates memory for
-this UMEM using whatever means it feels is most appropriate (malloc,
-mmap, huge pages, etc). This memory area is then registered with the
-kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two
-rings: the FILL ring and the COMPLETION ring. The fill ring is used by
-the application to send down frame ids for the kernel to fill in with
-RX packet data. References to these frames will then appear in the RX
-ring once each packet has been received. The completion ring, on the
-other hand, contains frame ids that the kernel has transmitted
-completely and can now be used again by user space, for either TX or
-RX. Thus, the frame ids appearing in the completion ring are ids that
-were previously transmitted using the TX ring. In summary, the RX and
-FILL rings are used for the RX path and the TX and COMPLETION rings
-are used for the TX path.
+The UMEM consists of a number of equally sized chunks. A descriptor in
+one of the rings references a frame by referencing its addr. The addr
+is simply an offset within the entire UMEM region. The user space
+allocates memory for this UMEM using whatever means it feels is most
+appropriate (malloc, mmap, huge pages, etc). This memory area is then
+registered with the kernel using the new setsockopt XDP_UMEM_REG. The
+UMEM also has two rings: the FILL ring and the COMPLETION ring. The
+fill ring is used by the application to send down addr for the kernel
+to fill in with RX packet data. References to these frames will then
+appear in the RX ring once each packet has been received. The
+completion ring, on the other hand, contains frame addr that the
+kernel has transmitted completely and can now be used again by user
+space, for either TX or RX. Thus, the frame addrs appearing in the
+completion ring are addrs that were previously transmitted using the
+TX ring. In summary, the RX and FILL rings are used for the RX path
+and the TX and COMPLETION rings are used for the TX path.
 
 The socket is then finally bound with a bind() call to a device and a
 specific queue id on that device, and it is not until bind is
@@ -59,13 +59,13 @@ wants to do this, it simply skips the registration of the UMEM and its
 corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind
 call and submits the XSK of the process it would like to share UMEM
 with as well as its own newly created XSK socket. The new process will
-then receive frame id references in its own RX ring that point to this
-shared UMEM. Note that since the ring structures are single-consumer /
-single-producer (for performance reasons), the new process has to
-create its own socket with associated RX and TX rings, since it cannot
-share this with the other process. This is also the reason that there
-is only one set of FILL and COMPLETION rings per UMEM. It is the
-responsibility of a single process to handle the UMEM.
+then receive frame addr references in its own RX ring that point to
+this shared UMEM. Note that since the ring structures are
+single-consumer / single-producer (for performance reasons), the new
+process has to create its own socket with associated RX and TX rings,
+since it cannot share this with the other process. This is also the
+reason that there is only one set of FILL and COMPLETION rings per
+UMEM. It is the responsibility of a single process to handle the UMEM.
 
 How is then packets distributed from an XDP program to the XSKs? There
 is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The
@@ -102,10 +102,10 @@ UMEM
 
 UMEM is a region of virtual contiguous memory, divided into
 equal-sized frames. An UMEM is associated to a netdev and a specific
-queue id of that netdev. It is created and configured (frame size,
-frame headroom, start address and size) by using the XDP_UMEM_REG
-setsockopt system call. A UMEM is bound to a netdev and queue id, via
-the bind() system call.
+queue id of that netdev. It is created and configured (chunk size,
+headroom, start address and size) by using the XDP_UMEM_REG setsockopt
+system call. A UMEM is bound to a netdev and queue id, via the bind()
+system call.
 
 An AF_XDP is socket linked to a single UMEM, but one UMEM can have
 multiple AF_XDP sockets. To share an UMEM created via one socket A,
@@ -147,13 +147,17 @@ UMEM Fill Ring
 ~~~~~~~~~~~~~~
 
 The Fill ring is used to transfer ownership of UMEM frames from
-user-space to kernel-space. The UMEM indicies are passed in the
-ring. As an example, if the UMEM is 64k and each frame is 4k, then the
-UMEM has 16 frames and can pass indicies between 0 and 15.
+user-space to kernel-space. The UMEM addrs are passed in the ring. As
+an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
+16 chunks and can pass addrs between 0 and 64k.
 
 Frames passed to the kernel are used for the ingress path (RX rings).
 
-The user application produces UMEM indicies to this ring.
+The user application produces UMEM addrs to this ring. Note that the
+kernel will mask the incoming addr. E.g. for a chunk size of 2k, the
+log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
+and 3000 refers to the same chunk.
+
 
 UMEM Completetion Ring
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -165,16 +169,15 @@ used.
 Frames passed from the kernel to user-space are frames that has been
 sent (TX ring) and can be used by user-space again.
 
-The user application consumes UMEM indicies from this ring.
+The user application consumes UMEM addrs from this ring.
 
 
 RX Ring
 ~~~~~~~
 
 The RX ring is the receiving side of a socket. Each entry in the ring
-is a struct xdp_desc descriptor. The descriptor contains UMEM index
-(idx), the length of the data (len), the offset into the frame
-(offset).
+is a struct xdp_desc descriptor. The descriptor contains UMEM offset
+(addr) and the length of the data (len).
 
 If no frames have been passed to kernel via the Fill ring, no
 descriptors will (or can) appear on the RX ring.
@@ -221,38 +224,50 @@ side is xdpsock_user.c and the XDP side xdpsock_kern.c.
 
 Naive ring dequeue and enqueue could look like this::
 
+    // struct xdp_rxtx_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	struct xdp_desc *desc;
+    // };
+
+    // struct xdp_umem_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	__u64 *desc;
+    // };
+
     // typedef struct xdp_rxtx_ring RING;
     // typedef struct xdp_umem_ring RING;
 
     // typedef struct xdp_desc RING_TYPE;
-    // typedef __u32 RING_TYPE;
+    // typedef __u64 RING_TYPE;
 
     int dequeue_one(RING *ring, RING_TYPE *item)
     {
-        __u32 entries = ring->ptrs.producer - ring->ptrs.consumer;
+        __u32 entries = *ring->producer - *ring->consumer;
 
         if (entries == 0)
             return -1;
 
         // read-barrier!
 
-        *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)];
-        ring->ptrs.consumer++;
+        *item = ring->desc[*ring->consumer & (RING_SIZE - 1)];
+        (*ring->consumer)++;
         return 0;
     }
 
     int enqueue_one(RING *ring, const RING_TYPE *item)
     {
-        u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer);
+        u32 free_entries = RING_SIZE - (*ring->producer - *ring->consumer);
 
         if (free_entries == 0)
             return -1;
 
-        ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item;
+        ring->desc[*ring->producer & (RING_SIZE - 1)] = *item;
 
         // write-barrier!
 
-        ring->ptrs.producer++;
+        (*ring->producer)++;
         return 0;
     }
 
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 4737cfe222f5..e411d6f9ac65 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -48,8 +48,8 @@ struct xdp_mmap_offsets {
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
 	__u64 len; /* Length of packet data area */
-	__u32 frame_size; /* Frame size */
-	__u32 frame_headroom; /* Frame head room */
+	__u32 chunk_size;
+	__u32 headroom;
 };
 
 struct xdp_statistics {
@@ -66,13 +66,11 @@ struct xdp_statistics {
 
 /* Rx/Tx descriptor */
 struct xdp_desc {
-	__u32 idx;
+	__u64 addr;
 	__u32 len;
-	__u16 offset;
-	__u8 flags;
-	__u8 padding[5];
+	__u32 options;
 };
 
-/* UMEM descriptor is __u32 */
+/* UMEM descriptor is __u64 */
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 87998818116f..9ad791ff4739 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -14,7 +14,7 @@
 
 #include "xdp_umem.h"
 
-#define XDP_UMEM_MIN_FRAME_SIZE 2048
+#define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
@@ -151,12 +151,12 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 
 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
-	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	unsigned int nframes, nfpp;
 	int size_chk, err;
 
-	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
 		 * - huge pages, or*
 		 * - using an IOMMU, or
@@ -166,7 +166,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 	}
 
-	if (!is_power_of_2(frame_size))
+	if (!is_power_of_2(chunk_size))
 		return -EINVAL;
 
 	if (!PAGE_ALIGNED(addr)) {
@@ -179,33 +179,30 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if ((addr + size) < addr)
 		return -EINVAL;
 
-	nframes = (unsigned int)div_u64(size, frame_size);
-	if (nframes == 0 || nframes > UINT_MAX)
+	chunks = (unsigned int)div_u64(size, chunk_size);
+	if (chunks == 0)
 		return -EINVAL;
 
-	nfpp = PAGE_SIZE / frame_size;
-	if (nframes < nfpp || nframes % nfpp)
+	chunks_per_page = PAGE_SIZE / chunk_size;
+	if (chunks < chunks_per_page || chunks % chunks_per_page)
 		return -EINVAL;
 
-	frame_headroom = ALIGN(frame_headroom, 64);
+	headroom = ALIGN(headroom, 64);
 
-	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 	if (size_chk < 0)
 		return -EINVAL;
 
 	umem->pid = get_task_pid(current, PIDTYPE_PID);
-	umem->size = (size_t)size;
 	umem->address = (unsigned long)addr;
-	umem->props.frame_size = frame_size;
-	umem->props.nframes = nframes;
-	umem->frame_headroom = frame_headroom;
+	umem->props.chunk_mask = ~((u64)chunk_size - 1);
+	umem->props.size = size;
+	umem->headroom = headroom;
+	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
 
-	umem->frame_size_log2 = ilog2(frame_size);
-	umem->nfpp_mask = nfpp - 1;
-	umem->nfpplog2 = ilog2(nfpp);
 	refcount_set(&umem->users, 1);
 
 	err = xdp_umem_account_pages(umem);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 0881cf456230..aeadd1bcb72d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -18,35 +18,20 @@ struct xdp_umem {
 	struct xsk_queue *cq;
 	struct page **pgs;
 	struct xdp_umem_props props;
-	u32 npgs;
-	u32 frame_headroom;
-	u32 nfpp_mask;
-	u32 nfpplog2;
-	u32 frame_size_log2;
+	u32 headroom;
+	u32 chunk_size_nohr;
 	struct user_struct *user;
 	struct pid *pid;
 	unsigned long address;
-	size_t size;
 	refcount_t users;
 	struct work_struct work;
+	u32 npgs;
 };
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx)
-{
-	u64 pg, off;
-	char *data;
-
-	pg = idx >> umem->nfpplog2;
-	off = (idx & umem->nfpp_mask) << umem->frame_size_log2;
-
-	data = page_address(umem->pgs[pg]);
-	return data + off;
-}
-
-static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
-						    u32 idx)
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return xdp_umem_get_data(umem, idx) + umem->frame_headroom;
+	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
+		(addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
index 2cf8ec485fd2..40eab10dfc49 100644
--- a/net/xdp/xdp_umem_props.h
+++ b/net/xdp/xdp_umem_props.h
@@ -7,8 +7,8 @@
 #define XDP_UMEM_PROPS_H_
 
 struct xdp_umem_props {
-	u32 frame_size;
-	u32 nframes;
+	u64 chunk_mask;
+	u64 size;
 };
 
 #endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 966307ce4b8e..4688c750df1d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,24 +41,27 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 id, len = xdp->data_end - xdp->data;
+	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
+	u64 addr;
 	int err;
 
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	if (!xskq_peek_id(xs->umem->fq, &id)) {
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
 		return -ENOSPC;
 	}
 
-	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
-	err = xskq_produce_batch_desc(xs->rx, id, len,
-				      xs->umem->frame_headroom);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (!err)
-		xskq_discard_id(xs->umem->fq);
+		xskq_discard_addr(xs->umem->fq);
 	else
 		xs->rx_dropped++;
 
@@ -95,10 +98,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
-	u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
+	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 
-	WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
+	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 
 	sock_wfree(skb);
 }
@@ -123,14 +126,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	while (xskq_peek_desc(xs->tx, &desc)) {
 		char *buffer;
-		u32 id, len;
+		u64 addr;
+		u32 len;
 
 		if (max_batch-- == 0) {
 			err = -EAGAIN;
 			goto out;
 		}
 
-		if (xskq_reserve_id(xs->umem->cq)) {
+		if (xskq_reserve_addr(xs->umem->cq)) {
 			err = -EAGAIN;
 			goto out;
 		}
@@ -153,8 +157,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		}
 
 		skb_put(skb, len);
-		id = desc.idx;
-		buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
+		addr = desc.addr;
+		buffer = xdp_umem_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
 		if (unlikely(err)) {
 			kfree_skb(skb);
@@ -164,7 +168,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		skb->dev = xs->dev;
 		skb->priority = sk->sk_priority;
 		skb->mark = sk->sk_mark;
-		skb_shinfo(skb)->destructor_arg = (void *)(long)id;
+		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index ebe85e59507e..6c32e92e98fc 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -17,7 +17,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
 
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 {
-	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
 }
 
 static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index b5924e7aeb2b..337e5ad3b10e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -27,7 +27,7 @@ struct xdp_rxtx_ring {
 /* Used for the fill and completion queues for buffers */
 struct xdp_umem_ring {
 	struct xdp_ring ptrs;
-	u32 desc[0] ____cacheline_aligned_in_smp;
+	u64 desc[0] ____cacheline_aligned_in_smp;
 };
 
 struct xsk_queue {
@@ -76,24 +76,25 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 
 /* UMEM queue */
 
-static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
+static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
-	if (unlikely(idx >= q->umem_props.nframes)) {
+	if (addr >= q->umem_props.size) {
 		q->invalid_descs++;
 		return false;
 	}
+
 	return true;
 }
 
-static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		*id = READ_ONCE(ring->desc[idx]);
-		if (xskq_is_valid_id(q, *id))
-			return id;
+		*addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+		if (xskq_is_valid_addr(q, *addr))
+			return addr;
 
 		q->cons_tail++;
 	}
@@ -101,7 +102,7 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
 	return NULL;
 }
 
-static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
 {
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
@@ -111,19 +112,19 @@ static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
 		smp_rmb();
 	}
 
-	return xskq_validate_id(q, id);
+	return xskq_validate_addr(q, addr);
 }
 
-static inline void xskq_discard_id(struct xsk_queue *q)
+static inline void xskq_discard_addr(struct xsk_queue *q)
 {
 	q->cons_tail++;
 }
 
-static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
+static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-	ring->desc[q->prod_tail++ & q->ring_mask] = id;
+	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
 	smp_wmb();
@@ -132,7 +133,7 @@ static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
 	return 0;
 }
 
-static inline int xskq_reserve_id(struct xsk_queue *q)
+static inline int xskq_reserve_addr(struct xsk_queue *q)
 {
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
 		return -ENOSPC;
@@ -145,16 +146,11 @@ static inline int xskq_reserve_id(struct xsk_queue *q)
 
 static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 {
-	u32 buff_len;
-
-	if (unlikely(d->idx >= q->umem_props.nframes)) {
-		q->invalid_descs++;
+	if (!xskq_is_valid_addr(q, d->addr))
 		return false;
-	}
 
-	buff_len = q->umem_props.frame_size;
-	if (unlikely(d->len > buff_len || d->len == 0 ||
-		     d->offset > buff_len || d->offset + d->len > buff_len)) {
+	if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
+	    (d->addr & q->umem_props.chunk_mask)) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -199,7 +195,7 @@ static inline void xskq_discard_desc(struct xsk_queue *q)
 }
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-					  u32 id, u32 len, u16 offset)
+					  u64 addr, u32 len)
 {
 	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 	unsigned int idx;
@@ -208,9 +204,8 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
 		return -ENOSPC;
 
 	idx = (q->prod_head++) & q->ring_mask;
-	ring->desc[idx].idx = id;
+	ring->desc[idx].addr = addr;
 	ring->desc[idx].len = len;
-	ring->desc[idx].offset = offset;
 
 	return 0;
 }
-- 
cgit v1.2.3


From a841b54dbd65421726caf7129f8951910c7a8ea6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 7 Apr 2018 13:50:59 -0400
Subject: NFS: Pass the inode down to the getattr() callback

Allow the getattr() callback to check things like whether or not we hold
a delegation so that it can adjust the attributes that it is asking for.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c         |  3 ++-
 fs/nfs/dir.c            |  3 ++-
 fs/nfs/export.c         |  2 +-
 fs/nfs/inode.c          |  3 ++-
 fs/nfs/nfs3proc.c       |  3 ++-
 fs/nfs/nfs4proc.c       | 17 ++++++++++-------
 fs/nfs/proc.c           |  3 ++-
 include/linux/nfs_xdr.h |  3 ++-
 8 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b9129e2befea..02e97c29af0c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -969,7 +969,8 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
 	}
 
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
-		error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
+		error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh,
+				fattr, NULL, NULL);
 		if (error < 0) {
 			dprintk("nfs_create_server: getattr error = %d\n", -error);
 			goto error;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 978a22ea962c..7a9c14426855 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1656,7 +1656,8 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
 		struct nfs_server *server = NFS_SB(dentry->d_sb);
-		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
+		error = server->nfs_client->rpc_ops->getattr(server, fhandle,
+				fattr, NULL, NULL);
 		if (error < 0)
 			goto out_error;
 	}
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index ab5de3246c5c..deecb67638aa 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -102,7 +102,7 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 	}
 
 	rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops;
-	ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label);
+	ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label, NULL);
 	if (ret) {
 		dprintk("%s: getattr failed %d\n", __func__, ret);
 		dentry = ERR_PTR(ret);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5de724b1b90c..a720427e5aa3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1101,7 +1101,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		goto out;
 	}
 
-	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
+	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr,
+			label, inode);
 	if (status != 0) {
 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
 			 inode->i_sb->s_id,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 5645ef4c5259..ec8a9efa268f 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -101,7 +101,8 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
  */
 static int
 nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr, struct nfs4_label *label)
+		struct nfs_fattr *fattr, struct nfs4_label *label,
+		struct inode *inode)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_GETATTR],
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c44cfa6be8ff..cd60e8360ef2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -92,8 +92,8 @@ static void nfs4_layoutget_release(void *calldata);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label, struct inode *inode);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
 			    struct nfs_open_context *ctx, struct nfs4_label *ilabel,
@@ -2494,7 +2494,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
 	}
 	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
 		nfs4_sequence_free_slot(&o_res->seq_res);
-		nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
+		nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr,
+				o_res->f_label, NULL);
 	}
 	return 0;
 }
@@ -3763,7 +3764,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
 	if (IS_ERR(label))
 		return PTR_ERR(label);
 
-	error = nfs4_proc_getattr(server, mntfh, fattr, label);
+	error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL);
 	if (error < 0) {
 		dprintk("nfs4_get_root: getattr error = %d\n", -error);
 		goto err_free_label;
@@ -3828,7 +3829,8 @@ out:
 }
 
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-				struct nfs_fattr *fattr, struct nfs4_label *label)
+				struct nfs_fattr *fattr, struct nfs4_label *label,
+				struct inode *inode)
 {
 	struct nfs4_getattr_arg args = {
 		.fh = fhandle,
@@ -3852,12 +3854,13 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 }
 
 static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-				struct nfs_fattr *fattr, struct nfs4_label *label)
+				struct nfs_fattr *fattr, struct nfs4_label *label,
+				struct inode *inode)
 {
 	struct nfs4_exception exception = { };
 	int err;
 	do {
-		err = _nfs4_proc_getattr(server, fhandle, fattr, label);
+		err = _nfs4_proc_getattr(server, fhandle, fattr, label, inode);
 		trace_nfs4_getattr(server, fhandle, fattr, err);
 		err = nfs4_handle_exception(server, err,
 				&exception);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 763f77e7f1f1..e0c257bd62b9 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -99,7 +99,8 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
  */
 static int
 nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr, struct nfs4_label *label)
+		struct nfs_fattr *fattr, struct nfs4_label *label,
+		struct inode *inode)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 09dc14ac5804..9dee3c23895d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1581,7 +1581,8 @@ struct nfs_rpc_ops {
 	struct dentry *(*try_mount) (int, const char *, struct nfs_mount_info *,
 				     struct nfs_subversion *);
 	int	(*getattr) (struct nfs_server *, struct nfs_fh *,
-			    struct nfs_fattr *, struct nfs4_label *);
+			    struct nfs_fattr *, struct nfs4_label *,
+			    struct inode *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
 			    struct iattr *);
 	int	(*lookup)  (struct inode *, const struct qstr *,
-- 
cgit v1.2.3


From 90cc0c3cc7092ea4c7871fdd5fb00a9ba62842e3 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 31 May 2018 11:42:11 -0500
Subject: PCI: shpchp: Add shpchp_is_native()

In the same way we do for pciehp, add shpchp_is_native(), which returns
true if the bridge should be handled by the native SHPC driver.  Then
convert the driver to use this function.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/hotplug/acpi_pcihp.c  |  4 ++--
 drivers/pci/hotplug/shpchp.h      |  1 -
 drivers/pci/hotplug/shpchp_core.c | 14 +-------------
 drivers/pci/pci-acpi.c            | 29 +++++++++++++++++++++++++++++
 include/linux/pci_hotplug.h       |  2 ++
 include/linux/pci_ids.h           |  1 +
 6 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 597d22aeefc1..3979f89b250a 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -83,11 +83,11 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev)
 	 * OSHP within the scope of the hotplug controller and its parents,
 	 * up to the host bridge under which this controller exists.
 	 */
-	host = pci_find_host_bridge(pdev->bus);
-	if (host->native_shpc_hotplug)
+	if (shpchp_is_native(pdev))
 		return 0;
 
 	/* If _OSC exists, we should not evaluate OSHP */
+	host = pci_find_host_bridge(pdev->bus);
 	root = acpi_pci_find_root(ACPI_HANDLE(&host->dev));
 	if (root->osc_support_set)
 		goto no_control;
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index 9675ab757323..516e4835019c 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -105,7 +105,6 @@ struct controller {
 };
 
 /* Define AMD SHPC ID  */
-#define PCI_DEVICE_ID_AMD_GOLAM_7450	0x7450
 #define PCI_DEVICE_ID_AMD_POGO_7458	0x7458
 
 /* AMD PCI-X bridge registers */
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index 47decc9b3bb3..e91be287f292 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -270,24 +270,12 @@ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	return 0;
 }
 
-static int is_shpc_capable(struct pci_dev *dev)
-{
-	if (dev->vendor == PCI_VENDOR_ID_AMD &&
-	    dev->device == PCI_DEVICE_ID_AMD_GOLAM_7450)
-		return 1;
-	if (!pci_find_capability(dev, PCI_CAP_ID_SHPC))
-		return 0;
-	if (acpi_get_hp_hw_control_from_firmware(dev))
-		return 0;
-	return 1;
-}
-
 static int shpc_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	int rc;
 	struct controller *ctrl;
 
-	if (!is_shpc_capable(pdev))
+	if (acpi_get_hp_hw_control_from_firmware(pdev))
 		return -ENODEV;
 
 	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 52b8434d4d6e..65113b6eed14 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -394,6 +394,35 @@ bool pciehp_is_native(struct pci_dev *bridge)
 	return host->native_pcie_hotplug;
 }
 
+/**
+ * shpchp_is_native - Check whether a hotplug port is handled by the OS
+ * @bridge: Hotplug port to check
+ *
+ * Returns true if the given @bridge is handled by the native SHPC hotplug
+ * driver.
+ */
+bool shpchp_is_native(struct pci_dev *bridge)
+{
+	const struct pci_host_bridge *host;
+
+	if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC))
+		return false;
+
+	/*
+	 * It is assumed that AMD GOLAM chips support SHPC but they do not
+	 * have SHPC capability.
+	 */
+	if (bridge->vendor == PCI_VENDOR_ID_AMD &&
+	    bridge->device == PCI_DEVICE_ID_AMD_GOLAM_7450)
+		return true;
+
+	if (!pci_find_capability(bridge, PCI_CAP_ID_SHPC))
+		return false;
+
+	host = pci_find_host_bridge(bridge->bus);
+	return host->native_shpc_hotplug;
+}
+
 /**
  * pci_acpi_wake_bus - Root bus wakeup notification fork function.
  * @context: Device wakeup context.
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 1f5c935eb0de..4c378368215c 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -164,6 +164,7 @@ struct hotplug_params {
 int pci_get_hp_params(struct pci_dev *dev, struct hotplug_params *hpp);
 bool pciehp_is_native(struct pci_dev *bridge);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge);
+bool shpchp_is_native(struct pci_dev *bridge);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(acpi_handle handle);
 #else
@@ -178,5 +179,6 @@ static inline int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge)
 	return 0;
 }
 static inline bool pciehp_is_native(struct pci_dev *bridge) { return true; }
+static inline bool shpchp_is_native(struct pci_dev *bridge) { return true; }
 #endif
 #endif
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 883cb7bf78aa..5aace6cca0d7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -561,6 +561,7 @@
 #define PCI_DEVICE_ID_AMD_OPUS_7443	0x7443
 #define PCI_DEVICE_ID_AMD_VIPER_7443	0x7443
 #define PCI_DEVICE_ID_AMD_OPUS_7445	0x7445
+#define PCI_DEVICE_ID_AMD_GOLAM_7450	0x7450
 #define PCI_DEVICE_ID_AMD_8111_PCI	0x7460
 #define PCI_DEVICE_ID_AMD_8111_LPC	0x7468
 #define PCI_DEVICE_ID_AMD_8111_IDE	0x7469
-- 
cgit v1.2.3


From 95d969ebb3925464038a32b4a225c5c52e675ae8 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 28 May 2018 15:47:52 +0300
Subject: PCI: hotplug: Add hotplug_is_native()

Add hotplug_is_native() to find out whether the OS is supposed to handle
native hotplug of a given bridge.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/pci_hotplug.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 4c378368215c..cf5e22103f68 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -181,4 +181,9 @@ static inline int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge)
 static inline bool pciehp_is_native(struct pci_dev *bridge) { return true; }
 static inline bool shpchp_is_native(struct pci_dev *bridge) { return true; }
 #endif
+
+static inline bool hotplug_is_native(struct pci_dev *bridge)
+{
+	return pciehp_is_native(bridge) || shpchp_is_native(bridge);
+}
 #endif
-- 
cgit v1.2.3


From 87ae68c8b4944d142447b88875c9c412c714434f Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Sat, 2 Jun 2018 09:40:34 +0200
Subject: ipv6: omit traffic class when calculating flow hash

Some of the code paths calculating flow hash for IPv6 use flowlabel member
of struct flowi6 which, despite its name, encodes both flow label and
traffic class. If traffic class changes within a TCP connection (as e.g.
ssh does), ECMP route can switch between path. It's also incosistent with
other code paths where ip6_flowlabel() (returning only flow label) is used
to feed the key.

Use only flow label everywhere, including one place where hash key is set
using ip6_flowinfo().

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h        | 5 +++++
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 798558fd1681..6dcc473fbe51 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -907,6 +907,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
+static inline u32 flowi6_get_flowlabel(const struct flowi6 *fl6)
+{
+	return (__force u32)(fl6->flowlabel & IPV6_FLOWLABEL_MASK);
+}
+
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 4fc1e84d77ec..6497b815e7a2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->tags.flow_label = flowi6_get_flowlabel(fl6);
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 22c4de2317d0..06593781b59a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1981,7 +1981,7 @@ out:
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->tags.flow_label = ip6_flowlabel(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -2002,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.tags.flow_label = flowi6_get_flowlabel(fl6);
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
cgit v1.2.3


From a925ab48dacbc9ad470a9ca4c761601ee07e476c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Jun 2018 13:20:38 -0400
Subject: Revert "ipv6: omit traffic class when calculating flow hash"

This reverts commit 87ae68c8b4944d142447b88875c9c412c714434f.

Applied the wrong version of this fix, correct version
coming up.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h        | 5 -----
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 6dcc473fbe51..798558fd1681 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -907,11 +907,6 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
-static inline u32 flowi6_get_flowlabel(const struct flowi6 *fl6)
-{
-	return (__force u32)(fl6->flowlabel & IPV6_FLOWLABEL_MASK);
-}
-
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6497b815e7a2..4fc1e84d77ec 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = flowi6_get_flowlabel(fl6);
+	keys->tags.flow_label = (__force u32)fl6->flowlabel;
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 06593781b59a..22c4de2317d0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1981,7 +1981,7 @@ out:
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowlabel(key_iph);
+		keys->tags.flow_label = ip6_flowinfo(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -2002,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = flowi6_get_flowlabel(fl6);
+			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
cgit v1.2.3


From fa1be7e01ea863e911349e30456706749518eeab Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Mon, 4 Jun 2018 11:36:05 +0200
Subject: ipv6: omit traffic class when calculating flow hash

Some of the code paths calculating flow hash for IPv6 use flowlabel member
of struct flowi6 which, despite its name, encodes both flow label and
traffic class. If traffic class changes within a TCP connection (as e.g.
ssh does), ECMP route can switch between path. It's also inconsistent with
other code paths where ip6_flowlabel() (returning only flow label) is used
to feed the key.

Use only flow label everywhere, including one place where hash key is set
using ip6_flowinfo().

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h        | 5 +++++
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 798558fd1681..16475c269749 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -907,6 +907,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
+static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
+{
+	return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
+}
+
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 4fc1e84d77ec..53f96e4f7bf5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 22c4de2317d0..ce6696acba73 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1981,7 +1981,7 @@ out:
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->tags.flow_label = ip6_flowlabel(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -2002,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
cgit v1.2.3


From b0966e7b894035dba6710aa4e9f18d7a3a3d5b22 Mon Sep 17 00:00:00 2001
From: Adam Manzanares <adam.manzanares@wdc.com>
Date: Mon, 4 Jun 2018 10:59:56 -0700
Subject: fs: aio ioprio add explicit block layer dependence

Previously, the ioprio_check_cap function was only defined when CONFIG_BLOCK
was set. Make this relationship explicit and add a stub for !CONFIG_BLOCK.

Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/ioprio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 4a28cec49ec3..9e30ed6443db 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -77,6 +77,13 @@ extern int ioprio_best(unsigned short aprio, unsigned short bprio);
 
 extern int set_task_ioprio(struct task_struct *task, int ioprio);
 
+#ifdef CONFIG_BLOCK
 extern int ioprio_check_cap(int ioprio);
+#else
+static inline int ioprio_check_cap(int ioprio)
+{
+	return -ENOTBLK;
+}
+#endif /* CONFIG_BLOCK */
 
 #endif
-- 
cgit v1.2.3


From fe943d50425b6646606f8ef1ef8b8d4975fdbee2 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Thu, 12 Apr 2018 12:04:55 +0800
Subject: libceph, rbd: add error handling for osd_req_op_cls_init()

Add proper error handling for osd_req_op_cls_init() to replace
BUG_ON statement when failing from memory allocation.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  9 ++++++---
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 12 +++++++++---
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 33b36fea1d73..ac57bc0f6fca 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2339,6 +2339,7 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
 static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
 {
 	unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
+	int ret;
 
 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
 	rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
@@ -2353,6 +2354,11 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
 	if (!obj_req->osd_req)
 		return -ENOMEM;
 
+	ret = osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
+				  "copyup");
+	if (ret)
+		return ret;
+
 	/*
 	 * Only send non-zero copyup data to save some I/O and network
 	 * bandwidth -- zero copyup data is equivalent to the object not
@@ -2362,9 +2368,6 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
 		dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
 		bytes = 0;
 	}
-
-	osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
-			    "copyup");
 	osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
 					  obj_req->copyup_bvecs,
 					  obj_req->copyup_bvec_count,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 96bb32285989..b73dd7ebe585 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -440,7 +440,7 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
 					struct page **pages, u64 length,
 					u32 alignment, bool pages_from_pool,
 					bool own_pages);
-extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
+extern int osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
 					const char *class, const char *method);
 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d2667e5dddc3..08b5fc1f90cc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -767,7 +767,7 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_dup_last);
 
-void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
+int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 			u16 opcode, const char *class, const char *method)
 {
 	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
@@ -779,7 +779,9 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 	BUG_ON(opcode != CEPH_OSD_OP_CALL);
 
 	pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
-	BUG_ON(!pagelist);
+	if (!pagelist)
+		return -ENOMEM;
+
 	ceph_pagelist_init(pagelist);
 
 	op->cls.class_name = class;
@@ -799,6 +801,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
 	op->indata_len = payload_len;
+	return 0;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
 
@@ -4928,7 +4931,10 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
+	ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
+	if (ret)
+		goto out_put_req;
+
 	if (req_page)
 		osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
 						  0, false, false);
-- 
cgit v1.2.3


From 49a9f4f6714ec0ca2c6ada2ce764fbdd694962ee Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 25 Apr 2018 17:30:23 +0800
Subject: ceph: always get rstat from auth mds

rstat is not tracked by capability. client can't know if rstat from
non-auth mds is uptodate or not.

Link: http://tracker.ceph.com/issues/23538
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c               |  2 ++
 fs/ceph/inode.c              | 21 +++++++++++++++------
 fs/ceph/xattr.c              | 30 ++++++++++++++++++------------
 include/linux/ceph/ceph_fs.h |  1 +
 4 files changed, 36 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 23dbfae16156..1b9f611c9dfe 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -69,6 +69,8 @@ static char *gcap_string(char *s, int c)
 		*s++ = 'w';
 	if (c & CEPH_CAP_GBUFFER)
 		*s++ = 'b';
+	if (c & CEPH_CAP_GWREXTEND)
+		*s++ = 'a';
 	if (c & CEPH_CAP_GLAZYIO)
 		*s++ = 'l';
 	return s;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ae056927080d..ec9441c2403b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -854,6 +854,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		}
 	}
 
+	/* layout and rstat are not tracked by capability, update them if
+	 * the inode info is from auth mds */
+	if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
+		if (S_ISDIR(inode->i_mode)) {
+			ci->i_dir_layout = iinfo->dir_layout;
+			ci->i_rbytes = le64_to_cpu(info->rbytes);
+			ci->i_rfiles = le64_to_cpu(info->rfiles);
+			ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+			ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+		}
+	}
+
 	/* xattrs */
 	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
 	if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))  &&
@@ -919,14 +931,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		inode->i_op = &ceph_dir_iops;
 		inode->i_fop = &ceph_dir_fops;
 
-		ci->i_dir_layout = iinfo->dir_layout;
 
 		ci->i_files = le64_to_cpu(info->files);
 		ci->i_subdirs = le64_to_cpu(info->subdirs);
-		ci->i_rbytes = le64_to_cpu(info->rbytes);
-		ci->i_rfiles = le64_to_cpu(info->rfiles);
-		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
-		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
 		break;
 	default:
 		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -2178,6 +2185,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
+	int mode;
 	int err;
 
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
@@ -2190,7 +2198,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
 		return 0;
 
-	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req->r_inode = inode;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f7dcafb7c5d4..5bc8edb4c2a6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -56,6 +56,7 @@ struct ceph_vxattr {
 
 #define VXATTR_FLAG_READONLY		(1<<0)
 #define VXATTR_FLAG_HIDDEN		(1<<1)
+#define VXATTR_FLAG_RSTAT		(1<<2)
 
 /* layouts */
 
@@ -265,14 +266,16 @@ static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
 #define CEPH_XATTR_NAME2(_type, _name, _name2)	\
 	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
 
-#define XATTR_NAME_CEPH(_type, _name)					\
+#define XATTR_NAME_CEPH(_type, _name, _flags)				\
 	{								\
 		.name = CEPH_XATTR_NAME(_type, _name),			\
 		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
-		.exists_cb = NULL,				\
-		.flags = VXATTR_FLAG_READONLY,			\
+		.exists_cb = NULL,					\
+		.flags = (VXATTR_FLAG_READONLY | _flags),		\
 	}
+#define XATTR_RSTAT_FIELD(_type, _name)			\
+	XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
 #define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
 	{								\
 		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
@@ -303,14 +306,14 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_LAYOUT_FIELD(dir, layout, object_size),
 	XATTR_LAYOUT_FIELD(dir, layout, pool),
 	XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
-	XATTR_NAME_CEPH(dir, entries),
-	XATTR_NAME_CEPH(dir, files),
-	XATTR_NAME_CEPH(dir, subdirs),
-	XATTR_NAME_CEPH(dir, rentries),
-	XATTR_NAME_CEPH(dir, rfiles),
-	XATTR_NAME_CEPH(dir, rsubdirs),
-	XATTR_NAME_CEPH(dir, rbytes),
-	XATTR_NAME_CEPH(dir, rctime),
+	XATTR_NAME_CEPH(dir, entries, 0),
+	XATTR_NAME_CEPH(dir, files, 0),
+	XATTR_NAME_CEPH(dir, subdirs, 0),
+	XATTR_RSTAT_FIELD(dir, rentries),
+	XATTR_RSTAT_FIELD(dir, rfiles),
+	XATTR_RSTAT_FIELD(dir, rsubdirs),
+	XATTR_RSTAT_FIELD(dir, rbytes),
+	XATTR_RSTAT_FIELD(dir, rctime),
 	{
 		.name = "ceph.quota",
 		.name_size = sizeof("ceph.quota"),
@@ -807,7 +810,10 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 	/* let's see if a virtual xattr was requested */
 	vxattr = ceph_match_vxattr(inode, name);
 	if (vxattr) {
-		err = ceph_do_getattr(inode, 0, true);
+		int mask = 0;
+		if (vxattr->flags & VXATTR_FLAG_RSTAT)
+			mask |= CEPH_STAT_RSTAT;
+		err = ceph_do_getattr(inode, mask, true);
 		if (err)
 			return err;
 		err = -ENODATA;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 7ecfc88314d8..4903deb0777a 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -628,6 +628,7 @@ int ceph_flags_to_mode(int flags);
 				 CEPH_CAP_XATTR_SHARED)
 #define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
 				   CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND
 
 #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
 			      CEPH_CAP_LINK_SHARED |			\
-- 
cgit v1.2.3


From 66850df58529eefc61cb96b895991508547503bf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 15 May 2018 15:47:58 +0200
Subject: libceph: introduce ceph_osdc_abort_requests()

This will be used by the filesystem for "umount -f".

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  2 ++
 net/ceph/osd_client.c           | 67 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 64 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index b73dd7ebe585..874c31c01f80 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -347,6 +347,7 @@ struct ceph_osd_client {
 	struct rb_root         linger_map_checks;
 	atomic_t               num_requests;
 	atomic_t               num_homeless;
+	int                    abort_err;
 	struct delayed_work    timeout_work;
 	struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
@@ -378,6 +379,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
+void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
 
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
 			    unsigned int which, u16 opcode, u32 flags);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 08b5fc1f90cc..a7e090d2c957 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1058,6 +1058,38 @@ EXPORT_SYMBOL(ceph_osdc_new_request);
 DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
 DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
 
+/*
+ * Call @fn on each OSD request as long as @fn returns 0.
+ */
+static void for_each_request(struct ceph_osd_client *osdc,
+			int (*fn)(struct ceph_osd_request *req, void *arg),
+			void *arg)
+{
+	struct rb_node *n, *p;
+
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		for (p = rb_first(&osd->o_requests); p; ) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			p = rb_next(p);
+			if (fn(req, arg))
+				return;
+		}
+	}
+
+	for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(p, struct ceph_osd_request, r_node);
+
+		p = rb_next(p);
+		if (fn(req, arg))
+			return;
+	}
+}
+
 static bool osd_homeless(struct ceph_osd *osd)
 {
 	return osd->o_osd == CEPH_HOMELESS_OSD;
@@ -2165,9 +2197,9 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	struct ceph_osd_client *osdc = req->r_osdc;
 	struct ceph_osd *osd;
 	enum calc_target_result ct_res;
+	int err = 0;
 	bool need_send = false;
 	bool promoted = false;
-	bool need_abort = false;
 
 	WARN_ON(req->r_tid);
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -2183,7 +2215,10 @@ again:
 		goto promote;
 	}
 
-	if (osdc->osdmap->epoch < osdc->epoch_barrier) {
+	if (osdc->abort_err) {
+		dout("req %p abort_err %d\n", req, osdc->abort_err);
+		err = osdc->abort_err;
+	} else if (osdc->osdmap->epoch < osdc->epoch_barrier) {
 		dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
 		     osdc->epoch_barrier);
 		req->r_t.paused = true;
@@ -2208,7 +2243,7 @@ again:
 		req->r_t.paused = true;
 		maybe_request_map(osdc);
 		if (req->r_abort_on_full)
-			need_abort = true;
+			err = -ENOSPC;
 	} else if (!osd_homeless(osd)) {
 		need_send = true;
 	} else {
@@ -2225,8 +2260,8 @@ again:
 	link_request(osd, req);
 	if (need_send)
 		send_request(req);
-	else if (need_abort)
-		complete_request(req, -ENOSPC);
+	else if (err)
+		complete_request(req, err);
 	mutex_unlock(&osd->lock);
 
 	if (ct_res == CALC_TARGET_POOL_DNE)
@@ -2340,6 +2375,28 @@ static void abort_request(struct ceph_osd_request *req, int err)
 	complete_request(req, err);
 }
 
+static int abort_fn(struct ceph_osd_request *req, void *arg)
+{
+	int err = *(int *)arg;
+
+	abort_request(req, err);
+	return 0; /* continue iteration */
+}
+
+/*
+ * Abort all in-flight requests with @err and arrange for all future
+ * requests to be failed immediately.
+ */
+void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
+{
+	dout("%s osdc %p err %d\n", __func__, osdc, err);
+	down_write(&osdc->lock);
+	for_each_request(osdc, abort_fn, &err);
+	osdc->abort_err = err;
+	up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_abort_requests);
+
 static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
 {
 	if (likely(eb > osdc->epoch_barrier)) {
-- 
cgit v1.2.3


From 88bc1922c273c95e84a8955e657401f9bc63a80b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 21 May 2018 16:00:29 +0200
Subject: libceph: defer __complete_request() to a workqueue

In the common case, req->r_callback is called by handle_reply() on the
ceph-msgr worker thread without any locks.  If handle_reply() fails, it
is called with both osd->lock and osdc->lock.  In the map check case,
it is called with just osdc->lock but held for write.  Finally, if the
request is aborted because of -ENOSPC or by ceph_osdc_abort_requests(),
it is called directly on the submitter's thread, again with both locks.

req->r_callback on the submitter's thread is relatively new (introduced
in 4.12) and ripe for deadlocks -- e.g. writeback worker thread waiting
on itself:

  inode_wait_for_writeback+0x26/0x40
  evict+0xb5/0x1a0
  iput+0x1d2/0x220
  ceph_put_wrbuffer_cap_refs+0xe0/0x2c0 [ceph]
  writepages_finish+0x2d3/0x410 [ceph]
  __complete_request+0x26/0x60 [libceph]
  complete_request+0x2e/0x70 [libceph]
  __submit_request+0x256/0x330 [libceph]
  submit_request+0x2b/0x30 [libceph]
  ceph_osdc_start_request+0x25/0x40 [libceph]
  ceph_writepages_start+0xdfe/0x1320 [ceph]
  do_writepages+0x1f/0x70
  __writeback_single_inode+0x45/0x330
  writeback_sb_inodes+0x26a/0x600
  __writeback_inodes_wb+0x92/0xc0
  wb_writeback+0x274/0x330
  wb_workfn+0x2d5/0x3b0

Defer __complete_request() to a workqueue in all failure cases so it's
never on the same thread as ceph_osdc_start_request() and always called
with no locks held.

Link: http://tracker.ceph.com/issues/23978
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
---
 include/linux/ceph/osd_client.h |  2 ++
 net/ceph/osd_client.c           | 19 ++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 874c31c01f80..d4191bde95a4 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -170,6 +170,7 @@ struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
 	struct rb_node  r_mc_node;          /* map check */
+	struct work_struct r_complete_work;
 	struct ceph_osd *r_osd;
 
 	struct ceph_osd_request_target r_t;
@@ -360,6 +361,7 @@ struct ceph_osd_client {
 	struct ceph_msgpool	msgpool_op_reply;
 
 	struct workqueue_struct	*notify_wq;
+	struct workqueue_struct	*completion_wq;
 };
 
 static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a78f578a2da7..a4c12c37aa90 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2329,6 +2329,14 @@ static void __complete_request(struct ceph_osd_request *req)
 	ceph_osdc_put_request(req);
 }
 
+static void complete_request_workfn(struct work_struct *work)
+{
+	struct ceph_osd_request *req =
+	    container_of(work, struct ceph_osd_request, r_complete_work);
+
+	__complete_request(req);
+}
+
 /*
  * This is open-coded in handle_reply().
  */
@@ -2338,7 +2346,9 @@ static void complete_request(struct ceph_osd_request *req, int err)
 
 	req->r_result = err;
 	finish_request(req);
-	__complete_request(req);
+
+	INIT_WORK(&req->r_complete_work, complete_request_workfn);
+	queue_work(req->r_osdc->completion_wq, &req->r_complete_work);
 }
 
 static void cancel_map_check(struct ceph_osd_request *req)
@@ -5058,6 +5068,10 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	if (!osdc->notify_wq)
 		goto out_msgpool_reply;
 
+	osdc->completion_wq = create_singlethread_workqueue("ceph-completion");
+	if (!osdc->completion_wq)
+		goto out_notify_wq;
+
 	schedule_delayed_work(&osdc->timeout_work,
 			      osdc->client->options->osd_keepalive_timeout);
 	schedule_delayed_work(&osdc->osds_timeout_work,
@@ -5065,6 +5079,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	return 0;
 
+out_notify_wq:
+	destroy_workqueue(osdc->notify_wq);
 out_msgpool_reply:
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
 out_msgpool:
@@ -5079,6 +5095,7 @@ out:
 
 void ceph_osdc_stop(struct ceph_osd_client *osdc)
 {
+	destroy_workqueue(osdc->completion_wq);
 	destroy_workqueue(osdc->notify_wq);
 	cancel_delayed_work_sync(&osdc->timeout_work);
 	cancel_delayed_work_sync(&osdc->osds_timeout_work);
-- 
cgit v1.2.3


From c843d13caefad9f2f182f38d6bfe492c9f00e086 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 30 May 2018 16:29:14 +0200
Subject: libceph: make abort_on_full a per-osdc setting

The intent behind making it a per-request setting was that it would be
set for writes, but not for reads.  As it is, the flag is set for all
fs/ceph requests except for pool perm check stat request (technically
a read).

ceph_osdc_abort_on_full() skips reads since the previous commit and
I don't see a use case for marking individual requests.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/addr.c                  | 1 -
 fs/ceph/file.c                  | 1 -
 fs/ceph/super.c                 | 2 ++
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 9 ++++-----
 5 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5f7ad3d0df2e..ca0d5510ed50 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1935,7 +1935,6 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
 	wr_req->r_mtime = ci->vfs_inode.i_mtime;
-	wr_req->r_abort_on_full = true;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index cf0e45b10121..6b9f7f3cd237 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -895,7 +895,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_callback = ceph_aio_complete_req;
 	req->r_inode = inode;
 	req->r_priv = aio_req;
-	req->r_abort_on_full = true;
 
 	ret = ceph_osdc_start_request(req->r_osdc, req, false);
 out:
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a092cdb69288..cad046aa4fd0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -616,7 +616,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 		err = PTR_ERR(fsc->client);
 		goto fail;
 	}
+
 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->osdc.abort_on_full = true;
 
 	if (!fsopt->mds_namespace) {
 		ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d4191bde95a4..0d6ee04b4c41 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -202,7 +202,6 @@ struct ceph_osd_request {
 	struct timespec r_mtime;              /* ditto */
 	u64 r_data_offset;                    /* ditto */
 	bool r_linger;                        /* don't resend on failure */
-	bool r_abort_on_full;		      /* return ENOSPC when full */
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
@@ -348,6 +347,7 @@ struct ceph_osd_client {
 	struct rb_root         linger_map_checks;
 	atomic_t               num_requests;
 	atomic_t               num_homeless;
+	bool                   abort_on_full; /* abort w/ ENOSPC when full */
 	int                    abort_err;
 	struct delayed_work    timeout_work;
 	struct delayed_work    osds_timeout_work;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3d055529189c..05c4d27d25fe 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1030,7 +1030,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
-	req->r_abort_on_full = true;
 	req->r_flags = flags;
 	req->r_base_oloc.pool = layout->pool_id;
 	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
@@ -2239,7 +2238,7 @@ again:
 		   (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		    pool_full(osdc, req->r_t.base_oloc.pool))) {
 		dout("req %p full/pool_full\n", req);
-		if (req->r_abort_on_full) {
+		if (osdc->abort_on_full) {
 			err = -ENOSPC;
 		} else {
 			pr_warn_ratelimited("FULL or reached pool quota\n");
@@ -2446,8 +2445,7 @@ static int abort_on_full_fn(struct ceph_osd_request *req, void *arg)
 	struct ceph_osd_client *osdc = req->r_osdc;
 	bool *victims = arg;
 
-	if (req->r_abort_on_full &&
-	    (req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
 	    (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 	     pool_full(osdc, req->r_t.base_oloc.pool))) {
 		if (!*victims) {
@@ -2470,7 +2468,8 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
 {
 	bool victims = false;
 
-	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))
+	if (osdc->abort_on_full &&
+	    (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)))
 		for_each_request(osdc, abort_on_full_fn, &victims);
 }
 
-- 
cgit v1.2.3


From a86f009f106cba322c608785e09c8b5be8ffe8bb Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 23 May 2018 14:46:53 +0200
Subject: libceph: allocate the locator string with GFP_NOFAIL

calc_target() isn't supposed to fail with anything but POOL_DNE, in
which case we report that the pool doesn't exist and fail the request
with -ENOENT.  Doing this for -ENOMEM is at the very least confusing
and also harmful -- as the preceding requests complete, a short-lived
locator string allocation is likely to succeed after a wait.

(We used to call ceph_object_locator_to_pg() for a pi lookup.  In
theory that could fail with -ENOENT, hence the "ret != -ENOENT" warning
being removed.)

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  8 ++++----
 net/ceph/osd_client.c       | 10 +---------
 net/ceph/osdmap.c           | 19 ++++++++-----------
 3 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e71fb222c7c3..5675b1f09bc5 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -279,10 +279,10 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting,
 		       const struct ceph_osds *new_acting,
 		       bool any_change);
 
-int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
-				const struct ceph_object_id *oid,
-				const struct ceph_object_locator *oloc,
-				struct ceph_pg *raw_pgid);
+void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+				 const struct ceph_object_id *oid,
+				 const struct ceph_object_locator *oloc,
+				 struct ceph_pg *raw_pgid);
 int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
 			      const struct ceph_object_id *oid,
 			      const struct ceph_object_locator *oloc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 05c4d27d25fe..f2584fe1246f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1430,7 +1430,6 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	bool recovery_deletes = ceph_osdmap_flag(osdc,
 						 CEPH_OSDMAP_RECOVERY_DELETES);
 	enum calc_target_result ct_res;
-	int ret;
 
 	t->epoch = osdc->osdmap->epoch;
 	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
@@ -1466,14 +1465,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 		}
 	}
 
-	ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc,
-					  &pgid);
-	if (ret) {
-		WARN_ON(ret != -ENOENT);
-		t->osd = CEPH_HOMELESS_OSD;
-		ct_res = CALC_TARGET_POOL_DNE;
-		goto out;
-	}
+	__ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid);
 	last_pgid.pool = pgid.pool;
 	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9645ffd6acfb..a7494f623451 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2145,10 +2145,10 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting,
  * Should only be called with target_oid and target_oloc (as opposed to
  * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
-				const struct ceph_object_id *oid,
-				const struct ceph_object_locator *oloc,
-				struct ceph_pg *raw_pgid)
+void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+				 const struct ceph_object_id *oid,
+				 const struct ceph_object_locator *oloc,
+				 struct ceph_pg *raw_pgid)
 {
 	WARN_ON(pi->id != oloc->pool);
 
@@ -2164,11 +2164,8 @@ int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
 		int nsl = oloc->pool_ns->len;
 		size_t total = nsl + 1 + oid->name_len;
 
-		if (total > sizeof(stack_buf)) {
-			buf = kmalloc(total, GFP_NOIO);
-			if (!buf)
-				return -ENOMEM;
-		}
+		if (total > sizeof(stack_buf))
+			buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL);
 		memcpy(buf, oloc->pool_ns->str, nsl);
 		buf[nsl] = '\037';
 		memcpy(buf + nsl + 1, oid->name, oid->name_len);
@@ -2180,7 +2177,6 @@ int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
 		     oid->name, nsl, oloc->pool_ns->str,
 		     raw_pgid->pool, raw_pgid->seed);
 	}
-	return 0;
 }
 
 int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
@@ -2194,7 +2190,8 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
 	if (!pi)
 		return -ENOENT;
 
-	return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
+	__ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
+	return 0;
 }
 EXPORT_SYMBOL(ceph_object_locator_to_pg);
 
-- 
cgit v1.2.3


From c5e7a7ea22d5677f7c70028908372cff6948ecdc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 4 Jun 2018 12:01:15 -0700
Subject: swait: strengthen language to discourage use

We already earlier discouraged people from using this interface in
commit 88796e7e5c45 ("sched/swait: Document it clearly that the swait
facilities are special and shouldn't be used"), but I just got a pull
request with a new broken user.

So make the comment *really* clear.

The swait interfaces are bad, and should not be used unless you have
some *very* strong reasons that include tons of hard performance numbers
on just why you want to use them, and you show that you actually
understand that they aren't at all like the normal wait/wakeup
interfaces.

So far, every single user has been suspect.  The main user is KVM, which
is completely pointless (there is only ever one waiter, which avoids the
interface subtleties, but also means that having a queue instead of a
pointer is counter-productive and certainly not an "optimization").

So make the comments much stronger.

Not that anybody likely reads them anyway, but there's always some
slight hope that it will cause somebody to think twice.

I'd like to remove this interface entirely, but there is the theoretical
possibility that it's actually the right thing to use in some situation,
most likely some deep RT use.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swait.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swait.h b/include/linux/swait.h
index c98aaf677466..13eac825819d 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -8,7 +8,19 @@
 #include <asm/current.h>
 
 /*
- * Simple wait queues
+ * BROKEN wait-queues.
+ *
+ * These "simple" wait-queues are broken garbage, and should never be
+ * used. The comments below claim that they are "similar" to regular
+ * wait-queues, but the semantics are actually completely different, and
+ * every single user we have ever had has been buggy (or pointless).
+ *
+ * A "swake_up()" only wakes up _one_ waiter, which is not at all what
+ * "wake_up()" does, and has led to problems. In other cases, it has
+ * been fine, because there's only ever one waiter (kvm), but in that
+ * case gthe whole "simple" wait-queue is just pointless to begin with,
+ * since there is no "queue". Use "wake_up_process()" with a direct
+ * pointer instead.
  *
  * While these are very similar to regular wait queues (wait.h) the most
  * important difference is that the simple waitqueue allows for deterministic
-- 
cgit v1.2.3


From 1a025028d400b23477341aa7ec2ce55f8b39b554 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 3 Jun 2018 02:17:39 +0100
Subject: rxrpc: Fix handling of call quietly cancelled out on server

Sometimes an in-progress call will stop responding on the fileserver when
the fileserver quietly cancels the call with an internally marked abort
(RX_CALL_DEAD), without sending an ABORT to the client.

This causes the client's call to eventually expire from lack of incoming
packets directed its way, which currently leads to it being cancelled
locally with ETIME.  Note that it's not currently clear as to why this
happens as it's really hard to reproduce.

The rotation policy implement by kAFS, however, doesn't differentiate
between ETIME meaning we didn't get any response from the server and ETIME
meaning the call got cancelled mid-flow.  The latter leads to an oops when
fetching data as the rotation partially resets the afs_read descriptor,
which can result in a cleared page pointer being dereferenced because that
page has already been filled.

Handle this by the following means:

 (1) Set a flag on a call when we receive a packet for it.

 (2) Store the highest packet serial number so far received for a call
     (bearing in mind this may wrap).

 (3) If, when the "not received anything recently" timeout expires on a
     call, we've received at least one packet for a call and the connection
     as a whole has received packets more recently than that call, then
     cancel the call locally with ECONNRESET rather than ETIME.

     This indicates that the call was definitely in progress on the server.

 (4) In kAFS, if the rotation algorithm sees ECONNRESET rather than ETIME,
     don't try the next server, but rather abort the call.

     This avoids the oops as we don't try to reuse the afs_read struct.
     Rather, as-yet ungotten pages will be reread at a later data.

Also:

 (5) Add an rxrpc tracepoint to log detection of the call being reset.

Without this, I occasionally see an oops like the following:

    general protection fault: 0000 [#1] SMP PTI
    ...
    RIP: 0010:_copy_to_iter+0x204/0x310
    RSP: 0018:ffff8800cae0f828 EFLAGS: 00010206
    RAX: 0000000000000560 RBX: 0000000000000560 RCX: 0000000000000560
    RDX: ffff8800cae0f968 RSI: ffff8800d58b3312 RDI: 0005080000000000
    RBP: ffff8800cae0f968 R08: 0000000000000560 R09: ffff8800ca00f400
    R10: ffff8800c36f28d4 R11: 00000000000008c4 R12: ffff8800cae0f958
    R13: 0000000000000560 R14: ffff8800d58b3312 R15: 0000000000000560
    FS:  00007fdaef108080(0000) GS:ffff8800ca680000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007fb28a8fa000 CR3: 00000000d2a76002 CR4: 00000000001606e0
    Call Trace:
     skb_copy_datagram_iter+0x14e/0x289
     rxrpc_recvmsg_data.isra.0+0x6f3/0xf68
     ? trace_buffer_unlock_commit_regs+0x4f/0x89
     rxrpc_kernel_recv_data+0x149/0x421
     afs_extract_data+0x1e0/0x798
     ? afs_wait_for_call_to_complete+0xc9/0x52e
     afs_deliver_fs_fetch_data+0x33a/0x5ab
     afs_deliver_to_call+0x1ee/0x5e0
     ? afs_wait_for_call_to_complete+0xc9/0x52e
     afs_wait_for_call_to_complete+0x12b/0x52e
     ? wake_up_q+0x54/0x54
     afs_make_call+0x287/0x462
     ? afs_fs_fetch_data+0x3e6/0x3ed
     ? rcu_read_lock_sched_held+0x5d/0x63
     afs_fs_fetch_data+0x3e6/0x3ed
     afs_fetch_data+0xbb/0x14a
     afs_readpages+0x317/0x40d
     __do_page_cache_readahead+0x203/0x2ba
     ? ondemand_readahead+0x3a7/0x3c1
     ondemand_readahead+0x3a7/0x3c1
     generic_file_buffered_read+0x18b/0x62f
     __vfs_read+0xdb/0xfe
     vfs_read+0xb2/0x137
     ksys_read+0x50/0x8c
     do_syscall_64+0x7d/0x1a0
     entry_SYSCALL_64_after_hwframe+0x49/0xbe

Note the weird value in RDI which is a result of trying to kmap() a NULL
page pointer.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/afs/rotate.c              |  4 ++++
 include/trace/events/rxrpc.h | 32 ++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      |  2 ++
 net/rxrpc/call_event.c       |  8 +++++++-
 net/rxrpc/input.c            | 10 ++++++++--
 5 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index e065bc0768e6..1faef56b12bd 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -310,6 +310,10 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 	case -ETIME:
 		_debug("no conn");
 		goto iterate_address;
+
+	case -ECONNRESET:
+		_debug("call reset");
+		goto failed;
 	}
 
 restart_from_beginning:
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 077e664ac9a2..4fff00e9da8a 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -1459,6 +1459,38 @@ TRACE_EVENT(rxrpc_tx_fail,
 		      __print_symbolic(__entry->what, rxrpc_tx_fail_traces))
 	    );
 
+TRACE_EVENT(rxrpc_call_reset,
+	    TP_PROTO(struct rxrpc_call *call),
+
+	    TP_ARGS(call),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		debug_id	)
+		    __field(u32,			cid		)
+		    __field(u32,			call_id		)
+		    __field(rxrpc_serial_t,		call_serial	)
+		    __field(rxrpc_serial_t,		conn_serial	)
+		    __field(rxrpc_seq_t,		tx_seq		)
+		    __field(rxrpc_seq_t,		rx_seq		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->debug_id = call->debug_id;
+		    __entry->cid = call->cid;
+		    __entry->call_id = call->call_id;
+		    __entry->call_serial = call->rx_serial;
+		    __entry->conn_serial = call->conn->hi_serial;
+		    __entry->tx_seq = call->tx_hard_ack;
+		    __entry->rx_seq = call->ackr_seen;
+			   ),
+
+	    TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x",
+		      __entry->debug_id,
+		      __entry->cid, __entry->call_id,
+		      __entry->call_serial, __entry->conn_serial,
+		      __entry->tx_seq, __entry->rx_seq)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 19975d2ca9a2..b2393113a251 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -477,6 +477,7 @@ enum rxrpc_call_flag {
 	RXRPC_CALL_PINGING,		/* Ping in process */
 	RXRPC_CALL_RETRANS_TIMEOUT,	/* Retransmission due to timeout occurred */
 	RXRPC_CALL_BEGAN_RX_TIMER,	/* We began the expect_rx_by timer */
+	RXRPC_CALL_RX_HEARD,		/* The peer responded at least once to this call */
 };
 
 /*
@@ -624,6 +625,7 @@ struct rxrpc_call {
 						 */
 	rxrpc_seq_t		rx_top;		/* Highest Rx slot allocated. */
 	rxrpc_seq_t		rx_expect_next;	/* Expected next packet sequence number */
+	rxrpc_serial_t		rx_serial;	/* Highest serial received for this call */
 	u8			rx_winsize;	/* Size of Rx window */
 	u8			tx_winsize;	/* Maximum size of Tx window */
 	bool			tx_phase;	/* T if transmission phase, F if receive phase */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6e0d788b4dc4..20210418904b 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -392,7 +392,13 @@ recheck_state:
 
 	/* Process events */
 	if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
-		rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
+		if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) &&
+		    (int)call->conn->hi_serial - (int)call->rx_serial > 0) {
+			trace_rxrpc_call_reset(call);
+			rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ECONNRESET);
+		} else {
+			rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
+		}
 		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		goto recheck_state;
 	}
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index b5fd6381313d..608d078a4981 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1278,8 +1278,14 @@ void rxrpc_data_ready(struct sock *udp_sk)
 			call = NULL;
 		}
 
-		if (call && sp->hdr.serviceId != call->service_id)
-			call->service_id = sp->hdr.serviceId;
+		if (call) {
+			if (sp->hdr.serviceId != call->service_id)
+				call->service_id = sp->hdr.serviceId;
+			if ((int)sp->hdr.serial - (int)call->rx_serial > 0)
+				call->rx_serial = sp->hdr.serial;
+			if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags))
+				set_bit(RXRPC_CALL_RX_HEARD, &call->flags);
+		}
 	} else {
 		skew = 0;
 		call = NULL;
-- 
cgit v1.2.3


From 1f4c74132140c184b8cbcf0a7298beda1d3681de Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 2 Jun 2018 21:40:19 -0700
Subject: net: skbuff.h: drop unneeded <linux/slab.h>

<linux/skbuff.h> does not use nor need <linux/slab.h>, so drop this
header file from skbuff.h.

<linux/skbuff.h> is currently #included in around 1200 C source and
header files, making it the 31st most-used header file.

Build tested [allmodconfig] on 20 arch-es.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 693564a9a979..164cdedf6012 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -854,8 +854,6 @@ struct sk_buff {
 /*
  *	Handling routines are only of interest to the kernel
  */
-#include <linux/slab.h>
-
 
 #define SKB_ALLOC_FCLONE	0x01
 #define SKB_ALLOC_RX		0x02
-- 
cgit v1.2.3


From 39dbc646fd2c67ee9b71450ce172cbd714d4e7fb Mon Sep 17 00:00:00 2001
From: Yuval Bason <yuval.bason@cavium.com>
Date: Sun, 3 Jun 2018 19:13:07 +0300
Subject: qed: Add srq core support for RoCE and iWARP

This patch adds support for configuring SRQ and provides the necessary
APIs for rdma upper layer driver (qedr) to enable the SRQ feature.

Signed-off-by: Michal Kalderon <michal.kalderon@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: Yuval Bason <yuval.bason@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c   |   5 +-
 drivers/net/ethernet/qlogic/qed/qed_cxt.h   |   1 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h   |   2 +
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c |  23 ++++
 drivers/net/ethernet/qlogic/qed/qed_main.c  |   2 +
 drivers/net/ethernet/qlogic/qed/qed_rdma.c  | 178 +++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_rdma.h  |   2 +
 drivers/net/ethernet/qlogic/qed/qed_roce.c  |  17 ++-
 include/linux/qed/qed_rdma_if.h             |  12 +-
 9 files changed, 234 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 820b226d6ff8..7ed6aa0521e1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -47,6 +47,7 @@
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_init_ops.h"
+#include "qed_rdma.h"
 #include "qed_reg_addr.h"
 #include "qed_sriov.h"
 
@@ -426,7 +427,7 @@ static void qed_cxt_set_srq_count(struct qed_hwfn *p_hwfn, u32 num_srqs)
 	p_mgr->srq_count = num_srqs;
 }
 
-static u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn)
+u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn)
 {
 	struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
 
@@ -2071,7 +2072,7 @@ static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn,
 	u32 num_cons, num_qps, num_srqs;
 	enum protocol_type proto;
 
-	num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs);
+	num_srqs = min_t(u32, QED_RDMA_MAX_SRQS, p_params->num_srqs);
 
 	if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) {
 		DP_NOTICE(p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index a4e95869889f..758a8b4c0de8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -235,6 +235,7 @@ u32 qed_cxt_get_proto_tid_count(struct qed_hwfn *p_hwfn,
 				enum protocol_type type);
 u32 qed_cxt_get_proto_cid_start(struct qed_hwfn *p_hwfn,
 				enum protocol_type type);
+u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn);
 int qed_cxt_free_proto_ilt(struct qed_hwfn *p_hwfn, enum protocol_type proto);
 
 #define QED_CTX_WORKING_MEM 0
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index beba9308011b..b9704be53537 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -9725,6 +9725,8 @@ enum iwarp_eqe_async_opcode {
 	IWARP_EVENT_TYPE_ASYNC_EXCEPTION_DETECTED,
 	IWARP_EVENT_TYPE_ASYNC_QP_IN_ERROR_STATE,
 	IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW,
+	IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY,
+	IWARP_EVENT_TYPE_ASYNC_SRQ_LIMIT,
 	MAX_IWARP_EQE_ASYNC_OPCODE
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 2a2b1018ed1d..474e6cff5b97 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -271,6 +271,8 @@ int qed_iwarp_create_qp(struct qed_hwfn *p_hwfn,
 	p_ramrod->sq_num_pages = qp->sq_num_pages;
 	p_ramrod->rq_num_pages = qp->rq_num_pages;
 
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(qp->srq_id);
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(p_hwfn->hw_info.opaque_fid);
 	p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
 	p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
 
@@ -3004,8 +3006,11 @@ static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
 				 union event_ring_data *data,
 				 u8 fw_return_code)
 {
+	struct qed_rdma_events events = p_hwfn->p_rdma_info->events;
 	struct regpair *fw_handle = &data->rdma_data.async_handle;
 	struct qed_iwarp_ep *ep = NULL;
+	u16 srq_offset;
+	u16 srq_id;
 	u16 cid;
 
 	ep = (struct qed_iwarp_ep *)(uintptr_t)HILO_64(fw_handle->hi,
@@ -3067,6 +3072,24 @@ static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
 		qed_iwarp_cid_cleaned(p_hwfn, cid);
 
 		break;
+	case IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY:
+		DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY\n");
+		srq_offset = p_hwfn->p_rdma_info->srq_id_offset;
+		/* FW assigns value that is no greater than u16 */
+		srq_id = ((u16)le32_to_cpu(fw_handle->lo)) - srq_offset;
+		events.affiliated_event(events.context,
+					QED_IWARP_EVENT_SRQ_EMPTY,
+					&srq_id);
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_SRQ_LIMIT:
+		DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_SRQ_LIMIT\n");
+		srq_offset = p_hwfn->p_rdma_info->srq_id_offset;
+		/* FW assigns value that is no greater than u16 */
+		srq_id = ((u16)le32_to_cpu(fw_handle->lo)) - srq_offset;
+		events.affiliated_event(events.context,
+					QED_IWARP_EVENT_SRQ_LIMIT,
+					&srq_id);
+		break;
 	case IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW:
 		DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW\n");
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 68c4399ffd50..b04d57ca5176 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -64,6 +64,7 @@
 
 #define QED_ROCE_QPS			(8192)
 #define QED_ROCE_DPIS			(8)
+#define QED_RDMA_SRQS                   QED_ROCE_QPS
 
 static char version[] =
 	"QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n";
@@ -922,6 +923,7 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 	if (IS_ENABLED(CONFIG_QED_RDMA)) {
 		params->rdma_pf_params.num_qps = QED_ROCE_QPS;
 		params->rdma_pf_params.min_dpis = QED_ROCE_DPIS;
+		params->rdma_pf_params.num_srqs = QED_RDMA_SRQS;
 		/* divide by 3 the MRs to avoid MF ILT overflow */
 		params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX;
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index a411f9c702a1..b8705107a93a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -259,15 +259,29 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 		goto free_cid_map;
 	}
 
+	/* Allocate bitmap for srqs */
+	p_rdma_info->num_srqs = qed_cxt_get_srq_count(p_hwfn);
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->srq_map,
+				 p_rdma_info->num_srqs, "SRQ");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate srq bitmap, rc = %d\n", rc);
+		goto free_real_cid_map;
+	}
+
 	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
 		rc = qed_iwarp_alloc(p_hwfn);
 
 	if (rc)
-		goto free_cid_map;
+		goto free_srq_map;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocation successful\n");
 	return 0;
 
+free_srq_map:
+	kfree(p_rdma_info->srq_map.bitmap);
+free_real_cid_map:
+	kfree(p_rdma_info->real_cid_map.bitmap);
 free_cid_map:
 	kfree(p_rdma_info->cid_map.bitmap);
 free_tid_map:
@@ -351,6 +365,8 @@ static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn)
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->cq_map, 1);
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->toggle_bits, 0);
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tid_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->srq_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->real_cid_map, 1);
 
 	kfree(p_rdma_info->port);
 	kfree(p_rdma_info->dev);
@@ -431,6 +447,12 @@ static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn,
 	if (cdev->rdma_max_sge)
 		dev->max_sge = min_t(u32, cdev->rdma_max_sge, dev->max_sge);
 
+	dev->max_srq_sge = QED_RDMA_MAX_SGE_PER_SRQ_WQE;
+	if (p_hwfn->cdev->rdma_max_srq_sge) {
+		dev->max_srq_sge = min_t(u32,
+					 p_hwfn->cdev->rdma_max_srq_sge,
+					 dev->max_srq_sge);
+	}
 	dev->max_inline = ROCE_REQ_MAX_INLINE_DATA_SIZE;
 
 	dev->max_inline = (cdev->rdma_max_inline) ?
@@ -474,6 +496,8 @@ static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn,
 	dev->max_mr_mw_fmr_size = dev->max_mr_mw_fmr_pbl * PAGE_SIZE;
 	dev->max_pkey = QED_RDMA_MAX_P_KEY;
 
+	dev->max_srq = p_hwfn->p_rdma_info->num_srqs;
+	dev->max_srq_wr = QED_RDMA_MAX_SRQ_WQE_ELEM;
 	dev->max_qp_resp_rd_atomic_resc = RDMA_RING_PAGE_SIZE /
 					  (RDMA_RESP_RD_ATOMIC_ELM_SIZE * 2);
 	dev->max_qp_req_rd_atomic_resc = RDMA_RING_PAGE_SIZE /
@@ -1628,6 +1652,155 @@ static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev)
 	return QED_LEADING_HWFN(cdev);
 }
 
+static int qed_rdma_modify_srq(void *rdma_cxt,
+			       struct qed_rdma_modify_srq_in_params *in_params)
+{
+	struct rdma_srq_modify_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data = {};
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_spq_entry *p_ent;
+	u16 opaque_fid;
+	int rc;
+
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_MODIFY_SRQ,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rdma_modify_srq;
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(in_params->srq_id);
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
+	p_ramrod->wqe_limit = cpu_to_le32(in_params->wqe_limit);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		return rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "modified SRQ id = %x",
+		   in_params->srq_id);
+
+	return rc;
+}
+
+static int
+qed_rdma_destroy_srq(void *rdma_cxt,
+		     struct qed_rdma_destroy_srq_in_params *in_params)
+{
+	struct rdma_srq_destroy_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data = {};
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_spq_entry *p_ent;
+	struct qed_bmap *bmap;
+	u16 opaque_fid;
+	int rc;
+
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	init_data.opaque_fid = opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_DESTROY_SRQ,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rdma_destroy_srq;
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(in_params->srq_id);
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		return rc;
+
+	bmap = &p_hwfn->p_rdma_info->srq_map;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, bmap, in_params->srq_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "SRQ destroyed Id = %x",
+		   in_params->srq_id);
+
+	return rc;
+}
+
+static int
+qed_rdma_create_srq(void *rdma_cxt,
+		    struct qed_rdma_create_srq_in_params *in_params,
+		    struct qed_rdma_create_srq_out_params *out_params)
+{
+	struct rdma_srq_create_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data = {};
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	enum qed_cxt_elem_type elem_type;
+	struct qed_spq_entry *p_ent;
+	u16 opaque_fid, srq_id;
+	struct qed_bmap *bmap;
+	u32 returned_id;
+	int rc;
+
+	bmap = &p_hwfn->p_rdma_info->srq_map;
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn, bmap, &returned_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	if (rc) {
+		DP_NOTICE(p_hwfn, "failed to allocate srq id\n");
+		return rc;
+	}
+
+	elem_type = QED_ELEM_SRQ;
+	rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, elem_type, returned_id);
+	if (rc)
+		goto err;
+	/* returned id is no greater than u16 */
+	srq_id = (u16)returned_id;
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.opaque_fid = opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_CREATE_SRQ,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		goto err;
+
+	p_ramrod = &p_ent->ramrod.rdma_create_srq;
+	DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, in_params->pbl_base_addr);
+	p_ramrod->pages_in_srq_pbl = cpu_to_le16(in_params->num_pages);
+	p_ramrod->pd_id = cpu_to_le16(in_params->pd_id);
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(srq_id);
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
+	p_ramrod->page_size = cpu_to_le16(in_params->page_size);
+	DMA_REGPAIR_LE(p_ramrod->producers_addr, in_params->prod_pair_addr);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err;
+
+	out_params->srq_id = srq_id;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "SRQ created Id = %x\n", out_params->srq_id);
+
+	return rc;
+
+err:
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, bmap, returned_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	return rc;
+}
+
 bool qed_rdma_allocated_qps(struct qed_hwfn *p_hwfn)
 {
 	bool result;
@@ -1773,6 +1946,9 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = {
 	.rdma_free_tid = &qed_rdma_free_tid,
 	.rdma_register_tid = &qed_rdma_register_tid,
 	.rdma_deregister_tid = &qed_rdma_deregister_tid,
+	.rdma_create_srq = &qed_rdma_create_srq,
+	.rdma_modify_srq = &qed_rdma_modify_srq,
+	.rdma_destroy_srq = &qed_rdma_destroy_srq,
 	.ll2_acquire_connection = &qed_ll2_acquire_connection,
 	.ll2_establish_connection = &qed_ll2_establish_connection,
 	.ll2_terminate_connection = &qed_ll2_terminate_connection,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.h b/drivers/net/ethernet/qlogic/qed/qed_rdma.h
index 18ec9cbd84f5..6f722ee8ee94 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.h
@@ -96,6 +96,8 @@ struct qed_rdma_info {
 	u8 num_cnqs;
 	u32 num_qps;
 	u32 num_mrs;
+	u32 num_srqs;
+	u16 srq_id_offset;
 	u16 queue_zone_base;
 	u16 max_queue_zones;
 	enum protocol_type proto;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 6acfd43c1a4f..ee57fcdc698d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -65,6 +65,8 @@ qed_roce_async_event(struct qed_hwfn *p_hwfn,
 		     u8 fw_event_code,
 		     u16 echo, union event_ring_data *data, u8 fw_return_code)
 {
+	struct qed_rdma_events events = p_hwfn->p_rdma_info->events;
+
 	if (fw_event_code == ROCE_ASYNC_EVENT_DESTROY_QP_DONE) {
 		u16 icid =
 		    (u16)le32_to_cpu(data->rdma_data.rdma_destroy_qp_data.cid);
@@ -75,11 +77,18 @@ qed_roce_async_event(struct qed_hwfn *p_hwfn,
 		 */
 		qed_roce_free_real_icid(p_hwfn, icid);
 	} else {
-		struct qed_rdma_events *events = &p_hwfn->p_rdma_info->events;
+		if (fw_event_code == ROCE_ASYNC_EVENT_SRQ_EMPTY ||
+		    fw_event_code == ROCE_ASYNC_EVENT_SRQ_LIMIT) {
+			u16 srq_id = (u16)data->rdma_data.async_handle.lo;
+
+			events.affiliated_event(events.context, fw_event_code,
+						&srq_id);
+		} else {
+			union rdma_eqe_data rdata = data->rdma_data;
 
-		events->affiliated_event(p_hwfn->p_rdma_info->events.context,
-					 fw_event_code,
-				     (void *)&data->rdma_data.async_handle);
+			events.affiliated_event(events.context, fw_event_code,
+						(void *)&rdata.async_handle);
+		}
 	}
 
 	return 0;
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index 4dd72ba210f5..e05e320d28b7 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -485,7 +485,9 @@ enum qed_iwarp_event_type {
 	QED_IWARP_EVENT_ACTIVE_MPA_REPLY,
 	QED_IWARP_EVENT_LOCAL_ACCESS_ERROR,
 	QED_IWARP_EVENT_REMOTE_OPERATION_ERROR,
-	QED_IWARP_EVENT_TERMINATE_RECEIVED
+	QED_IWARP_EVENT_TERMINATE_RECEIVED,
+	QED_IWARP_EVENT_SRQ_LIMIT,
+	QED_IWARP_EVENT_SRQ_EMPTY,
 };
 
 enum qed_tcp_ip_version {
@@ -646,6 +648,14 @@ struct qed_rdma_ops {
 	int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid);
 	void (*rdma_free_tid)(void *rdma_cxt, u32 itid);
 
+	int (*rdma_create_srq)(void *rdma_cxt,
+			       struct qed_rdma_create_srq_in_params *iparams,
+			       struct qed_rdma_create_srq_out_params *oparams);
+	int (*rdma_destroy_srq)(void *rdma_cxt,
+				struct qed_rdma_destroy_srq_in_params *iparams);
+	int (*rdma_modify_srq)(void *rdma_cxt,
+			       struct qed_rdma_modify_srq_in_params *iparams);
+
 	int (*ll2_acquire_connection)(void *rdma_cxt,
 				      struct qed_ll2_acquire_data *data);
 
-- 
cgit v1.2.3


From a9235b544a0ae95fbd56443bdbd251358c7df4c5 Mon Sep 17 00:00:00 2001
From: Vasyl Gomonovych <gomonovych@gmail.com>
Date: Fri, 18 May 2018 22:31:28 +0200
Subject: ring-buffer: Fix typo in comment

Fix typo of the word 'been'

Link: http://lkml.kernel.org/r/20180518203130.2011-1-gomonovych@gmail.com

Signed-off-by: Vasyl Gomonovych <gomonovych@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index a0233edc0718..b72ebdff0b77 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -65,7 +65,7 @@ u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
 
 /*
  * ring_buffer_discard_commit will remove an event that has not
- *   ben committed yet. If this is used, then ring_buffer_unlock_commit
+ *   been committed yet. If this is used, then ring_buffer_unlock_commit
  *   must not be called on the discarded event. This function
  *   will try to remove the event from the ring buffer completely
  *   if another event has not been written after it.
-- 
cgit v1.2.3


From 2026d35741f2c3ece73c11eb7e4a15d7c2df9ebe Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 30 May 2018 08:19:22 -0400
Subject: branch-check: fix long->int truncation when profiling branches

The function __builtin_expect returns long type (see the gcc
documentation), and so do macros likely and unlikely. Unfortunatelly, when
CONFIG_PROFILE_ANNOTATED_BRANCHES is selected, the macros likely and
unlikely expand to __branch_check__ and __branch_check__ truncates the
long type to int. This unintended truncation may cause bugs in various
kernel code (we found a bug in dm-writecache because of it), so it's
better to fix __branch_check__ to return long.

Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1805300818140.24812@file01.intranet.prod.int.rdu2.redhat.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 1f0d69a9fc815 ("tracing: profile likely and unlikely annotations")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ab4711c63601..42506e4d1f53 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -21,7 +21,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
 #define __branch_check__(x, expect, is_constant) ({			\
-			int ______r;					\
+			long ______r;					\
 			static struct ftrace_likely_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_annotated_branch"))) \
-- 
cgit v1.2.3


From 189454e868c45ce3476bfac03ace921dec34208a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:50 +0200
Subject: net: remove net_device operation ndo_xdp_flush

All drivers are cleaned up and no references to ndo_xdp_flush
are left in drivers, it is time to remove the net_device_ops
operation ndo_xdp_flush.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7f17785a59d7..42c6ea35a6f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1192,9 +1192,6 @@ struct dev_ifalias {
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- * void (*ndo_xdp_flush)(struct net_device *dev);
- *	This function is used to inform the driver to flush a particular
- *	xdp tx queue. Must be called on same CPU as xdp_xmit.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1382,7 +1379,6 @@ struct net_device_ops {
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
 						struct xdp_frame **xdp,
 						u32 flags);
-	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
 /**
-- 
cgit v1.2.3


From 9107c05e2e55c1c7abd02ab38f7694e0da08b643 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 2 Jun 2018 22:37:24 +0200
Subject: net: phy: remove PM ops from MDIO bus

Current implementation of MDIO bus PM ops doesn't actually implement
bus-specific PM ops but just calls PM ops defined on a device level
what doesn't seem to be fully in line with the core PM model.

When looking e.g. at __device_suspend() the PM core looks for PM ops
of a device in a specific order:
1. device PM domain
2. device type
3. device class
4. device bus

I think it has good reason that there's no PM ops on device level.

Now that a device type representation of PHY's as special type of MDIO
devices was added (only user of MDIO bus PM ops), the MDIO bus
PM ops can be removed including member pm of struct mdio_device.

If for some other type of MDIO device PM ops are needed, it should be
modeled as struct device_type as well.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c | 48 ----------------------------------------------
 include/linux/mdio.h       |  1 -
 2 files changed, 49 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 24b5511222c8..98f4b1f706df 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -717,58 +717,10 @@ static int mdio_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int mdio_bus_suspend(struct device *dev)
-{
-	struct mdio_device *mdio = to_mdio_device(dev);
-
-	if (mdio->pm_ops && mdio->pm_ops->suspend)
-		return mdio->pm_ops->suspend(dev);
-
-	return 0;
-}
-
-static int mdio_bus_resume(struct device *dev)
-{
-	struct mdio_device *mdio = to_mdio_device(dev);
-
-	if (mdio->pm_ops && mdio->pm_ops->resume)
-		return mdio->pm_ops->resume(dev);
-
-	return 0;
-}
-
-static int mdio_bus_restore(struct device *dev)
-{
-	struct mdio_device *mdio = to_mdio_device(dev);
-
-	if (mdio->pm_ops && mdio->pm_ops->restore)
-		return mdio->pm_ops->restore(dev);
-
-	return 0;
-}
-
-static const struct dev_pm_ops mdio_bus_pm_ops = {
-	.suspend = mdio_bus_suspend,
-	.resume = mdio_bus_resume,
-	.freeze = mdio_bus_suspend,
-	.thaw = mdio_bus_resume,
-	.restore = mdio_bus_restore,
-};
-
-#define MDIO_BUS_PM_OPS (&mdio_bus_pm_ops)
-
-#else
-
-#define MDIO_BUS_PM_OPS NULL
-
-#endif /* CONFIG_PM */
-
 struct bus_type mdio_bus_type = {
 	.name		= "mdio_bus",
 	.match		= mdio_bus_match,
 	.uevent		= mdio_uevent,
-	.pm		= MDIO_BUS_PM_OPS,
 };
 EXPORT_SYMBOL(mdio_bus_type);
 
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 2cfffe586885..bfa7114167d7 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -29,7 +29,6 @@ enum mdio_mutex_lock_class {
 struct mdio_device {
 	struct device dev;
 
-	const struct dev_pm_ops *pm_ops;
 	struct mii_bus *bus;
 	char modalias[MDIO_NAME_SIZE];
 
-- 
cgit v1.2.3


From 69e2ecccd0bee6b186d0d42f8093109bfca7e990 Mon Sep 17 00:00:00 2001
From: Kun Yi <kunyi@google.com>
Date: Mon, 4 Jun 2018 13:17:04 -0700
Subject: net: phy: broadcom: Enable 125 MHz clock on LED4 pin for BCM54612E by
 default.

BCM54612E have 4 multi-functional LED pins that can be configured
through register setting; the LED4 pin can be configured to a 125MHz
reference clock output by setting the spare register. Since the dedicated
CLK125 reference clock pin is not brought out on the 48-Pin MLP, the LED4
pin is the only pin to provide such function in this package, and therefore
it is beneficial to just enable the reference clock by default.

Signed-off-by: Kun Yi <kunyi@google.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 16 ++++++++++++++--
 include/linux/brcmphy.h    |  4 ++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index f9c25912eb98..e86ea105c802 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -54,6 +54,8 @@ static int bcm54210e_config_init(struct phy_device *phydev)
 
 static int bcm54612e_config_init(struct phy_device *phydev)
 {
+	int reg;
+
 	/* Clear TX internal delay unless requested. */
 	if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
 	    (phydev->interface != PHY_INTERFACE_MODE_RGMII_TXID)) {
@@ -65,8 +67,6 @@ static int bcm54612e_config_init(struct phy_device *phydev)
 	/* Clear RX internal delay unless requested. */
 	if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
 	    (phydev->interface != PHY_INTERFACE_MODE_RGMII_RXID)) {
-		u16 reg;
-
 		reg = bcm54xx_auxctl_read(phydev,
 					  MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
 		/* Disable RXD to RXC delay (default set) */
@@ -77,6 +77,18 @@ static int bcm54612e_config_init(struct phy_device *phydev)
 				     MII_BCM54XX_AUXCTL_MISC_WREN | reg);
 	}
 
+	/* Enable CLK125 MUX on LED4 if ref clock is enabled. */
+	if (!(phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED)) {
+		int err;
+
+		reg = bcm_phy_read_exp(phydev, BCM54612E_EXP_SPARE0);
+		err = bcm_phy_write_exp(phydev, BCM54612E_EXP_SPARE0,
+					BCM54612E_LED4_CLK125OUT_EN | reg);
+
+		if (err < 0)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index b324e01ccf2d..daa9234a9baf 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -85,6 +85,7 @@
 #define MII_BCM54XX_EXP_SEL	0x17	/* Expansion register select */
 #define MII_BCM54XX_EXP_SEL_SSD	0x0e00	/* Secondary SerDes select */
 #define MII_BCM54XX_EXP_SEL_ER	0x0f00	/* Expansion register select */
+#define MII_BCM54XX_EXP_SEL_ETC	0x0d00	/* Expansion register spare + 2k mem */
 
 #define MII_BCM54XX_AUX_CTL	0x18	/* Auxiliary control register */
 #define MII_BCM54XX_ISR		0x1a	/* BCM54xx interrupt status register */
@@ -219,6 +220,9 @@
 #define BCM54810_SHD_CLK_CTL			0x3
 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN		(1 << 9)
 
+/* BCM54612E Registers */
+#define BCM54612E_EXP_SPARE0		(MII_BCM54XX_EXP_SEL_ETC + 0x34)
+#define BCM54612E_LED4_CLK125OUT_EN	(1 << 1)
 
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
-- 
cgit v1.2.3


From e61e62b9e2cc14b336f330f37f517f9d373ff31e Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:51 +0200
Subject: xsk: moved struct xdp_umem definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy
support.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h | 24 +++++++++++++++++++++++-
 net/xdp/xdp_umem.c     |  1 +
 net/xdp/xdp_umem.h     | 22 +---------------------
 net/xdp/xsk_queue.h    |  3 +--
 4 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 7a647c56ec15..3a6cd88f179d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -6,12 +6,34 @@
 #ifndef _LINUX_XDP_SOCK_H
 #define _LINUX_XDP_SOCK_H
 
+#include <linux/workqueue.h>
+#include <linux/if_xdp.h>
 #include <linux/mutex.h>
+#include <linux/mm.h>
 #include <net/sock.h>
 
 struct net_device;
 struct xsk_queue;
-struct xdp_umem;
+
+struct xdp_umem_props {
+	u64 chunk_mask;
+	u64 size;
+};
+
+struct xdp_umem {
+	struct xsk_queue *fq;
+	struct xsk_queue *cq;
+	struct page **pgs;
+	struct xdp_umem_props props;
+	u32 headroom;
+	u32 chunk_size_nohr;
+	struct user_struct *user;
+	struct pid *pid;
+	unsigned long address;
+	refcount_t users;
+	struct work_struct work;
+	u32 npgs;
+};
 
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9ad791ff4739..2793a503223e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 
 #include "xdp_umem.h"
+#include "xsk_queue.h"
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index aeadd1bcb72d..9433e8af650a 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -6,27 +6,7 @@
 #ifndef XDP_UMEM_H_
 #define XDP_UMEM_H_
 
-#include <linux/mm.h>
-#include <linux/if_xdp.h>
-#include <linux/workqueue.h>
-
-#include "xsk_queue.h"
-#include "xdp_umem_props.h"
-
-struct xdp_umem {
-	struct xsk_queue *fq;
-	struct xsk_queue *cq;
-	struct page **pgs;
-	struct xdp_umem_props props;
-	u32 headroom;
-	u32 chunk_size_nohr;
-	struct user_struct *user;
-	struct pid *pid;
-	unsigned long address;
-	refcount_t users;
-	struct work_struct work;
-	u32 npgs;
-};
+#include <net/xdp_sock.h>
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 337e5ad3b10e..5246ed420a16 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -8,8 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/if_xdp.h>
-
-#include "xdp_umem_props.h"
+#include <net/xdp_sock.h>
 
 #define RX_BATCH_SIZE 16
 
-- 
cgit v1.2.3


From 8aef7340ae9695912a411886452ae9773206e845 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:52 +0200
Subject: xsk: introduce xdp_umem_page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xdp_umem_page holds the address for a page. Trade memory for
faster lookup. Later, we'll add DMA address here as well.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  7 ++++++-
 net/xdp/xdp_umem.c     | 15 ++++++++++++++-
 net/xdp/xdp_umem.h     |  3 +--
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 3a6cd88f179d..caf343a7e224 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -20,10 +20,14 @@ struct xdp_umem_props {
 	u64 size;
 };
 
+struct xdp_umem_page {
+	void *addr;
+};
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
-	struct page **pgs;
+	struct xdp_umem_page *pages;
 	struct xdp_umem_props props;
 	u32 headroom;
 	u32 chunk_size_nohr;
@@ -32,6 +36,7 @@ struct xdp_umem {
 	unsigned long address;
 	refcount_t users;
 	struct work_struct work;
+	struct page **pgs;
 	u32 npgs;
 };
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2793a503223e..aca826011f6c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -65,6 +65,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		goto out;
 
 	mmput(mm);
+	kfree(umem->pages);
+	umem->pages = NULL;
+
 	xdp_umem_unaccount_pages(umem);
 out:
 	kfree(umem);
@@ -155,7 +158,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	int size_chk, err;
+	int size_chk, err, i;
 
 	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
@@ -213,6 +216,16 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	err = xdp_umem_pin_pages(umem);
 	if (err)
 		goto out_account;
+
+	umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
+	if (!umem->pages) {
+		err = -ENOMEM;
+		goto out_account;
+	}
+
+	for (i = 0; i < umem->npgs; i++)
+		umem->pages[i].addr = page_address(umem->pgs[i]);
+
 	return 0;
 
 out_account:
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 9433e8af650a..40e8fa4a92af 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -10,8 +10,7 @@
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
-		(addr & (PAGE_SIZE - 1));
+	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
-- 
cgit v1.2.3


From 74515c5750f30244a901c3c0c82a2fe534b3c9c5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:53 +0200
Subject: net: xdp: added bpf_netdev_command XDP_{QUERY, SETUP}_XSK_UMEM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend ndo_bpf with two new commands used for query zero-copy support
and register an UMEM to a queue_id of a netdev.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 42c6ea35a6f2..cc4ea7ab6d24 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -817,10 +817,13 @@ enum bpf_netdev_command {
 	BPF_OFFLOAD_DESTROY,
 	BPF_OFFLOAD_MAP_ALLOC,
 	BPF_OFFLOAD_MAP_FREE,
+	XDP_QUERY_XSK_UMEM,
+	XDP_SETUP_XSK_UMEM,
 };
 
 struct bpf_prog_offload_ops;
 struct netlink_ext_ack;
+struct xdp_umem;
 
 struct netdev_bpf {
 	enum bpf_netdev_command command;
@@ -851,6 +854,11 @@ struct netdev_bpf {
 		struct {
 			struct bpf_offloaded_map *offmap;
 		};
+		/* XDP_SETUP_XSK_UMEM */
+		struct {
+			struct xdp_umem *umem;
+			u16 queue_id;
+		} xsk;
 	};
 };
 
-- 
cgit v1.2.3


From 02b55e5657c3a569fc681ba851e464cfa6b90d4f Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:54 +0200
Subject: xdp: add MEM_TYPE_ZERO_COPY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here, a new type of allocator support is added to the XDP return
API. A zero-copy allocated xdp_buff cannot be converted to an
xdp_frame. Instead is the buff has to be copied. This is not supported
at all in this commit.

Also, an opaque "handle" is added to xdp_buff. This can be used as a
context for the zero-copy allocator implementation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp.h | 10 ++++++++++
 net/core/xdp.c    | 19 ++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index a3b71a4dd71d..2deea7166a34 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -37,6 +37,7 @@ enum xdp_mem_type {
 	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
 	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
 	MEM_TYPE_PAGE_POOL,
+	MEM_TYPE_ZERO_COPY,
 	MEM_TYPE_MAX,
 };
 
@@ -51,6 +52,10 @@ struct xdp_mem_info {
 
 struct page_pool;
 
+struct zero_copy_allocator {
+	void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
+};
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
@@ -63,6 +68,7 @@ struct xdp_buff {
 	void *data_end;
 	void *data_meta;
 	void *data_hard_start;
+	unsigned long handle;
 	struct xdp_rxq_info *rxq;
 };
 
@@ -86,6 +92,10 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	int metasize;
 	int headroom;
 
+	/* TODO: implement clone, copy, use "native" MEM_TYPE */
+	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+		return NULL;
+
 	/* Assure headroom is available for storing info */
 	headroom = xdp->data - xdp->data_hard_start;
 	metasize = xdp->data - xdp->data_meta;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index cb8c4e061a5a..9d1f22072d5d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -31,6 +31,7 @@ struct xdp_mem_allocator {
 	union {
 		void *allocator;
 		struct page_pool *page_pool;
+		struct zero_copy_allocator *zc_alloc;
 	};
 	struct rhash_head node;
 	struct rcu_head rcu;
@@ -261,7 +262,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 	xdp_rxq->mem.type = type;
 
 	if (!allocator) {
-		if (type == MEM_TYPE_PAGE_POOL)
+		if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
 			return -EINVAL; /* Setup time check page_pool req */
 		return 0;
 	}
@@ -314,7 +315,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * is used for those calls sites.  Thus, allowing for faster recycling
  * of xdp_frames/pages in those cases.
  */
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+			 unsigned long handle)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -338,6 +340,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 		page = virt_to_page(data); /* Assumes order0 page*/
 		put_page(page);
 		break;
+	case MEM_TYPE_ZERO_COPY:
+		/* NB! Only valid from an xdp_buff! */
+		rcu_read_lock();
+		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+		xa->zc_alloc->free(xa->zc_alloc, handle);
+		rcu_read_unlock();
 	default:
 		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
 		break;
@@ -346,18 +355,18 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, false);
+	__xdp_return(xdpf->data, &xdpf->mem, false, 0);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, true);
+	__xdp_return(xdpf->data, &xdpf->mem, true, 0);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	__xdp_return(xdp->data, &xdp->rxq->mem, true);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
-- 
cgit v1.2.3


From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:55 +0200
Subject: xsk: add zero-copy support for Rx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and
wireup ndo_bpf call in bind.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      |  6 +++
 include/uapi/linux/if_xdp.h |  4 +-
 net/xdp/xdp_umem.c          | 77 ++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  3 ++
 net/xdp/xsk.c               | 96 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 165 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index caf343a7e224..d93d3aac3fc9 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -22,6 +22,7 @@ struct xdp_umem_props {
 
 struct xdp_umem_page {
 	void *addr;
+	dma_addr_t dma;
 };
 
 struct xdp_umem {
@@ -38,6 +39,9 @@ struct xdp_umem {
 	struct work_struct work;
 	struct page **pgs;
 	u32 npgs;
+	struct net_device *dev;
+	u16 queue_id;
+	bool zc;
 };
 
 struct xdp_sock {
@@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
+void xsk_umem_discard_addr(struct xdp_umem *umem);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e411d6f9ac65..1fa0e977ea8d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -13,7 +13,9 @@
 #include <linux/types.h>
 
 /* Options for the sxdp_flags field */
-#define XDP_SHARED_UMEM 1
+#define XDP_SHARED_UMEM	(1 << 0)
+#define XDP_COPY	(1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index aca826011f6c..f729d79b8d91 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -17,6 +17,81 @@
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags)
+{
+	bool force_zc, force_copy;
+	struct netdev_bpf bpf;
+	int err;
+
+	force_zc = flags & XDP_ZEROCOPY;
+	force_copy = flags & XDP_COPY;
+
+	if (force_zc && force_copy)
+		return -EINVAL;
+
+	if (force_copy)
+		return 0;
+
+	dev_hold(dev);
+
+	if (dev->netdev_ops->ndo_bpf) {
+		bpf.command = XDP_QUERY_XSK_UMEM;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? -ENOTSUPP : 0;
+		}
+
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = umem;
+		bpf.xsk.queue_id = queue_id;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? err : 0; /* fail or fallback */
+		}
+
+		umem->dev = dev;
+		umem->queue_id = queue_id;
+		umem->zc = true;
+		return 0;
+	}
+
+	dev_put(dev);
+	return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+}
+
+void xdp_umem_clear_dev(struct xdp_umem *umem)
+{
+	struct netdev_bpf bpf;
+	int err;
+
+	if (umem->dev) {
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = NULL;
+		bpf.xsk.queue_id = umem->queue_id;
+
+		rtnl_lock();
+		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
+		rtnl_unlock();
+
+		if (err)
+			WARN(1, "failed to disable umem!\n");
+
+		dev_put(umem->dev);
+		umem->dev = NULL;
+	}
+}
+
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
 	unsigned int i;
@@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct task_struct *task;
 	struct mm_struct *mm;
 
+	xdp_umem_clear_dev(umem);
+
 	if (umem->fq) {
 		xskq_destroy(umem->fq);
 		umem->fq = NULL;
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 40e8fa4a92af..674508a32a4d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags);
+void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4688c750df1d..ab64bd8260ea 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
-	return !!xs->rx;
+	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
+		READ_ONCE(xs->umem->fq);
 }
 
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+	return xskq_peek_addr(umem->fq, addr);
+}
+EXPORT_SYMBOL(xsk_umem_peek_addr);
+
+void xsk_umem_discard_addr(struct xdp_umem *umem)
+{
+	xskq_discard_addr(umem->fq);
+}
+EXPORT_SYMBOL(xsk_umem_discard_addr);
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
 	u64 addr;
 	int err;
 
-	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
-		return -EINVAL;
-
 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
 	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
@@ -60,25 +69,41 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
-	if (!err)
+	if (!err) {
 		xskq_discard_addr(xs->umem->fq);
-	else
-		xs->rx_dropped++;
+		xdp_return_buff(xdp);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
-int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	int err;
+	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 
-	err = __xsk_rcv(xs, xdp);
-	if (likely(!err))
+	if (err) {
 		xdp_return_buff(xdp);
+		xs->rx_dropped++;
+	}
 
 	return err;
 }
 
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 len;
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+		return -EINVAL;
+
+	len = xdp->data_end - xdp->data;
+
+	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
+		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+}
+
 void xsk_flush(struct xdp_sock *xs)
 {
 	xskq_produce_flush_desc(xs->rx);
@@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs)
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
+	u32 len = xdp->data_end - xdp->data;
+	void *buffer;
+	u64 addr;
 	int err;
 
-	err = __xsk_rcv(xs, xdp);
-	if (!err)
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
+		xs->rx_dropped++;
+		return -ENOSPC;
+	}
+
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
+	memcpy(buffer, xdp->data, len);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
+	if (!err) {
+		xskq_discard_addr(xs->umem->fq);
 		xsk_flush(xs);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
@@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 	struct net_device *dev;
+	u32 flags, qid;
 	int err = 0;
 
 	if (addr_len < sizeof(struct sockaddr_xdp))
@@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_unlock;
 	}
 
-	if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
-	    (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
+	qid = sxdp->sxdp_queue_id;
+
+	if ((xs->rx && qid >= dev->real_num_rx_queues) ||
+	    (xs->tx && qid >= dev->real_num_tx_queues)) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
 
-	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
+	flags = sxdp->sxdp_flags;
+
+	if (flags & XDP_SHARED_UMEM) {
 		struct xdp_sock *umem_xs;
 		struct socket *sock;
 
+		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+			/* Cannot specify flags for shared sockets. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
 		if (xs->umem) {
 			/* We have already our own. */
 			err = -EINVAL;
@@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 			err = -EBADF;
 			sockfd_put(sock);
 			goto out_unlock;
-		} else if (umem_xs->dev != dev ||
-			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
+		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 			err = -EINVAL;
 			sockfd_put(sock);
 			goto out_unlock;
@@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		/* This xsk has its own umem. */
 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
+
+		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
+		if (err)
+			goto out_unlock;
 	}
 
 	xs->dev = dev;
-- 
cgit v1.2.3


From e3760c7e50ac6cdf1188fec44938dd7e6e6eef61 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 14:05:56 +0200
Subject: net: added netdevice operation for Tx

Added ndo_xsk_async_xmit. This ndo "kicks" the netdev to start to pull
userland AF_XDP Tx frames from a NAPI context.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cc4ea7ab6d24..03ffeadf8a41 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1387,6 +1387,8 @@ struct net_device_ops {
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
 						struct xdp_frame **xdp,
 						u32 flags);
+	int			(*ndo_xsk_async_xmit)(struct net_device *dev,
+						      u32 queue_id);
 };
 
 /**
-- 
cgit v1.2.3


From ac98d8aab61baf785eb8f099b36daf34fc76a70e Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 14:05:57 +0200
Subject: xsk: wire upp Tx zero-copy functions

Here we add the functionality required to support zero-copy Tx, and
also exposes various zero-copy related functions for the netdevs.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  9 +++++++
 net/xdp/xdp_umem.c     | 29 +++++++++++++++++++--
 net/xdp/xdp_umem.h     |  8 +++++-
 net/xdp/xsk.c          | 70 +++++++++++++++++++++++++++++++++++++++++++++-----
 net/xdp/xsk_queue.h    | 32 ++++++++++++++++++++++-
 5 files changed, 137 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index d93d3aac3fc9..9fe472f2ac95 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -9,6 +9,7 @@
 #include <linux/workqueue.h>
 #include <linux/if_xdp.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <net/sock.h>
 
@@ -42,6 +43,8 @@ struct xdp_umem {
 	struct net_device *dev;
 	u16 queue_id;
 	bool zc;
+	spinlock_t xsk_list_lock;
+	struct list_head xsk_list;
 };
 
 struct xdp_sock {
@@ -53,6 +56,8 @@ struct xdp_sock {
 	struct list_head flush_node;
 	u16 queue_id;
 	struct xsk_queue *tx ____cacheline_aligned_in_smp;
+	struct list_head list;
+	bool zc;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	u64 rx_dropped;
@@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+/* Used from netdev driver */
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
 void xsk_umem_discard_addr(struct xdp_umem *umem);
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
+void xsk_umem_consume_tx_done(struct xdp_umem *umem);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index f729d79b8d91..7eb4948a38d2 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -17,6 +17,29 @@
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&umem->xsk_list_lock, flags);
+	list_add_rcu(&xs->list, &umem->xsk_list);
+	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+}
+
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	if (xs->dev) {
+		spin_lock_irqsave(&umem->xsk_list_lock, flags);
+		list_del_rcu(&xs->list);
+		spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+
+		if (umem->zc)
+			synchronize_net();
+	}
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags)
 {
@@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 
 	dev_hold(dev);
 
-	if (dev->netdev_ops->ndo_bpf) {
+	if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
 		bpf.command = XDP_QUERY_XSK_UMEM;
 
 		rtnl_lock();
@@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
 }
 
-void xdp_umem_clear_dev(struct xdp_umem *umem)
+static void xdp_umem_clear_dev(struct xdp_umem *umem)
 {
 	struct netdev_bpf bpf;
 	int err;
@@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
+	INIT_LIST_HEAD(&umem->xsk_list);
+	spin_lock_init(&umem->xsk_list_lock);
 
 	refcount_set(&umem->users, 1);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 674508a32a4d..f11560334f88 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags);
-void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
 struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
 
 #endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ab64bd8260ea..ddca4bf1cfc8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/net.h>
 #include <linux/netdevice.h>
+#include <linux/rculist.h>
 #include <net/xdp_sock.h>
 #include <net/xdp.h>
 
@@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return err;
 }
 
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+	xskq_produce_flush_addr_n(umem->cq, nb_entries);
+}
+EXPORT_SYMBOL(xsk_umem_complete_tx);
+
+void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		xs->sk.sk_write_space(&xs->sk);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx_done);
+
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+{
+	struct xdp_desc desc;
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		if (!xskq_peek_desc(xs->tx, &desc))
+			continue;
+
+		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+			goto out;
+
+		*dma = xdp_umem_get_dma(umem, desc.addr);
+		*len = desc.len;
+
+		xskq_discard_desc(xs->tx);
+		rcu_read_unlock();
+		return true;
+	}
+
+out:
+	rcu_read_unlock();
+	return false;
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx);
+
+static int xsk_zc_xmit(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct net_device *dev = xs->dev;
+
+	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+}
+
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
@@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			    size_t total_len)
 {
-	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 	u32 max_batch = TX_BATCH_SIZE;
 	struct xdp_sock *xs = xdp_sk(sk);
 	bool sent_frame = false;
@@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	if (unlikely(!xs->tx))
 		return -ENOBUFS;
-	if (need_wait)
-		return -EOPNOTSUPP;
 
 	mutex_lock(&xs->mutex);
 
@@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			goto out;
 		}
 
-		skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
+		skb = sock_alloc_send_skb(sk, len, 1, &err);
 		if (unlikely(!skb)) {
 			err = -EAGAIN;
 			goto out;
@@ -235,6 +286,7 @@ out:
 
 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
+	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 
@@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 		return -ENXIO;
 	if (unlikely(!(xs->dev->flags & IFF_UP)))
 		return -ENETDOWN;
+	if (need_wait)
+		return -EOPNOTSUPP;
 
-	return xsk_generic_xmit(sk, m, total_len);
+	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
 }
 
 static unsigned int xsk_poll(struct file *file, struct socket *sock,
@@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	}
 
 	xs->dev = dev;
-	xs->queue_id = sxdp->sxdp_queue_id;
-
+	xs->zc = xs->umem->zc;
+	xs->queue_id = qid;
 	xskq_set_umem(xs->rx, &xs->umem->props);
 	xskq_set_umem(xs->tx, &xs->umem->props);
+	xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
 	if (err)
@@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk)
 
 	xskq_destroy(xs->rx);
 	xskq_destroy(xs->tx);
+	xdp_del_sk_umem(xs->umem, xs);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 5246ed420a16..ef6a6f0ec949 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -11,6 +11,7 @@
 #include <net/xdp_sock.h>
 
 #define RX_BATCH_SIZE 16
+#define LAZY_UPDATE_THRESHOLD 128
 
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
@@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
 	return (entries > dcnt) ? dcnt : entries;
 }
 
+static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
+{
+	return q->nentries - (producer - q->cons_tail);
+}
+
 static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 {
-	u32 free_entries = q->nentries - (producer - q->cons_tail);
+	u32 free_entries = xskq_nb_free_lazy(q, producer);
 
 	if (free_entries >= dcnt)
 		return free_entries;
@@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
+	if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
+		return -ENOSPC;
+
 	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
@@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
+		return -ENOSPC;
+
+	ring->desc[q->prod_head++ & q->ring_mask] = addr;
+	return 0;
+}
+
+static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
+					     u32 nb_entries)
+{
+	/* Order producer and data */
+	smp_wmb();
+
+	q->prod_tail += nb_entries;
+	WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
 static inline int xskq_reserve_addr(struct xsk_queue *q)
 {
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
-- 
cgit v1.2.3


From 95358a9553fbec6c47ad7bd1aec20df663295088 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 5 Jun 2018 03:07:23 -0700
Subject: net-tcp: remove useless tw_timeout field
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tested: 'git grep tw_timeout' comes up empty and it builds :-)

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 1 -
 net/dccp/minisocks.c             | 1 -
 net/ipv4/tcp_minisocks.c         | 1 -
 3 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 659d8ed5a3bc..78775038f011 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -61,7 +61,6 @@ struct inet_timewait_sock {
 #define tw_cookie		__tw_common.skc_cookie
 #define tw_dr			__tw_common.skc_tw_dr
 
-	int			tw_timeout;
 	__u32			tw_mark;
 	volatile unsigned char	tw_substate;
 	unsigned char		tw_rcv_wscale;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 37ccbe62eb1a..ba6fc3c1186b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -53,7 +53,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 		if (timeo < rto)
 			timeo = rto;
 
-		tw->tw_timeout = DCCP_TIMEWAIT_LEN;
 		if (state == DCCP_TIME_WAIT)
 			timeo = DCCP_TIMEWAIT_LEN;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f867658b4b30..1dda1341a223 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -307,7 +307,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		if (timeo < rto)
 			timeo = rto;
 
-		tw->tw_timeout = TCP_TIMEWAIT_LEN;
 		if (state == TCP_TIME_WAIT)
 			timeo = TCP_TIMEWAIT_LEN;
 
-- 
cgit v1.2.3


From d52c89f120de849575f6b2e5948038f2be12ce6f Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Tue, 5 Jun 2018 13:11:16 +0300
Subject: qed*: Utilize FW 8.37.2.0

This FW contains several fixes and features.

RDMA
- Several modifications and fixes for Memory Windows
- drop vlan and tcp timestamp from mss calculation in driver for
  this FW
- Fix SQ completion flow when local ack timeout is infinite
- Modifications in t10dif support

ETH
- Fix aRFS for tunneled traffic without inner IP.
- Fix chip configuration which may fail under heavy traffic conditions.
- Support receiving any-VNI in VXLAN and GENEVE RX classification.

iSCSI / FcoE
- Fix iSCSI recovery flow
- Drop vlan and tcp timestamp from mss calc for fw 8.37.2.0

Misc
- Several registers (split registers) won't read correctly with
  ethtool -d

Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Manish Rangankar <manish.rangankar@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/qedr/qedr_hsi_rdma.h         | 139 +++---
 drivers/infiniband/hw/qedr/verbs.c                 |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_debug.c        | 492 ++++++++++++---------
 drivers/net/ethernet/qlogic/qed/qed_dev.c          |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          | 462 ++++++++++++-------
 drivers/net/ethernet/qlogic/qed/qed_hw.c           |  20 +
 drivers/net/ethernet/qlogic/qed/qed_hw.h           |  12 +
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    |  50 ++-
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c        |  13 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c           |   3 +
 drivers/net/ethernet/qlogic/qed/qed_l2.h           |   1 +
 drivers/net/ethernet/qlogic/qed/qed_rdma.c         |   8 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h     |   3 +-
 drivers/net/ethernet/qlogic/qed/qed_roce.c         |  31 +-
 drivers/scsi/qedi/qedi_iscsi.c                     |   6 -
 include/linux/qed/common_hsi.h                     |   4 +-
 include/linux/qed/iscsi_common.h                   |   8 +-
 include/linux/qed/qed_rdma_if.h                    |   4 +-
 include/linux/qed/roce_common.h                    |   1 +
 19 files changed, 727 insertions(+), 536 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
index b816c80df50b..7e1f7021396a 100644
--- a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
+++ b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
@@ -116,6 +116,7 @@ enum rdma_cqe_requester_status_enum {
 	RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR,
 	RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR,
 	RDMA_CQE_REQ_STS_XRC_VOILATION_ERR,
+	RDMA_CQE_REQ_STS_SIG_ERR,
 	MAX_RDMA_CQE_REQUESTER_STATUS_ENUM
 };
 
@@ -152,12 +153,12 @@ struct rdma_rq_sge {
 	struct regpair addr;
 	__le32 length;
 	__le32 flags;
-#define RDMA_RQ_SGE_L_KEY_MASK      0x3FFFFFF
-#define RDMA_RQ_SGE_L_KEY_SHIFT     0
+#define RDMA_RQ_SGE_L_KEY_LO_MASK   0x3FFFFFF
+#define RDMA_RQ_SGE_L_KEY_LO_SHIFT  0
 #define RDMA_RQ_SGE_NUM_SGES_MASK   0x7
 #define RDMA_RQ_SGE_NUM_SGES_SHIFT  26
-#define RDMA_RQ_SGE_RESERVED0_MASK  0x7
-#define RDMA_RQ_SGE_RESERVED0_SHIFT 29
+#define RDMA_RQ_SGE_L_KEY_HI_MASK   0x7
+#define RDMA_RQ_SGE_L_KEY_HI_SHIFT  29
 };
 
 struct rdma_srq_sge {
@@ -241,18 +242,39 @@ enum rdma_dif_io_direction_flg {
 	MAX_RDMA_DIF_IO_DIRECTION_FLG
 };
 
-/* RDMA DIF Runt Result Structure */
-struct rdma_dif_runt_result {
-	__le16 guard_tag;
-	__le16 reserved[3];
+struct rdma_dif_params {
+	__le32 base_ref_tag;
+	__le16 app_tag;
+	__le16 app_tag_mask;
+	__le16 runt_crc_value;
+	__le16 flags;
+#define RDMA_DIF_PARAMS_IO_DIRECTION_FLG_MASK    0x1
+#define RDMA_DIF_PARAMS_IO_DIRECTION_FLG_SHIFT   0
+#define RDMA_DIF_PARAMS_BLOCK_SIZE_MASK          0x1
+#define RDMA_DIF_PARAMS_BLOCK_SIZE_SHIFT         1
+#define RDMA_DIF_PARAMS_RUNT_VALID_FLG_MASK      0x1
+#define RDMA_DIF_PARAMS_RUNT_VALID_FLG_SHIFT     2
+#define RDMA_DIF_PARAMS_VALIDATE_CRC_GUARD_MASK  0x1
+#define RDMA_DIF_PARAMS_VALIDATE_CRC_GUARD_SHIFT 3
+#define RDMA_DIF_PARAMS_VALIDATE_REF_TAG_MASK    0x1
+#define RDMA_DIF_PARAMS_VALIDATE_REF_TAG_SHIFT   4
+#define RDMA_DIF_PARAMS_VALIDATE_APP_TAG_MASK    0x1
+#define RDMA_DIF_PARAMS_VALIDATE_APP_TAG_SHIFT   5
+#define RDMA_DIF_PARAMS_CRC_SEED_MASK            0x1
+#define RDMA_DIF_PARAMS_CRC_SEED_SHIFT           6
+#define RDMA_DIF_PARAMS_RX_REF_TAG_CONST_MASK    0x1
+#define RDMA_DIF_PARAMS_RX_REF_TAG_CONST_SHIFT   7
+#define RDMA_DIF_PARAMS_BLOCK_GUARD_TYPE_MASK    0x1
+#define RDMA_DIF_PARAMS_BLOCK_GUARD_TYPE_SHIFT   8
+#define RDMA_DIF_PARAMS_APP_ESCAPE_MASK          0x1
+#define RDMA_DIF_PARAMS_APP_ESCAPE_SHIFT         9
+#define RDMA_DIF_PARAMS_REF_ESCAPE_MASK          0x1
+#define RDMA_DIF_PARAMS_REF_ESCAPE_SHIFT         10
+#define RDMA_DIF_PARAMS_RESERVED4_MASK           0x1F
+#define RDMA_DIF_PARAMS_RESERVED4_SHIFT          11
+	__le32 reserved5;
 };
 
-/* Memory window type enumeration */
-enum rdma_mw_type {
-	RDMA_MW_TYPE_1,
-	RDMA_MW_TYPE_2A,
-	MAX_RDMA_MW_TYPE
-};
 
 struct rdma_sq_atomic_wqe {
 	__le32 reserved1;
@@ -334,17 +356,17 @@ struct rdma_sq_bind_wqe {
 #define RDMA_SQ_BIND_WQE_SE_FLG_SHIFT        3
 #define RDMA_SQ_BIND_WQE_INLINE_FLG_MASK     0x1
 #define RDMA_SQ_BIND_WQE_INLINE_FLG_SHIFT    4
-#define RDMA_SQ_BIND_WQE_RESERVED0_MASK      0x7
-#define RDMA_SQ_BIND_WQE_RESERVED0_SHIFT     5
+#define RDMA_SQ_BIND_WQE_DIF_ON_HOST_FLG_MASK  0x1
+#define RDMA_SQ_BIND_WQE_DIF_ON_HOST_FLG_SHIFT 5
+#define RDMA_SQ_BIND_WQE_RESERVED0_MASK      0x3
+#define RDMA_SQ_BIND_WQE_RESERVED0_SHIFT     6
 	u8 wqe_size;
 	u8 prev_wqe_size;
 	u8 bind_ctrl;
 #define RDMA_SQ_BIND_WQE_ZERO_BASED_MASK     0x1
 #define RDMA_SQ_BIND_WQE_ZERO_BASED_SHIFT    0
-#define RDMA_SQ_BIND_WQE_MW_TYPE_MASK        0x1
-#define RDMA_SQ_BIND_WQE_MW_TYPE_SHIFT       1
-#define RDMA_SQ_BIND_WQE_RESERVED1_MASK      0x3F
-#define RDMA_SQ_BIND_WQE_RESERVED1_SHIFT     2
+#define RDMA_SQ_BIND_WQE_RESERVED1_MASK        0x7F
+#define RDMA_SQ_BIND_WQE_RESERVED1_SHIFT       1
 	u8 access_ctrl;
 #define RDMA_SQ_BIND_WQE_REMOTE_READ_MASK    0x1
 #define RDMA_SQ_BIND_WQE_REMOTE_READ_SHIFT   0
@@ -363,6 +385,7 @@ struct rdma_sq_bind_wqe {
 	__le32 length_lo;
 	__le32 parent_l_key;
 	__le32 reserved4;
+	struct rdma_dif_params dif_params;
 };
 
 /* First element (16 bytes) of bind wqe */
@@ -392,10 +415,8 @@ struct rdma_sq_bind_wqe_2nd {
 	u8 bind_ctrl;
 #define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_MASK     0x1
 #define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_SHIFT    0
-#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_MASK        0x1
-#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_SHIFT       1
-#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_MASK      0x3F
-#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_SHIFT     2
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_MASK      0x7F
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_SHIFT     1
 	u8 access_ctrl;
 #define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_MASK    0x1
 #define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_SHIFT   0
@@ -416,6 +437,11 @@ struct rdma_sq_bind_wqe_2nd {
 	__le32 reserved4;
 };
 
+/* Third element (16 bytes) of bind wqe */
+struct rdma_sq_bind_wqe_3rd {
+	struct rdma_dif_params dif_params;
+};
+
 /* Structure with only the SQ WQE common
  * fields. Size is of one SQ element (16B)
  */
@@ -486,30 +512,6 @@ struct rdma_sq_fmr_wqe {
 	u8 length_hi;
 	__le32 length_lo;
 	struct regpair pbl_addr;
-	__le32 dif_base_ref_tag;
-	__le16 dif_app_tag;
-	__le16 dif_app_tag_mask;
-	__le16 dif_runt_crc_value;
-	__le16 dif_flags;
-#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_SHIFT	0
-#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_MASK		0x1
-#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_SHIFT		1
-#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_SHIFT	2
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_SHIFT	3
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_SHIFT	4
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_SHIFT	5
-#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_MASK		0x1
-#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_SHIFT		6
-#define RDMA_SQ_FMR_WQE_DIF_RX_REF_TAG_CONST_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_RX_REF_TAG_CONST_SHIFT	7
-#define RDMA_SQ_FMR_WQE_RESERVED4_MASK			0xFF
-#define RDMA_SQ_FMR_WQE_RESERVED4_SHIFT			8
-	__le32 reserved5;
 };
 
 /* First element (16 bytes) of fmr wqe */
@@ -566,33 +568,6 @@ struct rdma_sq_fmr_wqe_2nd {
 	struct regpair pbl_addr;
 };
 
-/* Third element (16 bytes) of fmr wqe */
-struct rdma_sq_fmr_wqe_3rd {
-	__le32 dif_base_ref_tag;
-	__le16 dif_app_tag;
-	__le16 dif_app_tag_mask;
-	__le16 dif_runt_crc_value;
-	__le16 dif_flags;
-#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_SHIFT		0
-#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_MASK			0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_SHIFT		1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_SHIFT		2
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_SHIFT	3
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_SHIFT		4
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_SHIFT		5
-#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_MASK			0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_SHIFT			6
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RX_REF_TAG_CONST_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RX_REF_TAG_CONST_SHIFT		7
-#define RDMA_SQ_FMR_WQE_3RD_RESERVED4_MASK			0xFF
-#define RDMA_SQ_FMR_WQE_RESERVED4_SHIFT				8
-	__le32 reserved5;
-};
 
 struct rdma_sq_local_inv_wqe {
 	struct regpair reserved;
@@ -637,8 +612,8 @@ struct rdma_sq_rdma_wqe {
 #define RDMA_SQ_RDMA_WQE_DIF_ON_HOST_FLG_SHIFT	5
 #define RDMA_SQ_RDMA_WQE_READ_INV_FLG_MASK	0x1
 #define RDMA_SQ_RDMA_WQE_READ_INV_FLG_SHIFT	6
-#define RDMA_SQ_RDMA_WQE_RESERVED0_MASK		0x1
-#define RDMA_SQ_RDMA_WQE_RESERVED0_SHIFT	7
+#define RDMA_SQ_RDMA_WQE_RESERVED1_MASK        0x1
+#define RDMA_SQ_RDMA_WQE_RESERVED1_SHIFT       7
 	u8 wqe_size;
 	u8 prev_wqe_size;
 	struct regpair remote_va;
@@ -646,13 +621,9 @@ struct rdma_sq_rdma_wqe {
 	u8 dif_flags;
 #define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_MASK            0x1
 #define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_SHIFT           0
-#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_MASK  0x1
-#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_SHIFT 1
-#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_MASK   0x1
-#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_SHIFT  2
-#define RDMA_SQ_RDMA_WQE_RESERVED1_MASK                 0x1F
-#define RDMA_SQ_RDMA_WQE_RESERVED1_SHIFT                3
-	u8 reserved2[3];
+#define RDMA_SQ_RDMA_WQE_RESERVED2_MASK        0x7F
+#define RDMA_SQ_RDMA_WQE_RESERVED2_SHIFT       1
+	u8 reserved3[3];
 };
 
 /* First element (16 bytes) of rdma wqe */
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 3f9afc02d166..e2caabb8a926 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -3276,7 +3276,7 @@ int qedr_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 				SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES,
 					  wr->num_sge);
 
-			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY,
+			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY_LO,
 				  wr->sg_list[i].lkey);
 
 			RQ_SGE_SET(rqe, wr->sg_list[i].addr,
@@ -3295,7 +3295,7 @@ int qedr_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			/* First one must include the number
 			 * of SGE in the list
 			 */
-			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, 0);
+			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY_LO, 0);
 			SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, 1);
 
 			RQ_SGE_SET(rqe, 0, 0, flags);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 39124b594a36..b9ec460dd996 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -183,16 +183,9 @@ enum platform_ids {
 	MAX_PLATFORM_IDS
 };
 
-struct chip_platform_defs {
-	u8 num_ports;
-	u8 num_pfs;
-	u8 num_vfs;
-};
-
 /* Chip constant definitions */
 struct chip_defs {
 	const char *name;
-	struct chip_platform_defs per_platform[MAX_PLATFORM_IDS];
 };
 
 /* Platform constant definitions */
@@ -317,6 +310,11 @@ struct phy_defs {
 	u32 tbus_data_hi_addr;
 };
 
+/* Split type definitions */
+struct split_type_defs {
+	const char *name;
+};
+
 /******************************** Constants **********************************/
 
 #define MAX_LCIDS			320
@@ -469,21 +467,9 @@ static struct dbg_array s_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {NULL} };
 
 /* Chip constant definitions array */
 static struct chip_defs s_chip_defs[MAX_CHIP_IDS] = {
-	{ "bb",
-	  {{MAX_NUM_PORTS_BB, MAX_NUM_PFS_BB, MAX_NUM_VFS_BB},
-	   {0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0} } },
-	{ "ah",
-	  {{MAX_NUM_PORTS_K2, MAX_NUM_PFS_K2, MAX_NUM_VFS_K2},
-	   {0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0} } },
-	{ "reserved",
-	   {{0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0} } }
+	{"bb"},
+	{"ah"},
+	{"reserved"},
 };
 
 /* Storm constant definitions array */
@@ -1588,7 +1574,7 @@ static struct grc_param_defs s_grc_param_defs[] = {
 	{{0, 0, 0}, 0, 1, false, false, 0, 1},
 
 	/* DBG_GRC_PARAM_DUMP_BMB */
-	{{0, 0, 0}, 0, 1, false, false, 0, 1},
+	{{0, 0, 0}, 0, 1, false, false, 0, 0},
 
 	/* DBG_GRC_PARAM_DUMP_NIG */
 	{{1, 1, 1}, 0, 1, false, false, 0, 1},
@@ -1745,6 +1731,23 @@ static struct phy_defs s_phy_defs[] = {
 	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131_K2_E5},
 };
 
+static struct split_type_defs s_split_type_defs[] = {
+	/* SPLIT_TYPE_NONE */
+	{"eng"},
+
+	/* SPLIT_TYPE_PORT */
+	{"port"},
+
+	/* SPLIT_TYPE_PF */
+	{"pf"},
+
+	/* SPLIT_TYPE_PORT_PF */
+	{"port"},
+
+	/* SPLIT_TYPE_VF */
+	{"vf"}
+};
+
 /**************************** Private Functions ******************************/
 
 /* Reads and returns a single dword from the specified unaligned buffer */
@@ -1781,28 +1784,68 @@ static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn,
 					struct qed_ptt *p_ptt)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 num_pfs = 0, max_pfs_per_port = 0;
 
 	if (dev_data->initialized)
 		return DBG_STATUS_OK;
 
+	/* Set chip */
 	if (QED_IS_K2(p_hwfn->cdev)) {
 		dev_data->chip_id = CHIP_K2;
 		dev_data->mode_enable[MODE_K2] = 1;
+		dev_data->num_vfs = MAX_NUM_VFS_K2;
+		num_pfs = MAX_NUM_PFS_K2;
+		max_pfs_per_port = MAX_NUM_PFS_K2 / 2;
 	} else if (QED_IS_BB_B0(p_hwfn->cdev)) {
 		dev_data->chip_id = CHIP_BB;
 		dev_data->mode_enable[MODE_BB] = 1;
+		dev_data->num_vfs = MAX_NUM_VFS_BB;
+		num_pfs = MAX_NUM_PFS_BB;
+		max_pfs_per_port = MAX_NUM_PFS_BB;
 	} else {
 		return DBG_STATUS_UNKNOWN_CHIP;
 	}
 
+	/* Set platofrm */
 	dev_data->platform_id = PLATFORM_ASIC;
 	dev_data->mode_enable[MODE_ASIC] = 1;
 
+	/* Set port mode */
+	switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) {
+	case 0:
+		dev_data->mode_enable[MODE_PORTS_PER_ENG_1] = 1;
+		break;
+	case 1:
+		dev_data->mode_enable[MODE_PORTS_PER_ENG_2] = 1;
+		break;
+	case 2:
+		dev_data->mode_enable[MODE_PORTS_PER_ENG_4] = 1;
+		break;
+	}
+
+	/* Set 100G mode */
+	if (dev_data->chip_id == CHIP_BB &&
+	    qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB) == 2)
+		dev_data->mode_enable[MODE_100G] = 1;
+
+	/* Set number of ports */
+	if (dev_data->mode_enable[MODE_PORTS_PER_ENG_1] ||
+	    dev_data->mode_enable[MODE_100G])
+		dev_data->num_ports = 1;
+	else if (dev_data->mode_enable[MODE_PORTS_PER_ENG_2])
+		dev_data->num_ports = 2;
+	else if (dev_data->mode_enable[MODE_PORTS_PER_ENG_4])
+		dev_data->num_ports = 4;
+
+	/* Set number of PFs per port */
+	dev_data->num_pfs_per_port = min_t(u32,
+					   num_pfs / dev_data->num_ports,
+					   max_pfs_per_port);
+
 	/* Initializes the GRC parameters */
 	qed_dbg_grc_init_params(p_hwfn);
 
 	dev_data->use_dmae = true;
-	dev_data->num_regs_read = 0;
 	dev_data->initialized = 1;
 
 	return DBG_STATUS_OK;
@@ -1821,9 +1864,9 @@ static struct dbg_bus_block *get_dbg_bus_block_desc(struct qed_hwfn *p_hwfn,
 /* Reads the FW info structure for the specified Storm from the chip,
  * and writes it to the specified fw_info pointer.
  */
-static void qed_read_fw_info(struct qed_hwfn *p_hwfn,
-			     struct qed_ptt *p_ptt,
-			     u8 storm_id, struct fw_info *fw_info)
+static void qed_read_storm_fw_info(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   u8 storm_id, struct fw_info *fw_info)
 {
 	struct storm_defs *storm = &s_storm_defs[storm_id];
 	struct fw_info_location fw_info_location;
@@ -1945,45 +1988,29 @@ static u32 qed_dump_fw_ver_param(struct qed_hwfn *p_hwfn,
 				 struct qed_ptt *p_ptt,
 				 u32 *dump_buf, bool dump)
 {
-	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
 	char fw_ver_str[16] = EMPTY_FW_VERSION_STR;
 	char fw_img_str[16] = EMPTY_FW_IMAGE_STR;
 	struct fw_info fw_info = { {0}, {0} };
 	u32 offset = 0;
 
 	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_FW_VER)) {
-		/* Read FW image/version from PRAM in a non-reset SEMI */
-		bool found = false;
-		u8 storm_id;
-
-		for (storm_id = 0; storm_id < MAX_DBG_STORMS && !found;
-		     storm_id++) {
-			struct storm_defs *storm = &s_storm_defs[storm_id];
-
-			/* Read FW version/image */
-			if (dev_data->block_in_reset[storm->block_id])
-				continue;
-
-			/* Read FW info for the current Storm */
-			qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
-
-			/* Create FW version/image strings */
-			if (snprintf(fw_ver_str, sizeof(fw_ver_str),
-				     "%d_%d_%d_%d", fw_info.ver.num.major,
-				     fw_info.ver.num.minor, fw_info.ver.num.rev,
-				     fw_info.ver.num.eng) < 0)
-				DP_NOTICE(p_hwfn,
-					  "Unexpected debug error: invalid FW version string\n");
-			switch (fw_info.ver.image_id) {
-			case FW_IMG_MAIN:
-				strcpy(fw_img_str, "main");
-				break;
-			default:
-				strcpy(fw_img_str, "unknown");
-				break;
-			}
-
-			found = true;
+		/* Read FW info from chip */
+		qed_read_fw_info(p_hwfn, p_ptt, &fw_info);
+
+		/* Create FW version/image strings */
+		if (snprintf(fw_ver_str, sizeof(fw_ver_str),
+			     "%d_%d_%d_%d", fw_info.ver.num.major,
+			     fw_info.ver.num.minor, fw_info.ver.num.rev,
+			     fw_info.ver.num.eng) < 0)
+			DP_NOTICE(p_hwfn,
+				  "Unexpected debug error: invalid FW version string\n");
+		switch (fw_info.ver.image_id) {
+		case FW_IMG_MAIN:
+			strcpy(fw_img_str, "main");
+			break;
+		default:
+			strcpy(fw_img_str, "unknown");
+			break;
 		}
 	}
 
@@ -2412,20 +2439,21 @@ static void qed_grc_clear_all_prty(struct qed_hwfn *p_hwfn,
 
 /* Dumps GRC registers section header. Returns the dumped size in dwords.
  * The following parameters are dumped:
- * - count:	 no. of dumped entries
- * - split:	 split type
- * - id:	 split ID (dumped only if split_id >= 0)
+ * - count: no. of dumped entries
+ * - split_type: split type
+ * - split_id: split ID (dumped only if split_id != SPLIT_TYPE_NONE)
  * - param_name: user parameter value (dumped only if param_name != NULL
  *		 and param_val != NULL).
  */
 static u32 qed_grc_dump_regs_hdr(u32 *dump_buf,
 				 bool dump,
 				 u32 num_reg_entries,
-				 const char *split_type,
-				 int split_id,
+				 enum init_split_types split_type,
+				 u8 split_id,
 				 const char *param_name, const char *param_val)
 {
-	u8 num_params = 2 + (split_id >= 0 ? 1 : 0) + (param_name ? 1 : 0);
+	u8 num_params = 2 +
+	    (split_type != SPLIT_TYPE_NONE ? 1 : 0) + (param_name ? 1 : 0);
 	u32 offset = 0;
 
 	offset += qed_dump_section_hdr(dump_buf + offset,
@@ -2433,8 +2461,9 @@ static u32 qed_grc_dump_regs_hdr(u32 *dump_buf,
 	offset += qed_dump_num_param(dump_buf + offset,
 				     dump, "count", num_reg_entries);
 	offset += qed_dump_str_param(dump_buf + offset,
-				     dump, "split", split_type);
-	if (split_id >= 0)
+				     dump, "split",
+				     s_split_type_defs[split_type].name);
+	if (split_type != SPLIT_TYPE_NONE)
 		offset += qed_dump_num_param(dump_buf + offset,
 					     dump, "id", split_id);
 	if (param_name && param_val)
@@ -2463,9 +2492,12 @@ void qed_read_regs(struct qed_hwfn *p_hwfn,
 static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
 				   struct qed_ptt *p_ptt,
 				   u32 *dump_buf,
-				   bool dump, u32 addr, u32 len, bool wide_bus)
+				   bool dump, u32 addr, u32 len, bool wide_bus,
+				   enum init_split_types split_type,
+				   u8 split_id)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 port_id = 0, pf_id = 0, vf_id = 0, fid = 0;
 
 	if (!dump)
 		return len;
@@ -2481,8 +2513,27 @@ static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
 		dev_data->num_regs_read = 0;
 	}
 
+	switch (split_type) {
+	case SPLIT_TYPE_PORT:
+		port_id = split_id;
+		break;
+	case SPLIT_TYPE_PF:
+		pf_id = split_id;
+		break;
+	case SPLIT_TYPE_PORT_PF:
+		port_id = split_id / dev_data->num_pfs_per_port;
+		pf_id = port_id + dev_data->num_ports *
+		    (split_id % dev_data->num_pfs_per_port);
+		break;
+	case SPLIT_TYPE_VF:
+		vf_id = split_id;
+		break;
+	default:
+		break;
+	}
+
 	/* Try reading using DMAE */
-	if (dev_data->use_dmae &&
+	if (dev_data->use_dmae && split_type == SPLIT_TYPE_NONE &&
 	    (len >= s_platform_defs[dev_data->platform_id].dmae_thresh ||
 	     wide_bus)) {
 		if (!qed_dmae_grc2host(p_hwfn, p_ptt, DWORDS_TO_BYTES(addr),
@@ -2494,7 +2545,37 @@ static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
 			   "Failed reading from chip using DMAE, using GRC instead\n");
 	}
 
-	/* Read registers */
+	/* If not read using DMAE, read using GRC */
+
+	/* Set pretend */
+	if (split_type != dev_data->pretend.split_type || split_id !=
+	    dev_data->pretend.split_id) {
+		switch (split_type) {
+		case SPLIT_TYPE_PORT:
+			qed_port_pretend(p_hwfn, p_ptt, port_id);
+			break;
+		case SPLIT_TYPE_PF:
+			fid = pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+			qed_fid_pretend(p_hwfn, p_ptt, fid);
+			break;
+		case SPLIT_TYPE_PORT_PF:
+			fid = pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+			qed_port_fid_pretend(p_hwfn, p_ptt, port_id, fid);
+			break;
+		case SPLIT_TYPE_VF:
+			fid = BIT(PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT) |
+			      (vf_id << PXP_PRETEND_CONCRETE_FID_VFID_SHIFT);
+			qed_fid_pretend(p_hwfn, p_ptt, fid);
+			break;
+		default:
+			break;
+		}
+
+		dev_data->pretend.split_type = (u8)split_type;
+		dev_data->pretend.split_id = split_id;
+	}
+
+	/* Read registers using GRC */
 	qed_read_regs(p_hwfn, p_ptt, dump_buf, addr, len);
 
 	return len;
@@ -2518,7 +2599,8 @@ static u32 qed_grc_dump_reg_entry_hdr(u32 *dump_buf,
 static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn,
 				  struct qed_ptt *p_ptt,
 				  u32 *dump_buf,
-				  bool dump, u32 addr, u32 len, bool wide_bus)
+				  bool dump, u32 addr, u32 len, bool wide_bus,
+				  enum init_split_types split_type, u8 split_id)
 {
 	u32 offset = 0;
 
@@ -2526,7 +2608,8 @@ static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn,
 	offset += qed_grc_dump_addr_range(p_hwfn,
 					  p_ptt,
 					  dump_buf + offset,
-					  dump, addr, len, wide_bus);
+					  dump, addr, len, wide_bus,
+					  split_type, split_id);
 
 	return offset;
 }
@@ -2559,7 +2642,8 @@ static u32 qed_grc_dump_reg_entry_skip(struct qed_hwfn *p_hwfn,
 		offset += qed_grc_dump_addr_range(p_hwfn,
 						  p_ptt,
 						  dump_buf + offset,
-						  dump, addr, curr_len, false);
+						  dump,  addr, curr_len, false,
+						  SPLIT_TYPE_NONE, 0);
 		reg_offset += curr_len;
 		addr += curr_len;
 
@@ -2581,6 +2665,8 @@ static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn,
 				     struct dbg_array input_regs_arr,
 				     u32 *dump_buf,
 				     bool dump,
+				     enum init_split_types split_type,
+				     u8 split_id,
 				     bool block_enable[MAX_BLOCK_ID],
 				     u32 *num_dumped_reg_entries)
 {
@@ -2628,7 +2714,8 @@ static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn,
 							 dump,
 							 addr,
 							 len,
-							 wide_bus);
+							 wide_bus,
+							 split_type, split_id);
 			(*num_dumped_reg_entries)++;
 		}
 	}
@@ -2643,19 +2730,28 @@ static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
 				   u32 *dump_buf,
 				   bool dump,
 				   bool block_enable[MAX_BLOCK_ID],
-				   const char *split_type_name,
-				   u32 split_id,
+				   enum init_split_types split_type,
+				   u8 split_id,
 				   const char *param_name,
 				   const char *param_val)
 {
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	enum init_split_types hdr_split_type = split_type;
 	u32 num_dumped_reg_entries, offset;
+	u8 hdr_split_id = split_id;
+
+	/* In PORT_PF split type, print a port split header */
+	if (split_type == SPLIT_TYPE_PORT_PF) {
+		hdr_split_type = SPLIT_TYPE_PORT;
+		hdr_split_id = split_id / dev_data->num_pfs_per_port;
+	}
 
 	/* Calculate register dump header size (and skip it for now) */
 	offset = qed_grc_dump_regs_hdr(dump_buf,
 				       false,
 				       0,
-				       split_type_name,
-				       split_id, param_name, param_val);
+				       hdr_split_type,
+				       hdr_split_id, param_name, param_val);
 
 	/* Dump registers */
 	offset += qed_grc_dump_regs_entries(p_hwfn,
@@ -2663,6 +2759,8 @@ static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
 					    input_regs_arr,
 					    dump_buf + offset,
 					    dump,
+					    split_type,
+					    split_id,
 					    block_enable,
 					    &num_dumped_reg_entries);
 
@@ -2671,8 +2769,8 @@ static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
 		qed_grc_dump_regs_hdr(dump_buf,
 				      dump,
 				      num_dumped_reg_entries,
-				      split_type_name,
-				      split_id, param_name, param_val);
+				      hdr_split_type,
+				      hdr_split_id, param_name, param_val);
 
 	return num_dumped_reg_entries > 0 ? offset : 0;
 }
@@ -2688,26 +2786,21 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 				  const char *param_name, const char *param_val)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
-	struct chip_platform_defs *chip_platform;
 	u32 offset = 0, input_offset = 0;
-	struct chip_defs *chip;
-	u8 port_id, pf_id, vf_id;
 	u16 fid;
-
-	chip = &s_chip_defs[dev_data->chip_id];
-	chip_platform = &chip->per_platform[dev_data->platform_id];
-
 	while (input_offset <
 	       s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].size_in_dwords) {
 		const struct dbg_dump_split_hdr *split_hdr;
 		struct dbg_array curr_input_regs_arr;
+		enum init_split_types split_type;
+		u16 split_count = 0;
 		u32 split_data_size;
-		u8 split_type_id;
+		u8 split_id;
 
 		split_hdr =
 			(const struct dbg_dump_split_hdr *)
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset++];
-		split_type_id =
+		split_type =
 			GET_FIELD(split_hdr->hdr,
 				  DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
 		split_data_size =
@@ -2717,99 +2810,44 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset];
 		curr_input_regs_arr.size_in_dwords = split_data_size;
 
-		switch (split_type_id) {
+		switch (split_type) {
 		case SPLIT_TYPE_NONE:
-			offset += qed_grc_dump_split_data(p_hwfn,
-							  p_ptt,
-							  curr_input_regs_arr,
-							  dump_buf + offset,
-							  dump,
-							  block_enable,
-							  "eng",
-							  (u32)(-1),
-							  param_name,
-							  param_val);
+			split_count = 1;
 			break;
-
 		case SPLIT_TYPE_PORT:
-			for (port_id = 0; port_id < chip_platform->num_ports;
-			     port_id++) {
-				if (dump)
-					qed_port_pretend(p_hwfn, p_ptt,
-							 port_id);
-				offset +=
-				    qed_grc_dump_split_data(p_hwfn, p_ptt,
-							    curr_input_regs_arr,
-							    dump_buf + offset,
-							    dump, block_enable,
-							    "port", port_id,
-							    param_name,
-							    param_val);
-			}
+			split_count = dev_data->num_ports;
 			break;
-
 		case SPLIT_TYPE_PF:
 		case SPLIT_TYPE_PORT_PF:
-			for (pf_id = 0; pf_id < chip_platform->num_pfs;
-			     pf_id++) {
-				u8 pfid_shift =
-					PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
-
-				if (dump) {
-					fid = pf_id << pfid_shift;
-					qed_fid_pretend(p_hwfn, p_ptt, fid);
-				}
-
-				offset +=
-				    qed_grc_dump_split_data(p_hwfn,
-							    p_ptt,
-							    curr_input_regs_arr,
-							    dump_buf + offset,
-							    dump,
-							    block_enable,
-							    "pf",
-							    pf_id,
-							    param_name,
-							    param_val);
-			}
+			split_count = dev_data->num_ports *
+			    dev_data->num_pfs_per_port;
 			break;
-
 		case SPLIT_TYPE_VF:
-			for (vf_id = 0; vf_id < chip_platform->num_vfs;
-			     vf_id++) {
-				u8 vfvalid_shift =
-					PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT;
-				u8 vfid_shift =
-					PXP_PRETEND_CONCRETE_FID_VFID_SHIFT;
-
-				if (dump) {
-					fid = BIT(vfvalid_shift) |
-					      (vf_id << vfid_shift);
-					qed_fid_pretend(p_hwfn, p_ptt, fid);
-				}
-
-				offset +=
-				    qed_grc_dump_split_data(p_hwfn, p_ptt,
-							    curr_input_regs_arr,
-							    dump_buf + offset,
-							    dump, block_enable,
-							    "vf", vf_id,
-							    param_name,
-							    param_val);
-			}
+			split_count = dev_data->num_vfs;
 			break;
-
 		default:
-			break;
+			return 0;
 		}
 
+		for (split_id = 0; split_id < split_count; split_id++)
+			offset += qed_grc_dump_split_data(p_hwfn, p_ptt,
+							  curr_input_regs_arr,
+							  dump_buf + offset,
+							  dump, block_enable,
+							  split_type,
+							  split_id,
+							  param_name,
+							  param_val);
+
 		input_offset += split_data_size;
 	}
 
-	/* Pretend to original PF */
+	/* Cancel pretends (pretend to original PF) */
 	if (dump) {
 		fid = p_hwfn->rel_pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
 		qed_fid_pretend(p_hwfn, p_ptt, fid);
+		dev_data->pretend.split_type = SPLIT_TYPE_NONE;
+		dev_data->pretend.split_id = 0;
 	}
 
 	return offset;
@@ -2825,7 +2863,8 @@ static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn,
 
 	/* Calculate header size */
 	offset += qed_grc_dump_regs_hdr(dump_buf,
-					false, 0, "eng", -1, NULL, NULL);
+					false, 0,
+					SPLIT_TYPE_NONE, 0, NULL, NULL);
 
 	/* Write reset registers */
 	for (i = 0; i < MAX_DBG_RESET_REGS; i++) {
@@ -2838,14 +2877,15 @@ static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn,
 						 dump,
 						 BYTES_TO_DWORDS
 						 (s_reset_regs_defs[i].addr), 1,
-						 false);
+						 false, SPLIT_TYPE_NONE, 0);
 		num_regs++;
 	}
 
 	/* Write header */
 	if (dump)
 		qed_grc_dump_regs_hdr(dump_buf,
-				      true, num_regs, "eng", -1, NULL, NULL);
+				      true, num_regs, SPLIT_TYPE_NONE,
+				      0, NULL, NULL);
 
 	return offset;
 }
@@ -2864,7 +2904,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 
 	/* Calculate header size */
 	offset += qed_grc_dump_regs_hdr(dump_buf,
-					false, 0, "eng", -1, NULL, NULL);
+					false, 0, SPLIT_TYPE_NONE,
+					0, NULL, NULL);
 
 	/* Write parity registers */
 	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
@@ -2899,7 +2940,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 							 dump_buf + offset,
 							 dump,
 							 addr,
-							 1, false);
+							 1, false,
+							 SPLIT_TYPE_NONE, 0);
 			addr = GET_FIELD(reg_data->data,
 					 DBG_ATTN_REG_STS_ADDRESS);
 			offset += qed_grc_dump_reg_entry(p_hwfn,
@@ -2907,7 +2949,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 							 dump_buf + offset,
 							 dump,
 							 addr,
-							 1, false);
+							 1, false,
+							 SPLIT_TYPE_NONE, 0);
 			num_reg_entries += 2;
 		}
 	}
@@ -2929,7 +2972,7 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 						 dump,
 						 addr,
 						 1,
-						 false);
+						 false, SPLIT_TYPE_NONE, 0);
 		num_reg_entries++;
 	}
 
@@ -2937,7 +2980,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 	if (dump)
 		qed_grc_dump_regs_hdr(dump_buf,
 				      true,
-				      num_reg_entries, "eng", -1, NULL, NULL);
+				      num_reg_entries, SPLIT_TYPE_NONE,
+				      0, NULL, NULL);
 
 	return offset;
 }
@@ -2950,7 +2994,8 @@ static u32 qed_grc_dump_special_regs(struct qed_hwfn *p_hwfn,
 	u32 offset = 0, addr;
 
 	offset += qed_grc_dump_regs_hdr(dump_buf,
-					dump, 2, "eng", -1, NULL, NULL);
+					dump, 2, SPLIT_TYPE_NONE, 0,
+					NULL, NULL);
 
 	/* Dump R/TDIF_REG_DEBUG_ERROR_INFO_SIZE (every 8'th register should be
 	 * skipped).
@@ -3096,7 +3141,8 @@ static u32 qed_grc_dump_mem(struct qed_hwfn *p_hwfn,
 	offset += qed_grc_dump_addr_range(p_hwfn,
 					  p_ptt,
 					  dump_buf + offset,
-					  dump, addr, len, wide_bus);
+					  dump, addr, len, wide_bus,
+					  SPLIT_TYPE_NONE, 0);
 
 	return offset;
 }
@@ -3235,12 +3281,12 @@ static u32 qed_grc_dump_memories(struct qed_hwfn *p_hwfn,
 	       s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].size_in_dwords) {
 		const struct dbg_dump_split_hdr *split_hdr;
 		struct dbg_array curr_input_mems_arr;
+		enum init_split_types split_type;
 		u32 split_data_size;
-		u8 split_type_id;
 
 		split_hdr = (const struct dbg_dump_split_hdr *)
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset++];
-		split_type_id =
+		split_type =
 			GET_FIELD(split_hdr->hdr,
 				  DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
 		split_data_size =
@@ -3250,20 +3296,15 @@ static u32 qed_grc_dump_memories(struct qed_hwfn *p_hwfn,
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset];
 		curr_input_mems_arr.size_in_dwords = split_data_size;
 
-		switch (split_type_id) {
-		case SPLIT_TYPE_NONE:
+		if (split_type == SPLIT_TYPE_NONE)
 			offset += qed_grc_dump_mem_entries(p_hwfn,
 							   p_ptt,
 							   curr_input_mems_arr,
 							   dump_buf + offset,
 							   dump);
-			break;
-
-		default:
+		else
 			DP_NOTICE(p_hwfn,
 				  "Dumping split memories is currently not supported\n");
-			break;
-		}
 
 		input_offset += split_data_size;
 	}
@@ -3623,7 +3664,8 @@ static u32 qed_grc_dump_rss(struct qed_hwfn *p_hwfn,
 							  dump,
 							  addr,
 							  num_dwords_to_read,
-							  false);
+							  false,
+							  SPLIT_TYPE_NONE, 0);
 			total_dwords -= num_dwords_to_read;
 			rss_addr++;
 		}
@@ -3682,7 +3724,7 @@ static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn,
 						  dump,
 						  addr,
 						  len,
-						  false);
+						  false, SPLIT_TYPE_NONE, 0);
 	}
 
 	return offset;
@@ -3731,7 +3773,8 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 
 	/* Dump required non-MCP registers */
 	offset += qed_grc_dump_regs_hdr(dump_buf + offset,
-					dump, 1, "eng", -1, "block", "MCP");
+					dump, 1, SPLIT_TYPE_NONE, 0,
+					"block", "MCP");
 	addr = BYTES_TO_DWORDS(MISC_REG_SHARED_MEM_ADDR);
 	offset += qed_grc_dump_reg_entry(p_hwfn,
 					 p_ptt,
@@ -3739,7 +3782,7 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 					 dump,
 					 addr,
 					 1,
-					 false);
+					 false, SPLIT_TYPE_NONE, 0);
 
 	/* Release MCP */
 	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
@@ -3923,7 +3966,8 @@ static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn,
 							  dump,
 							  addr,
 							  len,
-							  true);
+							  true, SPLIT_TYPE_NONE,
+							  0);
 		}
 
 		/* Disable block's client and debug output */
@@ -3949,28 +3993,15 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
 	bool parities_masked = false;
-	u8 i, port_mode = 0;
 	u32 offset = 0;
+	u8 i;
 
 	*num_dumped_dwords = 0;
+	dev_data->num_regs_read = 0;
 
-	if (dump) {
-		/* Find port mode */
-		switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) {
-		case 0:
-			port_mode = 1;
-			break;
-		case 1:
-			port_mode = 2;
-			break;
-		case 2:
-			port_mode = 4;
-			break;
-		}
-
-		/* Update reset state */
+	/* Update reset state */
+	if (dump)
 		qed_update_blocks_reset_state(p_hwfn, p_ptt);
-	}
 
 	/* Dump global params */
 	offset += qed_dump_common_global_params(p_hwfn,
@@ -3989,7 +4020,7 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 				     qed_grc_get_param(p_hwfn,
 						DBG_GRC_PARAM_NUM_LTIDS));
 	offset += qed_dump_num_param(dump_buf + offset,
-				     dump, "num-ports", port_mode);
+				     dump, "num-ports", dev_data->num_ports);
 
 	/* Dump reset registers (dumped before taking blocks out of reset ) */
 	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS))
@@ -4093,10 +4124,10 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 		offset += qed_grc_dump_phy(p_hwfn,
 					   p_ptt, dump_buf + offset, dump);
 
-	/* Dump static debug data  */
+	/* Dump static debug data (only if not during debug bus recording) */
 	if (qed_grc_is_included(p_hwfn,
 				DBG_GRC_PARAM_DUMP_STATIC) &&
-	    dev_data->bus.state == DBG_BUS_STATE_IDLE)
+	    (!dump || dev_data->bus.state == DBG_BUS_STATE_IDLE))
 		offset += qed_grc_dump_static_debug(p_hwfn,
 						    p_ptt,
 						    dump_buf + offset, dump);
@@ -4250,7 +4281,8 @@ static u32 qed_idle_chk_dump_failure(struct qed_hwfn *p_hwfn,
 							  dump_buf + offset,
 							  dump,
 							  addr,
-							  reg->size, wide_bus);
+							  reg->size, wide_bus,
+							  SPLIT_TYPE_NONE, 0);
 		}
 	}
 
@@ -4373,7 +4405,8 @@ qed_idle_chk_dump_rule_entries(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 							    next_reg_offset,
 							    dump, addr,
 							    reg->entry_size,
-							    wide_bus);
+							    wide_bus,
+							    SPLIT_TYPE_NONE, 0);
 			}
 
 			/* Call rule condition function.
@@ -4723,7 +4756,8 @@ static enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 					  dump_buf + offset,
 					  dump,
 					  BYTES_TO_DWORDS(trace_data_grc_addr),
-					  trace_data_size_dwords, false);
+					  trace_data_size_dwords, false,
+					  SPLIT_TYPE_NONE, 0);
 
 	/* Resume MCP (only if halt succeeded) */
 	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
@@ -4829,7 +4863,8 @@ static enum dbg_status qed_reg_fifo_dump(struct qed_hwfn *p_hwfn,
 						  true,
 						  addr,
 						  len,
-						  true);
+						  true, SPLIT_TYPE_NONE,
+						  0);
 		fifo_has_data = qed_rd(p_hwfn, p_ptt,
 				       GRC_REG_TRACE_FIFO_VALID_DATA) > 0;
 	}
@@ -4898,7 +4933,8 @@ static enum dbg_status qed_igu_fifo_dump(struct qed_hwfn *p_hwfn,
 						  true,
 						  addr,
 						  len,
-						  true);
+						  true, SPLIT_TYPE_NONE,
+						  0);
 		fifo_has_data = qed_rd(p_hwfn, p_ptt,
 				       IGU_REG_ERROR_HANDLING_DATA_VALID) > 0;
 	}
@@ -4956,7 +4992,7 @@ static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn,
 					  true,
 					  addr,
 					  override_window_dwords,
-					  true);
+					  true, SPLIT_TYPE_NONE, 0);
 	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
 			   override_window_dwords);
 out:
@@ -4998,7 +5034,7 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 			continue;
 
 		/* Read FW info for the current Storm */
-		qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
+		qed_read_storm_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
 
 		asserts = &fw_info.fw_asserts_section;
 
@@ -5036,7 +5072,7 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 					    dump_buf + offset,
 					    dump, addr,
 					    asserts->list_element_dword_size,
-					    false);
+						  false, SPLIT_TYPE_NONE, 0);
 	}
 
 	/* Dump last section */
@@ -5063,6 +5099,28 @@ enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr)
 	return DBG_STATUS_OK;
 }
 
+bool qed_read_fw_info(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, struct fw_info *fw_info)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		struct storm_defs *storm = &s_storm_defs[storm_id];
+
+		/* Skip Storm if it's in reset */
+		if (dev_data->block_in_reset[storm->block_id])
+			continue;
+
+		/* Read FW info for the current Storm */
+		qed_read_storm_fw_info(p_hwfn, p_ptt, storm_id, fw_info);
+
+		return true;
+	}
+
+	return false;
+}
+
 /* Assign default GRC param values */
 void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn)
 {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index fde20fd9942c..b285edc8d6a1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2792,7 +2792,7 @@ static void qed_hw_info_port_num_bb(struct qed_hwfn *p_hwfn,
 {
 	u32 port_mode;
 
-	port_mode = qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB_B0);
+	port_mode = qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB);
 
 	if (port_mode < 3) {
 		p_hwfn->cdev->num_ports_in_engine = 1;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index b9704be53537..bee10c1781fb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -1095,14 +1095,16 @@ enum personality_type {
 struct pf_start_tunnel_config {
 	u8 set_vxlan_udp_port_flg;
 	u8 set_geneve_udp_port_flg;
+	u8 set_no_inner_l2_vxlan_udp_port_flg;
 	u8 tunnel_clss_vxlan;
 	u8 tunnel_clss_l2geneve;
 	u8 tunnel_clss_ipgeneve;
 	u8 tunnel_clss_l2gre;
 	u8 tunnel_clss_ipgre;
-	u8 reserved;
 	__le16 vxlan_udp_port;
 	__le16 geneve_udp_port;
+	__le16 no_inner_l2_vxlan_udp_port;
+	__le16 reserved[3];
 };
 
 /* Ramrod data for PF start ramrod */
@@ -1145,14 +1147,17 @@ struct pf_update_tunnel_config {
 	u8 update_rx_def_non_ucast_clss;
 	u8 set_vxlan_udp_port_flg;
 	u8 set_geneve_udp_port_flg;
+	u8 set_no_inner_l2_vxlan_udp_port_flg;
 	u8 tunnel_clss_vxlan;
 	u8 tunnel_clss_l2geneve;
 	u8 tunnel_clss_ipgeneve;
 	u8 tunnel_clss_l2gre;
 	u8 tunnel_clss_ipgre;
+	u8 reserved;
 	__le16 vxlan_udp_port;
 	__le16 geneve_udp_port;
-	__le16 reserved;
+	__le16 no_inner_l2_vxlan_udp_port;
+	__le16 reserved1[3];
 };
 
 /* Data for port update ramrod */
@@ -2535,7 +2540,14 @@ struct idle_chk_data {
 	u16 reserved2;
 };
 
-/* Debug Tools data (per HW function) */
+struct pretend_params {
+	u8 split_type;
+	u8 reserved;
+	u16 split_id;
+};
+
+/* Debug Tools data (per HW function)
+ */
 struct dbg_tools_data {
 	struct dbg_grc_data grc;
 	struct dbg_bus_data bus;
@@ -2544,8 +2556,13 @@ struct dbg_tools_data {
 	u8 block_in_reset[88];
 	u8 chip_id;
 	u8 platform_id;
+	u8 num_ports;
+	u8 num_pfs_per_port;
+	u8 num_vfs;
 	u8 initialized;
 	u8 use_dmae;
+	u8 reserved;
+	struct pretend_params pretend;
 	u32 num_regs_read;
 };
 
@@ -2974,6 +2991,24 @@ enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr);
 void qed_read_regs(struct qed_hwfn *p_hwfn,
 		   struct qed_ptt *p_ptt, u32 *buf, u32 addr, u32 len);
 
+/**
+ * @brief qed_read_fw_info - Reads FW info from the chip.
+ *
+ * The FW info contains FW-related information, such as the FW version,
+ * FW image (main/L2B/kuku), FW timestamp, etc.
+ * The FW info is read from the internal RAM of the first Storm that is not in
+ * reset.
+ *
+ * @param p_hwfn -	    HW device data
+ * @param p_ptt -	    Ptt window used for writing the registers.
+ * @param fw_info -	Out: a pointer to write the FW info into.
+ *
+ * @return true if the FW info was read successfully from one of the Storms,
+ * or false if all Storms are in reset.
+ */
+bool qed_read_fw_info(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, struct fw_info *fw_info);
+
 /**
  * @brief qed_dbg_grc_set_params_default - Reverts all GRC parameters to their
  *	default value.
@@ -4110,6 +4145,21 @@ void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
  */
 void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
 
+#define NUM_STORMS 6
+
+/**
+ * @brief qed_set_rdma_error_level - Sets the RDMA assert level.
+ *                                   If the severity of the error will be
+ *                                   above the level, the FW will assert.
+ * @param p_hwfn - HW device data
+ * @param p_ptt - ptt window used for writing the registers
+ * @param assert_level - An array of assert levels for each storm.
+ *
+ */
+void qed_set_rdma_error_level(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u8 assert_level[NUM_STORMS]);
+
 /* Ystorm flow control mode. Use enum fw_flow_ctrl_mode */
 #define YSTORM_FLOW_CONTROL_MODE_OFFSET			(IRO[0].base)
 #define YSTORM_FLOW_CONTROL_MODE_SIZE			(IRO[0].size)
@@ -4340,27 +4390,67 @@ void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
 	(IRO[46].base + ((rdma_stat_counter_id) * IRO[46].m1))
 #define TSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[46].size)
 
+/* Xstorm error level for assert */
+#define XSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[47].base +	((pf_id) * IRO[47].m1))
+#define XSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[47].size)
+
+/* Ystorm error level for assert */
+#define YSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[48].base + ((pf_id) * IRO[48].m1))
+#define YSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[48].size)
+
+/* Pstorm error level for assert */
+#define PSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[49].base +	((pf_id) * IRO[49].m1))
+#define PSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[49].size)
+
+/* Tstorm error level for assert */
+#define TSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[50].base +	((pf_id) * IRO[50].m1))
+#define TSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[50].size)
+
+/* Mstorm error level for assert */
+#define MSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[51].base + ((pf_id) * IRO[51].m1))
+#define MSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[51].size)
+
+/* Ustorm error level for assert */
+#define USTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[52].base + ((pf_id) * IRO[52].m1))
+#define USTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[52].size)
+
 /* Xstorm iWARP rxmit stats */
 #define XSTORM_IWARP_RXMIT_STATS_OFFSET(pf_id) \
-	(IRO[47].base + ((pf_id) * IRO[47].m1))
-#define XSTORM_IWARP_RXMIT_STATS_SIZE			(IRO[47].size)
+	(IRO[53].base +	((pf_id) * IRO[53].m1))
+#define XSTORM_IWARP_RXMIT_STATS_SIZE			(IRO[53].size)
 
 /* Tstorm RoCE Event Statistics */
 #define TSTORM_ROCE_EVENTS_STAT_OFFSET(roce_pf_id) \
-	(IRO[48].base + ((roce_pf_id) * IRO[48].m1))
-#define TSTORM_ROCE_EVENTS_STAT_SIZE			(IRO[48].size)
+	(IRO[54].base + ((roce_pf_id) * IRO[54].m1))
+#define TSTORM_ROCE_EVENTS_STAT_SIZE			(IRO[54].size)
 
 /* DCQCN Received Statistics */
 #define YSTORM_ROCE_DCQCN_RECEIVED_STATS_OFFSET(roce_pf_id) \
-	(IRO[49].base + ((roce_pf_id) * IRO[49].m1))
-#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_SIZE		(IRO[49].size)
+	(IRO[55].base + ((roce_pf_id) * IRO[55].m1))
+#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_SIZE		(IRO[55].size)
+
+/* RoCE Error Statistics */
+#define YSTORM_ROCE_ERROR_STATS_OFFSET(roce_pf_id) \
+	(IRO[56].base + ((roce_pf_id) * IRO[56].m1))
+#define YSTORM_ROCE_ERROR_STATS_SIZE			(IRO[56].size)
 
 /* DCQCN Sent Statistics */
 #define PSTORM_ROCE_DCQCN_SENT_STATS_OFFSET(roce_pf_id) \
-	(IRO[50].base + ((roce_pf_id) * IRO[50].m1))
-#define PSTORM_ROCE_DCQCN_SENT_STATS_SIZE		(IRO[50].size)
+	(IRO[57].base + ((roce_pf_id) * IRO[57].m1))
+#define PSTORM_ROCE_DCQCN_SENT_STATS_SIZE		(IRO[57].size)
 
-static const struct iro iro_arr[51] = {
+/* RoCE CQEs Statistics */
+#define USTORM_ROCE_CQE_STATS_OFFSET(roce_pf_id) \
+	(IRO[58].base + ((roce_pf_id) * IRO[58].m1))
+#define USTORM_ROCE_CQE_STATS_SIZE			(IRO[58].size)
+
+static const struct iro iro_arr[59] = {
 	{0x0, 0x0, 0x0, 0x0, 0x8},
 	{0x4cb8, 0x88, 0x0, 0x0, 0x88},
 	{0x6530, 0x20, 0x0, 0x0, 0x20},
@@ -4408,10 +4498,18 @@ static const struct iro iro_arr[51] = {
 	{0x10768, 0x20, 0x0, 0x0, 0x20},
 	{0x2d48, 0x80, 0x0, 0x0, 0x10},
 	{0x5048, 0x10, 0x0, 0x0, 0x10},
+	{0xc748, 0x8, 0x0, 0x0, 0x1},
+	{0xa128, 0x8, 0x0, 0x0, 0x1},
+	{0x10f00, 0x8, 0x0, 0x0, 0x1},
+	{0xf030, 0x8, 0x0, 0x0, 0x1},
+	{0x13028, 0x8, 0x0, 0x0, 0x1},
+	{0x12c58, 0x8, 0x0, 0x0, 0x1},
 	{0xc9b8, 0x30, 0x0, 0x0, 0x10},
-	{0xed90, 0x10, 0x0, 0x0, 0x10},
-	{0xa3a0, 0x10, 0x0, 0x0, 0x10},
+	{0xed90, 0x28, 0x0, 0x0, 0x28},
+	{0xa520, 0x18, 0x0, 0x0, 0x18},
+	{0xa6a0, 0x8, 0x0, 0x0, 0x8},
 	{0x13108, 0x8, 0x0, 0x0, 0x8},
+	{0x13c50, 0x18, 0x0, 0x0, 0x18},
 };
 
 /* Runtime array offsets */
@@ -4797,147 +4895,147 @@ static const struct iro iro_arr[51] = {
 #define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET		39769
 #define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_SIZE			16
 #define NIG_REG_TX_EDPM_CTRL_RT_OFFSET				39785
-#define NIG_REG_ROCE_DUPLICATE_TO_HOST_RT_OFFSET		39786
-#define NIG_REG_PPF_TO_ENGINE_SEL_RT_OFFSET			39787
-#define NIG_REG_PPF_TO_ENGINE_SEL_RT_SIZE			8
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_OFFSET		39795
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_SIZE		1024
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_OFFSET		40819
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_SIZE		512
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_OFFSET		41331
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_SIZE		512
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET	41843
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE	512
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_OFFSET	42355
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_SIZE		512
-#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_OFFSET		42867
-#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_SIZE			32
-#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET			42899
-#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET			42900
-#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET			42901
-#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET			42902
-#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET			42903
-#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET			42904
-#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET			42905
-#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET		42906
-#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET		42907
-#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET		42908
-#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET		42909
-#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET			42910
-#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET			42911
-#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET			42912
-#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET			42913
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET		42914
-#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET			42915
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET		42916
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET		42917
-#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET			42918
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET		42919
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET		42920
-#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET			42921
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET		42922
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET		42923
-#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET			42924
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET		42925
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET		42926
-#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET			42927
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET		42928
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET		42929
-#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET			42930
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET		42931
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET		42932
-#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET			42933
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET		42934
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET		42935
-#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET			42936
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET		42937
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET		42938
-#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET			42939
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET		42940
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET		42941
-#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET			42942
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET		42943
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET		42944
-#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET			42945
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET		42946
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET		42947
-#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET			42948
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET		42949
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET		42950
-#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET			42951
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET		42952
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET		42953
-#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET			42954
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET		42955
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET		42956
-#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET			42957
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET		42958
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET		42959
-#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET			42960
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET		42961
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET		42962
-#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET			42963
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET		42964
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET		42965
-#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET			42966
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET		42967
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET		42968
-#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET			42969
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET		42970
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET		42971
-#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET			42972
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET		42973
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ20_RT_OFFSET		42974
-#define PBF_REG_BTB_GUARANTEED_VOQ20_RT_OFFSET			42975
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ20_RT_OFFSET		42976
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ21_RT_OFFSET		42977
-#define PBF_REG_BTB_GUARANTEED_VOQ21_RT_OFFSET			42978
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ21_RT_OFFSET		42979
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ22_RT_OFFSET		42980
-#define PBF_REG_BTB_GUARANTEED_VOQ22_RT_OFFSET			42981
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ22_RT_OFFSET		42982
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ23_RT_OFFSET		42983
-#define PBF_REG_BTB_GUARANTEED_VOQ23_RT_OFFSET			42984
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ23_RT_OFFSET		42985
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ24_RT_OFFSET		42986
-#define PBF_REG_BTB_GUARANTEED_VOQ24_RT_OFFSET			42987
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ24_RT_OFFSET		42988
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ25_RT_OFFSET		42989
-#define PBF_REG_BTB_GUARANTEED_VOQ25_RT_OFFSET			42990
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ25_RT_OFFSET		42991
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ26_RT_OFFSET		42992
-#define PBF_REG_BTB_GUARANTEED_VOQ26_RT_OFFSET			42993
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ26_RT_OFFSET		42994
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ27_RT_OFFSET		42995
-#define PBF_REG_BTB_GUARANTEED_VOQ27_RT_OFFSET			42996
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ27_RT_OFFSET		42997
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ28_RT_OFFSET		42998
-#define PBF_REG_BTB_GUARANTEED_VOQ28_RT_OFFSET			42999
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ28_RT_OFFSET		43000
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ29_RT_OFFSET		43001
-#define PBF_REG_BTB_GUARANTEED_VOQ29_RT_OFFSET			43002
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ29_RT_OFFSET		43003
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ30_RT_OFFSET		43004
-#define PBF_REG_BTB_GUARANTEED_VOQ30_RT_OFFSET			43005
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ30_RT_OFFSET		43006
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ31_RT_OFFSET		43007
-#define PBF_REG_BTB_GUARANTEED_VOQ31_RT_OFFSET			43008
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ31_RT_OFFSET		43009
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ32_RT_OFFSET		43010
-#define PBF_REG_BTB_GUARANTEED_VOQ32_RT_OFFSET			43011
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ32_RT_OFFSET		43012
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ33_RT_OFFSET		43013
-#define PBF_REG_BTB_GUARANTEED_VOQ33_RT_OFFSET			43014
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ33_RT_OFFSET		43015
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ34_RT_OFFSET		43016
-#define PBF_REG_BTB_GUARANTEED_VOQ34_RT_OFFSET			43017
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ34_RT_OFFSET		43018
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ35_RT_OFFSET		43019
-#define PBF_REG_BTB_GUARANTEED_VOQ35_RT_OFFSET			43020
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ35_RT_OFFSET		43021
-#define XCM_REG_CON_PHY_Q3_RT_OFFSET				43022
-
-#define RUNTIME_ARRAY_SIZE	43023
+#define NIG_REG_PPF_TO_ENGINE_SEL_RT_OFFSET                             39786
+#define NIG_REG_PPF_TO_ENGINE_SEL_RT_SIZE                               8
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_OFFSET                  39794
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_SIZE                    1024
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_OFFSET                     40818
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_SIZE                       512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_OFFSET                   41330
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_SIZE                     512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET          41842
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE            512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_OFFSET                42354
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_SIZE                  512
+#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_OFFSET                        42866
+#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_SIZE                          32
+#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET                               42898
+#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET                               42899
+#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET                               42900
+#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET                           42901
+#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET                           42902
+#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET                           42903
+#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET                           42904
+#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET                        42905
+#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET                        42906
+#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET                        42907
+#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET                        42908
+#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET                            42909
+#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET                         42910
+#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET                               42911
+#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET                          42912
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET                        42913
+#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET                           42914
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET                    42915
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET                        42916
+#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET                           42917
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET                    42918
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET                        42919
+#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET                           42920
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET                    42921
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET                        42922
+#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET                           42923
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET                    42924
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET                        42925
+#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET                           42926
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET                    42927
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET                        42928
+#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET                           42929
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET                    42930
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET                        42931
+#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET                           42932
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET                    42933
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET                        42934
+#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET                           42935
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET                    42936
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET                        42937
+#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET                           42938
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET                    42939
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET                        42940
+#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET                           42941
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET                    42942
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET                       42943
+#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET                          42944
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET                   42945
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET                       42946
+#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET                          42947
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET                   42948
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET                       42949
+#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET                          42950
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET                   42951
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET                       42952
+#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET                          42953
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET                   42954
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET                       42955
+#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET                          42956
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET                   42957
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET                       42958
+#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET                          42959
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET                   42960
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET                       42961
+#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET                          42962
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET                   42963
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET                       42964
+#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET                          42965
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET                   42966
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET                       42967
+#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET                          42968
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET                   42969
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET                       42970
+#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET                          42971
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET                   42972
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ20_RT_OFFSET                       42973
+#define PBF_REG_BTB_GUARANTEED_VOQ20_RT_OFFSET                          42974
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ20_RT_OFFSET                   42975
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ21_RT_OFFSET                       42976
+#define PBF_REG_BTB_GUARANTEED_VOQ21_RT_OFFSET                          42977
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ21_RT_OFFSET                   42978
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ22_RT_OFFSET                       42979
+#define PBF_REG_BTB_GUARANTEED_VOQ22_RT_OFFSET                          42980
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ22_RT_OFFSET                   42981
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ23_RT_OFFSET                       42982
+#define PBF_REG_BTB_GUARANTEED_VOQ23_RT_OFFSET                          42983
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ23_RT_OFFSET                   42984
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ24_RT_OFFSET                       42985
+#define PBF_REG_BTB_GUARANTEED_VOQ24_RT_OFFSET                          42986
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ24_RT_OFFSET                   42987
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ25_RT_OFFSET                       42988
+#define PBF_REG_BTB_GUARANTEED_VOQ25_RT_OFFSET                          42989
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ25_RT_OFFSET                   42990
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ26_RT_OFFSET                       42991
+#define PBF_REG_BTB_GUARANTEED_VOQ26_RT_OFFSET                          42992
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ26_RT_OFFSET                   42993
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ27_RT_OFFSET                       42994
+#define PBF_REG_BTB_GUARANTEED_VOQ27_RT_OFFSET                          42995
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ27_RT_OFFSET                   42996
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ28_RT_OFFSET                       42997
+#define PBF_REG_BTB_GUARANTEED_VOQ28_RT_OFFSET                          42998
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ28_RT_OFFSET                   42999
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ29_RT_OFFSET                       43000
+#define PBF_REG_BTB_GUARANTEED_VOQ29_RT_OFFSET                          43001
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ29_RT_OFFSET                   43002
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ30_RT_OFFSET                       43003
+#define PBF_REG_BTB_GUARANTEED_VOQ30_RT_OFFSET                          43004
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ30_RT_OFFSET                   43005
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ31_RT_OFFSET                       43006
+#define PBF_REG_BTB_GUARANTEED_VOQ31_RT_OFFSET                          43007
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ31_RT_OFFSET                   43008
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ32_RT_OFFSET                       43009
+#define PBF_REG_BTB_GUARANTEED_VOQ32_RT_OFFSET                          43010
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ32_RT_OFFSET                   43011
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ33_RT_OFFSET                       43012
+#define PBF_REG_BTB_GUARANTEED_VOQ33_RT_OFFSET                          43013
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ33_RT_OFFSET                   43014
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ34_RT_OFFSET                       43015
+#define PBF_REG_BTB_GUARANTEED_VOQ34_RT_OFFSET                          43016
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ34_RT_OFFSET                   43017
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ35_RT_OFFSET                       43018
+#define PBF_REG_BTB_GUARANTEED_VOQ35_RT_OFFSET                          43019
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ35_RT_OFFSET                   43020
+#define XCM_REG_CON_PHY_Q3_RT_OFFSET                                    43021
+
+#define RUNTIME_ARRAY_SIZE 43022
+
 
 /* Init Callbacks */
 #define DMAE_READY_CB	0
@@ -5694,8 +5792,10 @@ struct eth_vport_rx_mode {
 #define ETH_VPORT_RX_MODE_MCAST_ACCEPT_ALL_SHIFT	4
 #define ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL_MASK		0x1
 #define ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL_SHIFT	5
-#define ETH_VPORT_RX_MODE_RESERVED1_MASK		0x3FF
-#define ETH_VPORT_RX_MODE_RESERVED1_SHIFT		6
+#define ETH_VPORT_RX_MODE_ACCEPT_ANY_VNI_MASK		0x1
+#define ETH_VPORT_RX_MODE_ACCEPT_ANY_VNI_SHIFT		6
+#define ETH_VPORT_RX_MODE_RESERVED1_MASK		0x1FF
+#define ETH_VPORT_RX_MODE_RESERVED1_SHIFT		7
 };
 
 /* Command for setting tpa parameters */
@@ -6756,7 +6856,7 @@ struct e4_ystorm_rdma_task_ag_ctx {
 #define E4_YSTORM_RDMA_TASK_AG_CTX_RULE6EN_MASK		0x1
 #define E4_YSTORM_RDMA_TASK_AG_CTX_RULE6EN_SHIFT	7
 	u8 key;
-	__le32 mw_cnt;
+	__le32 mw_cnt_or_qp_id;
 	u8 ref_cnt_seq;
 	u8 ctx_upd_seq;
 	__le16 dif_flags;
@@ -6812,7 +6912,7 @@ struct e4_mstorm_rdma_task_ag_ctx {
 #define E4_MSTORM_RDMA_TASK_AG_CTX_RULE6EN_MASK		0x1
 #define E4_MSTORM_RDMA_TASK_AG_CTX_RULE6EN_SHIFT	7
 	u8 key;
-	__le32 mw_cnt;
+	__le32 mw_cnt_or_qp_id;
 	u8 ref_cnt_seq;
 	u8 ctx_upd_seq;
 	__le16 dif_flags;
@@ -7075,8 +7175,7 @@ struct rdma_register_tid_ramrod_data {
 	struct regpair va;
 	struct regpair pbl_base;
 	struct regpair dif_error_addr;
-	struct regpair dif_runt_addr;
-	__le32 reserved4[2];
+	__le32 reserved4[4];
 };
 
 /* rdma resize cq output params */
@@ -7144,8 +7243,7 @@ struct rdma_srq_modify_ramrod_data {
 enum rdma_tid_type {
 	RDMA_TID_REGISTERED_MR,
 	RDMA_TID_FMR,
-	RDMA_TID_MW_TYPE1,
-	RDMA_TID_MW_TYPE2A,
+	RDMA_TID_MW,
 	MAX_RDMA_TID_TYPE
 };
 
@@ -7681,6 +7779,16 @@ struct e4_roce_conn_context {
 	struct ustorm_roce_conn_st_ctx ustorm_st_context;
 };
 
+/* roce cqes statistics */
+struct roce_cqe_stats {
+	__le32 req_cqe_error;
+	__le32 req_remote_access_errors;
+	__le32 req_remote_invalid_request;
+	__le32 resp_cqe_error;
+	__le32 resp_local_length_error;
+	__le32 reserved;
+};
+
 /* roce create qp requester ramrod data */
 struct roce_create_qp_req_ramrod_data {
 	__le16 flags;
@@ -7798,8 +7906,8 @@ struct roce_dcqcn_sent_stats {
 
 /* RoCE destroy qp requester output params */
 struct roce_destroy_qp_req_output_params {
-	__le32 num_bound_mw;
 	__le32 cq_prod;
+	__le32 reserved;
 };
 
 /* RoCE destroy qp requester ramrod data */
@@ -7809,8 +7917,8 @@ struct roce_destroy_qp_req_ramrod_data {
 
 /* RoCE destroy qp responder output params */
 struct roce_destroy_qp_resp_output_params {
-	__le32 num_invalidated_mw;
 	__le32 cq_prod;
+	__le32 reserved;
 };
 
 /* RoCE destroy qp responder ramrod data */
@@ -7818,16 +7926,27 @@ struct roce_destroy_qp_resp_ramrod_data {
 	struct regpair output_params_addr;
 };
 
+/* roce error statistics */
+struct roce_error_stats {
+	__le32 resp_remote_access_errors;
+	__le32 reserved;
+};
+
 /* roce special events statistics */
 struct roce_events_stats {
-	__le16 silent_drops;
-	__le16 rnr_naks_sent;
+	__le32 silent_drops;
+	__le32 rnr_naks_sent;
 	__le32 retransmit_count;
 	__le32 icrc_error_count;
-	__le32 reserved;
+	__le32 implied_nak_seq_err;
+	__le32 duplicate_request;
+	__le32 local_ack_timeout_err;
+	__le32 out_of_sequence;
+	__le32 packet_seq_err;
+	__le32 rnr_nak_retry_err;
 };
 
-/* ROCE slow path EQ cmd IDs */
+/* roce slow path EQ cmd IDs */
 enum roce_event_opcode {
 	ROCE_EVENT_CREATE_QP = 11,
 	ROCE_EVENT_MODIFY_QP,
@@ -7845,6 +7964,9 @@ struct roce_init_func_params {
 	u8 cnp_dscp;
 	u8 reserved;
 	__le32 cnp_send_timeout;
+	__le16 rl_offset;
+	u8 rl_count_log;
+	u8 reserved1[5];
 };
 
 /* roce func init ramrod data */
@@ -8532,7 +8654,7 @@ struct e4_tstorm_roce_resp_conn_ag_ctx {
 	__le16 rq_prod;
 	__le16 conn_dpi;
 	__le16 irq_cons;
-	__le32 num_invlidated_mw;
+	__le32 reg9;
 	__le32 reg10;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index fca2dbd93ad9..70504dcf4087 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -360,6 +360,26 @@ void qed_port_unpretend(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	       *(u32 *)&p_ptt->pxp.pretend);
 }
 
+void qed_port_fid_pretend(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 port_id, u16 fid)
+{
+	u16 control = 0;
+
+	SET_FIELD(control, PXP_PRETEND_CMD_PORT, port_id);
+	SET_FIELD(control, PXP_PRETEND_CMD_USE_PORT, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_PORT, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_IS_CONCRETE, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_FUNCTION, 1);
+	if (!GET_FIELD(fid, PXP_CONCRETE_FID_VFVALID))
+		fid = GET_FIELD(fid, PXP_CONCRETE_FID_PFID);
+	p_ptt->pxp.pretend.control = cpu_to_le16(control);
+	p_ptt->pxp.pretend.fid.concrete_fid.fid = cpu_to_le16(fid);
+	REG_WR(p_hwfn,
+	       qed_ptt_config_addr(p_ptt) +
+	       offsetof(struct pxp_ptt_entry, pretend),
+	       *(u32 *)&p_ptt->pxp.pretend);
+}
+
 u32 qed_vfid_to_concrete(struct qed_hwfn *p_hwfn, u8 vfid)
 {
 	u32 concrete_fid = 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.h b/drivers/net/ethernet/qlogic/qed/qed_hw.h
index 8db2839a8ec8..505e94db939d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.h
@@ -244,6 +244,18 @@ void qed_port_pretend(struct qed_hwfn *p_hwfn,
 void qed_port_unpretend(struct qed_hwfn *p_hwfn,
 			struct qed_ptt *p_ptt);
 
+/**
+ * @brief qed_port_fid_pretend - pretend to another port and another function
+ *        when accessing the ptt window
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param port_id - the port to pretend to
+ * @param fid - fid field of pxp_pretend structure. Can contain either pf / vf.
+ */
+void qed_port_fid_pretend(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 port_id, u16 fid);
+
 /**
  * @brief qed_vfid_to_concrete - build a concrete FID for a
  *        given VF ID
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index 1365da7c8900..d845badf9b90 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -1245,7 +1245,7 @@ void qed_gft_config(struct qed_hwfn *p_hwfn,
 		    bool udp,
 		    bool ipv4, bool ipv6, enum gft_profile_type profile_type)
 {
-	u32 reg_val, cam_line, ram_line_lo, ram_line_hi;
+	u32 reg_val, cam_line, ram_line_lo, ram_line_hi, search_non_ip_as_gft;
 
 	if (!ipv6 && !ipv4)
 		DP_NOTICE(p_hwfn,
@@ -1314,6 +1314,9 @@ void qed_gft_config(struct qed_hwfn *p_hwfn,
 	ram_line_lo = 0;
 	ram_line_hi = 0;
 
+	/* Search no IP as GFT */
+	search_non_ip_as_gft = 0;
+
 	/* Tunnel type */
 	SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_DST_PORT, 1);
 	SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_OVER_IP_PROTOCOL, 1);
@@ -1337,8 +1340,13 @@ void qed_gft_config(struct qed_hwfn *p_hwfn,
 		SET_FIELD(ram_line_lo, GFT_RAM_LINE_ETHERTYPE, 1);
 	} else if (profile_type == GFT_PROFILE_TYPE_TUNNEL_TYPE) {
 		SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_ETHERTYPE, 1);
+
+		/* Allow tunneled traffic without inner IP */
+		search_non_ip_as_gft = 1;
 	}
 
+	qed_wr(p_hwfn,
+	       p_ptt, PRS_REG_SEARCH_NON_IP_AS_GFT, search_non_ip_as_gft);
 	qed_wr(p_hwfn,
 	       p_ptt,
 	       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE * pf_id,
@@ -1509,3 +1517,43 @@ void qed_enable_context_validation(struct qed_hwfn *p_hwfn,
 	ctx_validation = CDU_VALIDATION_DEFAULT_CFG << 8;
 	qed_wr(p_hwfn, p_ptt, CDU_REG_TCFC_CTX_VALID0, ctx_validation);
 }
+
+static u32 qed_get_rdma_assert_ram_addr(struct qed_hwfn *p_hwfn, u8 storm_id)
+{
+	switch (storm_id) {
+	case 0:
+		return TSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    TSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 1:
+		return MSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    MSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 2:
+		return USEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    USTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 3:
+		return XSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    XSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 4:
+		return YSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    YSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 5:
+		return PSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    PSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+
+	default:
+		return 0;
+	}
+}
+
+void qed_set_rdma_error_level(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u8 assert_level[NUM_STORMS])
+{
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < NUM_STORMS; storm_id++) {
+		u32 ram_addr = qed_get_rdma_assert_ram_addr(p_hwfn, storm_id);
+
+		qed_wr(p_hwfn, p_ptt, ram_addr, assert_level[storm_id]);
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 474e6cff5b97..90a2b53096e2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1159,7 +1159,6 @@ int qed_iwarp_connect(void *rdma_cxt,
 	struct qed_iwarp_info *iwarp_info;
 	struct qed_iwarp_ep *ep;
 	u8 mpa_data_size = 0;
-	u8 ts_hdr_size = 0;
 	u32 cid;
 	int rc;
 
@@ -1218,10 +1217,7 @@ int qed_iwarp_connect(void *rdma_cxt,
 	       iparams->cm_info.private_data,
 	       iparams->cm_info.private_data_len);
 
-	if (p_hwfn->p_rdma_info->iwarp.tcp_flags & QED_IWARP_TS_EN)
-		ts_hdr_size = TIMESTAMP_HEADER_SIZE;
-
-	ep->mss = iparams->mss - ts_hdr_size;
+	ep->mss = iparams->mss;
 	ep->mss = min_t(u16, QED_IWARP_MAX_FW_MSS, ep->mss);
 
 	ep->event_cb = iparams->event_cb;
@@ -2337,7 +2333,6 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 	u8 local_mac_addr[ETH_ALEN];
 	struct qed_iwarp_ep *ep;
 	int tcp_start_offset;
-	u8 ts_hdr_size = 0;
 	u8 ll2_syn_handle;
 	int payload_len;
 	u32 hdr_size;
@@ -2415,11 +2410,7 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 
 	memcpy(&ep->cm_info, &cm_info, sizeof(ep->cm_info));
 
-	if (p_hwfn->p_rdma_info->iwarp.tcp_flags & QED_IWARP_TS_EN)
-		ts_hdr_size = TIMESTAMP_HEADER_SIZE;
-
-	hdr_size = ((cm_info.ip_version == QED_TCP_IPV4) ? 40 : 60) +
-		   ts_hdr_size;
+	hdr_size = ((cm_info.ip_version == QED_TCP_IPV4) ? 40 : 60);
 	ep->mss = p_hwfn->p_rdma_info->iwarp.max_mtu - hdr_size;
 	ep->mss = min_t(u16, QED_IWARP_MAX_FW_MSS, ep->mss);
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index eed4725761f5..1f6ac848109d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -586,6 +586,9 @@ qed_sp_update_accept_mode(struct qed_hwfn *p_hwfn,
 		SET_FIELD(state, ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL,
 			  !!(accept_filter & QED_ACCEPT_BCAST));
 
+		SET_FIELD(state, ETH_VPORT_RX_MODE_ACCEPT_ANY_VNI,
+			  !!(accept_filter & QED_ACCEPT_ANY_VNI));
+
 		p_ramrod->rx_mode.state = cpu_to_le16(state);
 		DP_VERBOSE(p_hwfn, QED_MSG_SP,
 			   "p_ramrod->rx_mode.state = 0x%x\n", state);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index c4030e949cce..806a8da257e9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -183,6 +183,7 @@ struct qed_filter_accept_flags {
 #define QED_ACCEPT_MCAST_MATCHED        0x08
 #define QED_ACCEPT_MCAST_UNMATCHED      0x10
 #define QED_ACCEPT_BCAST                0x20
+#define QED_ACCEPT_ANY_VNI              0x40
 };
 
 struct qed_arfs_config_params {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index b8705107a93a..101d677114f2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -1508,11 +1508,8 @@ qed_rdma_register_tid(void *rdma_cxt,
 	case QED_RDMA_TID_FMR:
 		tid_type = RDMA_TID_FMR;
 		break;
-	case QED_RDMA_TID_MW_TYPE1:
-		tid_type = RDMA_TID_MW_TYPE1;
-		break;
-	case QED_RDMA_TID_MW_TYPE2A:
-		tid_type = RDMA_TID_MW_TYPE2A;
+	case QED_RDMA_TID_MW:
+		tid_type = RDMA_TID_MW;
 		break;
 	default:
 		rc = -EINVAL;
@@ -1544,7 +1541,6 @@ qed_rdma_register_tid(void *rdma_cxt,
 			  RDMA_REGISTER_TID_RAMROD_DATA_DIF_ON_HOST_FLG, 1);
 		DMA_REGPAIR_LE(p_ramrod->dif_error_addr,
 			       params->dif_error_addr);
-		DMA_REGPAIR_LE(p_ramrod->dif_runt_addr, params->dif_runt_addr);
 	}
 
 	rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index f7122059b6b5..d8ad2dcad8d5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -178,7 +178,7 @@
 	0x008c80UL
 #define  MCP_REG_SCRATCH	\
 	0xe20000UL
-#define  CNIG_REG_NW_PORT_MODE_BB_B0 \
+#define  CNIG_REG_NW_PORT_MODE_BB \
 	0x218200UL
 #define  MISCS_REG_CHIP_NUM \
 	0x00976cUL
@@ -1621,6 +1621,7 @@
 #define NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN_SHIFT 1
 
 #define PRS_REG_SEARCH_GFT 0x1f11bcUL
+#define PRS_REG_SEARCH_NON_IP_AS_GFT 0x1f11c0UL
 #define PRS_REG_CM_HDR_GFT 0x1f11c8UL
 #define PRS_REG_GFT_CAM 0x1f1100UL
 #define PRS_REG_GFT_PROFILE_MASK_RAM 0x1f1000UL
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index ee57fcdc698d..b5ce1581645f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -681,7 +681,6 @@ static int qed_roce_sp_modify_requester(struct qed_hwfn *p_hwfn,
 
 static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 					    struct qed_rdma_qp *qp,
-					    u32 *num_invalidated_mw,
 					    u32 *cq_prod)
 {
 	struct roce_destroy_qp_resp_output_params *p_ramrod_res;
@@ -692,8 +691,6 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 	int rc;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
-
-	*num_invalidated_mw = 0;
 	*cq_prod = qp->cq_prod;
 
 	if (!qp->resp_offloaded) {
@@ -742,7 +739,6 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 	if (rc)
 		goto err;
 
-	*num_invalidated_mw = le32_to_cpu(p_ramrod_res->num_invalidated_mw);
 	*cq_prod = le32_to_cpu(p_ramrod_res->cq_prod);
 	qp->cq_prod = *cq_prod;
 
@@ -764,8 +760,7 @@ err:
 }
 
 static int qed_roce_sp_destroy_qp_requester(struct qed_hwfn *p_hwfn,
-					    struct qed_rdma_qp *qp,
-					    u32 *num_bound_mw)
+					    struct qed_rdma_qp *qp)
 {
 	struct roce_destroy_qp_req_output_params *p_ramrod_res;
 	struct roce_destroy_qp_req_ramrod_data *p_ramrod;
@@ -807,7 +802,6 @@ static int qed_roce_sp_destroy_qp_requester(struct qed_hwfn *p_hwfn,
 	if (rc)
 		goto err;
 
-	*num_bound_mw = le32_to_cpu(p_ramrod_res->num_bound_mw);
 
 	/* Free ORQ - only if ramrod succeeded, in case FW is still using it */
 	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
@@ -968,8 +962,6 @@ err_resp:
 
 int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
 {
-	u32 num_invalidated_mw = 0;
-	u32 num_bound_mw = 0;
 	u32 cq_prod;
 	int rc;
 
@@ -984,22 +976,14 @@ int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
 
 	if (qp->cur_state != QED_ROCE_QP_STATE_RESET) {
 		rc = qed_roce_sp_destroy_qp_responder(p_hwfn, qp,
-						      &num_invalidated_mw,
 						      &cq_prod);
 		if (rc)
 			return rc;
 
 		/* Send destroy requester ramrod */
-		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp,
-						      &num_bound_mw);
+		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp);
 		if (rc)
 			return rc;
-
-		if (num_invalidated_mw != num_bound_mw) {
-			DP_NOTICE(p_hwfn,
-				  "number of invalidate memory windows is different from bounded ones\n");
-			return -EINVAL;
-		}
 	}
 
 	return 0;
@@ -1010,7 +994,6 @@ int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
 		       enum qed_roce_qp_state prev_state,
 		       struct qed_rdma_modify_qp_in_params *params)
 {
-	u32 num_invalidated_mw = 0, num_bound_mw = 0;
 	int rc = 0;
 
 	/* Perform additional operations according to the current state and the
@@ -1090,7 +1073,6 @@ int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
 		/* Send destroy responder ramrod */
 		rc = qed_roce_sp_destroy_qp_responder(p_hwfn,
 						      qp,
-						      &num_invalidated_mw,
 						      &cq_prod);
 
 		if (rc)
@@ -1098,14 +1080,7 @@ int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
 
 		qp->cq_prod = cq_prod;
 
-		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp,
-						      &num_bound_mw);
-
-		if (num_invalidated_mw != num_bound_mw) {
-			DP_NOTICE(p_hwfn,
-				  "number of invalidate memory windows is different from bounded ones\n");
-			return -EINVAL;
-		}
+		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp);
 	} else {
 		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "0\n");
 	}
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index ff21aa1fd939..2f0a4f2c5ff8 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -471,14 +471,8 @@ static u16 qedi_calc_mss(u16 pmtu, u8 is_ipv6, u8 tcp_ts_en, u8 vlan_en)
 	else
 		hdrs += IPV4_HDR_LEN;
 
-	if (vlan_en)
-		hdrs += VLAN_LEN;
-
 	mss = pmtu - hdrs;
 
-	if (tcp_ts_en)
-		mss -= TCP_OPTION_LEN;
-
 	if (!mss)
 		mss = DEF_MSS;
 
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 13c8ab171437..0081fa6d1268 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -109,8 +109,8 @@
 #define MAX_NUM_LL2_TX_STATS_COUNTERS	48
 
 #define FW_MAJOR_VERSION	8
-#define FW_MINOR_VERSION	33
-#define FW_REVISION_VERSION     11
+#define FW_MINOR_VERSION        37
+#define FW_REVISION_VERSION     2
 #define FW_ENGINEERING_VERSION	0
 
 /***********************/
diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h
index 938df614cb6a..b34c573f2b30 100644
--- a/include/linux/qed/iscsi_common.h
+++ b/include/linux/qed/iscsi_common.h
@@ -799,8 +799,8 @@ struct e4_mstorm_iscsi_task_ag_ctx {
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK			0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT	5
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK			0x1
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT			6
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK	0x1
@@ -849,8 +849,8 @@ struct e4_ustorm_iscsi_task_ag_ctx {
 #define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
 #define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
 #define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_MASK			0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK     0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT    5
 #define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK		0x3
 #define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT		6
 	u8 flags1;
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index e05e320d28b7..df4d13f7e191 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -65,8 +65,7 @@ enum qed_roce_qp_state {
 enum qed_rdma_tid_type {
 	QED_RDMA_TID_REGISTERED_MR,
 	QED_RDMA_TID_FMR,
-	QED_RDMA_TID_MW_TYPE1,
-	QED_RDMA_TID_MW_TYPE2A
+	QED_RDMA_TID_MW
 };
 
 struct qed_rdma_events {
@@ -280,7 +279,6 @@ struct qed_rdma_register_tid_in_params {
 
 	bool dif_enabled;
 	u64 dif_error_addr;
-	u64 dif_runt_addr;
 };
 
 struct qed_rdma_create_cq_in_params {
diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h
index 193bcef302e1..473fba76aa77 100644
--- a/include/linux/qed/roce_common.h
+++ b/include/linux/qed/roce_common.h
@@ -43,6 +43,7 @@
 #define ROCE_MAX_QPS			(32 * 1024)
 #define ROCE_DCQCN_NP_MAX_QPS		(64)
 #define ROCE_DCQCN_RP_MAX_QPS		(64)
+#define ROCE_LKEY_MW_DIF_EN_BIT		(28)
 
 /* Affiliated asynchronous events / errors enumeration */
 enum roce_async_events_type {
-- 
cgit v1.2.3


From e783bb00ad86d9d1f01d9d3a750713070036358e Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 5 Jun 2018 15:02:00 +0200
Subject: ipmr: fix error path when ipmr_new_table fails

commit 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
refactored ipmr_new_table, so that it now returns NULL when
mr_table_alloc fails. Unfortunately, all callers of ipmr_new_table
expect an ERR_PTR.

This can result in NULL deref, for example when ipmr_rules_exit calls
ipmr_free_table with NULL net->ipv4.mrt in the
!CONFIG_IP_MROUTE_MULTIPLE_TABLES version.

This patch makes mr_table_alloc return errors, and changes
ip6mr_new_table and its callers to return/expect error pointers as
well. It also removes the version of mr_table_alloc defined under
!CONFIG_IP_MROUTE_COMMON, since it is never used.

Fixes: 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h | 10 ----------
 net/ipv4/ipmr_base.c        |  8 +++++---
 net/ipv6/ip6mr.c            | 18 ++++++++++++------
 3 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index d617fe45543e..d633f737b3c6 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -307,16 +307,6 @@ static inline void vif_device_init(struct vif_device *v,
 {
 }
 
-static inline void *
-mr_table_alloc(struct net *net, u32 id,
-	       struct mr_table_ops *ops,
-	       void (*expire_func)(struct timer_list *t),
-	       void (*table_set)(struct mr_table *mrt,
-				 struct net *net))
-{
-	return NULL;
-}
-
 static inline void *mr_mfc_find_parent(struct mr_table *mrt,
 				       void *hasharg, int parent)
 {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 30221701614c..cafb0506c8c9 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -35,17 +35,19 @@ mr_table_alloc(struct net *net, u32 id,
 				 struct net *net))
 {
 	struct mr_table *mrt;
+	int err;
 
 	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
 	if (!mrt)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	mrt->id = id;
 	write_pnet(&mrt->net, net);
 
 	mrt->ops = *ops;
-	if (rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params)) {
+	err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
+	if (err) {
 		kfree(mrt);
-		return NULL;
+		return ERR_PTR(err);
 	}
 	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 5c57374850b3..058fc05e5708 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -228,8 +228,8 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	INIT_LIST_HEAD(&net->ipv6.mr6_tables);
 
 	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
-	if (!mrt) {
-		err = -ENOMEM;
+	if (IS_ERR(mrt)) {
+		err = PTR_ERR(mrt);
 		goto err1;
 	}
 
@@ -302,8 +302,13 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
 
 static int __net_init ip6mr_rules_init(struct net *net)
 {
-	net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT);
-	return net->ipv6.mrt6 ? 0 : -ENOMEM;
+	struct mr_table *mrt;
+
+	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+	if (IS_ERR(mrt))
+		return PTR_ERR(mrt);
+	net->ipv6.mrt6 = mrt;
+	return 0;
 }
 
 static void __net_exit ip6mr_rules_exit(struct net *net)
@@ -1758,8 +1763,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
 		rtnl_lock();
 		ret = 0;
-		if (!ip6mr_new_table(net, v))
-			ret = -ENOMEM;
+		mrt = ip6mr_new_table(net, v);
+		if (IS_ERR(mrt))
+			ret = PTR_ERR(mrt);
 		else
 			raw6_sk(sk)->ip6mr_table = v;
 		rtnl_unlock();
-- 
cgit v1.2.3


From ac0fc8a1bbcbe03ee67278afded105c05eb3535e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 5 Jun 2018 08:14:09 -0700
Subject: devlink: Add extack to reload and port_{un, }split operations

Add extack argument to reload, port_split and port_unsplit operations.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  9 ++++++---
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  5 +++--
 drivers/net/netdevsim/devlink.c                  |  3 ++-
 include/net/devlink.h                            |  7 ++++---
 net/core/devlink.c                               | 18 ++++++++++--------
 5 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 8a766fe28fa0..7ed38d80bc08 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -770,7 +770,8 @@ static void mlxsw_core_driver_put(const char *kind)
 
 static int mlxsw_devlink_port_split(struct devlink *devlink,
 				    unsigned int port_index,
-				    unsigned int count)
+				    unsigned int count,
+				    struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 
@@ -782,7 +783,8 @@ static int mlxsw_devlink_port_split(struct devlink *devlink,
 }
 
 static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
-				      unsigned int port_index)
+				      unsigned int port_index,
+				      struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 
@@ -963,7 +965,8 @@ mlxsw_devlink_sb_occ_tc_port_bind_get(struct devlink_port *devlink_port,
 						     pool_type, p_cur, p_max);
 }
 
-static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
+static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink,
+						struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 	int err;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index 71c2edd83031..db463e20a876 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -92,7 +92,7 @@ nfp_devlink_set_lanes(struct nfp_pf *pf, unsigned int idx, unsigned int lanes)
 
 static int
 nfp_devlink_port_split(struct devlink *devlink, unsigned int port_index,
-		       unsigned int count)
+		       unsigned int count, struct netlink_ext_ack *extack)
 {
 	struct nfp_pf *pf = devlink_priv(devlink);
 	struct nfp_eth_table_port eth_port;
@@ -123,7 +123,8 @@ out:
 }
 
 static int
-nfp_devlink_port_unsplit(struct devlink *devlink, unsigned int port_index)
+nfp_devlink_port_unsplit(struct devlink *devlink, unsigned int port_index,
+			 struct netlink_ext_ack *extack)
 {
 	struct nfp_pf *pf = devlink_priv(devlink);
 	struct nfp_eth_table_port eth_port;
diff --git a/drivers/net/netdevsim/devlink.c b/drivers/net/netdevsim/devlink.c
index bef7db5d129a..e8366cf372ff 100644
--- a/drivers/net/netdevsim/devlink.c
+++ b/drivers/net/netdevsim/devlink.c
@@ -147,7 +147,8 @@ out:
 	return err;
 }
 
-static int nsim_devlink_reload(struct devlink *devlink)
+static int nsim_devlink_reload(struct devlink *devlink,
+			       struct netlink_ext_ack *extack)
 {
 	enum nsim_resource_id res_ids[] = {
 		NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4_FIB_RULES,
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 9686a1aa4ec9..e336ea9c73df 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -296,12 +296,13 @@ struct devlink_resource {
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
 struct devlink_ops {
-	int (*reload)(struct devlink *devlink);
+	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
 	int (*port_split)(struct devlink *devlink, unsigned int port_index,
-			  unsigned int count);
-	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index);
+			  unsigned int count, struct netlink_ext_ack *extack);
+	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index,
+			    struct netlink_ext_ack *extack);
 	int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index,
 			   u16 pool_index,
 			   struct devlink_sb_pool_info *pool_info);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index f75ee022e6b2..22099705cc41 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -702,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
 	return 0;
 }
 
-static int devlink_port_split(struct devlink *devlink,
-			      u32 port_index, u32 count)
+static int devlink_port_split(struct devlink *devlink, u32 port_index,
+			      u32 count, struct netlink_ext_ack *extack)
 
 {
 	if (devlink->ops && devlink->ops->port_split)
-		return devlink->ops->port_split(devlink, port_index, count);
+		return devlink->ops->port_split(devlink, port_index, count,
+						extack);
 	return -EOPNOTSUPP;
 }
 
@@ -724,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
 
 	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
 	count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
-	return devlink_port_split(devlink, port_index, count);
+	return devlink_port_split(devlink, port_index, count, info->extack);
 }
 
-static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
+				struct netlink_ext_ack *extack)
 
 {
 	if (devlink->ops && devlink->ops->port_unsplit)
-		return devlink->ops->port_unsplit(devlink, port_index);
+		return devlink->ops->port_unsplit(devlink, port_index, extack);
 	return -EOPNOTSUPP;
 }
 
@@ -745,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
 		return -EINVAL;
 
 	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
-	return devlink_port_unsplit(devlink, port_index);
+	return devlink_port_unsplit(devlink, port_index, info->extack);
 }
 
 static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
@@ -2599,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
 		return err;
 	}
-	return devlink->ops->reload(devlink);
+	return devlink->ops->reload(devlink, info->extack);
 }
 
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
-- 
cgit v1.2.3


From 2520e627dbeecf7ccccc50c969504b59e1a3294b Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 5 Jun 2018 15:35:03 +0100
Subject: ACPI / PPTT: fix build when CONFIG_ACPI_PPTT is not enabled

Though CONFIG_ACPI_PPTT is selected by platforms and nor user visible,
it may be useful to support the build with CONFIG_ACPI_PPTT disabled.

This patch adds the missing dummy/boiler plate implementation to fix
the build.

Acked-by: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/acpi.h      | 15 +++++++++++++++
 include/linux/cacheinfo.h |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 032e12a2fdc2..d2639cd2eafc 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1297,8 +1297,23 @@ static inline int lpit_read_residency_count_address(u64 *address)
 }
 #endif
 
+#ifdef CONFIG_ACPI_PPTT
 int find_acpi_cpu_topology(unsigned int cpu, int level);
 int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
+#else
+static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
+{
+	return -EINVAL;
+}
+static inline int find_acpi_cpu_topology_package(unsigned int cpu)
+{
+	return -EINVAL;
+}
+static inline int find_acpi_cpu_cache_topology(unsigned int cpu, int level)
+{
+	return -EINVAL;
+}
+#endif
 
 #endif	/*_LINUX_ACPI_H*/
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 89397e30e269..70e19bc6cc9f 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -98,7 +98,7 @@ struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu);
 int init_cache_level(unsigned int cpu);
 int populate_cache_leaves(unsigned int cpu);
 int cache_setup_acpi(unsigned int cpu);
-#ifndef CONFIG_ACPI
+#ifndef CONFIG_ACPI_PPTT
 /*
  * acpi_find_last_cache_level is only called on ACPI enabled
  * platforms using the PPTT for topology. This means that if
-- 
cgit v1.2.3


From 05e98465e08a58d7f8f577281f0c98c71c624c4b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 1 Jun 2018 20:36:30 +0200
Subject: ncpfs: remove uapi .h files

Now that ncpfs is removed from the tree, there is no need to keep the
uapi header files around as no one uses them, and it is not a feature
that the kernel supports anymore.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/ncp.h       | 202 -----------------------------------------
 include/uapi/linux/ncp_fs.h    | 147 ------------------------------
 include/uapi/linux/ncp_mount.h |  72 ---------------
 include/uapi/linux/ncp_no.h    |  20 ----
 4 files changed, 441 deletions(-)
 delete mode 100644 include/uapi/linux/ncp.h
 delete mode 100644 include/uapi/linux/ncp_fs.h
 delete mode 100644 include/uapi/linux/ncp_mount.h
 delete mode 100644 include/uapi/linux/ncp_no.h

(limited to 'include')

diff --git a/include/uapi/linux/ncp.h b/include/uapi/linux/ncp.h
deleted file mode 100644
index ca6f3d42c88f..000000000000
--- a/include/uapi/linux/ncp.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *  ncp.h
- *
- *  Copyright (C) 1995 by Volker Lendecke
- *  Modified for sparc by J.F. Chadima
- *  Modified for __constant_ntoh by Frank A. Vorstenbosch
- *
- */
-
-#ifndef _LINUX_NCP_H
-#define _LINUX_NCP_H
-
-#include <linux/types.h>
-
-#define NCP_PTYPE                (0x11)
-#define NCP_PORT                 (0x0451)
-
-#define NCP_ALLOC_SLOT_REQUEST   (0x1111)
-#define NCP_REQUEST              (0x2222)
-#define NCP_DEALLOC_SLOT_REQUEST (0x5555)
-
-struct ncp_request_header {
-	__u16 type;
-	__u8 sequence;
-	__u8 conn_low;
-	__u8 task;
-	__u8 conn_high;
-	__u8 function;
-	__u8 data[0];
-} __attribute__((packed));
-
-#define NCP_REPLY                (0x3333)
-#define NCP_WATCHDOG		 (0x3E3E)
-#define NCP_POSITIVE_ACK         (0x9999)
-
-struct ncp_reply_header {
-	__u16 type;
-	__u8 sequence;
-	__u8 conn_low;
-	__u8 task;
-	__u8 conn_high;
-	__u8 completion_code;
-	__u8 connection_state;
-	__u8 data[0];
-} __attribute__((packed));
-
-#define NCP_VOLNAME_LEN (16)
-#define NCP_NUMBER_OF_VOLUMES (256)
-struct ncp_volume_info {
-	__u32 total_blocks;
-	__u32 free_blocks;
-	__u32 purgeable_blocks;
-	__u32 not_yet_purgeable_blocks;
-	__u32 total_dir_entries;
-	__u32 available_dir_entries;
-	__u8 sectors_per_block;
-	char volume_name[NCP_VOLNAME_LEN + 1];
-};
-
-#define AR_READ      (cpu_to_le16(1))
-#define AR_WRITE     (cpu_to_le16(2))
-#define AR_EXCLUSIVE (cpu_to_le16(0x20))
-
-#define NCP_FILE_ID_LEN 6
-
-/* Defines for Name Spaces */
-#define NW_NS_DOS     0
-#define NW_NS_MAC     1
-#define NW_NS_NFS     2
-#define NW_NS_FTAM    3
-#define NW_NS_OS2     4
-
-/*  Defines for ReturnInformationMask */
-#define RIM_NAME	      (cpu_to_le32(1))
-#define RIM_SPACE_ALLOCATED   (cpu_to_le32(2))
-#define RIM_ATTRIBUTES	      (cpu_to_le32(4))
-#define RIM_DATA_SIZE	      (cpu_to_le32(8))
-#define RIM_TOTAL_SIZE	      (cpu_to_le32(0x10))
-#define RIM_EXT_ATTR_INFO     (cpu_to_le32(0x20))
-#define RIM_ARCHIVE	      (cpu_to_le32(0x40))
-#define RIM_MODIFY	      (cpu_to_le32(0x80))
-#define RIM_CREATION	      (cpu_to_le32(0x100))
-#define RIM_OWNING_NAMESPACE  (cpu_to_le32(0x200))
-#define RIM_DIRECTORY	      (cpu_to_le32(0x400))
-#define RIM_RIGHTS	      (cpu_to_le32(0x800))
-#define RIM_ALL 	      (cpu_to_le32(0xFFF))
-#define RIM_COMPRESSED_INFO   (cpu_to_le32(0x80000000))
-
-/* Defines for NSInfoBitMask */
-#define NSIBM_NFS_NAME		0x0001
-#define NSIBM_NFS_MODE		0x0002
-#define NSIBM_NFS_GID		0x0004
-#define NSIBM_NFS_NLINKS	0x0008
-#define NSIBM_NFS_RDEV		0x0010
-#define NSIBM_NFS_LINK		0x0020
-#define NSIBM_NFS_CREATED	0x0040
-#define NSIBM_NFS_UID		0x0080
-#define NSIBM_NFS_ACSFLAG	0x0100
-#define NSIBM_NFS_MYFLAG	0x0200
-
-/* open/create modes */
-#define OC_MODE_OPEN	  0x01
-#define OC_MODE_TRUNCATE  0x02
-#define OC_MODE_REPLACE   0x02
-#define OC_MODE_CREATE	  0x08
-
-/* open/create results */
-#define OC_ACTION_NONE	   0x00
-#define OC_ACTION_OPEN	   0x01
-#define OC_ACTION_CREATE   0x02
-#define OC_ACTION_TRUNCATE 0x04
-#define OC_ACTION_REPLACE  0x04
-
-/* access rights attributes */
-#ifndef AR_READ_ONLY
-#define AR_READ_ONLY	   0x0001
-#define AR_WRITE_ONLY	   0x0002
-#define AR_DENY_READ	   0x0004
-#define AR_DENY_WRITE	   0x0008
-#define AR_COMPATIBILITY   0x0010
-#define AR_WRITE_THROUGH   0x0040
-#define AR_OPEN_COMPRESSED 0x0100
-#endif
-
-struct nw_nfs_info {
-	__u32 mode;
-	__u32 rdev;
-};
-
-struct nw_info_struct {
-	__u32 spaceAlloc;
-	__le32 attributes;
-	__u16 flags;
-	__le32 dataStreamSize;
-	__le32 totalStreamSize;
-	__u16 numberOfStreams;
-	__le16 creationTime;
-	__le16 creationDate;
-	__u32 creatorID;
-	__le16 modifyTime;
-	__le16 modifyDate;
-	__u32 modifierID;
-	__le16 lastAccessDate;
-	__u16 archiveTime;
-	__u16 archiveDate;
-	__u32 archiverID;
-	__u16 inheritedRightsMask;
-	__le32 dirEntNum;
-	__le32 DosDirNum;
-	__u32 volNumber;
-	__u32 EADataSize;
-	__u32 EAKeyCount;
-	__u32 EAKeySize;
-	__u32 NSCreator;
-	__u8 nameLen;
-	__u8 entryName[256];
-	/* libncp may depend on there being nothing after entryName */
-#ifdef __KERNEL__
-	struct nw_nfs_info nfs;
-#endif
-} __attribute__((packed));
-
-/* modify mask - use with MODIFY_DOS_INFO structure */
-#define DM_ATTRIBUTES		  (cpu_to_le32(0x02))
-#define DM_CREATE_DATE		  (cpu_to_le32(0x04))
-#define DM_CREATE_TIME		  (cpu_to_le32(0x08))
-#define DM_CREATOR_ID		  (cpu_to_le32(0x10))
-#define DM_ARCHIVE_DATE 	  (cpu_to_le32(0x20))
-#define DM_ARCHIVE_TIME 	  (cpu_to_le32(0x40))
-#define DM_ARCHIVER_ID		  (cpu_to_le32(0x80))
-#define DM_MODIFY_DATE		  (cpu_to_le32(0x0100))
-#define DM_MODIFY_TIME		  (cpu_to_le32(0x0200))
-#define DM_MODIFIER_ID		  (cpu_to_le32(0x0400))
-#define DM_LAST_ACCESS_DATE	  (cpu_to_le32(0x0800))
-#define DM_INHERITED_RIGHTS_MASK  (cpu_to_le32(0x1000))
-#define DM_MAXIMUM_SPACE	  (cpu_to_le32(0x2000))
-
-struct nw_modify_dos_info {
-	__le32 attributes;
-	__le16 creationDate;
-	__le16 creationTime;
-	__u32 creatorID;
-	__le16 modifyDate;
-	__le16 modifyTime;
-	__u32 modifierID;
-	__u16 archiveDate;
-	__u16 archiveTime;
-	__u32 archiverID;
-	__le16 lastAccessDate;
-	__u16 inheritanceGrantMask;
-	__u16 inheritanceRevokeMask;
-	__u32 maximumSpace;
-} __attribute__((packed));
-
-struct nw_search_sequence {
-	__u8 volNumber;
-	__u32 dirBase;
-	__u32 sequence;
-} __attribute__((packed));
-
-#endif				/* _LINUX_NCP_H */
diff --git a/include/uapi/linux/ncp_fs.h b/include/uapi/linux/ncp_fs.h
deleted file mode 100644
index e76a44229d2f..000000000000
--- a/include/uapi/linux/ncp_fs.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *  ncp_fs.h
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_NCP_FS_H
-#define _LINUX_NCP_FS_H
-
-#include <linux/fs.h>
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/magic.h>
-
-#include <linux/ipx.h>
-#include <linux/ncp_no.h>
-
-/*
- * ioctl commands
- */
-
-struct ncp_ioctl_request {
-	unsigned int function;
-	unsigned int size;
-	char __user *data;
-};
-
-struct ncp_fs_info {
-	int version;
-	struct sockaddr_ipx addr;
-	__kernel_uid_t mounted_uid;
-	int connection;		/* Connection number the server assigned us */
-	int buffer_size;	/* The negotiated buffer size, to be
-				   used for read/write requests! */
-
-	int volume_number;
-	__le32 directory_id;
-};
-
-struct ncp_fs_info_v2 {
-	int version;
-	unsigned long mounted_uid;
-	unsigned int connection;
-	unsigned int buffer_size;
-
-	unsigned int volume_number;
-	__le32 directory_id;
-
-	__u32 dummy1;
-	__u32 dummy2;
-	__u32 dummy3;
-};
-
-struct ncp_sign_init
-{
-	char sign_root[8];
-	char sign_last[16];
-};
-
-struct ncp_lock_ioctl
-{
-#define NCP_LOCK_LOG	0
-#define NCP_LOCK_SH	1
-#define NCP_LOCK_EX	2
-#define NCP_LOCK_CLEAR	256
-	int		cmd;
-	int		origin;
-	unsigned int	offset;
-	unsigned int	length;
-#define NCP_LOCK_DEFAULT_TIMEOUT	18
-#define NCP_LOCK_MAX_TIMEOUT		180
-	int		timeout;
-};
-
-struct ncp_setroot_ioctl
-{
-	int		volNumber;
-	int		namespace;
-	__le32		dirEntNum;
-};
-
-struct ncp_objectname_ioctl
-{
-#define NCP_AUTH_NONE	0x00
-#define NCP_AUTH_BIND	0x31
-#define NCP_AUTH_NDS	0x32
-	int		auth_type;
-	size_t		object_name_len;
-	void __user *	object_name;	/* a userspace data, in most cases user name */
-};
-
-struct ncp_privatedata_ioctl
-{
-	size_t		len;
-	void __user *	data;		/* ~1000 for NDS */
-};
-
-/* NLS charsets by ioctl */
-#define NCP_IOCSNAME_LEN 20
-struct ncp_nls_ioctl
-{
-	unsigned char codepage[NCP_IOCSNAME_LEN+1];
-	unsigned char iocharset[NCP_IOCSNAME_LEN+1];
-};
-
-#define	NCP_IOC_NCPREQUEST		_IOR('n', 1, struct ncp_ioctl_request)
-#define	NCP_IOC_GETMOUNTUID		_IOW('n', 2, __kernel_old_uid_t)
-#define NCP_IOC_GETMOUNTUID2		_IOW('n', 2, unsigned long)
-
-#define NCP_IOC_CONN_LOGGED_IN          _IO('n', 3)
-
-#define NCP_GET_FS_INFO_VERSION    (1)
-#define NCP_IOC_GET_FS_INFO             _IOWR('n', 4, struct ncp_fs_info)
-#define NCP_GET_FS_INFO_VERSION_V2 (2)
-#define NCP_IOC_GET_FS_INFO_V2		_IOWR('n', 4, struct ncp_fs_info_v2)
-
-#define NCP_IOC_SIGN_INIT		_IOR('n', 5, struct ncp_sign_init)
-#define NCP_IOC_SIGN_WANTED		_IOR('n', 6, int)
-#define NCP_IOC_SET_SIGN_WANTED		_IOW('n', 6, int)
-
-#define NCP_IOC_LOCKUNLOCK		_IOR('n', 7, struct ncp_lock_ioctl)
-
-#define NCP_IOC_GETROOT			_IOW('n', 8, struct ncp_setroot_ioctl)
-#define NCP_IOC_SETROOT			_IOR('n', 8, struct ncp_setroot_ioctl)
-
-#define NCP_IOC_GETOBJECTNAME		_IOWR('n', 9, struct ncp_objectname_ioctl)
-#define NCP_IOC_SETOBJECTNAME		_IOR('n', 9, struct ncp_objectname_ioctl)
-#define NCP_IOC_GETPRIVATEDATA		_IOWR('n', 10, struct ncp_privatedata_ioctl)
-#define NCP_IOC_SETPRIVATEDATA		_IOR('n', 10, struct ncp_privatedata_ioctl)
-
-#define NCP_IOC_GETCHARSETS		_IOWR('n', 11, struct ncp_nls_ioctl)
-#define NCP_IOC_SETCHARSETS		_IOR('n', 11, struct ncp_nls_ioctl)
-
-#define NCP_IOC_GETDENTRYTTL		_IOW('n', 12, __u32)
-#define NCP_IOC_SETDENTRYTTL		_IOR('n', 12, __u32)
-
-/*
- * The packet size to allocate. One page should be enough.
- */
-#define NCP_PACKET_SIZE 4070
-
-#define NCP_MAXPATHLEN 255
-#define NCP_MAXNAMELEN 14
-
-#endif				/* _LINUX_NCP_FS_H */
diff --git a/include/uapi/linux/ncp_mount.h b/include/uapi/linux/ncp_mount.h
deleted file mode 100644
index 9bdbcd68c329..000000000000
--- a/include/uapi/linux/ncp_mount.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *  ncp_mount.h
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_NCP_MOUNT_H
-#define _LINUX_NCP_MOUNT_H
-
-#include <linux/types.h>
-#include <linux/ncp.h>
-
-#define NCP_MOUNT_VERSION 3	/* Binary */
-
-/* Values for flags */
-#define NCP_MOUNT_SOFT		0x0001
-#define NCP_MOUNT_INTR		0x0002
-#define NCP_MOUNT_STRONG	0x0004	/* enable delete/rename of r/o files */
-#define NCP_MOUNT_NO_OS2	0x0008	/* do not use OS/2 (LONG) namespace */
-#define NCP_MOUNT_NO_NFS	0x0010	/* do not use NFS namespace */
-#define NCP_MOUNT_EXTRAS	0x0020
-#define NCP_MOUNT_SYMLINKS	0x0040	/* enable symlinks */
-#define NCP_MOUNT_NFS_EXTRAS	0x0080	/* Enable use of NFS NS meta-info */
-
-struct ncp_mount_data {
-	int version;
-	unsigned int ncp_fd;	/* The socket to the ncp port */
-	__kernel_uid_t mounted_uid;	/* Who may umount() this filesystem? */
-	__kernel_pid_t wdog_pid;		/* Who cares for our watchdog packets? */
-
-	unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
-	unsigned int time_out;	/* How long should I wait after
-				   sending a NCP request? */
-	unsigned int retry_count;	/* And how often should I retry? */
-	unsigned int flags;
-
-	__kernel_uid_t uid;
-	__kernel_gid_t gid;
-	__kernel_mode_t file_mode;
-	__kernel_mode_t dir_mode;
-};
-
-#define NCP_MOUNT_VERSION_V4	(4)	/* Binary or text */
-
-struct ncp_mount_data_v4 {
-	int version;
-	unsigned long flags;	/* NCP_MOUNT_* flags */
-	/* MIPS uses long __kernel_uid_t, but... */
-	/* we neever pass -1, so it is safe */
-	unsigned long mounted_uid;	/* Who may umount() this filesystem? */
-	/* MIPS uses long __kernel_pid_t */
-	long wdog_pid;		/* Who cares for our watchdog packets? */
-
-	unsigned int ncp_fd;	/* The socket to the ncp port */
-	unsigned int time_out;	/* How long should I wait after
-				   sending a NCP request? */
-	unsigned int retry_count;	/* And how often should I retry? */
-
-	/* MIPS uses long __kernel_uid_t... */
-	/* we never pass -1, so it is safe */
-	unsigned long uid;
-	unsigned long gid;
-	/* MIPS uses unsigned long __kernel_mode_t */
-	unsigned long file_mode;
-	unsigned long dir_mode;
-};
-
-#define NCP_MOUNT_VERSION_V5	(5)	/* Text only */
-
-#endif
diff --git a/include/uapi/linux/ncp_no.h b/include/uapi/linux/ncp_no.h
deleted file mode 100644
index 654d7c7f5d92..000000000000
--- a/include/uapi/linux/ncp_no.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _NCP_NO
-#define _NCP_NO
-
-/* these define the attribute byte as seen by NCP */
-#define aRONLY			(__cpu_to_le32(1))
-#define aHIDDEN			(__cpu_to_le32(2))
-#define aSYSTEM			(__cpu_to_le32(4))
-#define aEXECUTE		(__cpu_to_le32(8))
-#define aDIR			(__cpu_to_le32(0x10))
-#define aARCH			(__cpu_to_le32(0x20))
-#define aSHARED			(__cpu_to_le32(0x80))
-#define aDONTSUBALLOCATE	(__cpu_to_le32(1L<<11))
-#define aTRANSACTIONAL		(__cpu_to_le32(1L<<12))
-#define aPURGE			(__cpu_to_le32(1L<<16))
-#define aRENAMEINHIBIT		(__cpu_to_le32(1L<<17))
-#define aDELETEINHIBIT		(__cpu_to_le32(1L<<18))
-#define aDONTCOMPRESS		(__cpu_to_le32(1L<<27))
-
-#endif /* _NCP_NO */
-- 
cgit v1.2.3


From 610b15c50e86eb1e4b77274fabcaea29ac72d6a8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 7 May 2018 16:47:02 -0700
Subject: overflow.h: Add allocation size calculation helpers

In preparation for replacing unchecked overflows for memory allocations,
this creates helpers for the 3 most common calculations:

array_size(a, b): 2-dimensional array
array3_size(a, b, c): 3-dimensional array
struct_size(ptr, member, n): struct followed by n-many trailing members

Each of these return SIZE_MAX on overflow instead of wrapping around.

(Additionally renames a variable named "array_size" to avoid future
collision.)

Co-developed-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/md/dm-table.c    | 10 +++----
 include/linux/overflow.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0589a4da12bb..caa51dd351b6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -548,14 +548,14 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
  * On the other hand, dm-switch needs to process bulk data using messages and
  * excessive use of GFP_NOIO could cause trouble.
  */
-static char **realloc_argv(unsigned *array_size, char **old_argv)
+static char **realloc_argv(unsigned *size, char **old_argv)
 {
 	char **argv;
 	unsigned new_size;
 	gfp_t gfp;
 
-	if (*array_size) {
-		new_size = *array_size * 2;
+	if (*size) {
+		new_size = *size * 2;
 		gfp = GFP_KERNEL;
 	} else {
 		new_size = 8;
@@ -563,8 +563,8 @@ static char **realloc_argv(unsigned *array_size, char **old_argv)
 	}
 	argv = kmalloc(new_size * sizeof(*argv), gfp);
 	if (argv) {
-		memcpy(argv, old_argv, *array_size * sizeof(*argv));
-		*array_size = new_size;
+		memcpy(argv, old_argv, *size * sizeof(*argv));
+		*size = new_size;
 	}
 
 	kfree(old_argv);
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index c8890ec358a7..8712ff70995f 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -202,4 +202,77 @@
 
 #endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
 
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+	if (check_mul_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(n, size, &bytes))
+		return SIZE_MAX;
+	if (check_add_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @n: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @n @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, n)					\
+	__ab_c_size(n,							\
+		    sizeof(*(p)->member) + __must_be_array((p)->member),\
+		    sizeof(*(p)))
+
 #endif /* __LINUX_OVERFLOW_H */
-- 
cgit v1.2.3


From 49b7f8983aa78581bfd511a26891b26cd734e293 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 8 May 2018 12:52:32 -0700
Subject: mm: Use overflow helpers in kmalloc_array*()

Instead of open-coded multiplication and bounds checking, use the new
overflow helper.

Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/slab.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 81ebd71f8c03..4d759e1ddc33 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -13,6 +13,7 @@
 #define	_LINUX_SLAB_H
 
 #include <linux/gfp.h>
+#include <linux/overflow.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
@@ -624,11 +625,13 @@ int memcg_update_all_caches(int num_memcgs);
  */
 static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
 {
-	if (size != 0 && n > SIZE_MAX / size)
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
 	if (__builtin_constant_p(n) && __builtin_constant_p(size))
-		return kmalloc(n * size, flags);
-	return __kmalloc(n * size, flags);
+		return kmalloc(bytes, flags);
+	return __kmalloc(bytes, flags);
 }
 
 /**
@@ -657,11 +660,13 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
 				       int node)
 {
-	if (size != 0 && n > SIZE_MAX / size)
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
 	if (__builtin_constant_p(n) && __builtin_constant_p(size))
-		return kmalloc_node(n * size, flags, node);
-	return __kmalloc_node(n * size, flags, node);
+		return kmalloc_node(bytes, flags, node);
+	return __kmalloc_node(bytes, flags, node);
 }
 
 static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
-- 
cgit v1.2.3


From 3b3b1a29eb89ba93f91213cbebb332a2ac31fa8b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 8 May 2018 12:55:26 -0700
Subject: mm: Use overflow helpers in kvmalloc()

Instead of open-coded multiplication and bounds checking, use the new
overflow helper. Additionally prepare for vmalloc() users to add
array_size()-family helpers in the future.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/mm.h      | 7 +++++--
 include/linux/vmalloc.h | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06a4be6..7cb1c6a6bf82 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -25,6 +25,7 @@
 #include <linux/err.h>
 #include <linux/page_ref.h>
 #include <linux/memremap.h>
+#include <linux/overflow.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -560,10 +561,12 @@ static inline void *kvzalloc(size_t size, gfp_t flags)
 
 static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
 {
-	if (size != 0 && n > SIZE_MAX / size)
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
 
-	return kvmalloc(n * size, flags);
+	return kvmalloc(bytes, flags);
 }
 
 extern void kvfree(const void *addr);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1e5d8c392f15..398e9c95cd61 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -8,6 +8,7 @@
 #include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
 #include <linux/rbtree.h>
+#include <linux/overflow.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 struct notifier_block;		/* in notifier.h */
-- 
cgit v1.2.3


From 2509b561f7c6599907c08cb364c86b8c45466e4f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 8 May 2018 22:29:52 -0700
Subject: device: Use overflow helpers for devm_kmalloc()

Use the overflow helpers both in existing multiplication-using inlines as
well as the addition-overflow case in the core allocation routine.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/base/devres.c  | 7 ++++++-
 include/linux/device.h | 8 ++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 95b67281cd2a..f98a097e73f2 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -84,9 +84,14 @@ static struct devres_group * node_to_group(struct devres_node *node)
 static __always_inline struct devres * alloc_dr(dr_release_t release,
 						size_t size, gfp_t gfp, int nid)
 {
-	size_t tot_size = sizeof(struct devres) + size;
+	size_t tot_size;
 	struct devres *dr;
 
+	/* We must catch any near-SIZE_MAX cases that could overflow. */
+	if (unlikely(check_add_overflow(sizeof(struct devres), size,
+					&tot_size)))
+		return NULL;
+
 	dr = kmalloc_node_track_caller(tot_size, gfp, nid);
 	if (unlikely(!dr))
 		return NULL;
diff --git a/include/linux/device.h b/include/linux/device.h
index 477956990f5e..897efa647203 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -25,6 +25,7 @@
 #include <linux/ratelimit.h>
 #include <linux/uidgid.h>
 #include <linux/gfp.h>
+#include <linux/overflow.h>
 #include <asm/device.h>
 
 struct device;
@@ -668,9 +669,12 @@ static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
 static inline void *devm_kmalloc_array(struct device *dev,
 				       size_t n, size_t size, gfp_t flags)
 {
-	if (size != 0 && n > SIZE_MAX / size)
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
-	return devm_kmalloc(dev, n * size, flags);
+
+	return devm_kmalloc(dev, bytes, flags);
 }
 static inline void *devm_kcalloc(struct device *dev,
 				 size_t n, size_t size, gfp_t flags)
-- 
cgit v1.2.3


From 33edc3b2db8735cafa9d72f8510dc6bb394ddec8 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 5 Jun 2018 07:48:08 +0300
Subject: RDMA/restrack: Change SPDX tag to properly reflect license

Resource tracking is supposed to be dual licensed: GPL-2.0 and
OpenIB, but the SPDX tag was not compliant to it. Update the tag to
properly reflect license.

Fixes: 02d8883f520e ("RDMA/restrack: Add general infrastructure to track RDMA resources")
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/restrack.c | 2 +-
 include/rdma/restrack.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 172b517dc7b9..3b7fa0ccaa08 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
  * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
  */
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 637968589922..9654d33edd98 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
  */
-- 
cgit v1.2.3


From 7aaa822ed060719bd4ea012609883b6bc6950508 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 14 May 2018 15:50:52 -0700
Subject: pstore: Convert internal records to timespec64

This prepares pstore for converting the VFS layer to timespec64.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
---
 drivers/firmware/efi/efi-pstore.c | 27 ++++++++++++++-------------
 fs/pstore/inode.c                 |  3 ++-
 fs/pstore/platform.c              |  2 +-
 fs/pstore/ram.c                   | 21 ++++++++++++++-------
 include/linux/pstore.h            |  2 +-
 5 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 5a0fa939d70f..cfe87b465819 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -28,10 +28,9 @@ static int efi_pstore_close(struct pstore_info *psi)
 	return 0;
 }
 
-static inline u64 generic_id(unsigned long timestamp,
-			     unsigned int part, int count)
+static inline u64 generic_id(u64 timestamp, unsigned int part, int count)
 {
-	return ((u64) timestamp * 100 + part) * 1000 + count;
+	return (timestamp * 100 + part) * 1000 + count;
 }
 
 static int efi_pstore_read_func(struct efivar_entry *entry,
@@ -42,7 +41,8 @@ static int efi_pstore_read_func(struct efivar_entry *entry,
 	int i;
 	int cnt;
 	unsigned int part;
-	unsigned long time, size;
+	unsigned long size;
+	u64 time;
 
 	if (efi_guidcmp(entry->var.VendorGuid, vendor))
 		return 0;
@@ -50,7 +50,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry,
 	for (i = 0; i < DUMP_NAME_LEN; i++)
 		name[i] = entry->var.VariableName[i];
 
-	if (sscanf(name, "dump-type%u-%u-%d-%lu-%c",
+	if (sscanf(name, "dump-type%u-%u-%d-%llu-%c",
 		   &record->type, &part, &cnt, &time, &data_type) == 5) {
 		record->id = generic_id(time, part, cnt);
 		record->part = part;
@@ -62,7 +62,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry,
 		else
 			record->compressed = false;
 		record->ecc_notice_size = 0;
-	} else if (sscanf(name, "dump-type%u-%u-%d-%lu",
+	} else if (sscanf(name, "dump-type%u-%u-%d-%llu",
 		   &record->type, &part, &cnt, &time) == 4) {
 		record->id = generic_id(time, part, cnt);
 		record->part = part;
@@ -71,7 +71,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry,
 		record->time.tv_nsec = 0;
 		record->compressed = false;
 		record->ecc_notice_size = 0;
-	} else if (sscanf(name, "dump-type%u-%u-%lu",
+	} else if (sscanf(name, "dump-type%u-%u-%llu",
 			  &record->type, &part, &time) == 3) {
 		/*
 		 * Check if an old format,
@@ -250,9 +250,10 @@ static int efi_pstore_write(struct pstore_record *record)
 	/* Since we copy the entire length of name, make sure it is wiped. */
 	memset(name, 0, sizeof(name));
 
-	snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu-%c",
+	snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lld-%c",
 		 record->type, record->part, record->count,
-		 record->time.tv_sec, record->compressed ? 'C' : 'D');
+		 (long long)record->time.tv_sec,
+		 record->compressed ? 'C' : 'D');
 
 	for (i = 0; i < DUMP_NAME_LEN; i++)
 		efi_name[i] = name[i];
@@ -327,15 +328,15 @@ static int efi_pstore_erase(struct pstore_record *record)
 	char name[DUMP_NAME_LEN];
 	int ret;
 
-	snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu",
+	snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lld",
 		 record->type, record->part, record->count,
-		 record->time.tv_sec);
+		 (long long)record->time.tv_sec);
 	ret = efi_pstore_erase_name(name);
 	if (ret != -ENOENT)
 		return ret;
 
-	snprintf(name, sizeof(name), "dump-type%u-%u-%lu",
-		record->type, record->part, record->time.tv_sec);
+	snprintf(name, sizeof(name), "dump-type%u-%u-%lld",
+		record->type, record->part, (long long)record->time.tv_sec);
 	ret = efi_pstore_erase_name(name);
 
 	return ret;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5fcb845b9fec..75afe5eb0574 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -392,7 +392,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
 	inode->i_private = private;
 
 	if (record->time.tv_sec)
-		inode->i_mtime = inode->i_ctime = record->time;
+		inode->i_mtime = inode->i_ctime =
+			timespec64_to_timespec(record->time);
 
 	d_add(dentry, inode);
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index dc720573fd53..c238ab8ba31d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -328,7 +328,7 @@ void pstore_record_init(struct pstore_record *record,
 	record->psi = psinfo;
 
 	/* Report zeroed timestamp if called before timekeeping has resumed. */
-	record->time = ns_to_timespec(ktime_get_real_fast_ns());
+	record->time = ns_to_timespec64(ktime_get_real_fast_ns());
 }
 
 /*
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 49b2bc114868..69e893076ab7 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -38,6 +38,11 @@
 
 #define RAMOOPS_KERNMSG_HDR "===="
 #define MIN_MEM_SIZE 4096UL
+#if __BITS_PER_LONG == 64
+# define TVSEC_FMT "%ld"
+#else
+# define TVSEC_FMT "%lld"
+#endif
 
 static ulong record_size = MIN_MEM_SIZE;
 module_param(record_size, ulong, 0400);
@@ -153,21 +158,23 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
 	return prz;
 }
 
-static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
+static int ramoops_read_kmsg_hdr(char *buffer, struct timespec64 *time,
 				  bool *compressed)
 {
 	char data_type;
 	int header_length = 0;
 
-	if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n%n", &time->tv_sec,
-			&time->tv_nsec, &data_type, &header_length) == 3) {
+	if (sscanf(buffer, RAMOOPS_KERNMSG_HDR TVSEC_FMT ".%lu-%c\n%n",
+		   &time->tv_sec, &time->tv_nsec, &data_type,
+		   &header_length) == 3) {
 		if (data_type == 'C')
 			*compressed = true;
 		else
 			*compressed = false;
-	} else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n%n",
-			&time->tv_sec, &time->tv_nsec, &header_length) == 2) {
-			*compressed = false;
+	} else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR TVSEC_FMT ".%lu\n%n",
+			  &time->tv_sec, &time->tv_nsec,
+			  &header_length) == 2) {
+		*compressed = false;
 	} else {
 		time->tv_sec = 0;
 		time->tv_nsec = 0;
@@ -360,7 +367,7 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
 	char *hdr;
 	size_t len;
 
-	hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
+	hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR TVSEC_FMT ".%lu-%c\n",
 		record->time.tv_sec,
 		record->time.tv_nsec / 1000,
 		record->compressed ? 'C' : 'D');
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 61f806a7fe29..a15bc4d48752 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -71,7 +71,7 @@ struct pstore_record {
 	struct pstore_info	*psi;
 	enum pstore_type_id	type;
 	u64			id;
-	struct timespec		time;
+	struct timespec64	time;
 	char			*buf;
 	ssize_t			size;
 	ssize_t			ecc_notice_size;
-- 
cgit v1.2.3


From 95582b00838837fc07e042979320caf917ce3fe6 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 8 May 2018 19:36:02 -0700
Subject: vfs: change inode times to use struct timespec64

struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.

The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.

The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.

virtual patch

@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
  current_time ( ... )
  {
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
  ...
- return timespec_trunc(
+ return timespec64_trunc(
  ... );
  }

@ depends on patch @
identifier xtime;
@@
 struct \( iattr \| inode \| kstat \) {
 ...
-       struct timespec xtime;
+       struct timespec64 xtime;
 ...
 }

@ depends on patch @
identifier t;
@@
 struct inode_operations {
 ...
int (*update_time) (...,
-       struct timespec t,
+       struct timespec64 t,
...);
 ...
 }

@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
 fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
 ...) { ... }

@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
  ) { ... }

@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)

<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 =  timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)

@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
 fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)

@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}

@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}

@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1  ;
|
 node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
 node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
 node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
 stat->xtime = node2->i_xtime1;
|
 stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1  ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
 node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
 node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
---
 drivers/tty/tty_io.c               | 15 ++++++++++---
 drivers/usb/gadget/function/f_fs.c |  2 +-
 fs/adfs/inode.c                    |  7 ++++--
 fs/afs/fsclient.c                  |  2 +-
 fs/attr.c                          | 14 ++++++------
 fs/bad_inode.c                     |  2 +-
 fs/btrfs/file.c                    |  6 +++---
 fs/btrfs/inode.c                   |  8 +++----
 fs/btrfs/ioctl.c                   |  4 ++--
 fs/btrfs/root-tree.c               |  4 ++--
 fs/btrfs/transaction.c             |  2 +-
 fs/ceph/addr.c                     | 12 ++++++-----
 fs/ceph/cache.c                    |  4 ++--
 fs/ceph/caps.c                     |  6 +++---
 fs/ceph/file.c                     |  6 +++---
 fs/ceph/inode.c                    | 44 +++++++++++++++++++++-----------------
 fs/ceph/mds_client.c               |  7 ++++--
 fs/ceph/snap.c                     |  6 +++---
 fs/cifs/cache.c                    |  4 ++--
 fs/cifs/fscache.c                  |  8 +++----
 fs/cifs/inode.c                    | 26 +++++++++++-----------
 fs/coda/coda_linux.c               | 12 +++++------
 fs/configfs/inode.c                | 12 +++++------
 fs/cramfs/inode.c                  |  2 +-
 fs/ext4/ext4.h                     | 34 +++++++++++++++++------------
 fs/ext4/ialloc.c                   |  4 ++--
 fs/ext4/namei.c                    |  2 +-
 fs/f2fs/f2fs.h                     | 10 ++++++---
 fs/f2fs/file.c                     | 12 +++++------
 fs/f2fs/inode.c                    | 12 +++++------
 fs/f2fs/namei.c                    |  4 ++--
 fs/fat/inode.c                     | 20 +++++++++++------
 fs/fat/namei_msdos.c               | 21 +++++++++++-------
 fs/fat/namei_vfat.c                | 22 ++++++++++++-------
 fs/fuse/inode.c                    |  2 +-
 fs/gfs2/dir.c                      |  6 +++---
 fs/gfs2/glops.c                    |  4 ++--
 fs/hfs/inode.c                     |  4 ++--
 fs/hfsplus/inode.c                 | 12 +++++------
 fs/hostfs/hostfs_kern.c            | 12 +++++------
 fs/inode.c                         | 34 ++++++++++++++---------------
 fs/jffs2/dir.c                     | 18 ++++++++--------
 fs/jffs2/file.c                    |  2 +-
 fs/jffs2/fs.c                      | 12 +++++------
 fs/kernfs/dir.c                    |  4 ++--
 fs/kernfs/inode.c                  |  8 +++----
 fs/locks.c                         |  2 +-
 fs/nfs/callback_proc.c             |  4 ++--
 fs/nfs/fscache-index.c             |  4 ++--
 fs/nfs/fscache.c                   | 12 +++++------
 fs/nfs/inode.c                     | 39 +++++++++++++++++++--------------
 fs/nfs/nfs2xdr.c                   | 25 +++++++++++++---------
 fs/nfs/nfs3xdr.c                   |  8 +++++--
 fs/nfs/nfs4xdr.c                   |  7 ++++--
 fs/nfsd/blocklayout.c              |  8 ++++---
 fs/nfsd/nfs3xdr.c                  | 14 +++++++-----
 fs/nfsd/nfs4xdr.c                  |  7 ++++--
 fs/nfsd/nfsxdr.c                   |  2 +-
 fs/ntfs/inode.c                    | 30 +++++++++++++-------------
 fs/ocfs2/dlmglue.c                 | 20 +++++++++++------
 fs/ocfs2/file.c                    |  6 +++---
 fs/orangefs/inode.c                |  2 +-
 fs/orangefs/orangefs-kernel.h      |  2 +-
 fs/overlayfs/inode.c               |  2 +-
 fs/overlayfs/overlayfs.h           |  2 +-
 fs/proc/uptime.c                   |  2 +-
 fs/pstore/inode.c                  |  3 +--
 fs/reiserfs/namei.c                |  2 +-
 fs/reiserfs/xattr.c                |  4 ++--
 fs/ubifs/dir.c                     |  4 ++--
 fs/ubifs/file.c                    | 23 ++++++++++----------
 fs/ubifs/ubifs.h                   |  2 +-
 fs/udf/ialloc.c                    |  4 ++--
 fs/udf/inode.c                     | 43 ++++++++++++++++++++++++-------------
 fs/xfs/xfs_inode.c                 |  2 +-
 fs/xfs/xfs_iops.c                  |  2 +-
 fs/xfs/xfs_trans_inode.c           |  2 +-
 include/linux/fs.h                 | 23 ++++++++++----------
 include/linux/stat.h               |  8 +++----
 79 files changed, 448 insertions(+), 346 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 7c838b90a31d..aba59521ad48 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -867,8 +867,13 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 		i = -EIO;
 	tty_ldisc_deref(ld);
 
-	if (i > 0)
-		tty_update_time(&inode->i_atime);
+	if (i > 0) {
+		struct timespec ts;
+
+		ts = timespec64_to_timespec(inode->i_atime);
+		tty_update_time(&ts);
+		inode->i_atime = timespec_to_timespec64(ts);
+	}
 
 	return i;
 }
@@ -969,7 +974,11 @@ static inline ssize_t do_tty_write(
 		cond_resched();
 	}
 	if (written) {
-		tty_update_time(&file_inode(file)->i_mtime);
+		struct timespec ts;
+
+		ts = timespec64_to_timespec(file_inode(file)->i_mtime);
+		tty_update_time(&ts);
+		file_inode(file)->i_mtime = timespec_to_timespec64(ts);
 		ret = written;
 	}
 out:
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 0294e4f18873..d8a532fb935d 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1297,7 +1297,7 @@ ffs_sb_make_inode(struct super_block *sb, void *data,
 	inode = new_inode(sb);
 
 	if (likely(inode)) {
-		struct timespec ts = current_time(inode);
+		struct timespec64 ts = current_time(inode);
 
 		inode->i_ino	 = get_next_ino();
 		inode->i_mode    = perms->mode;
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 8dbd36f5e581..c836c425ca94 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -199,7 +199,7 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
 	return;
 
  cur_time:
-	*tv = current_time(inode);
+	*tv = timespec64_to_timespec(current_time(inode));
 	return;
 
  too_early:
@@ -242,6 +242,7 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs)
 struct inode *
 adfs_iget(struct super_block *sb, struct object_info *obj)
 {
+	struct timespec ts;
 	struct inode *inode;
 
 	inode = new_inode(sb);
@@ -270,7 +271,9 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
 
 	inode->i_mode	 = adfs_atts2mode(sb, inode);
-	adfs_adfs2unix_time(&inode->i_mtime, inode);
+	ts = timespec64_to_timespec(inode->i_mtime);
+	adfs_adfs2unix_time(&ts, inode);
+	inode->i_mtime = timespec_to_timespec64(ts);
 	inode->i_atime = inode->i_mtime;
 	inode->i_ctime = inode->i_mtime;
 
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index b273e1d60478..5907601aafd0 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -72,7 +72,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode,
 				  const afs_dataversion_t *expected_version,
 				  u8 flags)
 {
-	struct timespec t;
+	struct timespec64 t;
 	umode_t mode;
 
 	t.tv_sec = status->mtime_client;
diff --git a/fs/attr.c b/fs/attr.c
index 12ffdb6fb63c..63b17ef9e5ac 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -163,14 +163,14 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 	if (ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = timespec_trunc(attr->ia_atime,
-						inode->i_sb->s_time_gran);
+		inode->i_atime = timespec64_trunc(attr->ia_atime,
+						  inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = timespec_trunc(attr->ia_mtime,
-						inode->i_sb->s_time_gran);
+		inode->i_mtime = timespec64_trunc(attr->ia_mtime,
+						  inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
-		inode->i_ctime = timespec_trunc(attr->ia_ctime,
-						inode->i_sb->s_time_gran);
+		inode->i_ctime = timespec64_trunc(attr->ia_ctime,
+						  inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
@@ -207,7 +207,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
 	int error;
-	struct timespec now;
+	struct timespec64 now;
 	unsigned int ia_valid = attr->ia_valid;
 
 	WARN_ON_ONCE(!inode_is_locked(inode));
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 213b51dbbb60..125e8bbd22a2 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -126,7 +126,7 @@ static int bad_inode_fiemap(struct inode *inode,
 	return -EIO;
 }
 
-static int bad_inode_update_time(struct inode *inode, struct timespec *time,
+static int bad_inode_update_time(struct inode *inode, struct timespec64 *time,
 				 int flags)
 {
 	return -EIO;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f660ba1e5e58..51e77d72068a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1842,16 +1842,16 @@ out:
 
 static void update_time_for_write(struct inode *inode)
 {
-	struct timespec now;
+	struct timespec64 now;
 
 	if (IS_NOCMTIME(inode))
 		return;
 
 	now = current_time(inode);
-	if (!timespec_equal(&inode->i_mtime, &now))
+	if (!timespec64_equal(&inode->i_mtime, &now))
 		inode->i_mtime = now;
 
-	if (!timespec_equal(&inode->i_ctime, &now))
+	if (!timespec64_equal(&inode->i_ctime, &now))
 		inode->i_ctime = now;
 
 	if (IS_I_VERSION(inode))
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e604e7071f1..233a2bf70351 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5777,7 +5777,7 @@ static struct inode *new_simple_dir(struct super_block *s,
 	inode->i_mtime = current_time(inode);
 	inode->i_atime = inode->i_mtime;
 	inode->i_ctime = inode->i_mtime;
-	BTRFS_I(inode)->i_otime = inode->i_mtime;
+	BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime);
 
 	return inode;
 }
@@ -6131,7 +6131,7 @@ static int btrfs_dirty_inode(struct inode *inode)
  * This is a copy of file_update_time.  We need this so we can return error on
  * ENOSPC for updating the inode in the case of file write and mmap writes.
  */
-static int btrfs_update_time(struct inode *inode, struct timespec *now,
+static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
 			     int flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6386,7 +6386,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_mtime = current_time(inode);
 	inode->i_atime = inode->i_mtime;
 	inode->i_ctime = inode->i_mtime;
-	BTRFS_I(inode)->i_otime = inode->i_mtime;
+	BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime);
 
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
@@ -9471,7 +9471,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
-	struct timespec ctime = current_time(old_inode);
+	struct timespec64 ctime = current_time(old_inode);
 	struct dentry *parent;
 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 632e26d6f7ce..09c6bec14956 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -417,7 +417,7 @@ static noinline int create_subvol(struct inode *dir,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *new_root;
 	struct btrfs_block_rsv block_rsv;
-	struct timespec cur_time = current_time(dir);
+	struct timespec64 cur_time = current_time(dir);
 	struct inode *inode;
 	int ret;
 	int err;
@@ -4996,7 +4996,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root_item *root_item = &root->root_item;
 	struct btrfs_trans_handle *trans;
-	struct timespec ct = current_time(inode);
+	struct timespec64 ct = current_time(inode);
 	int ret = 0;
 	int received_uuid_changed;
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6db3bda44aa5..c451285976ac 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -485,9 +485,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
 	struct btrfs_root_item *item = &root->root_item;
-	struct timespec ct;
+	struct timespec64 ct;
 
-	ktime_get_real_ts(&ct);
+	ktime_get_real_ts64(&ct);
 	spin_lock(&root->root_item_lock);
 	btrfs_set_root_ctransid(item, trans->transid);
 	btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c944b4769e3c..7ac0d05571ca 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1428,7 +1428,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct dentry *dentry;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
-	struct timespec cur_time;
+	struct timespec64 cur_time;
 	int ret = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5f7ad3d0df2e..0133ea2b784a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -574,6 +574,7 @@ static u64 get_writepages_data_length(struct inode *inode,
  */
 static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
+	struct timespec ts;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct ceph_fs_client *fsc;
@@ -624,11 +625,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 
 	set_page_writeback(page);
+	ts = timespec64_to_timespec(inode->i_mtime);
 	err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
 				   &ci->i_layout, snapc, page_off, len,
 				   ceph_wbc.truncate_seq,
 				   ceph_wbc.truncate_size,
-				   &inode->i_mtime, &page, 1);
+				   &ts, &page, 1);
 	if (err < 0) {
 		struct writeback_control tmp_wbc;
 		if (!wbc)
@@ -1131,7 +1133,7 @@ new_request:
 			pages = NULL;
 		}
 
-		req->r_mtime = inode->i_mtime;
+		req->r_mtime = timespec64_to_timespec(inode->i_mtime);
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
 		req = NULL;
@@ -1731,7 +1733,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 		goto out;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = timespec64_to_timespec(inode->i_mtime);
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1773,7 +1775,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 			goto out_put;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = timespec64_to_timespec(inode->i_mtime);
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1934,7 +1936,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 				     0, false, true);
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
-	wr_req->r_mtime = ci->vfs_inode.i_mtime;
+	wr_req->r_mtime = timespec64_to_timespec(ci->vfs_inode.i_mtime);
 	wr_req->r_abort_on_full = true;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index bb524c880b1e..362900e42424 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -130,7 +130,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
 
 	memset(&aux, 0, sizeof(aux));
 	aux.version = ci->i_version;
-	aux.mtime = inode->i_mtime;
+	aux.mtime = timespec64_to_timespec(inode->i_mtime);
 
 	if (memcmp(data, &aux, sizeof(aux)) != 0)
 		return FSCACHE_CHECKAUX_OBSOLETE;
@@ -163,7 +163,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
 	if (!ci->fscache) {
 		memset(&aux, 0, sizeof(aux));
 		aux.version = ci->i_version;
-		aux.mtime = inode->i_mtime;
+		aux.mtime = timespec64_to_timespec(inode->i_mtime);
 		ci->fscache = fscache_acquire_cookie(fsc->fscache,
 						     &ceph_fscache_inode_object_def,
 						     &ci->i_vino, sizeof(ci->i_vino),
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 23dbfae16156..96b2ce936daa 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1358,9 +1358,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 		arg.xattr_buf = NULL;
 	}
 
-	arg.mtime = inode->i_mtime;
-	arg.atime = inode->i_atime;
-	arg.ctime = inode->i_ctime;
+	arg.mtime = timespec64_to_timespec(inode->i_mtime);
+	arg.atime = timespec64_to_timespec(inode->i_atime);
+	arg.ctime = timespec64_to_timespec(inode->i_ctime);
 
 	arg.op = op;
 	arg.caps = cap->implemented;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index cf0e45b10121..e962d672baf2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -924,7 +924,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	int num_pages = 0;
 	int flags;
 	int ret;
-	struct timespec mtime = current_time(inode);
+	struct timespec mtime = timespec64_to_timespec(current_time(inode));
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
 	bool write = iov_iter_rw(iter) == WRITE;
@@ -1132,7 +1132,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	int flags;
 	int ret;
 	bool check_caps = false;
-	struct timespec mtime = current_time(inode);
+	struct timespec mtime = timespec64_to_timespec(current_time(inode));
 	size_t count = iov_iter_count(from);
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -1664,7 +1664,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 		goto out;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = timespec64_to_timespec(inode->i_mtime);
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 676065a1ea62..700fd652cc77 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -662,6 +662,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 			 struct timespec *mtime, struct timespec *atime)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct timespec64 ctime64 = timespec_to_timespec64(*ctime);
+	struct timespec64 mtime64 = timespec_to_timespec64(*mtime);
+	struct timespec64 atime64 = timespec_to_timespec64(*atime);
 	int warn = 0;
 
 	if (issued & (CEPH_CAP_FILE_EXCL|
@@ -670,11 +673,11 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 		      CEPH_CAP_AUTH_EXCL|
 		      CEPH_CAP_XATTR_EXCL)) {
 		if (ci->i_version == 0 ||
-		    timespec_compare(ctime, &inode->i_ctime) > 0) {
+		    timespec64_compare(&ctime64, &inode->i_ctime) > 0) {
 			dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
 			     (long long)inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
 			     (long long)ctime->tv_sec, ctime->tv_nsec);
-			inode->i_ctime = *ctime;
+			inode->i_ctime = ctime64;
 		}
 		if (ci->i_version == 0 ||
 		    ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
@@ -685,24 +688,24 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 			     (long long)mtime->tv_sec, mtime->tv_nsec,
 			     ci->i_time_warp_seq, (int)time_warp_seq);
 
-			inode->i_mtime = *mtime;
-			inode->i_atime = *atime;
+			inode->i_mtime = mtime64;
+			inode->i_atime = atime64;
 			ci->i_time_warp_seq = time_warp_seq;
 		} else if (time_warp_seq == ci->i_time_warp_seq) {
 			/* nobody did utimes(); take the max */
-			if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+			if (timespec64_compare(&mtime64, &inode->i_mtime) > 0) {
 				dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
 				     (long long)inode->i_mtime.tv_sec,
 				     inode->i_mtime.tv_nsec,
 				     (long long)mtime->tv_sec, mtime->tv_nsec);
-				inode->i_mtime = *mtime;
+				inode->i_mtime = mtime64;
 			}
-			if (timespec_compare(atime, &inode->i_atime) > 0) {
+			if (timespec64_compare(&atime64, &inode->i_atime) > 0) {
 				dout("atime %lld.%09ld -> %lld.%09ld inc\n",
 				     (long long)inode->i_atime.tv_sec,
 				     inode->i_atime.tv_nsec,
 				     (long long)atime->tv_sec, atime->tv_nsec);
-				inode->i_atime = *atime;
+				inode->i_atime = atime64;
 			}
 		} else if (issued & CEPH_CAP_FILE_EXCL) {
 			/* we did a utimes(); ignore mds values */
@@ -712,9 +715,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 	} else {
 		/* we have no write|excl caps; whatever the MDS says is true */
 		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
-			inode->i_ctime = *ctime;
-			inode->i_mtime = *mtime;
-			inode->i_atime = *atime;
+			inode->i_ctime = ctime64;
+			inode->i_mtime = mtime64;
+			inode->i_atime = atime64;
 			ci->i_time_warp_seq = time_warp_seq;
 		} else {
 			warn = 1;
@@ -1941,6 +1944,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 	int err = 0;
 	int inode_dirty_flags = 0;
 	bool lock_snap_rwsem = false;
+	struct timespec ts;
 
 	prealloc_cf = ceph_alloc_cap_flush();
 	if (!prealloc_cf)
@@ -2023,14 +2027,14 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 			inode->i_atime = attr->ia_atime;
 			dirtied |= CEPH_CAP_FILE_EXCL;
 		} else if ((issued & CEPH_CAP_FILE_WR) &&
-			   timespec_compare(&inode->i_atime,
+			   timespec64_compare(&inode->i_atime,
 					    &attr->ia_atime) < 0) {
 			inode->i_atime = attr->ia_atime;
 			dirtied |= CEPH_CAP_FILE_WR;
 		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
-			ceph_encode_timespec(&req->r_args.setattr.atime,
-					     &attr->ia_atime);
+			   !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
+			ts = timespec64_to_timespec(attr->ia_atime);
+			ceph_encode_timespec(&req->r_args.setattr.atime, &ts);
 			mask |= CEPH_SETATTR_ATIME;
 			release |= CEPH_CAP_FILE_SHARED |
 				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
@@ -2045,14 +2049,14 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 			inode->i_mtime = attr->ia_mtime;
 			dirtied |= CEPH_CAP_FILE_EXCL;
 		} else if ((issued & CEPH_CAP_FILE_WR) &&
-			   timespec_compare(&inode->i_mtime,
+			   timespec64_compare(&inode->i_mtime,
 					    &attr->ia_mtime) < 0) {
 			inode->i_mtime = attr->ia_mtime;
 			dirtied |= CEPH_CAP_FILE_WR;
 		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
-			ceph_encode_timespec(&req->r_args.setattr.mtime,
-					     &attr->ia_mtime);
+			   !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			ts = timespec64_to_timespec(attr->ia_mtime);
+			ceph_encode_timespec(&req->r_args.setattr.mtime, &ts);
 			mask |= CEPH_SETATTR_MTIME;
 			release |= CEPH_CAP_FILE_SHARED |
 				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
@@ -2126,7 +2130,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 		req->r_inode_drop = release;
 		req->r_args.setattr.mask = cpu_to_le32(mask);
 		req->r_num_caps = 1;
-		req->r_stamp = attr->ia_ctime;
+		req->r_stamp = timespec64_to_timespec(attr->ia_ctime);
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 	}
 	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5ece2e6ad154..9460a92ce56d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2958,12 +2958,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		rec.v2.flock_len = (__force __le32)
 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
+		struct timespec ts;
 		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
 		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v1.issued = cpu_to_le32(cap->issued);
 		rec.v1.size = cpu_to_le64(inode->i_size);
-		ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
-		ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
+		ts = timespec64_to_timespec(inode->i_mtime);
+		ceph_encode_timespec(&rec.v1.mtime, &ts);
+		ts = timespec64_to_timespec(inode->i_atime);
+		ceph_encode_timespec(&rec.v1.atime, &ts);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
 		rec.v1.pathbase = cpu_to_le64(pathbase);
 	}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 041c27ea8de1..af81555c14fd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -594,9 +594,9 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = inode->i_size;
-	capsnap->mtime = inode->i_mtime;
-	capsnap->atime = inode->i_atime;
-	capsnap->ctime = inode->i_ctime;
+	capsnap->mtime = timespec64_to_timespec(inode->i_mtime);
+	capsnap->atime = timespec64_to_timespec(inode->i_atime);
+	capsnap->ctime = timespec64_to_timespec(inode->i_ctime);
 	capsnap->time_warp_seq = ci->i_time_warp_seq;
 	capsnap->truncate_size = ci->i_truncate_size;
 	capsnap->truncate_seq = ci->i_truncate_seq;
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index edf5f40898bf..e1553d1e0e50 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -128,8 +128,8 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
 
 	memset(&auxdata, 0, sizeof(auxdata));
 	auxdata.eof = cifsi->server_eof;
-	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
-	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+	auxdata.last_write_time = timespec64_to_timespec(cifsi->vfs_inode.i_mtime);
+	auxdata.last_change_time = timespec64_to_timespec(cifsi->vfs_inode.i_ctime);
 
 	if (memcmp(data, &auxdata, datalen) != 0)
 		return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 25d3f66b2d50..85145a763021 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -129,8 +129,8 @@ static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi,
 
 	memset(&auxdata, 0, sizeof(auxdata));
 	auxdata.eof = cifsi->server_eof;
-	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
-	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+	auxdata.last_write_time = timespec64_to_timespec(cifsi->vfs_inode.i_mtime);
+	auxdata.last_change_time = timespec64_to_timespec(cifsi->vfs_inode.i_ctime);
 
 	cifsi->fscache =
 		fscache_acquire_cookie(tcon->fscache,
@@ -166,8 +166,8 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
 	if (cifsi->fscache) {
 		memset(&auxdata, 0, sizeof(auxdata));
 		auxdata.eof = cifsi->server_eof;
-		auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
-		auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+		auxdata.last_write_time = timespec64_to_timespec(cifsi->vfs_inode.i_mtime);
+		auxdata.last_change_time = timespec64_to_timespec(cifsi->vfs_inode.i_ctime);
 
 		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
 		fscache_relinquish_cookie(cifsi->fscache, &auxdata, false);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3c371f7f5963..6be9a7cfaf0e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -95,6 +95,7 @@ static void
 cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct timespec ts;
 
 	cifs_dbg(FYI, "%s: revalidating inode %llu\n",
 		 __func__, cifs_i->uniqueid);
@@ -113,7 +114,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 	}
 
 	 /* revalidate if mtime or size have changed */
-	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	ts = timespec64_to_timespec(inode->i_mtime);
+	if (timespec_equal(&ts, &fattr->cf_mtime) &&
 	    cifs_i->server_eof == fattr->cf_eof) {
 		cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
 			 __func__, cifs_i->uniqueid);
@@ -162,9 +164,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	cifs_revalidate_cache(inode, fattr);
 
 	spin_lock(&inode->i_lock);
-	inode->i_atime = fattr->cf_atime;
-	inode->i_mtime = fattr->cf_mtime;
-	inode->i_ctime = fattr->cf_ctime;
+	inode->i_atime = timespec_to_timespec64(fattr->cf_atime);
+	inode->i_mtime = timespec_to_timespec64(fattr->cf_mtime);
+	inode->i_ctime = timespec_to_timespec64(fattr->cf_ctime);
 	inode->i_rdev = fattr->cf_rdev;
 	cifs_nlink_fattr_to_inode(inode, fattr);
 	inode->i_uid = fattr->cf_uid;
@@ -1122,14 +1124,14 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
 	if (attrs->ia_valid & ATTR_ATIME) {
 		set_time = true;
 		info_buf.LastAccessTime =
-			cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+			cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_atime)));
 	} else
 		info_buf.LastAccessTime = 0;
 
 	if (attrs->ia_valid & ATTR_MTIME) {
 		set_time = true;
 		info_buf.LastWriteTime =
-		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+		    cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_mtime)));
 	} else
 		info_buf.LastWriteTime = 0;
 
@@ -1142,7 +1144,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
 	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
 		cifs_dbg(FYI, "CIFS - CTIME changed\n");
 		info_buf.ChangeTime =
-		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+		    cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_ctime)));
 	} else
 		info_buf.ChangeTime = 0;
 
@@ -2059,8 +2061,8 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
 	/* old CIFS Unix Extensions doesn't return create time */
 	if (CIFS_I(inode)->createtime) {
 		stat->result_mask |= STATX_BTIME;
-		stat->btime =
-		      cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime));
+		stat->btime = timespec_to_timespec64(
+		      cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime)));
 	}
 
 	stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED);
@@ -2262,17 +2264,17 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		args->gid = INVALID_GID; /* no change */
 
 	if (attrs->ia_valid & ATTR_ATIME)
-		args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
+		args->atime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_atime));
 	else
 		args->atime = NO_CHANGE_64;
 
 	if (attrs->ia_valid & ATTR_MTIME)
-		args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime);
+		args->mtime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_mtime));
 	else
 		args->mtime = NO_CHANGE_64;
 
 	if (attrs->ia_valid & ATTR_CTIME)
-		args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime);
+		args->ctime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_ctime));
 	else
 		args->ctime = NO_CHANGE_64;
 
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index ca599df0dcb1..f3d543dd9a98 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -105,11 +105,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	if (attr->va_size != -1)
 		inode->i_blocks = (attr->va_size + 511) >> 9;
 	if (attr->va_atime.tv_sec != -1) 
-	        inode->i_atime = attr->va_atime;
+		inode->i_atime = timespec_to_timespec64(attr->va_atime);
 	if (attr->va_mtime.tv_sec != -1)
-	        inode->i_mtime = attr->va_mtime;
+		inode->i_mtime = timespec_to_timespec64(attr->va_mtime);
         if (attr->va_ctime.tv_sec != -1)
-	        inode->i_ctime = attr->va_ctime;
+		inode->i_ctime = timespec_to_timespec64(attr->va_ctime);
 }
 
 
@@ -175,13 +175,13 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
                 vattr->va_size = iattr->ia_size;
 	}
         if ( valid & ATTR_ATIME ) {
-                vattr->va_atime = iattr->ia_atime;
+		vattr->va_atime = timespec64_to_timespec(iattr->ia_atime);
 	}
         if ( valid & ATTR_MTIME ) {
-                vattr->va_mtime = iattr->ia_mtime;
+		vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime);
 	}
         if ( valid & ATTR_CTIME ) {
-                vattr->va_ctime = iattr->ia_ctime;
+		vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime);
 	}
 }
 
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index ad718e5e37bb..28ef9e528853 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -90,14 +90,14 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	if (ia_valid & ATTR_GID)
 		sd_iattr->ia_gid = iattr->ia_gid;
 	if (ia_valid & ATTR_ATIME)
-		sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
-						inode->i_sb->s_time_gran);
+		sd_iattr->ia_atime = timespec64_trunc(iattr->ia_atime,
+						      inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
-		sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
-						inode->i_sb->s_time_gran);
+		sd_iattr->ia_mtime = timespec64_trunc(iattr->ia_mtime,
+						      inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
-		sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
-						inode->i_sb->s_time_gran);
+		sd_iattr->ia_ctime = timespec64_trunc(iattr->ia_ctime,
+						      inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = iattr->ia_mode;
 
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 017b0ab19bc4..d7a600e00f28 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -90,7 +90,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	const struct cramfs_inode *cramfs_inode, unsigned int offset)
 {
 	struct inode *inode;
-	static struct timespec zerotime;
+	static struct timespec64 zerotime;
 
 	inode = iget_locked(sb, cramino(cramfs_inode, offset));
 	if (!inode)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a42e71203e53..f76d0777f7ad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -817,12 +817,14 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
 	time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
 }
 
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
-do {									       \
-	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
-	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
-		(raw_inode)->xtime ## _extra =				       \
-				ext4_encode_extra_time(&(inode)->xtime);       \
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)				\
+do {										\
+	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);		\
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     {\
+		struct timespec ts = timespec64_to_timespec((inode)->xtime);	\
+		(raw_inode)->xtime ## _extra =					\
+				ext4_encode_extra_time(&ts);			\
+		}								\
 } while (0)
 
 #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
@@ -834,16 +836,20 @@ do {									       \
 				ext4_encode_extra_time(&(einode)->xtime);      \
 } while (0)
 
-#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
-do {									       \
-	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);       \
-	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
-		ext4_decode_extra_time(&(inode)->xtime,			       \
-				       raw_inode->xtime ## _extra);	       \
-	else								       \
-		(inode)->xtime.tv_nsec = 0;				       \
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)				\
+do {										\
+	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);	\
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {	\
+		struct timespec ts = timespec64_to_timespec((inode)->xtime);	\
+		ext4_decode_extra_time(&ts,					\
+				       raw_inode->xtime ## _extra);		\
+		(inode)->xtime = timespec_to_timespec64(ts);			\
+		}								\
+	else									\
+		(inode)->xtime.tv_nsec = 0;					\
 } while (0)
 
+
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
 do {									       \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df92e3ec9913..751e1be2c652 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1078,8 +1078,8 @@ got:
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
-						       current_time(inode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	ei->i_crtime = timespec64_to_timespec(inode->i_mtime);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b1f21e3a0763..2feb546dfcaf 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3675,7 +3675,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	};
 	u8 new_file_type;
 	int retval;
-	struct timespec ctime;
+	struct timespec64 ctime;
 
 	if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
 	     !projid_eq(EXT4_I(new_dir)->i_projid,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1df7f10476d6..57e663b37dc8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2460,6 +2460,7 @@ static inline void clear_file(struct inode *inode, int type)
 
 static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 {
+	struct timespec ts;
 	bool ret;
 
 	if (dsync) {
@@ -2475,11 +2476,14 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 			i_size_read(inode) & ~PAGE_MASK)
 		return false;
 
-	if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+	ts = timespec64_to_timespec(inode->i_atime);
+	if (!timespec_equal(F2FS_I(inode)->i_disk_time, &ts))
 		return false;
-	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+	ts = timespec64_to_timespec(inode->i_ctime);
+	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &ts))
 		return false;
-	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+	ts = timespec64_to_timespec(inode->i_mtime);
+	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &ts))
 		return false;
 	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3,
 						&F2FS_I(inode)->i_crtime))
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6b94f19b3fa8..5f353302fdc6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -724,14 +724,14 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
 	if (ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = timespec_trunc(attr->ia_atime,
-						inode->i_sb->s_time_gran);
+		inode->i_atime = timespec64_trunc(attr->ia_atime,
+						  inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = timespec_trunc(attr->ia_mtime,
-						inode->i_sb->s_time_gran);
+		inode->i_mtime = timespec64_trunc(attr->ia_mtime,
+						  inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
-		inode->i_ctime = timespec_trunc(attr->ia_ctime,
-						inode->i_sb->s_time_gran);
+		inode->i_ctime = timespec64_trunc(attr->ia_ctime,
+						  inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e0d9e8f27ed2..2360a9d9a09e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -284,9 +284,9 @@ static int do_read_inode(struct inode *inode)
 		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
 	}
 
-	F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
-	F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
-	F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+	F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime);
+	F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime);
+	F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime);
 	F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
 	f2fs_put_page(node_page, 1);
 
@@ -448,9 +448,9 @@ void update_inode(struct inode *inode, struct page *node_page)
 	if (inode->i_nlink == 0)
 		clear_inline_node(node_page);
 
-	F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
-	F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
-	F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+	F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime);
+	F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime);
+	F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime);
 	F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
 }
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d5098efe577c..ae10fe34f63d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -50,8 +50,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime =
-			F2FS_I(inode)->i_crtime = current_time(inode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	F2FS_I(inode)->i_crtime = timespec64_to_timespec(inode->i_mtime);
 	inode->i_generation = sbi->s_next_generation++;
 
 	err = insert_inode_locked(inode);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ffbbf0520d9e..13271ea2b453 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -502,6 +502,7 @@ static int fat_validate_dir(struct inode *dir)
 /* doesn't deal with root inode */
 int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 {
+	struct timespec ts;
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	int error;
 
@@ -552,11 +553,14 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
-	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
+	fat_time_fat2unix(sbi, &ts, de->time, de->date, 0);
+	inode->i_mtime = timespec_to_timespec64(ts);
 	if (sbi->options.isvfat) {
-		fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
+		fat_time_fat2unix(sbi, &ts, de->ctime,
 				  de->cdate, de->ctime_cs);
-		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+		inode->i_ctime = timespec_to_timespec64(ts);
+		fat_time_fat2unix(sbi, &ts, 0, de->adate, 0);
+		inode->i_atime = timespec_to_timespec64(ts);
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
 
@@ -825,6 +829,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int __fat_write_inode(struct inode *inode, int wait)
 {
+	struct timespec ts;
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh;
@@ -862,13 +867,16 @@ retry:
 		raw_entry->size = cpu_to_le32(inode->i_size);
 	raw_entry->attr = fat_make_attrs(inode);
 	fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
-	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+	ts = timespec64_to_timespec(inode->i_mtime);
+	fat_time_unix2fat(sbi, &ts, &raw_entry->time,
 			  &raw_entry->date, NULL);
 	if (sbi->options.isvfat) {
 		__le16 atime;
-		fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
+		ts = timespec64_to_timespec(inode->i_ctime);
+		fat_time_unix2fat(sbi, &ts, &raw_entry->ctime,
 				  &raw_entry->cdate, &raw_entry->ctime_cs);
-		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
+		ts = timespec64_to_timespec(inode->i_atime);
+		fat_time_unix2fat(sbi, &ts, &atime,
 				  &raw_entry->adate, NULL);
 	}
 	spin_unlock(&sbi->inode_hash_lock);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 582ca731a6c9..df6eb551f713 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 	if (err)
 		return err;
 
-	dir->i_ctime = dir->i_mtime = *ts;
+	dir->i_ctime = dir->i_mtime = timespec_to_timespec64(*ts);
 	if (IS_DIRSYNC(dir))
 		(void)fat_sync_inode(dir);
 	else
@@ -266,7 +266,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
 	struct fat_slot_info sinfo;
-	struct timespec ts;
+	struct timespec64 ts;
+	struct timespec t;
 	unsigned char msdos_name[MSDOS_NAME];
 	int err, is_hid;
 
@@ -285,7 +286,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 
 	ts = current_time(dir);
-	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
+	t = timespec64_to_timespec(ts);
+	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &t, &sinfo);
 	if (err)
 		goto out;
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
@@ -348,7 +350,8 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct fat_slot_info sinfo;
 	struct inode *inode;
 	unsigned char msdos_name[MSDOS_NAME];
-	struct timespec ts;
+	struct timespec64 ts;
+	struct timespec t;
 	int err, is_hid, cluster;
 
 	mutex_lock(&MSDOS_SB(sb)->s_lock);
@@ -366,12 +369,13 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	}
 
 	ts = current_time(dir);
-	cluster = fat_alloc_new_dir(dir, &ts);
+	t = timespec64_to_timespec(ts);
+	cluster = fat_alloc_new_dir(dir, &t);
 	if (cluster < 0) {
 		err = cluster;
 		goto out;
 	}
-	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
+	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &t, &sinfo);
 	if (err)
 		goto out_free;
 	inc_nlink(dir);
@@ -436,7 +440,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 	struct msdos_dir_entry *dotdot_de;
 	struct inode *old_inode, *new_inode;
 	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
+	struct timespec64 ts;
 	loff_t new_i_pos;
 	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
 
@@ -503,8 +507,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		new_i_pos = MSDOS_I(new_inode)->i_pos;
 		fat_detach(new_inode);
 	} else {
+		struct timespec t = timespec64_to_timespec(ts);
 		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
-				      &ts, &sinfo);
+				      &t, &sinfo);
 		if (err)
 			goto out;
 		new_i_pos = sinfo.i_pos;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2649759c478a..caa229423327 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
 		goto cleanup;
 
 	/* update timestamp */
-	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = timespec_to_timespec64(*ts);
 	if (IS_DIRSYNC(dir))
 		(void)fat_sync_inode(dir);
 	else
@@ -772,13 +772,15 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
 	struct fat_slot_info sinfo;
-	struct timespec ts;
+	struct timespec64 ts;
+	struct timespec t;
 	int err;
 
 	mutex_lock(&MSDOS_SB(sb)->s_lock);
 
 	ts = current_time(dir);
-	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
+	t = timespec64_to_timespec(ts);
+	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &t, &sinfo);
 	if (err)
 		goto out;
 	inode_inc_iversion(dir);
@@ -861,18 +863,20 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
 	struct fat_slot_info sinfo;
-	struct timespec ts;
+	struct timespec64 ts;
+	struct timespec t;
 	int err, cluster;
 
 	mutex_lock(&MSDOS_SB(sb)->s_lock);
 
 	ts = current_time(dir);
-	cluster = fat_alloc_new_dir(dir, &ts);
+	t = timespec64_to_timespec(ts);
+	cluster = fat_alloc_new_dir(dir, &t);
 	if (cluster < 0) {
 		err = cluster;
 		goto out;
 	}
-	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
+	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &t, &sinfo);
 	if (err)
 		goto out_free;
 	inode_inc_iversion(dir);
@@ -910,7 +914,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct msdos_dir_entry *dotdot_de;
 	struct inode *old_inode, *new_inode;
 	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
+	struct timespec64 ts;
+	struct timespec t;
 	loff_t new_i_pos;
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
@@ -945,8 +950,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_i_pos = MSDOS_I(new_inode)->i_pos;
 		fat_detach(new_inode);
 	} else {
+		t = timespec64_to_timespec(ts);
 		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
-				     &ts, &sinfo);
+				     &t, &sinfo);
 		if (err)
 			goto out;
 		new_i_pos = sinfo.i_pos;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ef309958e060..e9e938947723 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -217,7 +217,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		return;
 	}
 
-	old_mtime = inode->i_mtime;
+	old_mtime = timespec64_to_timespec(inode->i_mtime);
 	fuse_change_attributes_common(inode, attr, attr_valid);
 
 	oldsize = inode->i_size;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index d9fb0ad6cc30..c22090e97ff0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -871,7 +871,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 	struct buffer_head *bh;
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
-	struct timespec tv = current_time(inode);
+	struct timespec64 tv = current_time(inode);
 
 	error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 	if (error)
@@ -1802,7 +1802,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct buffer_head *bh = da->bh;
 	struct gfs2_dirent *dent = da->dent;
-	struct timespec tv;
+	struct timespec64 tv;
 	struct gfs2_leaf *leaf;
 	int error;
 
@@ -1880,7 +1880,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_dirent *dent, *prev = NULL;
 	struct buffer_head *bh;
-	struct timespec tv = current_time(&dip->i_inode);
+	struct timespec64 tv = current_time(&dip->i_inode);
 
 	/* Returns _either_ the entry (if its first in block) or the
 	   previous entry otherwise */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d8782a7a1e7d..c63bee9adb6a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -338,7 +338,7 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl)
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	const struct gfs2_dinode *str = buf;
-	struct timespec atime;
+	struct timespec64 atime;
 	u16 height, depth;
 
 	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -361,7 +361,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
-	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+	if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0)
 		ip->i_inode.i_atime = atime;
 	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
 	ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2538b49cc349..8ea683f01cda 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -351,7 +351,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
 		inode->i_mode &= ~hsb->s_file_umask;
 		inode->i_mode |= S_IFREG;
 		inode->i_ctime = inode->i_atime = inode->i_mtime =
-				hfs_m_to_utime(rec->file.MdDat);
+				timespec_to_timespec64(hfs_m_to_utime(rec->file.MdDat));
 		inode->i_op = &hfs_file_inode_operations;
 		inode->i_fop = &hfs_file_operations;
 		inode->i_mapping->a_ops = &hfs_aops;
@@ -362,7 +362,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
 		HFS_I(inode)->fs_blocks = 0;
 		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
 		inode->i_ctime = inode->i_atime = inode->i_mtime =
-				hfs_m_to_utime(rec->dir.MdDat);
+				timespec_to_timespec64(hfs_m_to_utime(rec->dir.MdDat));
 		inode->i_op = &hfs_dir_inode_operations;
 		inode->i_fop = &hfs_dir_operations;
 		break;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c0c8d433864f..c824f702feec 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -493,9 +493,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		hfsplus_get_perms(inode, &folder->permissions, 1);
 		set_nlink(inode, 1);
 		inode->i_size = 2 + be32_to_cpu(folder->valence);
-		inode->i_atime = hfsp_mt2ut(folder->access_date);
-		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
-		inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
+		inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(folder->access_date));
+		inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(folder->content_mod_date));
+		inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(folder->attribute_mod_date));
 		HFSPLUS_I(inode)->create_date = folder->create_date;
 		HFSPLUS_I(inode)->fs_blocks = 0;
 		if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -531,9 +531,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 			init_special_inode(inode, inode->i_mode,
 					   be32_to_cpu(file->permissions.dev));
 		}
-		inode->i_atime = hfsp_mt2ut(file->access_date);
-		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
-		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
+		inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(file->access_date));
+		inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(file->content_mod_date));
+		inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(file->attribute_mod_date));
 		HFSPLUS_I(inode)->create_date = file->create_date;
 	} else {
 		pr_err("bad catalog entry used to create inode\n");
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3cd85eb5bbb1..2597b290c2a5 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -555,9 +555,9 @@ static int read_name(struct inode *ino, char *name)
 	set_nlink(ino, st.nlink);
 	i_uid_write(ino, st.uid);
 	i_gid_write(ino, st.gid);
-	ino->i_atime = st.atime;
-	ino->i_mtime = st.mtime;
-	ino->i_ctime = st.ctime;
+	ino->i_atime = timespec_to_timespec64(st.atime);
+	ino->i_mtime = timespec_to_timespec64(st.mtime);
+	ino->i_ctime = timespec_to_timespec64(st.ctime);
 	ino->i_size = st.size;
 	ino->i_blocks = st.blocks;
 	return 0;
@@ -838,15 +838,15 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 	if (attr->ia_valid & ATTR_ATIME) {
 		attrs.ia_valid |= HOSTFS_ATTR_ATIME;
-		attrs.ia_atime = attr->ia_atime;
+		attrs.ia_atime = timespec64_to_timespec(attr->ia_atime);
 	}
 	if (attr->ia_valid & ATTR_MTIME) {
 		attrs.ia_valid |= HOSTFS_ATTR_MTIME;
-		attrs.ia_mtime = attr->ia_mtime;
+		attrs.ia_mtime = timespec64_to_timespec(attr->ia_mtime);
 	}
 	if (attr->ia_valid & ATTR_CTIME) {
 		attrs.ia_valid |= HOSTFS_ATTR_CTIME;
-		attrs.ia_ctime = attr->ia_ctime;
+		attrs.ia_ctime = timespec64_to_timespec(attr->ia_ctime);
 	}
 	if (attr->ia_valid & ATTR_ATIME_SET) {
 		attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
diff --git a/fs/inode.c b/fs/inode.c
index 93af998ee290..9fe1f941be02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1590,8 +1590,8 @@ static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode,
 	if (upperdentry) {
 		struct inode *realinode = d_inode(upperdentry);
 
-		if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) ||
-		     !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) {
+		if ((!timespec64_equal(&inode->i_mtime, &realinode->i_mtime) ||
+		     !timespec64_equal(&inode->i_ctime, &realinode->i_ctime))) {
 			inode->i_mtime = realinode->i_mtime;
 			inode->i_ctime = realinode->i_ctime;
 		}
@@ -1614,12 +1614,12 @@ static int relatime_need_update(const struct path *path, struct inode *inode,
 	/*
 	 * Is mtime younger than atime? If yes, update atime:
 	 */
-	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
 		return 1;
 	/*
 	 * Is ctime younger than atime? If yes, update atime:
 	 */
-	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+	if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
 		return 1;
 
 	/*
@@ -1634,7 +1634,7 @@ static int relatime_need_update(const struct path *path, struct inode *inode,
 	return 0;
 }
 
-int generic_update_time(struct inode *inode, struct timespec *time, int flags)
+int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
 {
 	int iflags = I_DIRTY_TIME;
 	bool dirty = false;
@@ -1662,9 +1662,9 @@ EXPORT_SYMBOL(generic_update_time);
  * This does the actual work of updating an inodes time or version.  Must have
  * had called mnt_want_write() before calling this.
  */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
+static int update_time(struct inode *inode, struct timespec64 *time, int flags)
 {
-	int (*update_time)(struct inode *, struct timespec *, int);
+	int (*update_time)(struct inode *, struct timespec64 *, int);
 
 	update_time = inode->i_op->update_time ? inode->i_op->update_time :
 		generic_update_time;
@@ -1685,7 +1685,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode,
 			  bool rcu)
 {
 	struct vfsmount *mnt = path->mnt;
-	struct timespec now;
+	struct timespec64 now;
 
 	if (inode->i_flags & S_NOATIME)
 		return false;
@@ -1708,10 +1708,10 @@ bool __atime_needs_update(const struct path *path, struct inode *inode,
 
 	now = current_time(inode);
 
-	if (!relatime_need_update(path, inode, now, rcu))
+	if (!relatime_need_update(path, inode, timespec64_to_timespec(now), rcu))
 		return false;
 
-	if (timespec_equal(&inode->i_atime, &now))
+	if (timespec64_equal(&inode->i_atime, &now))
 		return false;
 
 	return true;
@@ -1721,7 +1721,7 @@ void touch_atime(const struct path *path)
 {
 	struct vfsmount *mnt = path->mnt;
 	struct inode *inode = d_inode(path->dentry);
-	struct timespec now;
+	struct timespec64 now;
 
 	if (!__atime_needs_update(path, inode, false))
 		return;
@@ -1855,7 +1855,7 @@ EXPORT_SYMBOL(file_remove_privs);
 int file_update_time(struct file *file)
 {
 	struct inode *inode = file_inode(file);
-	struct timespec now;
+	struct timespec64 now;
 	int sync_it = 0;
 	int ret;
 
@@ -1864,10 +1864,10 @@ int file_update_time(struct file *file)
 		return 0;
 
 	now = current_time(inode);
-	if (!timespec_equal(&inode->i_mtime, &now))
+	if (!timespec64_equal(&inode->i_mtime, &now))
 		sync_it = S_MTIME;
 
-	if (!timespec_equal(&inode->i_ctime, &now))
+	if (!timespec64_equal(&inode->i_ctime, &now))
 		sync_it |= S_CTIME;
 
 	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2144,15 +2144,15 @@ EXPORT_SYMBOL(timespec64_trunc);
  * Note that inode and inode->sb cannot be NULL.
  * Otherwise, the function warns and returns time without truncation.
  */
-struct timespec current_time(struct inode *inode)
+struct timespec64 current_time(struct inode *inode)
 {
-	struct timespec now = current_kernel_time();
+	struct timespec64 now = current_kernel_time64();
 
 	if (unlikely(!inode->i_sb)) {
 		WARN(1, "current_time() called with uninitialized super_block in the inode");
 		return now;
 	}
 
-	return timespec_trunc(now, inode->i_sb->s_time_gran);
+	return timespec64_trunc(now, inode->i_sb->s_time_gran);
 }
 EXPORT_SYMBOL(current_time);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 0a754f38462e..4ef97805eb27 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -201,7 +201,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
 	if (ret)
 		goto fail;
 
-	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+	dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime)));
 
 	jffs2_free_raw_inode(ri);
 
@@ -235,7 +235,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
 	if (dead_f->inocache)
 		set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
 	if (!ret)
-		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+		dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now));
 	return ret;
 }
 /***********************************************************************/
@@ -269,7 +269,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 		set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
 		mutex_unlock(&f->sem);
 		d_instantiate(dentry, d_inode(old_dentry));
-		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+		dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now));
 		ihold(d_inode(old_dentry));
 	}
 	return ret;
@@ -419,7 +419,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		goto fail;
 	}
 
-	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+	dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime)));
 
 	jffs2_free_raw_dirent(rd);
 
@@ -563,7 +563,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode
 		goto fail;
 	}
 
-	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+	dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime)));
 	inc_nlink(dir_i);
 
 	jffs2_free_raw_dirent(rd);
@@ -601,7 +601,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
 			      dentry->d_name.len, f, now);
 	if (!ret) {
-		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+		dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now));
 		clear_nlink(d_inode(dentry));
 		drop_nlink(dir_i);
 	}
@@ -736,7 +736,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
 		goto fail;
 	}
 
-	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+	dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime)));
 
 	jffs2_free_raw_dirent(rd);
 
@@ -857,14 +857,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		 * caller won't do it on its own since we are returning an error.
 		 */
 		d_invalidate(new_dentry);
-		new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
+		new_dir_i->i_mtime = new_dir_i->i_ctime = timespec_to_timespec64(ITIME(now));
 		return ret;
 	}
 
 	if (d_is_dir(old_dentry))
 		drop_nlink(old_dir_i);
 
-	new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
+	new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = timespec_to_timespec64(ITIME(now));
 
 	return 0;
 }
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bd0428bebe9b..481afd4c2e1a 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -308,7 +308,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 			inode->i_size = pos + writtenlen;
 			inode->i_blocks = (inode->i_size + 511) >> 9;
 
-			inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime));
+			inode->i_ctime = inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime)));
 		}
 	}
 
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index eab04eca95a3..0ecfb8ea38cd 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -146,9 +146,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 		return PTR_ERR(new_metadata);
 	}
 	/* It worked. Update the inode */
-	inode->i_atime = ITIME(je32_to_cpu(ri->atime));
-	inode->i_ctime = ITIME(je32_to_cpu(ri->ctime));
-	inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
+	inode->i_atime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->atime)));
+	inode->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime)));
+	inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->mtime)));
 	inode->i_mode = jemode_to_cpu(ri->mode);
 	i_uid_write(inode, je16_to_cpu(ri->uid));
 	i_gid_write(inode, je16_to_cpu(ri->gid));
@@ -280,9 +280,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 	i_uid_write(inode, je16_to_cpu(latest_node.uid));
 	i_gid_write(inode, je16_to_cpu(latest_node.gid));
 	inode->i_size = je32_to_cpu(latest_node.isize);
-	inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
-	inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
-	inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
+	inode->i_atime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.atime)));
+	inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.mtime)));
+	inode->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.ctime)));
 
 	set_nlink(inode, f->inocache->pino_nlink);
 
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 89d1dc19340b..d66cc0777303 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -779,7 +779,7 @@ int kernfs_add_one(struct kernfs_node *kn)
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
 		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ktime_get_real_ts(&ps_iattrs->ia_ctime);
+		ktime_get_real_ts64(&ps_iattrs->ia_ctime);
 		ps_iattrs->ia_mtime = ps_iattrs->ia_ctime;
 	}
 
@@ -1306,7 +1306,7 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 			/* update timestamps on the parent */
 			if (ps_iattr) {
-				ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime);
+				ktime_get_real_ts64(&ps_iattr->ia_iattr.ia_ctime);
 				ps_iattr->ia_iattr.ia_mtime =
 					ps_iattr->ia_iattr.ia_ctime;
 			}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index a34303981deb..3d73fe9d56e2 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -52,7 +52,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 	iattrs->ia_uid = GLOBAL_ROOT_UID;
 	iattrs->ia_gid = GLOBAL_ROOT_GID;
 
-	ktime_get_real_ts(&iattrs->ia_atime);
+	ktime_get_real_ts64(&iattrs->ia_atime);
 	iattrs->ia_mtime = iattrs->ia_atime;
 	iattrs->ia_ctime = iattrs->ia_atime;
 
@@ -176,9 +176,9 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
 	struct super_block *sb = inode->i_sb;
 	inode->i_uid = iattr->ia_uid;
 	inode->i_gid = iattr->ia_gid;
-	inode->i_atime = timespec_trunc(iattr->ia_atime, sb->s_time_gran);
-	inode->i_mtime = timespec_trunc(iattr->ia_mtime, sb->s_time_gran);
-	inode->i_ctime = timespec_trunc(iattr->ia_ctime, sb->s_time_gran);
+	inode->i_atime = timespec64_trunc(iattr->ia_atime, sb->s_time_gran);
+	inode->i_mtime = timespec64_trunc(iattr->ia_mtime, sb->s_time_gran);
+	inode->i_ctime = timespec64_trunc(iattr->ia_ctime, sb->s_time_gran);
 }
 
 static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
diff --git a/fs/locks.c b/fs/locks.c
index 62bbe8b31f26..7356c405ec46 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1562,7 +1562,7 @@ EXPORT_SYMBOL(__break_lease);
  * exclusive leases.  The justification is that if someone has an
  * exclusive lease, then they could be modifying it.
  */
-void lease_get_mtime(struct inode *inode, struct timespec *time)
+void lease_get_mtime(struct inode *inode, struct timespec64 *time)
 {
 	bool has_lease = false;
 	struct file_lock_context *ctx;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a50d7813e3ea..9d24aabcd8bb 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -54,8 +54,8 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
 	res->change_attr = delegation->change_attr;
 	if (nfs_have_writebacks(inode))
 		res->change_attr++;
-	res->ctime = inode->i_ctime;
-	res->mtime = inode->i_mtime;
+	res->ctime = timespec64_to_timespec(inode->i_ctime);
+	res->mtime = timespec64_to_timespec(inode->i_mtime);
 	res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
 		args->bitmap[0];
 	res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 1c5d8d31fc0a..666415d13d52 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -88,8 +88,8 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
 		return FSCACHE_CHECKAUX_OBSOLETE;
 
 	memset(&auxdata, 0, sizeof(auxdata));
-	auxdata.mtime = nfsi->vfs_inode.i_mtime;
-	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+	auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
+	auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
 
 	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
 		auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index b55fc7920c3b..4dc887813c71 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -237,8 +237,8 @@ void nfs_fscache_init_inode(struct inode *inode)
 		return;
 
 	memset(&auxdata, 0, sizeof(auxdata));
-	auxdata.mtime = nfsi->vfs_inode.i_mtime;
-	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+	auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
+	auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
 
 	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
 		auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
@@ -262,8 +262,8 @@ void nfs_fscache_clear_inode(struct inode *inode)
 	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
 
 	memset(&auxdata, 0, sizeof(auxdata));
-	auxdata.mtime = nfsi->vfs_inode.i_mtime;
-	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+	auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
+	auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
 	fscache_relinquish_cookie(cookie, &auxdata, false);
 	nfsi->fscache = NULL;
 }
@@ -304,8 +304,8 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 		return;
 
 	memset(&auxdata, 0, sizeof(auxdata));
-	auxdata.mtime = nfsi->vfs_inode.i_mtime;
-	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+	auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
+	auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
 
 	if (inode_is_open_for_write(inode)) {
 		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 55b62254dd7c..138941a97327 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -494,15 +494,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-			inode->i_atime = fattr->atime;
+			inode->i_atime = timespec_to_timespec64(fattr->atime);
 		else if (nfs_server_capable(inode, NFS_CAP_ATIME))
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			inode->i_mtime = fattr->mtime;
+			inode->i_mtime = timespec_to_timespec64(fattr->mtime);
 		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
-			inode->i_ctime = fattr->ctime;
+			inode->i_ctime = timespec_to_timespec64(fattr->ctime);
 		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -1304,6 +1304,8 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
 
 static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
+	struct timespec ts;
+
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
 			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			&& inode_eq_iversion_raw(inode, fattr->pre_change_attr)) {
@@ -1312,16 +1314,18 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
 	}
 	/* If we have atomic WCC data, we may update some attributes */
+	ts = timespec64_to_timespec(inode->i_ctime);
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
-			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
-		inode->i_ctime = fattr->ctime;
+			&& timespec_equal(&ts, &fattr->pre_ctime)) {
+		inode->i_ctime = timespec_to_timespec64(fattr->ctime);
 	}
 
+	ts = timespec64_to_timespec(inode->i_mtime);
 	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
-		inode->i_mtime = fattr->mtime;
+			&& timespec_equal(&ts, &fattr->pre_mtime)) {
+		inode->i_mtime = timespec_to_timespec64(fattr->mtime);
 		if (S_ISDIR(inode->i_mode))
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
 	}
@@ -1347,7 +1351,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_size, new_isize;
 	unsigned long invalid = 0;
-
+	struct timespec ts;
 
 	if (nfs_have_delegated_attributes(inode))
 		return 0;
@@ -1363,10 +1367,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 			invalid |= NFS_INO_INVALID_CHANGE
 				| NFS_INO_REVAL_PAGECACHE;
 
-		if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+		ts = timespec64_to_timespec(inode->i_mtime);
+		if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&ts, &fattr->mtime))
 			invalid |= NFS_INO_INVALID_MTIME;
 
-		if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
+		ts = timespec64_to_timespec(inode->i_ctime);
+		if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&ts, &fattr->ctime))
 			invalid |= NFS_INO_INVALID_CTIME;
 
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1396,7 +1402,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_OTHER;
 
-	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
+	ts = timespec64_to_timespec(inode->i_atime);
+	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&ts, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
 	if (invalid != 0)
@@ -1667,12 +1674,12 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
 			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
-		fattr->pre_ctime = inode->i_ctime;
+		fattr->pre_ctime = timespec64_to_timespec(inode->i_ctime);
 		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
 			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
-		fattr->pre_mtime = inode->i_mtime;
+		fattr->pre_mtime = timespec64_to_timespec(inode->i_mtime);
 		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
@@ -1829,7 +1836,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
-		inode->i_mtime = fattr->mtime;
+		inode->i_mtime = timespec_to_timespec64(fattr->mtime);
 	} else if (server->caps & NFS_CAP_MTIME) {
 		nfsi->cache_validity |= save_cache_validity &
 				(NFS_INO_INVALID_MTIME
@@ -1838,7 +1845,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
-		inode->i_ctime = fattr->ctime;
+		inode->i_ctime = timespec_to_timespec64(fattr->ctime);
 	} else if (server->caps & NFS_CAP_CTIME) {
 		nfsi->cache_validity |= save_cache_validity &
 				(NFS_INO_INVALID_CTIME
@@ -1875,7 +1882,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 
 	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-		inode->i_atime = fattr->atime;
+		inode->i_atime = timespec_to_timespec64(fattr->atime);
 	else if (server->caps & NFS_CAP_ATIME) {
 		nfsi->cache_validity |= save_cache_validity &
 				(NFS_INO_INVALID_ATIME
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 85e4b4a233f9..350675e3ed47 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -354,6 +354,7 @@ static __be32 *xdr_time_not_set(__be32 *p)
 
 static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
 {
+	struct timespec ts;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
@@ -375,17 +376,21 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
 	else
 		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 
-	if (attr->ia_valid & ATTR_ATIME_SET)
-		p = xdr_encode_time(p, &attr->ia_atime);
-	else if (attr->ia_valid & ATTR_ATIME)
-		p = xdr_encode_current_server_time(p, &attr->ia_atime);
-	else
+	if (attr->ia_valid & ATTR_ATIME_SET) {
+		ts = timespec64_to_timespec(attr->ia_atime);
+		p = xdr_encode_time(p, &ts);
+	} else if (attr->ia_valid & ATTR_ATIME) {
+		ts = timespec64_to_timespec(attr->ia_atime);
+		p = xdr_encode_current_server_time(p, &ts);
+	} else
 		p = xdr_time_not_set(p);
-	if (attr->ia_valid & ATTR_MTIME_SET)
-		xdr_encode_time(p, &attr->ia_mtime);
-	else if (attr->ia_valid & ATTR_MTIME)
-		xdr_encode_current_server_time(p, &attr->ia_mtime);
-	else
+	if (attr->ia_valid & ATTR_MTIME_SET) {
+		ts = timespec64_to_timespec(attr->ia_atime);
+		xdr_encode_time(p, &ts);
+	} else if (attr->ia_valid & ATTR_MTIME) {
+		ts = timespec64_to_timespec(attr->ia_mtime);
+		xdr_encode_current_server_time(p, &ts);
+	} else
 		xdr_time_not_set(p);
 }
 
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 09ee36dd8426..64e4fa33d89f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -561,6 +561,7 @@ static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
  */
 static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
 {
+	struct timespec ts;
 	u32 nbytes;
 	__be32 *p;
 
@@ -610,8 +611,10 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
 		*p++ = xdr_zero;
 
 	if (attr->ia_valid & ATTR_ATIME_SET) {
+		struct timespec ts;
 		*p++ = xdr_two;
-		p = xdr_encode_nfstime3(p, &attr->ia_atime);
+		ts = timespec64_to_timespec(attr->ia_atime);
+		p = xdr_encode_nfstime3(p, &ts);
 	} else if (attr->ia_valid & ATTR_ATIME) {
 		*p++ = xdr_one;
 	} else
@@ -619,7 +622,8 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_MTIME_SET) {
 		*p++ = xdr_two;
-		xdr_encode_nfstime3(p, &attr->ia_mtime);
+		ts = timespec64_to_timespec(attr->ia_mtime);
+		xdr_encode_nfstime3(p, &ts);
 	} else if (attr->ia_valid & ATTR_MTIME) {
 		*p = xdr_one;
 	} else
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9b7392032321..481426e9e6f0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1057,6 +1057,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 				const struct nfs_server *server,
 				const uint32_t attrmask[])
 {
+	struct timespec ts;
 	char owner_name[IDMAP_NAMESZ];
 	char owner_group[IDMAP_NAMESZ];
 	int owner_namelen = 0;
@@ -1145,14 +1146,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 	if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
 		if (iap->ia_valid & ATTR_ATIME_SET) {
 			*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
-			p = xdr_encode_nfstime4(p, &iap->ia_atime);
+			ts = timespec64_to_timespec(iap->ia_atime);
+			p = xdr_encode_nfstime4(p, &ts);
 		} else
 			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
 	if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
 		if (iap->ia_valid & ATTR_MTIME_SET) {
 			*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
-			p = xdr_encode_nfstime4(p, &iap->ia_mtime);
+			ts = timespec64_to_timespec(iap->ia_mtime);
+			p = xdr_encode_nfstime4(p, &ts);
 		} else
 			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 70b8bf781fce..72b21eaffc50 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -121,13 +121,15 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
 {
 	loff_t new_size = lcp->lc_last_wr + 1;
 	struct iattr iattr = { .ia_valid = 0 };
+	struct timespec ts;
 	int error;
 
+	ts = timespec64_to_timespec(inode->i_mtime);
 	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
-	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
-		lcp->lc_mtime = current_time(inode);
+	    timespec_compare(&lcp->lc_mtime, &ts) < 0)
+		lcp->lc_mtime = timespec64_to_timespec(current_time(inode));
 	iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
-	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = timespec_to_timespec64(lcp->lc_mtime);
 
 	if (new_size > i_size_read(inode)) {
 		iattr.ia_valid |= ATTR_SIZE;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 3192b544a441..9b973f4f7d01 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -165,6 +165,7 @@ static __be32 *
 encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	      struct kstat *stat)
 {
+	struct timespec ts;
 	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
 	*p++ = htonl((u32) (stat->mode & S_IALLUGO));
 	*p++ = htonl((u32) stat->nlink);
@@ -180,9 +181,12 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	*p++ = htonl((u32) MINOR(stat->rdev));
 	p = encode_fsid(p, fhp);
 	p = xdr_encode_hyper(p, stat->ino);
-	p = encode_time3(p, &stat->atime);
-	p = encode_time3(p, &stat->mtime);
-	p = encode_time3(p, &stat->ctime);
+	ts = timespec64_to_timespec(stat->atime);
+	p = encode_time3(p, &ts);
+	ts = timespec64_to_timespec(stat->mtime);
+	p = encode_time3(p, &ts);
+	ts = timespec64_to_timespec(stat->ctime);
+	p = encode_time3(p, &ts);
 
 	return p;
 }
@@ -271,8 +275,8 @@ void fill_pre_wcc(struct svc_fh *fhp)
 		stat.size  = inode->i_size;
 	}
 
-	fhp->fh_pre_mtime = stat.mtime;
-	fhp->fh_pre_ctime = stat.ctime;
+	fhp->fh_pre_mtime = timespec64_to_timespec(stat.mtime);
+	fhp->fh_pre_ctime = timespec64_to_timespec(stat.ctime);
 	fhp->fh_pre_size  = stat.size;
 	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
 	fhp->fh_pre_saved = true;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1d048dd95464..c09323c3b419 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -320,6 +320,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		   struct iattr *iattr, struct nfs4_acl **acl,
 		   struct xdr_netobj *label, int *umask)
 {
+	struct timespec ts;
 	int expected_len, len = 0;
 	u32 dummy32;
 	char *buf;
@@ -421,7 +422,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		switch (dummy32) {
 		case NFS4_SET_TO_CLIENT_TIME:
 			len += 12;
-			status = nfsd4_decode_time(argp, &iattr->ia_atime);
+			status = nfsd4_decode_time(argp, &ts);
+			iattr->ia_atime = timespec_to_timespec64(ts);
 			if (status)
 				return status;
 			iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
@@ -440,7 +442,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		switch (dummy32) {
 		case NFS4_SET_TO_CLIENT_TIME:
 			len += 12;
-			status = nfsd4_decode_time(argp, &iattr->ia_mtime);
+			status = nfsd4_decode_time(argp, &ts);
+			iattr->ia_mtime = timespec_to_timespec64(ts);
 			if (status)
 				return status;
 			iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index a43e8260520a..6b2e8b73d36e 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -131,7 +131,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 {
 	struct dentry	*dentry = fhp->fh_dentry;
 	int type;
-	struct timespec time;
+	struct timespec64 time;
 	u32 f;
 
 	type = (stat->mode & S_IFMT);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1c1ee489284b..decaf75d1cd5 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -667,18 +667,18 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * mtime is the last change of the data within the file. Not changed
 	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
 	 */
-	vi->i_mtime = ntfs2utc(si->last_data_change_time);
+	vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time));
 	/*
 	 * ctime is the last change of the metadata of the file. This obviously
 	 * always changes, when mtime is changed. ctime can be changed on its
 	 * own, mtime is then not changed, e.g. when a file is renamed.
 	 */
-	vi->i_ctime = ntfs2utc(si->last_mft_change_time);
+	vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time));
 	/*
 	 * Last access to the data within the file. Not changed during a rename
 	 * for example but changed whenever the file is written to.
 	 */
-	vi->i_atime = ntfs2utc(si->last_access_time);
+	vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time));
 
 	/* Find the attribute list attribute if present. */
 	ntfs_attr_reinit_search_ctx(ctx);
@@ -2804,11 +2804,11 @@ done:
 	 * for real.
 	 */
 	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
-		struct timespec now = current_time(VFS_I(base_ni));
+		struct timespec64 now = current_time(VFS_I(base_ni));
 		int sync_it = 0;
 
-		if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
-		    !timespec_equal(&VFS_I(base_ni)->i_ctime, &now))
+		if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		    !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now))
 			sync_it = 1;
 		VFS_I(base_ni)->i_mtime = now;
 		VFS_I(base_ni)->i_ctime = now;
@@ -2923,14 +2923,14 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 	if (ia_valid & ATTR_ATIME)
-		vi->i_atime = timespec_trunc(attr->ia_atime,
-				vi->i_sb->s_time_gran);
+		vi->i_atime = timespec64_trunc(attr->ia_atime,
+					       vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
-		vi->i_mtime = timespec_trunc(attr->ia_mtime,
-				vi->i_sb->s_time_gran);
+		vi->i_mtime = timespec64_trunc(attr->ia_mtime,
+					       vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
-		vi->i_ctime = timespec_trunc(attr->ia_ctime,
-				vi->i_sb->s_time_gran);
+		vi->i_ctime = timespec64_trunc(attr->ia_ctime,
+					       vi->i_sb->s_time_gran);
 	mark_inode_dirty(vi);
 out:
 	return err;
@@ -2997,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset));
 	/* Update the access times if they have changed. */
-	nt = utc2ntfs(vi->i_mtime);
+	nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime));
 	if (si->last_data_change_time != nt) {
 		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino, (long long)
@@ -3006,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 		si->last_data_change_time = nt;
 		modified = true;
 	}
-	nt = utc2ntfs(vi->i_ctime);
+	nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime));
 	if (si->last_mft_change_time != nt) {
 		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino, (long long)
@@ -3015,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 		si->last_mft_change_time = nt;
 		modified = true;
 	}
-	nt = utc2ntfs(vi->i_atime);
+	nt = utc2ntfs(timespec64_to_timespec(vi->i_atime));
 	if (si->last_access_time != nt) {
 		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 97a972efab83..3eb665187d46 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2141,6 +2141,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
+	struct timespec ts;
 
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
@@ -2161,12 +2162,15 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	lvb->lvb_igid      = cpu_to_be32(i_gid_read(inode));
 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
+	ts = timespec64_to_timespec(inode->i_atime);
 	lvb->lvb_iatime_packed  =
-		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
+		cpu_to_be64(ocfs2_pack_timespec(&ts));
+	ts = timespec64_to_timespec(inode->i_ctime);
 	lvb->lvb_ictime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+		cpu_to_be64(ocfs2_pack_timespec(&ts));
+	ts = timespec64_to_timespec(inode->i_mtime);
 	lvb->lvb_imtime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+		cpu_to_be64(ocfs2_pack_timespec(&ts));
 	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
 	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
 	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2184,6 +2188,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
 
 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 {
+	struct timespec ts;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
@@ -2211,12 +2216,15 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
 	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
-	ocfs2_unpack_timespec(&inode->i_atime,
+	ocfs2_unpack_timespec(&ts,
 			      be64_to_cpu(lvb->lvb_iatime_packed));
-	ocfs2_unpack_timespec(&inode->i_mtime,
+	inode->i_atime = timespec_to_timespec64(ts);
+	ocfs2_unpack_timespec(&ts,
 			      be64_to_cpu(lvb->lvb_imtime_packed));
-	ocfs2_unpack_timespec(&inode->i_ctime,
+	inode->i_mtime = timespec_to_timespec64(ts);
+	ocfs2_unpack_timespec(&ts,
 			      be64_to_cpu(lvb->lvb_ictime_packed));
+	inode->i_ctime = timespec_to_timespec64(ts);
 	spin_unlock(&oi->ip_lock);
 }
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6ee94bc23f5b..4b32069d4390 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -222,7 +222,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt)
 {
-	struct timespec now;
+	struct timespec64 now;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
@@ -248,8 +248,8 @@ int ocfs2_should_update_atime(struct inode *inode,
 		return 0;
 
 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
-		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
-		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
+		if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
+		    (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 			return 1;
 
 		return 0;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 79c61da8b1bc..7e3400b90bc9 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -290,7 +290,7 @@ int orangefs_permission(struct inode *inode, int mask)
 	return generic_permission(inode, mask);
 }
 
-int orangefs_update_time(struct inode *inode, struct timespec *time, int flags)
+int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags)
 {
 	struct iattr iattr;
 	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index c29bb0ebc6bb..7ed50ee3db8a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -343,7 +343,7 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
 
 int orangefs_permission(struct inode *inode, int mask);
 
-int orangefs_update_time(struct inode *, struct timespec *, int);
+int orangefs_update_time(struct inode *, struct timespec64 *, int);
 
 /*
  * defined in xattr.c
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 6e3815fb006b..d7cca60f28e6 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -416,7 +416,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
 	return err;
 }
 
-int ovl_update_time(struct inode *inode, struct timespec *ts, int flags)
+int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
 {
 	if (flags & S_ATIME) {
 		struct ovl_fs *ofs = inode->i_sb->s_fs_info;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e0b7de799f6b..9fe10247f9d4 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -331,7 +331,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 struct posix_acl *ovl_get_acl(struct inode *inode, int type);
 int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
-int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
+int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
 bool ovl_is_private_xattr(const char *name);
 
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 95a708d83721..f18e6f949e0c 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -10,7 +10,7 @@
 static int uptime_proc_show(struct seq_file *m, void *v)
 {
 	struct timespec uptime;
-	struct timespec idle;
+	struct timespec64 idle;
 	u64 nsec;
 	u32 rem;
 	int i;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 75afe5eb0574..5fcb845b9fec 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -392,8 +392,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
 	inode->i_private = private;
 
 	if (record->time.tv_sec)
-		inode->i_mtime = inode->i_ctime =
-			timespec64_to_timespec(record->time);
+		inode->i_mtime = inode->i_ctime = record->time;
 
 	d_add(dentry, inode);
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index bd39a998843d..d2869ced6d39 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1320,7 +1320,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int jbegin_count;
 	umode_t old_inode_mode;
 	unsigned long savelink = 1;
-	struct timespec ctime;
+	struct timespec64 ctime;
 
 	if (flags & ~RENAME_NOREPLACE)
 		return -EINVAL;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5dbf5324bdda..ff94fad477e4 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -451,10 +451,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 
 static void update_ctime(struct inode *inode)
 {
-	struct timespec now = current_time(inode);
+	struct timespec64 now = current_time(inode);
 
 	if (inode_unhashed(inode) || !inode->i_nlink ||
-	    timespec_equal(&inode->i_ctime, &now))
+	    timespec64_equal(&inode->i_ctime, &now))
 		return;
 
 	inode->i_ctime = current_time(inode);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d7fb88e172e..ba53517448b4 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1289,7 +1289,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 					.dirtied_ino = 3 };
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
-	struct timespec time;
+	struct timespec64 time;
 	unsigned int uninitialized_var(saved_nlink);
 	struct fscrypt_name old_nm, new_nm;
 
@@ -1517,7 +1517,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
 	int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
 	struct inode *fst_inode = d_inode(old_dentry);
 	struct inode *snd_inode = d_inode(new_dentry);
-	struct timespec time;
+	struct timespec64 time;
 	int err;
 	struct fscrypt_name fst_nm, snd_nm;
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1acb2ff505e6..ddb252109945 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1089,14 +1089,14 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
 	if (attr->ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
 	if (attr->ia_valid & ATTR_ATIME)
-		inode->i_atime = timespec_trunc(attr->ia_atime,
-						inode->i_sb->s_time_gran);
+		inode->i_atime = timespec64_trunc(attr->ia_atime,
+						  inode->i_sb->s_time_gran);
 	if (attr->ia_valid & ATTR_MTIME)
-		inode->i_mtime = timespec_trunc(attr->ia_mtime,
-						inode->i_sb->s_time_gran);
+		inode->i_mtime = timespec64_trunc(attr->ia_mtime,
+						  inode->i_sb->s_time_gran);
 	if (attr->ia_valid & ATTR_CTIME)
-		inode->i_ctime = timespec_trunc(attr->ia_ctime,
-						inode->i_sb->s_time_gran);
+		inode->i_ctime = timespec64_trunc(attr->ia_ctime,
+						  inode->i_sb->s_time_gran);
 	if (attr->ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
@@ -1367,8 +1367,9 @@ out:
 static inline int mctime_update_needed(const struct inode *inode,
 				       const struct timespec *now)
 {
-	if (!timespec_equal(&inode->i_mtime, now) ||
-	    !timespec_equal(&inode->i_ctime, now))
+	struct timespec64 now64 = timespec_to_timespec64(*now);
+	if (!timespec64_equal(&inode->i_mtime, &now64) ||
+	    !timespec64_equal(&inode->i_ctime, &now64))
 		return 1;
 	return 0;
 }
@@ -1380,7 +1381,7 @@ static inline int mctime_update_needed(const struct inode *inode,
  *
  * This function updates time of the inode.
  */
-int ubifs_update_time(struct inode *inode, struct timespec *time,
+int ubifs_update_time(struct inode *inode, struct timespec64 *time,
 			     int flags)
 {
 	struct ubifs_inode *ui = ubifs_inode(inode);
@@ -1424,7 +1425,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
  */
 static int update_mctime(struct inode *inode)
 {
-	struct timespec now = current_time(inode);
+	struct timespec now = timespec64_to_timespec(current_time(inode));
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 
@@ -1518,7 +1519,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
-	struct timespec now = current_time(inode);
+	struct timespec now = timespec64_to_timespec(current_time(inode));
 	struct ubifs_budget_req req = { .new_page = 1 };
 	int err, update_time;
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5ee7af879cc4..ac61ba9921c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1740,7 +1740,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 #ifdef CONFIG_UBIFS_ATIME_SUPPORT
-int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
+int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
 #endif
 
 /* dir.c */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index b7a0d4b4bda1..56569023783b 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -124,8 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
 	else
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
-	inode->i_mtime = inode->i_atime = inode->i_ctime =
-		iinfo->i_crtime = current_time(inode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	iinfo->i_crtime = timespec64_to_timespec(inode->i_mtime);
 	if (unlikely(insert_inode_locked(inode) < 0)) {
 		make_bad_inode(inode);
 		iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index df2378d6ebb4..7f39d17352c9 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1271,6 +1271,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
 	struct kernel_lb_addr *iloc = &iinfo->i_location;
+	struct timespec ts;
 	unsigned int link_count;
 	unsigned int indirections = 0;
 	int bs = inode->i_sb->s_blocksize;
@@ -1443,9 +1444,12 @@ reread:
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
 			(inode->i_sb->s_blocksize_bits - 9);
 
-		udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime);
-		udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime);
-		udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime);
+		udf_disk_stamp_to_time(&ts, fe->accessTime);
+		inode->i_atime = timespec_to_timespec64(ts);
+		udf_disk_stamp_to_time(&ts, fe->modificationTime);
+		inode->i_mtime = timespec_to_timespec64(ts);
+		udf_disk_stamp_to_time(&ts, fe->attrTime);
+		inode->i_ctime = timespec_to_timespec64(ts);
 
 		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1455,10 +1459,13 @@ reread:
 		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
 		    (inode->i_sb->s_blocksize_bits - 9);
 
-		udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime);
-		udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime);
+		udf_disk_stamp_to_time(&ts, efe->accessTime);
+		inode->i_atime = timespec_to_timespec64(ts);
+		udf_disk_stamp_to_time(&ts, efe->modificationTime);
+		inode->i_mtime = timespec_to_timespec64(ts);
 		udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);
-		udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime);
+		udf_disk_stamp_to_time(&ts, efe->attrTime);
+		inode->i_ctime = timespec_to_timespec64(ts);
 
 		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1708,9 +1715,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
 		fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
 
-		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
-		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
-		udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
+		udf_time_to_disk_stamp(&fe->accessTime,
+				       timespec64_to_timespec(inode->i_atime));
+		udf_time_to_disk_stamp(&fe->modificationTime,
+				       timespec64_to_timespec(inode->i_mtime));
+		udf_time_to_disk_stamp(&fe->attrTime,
+				       timespec64_to_timespec(inode->i_ctime));
 		memset(&(fe->impIdent), 0, sizeof(struct regid));
 		strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
 		fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1729,14 +1739,17 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		efe->objectSize = cpu_to_le64(inode->i_size);
 		efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
 
-		udf_adjust_time(iinfo, inode->i_atime);
-		udf_adjust_time(iinfo, inode->i_mtime);
-		udf_adjust_time(iinfo, inode->i_ctime);
+		udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_atime));
+		udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_mtime));
+		udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_ctime));
 
-		udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
-		udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&efe->accessTime,
+				       timespec64_to_timespec(inode->i_atime));
+		udf_time_to_disk_stamp(&efe->modificationTime,
+				       timespec64_to_timespec(inode->i_mtime));
 		udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
-		udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
+		udf_time_to_disk_stamp(&efe->attrTime,
+				       timespec64_to_timespec(inode->i_ctime));
 
 		memset(&(efe->impIdent), 0, sizeof(efe->impIdent));
 		strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2b70c8b4cee2..0043812897af 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -773,7 +773,7 @@ xfs_ialloc(
 	xfs_inode_t	*ip;
 	uint		flags;
 	int		error;
-	struct timespec	tv;
+	struct timespec64 tv;
 	struct inode	*inode;
 
 	/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a3ed3c811dfa..b89601b445c2 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1050,7 +1050,7 @@ xfs_vn_setattr(
 STATIC int
 xfs_vn_update_time(
 	struct inode		*inode,
-	struct timespec		*now,
+	struct timespec64	*now,
 	int			flags)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 07cea592dc01..a1b3ce132f2f 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -70,7 +70,7 @@ xfs_trans_ichgtime(
 	int			flags)
 {
 	struct inode		*inode = VFS_I(ip);
-	struct timespec		tv;
+	struct timespec64 tv;
 
 	ASSERT(tp);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f6997e0dabf..f15329aa9cad 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -206,9 +206,9 @@ struct iattr {
 	kuid_t		ia_uid;
 	kgid_t		ia_gid;
 	loff_t		ia_size;
-	struct timespec	ia_atime;
-	struct timespec	ia_mtime;
-	struct timespec	ia_ctime;
+	struct timespec64 ia_atime;
+	struct timespec64 ia_mtime;
+	struct timespec64 ia_ctime;
 
 	/*
 	 * Not an attribute, but an auxiliary info for filesystems wanting to
@@ -602,9 +602,9 @@ struct inode {
 	};
 	dev_t			i_rdev;
 	loff_t			i_size;
-	struct timespec		i_atime;
-	struct timespec		i_mtime;
-	struct timespec		i_ctime;
+	struct timespec64	i_atime;
+	struct timespec64	i_mtime;
+	struct timespec64	i_ctime;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
 	unsigned int		i_blkbits;
@@ -1091,7 +1091,7 @@ extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
-extern void lease_get_mtime(struct inode *, struct timespec *time);
+extern void lease_get_mtime(struct inode *, struct timespec64 *time);
 extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
 extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
 extern int lease_modify(struct file_lock *, int, struct list_head *);
@@ -1206,7 +1206,8 @@ static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned
 	return 0;
 }
 
-static inline void lease_get_mtime(struct inode *inode, struct timespec *time)
+static inline void lease_get_mtime(struct inode *inode,
+				   struct timespec64 *time)
 {
 	return;
 }
@@ -1477,7 +1478,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 }
 
 extern struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran);
-extern struct timespec current_time(struct inode *inode);
+extern struct timespec64 current_time(struct inode *inode);
 
 /*
  * Snapshotting support.
@@ -1765,7 +1766,7 @@ struct inode_operations {
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
-	int (*update_time)(struct inode *, struct timespec *, int);
+	int (*update_time)(struct inode *, struct timespec64 *, int);
 	int (*atomic_open)(struct inode *, struct dentry *,
 			   struct file *, unsigned open_flag,
 			   umode_t create_mode, int *opened);
@@ -2199,7 +2200,7 @@ extern int current_umask(void);
 
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
-extern int generic_update_time(struct inode *, struct timespec *, int);
+extern int generic_update_time(struct inode *, struct timespec64 *, int);
 
 /* /sys/fs */
 extern struct kobject *fs_kobj;
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 22484e44544d..765573dc17d6 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -41,10 +41,10 @@ struct kstat {
 	kuid_t		uid;
 	kgid_t		gid;
 	loff_t		size;
-	struct timespec	atime;
-	struct timespec	mtime;
-	struct timespec	ctime;
-	struct timespec	btime;			/* File creation time */
+	struct timespec64 atime;
+	struct timespec64 mtime;
+	struct timespec64 ctime;
+	struct timespec64 btime;			/* File creation time */
 	u64		blocks;
 };
 
-- 
cgit v1.2.3


From d87be0438e3d4e249926f0431bcec991e4a9eea6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 1 Jun 2018 12:06:38 -0700
Subject: ACPICA: IORT: Update for revision D

IORT revision D contains a few additions and fixes to
currently-supported tables:

- SMMUv3 proximity domain field is enlarged to 4 bytes for consistency
  with SRAT
- Root complex nodes gain an address size limit field equivalent to that
  of named components
- Named component nodes gain a way to describe PASID (substream_ID)
  support, encoded in their flags

Additionally, we fix a couple of outstanding points in passing:
- Add the stall support flag for named components defined in revision C
- Fix SMMUv3 HTTU override mask, which should always have been 2 bits

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 876012da8e6e..46dbc1018b2d 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -67,7 +67,7 @@
  * IORT - IO Remapping Table
  *
  * Conforms to "IO Remapping Table System Software on ARM Platforms",
- * Document number: ARM DEN 0049C, May 2017
+ * Document number: ARM DEN 0049D, March 2018
  *
  ******************************************************************************/
 
@@ -152,10 +152,17 @@ struct acpi_iort_named_component {
 	char device_name[1];	/* Path of namespace object */
 };
 
+/* Masks for Flags field above */
+
+#define ACPI_IORT_NC_STALL_SUPPORTED    (1)
+#define ACPI_IORT_NC_PASID_BITS         (31<<1)
+
 struct acpi_iort_root_complex {
 	u64 memory_properties;	/* Memory access properties */
 	u32 ats_attribute;
 	u32 pci_segment_number;
+	u8 memory_address_limit;	/* Memory address size limit */
+	u8 reserved[3];		/* Reserved, must be zero */
 };
 
 /* Values for ats_attribute field above */
@@ -209,9 +216,7 @@ struct acpi_iort_smmu_v3 {
 	u32 pri_gsiv;
 	u32 gerr_gsiv;
 	u32 sync_gsiv;
-	u8 pxm;
-	u8 reserved1;
-	u16 reserved2;
+	u32 pxm;
 	u32 id_mapping_index;
 };
 
@@ -224,7 +229,7 @@ struct acpi_iort_smmu_v3 {
 /* Masks for Flags field above */
 
 #define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE   (1)
-#define ACPI_IORT_SMMU_V3_HTTU_OVERRIDE     (1<<1)
+#define ACPI_IORT_SMMU_V3_HTTU_OVERRIDE     (3<<1)
 #define ACPI_IORT_SMMU_V3_PXM_VALID         (1<<3)
 
 /*******************************************************************************
-- 
cgit v1.2.3


From a53eaef6da5dc0abd899d6512a31c6d94c92f9be Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 1 Jun 2018 12:06:39 -0700
Subject: ACPICA: IORT: Add PMCG node supprt

PMCG nodes were added by IORT revision C, with the unfortunate oversight
that it only defined a single base address, and thus was incapable of
properly describing PMCG implementations with PMCG_CFGR.RELOC_CTRS = 1,
where the counters are in a separate page from the control registers.

Revision D corrects this by clarifying the existing field as the page 0
base address and adding a second field to describe the page 1 address
when implemented. With the spec now fit for purpose, let's support it.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 46dbc1018b2d..c50ef7e6b942 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -98,7 +98,8 @@ enum acpi_iort_node_type {
 	ACPI_IORT_NODE_NAMED_COMPONENT = 0x01,
 	ACPI_IORT_NODE_PCI_ROOT_COMPLEX = 0x02,
 	ACPI_IORT_NODE_SMMU = 0x03,
-	ACPI_IORT_NODE_SMMU_V3 = 0x04
+	ACPI_IORT_NODE_SMMU_V3 = 0x04,
+	ACPI_IORT_NODE_PMCG = 0x05
 };
 
 struct acpi_iort_id_mapping {
@@ -232,6 +233,13 @@ struct acpi_iort_smmu_v3 {
 #define ACPI_IORT_SMMU_V3_HTTU_OVERRIDE     (3<<1)
 #define ACPI_IORT_SMMU_V3_PXM_VALID         (1<<3)
 
+struct acpi_iort_pmcg {
+	u64 page0_base_address;
+	u32 overflow_gsiv;
+	u32 node_reference;
+	u64 page1_base_address;
+};
+
 /*******************************************************************************
  *
  * IVRS - I/O Virtualization Reporting Structure
-- 
cgit v1.2.3


From fb30b2981d674f2326feb942f462c2cf8213ac14 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 1 Jun 2018 12:06:41 -0700
Subject: ACPICA: AML Parser: Add debug option to dump parse trees

Debug level 0x00800000 will dump the current parse tree
just before it is deleted.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/pswalk.c | 34 +++++++++++++++++++++++++++++++---
 include/acpi/acoutput.h      |  4 +++-
 2 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/pswalk.c b/drivers/acpi/acpica/pswalk.c
index e0a442b8648b..bd6af8c87d48 100644
--- a/drivers/acpi/acpica/pswalk.c
+++ b/drivers/acpi/acpica/pswalk.c
@@ -25,22 +25,48 @@ ACPI_MODULE_NAME("pswalk")
  * DESCRIPTION: Delete a portion of or an entire parse tree.
  *
  ******************************************************************************/
+#include "amlcode.h"
 void acpi_ps_delete_parse_tree(union acpi_parse_object *subtree_root)
 {
 	union acpi_parse_object *op = subtree_root;
 	union acpi_parse_object *next = NULL;
 	union acpi_parse_object *parent = NULL;
+	u32 level = 0;
 
 	ACPI_FUNCTION_TRACE_PTR(ps_delete_parse_tree, subtree_root);
 
+	ACPI_DEBUG_PRINT((ACPI_DB_PARSE_TREES, " root %p\n", subtree_root));
+
 	/* Visit all nodes in the subtree */
 
 	while (op) {
-
-		/* Check if we are not ascending */
-
 		if (op != parent) {
 
+			/* This is the descending case */
+
+			if (ACPI_IS_DEBUG_ENABLED
+			    (ACPI_LV_PARSE_TREES, _COMPONENT)) {
+
+				/* This debug option will print the entire parse tree */
+
+				acpi_os_printf("      %*.s%s %p", (level * 4),
+					       " ",
+					       acpi_ps_get_opcode_name(op->
+								       common.
+								       aml_opcode),
+					       op);
+
+				if (op->named.aml_opcode == AML_INT_NAMEPATH_OP) {
+					acpi_os_printf("  %4.4s",
+						       op->common.value.string);
+				}
+				if (op->named.aml_opcode == AML_STRING_OP) {
+					acpi_os_printf("  %s",
+						       op->common.value.string);
+				}
+				acpi_os_printf("\n");
+			}
+
 			/* Look for an argument or child of the current op */
 
 			next = acpi_ps_get_arg(op, 0);
@@ -49,6 +75,7 @@ void acpi_ps_delete_parse_tree(union acpi_parse_object *subtree_root)
 				/* Still going downward in tree (Op is not completed yet) */
 
 				op = next;
+				level++;
 				continue;
 			}
 		}
@@ -69,6 +96,7 @@ void acpi_ps_delete_parse_tree(union acpi_parse_object *subtree_root)
 		if (next) {
 			op = next;
 		} else {
+			level--;
 			op = parent;
 		}
 	}
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 0a6c5bd92256..3a26aa7ead23 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -80,7 +80,8 @@
 #define ACPI_LV_ALLOCATIONS         0x00100000
 #define ACPI_LV_FUNCTIONS           0x00200000
 #define ACPI_LV_OPTIMIZATIONS       0x00400000
-#define ACPI_LV_VERBOSITY2          0x00700000 | ACPI_LV_VERBOSITY1
+#define ACPI_LV_PARSE_TREES         0x00800000
+#define ACPI_LV_VERBOSITY2          0x00F00000 | ACPI_LV_VERBOSITY1
 #define ACPI_LV_ALL                 ACPI_LV_VERBOSITY2
 
 /* Trace verbosity level 3 [Threading, I/O, and Interrupts] */
@@ -131,6 +132,7 @@
 #define ACPI_DB_TABLES              ACPI_DEBUG_LEVEL (ACPI_LV_TABLES)
 #define ACPI_DB_FUNCTIONS           ACPI_DEBUG_LEVEL (ACPI_LV_FUNCTIONS)
 #define ACPI_DB_OPTIMIZATIONS       ACPI_DEBUG_LEVEL (ACPI_LV_OPTIMIZATIONS)
+#define ACPI_DB_PARSE_TREES         ACPI_DEBUG_LEVEL (ACPI_LV_PARSE_TREES)
 #define ACPI_DB_VALUES              ACPI_DEBUG_LEVEL (ACPI_LV_VALUES)
 #define ACPI_DB_OBJECTS             ACPI_DEBUG_LEVEL (ACPI_LV_OBJECTS)
 #define ACPI_DB_ALLOCATIONS         ACPI_DEBUG_LEVEL (ACPI_LV_ALLOCATIONS)
-- 
cgit v1.2.3


From 6898837872deef0ee5ad31799185afba8cf56856 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 1 Jun 2018 12:06:45 -0700
Subject: ACPICA: Update version to 20180531

Version 20180531.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 77d71bd1be39..48d84f0d9547 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -12,7 +12,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20180508
+#define ACPI_CA_VERSION                 0x20180531
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>
-- 
cgit v1.2.3


From 3c095f32a92be4d07f3172a777dab1aacdb6a728 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 31 May 2018 12:59:58 +0200
Subject: PM / Domains: Add support for multi PM domains per device to genpd

To support devices being partitioned across multiple PM domains, let's
begin with extending genpd to cope with these kind of configurations.

Therefore, add a new exported function genpd_dev_pm_attach_by_id(), which
is similar to the existing genpd_dev_pm_attach(), but with the difference
that it allows its callers to provide an index to the PM domain that it
wants to attach.

Note that, genpd_dev_pm_attach_by_id() shall only be called by the driver
core / PM core, similar to how the existing dev_pm_domain_attach() makes
use of genpd_dev_pm_attach(). However, this is implemented by following
changes on top.

Because, only one PM domain can be attached per device, genpd needs to
create a virtual device that it can attach/detach instead. More precisely,
let the new function genpd_dev_pm_attach_by_id() register a virtual struct
device via calling device_register(). Then let it attach this device to the
corresponding PM domain, rather than the one that is provided by the
caller. The actual attaching is done via re-using the existing genpd OF
functions.

At successful attachment, genpd_dev_pm_attach_by_id() returns the created
virtual device, which allows the caller to operate on it to deal with power
management. Following changes on top, provides more details in this
regards.

To deal with detaching of a PM domain for the multiple PM domains case,
let's also extend the existing genpd_dev_pm_detach() function, to cover the
cleanup of the created virtual device, via make it call device_unregister()
on it. In this way, there is no need to introduce a new function to deal
with detach for the multiple PM domain case, but instead the existing one
is re-used.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h   |  8 +++++
 2 files changed, 88 insertions(+)

(limited to 'include')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index b1fcbf917974..4925af5c4cf0 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2171,6 +2171,15 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_genpd_remove_last);
 
+static void genpd_release_dev(struct device *dev)
+{
+	kfree(dev);
+}
+
+static struct bus_type genpd_bus_type = {
+	.name		= "genpd",
+};
+
 /**
  * genpd_dev_pm_detach - Detach a device from its PM domain.
  * @dev: Device to detach.
@@ -2208,6 +2217,10 @@ static void genpd_dev_pm_detach(struct device *dev, bool power_off)
 
 	/* Check if PM domain can be powered off after removing this device. */
 	genpd_queue_power_off_work(pd);
+
+	/* Unregister the device if it was created by genpd. */
+	if (dev->bus == &genpd_bus_type)
+		device_unregister(dev);
 }
 
 static void genpd_dev_pm_sync(struct device *dev)
@@ -2298,6 +2311,67 @@ int genpd_dev_pm_attach(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
 
+/**
+ * genpd_dev_pm_attach_by_id - Associate a device with one of its PM domains.
+ * @dev: The device used to lookup the PM domain.
+ * @index: The index of the PM domain.
+ *
+ * Parse device's OF node to find a PM domain specifier at the provided @index.
+ * If such is found, creates a virtual device and attaches it to the retrieved
+ * pm_domain ops. To deal with detaching of the virtual device, the ->detach()
+ * callback in the struct dev_pm_domain are assigned to genpd_dev_pm_detach().
+ *
+ * Returns the created virtual device if successfully attached PM domain, NULL
+ * when the device don't need a PM domain, else an ERR_PTR() in case of
+ * failures. If a power-domain exists for the device, but cannot be found or
+ * turned on, then ERR_PTR(-EPROBE_DEFER) is returned to ensure that the device
+ * is not probed and to re-try again later.
+ */
+struct device *genpd_dev_pm_attach_by_id(struct device *dev,
+					 unsigned int index)
+{
+	struct device *genpd_dev;
+	int num_domains;
+	int ret;
+
+	if (!dev->of_node)
+		return NULL;
+
+	/* Deal only with devices using multiple PM domains. */
+	num_domains = of_count_phandle_with_args(dev->of_node, "power-domains",
+						 "#power-domain-cells");
+	if (num_domains < 2 || index >= num_domains)
+		return NULL;
+
+	/* Allocate and register device on the genpd bus. */
+	genpd_dev = kzalloc(sizeof(*genpd_dev), GFP_KERNEL);
+	if (!genpd_dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev_set_name(genpd_dev, "genpd:%u:%s", index, dev_name(dev));
+	genpd_dev->bus = &genpd_bus_type;
+	genpd_dev->release = genpd_release_dev;
+
+	ret = device_register(genpd_dev);
+	if (ret) {
+		kfree(genpd_dev);
+		return ERR_PTR(ret);
+	}
+
+	/* Try to attach the device to the PM domain at the specified index. */
+	ret = __genpd_dev_pm_attach(genpd_dev, dev->of_node, index);
+	if (ret < 1) {
+		device_unregister(genpd_dev);
+		return ret ? ERR_PTR(ret) : NULL;
+	}
+
+	pm_runtime_set_active(genpd_dev);
+	pm_runtime_enable(genpd_dev);
+
+	return genpd_dev;
+}
+EXPORT_SYMBOL_GPL(genpd_dev_pm_attach_by_id);
+
 static const struct of_device_id idle_state_match[] = {
 	{ .compatible = "domain-idle-state", },
 	{ }
@@ -2457,6 +2531,12 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(of_genpd_opp_to_performance_state);
 
+static int __init genpd_bus_init(void)
+{
+	return bus_register(&genpd_bus_type);
+}
+core_initcall(genpd_bus_init);
+
 #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
 
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 42e0d649e653..82458e8e2e01 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -237,6 +237,8 @@ unsigned int of_genpd_opp_to_performance_state(struct device *dev,
 				struct device_node *opp_node);
 
 int genpd_dev_pm_attach(struct device *dev);
+struct device *genpd_dev_pm_attach_by_id(struct device *dev,
+					 unsigned int index);
 #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
 static inline int of_genpd_add_provider_simple(struct device_node *np,
 					struct generic_pm_domain *genpd)
@@ -282,6 +284,12 @@ static inline int genpd_dev_pm_attach(struct device *dev)
 	return 0;
 }
 
+static inline struct device *genpd_dev_pm_attach_by_id(struct device *dev,
+						       unsigned int index)
+{
+	return NULL;
+}
+
 static inline
 struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 {
-- 
cgit v1.2.3


From 82e12d9e0bd59f3d24be9c735258e2e98e4f54f6 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 31 May 2018 12:59:59 +0200
Subject: PM / Domains: Add dev_pm_domain_attach_by_id() to manage multi PM
 domains

The existing dev_pm_domain_attach() function, allows a single PM domain to
be attached per device. To be able to support devices that are partitioned
across multiple PM domains, let's introduce a new interface,
dev_pm_domain_attach_by_id().

The dev_pm_domain_attach_by_id() returns a new allocated struct device with
the corresponding attached PM domain. This enables for example a driver to
operate on the new device from a power management point of view. The driver
may then also benefit from using the received device, to set up so called
device-links towards its original device. Depending on the situation, these
links may then be dynamically changed.

The new interface is typically called by drivers during their probe phase,
in case they manages devices which uses multiple PM domains. If that is the
case, the driver also becomes responsible of managing the detaching of the
PM domains, which typically should be done at the remove phase. Detaching
is done by calling the existing dev_pm_domain_detach() function and for
each of the received devices from dev_pm_domain_attach_by_id().

Note, currently its only genpd that supports multiple PM domains per
device, but dev_pm_domain_attach_by_id() can easily by extended to cover
other PM domain types, if/when needed.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/common.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/pm_domain.h   |  7 +++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index 7ae62b6355b8..df41b4780b3b 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -116,14 +116,51 @@ int dev_pm_domain_attach(struct device *dev, bool power_on)
 }
 EXPORT_SYMBOL_GPL(dev_pm_domain_attach);
 
+/**
+ * dev_pm_domain_attach_by_id - Associate a device with one of its PM domains.
+ * @dev: The device used to lookup the PM domain.
+ * @index: The index of the PM domain.
+ *
+ * As @dev may only be attached to a single PM domain, the backend PM domain
+ * provider creates a virtual device to attach instead. If attachment succeeds,
+ * the ->detach() callback in the struct dev_pm_domain are assigned by the
+ * corresponding backend attach function, as to deal with detaching of the
+ * created virtual device.
+ *
+ * This function should typically be invoked by a driver during the probe phase,
+ * in case its device requires power management through multiple PM domains. The
+ * driver may benefit from using the received device, to configure device-links
+ * towards its original device. Depending on the use-case and if needed, the
+ * links may be dynamically changed by the driver, which allows it to control
+ * the power to the PM domains independently from each other.
+ *
+ * Callers must ensure proper synchronization of this function with power
+ * management callbacks.
+ *
+ * Returns the virtual created device when successfully attached to its PM
+ * domain, NULL in case @dev don't need a PM domain, else an ERR_PTR().
+ * Note that, to detach the returned virtual device, the driver shall call
+ * dev_pm_domain_detach() on it, typically during the remove phase.
+ */
+struct device *dev_pm_domain_attach_by_id(struct device *dev,
+					  unsigned int index)
+{
+	if (dev->pm_domain)
+		return ERR_PTR(-EEXIST);
+
+	return genpd_dev_pm_attach_by_id(dev, index);
+}
+EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id);
+
 /**
  * dev_pm_domain_detach - Detach a device from its PM domain.
  * @dev: Device to detach.
  * @power_off: Used to indicate whether we should power off the device.
  *
- * This functions will reverse the actions from dev_pm_domain_attach() and thus
- * try to detach the @dev from its PM domain. Typically it should be invoked
- * from subsystem level code during the remove phase.
+ * This functions will reverse the actions from dev_pm_domain_attach() and
+ * dev_pm_domain_attach_by_id(), thus it detaches @dev from its PM domain.
+ * Typically it should be invoked during the remove phase, either from
+ * subsystem level code or from drivers.
  *
  * Callers must ensure proper synchronization of this function with power
  * management callbacks.
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 82458e8e2e01..9206a4fef9ac 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -299,6 +299,8 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 
 #ifdef CONFIG_PM
 int dev_pm_domain_attach(struct device *dev, bool power_on);
+struct device *dev_pm_domain_attach_by_id(struct device *dev,
+					  unsigned int index);
 void dev_pm_domain_detach(struct device *dev, bool power_off);
 void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
 #else
@@ -306,6 +308,11 @@ static inline int dev_pm_domain_attach(struct device *dev, bool power_on)
 {
 	return 0;
 }
+static inline struct device *dev_pm_domain_attach_by_id(struct device *dev,
+							unsigned int index)
+{
+	return NULL;
+}
 static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {}
 static inline void dev_pm_domain_set(struct device *dev,
 				     struct dev_pm_domain *pd) {}
-- 
cgit v1.2.3


From b575e837215325544b0dbcef912f13f369de4f3f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 2 Jun 2018 08:43:53 -0400
Subject: uapi/headers: Provide types_32_64.h

Provide helper macros for fields which represent pointers in
kernel-userspace ABI. This facilitates handling of 32-bit
user-space by 64-bit kernels by defining those fields as
32-bit 0-padding and 32-bit integer on 32-bit architectures,
which allows the kernel to treat those as 64-bit integers.
The order of padding and 32-bit integer depends on the
endianness.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Chris Lameter <cl@linux.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Andrew Hunter <ahh@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: linux-api@vger.kernel.org
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lkml.kernel.org/r/20180602124408.8430-2-mathieu.desnoyers@efficios.com
---
 include/uapi/linux/types_32_64.h | 50 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 include/uapi/linux/types_32_64.h

(limited to 'include')

diff --git a/include/uapi/linux/types_32_64.h b/include/uapi/linux/types_32_64.h
new file mode 100644
index 000000000000..0a87ace34a57
--- /dev/null
+++ b/include/uapi/linux/types_32_64.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_TYPES_32_64_H
+#define _UAPI_LINUX_TYPES_32_64_H
+
+/*
+ * linux/types_32_64.h
+ *
+ * Integer type declaration for pointers across 32-bit and 64-bit systems.
+ *
+ * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include <stdint.h>
+#endif
+
+#include <asm/byteorder.h>
+
+#ifdef __BYTE_ORDER
+# if (__BYTE_ORDER == __BIG_ENDIAN)
+#  define LINUX_BYTE_ORDER_BIG_ENDIAN
+# else
+#  define LINUX_BYTE_ORDER_LITTLE_ENDIAN
+# endif
+#else
+# ifdef __BIG_ENDIAN
+#  define LINUX_BYTE_ORDER_BIG_ENDIAN
+# else
+#  define LINUX_BYTE_ORDER_LITTLE_ENDIAN
+# endif
+#endif
+
+#ifdef __LP64__
+# define LINUX_FIELD_u32_u64(field)			__u64 field
+# define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v)	field = (intptr_t)v
+#else
+# ifdef LINUX_BYTE_ORDER_BIG_ENDIAN
+#  define LINUX_FIELD_u32_u64(field)	__u32 field ## _padding, field
+#  define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v)	\
+	field ## _padding = 0, field = (intptr_t)v
+# else
+#  define LINUX_FIELD_u32_u64(field)	__u32 field, field ## _padding
+#  define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v)	\
+	field = (intptr_t)v, field ## _padding = 0
+# endif
+#endif
+
+#endif /* _UAPI_LINUX_TYPES_32_64_H */
-- 
cgit v1.2.3


From d7822b1e24f2df5df98c76f0e94a5416349ff759 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 2 Jun 2018 08:43:54 -0400
Subject: rseq: Introduce restartable sequences system call

Expose a new system call allowing each thread to register one userspace
memory area to be used as an ABI between kernel and user-space for two
purposes: user-space restartable sequences and quick access to read the
current CPU number value from user-space.

* Restartable sequences (per-cpu atomics)

Restartables sequences allow user-space to perform update operations on
per-cpu data without requiring heavy-weight atomic operations.

The restartable critical sections (percpu atomics) work has been started
by Paul Turner and Andrew Hunter. It lets the kernel handle restart of
critical sections. [1] [2] The re-implementation proposed here brings a
few simplifications to the ABI which facilitates porting to other
architectures and speeds up the user-space fast path.

Here are benchmarks of various rseq use-cases.

Test hardware:

arm32: ARMv7 Processor rev 4 (v7l) "Cubietruck", 2-core
x86-64: Intel E5-2630 v3@2.40GHz, 16-core, hyperthreading

The following benchmarks were all performed on a single thread.

* Per-CPU statistic counter increment

                getcpu+atomic (ns/op)    rseq (ns/op)    speedup
arm32:                344.0                 31.4          11.0
x86-64:                15.3                  2.0           7.7

* LTTng-UST: write event 32-bit header, 32-bit payload into tracer
             per-cpu buffer

                getcpu+atomic (ns/op)    rseq (ns/op)    speedup
arm32:               2502.0                 2250.0         1.1
x86-64:               117.4                   98.0         1.2

* liburcu percpu: lock-unlock pair, dereference, read/compare word

                getcpu+atomic (ns/op)    rseq (ns/op)    speedup
arm32:                751.0                 128.5          5.8
x86-64:                53.4                  28.6          1.9

* jemalloc memory allocator adapted to use rseq

Using rseq with per-cpu memory pools in jemalloc at Facebook (based on
rseq 2016 implementation):

The production workload response-time has 1-2% gain avg. latency, and
the P99 overall latency drops by 2-3%.

* Reading the current CPU number

Speeding up reading the current CPU number on which the caller thread is
running is done by keeping the current CPU number up do date within the
cpu_id field of the memory area registered by the thread. This is done
by making scheduler preemption set the TIF_NOTIFY_RESUME flag on the
current thread. Upon return to user-space, a notify-resume handler
updates the current CPU value within the registered user-space memory
area. User-space can then read the current CPU number directly from
memory.

Keeping the current cpu id in a memory area shared between kernel and
user-space is an improvement over current mechanisms available to read
the current CPU number, which has the following benefits over
alternative approaches:

- 35x speedup on ARM vs system call through glibc
- 20x speedup on x86 compared to calling glibc, which calls vdso
  executing a "lsl" instruction,
- 14x speedup on x86 compared to inlined "lsl" instruction,
- Unlike vdso approaches, this cpu_id value can be read from an inline
  assembly, which makes it a useful building block for restartable
  sequences.
- The approach of reading the cpu id through memory mapping shared
  between kernel and user-space is portable (e.g. ARM), which is not the
  case for the lsl-based x86 vdso.

On x86, yet another possible approach would be to use the gs segment
selector to point to user-space per-cpu data. This approach performs
similarly to the cpu id cache, but it has two disadvantages: it is
not portable, and it is incompatible with existing applications already
using the gs segment selector for other purposes.

Benchmarking various approaches for reading the current CPU number:

ARMv7 Processor rev 4 (v7l)
Machine model: Cubietruck
- Baseline (empty loop):                                    8.4 ns
- Read CPU from rseq cpu_id:                               16.7 ns
- Read CPU from rseq cpu_id (lazy register):               19.8 ns
- glibc 2.19-0ubuntu6.6 getcpu:                           301.8 ns
- getcpu system call:                                     234.9 ns

x86-64 Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz:
- Baseline (empty loop):                                    0.8 ns
- Read CPU from rseq cpu_id:                                0.8 ns
- Read CPU from rseq cpu_id (lazy register):                0.8 ns
- Read using gs segment selector:                           0.8 ns
- "lsl" inline assembly:                                   13.0 ns
- glibc 2.19-0ubuntu6 getcpu:                              16.6 ns
- getcpu system call:                                      53.9 ns

- Speed (benchmark taken on v8 of patchset)

Running 10 runs of hackbench -l 100000 seems to indicate, contrary to
expectations, that enabling CONFIG_RSEQ slightly accelerates the
scheduler:

Configuration: 2 sockets * 8-core Intel(R) Xeon(R) CPU E5-2630 v3 @
2.40GHz (directly on hardware, hyperthreading disabled in BIOS, energy
saving disabled in BIOS, turboboost disabled in BIOS, cpuidle.off=1
kernel parameter), with a Linux v4.6 defconfig+localyesconfig,
restartable sequences series applied.

* CONFIG_RSEQ=n

avg.:      41.37 s
std.dev.:   0.36 s

* CONFIG_RSEQ=y

avg.:      40.46 s
std.dev.:   0.33 s

- Size

On x86-64, between CONFIG_RSEQ=n/y, the text size increase of vmlinux is
567 bytes, and the data size increase of vmlinux is 5696 bytes.

[1] https://lwn.net/Articles/650333/
[2] http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Chris Lameter <cl@linux.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Andrew Hunter <ahh@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-api@vger.kernel.org
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20151027235635.16059.11630.stgit@pjt-glaptop.roam.corp.google.com
Link: http://lkml.kernel.org/r/20150624222609.6116.86035.stgit@kitami.mtv.corp.google.com
Link: https://lkml.kernel.org/r/20180602124408.8430-3-mathieu.desnoyers@efficios.com
---
 MAINTAINERS                 |  11 ++
 arch/Kconfig                |   7 +
 fs/exec.c                   |   1 +
 include/linux/sched.h       | 134 +++++++++++++++++
 include/linux/syscalls.h    |   4 +-
 include/trace/events/rseq.h |  57 +++++++
 include/uapi/linux/rseq.h   | 133 +++++++++++++++++
 init/Kconfig                |  23 +++
 kernel/Makefile             |   1 +
 kernel/fork.c               |   2 +
 kernel/rseq.c               | 357 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c         |   2 +
 kernel/sys_ni.c             |   3 +
 13 files changed, 734 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/events/rseq.h
 create mode 100644 include/uapi/linux/rseq.h
 create mode 100644 kernel/rseq.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index aa635837a6af..a384243d911b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11976,6 +11976,17 @@ F:	include/dt-bindings/reset/
 F:	include/linux/reset.h
 F:	include/linux/reset-controller.h
 
+RESTARTABLE SEQUENCES SUPPORT
+M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+M:	Peter Zijlstra <peterz@infradead.org>
+M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+M:	Boqun Feng <boqun.feng@gmail.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	kernel/rseq.c
+F:	include/uapi/linux/rseq.h
+F:	include/trace/events/rseq.h
+
 RFKILL
 M:	Johannes Berg <johannes@sipsolutions.net>
 L:	linux-wireless@vger.kernel.org
diff --git a/arch/Kconfig b/arch/Kconfig
index b695a3e3e922..095ba99968c1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -272,6 +272,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API
 	  declared in asm/ptrace.h
 	  For example the kprobes-based event tracer needs this API.
 
+config HAVE_RSEQ
+	bool
+	depends on HAVE_REGS_AND_STACK_ACCESS_API
+	help
+	  This symbol should be selected by an architecture if it
+	  supports an implementation of restartable sequences.
+
 config HAVE_CLK
 	bool
 	help
diff --git a/fs/exec.c b/fs/exec.c
index 183059c427b9..2c3911612b22 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1822,6 +1822,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	membarrier_execve(current);
+	rseq_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14e4f9c12337..3aa4fcb74e76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
 #include <linux/signal_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
+#include <linux/rseq.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -1047,6 +1048,17 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_RSEQ
+	struct rseq __user *rseq;
+	u32 rseq_len;
+	u32 rseq_sig;
+	/*
+	 * RmW on rseq_event_mask must be performed atomically
+	 * with respect to preemption.
+	 */
+	unsigned long rseq_event_mask;
+#endif
+
 	struct tlbflush_unmap_batch	tlb_ubc;
 
 	struct rcu_head			rcu;
@@ -1757,4 +1769,126 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
+#ifdef CONFIG_RSEQ
+
+/*
+ * Map the event mask on the user-space ABI enum rseq_cs_flags
+ * for direct mask checks.
+ */
+enum rseq_event_mask_bits {
+	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
+	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
+	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
+};
+
+enum rseq_event_mask {
+	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
+	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
+	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
+};
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+	if (t->rseq)
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+
+void __rseq_handle_notify_resume(struct pt_regs *regs);
+
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+{
+	if (current->rseq)
+		__rseq_handle_notify_resume(regs);
+}
+
+static inline void rseq_signal_deliver(struct pt_regs *regs)
+{
+	preempt_disable();
+	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+	preempt_enable();
+	rseq_handle_notify_resume(regs);
+}
+
+/* rseq_preempt() requires preemption to be disabled. */
+static inline void rseq_preempt(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/* rseq_migrate() requires preemption to be disabled. */
+static inline void rseq_migrate(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Only applies when forking a process, not a thread. In
+ * case a parent fork() in the middle of a restartable sequence, set the
+ * resume notifier to force the child to retry.
+ */
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+	if (clone_flags & CLONE_THREAD) {
+		t->rseq = NULL;
+		t->rseq_len = 0;
+		t->rseq_sig = 0;
+		t->rseq_event_mask = 0;
+	} else {
+		t->rseq = current->rseq;
+		t->rseq_len = current->rseq_len;
+		t->rseq_sig = current->rseq_sig;
+		t->rseq_event_mask = current->rseq_event_mask;
+		rseq_preempt(t);
+	}
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+	t->rseq = NULL;
+	t->rseq_len = 0;
+	t->rseq_sig = 0;
+	t->rseq_event_mask = 0;
+}
+
+#else
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+}
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+{
+}
+static inline void rseq_signal_deliver(struct pt_regs *regs)
+{
+}
+static inline void rseq_preempt(struct task_struct *t)
+{
+}
+static inline void rseq_migrate(struct task_struct *t)
+{
+}
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+}
+static inline void rseq_execve(struct task_struct *t)
+{
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_RSEQ
+
+void rseq_syscall(struct pt_regs *regs);
+
+#else
+
+static inline void rseq_syscall(struct pt_regs *regs)
+{
+}
+
+#endif
+
 #endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 390e814fdc8d..73810808cdf2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -66,6 +66,7 @@ struct old_linux_dirent;
 struct perf_event_attr;
 struct file_handle;
 struct sigaltstack;
+struct rseq;
 union bpf_attr;
 
 #include <linux/types.h>
@@ -897,7 +898,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
-
+asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
+			 int flags, uint32_t sig);
 
 /*
  * Architecture-specific system calls
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
new file mode 100644
index 000000000000..a04a64bc1a00
--- /dev/null
+++ b/include/trace/events/rseq.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rseq
+
+#if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RSEQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+TRACE_EVENT(rseq_update,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__field(s32, cpu_id)
+	),
+
+	TP_fast_assign(
+		__entry->cpu_id = raw_smp_processor_id();
+	),
+
+	TP_printk("cpu_id=%d", __entry->cpu_id)
+);
+
+TRACE_EVENT(rseq_ip_fixup,
+
+	TP_PROTO(unsigned long regs_ip, unsigned long start_ip,
+		unsigned long post_commit_offset, unsigned long abort_ip),
+
+	TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, regs_ip)
+		__field(unsigned long, start_ip)
+		__field(unsigned long, post_commit_offset)
+		__field(unsigned long, abort_ip)
+	),
+
+	TP_fast_assign(
+		__entry->regs_ip = regs_ip;
+		__entry->start_ip = start_ip;
+		__entry->post_commit_offset = post_commit_offset;
+		__entry->abort_ip = abort_ip;
+	),
+
+	TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx",
+		__entry->regs_ip, __entry->start_ip,
+		__entry->post_commit_offset, __entry->abort_ip)
+);
+
+#endif /* _TRACE_SOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
new file mode 100644
index 000000000000..d620fa43756c
--- /dev/null
+++ b/include/uapi/linux/rseq.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RSEQ_H
+#define _UAPI_LINUX_RSEQ_H
+
+/*
+ * linux/rseq.h
+ *
+ * Restartable sequences system call API
+ *
+ * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include <stdint.h>
+#endif
+
+#include <linux/types_32_64.h>
+
+enum rseq_cpu_id_state {
+	RSEQ_CPU_ID_UNINITIALIZED		= -1,
+	RSEQ_CPU_ID_REGISTRATION_FAILED		= -2,
+};
+
+enum rseq_flags {
+	RSEQ_FLAG_UNREGISTER = (1 << 0),
+};
+
+enum rseq_cs_flags_bit {
+	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT	= 0,
+	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT	= 1,
+	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT	= 2,
+};
+
+enum rseq_cs_flags {
+	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT	=
+		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
+	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL	=
+		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
+	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE	=
+		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+};
+
+/*
+ * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line. It is usually declared as
+ * link-time constant data.
+ */
+struct rseq_cs {
+	/* Version of this structure. */
+	__u32 version;
+	/* enum rseq_cs_flags */
+	__u32 flags;
+	LINUX_FIELD_u32_u64(start_ip);
+	/* Offset from start_ip. */
+	LINUX_FIELD_u32_u64(post_commit_offset);
+	LINUX_FIELD_u32_u64(abort_ip);
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+/*
+ * struct rseq is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line.
+ *
+ * A single struct rseq per thread is allowed.
+ */
+struct rseq {
+	/*
+	 * Restartable sequences cpu_id_start field. Updated by the
+	 * kernel, and read by user-space with single-copy atomicity
+	 * semantics. Aligned on 32-bit. Always contains a value in the
+	 * range of possible CPUs, although the value may not be the
+	 * actual current CPU (e.g. if rseq is not initialized). This
+	 * CPU number value should always be compared against the value
+	 * of the cpu_id field before performing a rseq commit or
+	 * returning a value read from a data structure indexed using
+	 * the cpu_id_start value.
+	 */
+	__u32 cpu_id_start;
+	/*
+	 * Restartable sequences cpu_id field. Updated by the kernel,
+	 * and read by user-space with single-copy atomicity semantics.
+	 * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and
+	 * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the
+	 * former means "rseq uninitialized", and latter means "rseq
+	 * initialization failed". This value is meant to be read within
+	 * rseq critical sections and compared with the cpu_id_start
+	 * value previously read, before performing the commit instruction,
+	 * or read and compared with the cpu_id_start value before returning
+	 * a value loaded from a data structure indexed using the
+	 * cpu_id_start value.
+	 */
+	__u32 cpu_id;
+	/*
+	 * Restartable sequences rseq_cs field.
+	 *
+	 * Contains NULL when no critical section is active for the current
+	 * thread, or holds a pointer to the currently active struct rseq_cs.
+	 *
+	 * Updated by user-space, which sets the address of the currently
+	 * active rseq_cs at the beginning of assembly instruction sequence
+	 * block, and set to NULL by the kernel when it restarts an assembly
+	 * instruction sequence block, as well as when the kernel detects that
+	 * it is preempting or delivering a signal outside of the range
+	 * targeted by the rseq_cs. Also needs to be set to NULL by user-space
+	 * before reclaiming memory that contains the targeted struct rseq_cs.
+	 *
+	 * Read and set by the kernel with single-copy atomicity semantics.
+	 * Set by user-space with single-copy atomicity semantics. Aligned
+	 * on 64-bit.
+	 */
+	LINUX_FIELD_u32_u64(rseq_cs);
+	/*
+	 * - RSEQ_DISABLE flag:
+	 *
+	 * Fallback fast-track flag for single-stepping.
+	 * Set by user-space if lack of progress is detected.
+	 * Cleared by user-space after rseq finish.
+	 * Read by the kernel.
+	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
+	 *     Inhibit instruction sequence block restart and event
+	 *     counter increment on preemption for this thread.
+	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+	 *     Inhibit instruction sequence block restart and event
+	 *     counter increment on signal delivery for this thread.
+	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
+	 *     Inhibit instruction sequence block restart and event
+	 *     counter increment on migration for this thread.
+	 */
+	__u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+#endif /* _UAPI_LINUX_RSEQ_H */
diff --git a/init/Kconfig b/init/Kconfig
index 18b151f0ddc1..33ec06fddaaa 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1417,6 +1417,29 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS
 config ARCH_HAS_MEMBARRIER_SYNC_CORE
 	bool
 
+config RSEQ
+	bool "Enable rseq() system call" if EXPERT
+	default y
+	depends on HAVE_RSEQ
+	select MEMBARRIER
+	help
+	  Enable the restartable sequences system call. It provides a
+	  user-space cache for the current CPU number value, which
+	  speeds up getting the current CPU number from user-space,
+	  as well as an ABI to speed up user-space operations on
+	  per-CPU data.
+
+	  If unsure, say Y.
+
+config DEBUG_RSEQ
+	default n
+	bool "Enabled debugging of rseq() system call" if EXPERT
+	depends on RSEQ && DEBUG_KERNEL
+	help
+	  Enable extra debugging checks for the rseq system call.
+
+	  If unsure, say N.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index f85ae5dfa474..7085c841c413 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
+obj-$(CONFIG_RSEQ) += rseq.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/fork.c b/kernel/fork.c
index a5d21c42acfc..70992bfeba81 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1899,6 +1899,8 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	copy_seccomp(p);
 
+	rseq_fork(p, clone_flags);
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
diff --git a/kernel/rseq.c b/kernel/rseq.c
new file mode 100644
index 000000000000..ae306f90c514
--- /dev/null
+++ b/kernel/rseq.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Restartable sequences system call
+ *
+ * Copyright (C) 2015, Google, Inc.,
+ * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
+ * Copyright (C) 2015-2018, EfficiOS Inc.,
+ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/rseq.h>
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rseq.h>
+
+#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
+				       RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
+
+/*
+ *
+ * Restartable sequences are a lightweight interface that allows
+ * user-level code to be executed atomically relative to scheduler
+ * preemption and signal delivery. Typically used for implementing
+ * per-cpu operations.
+ *
+ * It allows user-space to perform update operations on per-cpu data
+ * without requiring heavy-weight atomic operations.
+ *
+ * Detailed algorithm of rseq user-space assembly sequences:
+ *
+ *                     init(rseq_cs)
+ *                     cpu = TLS->rseq::cpu_id_start
+ *   [1]               TLS->rseq::rseq_cs = rseq_cs
+ *   [start_ip]        ----------------------------
+ *   [2]               if (cpu != TLS->rseq::cpu_id)
+ *                             goto abort_ip;
+ *   [3]               <last_instruction_in_cs>
+ *   [post_commit_ip]  ----------------------------
+ *
+ *   The address of jump target abort_ip must be outside the critical
+ *   region, i.e.:
+ *
+ *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
+ *
+ *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
+ *   userspace that can handle being interrupted between any of those
+ *   instructions, and then resumed to the abort_ip.
+ *
+ *   1.  Userspace stores the address of the struct rseq_cs assembly
+ *       block descriptor into the rseq_cs field of the registered
+ *       struct rseq TLS area. This update is performed through a single
+ *       store within the inline assembly instruction sequence.
+ *       [start_ip]
+ *
+ *   2.  Userspace tests to check whether the current cpu_id field match
+ *       the cpu number loaded before start_ip, branching to abort_ip
+ *       in case of a mismatch.
+ *
+ *       If the sequence is preempted or interrupted by a signal
+ *       at or after start_ip and before post_commit_ip, then the kernel
+ *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
+ *       ip to abort_ip before returning to user-space, so the preempted
+ *       execution resumes at abort_ip.
+ *
+ *   3.  Userspace critical section final instruction before
+ *       post_commit_ip is the commit. The critical section is
+ *       self-terminating.
+ *       [post_commit_ip]
+ *
+ *   4.  <success>
+ *
+ *   On failure at [2], or if interrupted by preempt or signal delivery
+ *   between [1] and [3]:
+ *
+ *       [abort_ip]
+ *   F1. <failure>
+ */
+
+static int rseq_update_cpu_id(struct task_struct *t)
+{
+	u32 cpu_id = raw_smp_processor_id();
+
+	if (__put_user(cpu_id, &t->rseq->cpu_id_start))
+		return -EFAULT;
+	if (__put_user(cpu_id, &t->rseq->cpu_id))
+		return -EFAULT;
+	trace_rseq_update(t);
+	return 0;
+}
+
+static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+{
+	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+
+	/*
+	 * Reset cpu_id_start to its initial state (0).
+	 */
+	if (__put_user(cpu_id_start, &t->rseq->cpu_id_start))
+		return -EFAULT;
+	/*
+	 * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
+	 * in after unregistration can figure out that rseq needs to be
+	 * registered again.
+	 */
+	if (__put_user(cpu_id, &t->rseq->cpu_id))
+		return -EFAULT;
+	return 0;
+}
+
+static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
+{
+	struct rseq_cs __user *urseq_cs;
+	unsigned long ptr;
+	u32 __user *usig;
+	u32 sig;
+	int ret;
+
+	ret = __get_user(ptr, &t->rseq->rseq_cs);
+	if (ret)
+		return ret;
+	if (!ptr) {
+		memset(rseq_cs, 0, sizeof(*rseq_cs));
+		return 0;
+	}
+	urseq_cs = (struct rseq_cs __user *)ptr;
+	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
+		return -EFAULT;
+	if (rseq_cs->version > 0)
+		return -EINVAL;
+
+	/* Ensure that abort_ip is not in the critical section. */
+	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
+		return -EINVAL;
+
+	usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32));
+	ret = get_user(sig, usig);
+	if (ret)
+		return ret;
+
+	if (current->rseq_sig != sig) {
+		printk_ratelimited(KERN_WARNING
+			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
+			sig, current->rseq_sig, current->pid, usig);
+		return -EPERM;
+	}
+	return 0;
+}
+
+static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+{
+	u32 flags, event_mask;
+	int ret;
+
+	/* Get thread flags. */
+	ret = __get_user(flags, &t->rseq->flags);
+	if (ret)
+		return ret;
+
+	/* Take critical section flags into account. */
+	flags |= cs_flags;
+
+	/*
+	 * Restart on signal can only be inhibited when restart on
+	 * preempt and restart on migrate are inhibited too. Otherwise,
+	 * a preempted signal handler could fail to restart the prior
+	 * execution context on sigreturn.
+	 */
+	if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
+		     (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
+		     RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
+		return -EINVAL;
+
+	/*
+	 * Load and clear event mask atomically with respect to
+	 * scheduler preemption.
+	 */
+	preempt_disable();
+	event_mask = t->rseq_event_mask;
+	t->rseq_event_mask = 0;
+	preempt_enable();
+
+	return !!(event_mask & ~flags);
+}
+
+static int clear_rseq_cs(struct task_struct *t)
+{
+	/*
+	 * The rseq_cs field is set to NULL on preemption or signal
+	 * delivery on top of rseq assembly block, as well as on top
+	 * of code outside of the rseq assembly block. This performs
+	 * a lazy clear of the rseq_cs field.
+	 *
+	 * Set rseq_cs to NULL with single-copy atomicity.
+	 */
+	return __put_user(0UL, &t->rseq->rseq_cs);
+}
+
+/*
+ * Unsigned comparison will be true when ip >= start_ip, and when
+ * ip < start_ip + post_commit_offset.
+ */
+static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
+{
+	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
+}
+
+static int rseq_ip_fixup(struct pt_regs *regs)
+{
+	unsigned long ip = instruction_pointer(regs);
+	struct task_struct *t = current;
+	struct rseq_cs rseq_cs;
+	int ret;
+
+	ret = rseq_get_rseq_cs(t, &rseq_cs);
+	if (ret)
+		return ret;
+
+	/*
+	 * Handle potentially not being within a critical section.
+	 * If not nested over a rseq critical section, restart is useless.
+	 * Clear the rseq_cs pointer and return.
+	 */
+	if (!in_rseq_cs(ip, &rseq_cs))
+		return clear_rseq_cs(t);
+	ret = rseq_need_restart(t, rseq_cs.flags);
+	if (ret <= 0)
+		return ret;
+	ret = clear_rseq_cs(t);
+	if (ret)
+		return ret;
+	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
+			    rseq_cs.abort_ip);
+	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
+	return 0;
+}
+
+/*
+ * This resume handler must always be executed between any of:
+ * - preemption,
+ * - signal delivery,
+ * and return to user-space.
+ *
+ * This is how we can ensure that the entire rseq critical section,
+ * consisting of both the C part and the assembly instruction sequence,
+ * will issue the commit instruction only if executed atomically with
+ * respect to other threads scheduled on the same CPU, and with respect
+ * to signal handlers.
+ */
+void __rseq_handle_notify_resume(struct pt_regs *regs)
+{
+	struct task_struct *t = current;
+	int ret;
+
+	if (unlikely(t->flags & PF_EXITING))
+		return;
+	if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq))))
+		goto error;
+	ret = rseq_ip_fixup(regs);
+	if (unlikely(ret < 0))
+		goto error;
+	if (unlikely(rseq_update_cpu_id(t)))
+		goto error;
+	return;
+
+error:
+	force_sig(SIGSEGV, t);
+}
+
+#ifdef CONFIG_DEBUG_RSEQ
+
+/*
+ * Terminate the process if a syscall is issued within a restartable
+ * sequence.
+ */
+void rseq_syscall(struct pt_regs *regs)
+{
+	unsigned long ip = instruction_pointer(regs);
+	struct task_struct *t = current;
+	struct rseq_cs rseq_cs;
+
+	if (!t->rseq)
+		return;
+	if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) ||
+	    rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
+		force_sig(SIGSEGV, t);
+}
+
+#endif
+
+/*
+ * sys_rseq - setup restartable sequences for caller thread.
+ */
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
+		int, flags, u32, sig)
+{
+	int ret;
+
+	if (flags & RSEQ_FLAG_UNREGISTER) {
+		/* Unregister rseq for current thread. */
+		if (current->rseq != rseq || !current->rseq)
+			return -EINVAL;
+		if (current->rseq_len != rseq_len)
+			return -EINVAL;
+		if (current->rseq_sig != sig)
+			return -EPERM;
+		ret = rseq_reset_rseq_cpu_id(current);
+		if (ret)
+			return ret;
+		current->rseq = NULL;
+		current->rseq_len = 0;
+		current->rseq_sig = 0;
+		return 0;
+	}
+
+	if (unlikely(flags))
+		return -EINVAL;
+
+	if (current->rseq) {
+		/*
+		 * If rseq is already registered, check whether
+		 * the provided address differs from the prior
+		 * one.
+		 */
+		if (current->rseq != rseq || current->rseq_len != rseq_len)
+			return -EINVAL;
+		if (current->rseq_sig != sig)
+			return -EPERM;
+		/* Already registered. */
+		return -EBUSY;
+	}
+
+	/*
+	 * If there was no rseq previously registered,
+	 * ensure the provided rseq is properly aligned and valid.
+	 */
+	if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+	    rseq_len != sizeof(*rseq))
+		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, rseq, rseq_len))
+		return -EFAULT;
+	current->rseq = rseq;
+	current->rseq_len = rseq_len;
+	current->rseq_sig = sig;
+	/*
+	 * If rseq was previously inactive, and has just been
+	 * registered, ensure the cpu_id_start and cpu_id fields
+	 * are updated before returning to user-space.
+	 */
+	rseq_set_notify_resume(current);
+
+	return 0;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9866f86f304..a98d54cd5535 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1191,6 +1191,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p);
 		p->se.nr_migrations++;
+		rseq_migrate(p);
 		perf_event_task_migrate(p);
 	}
 
@@ -2634,6 +2635,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 {
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
+	rseq_preempt(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_task(next);
 	prepare_arch_switch(next);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 183169c2a75b..86f832d6ff6f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16);
 COND_SYSCALL(setresuid16);
 COND_SYSCALL(setreuid16);
 COND_SYSCALL(setuid16);
+
+/* restartable sequence */
+COND_SYSCALL(rseq);
-- 
cgit v1.2.3


From bd975e691486ba52790ba23cc9b4fecab7bc0d31 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 31 May 2018 18:45:21 +0200
Subject: netfilter: ipset: List timing out entries with "timeout 1" instead of
 zero

When listing sets with timeout support, there's a probability that
just timing out entries with "0" timeout value is listed/saved.
However when restoring the saved list, the zero timeout value means
permanent elelements.

The new behaviour is that timing out entries are listed with "timeout 1"
instead of zero.

Fixes netfilter bugzilla #1258.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set_timeout.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index bfb3531fd88a..7ad8ddf9ca8a 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -65,8 +65,14 @@ ip_set_timeout_set(unsigned long *timeout, u32 value)
 static inline u32
 ip_set_timeout_get(const unsigned long *timeout)
 {
-	return *timeout == IPSET_ELEM_PERMANENT ? 0 :
-		jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
+	u32 t;
+
+	if (*timeout == IPSET_ELEM_PERMANENT)
+		return 0;
+
+	t = jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
+	/* Zero value in userspace means no timeout */
+	return t == 0 ? 1 : t;
 }
 
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From 30a2e107108c66cbcb7776b58cbcd7db223a1cc9 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 5 Jun 2018 11:53:35 +0200
Subject: netfilter: ipset: Limit max timeout value

Due to the negative value condition in msecs_to_jiffies(), the real
max possible timeout value must be set to (UINT_MAX >> 1)/MSEC_PER_SEC.

Neutron Soutmun proposed the proper fix, but an insufficient one was
applied, see https://patchwork.ozlabs.org/patch/400405/.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set_timeout.h | 10 ++++++----
 net/netfilter/xt_set.c                         |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 7ad8ddf9ca8a..8ce271e187b6 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -23,6 +23,9 @@
 /* Set is defined with timeout support: timeout value may be 0 */
 #define IPSET_NO_TIMEOUT	UINT_MAX
 
+/* Max timeout value, see msecs_to_jiffies() in jiffies.h */
+#define IPSET_MAX_TIMEOUT	(UINT_MAX >> 1)/MSEC_PER_SEC
+
 #define ip_set_adt_opt_timeout(opt, set)	\
 ((opt)->ext.timeout != IPSET_NO_TIMEOUT ? (opt)->ext.timeout : (set)->timeout)
 
@@ -32,11 +35,10 @@ ip_set_timeout_uget(struct nlattr *tb)
 	unsigned int timeout = ip_set_get_h32(tb);
 
 	/* Normalize to fit into jiffies */
-	if (timeout > UINT_MAX/MSEC_PER_SEC)
-		timeout = UINT_MAX/MSEC_PER_SEC;
+	if (timeout > IPSET_MAX_TIMEOUT)
+		timeout = IPSET_MAX_TIMEOUT;
 
-	/* Userspace supplied TIMEOUT parameter: adjust crazy size */
-	return timeout == IPSET_NO_TIMEOUT ? IPSET_NO_TIMEOUT - 1 : timeout;
+	return timeout;
 }
 
 static inline bool
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 07af7dbf7a30..bf2890b13212 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -372,8 +372,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
-	    add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
-		add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
+	    add_opt.ext.timeout > IPSET_MAX_TIMEOUT)
+		add_opt.ext.timeout = IPSET_MAX_TIMEOUT;
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
@@ -407,8 +407,8 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
-	    add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
-		add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
+	    add_opt.ext.timeout > IPSET_MAX_TIMEOUT)
+		add_opt.ext.timeout = IPSET_MAX_TIMEOUT;
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
-- 
cgit v1.2.3


From 7170e6045a6a8b33f4fa5753589dc77b16198e2d Mon Sep 17 00:00:00 2001
From: Doron Roberts-Kedes <doronrk@fb.com>
Date: Wed, 6 Jun 2018 09:33:28 -0700
Subject: strparser: Add __strp_unpause and use it in ktls.

strp_unpause queues strp_work in order to parse any messages that
arrived while the strparser was paused. However, the process invoking
strp_unpause could eagerly parse a buffered message itself if it held
the sock lock.

__strp_unpause is an alternative to strp_pause that avoids the scheduling
overhead that results when a receiving thread unpauses the strparser
and waits for the next message to be delivered by the workqueue thread.

This patch more than doubled the IOPS achieved in a benchmark of NBD
traffic encrypted using ktls.

Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/strparser.h   |  2 ++
 net/strparser/strparser.c | 13 +++++++++++++
 net/tls/tls_sw.c          |  2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index d96b59f45eba..f177c87ce38b 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -90,6 +90,8 @@ static inline void strp_pause(struct strparser *strp)
 
 /* May be called without holding lock for attached socket */
 void strp_unpause(struct strparser *strp);
+/* Must be called with process lock held (lock_sock) */
+void __strp_unpause(struct strparser *strp);
 
 static inline void save_strp_stats(struct strparser *strp,
 				   struct strp_aggr_stats *agg_stats)
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 092bebc70048..1a9695183599 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -512,6 +512,19 @@ int strp_init(struct strparser *strp, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(strp_init);
 
+/* Sock process lock held (lock_sock) */
+void __strp_unpause(struct strparser *strp)
+{
+	strp->paused = 0;
+
+	if (strp->need_bytes) {
+		if (strp_peek_len(strp) < strp->need_bytes)
+			return;
+	}
+	strp_read_sock(strp);
+}
+EXPORT_SYMBOL_GPL(__strp_unpause);
+
 void strp_unpause(struct strparser *strp)
 {
 	strp->paused = 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 839e1e165a0c..8ca57d01b18f 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -735,7 +735,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
 	/* Finished with message */
 	ctx->recv_pkt = NULL;
 	kfree_skb(skb);
-	strp_unpause(&ctx->strp);
+	__strp_unpause(&ctx->strp);
 
 	return true;
 }
-- 
cgit v1.2.3


From 64e6dd1fb2f1ed799f317dc34aa6e251c64f4981 Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Thu, 7 Jun 2018 18:15:14 +0800
Subject: netfilter: nf_conntrack: Increase __IPS_MAX_BIT with new bit
 IPS_OFFLOAD_BIT

The __IPS_MAX_BIT is used in __ctnetlink_change_status as the max bit
value. When add new bit IPS_OFFLOAD_BIT whose value is 14, we should
increase the __IPS_MAX_BIT too, from 14 to 15.

There is no any bug in current codes, although it lost one loop in
__ctnetlink_change_status. Because the new bit IPS_OFFLOAD_BIT belongs
the IPS_UNCHANGEABLE_MASK.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index c712eb6879f1..336014bf8868 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -112,7 +112,7 @@ enum ip_conntrack_status {
 				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
 				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
 
-	__IPS_MAX_BIT = 14,
+	__IPS_MAX_BIT = 15,
 };
 
 /* Connection tracking event types */
-- 
cgit v1.2.3


From e536700ef5bf9788af185bf890a52f296d055ed7 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Jun 2018 14:10:56 +0100
Subject: regulator: gpio: Revert

regulator: fixed/gpio: Revert GPIO descriptor changes due to platform breakage

Commit 6059577cb28 "regulator: fixed: Convert to use GPIO descriptor
only" broke at least the ams-delta platform since the lookup tables
added to the board files use the function name "enable" while the driver
uses NULL causing the regulator to not acquire and control the enable
GPIOs.  Revert that and a couple of other commits that are caught up
with it to fix the issue:

2b6c00c157c5bf80 "ARM: pxa, regulator: fix building ezx e680"
6059577cb28d8b15 "regulator: fixed: Convert to use GPIO descriptor only"
37bed97f00734ce3 "regulator: gpio: Get enable GPIO using GPIO descriptor"

Reported-by: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-imx/mach-mx21ads.c                   | 13 +--------
 arch/arm/mach-imx/mach-mx27ads.c                   | 12 +-------
 arch/arm/mach-mmp/brownstone.c                     | 12 +-------
 arch/arm/mach-omap1/board-ams-delta.c              | 14 +--------
 arch/arm/mach-omap2/pdata-quirks.c                 | 16 +----------
 arch/arm/mach-pxa/em-x270.c                        |  1 +
 arch/arm/mach-pxa/ezx.c                            | 33 ++++++++--------------
 arch/arm/mach-pxa/hx4700.c                         | 12 +-------
 arch/arm/mach-pxa/magician.c                       | 13 ++-------
 arch/arm/mach-pxa/raumfeld.c                       | 12 ++------
 arch/arm/mach-pxa/zeus.c                           | 23 ++-------------
 arch/arm/mach-s3c64xx/mach-crag6410.c              |  1 +
 arch/arm/mach-s3c64xx/mach-smdk6410.c              |  1 +
 arch/arm/mach-sa1100/assabet.c                     | 21 +++++---------
 arch/arm/mach-sa1100/generic.c                     |  5 ++--
 arch/arm/mach-sa1100/generic.h                     |  3 +-
 arch/arm/mach-sa1100/shannon.c                     |  4 +--
 arch/sh/boards/mach-ecovec24/setup.c               | 22 ++-------------
 .../intel-mid/device_libs/platform_bcm43xx.c       | 17 ++---------
 drivers/regulator/fixed-helper.c                   |  1 +
 drivers/regulator/fixed.c                          | 33 +++++++++++-----------
 drivers/regulator/gpio-regulator.c                 | 23 +++++++--------
 include/linux/regulator/fixed.h                    |  3 ++
 include/linux/regulator/gpio-regulator.h           |  3 ++
 24 files changed, 79 insertions(+), 219 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-imx/mach-mx21ads.c b/arch/arm/mach-imx/mach-mx21ads.c
index 5d3b6b4fe6db..5e366824814f 100644
--- a/arch/arm/mach-imx/mach-mx21ads.c
+++ b/arch/arm/mach-imx/mach-mx21ads.c
@@ -18,7 +18,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/physmap.h>
 #include <linux/gpio/driver.h>
-#include <linux/gpio/machine.h>
 #include <linux/gpio.h>
 #include <linux/regulator/fixed.h>
 #include <linux/regulator/machine.h>
@@ -176,7 +175,6 @@ static struct resource mx21ads_mmgpio_resource =
 	DEFINE_RES_MEM_NAMED(MX21ADS_IO_REG, SZ_2, "dat");
 
 static struct bgpio_pdata mx21ads_mmgpio_pdata = {
-	.label	= "mx21ads-mmgpio",
 	.base	= MX21ADS_MMGPIO_BASE,
 	.ngpio	= 16,
 };
@@ -205,6 +203,7 @@ static struct regulator_init_data mx21ads_lcd_regulator_init_data = {
 static struct fixed_voltage_config mx21ads_lcd_regulator_pdata = {
 	.supply_name	= "LCD",
 	.microvolts	= 3300000,
+	.gpio		= MX21ADS_IO_LCDON,
 	.enable_high	= 1,
 	.init_data	= &mx21ads_lcd_regulator_init_data,
 };
@@ -217,15 +216,6 @@ static struct platform_device mx21ads_lcd_regulator = {
 	},
 };
 
-static struct gpiod_lookup_table mx21ads_lcd_regulator_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.0", /* Let's hope ID 0 is what we get */
-	.table = {
-		GPIO_LOOKUP("mx21ads-mmgpio", 9,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 /*
  * Connected is a portrait Sharp-QVGA display
  * of type: LQ035Q7DB02
@@ -321,7 +311,6 @@ static void __init mx21ads_late_init(void)
 {
 	imx21_add_mxc_mmc(0, &mx21ads_sdhc_pdata);
 
-	gpiod_add_lookup_table(&mx21ads_lcd_regulator_gpiod_table);
 	platform_add_devices(platform_devices, ARRAY_SIZE(platform_devices));
 
 	mx21ads_cs8900_resources[1].start =
diff --git a/arch/arm/mach-imx/mach-mx27ads.c b/arch/arm/mach-imx/mach-mx27ads.c
index 0fdb88db0cbd..a04bb094ded1 100644
--- a/arch/arm/mach-imx/mach-mx27ads.c
+++ b/arch/arm/mach-imx/mach-mx27ads.c
@@ -16,7 +16,6 @@
 #include <linux/gpio/driver.h>
 /* Needed for gpio_to_irq() */
 #include <linux/gpio.h>
-#include <linux/gpio/machine.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
@@ -231,17 +230,10 @@ static struct regulator_init_data mx27ads_lcd_regulator_init_data = {
 static struct fixed_voltage_config mx27ads_lcd_regulator_pdata = {
 	.supply_name	= "LCD",
 	.microvolts	= 3300000,
+	.gpio		= MX27ADS_LCD_GPIO,
 	.init_data	= &mx27ads_lcd_regulator_init_data,
 };
 
-static struct gpiod_lookup_table mx27ads_lcd_regulator_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.0", /* Let's hope ID 0 is what we get */
-	.table = {
-		GPIO_LOOKUP("LCD", 0, "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 static void __init mx27ads_regulator_init(void)
 {
 	struct gpio_chip *vchip;
@@ -255,8 +247,6 @@ static void __init mx27ads_regulator_init(void)
 	vchip->set		= vgpio_set;
 	gpiochip_add_data(vchip, NULL);
 
-	gpiod_add_lookup_table(&mx27ads_lcd_regulator_gpiod_table);
-
 	platform_device_register_data(NULL, "reg-fixed-voltage",
 				      PLATFORM_DEVID_AUTO,
 				      &mx27ads_lcd_regulator_pdata,
diff --git a/arch/arm/mach-mmp/brownstone.c b/arch/arm/mach-mmp/brownstone.c
index 563b5a278d65..d1613b954926 100644
--- a/arch/arm/mach-mmp/brownstone.c
+++ b/arch/arm/mach-mmp/brownstone.c
@@ -15,7 +15,6 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/gpio-pxa.h>
-#include <linux/gpio/machine.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/max8649.h>
 #include <linux/regulator/fixed.h>
@@ -149,6 +148,7 @@ static struct regulator_init_data brownstone_v_5vp_data = {
 static struct fixed_voltage_config brownstone_v_5vp = {
 	.supply_name		= "v_5vp",
 	.microvolts		= 5000000,
+	.gpio			= GPIO_5V_ENABLE,
 	.enable_high		= 1,
 	.enabled_at_boot	= 1,
 	.init_data		= &brownstone_v_5vp_data,
@@ -162,15 +162,6 @@ static struct platform_device brownstone_v_5vp_device = {
 	},
 };
 
-static struct gpiod_lookup_table brownstone_v_5vp_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.1", /* .id set to 1 above */
-	.table = {
-		GPIO_LOOKUP("gpio-pxa", GPIO_5V_ENABLE,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 static struct max8925_platform_data brownstone_max8925_info = {
 	.irq_base		= MMP_NR_IRQS,
 };
@@ -226,7 +217,6 @@ static void __init brownstone_init(void)
 	mmp2_add_isram(&mmp2_isram_platdata);
 
 	/* enable 5v regulator */
-	gpiod_add_lookup_table(&brownstone_v_5vp_gpiod_table);
 	platform_device_register(&brownstone_v_5vp_device);
 }
 
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index 759fa18f6ab4..52e8e53ca154 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -12,7 +12,6 @@
  * published by the Free Software Foundation.
  */
 #include <linux/gpio/driver.h>
-#include <linux/gpio/machine.h>
 #include <linux/gpio.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -204,7 +203,6 @@ static struct resource latch2_resources[] = {
 };
 
 static struct bgpio_pdata latch2_pdata = {
-	.label	= "ams-delta-latch2",
 	.base	= AMS_DELTA_LATCH2_GPIO_BASE,
 	.ngpio	= AMS_DELTA_LATCH2_NGPIO,
 };
@@ -274,6 +272,7 @@ static struct regulator_init_data modem_nreset_data = {
 static struct fixed_voltage_config modem_nreset_config = {
 	.supply_name		= "modem_nreset",
 	.microvolts		= 3300000,
+	.gpio			= AMS_DELTA_GPIO_PIN_MODEM_NRESET,
 	.startup_delay		= 25000,
 	.enable_high		= 1,
 	.enabled_at_boot	= 1,
@@ -288,16 +287,6 @@ static struct platform_device modem_nreset_device = {
 	},
 };
 
-static struct gpiod_lookup_table modem_nreset_gpiod_table = {
-	.dev_id = "reg-fixed-voltage",
-	.table = {
-		/* The AMS_DELTA_GPIO_PIN_MODEM_NRESET is at offset 12 */
-		GPIO_LOOKUP("ams-delta-latch2", 12,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 struct modem_private_data {
 	struct regulator *regulator;
 };
@@ -581,7 +570,6 @@ static int __init late_init(void)
 
 	platform_add_devices(late_devices, ARRAY_SIZE(late_devices));
 
-	gpiod_add_lookup_table(&modem_nreset_gpiod_table);
 	err = platform_device_register(&modem_nreset_device);
 	if (err) {
 		pr_err("Couldn't register the modem regulator device\n");
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index 7d1447204fb8..6459816c2879 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -10,7 +10,6 @@
 #include <linux/clk.h>
 #include <linux/davinci_emac.h>
 #include <linux/gpio.h>
-#include <linux/gpio/machine.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/of_platform.h>
@@ -331,6 +330,7 @@ static struct regulator_init_data pandora_vmmc3 = {
 static struct fixed_voltage_config pandora_vwlan = {
 	.supply_name		= "vwlan",
 	.microvolts		= 1800000, /* 1.8V */
+	.gpio			= PANDORA_WIFI_NRESET_GPIO,
 	.startup_delay		= 50000, /* 50ms */
 	.enable_high		= 1,
 	.init_data		= &pandora_vmmc3,
@@ -344,19 +344,6 @@ static struct platform_device pandora_vwlan_device = {
 	},
 };
 
-static struct gpiod_lookup_table pandora_vwlan_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.1",
-	.table = {
-		/*
-		 * As this is a low GPIO number it should be at the first
-		 * GPIO bank.
-		 */
-		GPIO_LOOKUP("gpio-0-31", PANDORA_WIFI_NRESET_GPIO,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 static void pandora_wl1251_init_card(struct mmc_card *card)
 {
 	/*
@@ -418,7 +405,6 @@ fail:
 static void __init omap3_pandora_legacy_init(void)
 {
 	platform_device_register(&pandora_backlight);
-	gpiod_add_lookup_table(&pandora_vwlan_gpiod_table);
 	platform_device_register(&pandora_vwlan_device);
 	omap_hsmmc_init(pandora_mmc3);
 	omap_hsmmc_late_init(pandora_mmc3);
diff --git a/arch/arm/mach-pxa/em-x270.c b/arch/arm/mach-pxa/em-x270.c
index 6d7d93981098..49022ad338e9 100644
--- a/arch/arm/mach-pxa/em-x270.c
+++ b/arch/arm/mach-pxa/em-x270.c
@@ -987,6 +987,7 @@ static struct fixed_voltage_config camera_dummy_config = {
 	.supply_name		= "camera_vdd",
 	.input_supply		= "vcc cam",
 	.microvolts		= 2800000,
+	.gpio			= -1,
 	.enable_high		= 0,
 	.init_data		= &camera_dummy_initdata,
 };
diff --git a/arch/arm/mach-pxa/ezx.c b/arch/arm/mach-pxa/ezx.c
index 4f33eea73f9a..2c90b58f347d 100644
--- a/arch/arm/mach-pxa/ezx.c
+++ b/arch/arm/mach-pxa/ezx.c
@@ -21,7 +21,6 @@
 #include <linux/regulator/fixed.h>
 #include <linux/input.h>
 #include <linux/gpio.h>
-#include <linux/gpio/machine.h>
 #include <linux/gpio_keys.h>
 #include <linux/leds-lp3944.h>
 #include <linux/platform_data/i2c-pxa.h>
@@ -699,39 +698,31 @@ static struct pxa27x_keypad_platform_data e2_keypad_platform_data = {
 
 #if defined(CONFIG_MACH_EZX_A780) || defined(CONFIG_MACH_EZX_A910)
 /* camera */
-static struct regulator_consumer_supply camera_regulator_supplies[] = {
+static struct regulator_consumer_supply camera_dummy_supplies[] = {
 	REGULATOR_SUPPLY("vdd", "0-005d"),
 };
 
-static struct regulator_init_data camera_regulator_initdata = {
-	.consumer_supplies = camera_regulator_supplies,
-	.num_consumer_supplies = ARRAY_SIZE(camera_regulator_supplies),
+static struct regulator_init_data camera_dummy_initdata = {
+	.consumer_supplies = camera_dummy_supplies,
+	.num_consumer_supplies = ARRAY_SIZE(camera_dummy_supplies),
 	.constraints = {
 		.valid_ops_mask = REGULATOR_CHANGE_STATUS,
 	},
 };
 
-static struct fixed_voltage_config camera_regulator_config = {
+static struct fixed_voltage_config camera_dummy_config = {
 	.supply_name		= "camera_vdd",
 	.microvolts		= 2800000,
+	.gpio			= GPIO50_nCAM_EN,
 	.enable_high		= 0,
-	.init_data		= &camera_regulator_initdata,
+	.init_data		= &camera_dummy_initdata,
 };
 
-static struct platform_device camera_supply_regulator_device = {
+static struct platform_device camera_supply_dummy_device = {
 	.name	= "reg-fixed-voltage",
 	.id	= 1,
 	.dev	= {
-		.platform_data = &camera_regulator_config,
-	},
-};
-
-static struct gpiod_lookup_table camera_supply_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.1",
-	.table = {
-		GPIO_LOOKUP("gpio-pxa", GPIO50_nCAM_EN,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
+		.platform_data = &camera_dummy_config,
 	},
 };
 #endif
@@ -809,7 +800,7 @@ static struct i2c_board_info a780_i2c_board_info[] = {
 
 static struct platform_device *a780_devices[] __initdata = {
 	&a780_gpio_keys,
-	&camera_supply_regulator_device,
+	&camera_supply_dummy_device,
 };
 
 static void __init a780_init(void)
@@ -832,7 +823,6 @@ static void __init a780_init(void)
 	if (a780_camera_init() == 0)
 		pxa_set_camera_info(&a780_pxacamera_platform_data);
 
-	gpiod_add_lookup_table(&camera_supply_gpiod_table);
 	pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup));
 	platform_add_devices(ARRAY_AND_SIZE(ezx_devices));
 	platform_add_devices(ARRAY_AND_SIZE(a780_devices));
@@ -1108,7 +1098,7 @@ static struct i2c_board_info __initdata a910_i2c_board_info[] = {
 
 static struct platform_device *a910_devices[] __initdata = {
 	&a910_gpio_keys,
-	&camera_supply_regulator_device,
+	&camera_supply_dummy_device,
 };
 
 static void __init a910_init(void)
@@ -1131,7 +1121,6 @@ static void __init a910_init(void)
 	if (a910_camera_init() == 0)
 		pxa_set_camera_info(&a910_pxacamera_platform_data);
 
-	gpiod_add_lookup_table(&camera_supply_gpiod_table);
 	pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup));
 	platform_add_devices(ARRAY_AND_SIZE(ezx_devices));
 	platform_add_devices(ARRAY_AND_SIZE(a910_devices));
diff --git a/arch/arm/mach-pxa/hx4700.c b/arch/arm/mach-pxa/hx4700.c
index 6717a10180eb..e2e7f247a645 100644
--- a/arch/arm/mach-pxa/hx4700.c
+++ b/arch/arm/mach-pxa/hx4700.c
@@ -20,7 +20,6 @@
 #include <linux/delay.h>
 #include <linux/fb.h>
 #include <linux/gpio.h>
-#include <linux/gpio/machine.h>
 #include <linux/gpio_keys.h>
 #include <linux/input.h>
 #include <linux/input/navpoint.h>
@@ -712,6 +711,7 @@ static struct gpio_regulator_state bq24022_states[] = {
 static struct gpio_regulator_config bq24022_info = {
 	.supply_name = "bq24022",
 
+	.enable_gpio = GPIO72_HX4700_BQ24022_nCHARGE_EN,
 	.enable_high = 0,
 	.enabled_at_boot = 0,
 
@@ -733,15 +733,6 @@ static struct platform_device bq24022 = {
 	},
 };
 
-static struct gpiod_lookup_table bq24022_gpiod_table = {
-	.dev_id = "gpio-regulator",
-	.table = {
-		GPIO_LOOKUP("gpio-pxa", GPIO72_HX4700_BQ24022_nCHARGE_EN,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 /*
  * StrataFlash
  */
@@ -884,7 +875,6 @@ static void __init hx4700_init(void)
 	pxa_set_btuart_info(NULL);
 	pxa_set_stuart_info(NULL);
 
-	gpiod_add_lookup_table(&bq24022_gpiod_table);
 	platform_add_devices(devices, ARRAY_SIZE(devices));
 	pwm_add_table(hx4700_pwm_lookup, ARRAY_SIZE(hx4700_pwm_lookup));
 
diff --git a/arch/arm/mach-pxa/magician.c b/arch/arm/mach-pxa/magician.c
index 9a5bda3ea194..c5325d1ae77b 100644
--- a/arch/arm/mach-pxa/magician.c
+++ b/arch/arm/mach-pxa/magician.c
@@ -18,7 +18,6 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/gpio.h>
-#include <linux/gpio/machine.h>
 #include <linux/gpio_keys.h>
 #include <linux/input.h>
 #include <linux/mfd/htc-pasic3.h>
@@ -657,6 +656,7 @@ static struct gpio_regulator_state bq24022_states[] = {
 static struct gpio_regulator_config bq24022_info = {
 	.supply_name		= "bq24022",
 
+	.enable_gpio		= GPIO30_MAGICIAN_BQ24022_nCHARGE_EN,
 	.enable_high		= 0,
 	.enabled_at_boot	= 1,
 
@@ -678,15 +678,6 @@ static struct platform_device bq24022 = {
 	},
 };
 
-static struct gpiod_lookup_table bq24022_gpiod_table = {
-	.dev_id = "gpio-regulator",
-	.table = {
-		GPIO_LOOKUP("gpio-pxa", GPIO30_MAGICIAN_BQ24022_nCHARGE_EN,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 /*
  * fixed regulator for ads7846
  */
@@ -705,6 +696,7 @@ static struct regulator_init_data vads7846_regulator = {
 static struct fixed_voltage_config vads7846 = {
 	.supply_name	= "vads7846",
 	.microvolts	= 3300000, /* probably */
+	.gpio		= -EINVAL,
 	.startup_delay	= 0,
 	.init_data	= &vads7846_regulator,
 };
@@ -1015,7 +1007,6 @@ static void __init magician_init(void)
 	regulator_register_always_on(0, "power", pwm_backlight_supply,
 		ARRAY_SIZE(pwm_backlight_supply), 5000000);
 
-	gpiod_add_lookup_table(&bq24022_gpiod_table);
 	platform_add_devices(ARRAY_AND_SIZE(devices));
 }
 
diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c
index ee766e4ebddc..034345546f84 100644
--- a/arch/arm/mach-pxa/raumfeld.c
+++ b/arch/arm/mach-pxa/raumfeld.c
@@ -886,6 +886,7 @@ static struct regulator_init_data audio_va_initdata = {
 static struct fixed_voltage_config audio_va_config = {
 	.supply_name		= "audio_va",
 	.microvolts		= 5000000,
+	.gpio			= GPIO_AUDIO_VA_ENABLE,
 	.enable_high		= 1,
 	.enabled_at_boot	= 0,
 	.init_data		= &audio_va_initdata,
@@ -899,15 +900,6 @@ static struct platform_device audio_va_device = {
 	},
 };
 
-static struct gpiod_lookup_table audio_va_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.0",
-	.table = {
-		GPIO_LOOKUP("gpio-pxa", GPIO_AUDIO_VA_ENABLE,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 /* Dummy supplies for Codec's VD/VLC */
 
 static struct regulator_consumer_supply audio_dummy_supplies[] = {
@@ -926,6 +918,7 @@ static struct regulator_init_data audio_dummy_initdata = {
 static struct fixed_voltage_config audio_dummy_config = {
 	.supply_name		= "audio_vd",
 	.microvolts		= 3300000,
+	.gpio			= -1,
 	.init_data		= &audio_dummy_initdata,
 };
 
@@ -1040,7 +1033,6 @@ static void __init raumfeld_audio_init(void)
 	else
 		gpio_direction_output(GPIO_MCLK_RESET, 1);
 
-	gpiod_add_lookup_table(&audio_va_gpiod_table);
 	platform_add_devices(ARRAY_AND_SIZE(audio_regulator_devices));
 }
 
diff --git a/arch/arm/mach-pxa/zeus.c b/arch/arm/mach-pxa/zeus.c
index 58e05afcece0..e3851795d6d7 100644
--- a/arch/arm/mach-pxa/zeus.c
+++ b/arch/arm/mach-pxa/zeus.c
@@ -17,7 +17,6 @@
 #include <linux/irq.h>
 #include <linux/pm.h>
 #include <linux/gpio.h>
-#include <linux/gpio/machine.h>
 #include <linux/serial_8250.h>
 #include <linux/dm9000.h>
 #include <linux/mmc/host.h>
@@ -411,6 +410,7 @@ static struct regulator_init_data can_regulator_init_data = {
 static struct fixed_voltage_config can_regulator_pdata = {
 	.supply_name	= "CAN_SHDN",
 	.microvolts	= 3300000,
+	.gpio		= ZEUS_CAN_SHDN_GPIO,
 	.init_data	= &can_regulator_init_data,
 };
 
@@ -422,15 +422,6 @@ static struct platform_device can_regulator_device = {
 	},
 };
 
-static struct gpiod_lookup_table can_regulator_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.0",
-	.table = {
-		GPIO_LOOKUP("gpio-pxa", ZEUS_CAN_SHDN_GPIO,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 static struct mcp251x_platform_data zeus_mcp2515_pdata = {
 	.oscillator_frequency	= 16*1000*1000,
 };
@@ -547,6 +538,7 @@ static struct regulator_init_data zeus_ohci_regulator_data = {
 static struct fixed_voltage_config zeus_ohci_regulator_config = {
 	.supply_name		= "vbus2",
 	.microvolts		= 5000000, /* 5.0V */
+	.gpio			= ZEUS_USB2_PWREN_GPIO,
 	.enable_high		= 1,
 	.startup_delay		= 0,
 	.init_data		= &zeus_ohci_regulator_data,
@@ -560,15 +552,6 @@ static struct platform_device zeus_ohci_regulator_device = {
 	},
 };
 
-static struct gpiod_lookup_table zeus_ohci_regulator_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.0",
-	.table = {
-		GPIO_LOOKUP("gpio-pxa", ZEUS_USB2_PWREN_GPIO,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 static struct pxaohci_platform_data zeus_ohci_platform_data = {
 	.port_mode	= PMM_NPS_MODE,
 	/* Clear Power Control Polarity Low and set Power Sense
@@ -872,8 +855,6 @@ static void __init zeus_init(void)
 
 	pxa2xx_mfp_config(ARRAY_AND_SIZE(zeus_pin_config));
 
-	gpiod_add_lookup_table(&can_regulator_gpiod_table);
-	gpiod_add_lookup_table(&zeus_ohci_regulator_gpiod_table);
 	platform_add_devices(zeus_devices, ARRAY_SIZE(zeus_devices));
 
 	zeus_register_ohci();
diff --git a/arch/arm/mach-s3c64xx/mach-crag6410.c b/arch/arm/mach-s3c64xx/mach-crag6410.c
index 379424d72ae7..f04650297487 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410.c
@@ -352,6 +352,7 @@ static struct fixed_voltage_config wallvdd_pdata = {
 	.supply_name = "WALLVDD",
 	.microvolts = 5000000,
 	.init_data = &wallvdd_data,
+	.gpio = -EINVAL,
 };
 
 static struct platform_device wallvdd_device = {
diff --git a/arch/arm/mach-s3c64xx/mach-smdk6410.c b/arch/arm/mach-s3c64xx/mach-smdk6410.c
index 908e5aa831c8..c46fa5dfd2e0 100644
--- a/arch/arm/mach-s3c64xx/mach-smdk6410.c
+++ b/arch/arm/mach-s3c64xx/mach-smdk6410.c
@@ -222,6 +222,7 @@ static struct fixed_voltage_config smdk6410_b_pwr_5v_pdata = {
 	.supply_name = "B_PWR_5V",
 	.microvolts = 5000000,
 	.init_data = &smdk6410_b_pwr_5v_data,
+	.gpio = -EINVAL,
 };
 
 static struct platform_device smdk6410_b_pwr_5v = {
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index dbb53c520165..575ec085cffa 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -101,7 +101,7 @@ static int __init assabet_init_gpio(void __iomem *reg, u32 def_val)
 
 	assabet_bcr_gc = gc;
 
-	return 0;
+	return gc->base;
 }
 
 /*
@@ -471,14 +471,6 @@ static struct fixed_voltage_config assabet_cf_vcc_pdata __initdata = {
 	.enable_high = 1,
 };
 
-static struct gpiod_lookup_table assabet_cf_vcc_gpio_table = {
-	.dev_id = "reg-fixed-voltage.0",
-	.table = {
-		GPIO_LOOKUP("assabet", 0, "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 static void __init assabet_init(void)
 {
 	/*
@@ -525,11 +517,9 @@ static void __init assabet_init(void)
 			neponset_resources, ARRAY_SIZE(neponset_resources));
 #endif
 	} else {
-		gpiod_add_lookup_table(&assabet_cf_vcc_gpio_table);
 		sa11x0_register_fixed_regulator(0, &assabet_cf_vcc_pdata,
-					assabet_cf_vcc_consumers,
-					ARRAY_SIZE(assabet_cf_vcc_consumers),
-					true);
+					 assabet_cf_vcc_consumers,
+					 ARRAY_SIZE(assabet_cf_vcc_consumers));
 
 	}
 
@@ -812,6 +802,7 @@ fs_initcall(assabet_leds_init);
 
 void __init assabet_init_irq(void)
 {
+	unsigned int assabet_gpio_base;
 	u32 def_val;
 
 	sa1100_init_irq();
@@ -826,7 +817,9 @@ void __init assabet_init_irq(void)
 	 *
 	 * This must precede any driver calls to BCR_set() or BCR_clear().
 	 */
-	assabet_init_gpio((void *)&ASSABET_BCR, def_val);
+	assabet_gpio_base = assabet_init_gpio((void *)&ASSABET_BCR, def_val);
+
+	assabet_cf_vcc_pdata.gpio = assabet_gpio_base + 0;
 }
 
 MACHINE_START(ASSABET, "Intel-Assabet")
diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c
index 800321c6cbd8..7167ddf84a0e 100644
--- a/arch/arm/mach-sa1100/generic.c
+++ b/arch/arm/mach-sa1100/generic.c
@@ -348,8 +348,7 @@ void __init sa11x0_init_late(void)
 
 int __init sa11x0_register_fixed_regulator(int n,
 	struct fixed_voltage_config *cfg,
-	struct regulator_consumer_supply *supplies, unsigned num_supplies,
-	bool uses_gpio)
+	struct regulator_consumer_supply *supplies, unsigned num_supplies)
 {
 	struct regulator_init_data *id;
 
@@ -357,7 +356,7 @@ int __init sa11x0_register_fixed_regulator(int n,
 	if (!cfg->init_data)
 		return -ENOMEM;
 
-	if (!uses_gpio)
+	if (cfg->gpio < 0)
 		id->constraints.always_on = 1;
 	id->constraints.name = cfg->supply_name;
 	id->constraints.min_uV = cfg->microvolts;
diff --git a/arch/arm/mach-sa1100/generic.h b/arch/arm/mach-sa1100/generic.h
index 158a4fd5ca24..5f3cb52fa6ab 100644
--- a/arch/arm/mach-sa1100/generic.h
+++ b/arch/arm/mach-sa1100/generic.h
@@ -54,5 +54,4 @@ void sa11x0_register_pcmcia(int socket, struct gpiod_lookup_table *);
 struct fixed_voltage_config;
 struct regulator_consumer_supply;
 int sa11x0_register_fixed_regulator(int n, struct fixed_voltage_config *cfg,
-	struct regulator_consumer_supply *supplies, unsigned num_supplies,
-	bool uses_gpio);
+	struct regulator_consumer_supply *supplies, unsigned num_supplies);
diff --git a/arch/arm/mach-sa1100/shannon.c b/arch/arm/mach-sa1100/shannon.c
index 5bc82e2671c6..22f7fe0b809f 100644
--- a/arch/arm/mach-sa1100/shannon.c
+++ b/arch/arm/mach-sa1100/shannon.c
@@ -102,14 +102,14 @@ static struct fixed_voltage_config shannon_cf_vcc_pdata __initdata = {
 	.supply_name = "cf-power",
 	.microvolts = 3300000,
 	.enabled_at_boot = 1,
+	.gpio = -EINVAL,
 };
 
 static void __init shannon_init(void)
 {
 	sa11x0_register_fixed_regulator(0, &shannon_cf_vcc_pdata,
 					shannon_cf_vcc_consumers,
-					ARRAY_SIZE(shannon_cf_vcc_consumers),
-					false);
+					ARRAY_SIZE(shannon_cf_vcc_consumers));
 	sa11x0_register_pcmcia(0, &shannon_pcmcia0_gpio_table);
 	sa11x0_register_pcmcia(1, &shannon_pcmcia1_gpio_table);
 	sa11x0_ppc_configure_mcp();
diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index c296b5c399b7..adc61d14172c 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -633,6 +633,7 @@ static struct regulator_init_data cn12_power_init_data = {
 static struct fixed_voltage_config cn12_power_info = {
 	.supply_name = "CN12 SD/MMC Vdd",
 	.microvolts = 3300000,
+	.gpio = GPIO_PTB7,
 	.enable_high = 1,
 	.init_data = &cn12_power_init_data,
 };
@@ -645,16 +646,6 @@ static struct platform_device cn12_power = {
 	},
 };
 
-static struct gpiod_lookup_table cn12_power_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.0",
-	.table = {
-		/* Offset 7 on port B */
-		GPIO_LOOKUP("sh7724_pfc", GPIO_PTB7,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 #if defined(CONFIG_MMC_SDHI) || defined(CONFIG_MMC_SDHI_MODULE)
 /* SDHI0 */
 static struct regulator_consumer_supply sdhi0_power_consumers[] =
@@ -674,6 +665,7 @@ static struct regulator_init_data sdhi0_power_init_data = {
 static struct fixed_voltage_config sdhi0_power_info = {
 	.supply_name = "CN11 SD/MMC Vdd",
 	.microvolts = 3300000,
+	.gpio = GPIO_PTB6,
 	.enable_high = 1,
 	.init_data = &sdhi0_power_init_data,
 };
@@ -686,16 +678,6 @@ static struct platform_device sdhi0_power = {
 	},
 };
 
-static struct gpiod_lookup_table sdhi0_power_gpiod_table = {
-	.dev_id = "reg-fixed-voltage.1",
-	.table = {
-		/* Offset 6 on port B */
-		GPIO_LOOKUP("sh7724_pfc", GPIO_PTB6,
-			    "enable", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
 static struct tmio_mmc_data sdhi0_info = {
 	.chan_priv_tx	= (void *)SHDMA_SLAVE_SDHI0_TX,
 	.chan_priv_rx	= (void *)SHDMA_SLAVE_SDHI0_RX,
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
index fc77d69e51d7..4392c15ed9e0 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -10,7 +10,7 @@
  * of the License.
  */
 
-#include <linux/gpio/machine.h>
+#include <linux/gpio.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/fixed.h>
@@ -43,6 +43,7 @@ static struct fixed_voltage_config bcm43xx_vmmc = {
 	 * real voltage and signaling are still 1.8V.
 	 */
 	.microvolts		= 2000000,		/* 1.8V */
+	.gpio			= -EINVAL,
 	.startup_delay		= 250 * 1000,		/* 250ms */
 	.enable_high		= 1,			/* active high */
 	.enabled_at_boot	= 0,			/* disabled at boot */
@@ -57,23 +58,11 @@ static struct platform_device bcm43xx_vmmc_regulator = {
 	},
 };
 
-static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = {
-	.dev_id	= "reg-fixed-voltage.0",
-	.table	= {
-		GPIO_LOOKUP("0000:00:0c.0", -1, "enable", GPIO_ACTIVE_LOW),
-		{}
-	},
-};
-
 static int __init bcm43xx_regulator_register(void)
 {
-	struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table;
-	struct gpiod_lookup *lookup = table->table;
 	int ret;
 
-	lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
-	gpiod_add_lookup_table(table);
-
+	bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
 	ret = platform_device_register(&bcm43xx_vmmc_regulator);
 	if (ret) {
 		pr_err("%s: vmmc regulator register failed\n", __func__);
diff --git a/drivers/regulator/fixed-helper.c b/drivers/regulator/fixed-helper.c
index 2c6098e6f4bc..777fac6fb4cb 100644
--- a/drivers/regulator/fixed-helper.c
+++ b/drivers/regulator/fixed-helper.c
@@ -43,6 +43,7 @@ struct platform_device *regulator_register_always_on(int id, const char *name,
 	}
 
 	data->cfg.microvolts = uv;
+	data->cfg.gpio = -EINVAL;
 	data->cfg.enabled_at_boot = 1;
 	data->cfg.init_data = &data->init_data;
 
diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c
index 1142f195529b..988a7472c2ab 100644
--- a/drivers/regulator/fixed.c
+++ b/drivers/regulator/fixed.c
@@ -24,9 +24,10 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/fixed.h>
-#include <linux/gpio/consumer.h>
+#include <linux/gpio.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include <linux/of_gpio.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/machine.h>
 
@@ -77,6 +78,10 @@ of_get_fixed_voltage_config(struct device *dev,
 	if (init_data->constraints.boot_on)
 		config->enabled_at_boot = true;
 
+	config->gpio = of_get_named_gpio(np, "gpio", 0);
+	if ((config->gpio < 0) && (config->gpio != -ENOENT))
+		return ERR_PTR(config->gpio);
+
 	of_property_read_u32(np, "startup-delay-us", &config->startup_delay);
 
 	config->enable_high = of_property_read_bool(np, "enable-active-high");
@@ -97,7 +102,6 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev)
 	struct fixed_voltage_config *config;
 	struct fixed_voltage_data *drvdata;
 	struct regulator_config cfg = { };
-	enum gpiod_flags gflags;
 	int ret;
 
 	drvdata = devm_kzalloc(&pdev->dev, sizeof(struct fixed_voltage_data),
@@ -146,28 +150,25 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev)
 
 	drvdata->desc.fixed_uV = config->microvolts;
 
+	if (gpio_is_valid(config->gpio)) {
+		cfg.ena_gpio = config->gpio;
+		if (pdev->dev.of_node)
+			cfg.ena_gpio_initialized = true;
+	}
 	cfg.ena_gpio_invert = !config->enable_high;
 	if (config->enabled_at_boot) {
 		if (config->enable_high)
-			gflags = GPIOD_OUT_HIGH;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
 		else
-			gflags = GPIOD_OUT_LOW;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
 	} else {
 		if (config->enable_high)
-			gflags = GPIOD_OUT_LOW;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
 		else
-			gflags = GPIOD_OUT_HIGH;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
 	}
-	if (config->gpio_is_open_drain) {
-		if (gflags == GPIOD_OUT_HIGH)
-			gflags = GPIOD_OUT_HIGH_OPEN_DRAIN;
-		else
-			gflags = GPIOD_OUT_LOW_OPEN_DRAIN;
-	}
-
-	cfg.ena_gpiod = devm_gpiod_get_optional(&pdev->dev, NULL, gflags);
-	if (IS_ERR(cfg.ena_gpiod))
-		return PTR_ERR(cfg.ena_gpiod);
+	if (config->gpio_is_open_drain)
+		cfg.ena_gpio_flags |= GPIOF_OPEN_DRAIN;
 
 	cfg.dev = &pdev->dev;
 	cfg.init_data = config->init_data;
diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c
index 9d6094c4d71c..a86b8997bb54 100644
--- a/drivers/regulator/gpio-regulator.c
+++ b/drivers/regulator/gpio-regulator.c
@@ -31,7 +31,6 @@
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/gpio-regulator.h>
 #include <linux/gpio.h>
-#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
@@ -162,6 +161,10 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np,
 
 	of_property_read_u32(np, "startup-delay-us", &config->startup_delay);
 
+	config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0);
+	if (config->enable_gpio < 0 && config->enable_gpio != -ENOENT)
+		return ERR_PTR(config->enable_gpio);
+
 	/* Fetch GPIOs. - optional property*/
 	ret = of_gpio_count(np);
 	if ((ret < 0) && (ret != -ENOENT))
@@ -252,7 +255,6 @@ static int gpio_regulator_probe(struct platform_device *pdev)
 	struct device_node *np = pdev->dev.of_node;
 	struct gpio_regulator_data *drvdata;
 	struct regulator_config cfg = { };
-	enum gpiod_flags gflags;
 	int ptr, ret, state;
 
 	drvdata = devm_kzalloc(&pdev->dev, sizeof(struct gpio_regulator_data),
@@ -338,22 +340,21 @@ static int gpio_regulator_probe(struct platform_device *pdev)
 	cfg.driver_data = drvdata;
 	cfg.of_node = np;
 
+	if (gpio_is_valid(config->enable_gpio)) {
+		cfg.ena_gpio = config->enable_gpio;
+		cfg.ena_gpio_initialized = true;
+	}
 	cfg.ena_gpio_invert = !config->enable_high;
 	if (config->enabled_at_boot) {
 		if (config->enable_high)
-			gflags = GPIOD_OUT_HIGH;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
 		else
-			gflags = GPIOD_OUT_LOW;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
 	} else {
 		if (config->enable_high)
-			gflags = GPIOD_OUT_LOW;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_LOW;
 		else
-			gflags = GPIOD_OUT_HIGH;
-	}
-	cfg.ena_gpiod = devm_gpiod_get_optional(&pdev->dev, "enable", gflags);
-	if (IS_ERR(cfg.ena_gpiod)) {
-		ret = PTR_ERR(cfg.ena_gpiod);
-		goto err_stategpio;
+			cfg.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
 	}
 
 	drvdata->dev = regulator_register(&drvdata->desc, &cfg);
diff --git a/include/linux/regulator/fixed.h b/include/linux/regulator/fixed.h
index 1a4340ed8e2b..48918be649d4 100644
--- a/include/linux/regulator/fixed.h
+++ b/include/linux/regulator/fixed.h
@@ -24,6 +24,8 @@ struct regulator_init_data;
  * @supply_name:	Name of the regulator supply
  * @input_supply:	Name of the input regulator supply
  * @microvolts:		Output voltage of regulator
+ * @gpio:		GPIO to use for enable control
+ * 			set to -EINVAL if not used
  * @startup_delay:	Start-up time in microseconds
  * @gpio_is_open_drain: Gpio pin is open drain or normal type.
  *			If it is open drain type then HIGH will be set
@@ -47,6 +49,7 @@ struct fixed_voltage_config {
 	const char *supply_name;
 	const char *input_supply;
 	int microvolts;
+	int gpio;
 	unsigned startup_delay;
 	unsigned gpio_is_open_drain:1;
 	unsigned enable_high:1;
diff --git a/include/linux/regulator/gpio-regulator.h b/include/linux/regulator/gpio-regulator.h
index 536cab86f2d5..19fbd267406d 100644
--- a/include/linux/regulator/gpio-regulator.h
+++ b/include/linux/regulator/gpio-regulator.h
@@ -44,6 +44,8 @@ struct gpio_regulator_state {
 /**
  * struct gpio_regulator_config - config structure
  * @supply_name:	Name of the regulator supply
+ * @enable_gpio:	GPIO to use for enable control
+ *			set to -EINVAL if not used
  * @enable_high:	Polarity of enable GPIO
  *			1 = Active high, 0 = Active low
  * @enabled_at_boot:	Whether regulator has been enabled at
@@ -67,6 +69,7 @@ struct gpio_regulator_state {
 struct gpio_regulator_config {
 	const char *supply_name;
 
+	int enable_gpio;
 	unsigned enable_high:1;
 	unsigned enabled_at_boot:1;
 	unsigned startup_delay;
-- 
cgit v1.2.3


From 55e49dc43a835b19567e62142cb1c87dc7db7b3c Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Apr 2018 20:29:41 +0300
Subject: virtio_ring: switch to dma_XX barriers for rpmsg

virtio is using barriers to order memory accesses, thus
dma_wmb/rmb is a good match.

Before
[mst@tuck linux]$ size drivers/virtio/virtio_ring.o
   text    data     bss     dec     hex filename
  11392     820       0   12212    2fb4 drivers/virtio/virtio_ring.o

After
mst@tuck linux]$ size drivers/virtio/virtio_ring.o
   text    data     bss     dec     hex filename
  11284     820       0   12104    2f48 drivers/virtio/virtio_ring.o

Cc: Ohad Ben-Cohen <ohad@wizery.com>
Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Cc: linux-remoteproc@vger.kernel.org
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_ring.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index bbf32524ab27..fab02133a919 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -35,7 +35,7 @@ static inline void virtio_rmb(bool weak_barriers)
 	if (weak_barriers)
 		virt_rmb();
 	else
-		rmb();
+		dma_rmb();
 }
 
 static inline void virtio_wmb(bool weak_barriers)
@@ -43,7 +43,7 @@ static inline void virtio_wmb(bool weak_barriers)
 	if (weak_barriers)
 		virt_wmb();
 	else
-		wmb();
+		dma_wmb();
 }
 
 static inline void virtio_store_mb(bool weak_barriers,
-- 
cgit v1.2.3


From 7eced5ab5a7366ee7ca5360b3eca9d220c2b2887 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 3 Jun 2018 12:06:57 +0200
Subject: netfilter: nf_tables: add NFT_LOGLEVEL_* enumeration and use it

This is internal, not exposed through uapi, and although it maps with
userspace LOG_*, with the introduction of LOGLEVEL_AUDIT we are
incurring in namespace pollution.

This patch adds the NFT_LOGLEVEL_ enumeration and use it from nft_log.

Fixes: 1a893b44de45 ("netfilter: nf_tables: Add audit support to log statement")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netfilter/nf_tables.h | 28 +++++++++++++++++++++++++---
 net/netfilter/nft_log.c                  | 10 +++++-----
 2 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ae00a3c49b8a..c9bf74b94f37 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1099,9 +1099,31 @@ enum nft_log_attributes {
 #define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
 
 /**
- * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging
- */
-#define LOGLEVEL_AUDIT		8
+ * enum nft_log_level - nf_tables log levels
+ *
+ * @NFT_LOGLEVEL_EMERG: system is unusable
+ * @NFT_LOGLEVEL_ALERT: action must be taken immediately
+ * @NFT_LOGLEVEL_CRIT: critical conditions
+ * @NFT_LOGLEVEL_ERR: error conditions
+ * @NFT_LOGLEVEL_WARNING: warning conditions
+ * @NFT_LOGLEVEL_NOTICE: normal but significant condition
+ * @NFT_LOGLEVEL_INFO: informational
+ * @NFT_LOGLEVEL_DEBUG: debug-level messages
+ * @NFT_LOGLEVEL_AUDIT: enabling audit logging
+ */
+enum nft_log_level {
+	NFT_LOGLEVEL_EMERG,
+	NFT_LOGLEVEL_ALERT,
+	NFT_LOGLEVEL_CRIT,
+	NFT_LOGLEVEL_ERR,
+	NFT_LOGLEVEL_WARNING,
+	NFT_LOGLEVEL_NOTICE,
+	NFT_LOGLEVEL_INFO,
+	NFT_LOGLEVEL_DEBUG,
+	NFT_LOGLEVEL_AUDIT,
+	__NFT_LOGLEVEL_MAX
+};
+#define NFT_LOGLEVEL_MAX	(__NFT_LOGLEVEL_MAX + 1)
 
 /**
  * enum nft_queue_attributes - nf_tables queue expression netlink attributes
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 7eef1cffbf1b..655187bed5d8 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -111,7 +111,7 @@ static void nft_log_eval(const struct nft_expr *expr,
 	const struct nft_log *priv = nft_expr_priv(expr);
 
 	if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
-	    priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+	    priv->loginfo.u.log.level == NFT_LOGLEVEL_AUDIT) {
 		nft_log_eval_audit(pkt);
 		return;
 	}
@@ -166,9 +166,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
 			li->u.log.level =
 				ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL]));
 		} else {
-			li->u.log.level = LOGLEVEL_WARNING;
+			li->u.log.level = NFT_LOGLEVEL_WARNING;
 		}
-		if (li->u.log.level > LOGLEVEL_AUDIT) {
+		if (li->u.log.level > NFT_LOGLEVEL_AUDIT) {
 			err = -EINVAL;
 			goto err1;
 		}
@@ -196,7 +196,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		break;
 	}
 
-	if (li->u.log.level == LOGLEVEL_AUDIT)
+	if (li->u.log.level == NFT_LOGLEVEL_AUDIT)
 		return 0;
 
 	err = nf_logger_find_get(ctx->family, li->type);
@@ -220,7 +220,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	if (priv->prefix != nft_log_null_prefix)
 		kfree(priv->prefix);
 
-	if (li->u.log.level == LOGLEVEL_AUDIT)
+	if (li->u.log.level == NFT_LOGLEVEL_AUDIT)
 		return;
 
 	nf_logger_put(ctx->family, li->type);
-- 
cgit v1.2.3


From fd3a88625844907151737fc3b4201676effa6d27 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 6 Jun 2018 11:23:01 -0400
Subject: net: in virtio_net_hdr only add VLAN_HLEN to csum_start if payload
 holds vlan

Tun, tap, virtio, packet and uml vector all use struct virtio_net_hdr
to communicate packet metadata to userspace.

For skbuffs with vlan, the first two return the packet as it may have
existed on the wire, inserting the VLAN tag in the user buffer.  Then
virtio_net_hdr.csum_start needs to be adjusted by VLAN_HLEN bytes.

Commit f09e2249c4f5 ("macvtap: restore vlan header on user read")
added this feature to macvtap. Commit 3ce9b20f1971 ("macvtap: Fix
csum_start when VLAN tags are present") then fixed up csum_start.

Virtio, packet and uml do not insert the vlan header in the user
buffer.

When introducing virtio_net_hdr_from_skb to deduplicate filling in
the virtio_net_hdr, the variant from macvtap which adds VLAN_HLEN was
applied uniformly, breaking csum offset for packets with vlan on
virtio and packet.

Make insertion of VLAN_HLEN optional. Convert the callers to pass it
when needed.

Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion")
Fixes: 1276f24eeef2 ("packet: use common code for virtio_net_hdr and skb GSO conversion")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/um/drivers/vector_transports.c |  3 ++-
 drivers/net/tap.c                   |  5 ++++-
 drivers/net/tun.c                   |  3 ++-
 drivers/net/virtio_net.c            |  3 ++-
 include/linux/virtio_net.h          | 11 ++++-------
 net/packet/af_packet.c              |  4 ++--
 6 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/um/drivers/vector_transports.c b/arch/um/drivers/vector_transports.c
index 9065047f844b..77e4ebc206ae 100644
--- a/arch/um/drivers/vector_transports.c
+++ b/arch/um/drivers/vector_transports.c
@@ -120,7 +120,8 @@ static int raw_form_header(uint8_t *header,
 		skb,
 		vheader,
 		virtio_legacy_is_little_endian(),
-		false
+		false,
+		0
 	);
 
 	return 0;
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 9b6cb780affe..f0f7cd977667 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -774,13 +774,16 @@ static ssize_t tap_put_user(struct tap_queue *q,
 	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
+		int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
 		struct virtio_net_hdr vnet_hdr;
+
 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
 		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					    tap_is_little_endian(q), true))
+					    tap_is_little_endian(q), true,
+					    vlan_hlen))
 			BUG();
 
 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 85e14adf5207..a192a017cc68 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2089,7 +2089,8 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &gso,
-					    tun_is_little_endian(tun), true)) {
+					    tun_is_little_endian(tun), true,
+					    vlan_hlen)) {
 			struct skb_shared_info *sinfo = skb_shinfo(skb);
 			pr_err("unexpected GSO type: "
 			       "0x%x, gso_size %d, hdr_len %d\n",
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 2aaa18ec7d46..1619ee3070b6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1411,7 +1411,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 		hdr = skb_vnet_hdr(skb);
 
 	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
-				    virtio_is_little_endian(vi->vdev), false))
+				    virtio_is_little_endian(vi->vdev), false,
+				    0))
 		BUG();
 
 	if (vi->mergeable_rx_bufs)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index f144216febc6..9397628a1967 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -58,7 +58,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
 					  bool little_endian,
-					  bool has_data_valid)
+					  bool has_data_valid,
+					  int vlan_hlen)
 {
 	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
@@ -83,12 +84,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		if (skb_vlan_tag_present(skb))
-			hdr->csum_start = __cpu_to_virtio16(little_endian,
-				skb_checksum_start_offset(skb) + VLAN_HLEN);
-		else
-			hdr->csum_start = __cpu_to_virtio16(little_endian,
-				skb_checksum_start_offset(skb));
+		hdr->csum_start = __cpu_to_virtio16(little_endian,
+			skb_checksum_start_offset(skb) + vlan_hlen);
 		hdr->csum_offset = __cpu_to_virtio16(little_endian,
 				skb->csum_offset);
 	} else if (has_data_valid &&
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 54ce66f68482..ee018564b2b4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2005,7 +2005,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
 		return -EINVAL;
 	*len -= sizeof(vnet_hdr);
 
-	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
+	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
 		return -EINVAL;
 
 	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2272,7 +2272,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (do_vnet) {
 		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
 					    sizeof(struct virtio_net_hdr),
-					    vio_le(), true)) {
+					    vio_le(), true, 0)) {
 			spin_lock(&sk->sk_receive_queue.lock);
 			goto drop_n_account;
 		}
-- 
cgit v1.2.3


From a5a16e43529b5040760ebf9bd9b056dd34861f93 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 7 Jun 2018 15:37:34 +0200
Subject: xsk: Fix umem fill/completion queue mmap on 32-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc-4.1.2 on 32-bit:

    net/xdp/xsk.c:663: warning: integer constant is too large for ‘long’ type
    net/xdp/xsk.c:665: warning: integer constant is too large for ‘long’ type

Add the missing "ULL" suffixes to the large XDP_UMEM_PGOFF_*_RING values
to fix this.

    net/xdp/xsk.c:663: warning: comparison is always false due to limited range of data type
    net/xdp/xsk.c:665: warning: comparison is always false due to limited range of data type

"unsigned long" is 32-bit on 32-bit systems, hence the offset is
truncated, and can never be equal to any of the XDP_UMEM_PGOFF_*_RING
values.  Use loff_t (and the required cast) to fix this.

Fixes: 423f38329d267969 ("xsk: add umem fill queue support and mmap")
Fixes: fe2308328cd2f26e ("xsk: add umem completion queue support and mmap")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 4 ++--
 net/xdp/xsk.c               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 1fa0e977ea8d..caed8b1614ff 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -63,8 +63,8 @@ struct xdp_statistics {
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_PGOFF_TX_RING		 0x80000000
-#define XDP_UMEM_PGOFF_FILL_RING	0x100000000
-#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
 
 /* Rx/Tx descriptor */
 struct xdp_desc {
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c6ed2454f7ce..36919a254ba3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -643,7 +643,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 static int xsk_mmap(struct file *file, struct socket *sock,
 		    struct vm_area_struct *vma)
 {
-	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 	unsigned long size = vma->vm_end - vma->vm_start;
 	struct xdp_sock *xs = xdp_sk(sock->sk);
 	struct xsk_queue *q = NULL;
-- 
cgit v1.2.3


From ab77dab46210bb630e06c6803c5d84074bacd351 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Thu, 7 Jun 2018 17:04:29 -0700
Subject: fs/dax.c: use new return type vm_fault_t

Use new return type vm_fault_t for fault handler.  For now, this is just
documenting that the function returns a VM_FAULT value rather than an
errno.  Once all instances are converted, vm_fault_t will become a
distinct type.

commit 1c8f422059ae ("mm: change return type to vm_fault_t")

There was an existing bug inside dax_load_hole() if vm_insert_mixed had
failed to allocate a page table, we'd return VM_FAULT_NOPAGE instead of
VM_FAULT_OOM.  With new vmf_insert_mixed() this issue is addressed.

vm_insert_mixed_mkwrite has inefficiency when it returns an error value,
driver has to convert it to vm_fault_t type.  With new
vmf_insert_mixed_mkwrite() this limitation will be addressed.

Link: http://lkml.kernel.org/r/20180510181121.GA15239@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c            | 78 +++++++++++++++++++++++++----------------------------
 include/linux/dax.h |  4 +--
 include/linux/mm.h  |  4 +--
 mm/memory.c         | 21 ++++++++++++---
 4 files changed, 58 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index aa86d9f971a4..08656a2f2aa6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -905,12 +905,12 @@ out:
  * If this page is ever written to we will re-fault and change the mapping to
  * point to real DAX storage instead.
  */
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
 			 struct vm_fault *vmf)
 {
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
-	int ret = VM_FAULT_NOPAGE;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
 	struct page *zero_page;
 	void *entry2;
 	pfn_t pfn;
@@ -929,7 +929,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 		goto out;
 	}
 
-	vm_insert_mixed(vmf->vma, vaddr, pfn);
+	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
 out:
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
@@ -1112,7 +1112,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 }
 EXPORT_SYMBOL_GPL(dax_iomap_rw);
 
-static int dax_fault_return(int error)
+static vm_fault_t dax_fault_return(int error)
 {
 	if (error == 0)
 		return VM_FAULT_NOPAGE;
@@ -1132,7 +1132,7 @@ static bool dax_fault_is_synchronous(unsigned long flags,
 		&& (iomap->flags & IOMAP_F_DIRTY);
 }
 
-static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       int *iomap_errp, const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1145,18 +1145,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	int error, major = 0;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool sync;
-	int vmf_ret = 0;
+	vm_fault_t ret = 0;
 	void *entry;
 	pfn_t pfn;
 
-	trace_dax_pte_fault(inode, vmf, vmf_ret);
+	trace_dax_pte_fault(inode, vmf, ret);
 	/*
 	 * Check whether offset isn't beyond end of file now. Caller is supposed
 	 * to hold locks serializing us with truncate / punch hole so this is
 	 * a reliable test.
 	 */
 	if (pos >= i_size_read(inode)) {
-		vmf_ret = VM_FAULT_SIGBUS;
+		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
@@ -1165,7 +1165,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
 	if (IS_ERR(entry)) {
-		vmf_ret = dax_fault_return(PTR_ERR(entry));
+		ret = dax_fault_return(PTR_ERR(entry));
 		goto out;
 	}
 
@@ -1176,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * retried.
 	 */
 	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
-		vmf_ret = VM_FAULT_NOPAGE;
+		ret = VM_FAULT_NOPAGE;
 		goto unlock_entry;
 	}
 
@@ -1189,7 +1189,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	if (iomap_errp)
 		*iomap_errp = error;
 	if (error) {
-		vmf_ret = dax_fault_return(error);
+		ret = dax_fault_return(error);
 		goto unlock_entry;
 	}
 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
@@ -1219,9 +1219,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			goto error_finish_iomap;
 
 		__SetPageUptodate(vmf->cow_page);
-		vmf_ret = finish_fault(vmf);
-		if (!vmf_ret)
-			vmf_ret = VM_FAULT_DONE_COW;
+		ret = finish_fault(vmf);
+		if (!ret)
+			ret = VM_FAULT_DONE_COW;
 		goto finish_iomap;
 	}
 
@@ -1257,23 +1257,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 				goto error_finish_iomap;
 			}
 			*pfnp = pfn;
-			vmf_ret = VM_FAULT_NEEDDSYNC | major;
+			ret = VM_FAULT_NEEDDSYNC | major;
 			goto finish_iomap;
 		}
 		trace_dax_insert_mapping(inode, vmf, entry);
 		if (write)
-			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+			ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
 		else
-			error = vm_insert_mixed(vma, vaddr, pfn);
+			ret = vmf_insert_mixed(vma, vaddr, pfn);
 
-		/* -EBUSY is fine, somebody else faulted on the same PTE */
-		if (error == -EBUSY)
-			error = 0;
-		break;
+		goto finish_iomap;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (!write) {
-			vmf_ret = dax_load_hole(mapping, entry, vmf);
+			ret = dax_load_hole(mapping, entry, vmf);
 			goto finish_iomap;
 		}
 		/*FALLTHRU*/
@@ -1284,12 +1281,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	}
 
  error_finish_iomap:
-	vmf_ret = dax_fault_return(error) | major;
+	ret = dax_fault_return(error);
  finish_iomap:
 	if (ops->iomap_end) {
 		int copied = PAGE_SIZE;
 
-		if (vmf_ret & VM_FAULT_ERROR)
+		if (ret & VM_FAULT_ERROR)
 			copied = 0;
 		/*
 		 * The fault is done by now and there's no way back (other
@@ -1302,12 +1299,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
  unlock_entry:
 	put_locked_mapping_entry(mapping, vmf->pgoff);
  out:
-	trace_dax_pte_fault_done(inode, vmf, vmf_ret);
-	return vmf_ret;
+	trace_dax_pte_fault_done(inode, vmf, ret);
+	return ret | major;
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		void *entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1348,7 +1345,7 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1358,7 +1355,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	bool sync;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
-	int result = VM_FAULT_FALLBACK;
+	vm_fault_t result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
 	pgoff_t max_pgoff, pgoff;
 	void *entry;
@@ -1509,7 +1506,7 @@ out:
 	return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
@@ -1529,7 +1526,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  * has done all the necessary locking for page fault to proceed
  * successfully.
  */
-int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
 {
 	switch (pe_size) {
@@ -1553,14 +1550,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
  * DAX file.  It takes care of marking corresponding radix tree entry as dirty
  * as well.
  */
-static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
 				  enum page_entry_size pe_size,
 				  pfn_t pfn)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	void *entry, **slot;
 	pgoff_t index = vmf->pgoff;
-	int vmf_ret, error;
+	vm_fault_t ret;
 
 	xa_lock_irq(&mapping->i_pages);
 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
@@ -1579,21 +1576,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
 	xa_unlock_irq(&mapping->i_pages);
 	switch (pe_size) {
 	case PE_SIZE_PTE:
-		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
-		vmf_ret = dax_fault_return(error);
+		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
 		break;
 #ifdef CONFIG_FS_DAX_PMD
 	case PE_SIZE_PMD:
-		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+		ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
 			pfn, true);
 		break;
 #endif
 	default:
-		vmf_ret = VM_FAULT_FALLBACK;
+		ret = VM_FAULT_FALLBACK;
 	}
 	put_locked_mapping_entry(mapping, index);
-	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
-	return vmf_ret;
+	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
+	return ret;
 }
 
 /**
@@ -1606,8 +1602,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
  * stored persistently on the media and handles inserting of appropriate page
  * table entry.
  */
-int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-			  pfn_t pfn)
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size, pfn_t pfn)
 {
 	int err;
 	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index c99692ddd4b5..88504e87cd6c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -125,8 +125,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
-int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-			  pfn_t pfn);
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size, pfn_t pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0495e6f97fae..b7012bb1d218 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2431,8 +2431,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			pfn_t pfn);
-int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn);
+vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
+		unsigned long addr, pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464e0fd2..6a97893a8de7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1955,12 +1955,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_insert_mixed);
 
-int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn)
+/*
+ *  If the insertion of PTE failed because someone else already added a
+ *  different entry in the mean time, we treat that as success as we assume
+ *  the same entry was actually inserted.
+ */
+
+vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
+		unsigned long addr, pfn_t pfn)
 {
-	return __vm_insert_mixed(vma, addr, pfn, true);
+	int err;
+
+	err =  __vm_insert_mixed(vma, addr, pfn, true);
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (err < 0 && err != -EBUSY)
+		return VM_FAULT_SIGBUS;
+	return VM_FAULT_NOPAGE;
 }
-EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
 
 /*
  * maps a range of physical memory into the requested pages. the old
-- 
cgit v1.2.3


From 05fec35ebb0e0e1f4603cc241fdf65fd3abb3092 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 7 Jun 2018 17:05:24 -0700
Subject: slab: clean up the code comment in slab kmem_cache struct

In commit 3b0efdfa1e7 ("mm, sl[aou]b: Extract common fields from struct
kmem_cache") the variable 'obj_size' was moved above, however the
related code comment is not updated accordingly.  Do it here.

Link: http://lkml.kernel.org/r/20180603032402.27526-1-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index d9228e4d0320..3485c58cfd1c 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -67,9 +67,10 @@ struct kmem_cache {
 
 	/*
 	 * If debugging is enabled, then the allocator can add additional
-	 * fields and/or padding to every object. size contains the total
-	 * object size including these internal fields, the following two
-	 * variables contain the offset to the user object and its size.
+	 * fields and/or padding to every object. 'size' contains the total
+	 * object size including these internal fields, while 'obj_offset'
+	 * and 'object_size' contain the offset to the user object and its
+	 * size.
 	 */
 	int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
-- 
cgit v1.2.3


From 88aa7cc688d48ddd84558b41d5905a0db9535c4b Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 7 Jun 2018 17:05:28 -0700
Subject: mm: introduce arg_lock to protect arg_start|end and env_start|end in
 mm_struct

mmap_sem is on the hot path of kernel, and it very contended, but it is
abused too.  It is used to protect arg_start|end and evn_start|end when
reading /proc/$PID/cmdline and /proc/$PID/environ, but it doesn't make
sense since those proc files just expect to read 4 values atomically and
not related to VM, they could be set to arbitrary values by C/R.

And, the mmap_sem contention may cause unexpected issue like below:

INFO: task ps:14018 blocked for more than 120 seconds.
       Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
message.
 ps              D    0 14018      1 0x00000004
 Call Trace:
   schedule+0x36/0x80
   rwsem_down_read_failed+0xf0/0x150
   call_rwsem_down_read_failed+0x18/0x30
   down_read+0x20/0x40
   proc_pid_cmdline_read+0xd9/0x4e0
   __vfs_read+0x37/0x150
   vfs_read+0x96/0x130
   SyS_read+0x55/0xc0
   entry_SYSCALL_64_fastpath+0x1a/0xc5

Both Alexey Dobriyan and Michal Hocko suggested to use dedicated lock
for them to mitigate the abuse of mmap_sem.

So, introduce a new spinlock in mm_struct to protect the concurrent
access to arg_start|end, env_start|end and others, as well as replace
write map_sem to read to protect the race condition between prctl and
sys_brk which might break check_data_rlimit(), and makes prctl more
friendly to other VM operations.

This patch just eliminates the abuse of mmap_sem, but it can't resolve
the above hung task warning completely since the later
access_remote_vm() call needs acquire mmap_sem.  The mmap_sem
scalability issue will be solved in the future.

[yang.shi@linux.alibaba.com: add comment about mmap_sem and arg_lock]
  Link: http://lkml.kernel.org/r/1524077799-80690-1-git-send-email-yang.shi@linux.alibaba.com
Link: http://lkml.kernel.org/r/1523730291-109696-1-git-send-email-yang.shi@linux.alibaba.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c           |  8 ++++----
 include/linux/mm_types.h |  2 ++
 kernel/fork.c            |  1 +
 kernel/sys.c             | 10 ++++++++--
 mm/init-mm.c             |  1 +
 5 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index af128b374143..ef3f7df50023 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -239,12 +239,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 		goto out_mmput;
 	}
 
-	down_read(&mm->mmap_sem);
+	spin_lock(&mm->arg_lock);
 	arg_start = mm->arg_start;
 	arg_end = mm->arg_end;
 	env_start = mm->env_start;
 	env_end = mm->env_end;
-	up_read(&mm->mmap_sem);
+	spin_unlock(&mm->arg_lock);
 
 	BUG_ON(arg_start > arg_end);
 	BUG_ON(env_start > env_end);
@@ -927,10 +927,10 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!mmget_not_zero(mm))
 		goto free;
 
-	down_read(&mm->mmap_sem);
+	spin_lock(&mm->arg_lock);
 	env_start = mm->env_start;
 	env_end = mm->env_end;
-	up_read(&mm->mmap_sem);
+	spin_unlock(&mm->arg_lock);
 
 	while (count > 0) {
 		size_t this_len, max_len;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21612347d311..49dd59ea90f7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -413,6 +413,8 @@ struct mm_struct {
 	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE & ~VM_STACK */
 	unsigned long stack_vm;		/* VM_STACK */
 	unsigned long def_flags;
+
+	spinlock_t arg_lock; /* protect the below fields */
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
diff --git a/kernel/fork.c b/kernel/fork.c
index 80b48a8fb47b..c6d1c1ce9ed7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -899,6 +899,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->pinned_vm = 0;
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
+	spin_lock_init(&mm->arg_lock);
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
diff --git a/kernel/sys.c b/kernel/sys.c
index d1b2b8d934bb..38509dc1f77b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2018,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 			return error;
 	}
 
-	down_write(&mm->mmap_sem);
+	/*
+	 * arg_lock protects concurent updates but we still need mmap_sem for
+	 * read to exclude races with sys_brk.
+	 */
+	down_read(&mm->mmap_sem);
 
 	/*
 	 * We don't validate if these members are pointing to
@@ -2032,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	 *    to any problem in kernel itself
 	 */
 
+	spin_lock(&mm->arg_lock);
 	mm->start_code	= prctl_map.start_code;
 	mm->end_code	= prctl_map.end_code;
 	mm->start_data	= prctl_map.start_data;
@@ -2043,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	mm->arg_end	= prctl_map.arg_end;
 	mm->env_start	= prctl_map.env_start;
 	mm->env_end	= prctl_map.env_end;
+	spin_unlock(&mm->arg_lock);
 
 	/*
 	 * Note this update of @saved_auxv is lockless thus
@@ -2055,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	if (prctl_map.auxv_size)
 		memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
 
-	up_write(&mm->mmap_sem);
+	up_read(&mm->mmap_sem);
 	return 0;
 }
 #endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f94d5d15ebc0..f0179c9c04c2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -22,6 +22,7 @@ struct mm_struct init_mm = {
 	.mm_count	= ATOMIC_INIT(1),
 	.mmap_sem	= __RWSEM_INITIALIZER(init_mm.mmap_sem),
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
 	INIT_MM_CONTEXT(init_mm)
-- 
cgit v1.2.3


From f3a53a3a1e5b3b9bc114b5b7930fbe89d9ffed5d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 Jun 2018 17:05:35 -0700
Subject: mm, memcontrol: implement memory.swap.events

Add swap max and fail events so that userland can monitor and respond to
running out of swap.

I'm not too sure about the fail event.  Right now, it's a bit confusing
which stats / events are recursive and which aren't and also which ones
reflect events which originate from a given cgroup and which targets the
cgroup.  No idea what the right long term solution is and it could just
be that growing them organically is actually the only right thing to do.

Link: http://lkml.kernel.org/r/20180416231151.GI1911913@devbig577.frc2.facebook.com
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 16 ++++++++++++++++
 include/linux/memcontrol.h              |  5 +++++
 mm/memcontrol.c                         | 24 +++++++++++++++++++++++-
 3 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 74cdeaed9f7a..b0dda10b9382 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1199,6 +1199,22 @@ PAGE_SIZE multiple when read back.
 	Swap usage hard limit.  If a cgroup's swap usage reaches this
 	limit, anonymous memory of the cgroup will not be swapped out.
 
+  memory.swap.events
+	A read-only flat-keyed file which exists on non-root cgroups.
+	The following entries are defined.  Unless specified
+	otherwise, a value change in this file generates a file
+	modified event.
+
+	  max
+		The number of times the cgroup's swap usage was about
+		to go over the max boundary and swap allocation
+		failed.
+
+	  fail
+		The number of times swap allocation failed either
+		because of running out of swap system-wide or max
+		limit.
+
 
 Usage Guidelines
 ~~~~~~~~~~~~~~~~
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d99b71bc2c66..517096c3cc99 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -53,6 +53,8 @@ enum memcg_memory_event {
 	MEMCG_HIGH,
 	MEMCG_MAX,
 	MEMCG_OOM,
+	MEMCG_SWAP_MAX,
+	MEMCG_SWAP_FAIL,
 	MEMCG_NR_MEMORY_EVENTS,
 };
 
@@ -208,6 +210,9 @@ struct mem_cgroup {
 	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
 	struct cgroup_file events_file;
 
+	/* handle for "memory.swap.events" */
+	struct cgroup_file swap_events_file;
+
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8166521a474..d86665cf4a49 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6012,13 +6012,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 	if (!memcg)
 		return 0;
 
-	if (!entry.val)
+	if (!entry.val) {
+		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
 		return 0;
+	}
 
 	memcg = mem_cgroup_id_get_online(memcg);
 
 	if (!mem_cgroup_is_root(memcg) &&
 	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
 		mem_cgroup_id_put(memcg);
 		return -ENOMEM;
 	}
@@ -6156,6 +6160,18 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int swap_events_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "max %lu\n",
+		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
+	seq_printf(m, "fail %lu\n",
+		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
+
+	return 0;
+}
+
 static struct cftype swap_files[] = {
 	{
 		.name = "swap.current",
@@ -6168,6 +6184,12 @@ static struct cftype swap_files[] = {
 		.seq_show = swap_max_show,
 		.write = swap_max_write,
 	},
+	{
+		.name = "swap.events",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
+		.seq_show = swap_events_show,
+	},
 	{ }	/* terminate */
 };
 
-- 
cgit v1.2.3


From 5d752600a8c373382264392f5b573b2fc9c0e8ea Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 7 Jun 2018 17:06:01 -0700
Subject: mm: restructure memfd code

With the addition of memfd hugetlbfs support, we now have the situation
where memfd depends on TMPFS -or- HUGETLBFS.  Previously, memfd was only
supported on tmpfs, so it made sense that the code resided in shmem.c.
In the current code, memfd is only functional if TMPFS is defined.  If
HUGETLFS is defined and TMPFS is not defined, then memfd functionality
will not be available for hugetlbfs.  This does not cause BUGs, just a
lack of potentially desired functionality.

Code is restructured in the following way:
- include/linux/memfd.h is a new file containing memfd specific
  definitions previously contained in shmem_fs.h.
- mm/memfd.c is a new file containing memfd specific code previously
  contained in shmem.c.
- memfd specific code is removed from shmem_fs.h and shmem.c.
- A new config option MEMFD_CREATE is added that is defined if TMPFS
  or HUGETLBFS is defined.

No functional changes are made to the code: restructuring only.

Link: http://lkml.kernel.org/r/20180415182119.4517-4-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Herrmann <dh.herrmann@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marc-Andr Lureau <marcandre.lureau@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig               |   3 +
 fs/fcntl.c               |   2 +-
 include/linux/memfd.h    |  16 +++
 include/linux/shmem_fs.h |  13 --
 mm/Makefile              |   1 +
 mm/memfd.c               | 345 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c               | 324 --------------------------------------------
 7 files changed, 366 insertions(+), 338 deletions(-)
 create mode 100644 include/linux/memfd.h
 create mode 100644 mm/memfd.c

(limited to 'include')

diff --git a/fs/Kconfig b/fs/Kconfig
index ac4ac908f001..51f78a28072a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,6 +203,9 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
+config MEMFD_CREATE
+	def_bool TMPFS || HUGETLBFS
+
 config ARCH_HAS_GIGANTIC_PAGE
 	bool
 
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c42169459298..12273b6ea56d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -23,7 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
-#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
 #include <linux/compat.h>
 
 #include <linux/poll.h>
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
new file mode 100644
index 000000000000..4f1600413f91
--- /dev/null
+++ b/include/linux/memfd.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MEMFD_H
+#define __LINUX_MEMFD_H
+
+#include <linux/file.h>
+
+#ifdef CONFIG_MEMFD_CREATE
+extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+#else
+static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 73b5e655a76e..f155dc607112 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -110,19 +110,6 @@ static inline bool shmem_file(struct file *file)
 extern bool shmem_charge(struct inode *inode, long pages);
 extern void shmem_uncharge(struct inode *inode, long pages);
 
-#ifdef CONFIG_TMPFS
-
-extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
-
-#else
-
-static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
-{
-	return -EINVAL;
-}
-
-#endif
-
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
 extern bool shmem_huge_enabled(struct vm_area_struct *vma);
 #else
diff --git a/mm/Makefile b/mm/Makefile
index b4e54a9ae9c5..8716bdabe1e6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/memfd.c b/mm/memfd.c
new file mode 100644
index 000000000000..27069518e3c5
--- /dev/null
+++ b/mm/memfd.c
@@ -0,0 +1,345 @@
+/*
+ * memfd_create system call and file sealing support
+ *
+ * Code was originally included in shmem.c, and broken out to facilitate
+ * use by hugetlbfs as well as tmpfs.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/khugepaged.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
+#include <uapi/linux/memfd.h>
+
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on tmpfs
+ * or hugetlbfs because they are memory only filesystems.
+ */
+#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN               4       /* about 150ms max */
+
+static void memfd_tag_pins(struct address_space *mapping)
+{
+	struct radix_tree_iter iter;
+	void __rcu **slot;
+	pgoff_t start;
+	struct page *page;
+
+	lru_add_drain();
+	start = 0;
+	rcu_read_lock();
+
+	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+		page = radix_tree_deref_slot(slot);
+		if (!page || radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page)) {
+				slot = radix_tree_iter_retry(&iter);
+				continue;
+			}
+		} else if (page_count(page) - page_mapcount(page) > 1) {
+			xa_lock_irq(&mapping->i_pages);
+			radix_tree_tag_set(&mapping->i_pages, iter.index,
+					   MEMFD_TAG_PINNED);
+			xa_unlock_irq(&mapping->i_pages);
+		}
+
+		if (need_resched()) {
+			slot = radix_tree_iter_resume(slot, &iter);
+			cond_resched_rcu();
+		}
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
+static int memfd_wait_for_pins(struct address_space *mapping)
+{
+	struct radix_tree_iter iter;
+	void __rcu **slot;
+	pgoff_t start;
+	struct page *page;
+	int error, scan;
+
+	memfd_tag_pins(mapping);
+
+	error = 0;
+	for (scan = 0; scan <= LAST_SCAN; scan++) {
+		if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED))
+			break;
+
+		if (!scan)
+			lru_add_drain_all();
+		else if (schedule_timeout_killable((HZ << scan) / 200))
+			scan = LAST_SCAN;
+
+		start = 0;
+		rcu_read_lock();
+		radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
+					   start, MEMFD_TAG_PINNED) {
+
+			page = radix_tree_deref_slot(slot);
+			if (radix_tree_exception(page)) {
+				if (radix_tree_deref_retry(page)) {
+					slot = radix_tree_iter_retry(&iter);
+					continue;
+				}
+
+				page = NULL;
+			}
+
+			if (page &&
+			    page_count(page) - page_mapcount(page) != 1) {
+				if (scan < LAST_SCAN)
+					goto continue_resched;
+
+				/*
+				 * On the last scan, we clean up all those tags
+				 * we inserted; but make a note that we still
+				 * found pages pinned.
+				 */
+				error = -EBUSY;
+			}
+
+			xa_lock_irq(&mapping->i_pages);
+			radix_tree_tag_clear(&mapping->i_pages,
+					     iter.index, MEMFD_TAG_PINNED);
+			xa_unlock_irq(&mapping->i_pages);
+continue_resched:
+			if (need_resched()) {
+				slot = radix_tree_iter_resume(slot, &iter);
+				cond_resched_rcu();
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	return error;
+}
+
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+	if (shmem_file(file))
+		return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+	if (is_file_hugepages(file))
+		return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+	return NULL;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+		     F_SEAL_SHRINK | \
+		     F_SEAL_GROW | \
+		     F_SEAL_WRITE)
+
+static int memfd_add_seals(struct file *file, unsigned int seals)
+{
+	struct inode *inode = file_inode(file);
+	unsigned int *file_seals;
+	int error;
+
+	/*
+	 * SEALING
+	 * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
+	 * but restrict access to a specific subset of file operations. Seals
+	 * can only be added, but never removed. This way, mutually untrusted
+	 * parties can share common memory regions with a well-defined policy.
+	 * A malicious peer can thus never perform unwanted operations on a
+	 * shared object.
+	 *
+	 * Seals are only supported on special tmpfs or hugetlbfs files and
+	 * always affect the whole underlying inode. Once a seal is set, it
+	 * may prevent some kinds of access to the file. Currently, the
+	 * following seals are defined:
+	 *   SEAL_SEAL: Prevent further seals from being set on this file
+	 *   SEAL_SHRINK: Prevent the file from shrinking
+	 *   SEAL_GROW: Prevent the file from growing
+	 *   SEAL_WRITE: Prevent write access to the file
+	 *
+	 * As we don't require any trust relationship between two parties, we
+	 * must prevent seals from being removed. Therefore, sealing a file
+	 * only adds a given set of seals to the file, it never touches
+	 * existing seals. Furthermore, the "setting seals"-operation can be
+	 * sealed itself, which basically prevents any further seal from being
+	 * added.
+	 *
+	 * Semantics of sealing are only defined on volatile files. Only
+	 * anonymous tmpfs and hugetlbfs files support sealing. More
+	 * importantly, seals are never written to disk. Therefore, there's
+	 * no plan to support it on other file types.
+	 */
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EPERM;
+	if (seals & ~(unsigned int)F_ALL_SEALS)
+		return -EINVAL;
+
+	inode_lock(inode);
+
+	file_seals = memfd_file_seals_ptr(file);
+	if (!file_seals) {
+		error = -EINVAL;
+		goto unlock;
+	}
+
+	if (*file_seals & F_SEAL_SEAL) {
+		error = -EPERM;
+		goto unlock;
+	}
+
+	if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
+		error = mapping_deny_writable(file->f_mapping);
+		if (error)
+			goto unlock;
+
+		error = memfd_wait_for_pins(file->f_mapping);
+		if (error) {
+			mapping_allow_writable(file->f_mapping);
+			goto unlock;
+		}
+	}
+
+	*file_seals |= seals;
+	error = 0;
+
+unlock:
+	inode_unlock(inode);
+	return error;
+}
+
+static int memfd_get_seals(struct file *file)
+{
+	unsigned int *seals = memfd_file_seals_ptr(file);
+
+	return seals ? *seals : -EINVAL;
+}
+
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long error;
+
+	switch (cmd) {
+	case F_ADD_SEALS:
+		/* disallow upper 32bit */
+		if (arg > UINT_MAX)
+			return -EINVAL;
+
+		error = memfd_add_seals(file, arg);
+		break;
+	case F_GET_SEALS:
+		error = memfd_get_seals(file);
+		break;
+	default:
+		error = -EINVAL;
+		break;
+	}
+
+	return error;
+}
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+
+SYSCALL_DEFINE2(memfd_create,
+		const char __user *, uname,
+		unsigned int, flags)
+{
+	unsigned int *file_seals;
+	struct file *file;
+	int fd, error;
+	char *name;
+	long len;
+
+	if (!(flags & MFD_HUGETLB)) {
+		if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+			return -EINVAL;
+	} else {
+		/* Allow huge page size encoding in flags. */
+		if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+				(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+			return -EINVAL;
+	}
+
+	/* length includes terminating zero */
+	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+	if (len <= 0)
+		return -EFAULT;
+	if (len > MFD_NAME_MAX_LEN + 1)
+		return -EINVAL;
+
+	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	strcpy(name, MFD_NAME_PREFIX);
+	if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+		error = -EFAULT;
+		goto err_name;
+	}
+
+	/* terminating-zero may have changed after strnlen_user() returned */
+	if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+		error = -EFAULT;
+		goto err_name;
+	}
+
+	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+	if (fd < 0) {
+		error = fd;
+		goto err_name;
+	}
+
+	if (flags & MFD_HUGETLB) {
+		struct user_struct *user = NULL;
+
+		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+					HUGETLB_ANONHUGE_INODE,
+					(flags >> MFD_HUGE_SHIFT) &
+					MFD_HUGE_MASK);
+	} else
+		file = shmem_file_setup(name, 0, VM_NORESERVE);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_fd;
+	}
+	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+	file->f_flags |= O_RDWR | O_LARGEFILE;
+
+	if (flags & MFD_ALLOW_SEALING) {
+		file_seals = memfd_file_seals_ptr(file);
+		*file_seals &= ~F_SEAL_SEAL;
+	}
+
+	fd_install(fd, file);
+	kfree(name);
+	return fd;
+
+err_fd:
+	put_unused_fd(fd);
+err_name:
+	kfree(name);
+	return error;
+}
diff --git a/mm/shmem.c b/mm/shmem.c
index ceba031d9985..cb4d7fa389d0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2618,243 +2618,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on tmpfs
- * or hugetlbfs because they are memory only filesystems.
- */
-#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN               4       /* about 150ms max */
-
-static void memfd_tag_pins(struct address_space *mapping)
-{
-	struct radix_tree_iter iter;
-	void __rcu **slot;
-	pgoff_t start;
-	struct page *page;
-
-	lru_add_drain();
-	start = 0;
-	rcu_read_lock();
-
-	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
-		page = radix_tree_deref_slot(slot);
-		if (!page || radix_tree_exception(page)) {
-			if (radix_tree_deref_retry(page)) {
-				slot = radix_tree_iter_retry(&iter);
-				continue;
-			}
-		} else if (page_count(page) - page_mapcount(page) > 1) {
-			xa_lock_irq(&mapping->i_pages);
-			radix_tree_tag_set(&mapping->i_pages, iter.index,
-					   MEMFD_TAG_PINNED);
-			xa_unlock_irq(&mapping->i_pages);
-		}
-
-		if (need_resched()) {
-			slot = radix_tree_iter_resume(slot, &iter);
-			cond_resched_rcu();
-		}
-	}
-	rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int memfd_wait_for_pins(struct address_space *mapping)
-{
-	struct radix_tree_iter iter;
-	void __rcu **slot;
-	pgoff_t start;
-	struct page *page;
-	int error, scan;
-
-	memfd_tag_pins(mapping);
-
-	error = 0;
-	for (scan = 0; scan <= LAST_SCAN; scan++) {
-		if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED))
-			break;
-
-		if (!scan)
-			lru_add_drain_all();
-		else if (schedule_timeout_killable((HZ << scan) / 200))
-			scan = LAST_SCAN;
-
-		start = 0;
-		rcu_read_lock();
-		radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
-					   start, MEMFD_TAG_PINNED) {
-
-			page = radix_tree_deref_slot(slot);
-			if (radix_tree_exception(page)) {
-				if (radix_tree_deref_retry(page)) {
-					slot = radix_tree_iter_retry(&iter);
-					continue;
-				}
-
-				page = NULL;
-			}
-
-			if (page &&
-			    page_count(page) - page_mapcount(page) != 1) {
-				if (scan < LAST_SCAN)
-					goto continue_resched;
-
-				/*
-				 * On the last scan, we clean up all those tags
-				 * we inserted; but make a note that we still
-				 * found pages pinned.
-				 */
-				error = -EBUSY;
-			}
-
-			xa_lock_irq(&mapping->i_pages);
-			radix_tree_tag_clear(&mapping->i_pages,
-					     iter.index, MEMFD_TAG_PINNED);
-			xa_unlock_irq(&mapping->i_pages);
-continue_resched:
-			if (need_resched()) {
-				slot = radix_tree_iter_resume(slot, &iter);
-				cond_resched_rcu();
-			}
-		}
-		rcu_read_unlock();
-	}
-
-	return error;
-}
-
-static unsigned int *memfd_file_seals_ptr(struct file *file)
-{
-	if (shmem_file(file))
-		return &SHMEM_I(file_inode(file))->seals;
-
-#ifdef CONFIG_HUGETLBFS
-	if (is_file_hugepages(file))
-		return &HUGETLBFS_I(file_inode(file))->seals;
-#endif
-
-	return NULL;
-}
-
-#define F_ALL_SEALS (F_SEAL_SEAL | \
-		     F_SEAL_SHRINK | \
-		     F_SEAL_GROW | \
-		     F_SEAL_WRITE)
-
-static int memfd_add_seals(struct file *file, unsigned int seals)
-{
-	struct inode *inode = file_inode(file);
-	unsigned int *file_seals;
-	int error;
-
-	/*
-	 * SEALING
-	 * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
-	 * but restrict access to a specific subset of file operations. Seals
-	 * can only be added, but never removed. This way, mutually untrusted
-	 * parties can share common memory regions with a well-defined policy.
-	 * A malicious peer can thus never perform unwanted operations on a
-	 * shared object.
-	 *
-	 * Seals are only supported on special tmpfs or hugetlbfs files and
-	 * always affect the whole underlying inode. Once a seal is set, it
-	 * may prevent some kinds of access to the file. Currently, the
-	 * following seals are defined:
-	 *   SEAL_SEAL: Prevent further seals from being set on this file
-	 *   SEAL_SHRINK: Prevent the file from shrinking
-	 *   SEAL_GROW: Prevent the file from growing
-	 *   SEAL_WRITE: Prevent write access to the file
-	 *
-	 * As we don't require any trust relationship between two parties, we
-	 * must prevent seals from being removed. Therefore, sealing a file
-	 * only adds a given set of seals to the file, it never touches
-	 * existing seals. Furthermore, the "setting seals"-operation can be
-	 * sealed itself, which basically prevents any further seal from being
-	 * added.
-	 *
-	 * Semantics of sealing are only defined on volatile files. Only
-	 * anonymous tmpfs and hugetlbfs files support sealing. More
-	 * importantly, seals are never written to disk. Therefore, there's
-	 * no plan to support it on other file types.
-	 */
-
-	if (!(file->f_mode & FMODE_WRITE))
-		return -EPERM;
-	if (seals & ~(unsigned int)F_ALL_SEALS)
-		return -EINVAL;
-
-	inode_lock(inode);
-
-	file_seals = memfd_file_seals_ptr(file);
-	if (!file_seals) {
-		error = -EINVAL;
-		goto unlock;
-	}
-
-	if (*file_seals & F_SEAL_SEAL) {
-		error = -EPERM;
-		goto unlock;
-	}
-
-	if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
-		error = mapping_deny_writable(file->f_mapping);
-		if (error)
-			goto unlock;
-
-		error = memfd_wait_for_pins(file->f_mapping);
-		if (error) {
-			mapping_allow_writable(file->f_mapping);
-			goto unlock;
-		}
-	}
-
-	*file_seals |= seals;
-	error = 0;
-
-unlock:
-	inode_unlock(inode);
-	return error;
-}
-
-static int memfd_get_seals(struct file *file)
-{
-	unsigned int *seals = memfd_file_seals_ptr(file);
-
-	return seals ? *seals : -EINVAL;
-}
-
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long error;
-
-	switch (cmd) {
-	case F_ADD_SEALS:
-		/* disallow upper 32bit */
-		if (arg > UINT_MAX)
-			return -EINVAL;
-
-		error = memfd_add_seals(file, arg);
-		break;
-	case F_GET_SEALS:
-		error = memfd_get_seals(file);
-		break;
-	default:
-		error = -EINVAL;
-		break;
-	}
-
-	return error;
-}
-
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 							 loff_t len)
 {
@@ -3677,93 +3440,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 	return 0;
 }
 
-#define MFD_NAME_PREFIX "memfd:"
-#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
-#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
-
-SYSCALL_DEFINE2(memfd_create,
-		const char __user *, uname,
-		unsigned int, flags)
-{
-	unsigned int *file_seals;
-	struct file *file;
-	int fd, error;
-	char *name;
-	long len;
-
-	if (!(flags & MFD_HUGETLB)) {
-		if (flags & ~(unsigned int)MFD_ALL_FLAGS)
-			return -EINVAL;
-	} else {
-		/* Allow huge page size encoding in flags. */
-		if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
-				(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
-			return -EINVAL;
-	}
-
-	/* length includes terminating zero */
-	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
-	if (len <= 0)
-		return -EFAULT;
-	if (len > MFD_NAME_MAX_LEN + 1)
-		return -EINVAL;
-
-	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
-	if (!name)
-		return -ENOMEM;
-
-	strcpy(name, MFD_NAME_PREFIX);
-	if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
-		error = -EFAULT;
-		goto err_name;
-	}
-
-	/* terminating-zero may have changed after strnlen_user() returned */
-	if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
-		error = -EFAULT;
-		goto err_name;
-	}
-
-	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
-	if (fd < 0) {
-		error = fd;
-		goto err_name;
-	}
-
-	if (flags & MFD_HUGETLB) {
-		struct user_struct *user = NULL;
-
-		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
-					HUGETLB_ANONHUGE_INODE,
-					(flags >> MFD_HUGE_SHIFT) &
-					MFD_HUGE_MASK);
-	} else
-		file = shmem_file_setup(name, 0, VM_NORESERVE);
-	if (IS_ERR(file)) {
-		error = PTR_ERR(file);
-		goto err_fd;
-	}
-	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
-	file->f_flags |= O_RDWR | O_LARGEFILE;
-
-	if (flags & MFD_ALLOW_SEALING) {
-		file_seals = memfd_file_seals_ptr(file);
-		*file_seals &= ~F_SEAL_SEAL;
-	}
-
-	fd_install(fd, file);
-	kfree(name);
-	return fd;
-
-err_fd:
-	put_unused_fd(fd);
-err_name:
-	kfree(name);
-	return error;
-}
-
 #endif /* CONFIG_TMPFS */
 
 static void shmem_put_super(struct super_block *sb)
-- 
cgit v1.2.3


From 3010a5ea665a089361e435093bd737399123fcc4 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Thu, 7 Jun 2018 17:06:08 -0700
Subject: mm: introduce ARCH_HAS_PTE_SPECIAL

Currently the PTE special supports is turned on in per architecture
header files.  Most of the time, it is defined in
arch/*/include/asm/pgtable.h depending or not on some other per
architecture static definition.

This patch introduce a new configuration variable to manage this
directly in the Kconfig files.  It would later replace
__HAVE_ARCH_PTE_SPECIAL.

Here notes for some architecture where the definition of
__HAVE_ARCH_PTE_SPECIAL is not obvious:

arm
 __HAVE_ARCH_PTE_SPECIAL which is currently defined in
arch/arm/include/asm/pgtable-3level.h which is included by
arch/arm/include/asm/pgtable.h when CONFIG_ARM_LPAE is set.
So select ARCH_HAS_PTE_SPECIAL if ARM_LPAE.

powerpc
__HAVE_ARCH_PTE_SPECIAL is defined in 2 files:
 - arch/powerpc/include/asm/book3s/64/pgtable.h
 - arch/powerpc/include/asm/pte-common.h
The first one is included if (PPC_BOOK3S & PPC64) while the second is
included in all the other cases.
So select ARCH_HAS_PTE_SPECIAL all the time.

sparc:
__HAVE_ARCH_PTE_SPECIAL is defined if defined(__sparc__) &&
defined(__arch64__) which are defined through the compiler in
sparc/Makefile if !SPARC32 which I assume to be if SPARC64.
So select ARCH_HAS_PTE_SPECIAL if SPARC64

There is no functional change introduced by this patch.

Link: http://lkml.kernel.org/r/1523433816-14460-2-git-send-email-ldufour@linux.vnet.ibm.com
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Suggested-by: Jerome Glisse <jglisse@redhat.com>
Reviewed-by: Jerome Glisse <jglisse@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Albert Ou <albert@sifive.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Christophe LEROY <christophe.leroy@c-s.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/features/vm/pte_special/arch-support.txt | 2 +-
 arch/arc/Kconfig                                       | 1 +
 arch/arc/include/asm/pgtable.h                         | 2 --
 arch/arm/Kconfig                                       | 1 +
 arch/arm/include/asm/pgtable-3level.h                  | 1 -
 arch/arm64/Kconfig                                     | 1 +
 arch/arm64/include/asm/pgtable.h                       | 2 --
 arch/powerpc/Kconfig                                   | 1 +
 arch/powerpc/include/asm/book3s/64/pgtable.h           | 3 ---
 arch/powerpc/include/asm/pte-common.h                  | 3 ---
 arch/riscv/Kconfig                                     | 1 +
 arch/riscv/include/asm/pgtable-bits.h                  | 3 ---
 arch/s390/Kconfig                                      | 1 +
 arch/s390/include/asm/pgtable.h                        | 1 -
 arch/sh/Kconfig                                        | 1 +
 arch/sh/include/asm/pgtable.h                          | 2 --
 arch/sparc/Kconfig                                     | 1 +
 arch/sparc/include/asm/pgtable_64.h                    | 3 ---
 arch/x86/Kconfig                                       | 1 +
 arch/x86/include/asm/pgtable_types.h                   | 1 -
 include/linux/pfn_t.h                                  | 4 ++--
 mm/Kconfig                                             | 3 +++
 mm/gup.c                                               | 4 ++--
 mm/memory.c                                            | 2 +-
 24 files changed, 18 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt
index 6a608a6dcf71..a8378424bc98 100644
--- a/Documentation/features/vm/pte_special/arch-support.txt
+++ b/Documentation/features/vm/pte_special/arch-support.txt
@@ -1,6 +1,6 @@
 #
 # Feature name:          pte_special
-#         Kconfig:       __HAVE_ARCH_PTE_SPECIAL
+#         Kconfig:       ARCH_HAS_PTE_SPECIAL
 #         description:   arch supports the pte_special()/pte_mkspecial() VM APIs
 #
     -----------------------
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 89d47eac18b2..e81bcd271be7 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -48,6 +48,7 @@ config ARC
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZMA
+	select ARCH_HAS_PTE_SPECIAL
 
 config MIGHT_HAVE_PCI
 	bool
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 08fe33830d4b..8ec5599a0957 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec,	|= (_PAGE_EXECUTE));
 PTE_BIT_FUNC(mkspecial,	|= (_PAGE_SPECIAL));
 PTE_BIT_FUNC(mkhuge,	|= (_PAGE_HW_SZ));
 
-#define __HAVE_ARCH_PTE_SPECIAL
-
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 8f460bdd4be1..534563ac7f5f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
+	select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 2a4836087358..6d50a11d7793 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 	pte_val(pte) |= L_PTE_SPECIAL;
 	return pte;
 }
-#define	__HAVE_ARCH_PTE_SPECIAL
 
 #define pmd_write(pmd)		(pmd_isclear((pmd), L_PMD_SECT_RDONLY))
 #define pmd_dirty(pmd)		(pmd_isset((pmd), L_PMD_SECT_DIRTY))
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b25ed7834f6c..4759566a78cb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -17,6 +17,7 @@ config ARM64
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7c4c8f318ba9..9f82d6b53851 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-#define __HAVE_ARCH_PTE_SPECIAL
-
 static inline pte_t pgd_pte(pgd_t pgd)
 {
 	return __pte(pgd_val(pgd));
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 076fe3094856..8f959df2de7a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,6 +135,7 @@ config PPC
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 42fe7c2ff2df..63cee159022b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -335,9 +335,6 @@ extern unsigned long pci_io_base;
 /* Advertise special mapping type for AGP */
 #define HAVE_PAGE_AGP
 
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 050b0d775324..bef56141a549 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -208,9 +208,6 @@ static inline bool pte_user(pte_t pte)
 #define PAGE_AGP		(PAGE_KERNEL_NC)
 #define HAVE_PAGE_AGP
 
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
 #ifndef _PAGE_READ
 /* if not defined, we should not find _PAGE_WRITE too */
 #define _PAGE_READ 0
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 274bc064c41f..17f19e67993b 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -42,6 +42,7 @@ config RISCV
 	select THREAD_INFO_IN_TASK
 	select RISCV_TIMER
 	select GENERIC_IRQ_MULTI_HANDLER
+	select ARCH_HAS_PTE_SPECIAL
 
 config MMU
 	def_bool y
diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
index 997ddbb1d370..2fa2942be221 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -42,7 +42,4 @@
 					  _PAGE_WRITE | _PAGE_EXEC |	\
 					  _PAGE_USER | _PAGE_GLOBAL))
 
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
 #endif /* _ASM_RISCV_PGTABLE_BITS_H */
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b7deee7e738f..baed39772c84 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -65,6 +65,7 @@ config S390
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2d24d33bf188..9809694e1389 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr)
 #define _PAGE_WRITE	0x020		/* SW pte write bit */
 #define _PAGE_SPECIAL	0x040		/* SW associated with special page */
 #define _PAGE_UNUSED	0x080		/* SW bit for pgste usage state */
-#define __HAVE_ARCH_PTE_SPECIAL
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SOFT_DIRTY 0x002		/* SW pte soft dirty bit */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index ae619d54018c..4d61a085982b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 config SUPERH
 	def_bool y
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_NO_COHERENT_DMA_MMAP if !MMU
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 89c513a982fc..f6abfe2bca93 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end,
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-#define __HAVE_ARCH_PTE_SPECIAL
-
 #include <asm-generic/pgtable.h>
 
 #endif /* __ASM_SH_PGTABLE_H */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index b42ba888217d..9a2b8877f174 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -88,6 +88,7 @@ config SPARC64
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select GENERIC_TIME_VSYSCALL
 	select ARCH_CLOCKSOURCE_DATA
+	select ARCH_HAS_PTE_SPECIAL
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 44d6ac47e035..1393a8ac596b 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr);
 #define _PAGE_PMD_HUGE    _AC(0x0100000000000000,UL) /* Huge page            */
 #define _PAGE_PUD_HUGE    _PAGE_PMD_HUGE
 
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
 /* SUN4U pte bits... */
 #define _PAGE_SZ4MB_4U	  _AC(0x6000000000000000,UL) /* 4MB Page             */
 #define _PAGE_SZ512K_4U	  _AC(0x4000000000000000,UL) /* 512K Page            */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb6e3a219294..f182a4e8e5bd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_UACCESS_MCSAFE		if X86_64
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 1e5a40673953..99fff853c944 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -65,7 +65,6 @@
 #define _PAGE_PKEY_BIT2	(_AT(pteval_t, 0))
 #define _PAGE_PKEY_BIT3	(_AT(pteval_t, 0))
 #endif
-#define __HAVE_ARCH_PTE_SPECIAL
 
 #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
 			 _PAGE_PKEY_BIT1 | \
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index a03c2642a87c..21713dc14ce2 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud);
 #endif
 #endif /* __HAVE_ARCH_PTE_DEVMAP */
 
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline bool pfn_t_special(pfn_t pfn)
 {
 	return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL;
@@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn)
 {
 	return false;
 }
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 #endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e0b6e87f65d..00bffa7a5112 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -754,3 +754,6 @@ config GUP_BENCHMARK
 	  performance of get_user_pages_fast().
 
 	  See tools/testing/selftests/vm/gup_benchmark.c
+
+config ARCH_HAS_PTE_SPECIAL
+	bool
diff --git a/mm/gup.c b/mm/gup.c
index 541904a7c60f..010153989b9b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1354,7 +1354,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
 	}
 }
 
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 			 int write, struct page **pages, int *nr)
 {
@@ -1430,7 +1430,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 {
 	return 0;
 }
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
 #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 6a97893a8de7..42d2d4366c11 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -817,7 +817,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  * PFNMAP mappings in order to support COWable mappings.
  *
  */
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 # define HAVE_PTE_SPECIAL 1
 #else
 # define HAVE_PTE_SPECIAL 0
-- 
cgit v1.2.3


From 1c4bc43ddfd52cbe5a08bb86ae636f55d2799424 Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Thu, 7 Jun 2018 17:06:15 -0700
Subject: mm/memblock: introduce PHYS_ADDR_MAX

So far code was using ULLONG_MAX and type casting to obtain a
phys_addr_t with all bits set.  The typecast is necessary to silence
compiler warnings on 32-bit platforms.

Use the simpler but still type safe approach "~(phys_addr_t)0" to create a
preprocessor define for all bits set.

Link: http://lkml.kernel.org/r/20180406213809.566-1-stefan@agner.ch
Signed-off-by: Stefan Agner <stefan@agner.ch>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  1 +
 mm/memblock.c          | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7aed92624531..7c4e8f1f72d8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -29,6 +29,7 @@
 #define LLONG_MIN	(-LLONG_MAX - 1)
 #define ULLONG_MAX	(~0ULL)
 #define SIZE_MAX	(~(size_t)0)
+#define PHYS_ADDR_MAX	(~(phys_addr_t)0)
 
 #define U8_MAX		((u8)~0U)
 #define S8_MAX		((s8)(U8_MAX>>1))
diff --git a/mm/memblock.c b/mm/memblock.c
index 5108356ad8aa..eec988c21c7e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void)
 /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
 static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
 {
-	return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
+	return *size = min(*size, PHYS_ADDR_MAX - base);
 }
 
 /*
@@ -925,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
 			r = &type_b->regions[idx_b];
 			r_start = idx_b ? r[-1].base + r[-1].size : 0;
 			r_end = idx_b < type_b->cnt ?
-				r->base : (phys_addr_t)ULLONG_MAX;
+				r->base : PHYS_ADDR_MAX;
 
 			/*
 			 * if idx_b advanced past idx_a,
@@ -1041,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 			r = &type_b->regions[idx_b];
 			r_start = idx_b ? r[-1].base + r[-1].size : 0;
 			r_end = idx_b < type_b->cnt ?
-				r->base : (phys_addr_t)ULLONG_MAX;
+				r->base : PHYS_ADDR_MAX;
 			/*
 			 * if idx_b advanced past idx_a,
 			 * break out to advance idx_a
@@ -1516,13 +1516,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
 
 static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
 {
-	phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+	phys_addr_t max_addr = PHYS_ADDR_MAX;
 	struct memblock_region *r;
 
 	/*
 	 * translate the memory @limit size into the max address within one of
 	 * the memory memblock regions, if the @limit exceeds the total size
-	 * of those regions, max_addr will keep original value ULLONG_MAX
+	 * of those regions, max_addr will keep original value PHYS_ADDR_MAX
 	 */
 	for_each_memblock(memory, r) {
 		if (limit <= r->size) {
@@ -1537,7 +1537,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
 
 void __init memblock_enforce_memory_limit(phys_addr_t limit)
 {
-	phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+	phys_addr_t max_addr = PHYS_ADDR_MAX;
 
 	if (!limit)
 		return;
@@ -1545,14 +1545,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
 	max_addr = __find_max_addr(limit);
 
 	/* @limit exceeds the total size of the memory, do nothing */
-	if (max_addr == (phys_addr_t)ULLONG_MAX)
+	if (max_addr == PHYS_ADDR_MAX)
 		return;
 
 	/* truncate both memory and reserved regions */
 	memblock_remove_range(&memblock.memory, max_addr,
-			      (phys_addr_t)ULLONG_MAX);
+			      PHYS_ADDR_MAX);
 	memblock_remove_range(&memblock.reserved, max_addr,
-			      (phys_addr_t)ULLONG_MAX);
+			      PHYS_ADDR_MAX);
 }
 
 void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
@@ -1580,7 +1580,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
 	/* truncate the reserved regions */
 	memblock_remove_range(&memblock.reserved, 0, base);
 	memblock_remove_range(&memblock.reserved,
-			base + size, (phys_addr_t)ULLONG_MAX);
+			base + size, PHYS_ADDR_MAX);
 }
 
 void __init memblock_mem_limit_remove_map(phys_addr_t limit)
@@ -1593,7 +1593,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
 	max_addr = __find_max_addr(limit);
 
 	/* @limit exceeds the total size of the memory, do nothing */
-	if (max_addr == (phys_addr_t)ULLONG_MAX)
+	if (max_addr == PHYS_ADDR_MAX)
 		return;
 
 	memblock_cap_memory_range(0, max_addr);
-- 
cgit v1.2.3


From bbec2e15170aae3e084d7d9afc730aeebe01b654 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 7 Jun 2018 17:06:18 -0700
Subject: mm: rename page_counter's count/limit into usage/max

This patch renames struct page_counter fields:
  count -> usage
  limit -> max

and the corresponding functions:
  page_counter_limit() -> page_counter_set_max()
  mem_cgroup_get_limit() -> mem_cgroup_get_max()
  mem_cgroup_resize_limit() -> mem_cgroup_resize_max()
  memcg_update_kmem_limit() -> memcg_update_kmem_max()
  memcg_update_tcp_limit() -> memcg_update_tcp_max()

The idea behind this renaming is to have the direct matching
between memory cgroup knobs (low, high, max) and page_counters API.

This is pure renaming, this patch doesn't bring any functional change.

Link: http://lkml.kernel.org/r/20180405185921.4942-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h   |   4 +-
 include/linux/page_counter.h |  12 ++---
 mm/hugetlb_cgroup.c          |   6 +--
 mm/memcontrol.c              | 112 +++++++++++++++++++++----------------------
 mm/oom_kill.c                |   2 +-
 mm/page_counter.c            |  28 +++++------
 6 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 517096c3cc99..577a19a6a93b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -467,7 +467,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
 
 void mem_cgroup_handle_over_high(void);
 
-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
 
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 				struct task_struct *p);
@@ -858,7 +858,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 	return 0;
 }
 
-static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 {
 	return 0;
 }
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index c15ab80ad32d..94029dad9317 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -7,8 +7,8 @@
 #include <asm/page.h>
 
 struct page_counter {
-	atomic_long_t count;
-	unsigned long limit;
+	atomic_long_t usage;
+	unsigned long max;
 	struct page_counter *parent;
 
 	/* legacy */
@@ -25,14 +25,14 @@ struct page_counter {
 static inline void page_counter_init(struct page_counter *counter,
 				     struct page_counter *parent)
 {
-	atomic_long_set(&counter->count, 0);
-	counter->limit = PAGE_COUNTER_MAX;
+	atomic_long_set(&counter->usage, 0);
+	counter->max = PAGE_COUNTER_MAX;
 	counter->parent = parent;
 }
 
 static inline unsigned long page_counter_read(struct page_counter *counter)
 {
-	return atomic_long_read(&counter->count);
+	return atomic_long_read(&counter->usage);
 }
 
 void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
@@ -41,7 +41,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 			     unsigned long nr_pages,
 			     struct page_counter **fail);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_limit(struct page_counter *counter, unsigned long limit);
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
 int page_counter_memparse(const char *buf, const char *max,
 			  unsigned long *nr_pages);
 
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index eec1150125b9..68c2f2f3c05b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
 
 		limit = round_down(PAGE_COUNTER_MAX,
 				   1 << huge_page_order(&hstates[idx]));
-		ret = page_counter_limit(counter, limit);
+		ret = page_counter_set_max(counter, limit);
 		VM_BUG_ON(ret);
 	}
 }
@@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
 	case RES_USAGE:
 		return (u64)page_counter_read(counter) * PAGE_SIZE;
 	case RES_LIMIT:
-		return (u64)counter->limit * PAGE_SIZE;
+		return (u64)counter->max * PAGE_SIZE;
 	case RES_MAX_USAGE:
 		return (u64)counter->watermark * PAGE_SIZE;
 	case RES_FAILCNT:
@@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
 	case RES_LIMIT:
 		mutex_lock(&hugetlb_limit_mutex);
-		ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
+		ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
 		mutex_unlock(&hugetlb_limit_mutex);
 		break;
 	default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d86665cf4a49..79bb5aeaa800 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 	unsigned long limit;
 
 	count = page_counter_read(&memcg->memory);
-	limit = READ_ONCE(memcg->memory.limit);
+	limit = READ_ONCE(memcg->memory.max);
 	if (count < limit)
 		margin = limit - count;
 
 	if (do_memsw_account()) {
 		count = page_counter_read(&memcg->memsw);
-		limit = READ_ONCE(memcg->memsw.limit);
+		limit = READ_ONCE(memcg->memsw.max);
 		if (count <= limit)
 			margin = min(margin, limit - count);
 		else
@@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
 	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
 		K((u64)page_counter_read(&memcg->memory)),
-		K((u64)memcg->memory.limit), memcg->memory.failcnt);
+		K((u64)memcg->memory.max), memcg->memory.failcnt);
 	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
 		K((u64)page_counter_read(&memcg->memsw)),
-		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+		K((u64)memcg->memsw.max), memcg->memsw.failcnt);
 	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
 		K((u64)page_counter_read(&memcg->kmem)),
-		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
+		K((u64)memcg->kmem.max), memcg->kmem.failcnt);
 
 	for_each_mem_cgroup_tree(iter, memcg) {
 		pr_info("Memory cgroup stats for ");
@@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 {
-	unsigned long limit;
+	unsigned long max;
 
-	limit = memcg->memory.limit;
+	max = memcg->memory.max;
 	if (mem_cgroup_swappiness(memcg)) {
-		unsigned long memsw_limit;
-		unsigned long swap_limit;
+		unsigned long memsw_max;
+		unsigned long swap_max;
 
-		memsw_limit = memcg->memsw.limit;
-		swap_limit = memcg->swap.limit;
-		swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
-		limit = min(limit + swap_limit, memsw_limit);
+		memsw_max = memcg->memsw.max;
+		swap_max = memcg->swap.max;
+		swap_max = min(swap_max, (unsigned long)total_swap_pages);
+		max = min(max + swap_max, memsw_max);
 	}
-	return limit;
+	return max;
 }
 
 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2444,10 +2444,10 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 }
 #endif
 
-static DEFINE_MUTEX(memcg_limit_mutex);
+static DEFINE_MUTEX(memcg_max_mutex);
 
-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
-				   unsigned long limit, bool memsw)
+static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
+				 unsigned long max, bool memsw)
 {
 	bool enlarge = false;
 	int ret;
@@ -2460,22 +2460,22 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 			break;
 		}
 
-		mutex_lock(&memcg_limit_mutex);
+		mutex_lock(&memcg_max_mutex);
 		/*
 		 * Make sure that the new limit (memsw or memory limit) doesn't
-		 * break our basic invariant rule memory.limit <= memsw.limit.
+		 * break our basic invariant rule memory.max <= memsw.max.
 		 */
-		limits_invariant = memsw ? limit >= memcg->memory.limit :
-					   limit <= memcg->memsw.limit;
+		limits_invariant = memsw ? max >= memcg->memory.max :
+					   max <= memcg->memsw.max;
 		if (!limits_invariant) {
-			mutex_unlock(&memcg_limit_mutex);
+			mutex_unlock(&memcg_max_mutex);
 			ret = -EINVAL;
 			break;
 		}
-		if (limit > counter->limit)
+		if (max > counter->max)
 			enlarge = true;
-		ret = page_counter_limit(counter, limit);
-		mutex_unlock(&memcg_limit_mutex);
+		ret = page_counter_set_max(counter, max);
+		mutex_unlock(&memcg_max_mutex);
 
 		if (!ret)
 			break;
@@ -2757,7 +2757,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
 		return (u64)page_counter_read(counter) * PAGE_SIZE;
 	case RES_LIMIT:
-		return (u64)counter->limit * PAGE_SIZE;
+		return (u64)counter->max * PAGE_SIZE;
 	case RES_MAX_USAGE:
 		return (u64)counter->watermark * PAGE_SIZE;
 	case RES_FAILCNT:
@@ -2871,24 +2871,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 }
 #endif /* !CONFIG_SLOB */
 
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
-				   unsigned long limit)
+static int memcg_update_kmem_max(struct mem_cgroup *memcg,
+				 unsigned long max)
 {
 	int ret;
 
-	mutex_lock(&memcg_limit_mutex);
-	ret = page_counter_limit(&memcg->kmem, limit);
-	mutex_unlock(&memcg_limit_mutex);
+	mutex_lock(&memcg_max_mutex);
+	ret = page_counter_set_max(&memcg->kmem, max);
+	mutex_unlock(&memcg_max_mutex);
 	return ret;
 }
 
-static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
 {
 	int ret;
 
-	mutex_lock(&memcg_limit_mutex);
+	mutex_lock(&memcg_max_mutex);
 
-	ret = page_counter_limit(&memcg->tcpmem, limit);
+	ret = page_counter_set_max(&memcg->tcpmem, max);
 	if (ret)
 		goto out;
 
@@ -2913,7 +2913,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
 		memcg->tcpmem_active = true;
 	}
 out:
-	mutex_unlock(&memcg_limit_mutex);
+	mutex_unlock(&memcg_max_mutex);
 	return ret;
 }
 
@@ -2941,16 +2941,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 		}
 		switch (MEMFILE_TYPE(of_cft(of)->private)) {
 		case _MEM:
-			ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
+			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
 			break;
 		case _MEMSWAP:
-			ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
+			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
 			break;
 		case _KMEM:
-			ret = memcg_update_kmem_limit(memcg, nr_pages);
+			ret = memcg_update_kmem_max(memcg, nr_pages);
 			break;
 		case _TCP:
-			ret = memcg_update_tcp_limit(memcg, nr_pages);
+			ret = memcg_update_tcp_max(memcg, nr_pages);
 			break;
 		}
 		break;
@@ -3126,8 +3126,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	/* Hierarchical information */
 	memory = memsw = PAGE_COUNTER_MAX;
 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
-		memory = min(memory, mi->memory.limit);
-		memsw = min(memsw, mi->memsw.limit);
+		memory = min(memory, mi->memory.max);
+		memsw = min(memsw, mi->memsw.max);
 	}
 	seq_printf(m, "hierarchical_memory_limit %llu\n",
 		   (u64)memory * PAGE_SIZE);
@@ -3626,7 +3626,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	*pheadroom = PAGE_COUNTER_MAX;
 
 	while ((parent = parent_mem_cgroup(memcg))) {
-		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+		unsigned long ceiling = min(memcg->memory.max, memcg->high);
 		unsigned long used = page_counter_read(&memcg->memory);
 
 		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-	page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
-	page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
-	page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
-	page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
-	page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
 	memcg->low = 0;
+	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
+	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
+	page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
+	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
+	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
 	memcg->high = PAGE_COUNTER_MAX;
 	memcg->soft_limit = PAGE_COUNTER_MAX;
 	memcg_wb_domain_size_changed(memcg);
@@ -5131,7 +5131,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 static int memory_max_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-	unsigned long max = READ_ONCE(memcg->memory.limit);
+	unsigned long max = READ_ONCE(memcg->memory.max);
 
 	if (max == PAGE_COUNTER_MAX)
 		seq_puts(m, "max\n");
@@ -5155,7 +5155,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
-	xchg(&memcg->memory.limit, max);
+	xchg(&memcg->memory.max, max);
 
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -6074,7 +6074,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 		return nr_swap_pages;
 	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
 		nr_swap_pages = min_t(long, nr_swap_pages,
-				      READ_ONCE(memcg->swap.limit) -
+				      READ_ONCE(memcg->swap.max) -
 				      page_counter_read(&memcg->swap));
 	return nr_swap_pages;
 }
@@ -6095,7 +6095,7 @@ bool mem_cgroup_swap_full(struct page *page)
 		return false;
 
 	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
-		if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+		if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
 			return true;
 
 	return false;
@@ -6129,7 +6129,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
 static int swap_max_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-	unsigned long max = READ_ONCE(memcg->swap.limit);
+	unsigned long max = READ_ONCE(memcg->swap.max);
 
 	if (max == PAGE_COUNTER_MAX)
 		seq_puts(m, "max\n");
@@ -6151,9 +6151,9 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
-	mutex_lock(&memcg_limit_mutex);
-	err = page_counter_limit(&memcg->swap, max);
-	mutex_unlock(&memcg_limit_mutex);
+	mutex_lock(&memcg_max_mutex);
+	err = page_counter_set_max(&memcg->swap, max);
+	mutex_unlock(&memcg_max_mutex);
 	if (err)
 		return err;
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8ba6cb88cf58..6694348b27e9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	int nid;
 
 	if (is_memcg_oom(oc)) {
-		oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+		oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
 		return CONSTRAINT_MEMCG;
 	}
 
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 2a8df3ad60a4..41937c9a9d11 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -22,7 +22,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
 {
 	long new;
 
-	new = atomic_long_sub_return(nr_pages, &counter->count);
+	new = atomic_long_sub_return(nr_pages, &counter->usage);
 	/* More uncharges than charges? */
 	WARN_ON_ONCE(new < 0);
 }
@@ -41,7 +41,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
 	for (c = counter; c; c = c->parent) {
 		long new;
 
-		new = atomic_long_add_return(nr_pages, &c->count);
+		new = atomic_long_add_return(nr_pages, &c->usage);
 		/*
 		 * This is indeed racy, but we can live with some
 		 * inaccuracy in the watermark.
@@ -82,9 +82,9 @@ bool page_counter_try_charge(struct page_counter *counter,
 		 * we either see the new limit or the setter sees the
 		 * counter has changed and retries.
 		 */
-		new = atomic_long_add_return(nr_pages, &c->count);
-		if (new > c->limit) {
-			atomic_long_sub(nr_pages, &c->count);
+		new = atomic_long_add_return(nr_pages, &c->usage);
+		if (new > c->max) {
+			atomic_long_sub(nr_pages, &c->usage);
 			/*
 			 * This is racy, but we can live with some
 			 * inaccuracy in the failcnt.
@@ -123,20 +123,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
 }
 
 /**
- * page_counter_limit - limit the number of pages allowed
+ * page_counter_set_max - set the maximum number of pages allowed
  * @counter: counter
- * @limit: limit to set
+ * @nr_pages: limit to set
  *
  * Returns 0 on success, -EBUSY if the current number of pages on the
  * counter already exceeds the specified limit.
  *
  * The caller must serialize invocations on the same counter.
  */
-int page_counter_limit(struct page_counter *counter, unsigned long limit)
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
 {
 	for (;;) {
 		unsigned long old;
-		long count;
+		long usage;
 
 		/*
 		 * Update the limit while making sure that it's not
@@ -149,17 +149,17 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
 		 * the limit, so if it sees the old limit, we see the
 		 * modified counter and retry.
 		 */
-		count = atomic_long_read(&counter->count);
+		usage = atomic_long_read(&counter->usage);
 
-		if (count > limit)
+		if (usage > nr_pages)
 			return -EBUSY;
 
-		old = xchg(&counter->limit, limit);
+		old = xchg(&counter->max, nr_pages);
 
-		if (atomic_long_read(&counter->count) <= count)
+		if (atomic_long_read(&counter->usage) <= usage)
 			return 0;
 
-		counter->limit = old;
+		counter->max = old;
 		cond_resched();
 	}
 }
-- 
cgit v1.2.3


From 230671533d64631116be3ff9d407bd9ca5a58e1b Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 7 Jun 2018 17:06:22 -0700
Subject: mm: memory.low hierarchical behavior

This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.

For example, there are memcgs A, A/B, A/C, A/D and A/E:

  A      A/memory.low = 2G, A/memory.current = 6G
 //\\
BC  DE   B/memory.low = 3G  B/memory.current = 2G
         C/memory.low = 1G  C/memory.current = 2G
         D/memory.low = 0   D/memory.current = 2G
	 E/memory.low = 10G E/memory.current = 0

If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G.  This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well.  Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.

A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
  A:    1430097920
  A/B:  711929856
  A/C:  717426688
  A/D:  741376
  A/E:  0

To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.

Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise).  It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense.  To avoid traversing the cgroup tree
twice, page_counters code is reused.

Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level.  So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.

This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.

Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.

With this patch applied results are matching the expectations:
  A:    2147930112
  A/B:  1428721664
  A/C:  718393344
  A/D:  815104
  A/E:  0

Test script:
  #!/bin/bash

  CGPATH="/sys/fs/cgroup"

  truncate /file1 --size 2G
  truncate /file2 --size 2G
  truncate /file3 --size 2G
  truncate /file4 --size 50G

  mkdir "${CGPATH}/A"
  echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
  mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"

  echo 2G > "${CGPATH}/A/memory.low"
  echo 3G > "${CGPATH}/A/B/memory.low"
  echo 1G > "${CGPATH}/A/C/memory.low"
  echo 0 > "${CGPATH}/A/D/memory.low"
  echo 10G > "${CGPATH}/A/E/memory.low"

  echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
  echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
  echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
  echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4

  echo "A:   " `cat "${CGPATH}/A/memory.current"`
  echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
  echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
  echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
  echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`

  rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
  rmdir "${CGPATH}/A"
  rm /file1 /file2 /file3 /file4

Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h   |   3 +-
 include/linux/page_counter.h |   7 +++
 mm/memcontrol.c              | 112 ++++++++++++++++++++++++++++++++-----------
 mm/page_counter.c            |  43 +++++++++++++++++
 4 files changed, 134 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 577a19a6a93b..891945507044 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,8 +181,7 @@ struct mem_cgroup {
 	struct page_counter kmem;
 	struct page_counter tcpmem;
 
-	/* Normal memory consumption range */
-	unsigned long low;
+	/* Upper bound of normal memory consumption range */
 	unsigned long high;
 
 	/* Range enforcement for interrupt charges */
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 94029dad9317..7902a727d3b6 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -9,8 +9,14 @@
 struct page_counter {
 	atomic_long_t usage;
 	unsigned long max;
+	unsigned long low;
 	struct page_counter *parent;
 
+	/* effective memory.low and memory.low usage tracking */
+	unsigned long elow;
+	atomic_long_t low_usage;
+	atomic_long_t children_low_usage;
+
 	/* legacy */
 	unsigned long watermark;
 	unsigned long failcnt;
@@ -42,6 +48,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 			     struct page_counter **fail);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
 int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
 int page_counter_memparse(const char *buf, const char *max,
 			  unsigned long *nr_pages);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 79bb5aeaa800..d0261059aab9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4270,7 +4270,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	}
 	spin_unlock(&memcg->event_list_lock);
 
-	memcg->low = 0;
+	page_counter_set_low(&memcg->memory, 0);
 
 	memcg_offline_kmem(memcg);
 	wb_memcg_offline(memcg);
@@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-	memcg->low = 0;
 	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+	page_counter_set_low(&memcg->memory, 0);
 	memcg->high = PAGE_COUNTER_MAX;
 	memcg->soft_limit = PAGE_COUNTER_MAX;
 	memcg_wb_domain_size_changed(memcg);
@@ -5064,7 +5064,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
 static int memory_low_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-	unsigned long low = READ_ONCE(memcg->low);
+	unsigned long low = READ_ONCE(memcg->memory.low);
 
 	if (low == PAGE_COUNTER_MAX)
 		seq_puts(m, "max\n");
@@ -5086,7 +5086,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
-	memcg->low = low;
+	page_counter_set_low(&memcg->memory, low);
 
 	return nbytes;
 }
@@ -5348,36 +5348,72 @@ struct cgroup_subsys memory_cgrp_subsys = {
  * @root: the top ancestor of the sub-tree being checked
  * @memcg: the memory cgroup to check
  *
- * Returns %true if memory consumption of @memcg, and that of all
- * ancestors up to (but not including) @root, is below the normal range.
+ * WARNING: This function is not stateless! It can only be used as part
+ *          of a top-down tree iteration, not for isolated queries.
  *
- * @root is exclusive; it is never low when looked at directly and isn't
- * checked when traversing the hierarchy.
+ * Returns %true if memory consumption of @memcg is below the normal range.
  *
- * Excluding @root enables using memory.low to prioritize memory usage
- * between cgroups within a subtree of the hierarchy that is limited by
- * memory.high or memory.max.
+ * @root is exclusive; it is never low when looked at directly
  *
- * For example, given cgroup A with children B and C:
+ * To provide a proper hierarchical behavior, effective memory.low value
+ * is used.
  *
- *    A
- *   / \
- *  B   C
+ * Effective memory.low is always equal or less than the original memory.low.
+ * If there is no memory.low overcommittment (which is always true for
+ * top-level memory cgroups), these two values are equal.
+ * Otherwise, it's a part of parent's effective memory.low,
+ * calculated as a cgroup's memory.low usage divided by sum of sibling's
+ * memory.low usages, where memory.low usage is the size of actually
+ * protected memory.
  *
- * and
+ *                                             low_usage
+ * elow = min( memory.low, parent->elow * ------------------ ),
+ *                                        siblings_low_usage
  *
- *  1. A/memory.current > A/memory.high
- *  2. A/B/memory.current < A/B/memory.low
- *  3. A/C/memory.current >= A/C/memory.low
+ *             | memory.current, if memory.current < memory.low
+ * low_usage = |
+	       | 0, otherwise.
  *
- * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
- * should reclaim from 'C' until 'A' is no longer high or until we can
- * no longer reclaim from 'C'.  If 'A', i.e. @root, isn't excluded by
- * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
- * low and we will reclaim indiscriminately from both 'B' and 'C'.
+ *
+ * Such definition of the effective memory.low provides the expected
+ * hierarchical behavior: parent's memory.low value is limiting
+ * children, unprotected memory is reclaimed first and cgroups,
+ * which are not using their guarantee do not affect actual memory
+ * distribution.
+ *
+ * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
+ *
+ *     A      A/memory.low = 2G, A/memory.current = 6G
+ *    //\\
+ *   BC  DE   B/memory.low = 3G  B/memory.current = 2G
+ *            C/memory.low = 1G  C/memory.current = 2G
+ *            D/memory.low = 0   D/memory.current = 2G
+ *            E/memory.low = 10G E/memory.current = 0
+ *
+ * and the memory pressure is applied, the following memory distribution
+ * is expected (approximately):
+ *
+ *     A/memory.current = 2G
+ *
+ *     B/memory.current = 1.3G
+ *     C/memory.current = 0.6G
+ *     D/memory.current = 0
+ *     E/memory.current = 0
+ *
+ * These calculations require constant tracking of the actual low usages
+ * (see propagate_low_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_low()
+ * path for each memory cgroup top-down from the reclaim,
+ * it's possible to optimize this part, and save calculated elow
+ * for next usage. This part is intentionally racy, but it's ok,
+ * as memory.low is a best-effort mechanism.
  */
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
+	unsigned long usage, low_usage, siblings_low_usage;
+	unsigned long elow, parent_elow;
+	struct mem_cgroup *parent;
+
 	if (mem_cgroup_disabled())
 		return false;
 
@@ -5386,12 +5422,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 	if (memcg == root)
 		return false;
 
-	for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
-		if (page_counter_read(&memcg->memory) >= memcg->low)
-			return false;
-	}
+	elow = memcg->memory.low;
+	usage = page_counter_read(&memcg->memory);
+	parent = parent_mem_cgroup(memcg);
 
-	return true;
+	if (parent == root)
+		goto exit;
+
+	parent_elow = READ_ONCE(parent->memory.elow);
+	elow = min(elow, parent_elow);
+
+	if (!elow || !parent_elow)
+		goto exit;
+
+	low_usage = min(usage, memcg->memory.low);
+	siblings_low_usage = atomic_long_read(
+		&parent->memory.children_low_usage);
+
+	if (!low_usage || !siblings_low_usage)
+		goto exit;
+
+	elow = min(elow, parent_elow * low_usage / siblings_low_usage);
+exit:
+	memcg->memory.elow = elow;
+	return usage < elow;
 }
 
 /**
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 41937c9a9d11..a5ff4cbc355a 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,6 +13,28 @@
 #include <linux/bug.h>
 #include <asm/page.h>
 
+static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+{
+	unsigned long low_usage, old;
+	long delta;
+
+	if (!c->parent)
+		return;
+
+	if (!c->low && !atomic_long_read(&c->low_usage))
+		return;
+
+	if (usage <= c->low)
+		low_usage = usage;
+	else
+		low_usage = 0;
+
+	old = atomic_long_xchg(&c->low_usage, low_usage);
+	delta = low_usage - old;
+	if (delta)
+		atomic_long_add(delta, &c->parent->children_low_usage);
+}
+
 /**
  * page_counter_cancel - take pages out of the local counter
  * @counter: counter
@@ -23,6 +45,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
 	long new;
 
 	new = atomic_long_sub_return(nr_pages, &counter->usage);
+	propagate_low_usage(counter, new);
 	/* More uncharges than charges? */
 	WARN_ON_ONCE(new < 0);
 }
@@ -42,6 +65,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
 		long new;
 
 		new = atomic_long_add_return(nr_pages, &c->usage);
+		propagate_low_usage(counter, new);
 		/*
 		 * This is indeed racy, but we can live with some
 		 * inaccuracy in the watermark.
@@ -85,6 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 		new = atomic_long_add_return(nr_pages, &c->usage);
 		if (new > c->max) {
 			atomic_long_sub(nr_pages, &c->usage);
+			propagate_low_usage(counter, new);
 			/*
 			 * This is racy, but we can live with some
 			 * inaccuracy in the failcnt.
@@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 			*fail = c;
 			goto failed;
 		}
+		propagate_low_usage(counter, new);
 		/*
 		 * Just like with failcnt, we can live with some
 		 * inaccuracy in the watermark.
@@ -164,6 +190,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
 	}
 }
 
+/**
+ * page_counter_set_low - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+{
+	struct page_counter *c;
+
+	counter->low = nr_pages;
+
+	for (c = counter; c; c = c->parent)
+		propagate_low_usage(c, atomic_long_read(&c->usage));
+}
+
 /**
  * page_counter_memparse - memparse() for page counter limits
  * @buf: string to parse
-- 
cgit v1.2.3


From 93781325da6e07af57a21f111d1f2b9f128fa397 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 7 Jun 2018 17:07:02 -0700
Subject: lockdep: fix fs_reclaim annotation

While revisiting my Btrfs swapfile series [1], I introduced a situation
in which reclaim would lock i_rwsem, and even though the swapon() path
clearly made GFP_KERNEL allocations while holding i_rwsem, I got no
complaints from lockdep.  It turns out that the rework of the fs_reclaim
annotation was broken: if the current task has PF_MEMALLOC set, we don't
acquire the dummy fs_reclaim lock, but when reclaiming we always check
this _after_ we've just set the PF_MEMALLOC flag.  In most cases, we can
fix this by moving the fs_reclaim_{acquire,release}() outside of the
memalloc_noreclaim_{save,restore}(), althought kswapd is slightly
different.  After applying this, I got the expected lockdep splats.

1: https://lwn.net/Articles/625412/

Link: http://lkml.kernel.org/r/9f8aa70652a98e98d7c4de0fc96a4addcee13efe.1523778026.git.osandov@fb.com
Fixes: d92a8cfcb37e ("locking/lockdep: Rework FS_RECLAIM annotation")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h |  4 ++++
 mm/page_alloc.c          | 20 +++++++++++++++-----
 mm/vmscan.c              | 20 +++++++++++++-------
 3 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 76a8cb4ef178..44d356f5e47c 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -163,9 +163,13 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 }
 
 #ifdef CONFIG_LOCKDEP
+extern void __fs_reclaim_acquire(void);
+extern void __fs_reclaim_release(void);
 extern void fs_reclaim_acquire(gfp_t gfp_mask);
 extern void fs_reclaim_release(gfp_t gfp_mask);
 #else
+static inline void __fs_reclaim_acquire(void) { }
+static inline void __fs_reclaim_release(void) { }
 static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
 static inline void fs_reclaim_release(gfp_t gfp_mask) { }
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4fbe211340e0..b93cc79a8db4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3701,7 +3701,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 #endif /* CONFIG_COMPACTION */
 
 #ifdef CONFIG_LOCKDEP
-struct lockdep_map __fs_reclaim_map =
+static struct lockdep_map __fs_reclaim_map =
 	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
 
 static bool __need_fs_reclaim(gfp_t gfp_mask)
@@ -3726,17 +3726,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
 	return true;
 }
 
+void __fs_reclaim_acquire(void)
+{
+	lock_map_acquire(&__fs_reclaim_map);
+}
+
+void __fs_reclaim_release(void)
+{
+	lock_map_release(&__fs_reclaim_map);
+}
+
 void fs_reclaim_acquire(gfp_t gfp_mask)
 {
 	if (__need_fs_reclaim(gfp_mask))
-		lock_map_acquire(&__fs_reclaim_map);
+		__fs_reclaim_acquire();
 }
 EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
 
 void fs_reclaim_release(gfp_t gfp_mask)
 {
 	if (__need_fs_reclaim(gfp_mask))
-		lock_map_release(&__fs_reclaim_map);
+		__fs_reclaim_release();
 }
 EXPORT_SYMBOL_GPL(fs_reclaim_release);
 #endif
@@ -3754,8 +3764,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
-	noreclaim_flag = memalloc_noreclaim_save();
 	fs_reclaim_acquire(gfp_mask);
+	noreclaim_flag = memalloc_noreclaim_save();
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 
@@ -3763,8 +3773,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 								ac->nodemask);
 
 	current->reclaim_state = NULL;
-	fs_reclaim_release(gfp_mask);
 	memalloc_noreclaim_restore(noreclaim_flag);
+	fs_reclaim_release(gfp_mask);
 
 	cond_resched();
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9270a4370d54..0e67b477ecef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3318,11 +3318,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		.may_unmap = 1,
 		.may_swap = 1,
 	};
+
+	__fs_reclaim_acquire();
+
 	count_vm_event(PAGEOUTRUN);
 
 	do {
 		unsigned long nr_reclaimed = sc.nr_reclaimed;
 		bool raise_priority = true;
+		bool ret;
 
 		sc.reclaim_idx = classzone_idx;
 
@@ -3395,7 +3399,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
-		if (try_to_freeze() || kthread_should_stop())
+		__fs_reclaim_release();
+		ret = try_to_freeze();
+		__fs_reclaim_acquire();
+		if (ret || kthread_should_stop())
 			break;
 
 		/*
@@ -3412,6 +3419,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 
 out:
 	snapshot_refaults(NULL, pgdat);
+	__fs_reclaim_release();
 	/*
 	 * Return the order kswapd stopped reclaiming at as
 	 * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3600,9 +3608,7 @@ kswapd_try_sleep:
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
 						alloc_order);
-		fs_reclaim_acquire(GFP_KERNEL);
 		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
-		fs_reclaim_release(GFP_KERNEL);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
 	}
@@ -3684,16 +3690,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
 
-	noreclaim_flag = memalloc_noreclaim_save();
 	fs_reclaim_acquire(sc.gfp_mask);
+	noreclaim_flag = memalloc_noreclaim_save();
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	p->reclaim_state = NULL;
-	fs_reclaim_release(sc.gfp_mask);
 	memalloc_noreclaim_restore(noreclaim_flag);
+	fs_reclaim_release(sc.gfp_mask);
 
 	return nr_reclaimed;
 }
@@ -3870,6 +3876,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	};
 
 	cond_resched();
+	fs_reclaim_acquire(sc.gfp_mask);
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
 	 * and we also need to be able to write out pages for RECLAIM_WRITE
@@ -3877,7 +3884,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	p->flags |= PF_SWAPWRITE;
-	fs_reclaim_acquire(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
@@ -3892,9 +3898,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	}
 
 	p->reclaim_state = NULL;
-	fs_reclaim_release(gfp_mask);
 	current->flags &= ~PF_SWAPWRITE;
 	memalloc_noreclaim_restore(noreclaim_flag);
+	fs_reclaim_release(sc.gfp_mask);
 	return sc.nr_reclaimed >= nr_pages;
 }
 
-- 
cgit v1.2.3


From 48f49e1f66854ef44439b84a5cf47179539f3804 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 7 Jun 2018 17:07:07 -0700
Subject: mm/ksm: remove unused page_referenced_ksm declaration

Commit 9f32624be943 ("mm/rmap: use rmap_walk() in page_referenced()")
removed the declaration of page_referenced_ksm for the case
CONFIG_KSM=y, but left one for CONFIG_KSM=n.

Remove the unused leftover.

Link: http://lkml.kernel.org/r/1524552106-7356-2-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 44368b19b27e..bbdfca3db6e1 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -89,12 +89,6 @@ static inline struct page *ksm_might_need_to_copy(struct page *page,
 	return page;
 }
 
-static inline int page_referenced_ksm(struct page *page,
-			struct mem_cgroup *memcg, unsigned long *vm_flags)
-{
-	return 0;
-}
-
 static inline void rmap_walk_ksm(struct page *page,
 			struct rmap_walk_control *rwc)
 {
-- 
cgit v1.2.3


From 88484826bc67844eeae1855ebe32cce9edd937c9 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 7 Jun 2018 17:07:11 -0700
Subject: mm/ksm: move [set_]page_stable_node from ksm.h to ksm.c

page_stable_node() and set_page_stable_node() are only used in mm/ksm.c
and there is no point to keep them in the include/linux/ksm.h

[akpm@linux-foundation.org: fix SYSFS=n build]
Link: http://lkml.kernel.org/r/1524552106-7356-3-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h | 11 -----------
 mm/ksm.c            | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index bbdfca3db6e1..161e8164abcf 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -37,17 +37,6 @@ static inline void ksm_exit(struct mm_struct *mm)
 		__ksm_exit(mm);
 }
 
-static inline struct stable_node *page_stable_node(struct page *page)
-{
-	return PageKsm(page) ? page_rmapping(page) : NULL;
-}
-
-static inline void set_page_stable_node(struct page *page,
-					struct stable_node *stable_node)
-{
-	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
-}
-
 /*
  * When do_swap_page() first faults in from swap what used to be a KSM page,
  * no problem, it will be assigned to this vma's anon_vma; but thereafter,
diff --git a/mm/ksm.c b/mm/ksm.c
index 7d6558f3bac9..e2d2886fb1df 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -840,6 +840,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
 	return err;
 }
 
+static inline struct stable_node *page_stable_node(struct page *page)
+{
+	return PageKsm(page) ? page_rmapping(page) : NULL;
+}
+
+static inline void set_page_stable_node(struct page *page,
+					struct stable_node *stable_node)
+{
+	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+}
+
 #ifdef CONFIG_SYSFS
 /*
  * Only called through the sysfs control interface:
-- 
cgit v1.2.3


From 9ccc361716362e8abb01d9944f8df97b4aac3e23 Mon Sep 17 00:00:00 2001
From: Wang Long <wanglong19@meituan.com>
Date: Thu, 7 Jun 2018 17:07:19 -0700
Subject: memcg: writeback: use memcg->cgwb_list directly

mem_cgroup_cgwb_list is a very simple wrapper and it will never be used
outside of code under CONFIG_CGROUP_WRITEBACK.  so use memcg->cgwb_list
directly.

Link: http://lkml.kernel.org/r/1524406173-212182-1-git-send-email-wanglong19@meituan.com
Signed-off-by: Wang Long <wanglong19@meituan.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 -
 mm/backing-dev.c           | 4 ++--
 mm/memcontrol.c            | 5 -----
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 891945507044..10d741e8fe51 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1097,7 +1097,6 @@ static inline void dec_lruvec_page_state(struct page *page,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
-struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 			 unsigned long *pheadroom, unsigned long *pdirty,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fe3ebd6ac00..347cc834c04a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -557,7 +557,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	memcg = mem_cgroup_from_css(memcg_css);
 	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
 	blkcg = css_to_blkcg(blkcg_css);
-	memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+	memcg_cgwb_list = &memcg->cgwb_list;
 	blkcg_cgwb_list = &blkcg->cgwb_list;
 
 	/* look up again under lock and discard on blkcg mismatch */
@@ -736,7 +736,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
  */
 void wb_memcg_offline(struct mem_cgroup *memcg)
 {
-	struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+	struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
 	struct bdi_writeback *wb, *next;
 
 	spin_lock_irq(&cgwb_lock);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8bab4660e6aa..2751259f0a76 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3562,11 +3562,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
-struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
-{
-	return &memcg->cgwb_list;
-}
-
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
 {
 	return wb_domain_init(&memcg->cgwb_domain, gfp);
-- 
cgit v1.2.3


From fb52bbaee598f58352d8732637ebe7013b2df79f Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Thu, 7 Jun 2018 17:07:43 -0700
Subject: mm: move is_pageblock_removable_nolock() to mm/memory_hotplug.c

is_pageblock_removable_nolock() is not used outside of
mm/memory_hotplug.c.  Move it next to unique caller
is_mem_section_removable() and make it static.

Remove prototype in <linux/memory_hotplug.h> to silence gcc warning (W=1):

  mm/page_alloc.c:7704:6: warning: no previous prototype for `is_pageblock_removable_nolock' [-Wmissing-prototypes]

Link: http://lkml.kernel.org/r/20180509190001.24789-1-malat@debian.org
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  1 -
 mm/memory_hotplug.c            | 23 +++++++++++++++++++++++
 mm/page_alloc.c                | 23 -----------------------
 3 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2b0265265c28..4e9828cda7a2 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -107,7 +107,6 @@ static inline bool movable_node_is_enabled(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-extern bool is_pageblock_removable_nolock(struct page *page);
 extern int arch_remove_memory(u64 start, u64 size,
 		struct vmem_altmap *altmap);
 extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 25982467800b..7deb49f69e27 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page)
 	return page + pageblock_nr_pages;
 }
 
+static bool is_pageblock_removable_nolock(struct page *page)
+{
+	struct zone *zone;
+	unsigned long pfn;
+
+	/*
+	 * We have to be careful here because we are iterating over memory
+	 * sections which are not zone aware so we might end up outside of
+	 * the zone but still within the section.
+	 * We have to take care about the node as well. If the node is offline
+	 * its NODE_DATA will be NULL - see page_zone.
+	 */
+	if (!node_online(page_to_nid(page)))
+		return false;
+
+	zone = page_zone(page);
+	pfn = page_to_pfn(page);
+	if (!zone_spans_pfn(zone, pfn))
+		return false;
+
+	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+}
+
 /* Checks if this range of memory is likely to be hot-removable. */
 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b93cc79a8db4..d0d26da12086 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7692,29 +7692,6 @@ unmovable:
 	return true;
 }
 
-bool is_pageblock_removable_nolock(struct page *page)
-{
-	struct zone *zone;
-	unsigned long pfn;
-
-	/*
-	 * We have to be careful here because we are iterating over memory
-	 * sections which are not zone aware so we might end up outside of
-	 * the zone but still within the section.
-	 * We have to take care about the node as well. If the node is offline
-	 * its NODE_DATA will be NULL - see page_zone.
-	 */
-	if (!node_online(page_to_nid(page)))
-		return false;
-
-	zone = page_zone(page);
-	pfn = page_to_pfn(page);
-	if (!zone_spans_pfn(zone, pfn))
-		return false;
-
-	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
-}
-
 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 
 static unsigned long pfn_max_align_down(unsigned long pfn)
-- 
cgit v1.2.3


From bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 7 Jun 2018 17:07:46 -0700
Subject: memcg: introduce memory.min

Memory controller implements the memory.low best-effort memory
protection mechanism, which works perfectly in many cases and allows
protecting working sets of important workloads from sudden reclaim.

But its semantics has a significant limitation: it works only as long as
there is a supply of reclaimable memory.  This makes it pretty useless
against any sort of slow memory leaks or memory usage increases.  This
is especially true for swapless systems.  If swap is enabled, memory
soft protection effectively postpones problems, allowing a leaking
application to fill all swap area, which makes no sense.  The only
effective way to guarantee the memory protection in this case is to
invoke the OOM killer.

It's possible to handle this case in userspace by reacting on MEMCG_LOW
events; but there is still a place for a fail-safe in-kernel mechanism
to provide stronger guarantees.

This patch introduces the memory.min interface for cgroup v2 memory
controller.  It works very similarly to memory.low (sharing the same
hierarchical behavior), except that it's not disabled if there is no
more reclaimable memory in the system.

If cgroup is not populated, its memory.min is ignored, because otherwise
even the OOM killer wouldn't be able to reclaim the protected memory,
and the system can stall.

[guro@fb.com: s/low/min/ in docs]
Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com
Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  27 +++++++-
 include/linux/memcontrol.h              |  15 ++--
 include/linux/page_counter.h            |  11 ++-
 mm/memcontrol.c                         | 118 +++++++++++++++++++++++++-------
 mm/page_counter.c                       |  63 ++++++++++++-----
 mm/vmscan.c                             |  18 ++++-
 6 files changed, 202 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7b56ca80e37a..e34d3c938729 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back.
 	The total amount of memory currently being used by the cgroup
 	and its descendants.
 
+  memory.min
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	Hard memory protection.  If the memory usage of a cgroup
+	is within its effective min boundary, the cgroup's memory
+	won't be reclaimed under any conditions. If there is no
+	unprotected reclaimable memory available, OOM killer
+	is invoked.
+
+       Effective min boundary is limited by memory.min values of
+	all ancestor cgroups. If there is memory.min overcommitment
+	(child cgroup or cgroups are requiring more protected memory
+	than parent will allow), then each child cgroup will get
+	the part of parent's protection proportional to its
+	actual memory usage below memory.min.
+
+	Putting more memory than generally available under this
+	protection is discouraged and may lead to constant OOMs.
+
+	If a memory cgroup is not populated with processes,
+	its memory.min is ignored.
+
   memory.low
 	A read-write single value file which exists on non-root
 	cgroups.  The default is "0".
@@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back.
 
 	Effective low boundary is limited by memory.low values of
 	all ancestor cgroups. If there is memory.low overcommitment
-	(child cgroup or cgroups are requiring more protected memory,
+	(child cgroup or cgroups are requiring more protected memory
 	than parent will allow), then each child cgroup will get
-	the part of parent's protection proportional to the its
+	the part of parent's protection proportional to its
 	actual memory usage below memory.low.
 
 	Putting more memory than generally available under this
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 10d741e8fe51..9c04cf8e6487 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -58,6 +58,12 @@ enum memcg_memory_event {
 	MEMCG_NR_MEMORY_EVENTS,
 };
 
+enum mem_cgroup_protection {
+	MEMCG_PROT_NONE,
+	MEMCG_PROT_LOW,
+	MEMCG_PROT_MIN,
+};
+
 struct mem_cgroup_reclaim_cookie {
 	pg_data_t *pgdat;
 	int priority;
@@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
+						struct mem_cgroup *memcg);
 
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
@@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
 {
 }
 
-static inline bool mem_cgroup_low(struct mem_cgroup *root,
-				  struct mem_cgroup *memcg)
+static inline enum mem_cgroup_protection mem_cgroup_protected(
+	struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
-	return false;
+	return MEMCG_PROT_NONE;
 }
 
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 7902a727d3b6..bab7e57f659b 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -8,10 +8,16 @@
 
 struct page_counter {
 	atomic_long_t usage;
-	unsigned long max;
+	unsigned long min;
 	unsigned long low;
+	unsigned long max;
 	struct page_counter *parent;
 
+	/* effective memory.min and memory.min usage tracking */
+	unsigned long emin;
+	atomic_long_t min_usage;
+	atomic_long_t children_min_usage;
+
 	/* effective memory.low and memory.low usage tracking */
 	unsigned long elow;
 	atomic_long_t low_usage;
@@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter,
 			     unsigned long nr_pages,
 			     struct page_counter **fail);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
 int page_counter_memparse(const char *buf, const char *max,
 			  unsigned long *nr_pages);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6de0d6a3a8d..e3d56927a724 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	}
 	spin_unlock(&memcg->event_list_lock);
 
+	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
 
 	memcg_offline_kmem(memcg);
@@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
 	memcg->high = PAGE_COUNTER_MAX;
 	memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
 	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
 }
 
+static int memory_min_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long min = READ_ONCE(memcg->memory.min);
+
+	if (min == PAGE_COUNTER_MAX)
+		seq_puts(m, "max\n");
+	else
+		seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t memory_min_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long min;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "max", &min);
+	if (err)
+		return err;
+
+	page_counter_set_min(&memcg->memory, min);
+
+	return nbytes;
+}
+
 static int memory_low_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5300,6 +5332,12 @@ static struct cftype memory_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = memory_current_read,
 	},
+	{
+		.name = "min",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_min_show,
+		.write = memory_min_write,
+	},
 	{
 		.name = "low",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = {
 };
 
 /**
- * mem_cgroup_low - check if memory consumption is in the normal range
+ * mem_cgroup_protected - check if memory consumption is in the normal range
  * @root: the top ancestor of the sub-tree being checked
  * @memcg: the memory cgroup to check
  *
  * WARNING: This function is not stateless! It can only be used as part
  *          of a top-down tree iteration, not for isolated queries.
  *
- * Returns %true if memory consumption of @memcg is in the normal range.
+ * Returns one of the following:
+ *   MEMCG_PROT_NONE: cgroup memory is not protected
+ *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
+ *     an unprotected supply of reclaimable memory from other cgroups.
+ *   MEMCG_PROT_MIN: cgroup memory is protected
  *
- * @root is exclusive; it is never low when looked at directly
+ * @root is exclusive; it is never protected when looked at directly
  *
- * To provide a proper hierarchical behavior, effective memory.low value
- * is used.
+ * To provide a proper hierarchical behavior, effective memory.min/low values
+ * are used. Below is the description of how effective memory.low is calculated.
+ * Effective memory.min values is calculated in the same way.
  *
  * Effective memory.low is always equal or less than the original memory.low.
  * If there is no memory.low overcommittment (which is always true for
@@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = {
  *     E/memory.current = 0
  *
  * These calculations require constant tracking of the actual low usages
- * (see propagate_low_usage()), as well as recursive calculation of
- * effective memory.low values. But as we do call mem_cgroup_low()
+ * (see propagate_protected_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_protected()
  * path for each memory cgroup top-down from the reclaim,
  * it's possible to optimize this part, and save calculated elow
  * for next usage. This part is intentionally racy, but it's ok,
  * as memory.low is a best-effort mechanism.
  */
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
+						struct mem_cgroup *memcg)
 {
-	unsigned long usage, low_usage, siblings_low_usage;
-	unsigned long elow, parent_elow;
 	struct mem_cgroup *parent;
+	unsigned long emin, parent_emin;
+	unsigned long elow, parent_elow;
+	unsigned long usage;
 
 	if (mem_cgroup_disabled())
-		return false;
+		return MEMCG_PROT_NONE;
 
 	if (!root)
 		root = root_mem_cgroup;
 	if (memcg == root)
-		return false;
+		return MEMCG_PROT_NONE;
 
-	elow = memcg->memory.low;
 	usage = page_counter_read(&memcg->memory);
-	parent = parent_mem_cgroup(memcg);
+	if (!usage)
+		return MEMCG_PROT_NONE;
+
+	emin = memcg->memory.min;
+	elow = memcg->memory.low;
 
+	parent = parent_mem_cgroup(memcg);
 	if (parent == root)
 		goto exit;
 
+	parent_emin = READ_ONCE(parent->memory.emin);
+	emin = min(emin, parent_emin);
+	if (emin && parent_emin) {
+		unsigned long min_usage, siblings_min_usage;
+
+		min_usage = min(usage, memcg->memory.min);
+		siblings_min_usage = atomic_long_read(
+			&parent->memory.children_min_usage);
+
+		if (min_usage && siblings_min_usage)
+			emin = min(emin, parent_emin * min_usage /
+				   siblings_min_usage);
+	}
+
 	parent_elow = READ_ONCE(parent->memory.elow);
 	elow = min(elow, parent_elow);
+	if (elow && parent_elow) {
+		unsigned long low_usage, siblings_low_usage;
 
-	if (!elow || !parent_elow)
-		goto exit;
+		low_usage = min(usage, memcg->memory.low);
+		siblings_low_usage = atomic_long_read(
+			&parent->memory.children_low_usage);
 
-	low_usage = min(usage, memcg->memory.low);
-	siblings_low_usage = atomic_long_read(
-		&parent->memory.children_low_usage);
-
-	if (!low_usage || !siblings_low_usage)
-		goto exit;
+		if (low_usage && siblings_low_usage)
+			elow = min(elow, parent_elow * low_usage /
+				   siblings_low_usage);
+	}
 
-	elow = min(elow, parent_elow * low_usage / siblings_low_usage);
 exit:
+	memcg->memory.emin = emin;
 	memcg->memory.elow = elow;
-	return usage && usage <= elow;
+
+	if (usage <= emin)
+		return MEMCG_PROT_MIN;
+	else if (usage <= elow)
+		return MEMCG_PROT_LOW;
+	else
+		return MEMCG_PROT_NONE;
 }
 
 /**
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a5ff4cbc355a..de31470655f6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,26 +13,38 @@
 #include <linux/bug.h>
 #include <asm/page.h>
 
-static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+static void propagate_protected_usage(struct page_counter *c,
+				      unsigned long usage)
 {
-	unsigned long low_usage, old;
+	unsigned long protected, old_protected;
 	long delta;
 
 	if (!c->parent)
 		return;
 
-	if (!c->low && !atomic_long_read(&c->low_usage))
-		return;
+	if (c->min || atomic_long_read(&c->min_usage)) {
+		if (usage <= c->min)
+			protected = usage;
+		else
+			protected = 0;
+
+		old_protected = atomic_long_xchg(&c->min_usage, protected);
+		delta = protected - old_protected;
+		if (delta)
+			atomic_long_add(delta, &c->parent->children_min_usage);
+	}
 
-	if (usage <= c->low)
-		low_usage = usage;
-	else
-		low_usage = 0;
+	if (c->low || atomic_long_read(&c->low_usage)) {
+		if (usage <= c->low)
+			protected = usage;
+		else
+			protected = 0;
 
-	old = atomic_long_xchg(&c->low_usage, low_usage);
-	delta = low_usage - old;
-	if (delta)
-		atomic_long_add(delta, &c->parent->children_low_usage);
+		old_protected = atomic_long_xchg(&c->low_usage, protected);
+		delta = protected - old_protected;
+		if (delta)
+			atomic_long_add(delta, &c->parent->children_low_usage);
+	}
 }
 
 /**
@@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
 	long new;
 
 	new = atomic_long_sub_return(nr_pages, &counter->usage);
-	propagate_low_usage(counter, new);
+	propagate_protected_usage(counter, new);
 	/* More uncharges than charges? */
 	WARN_ON_ONCE(new < 0);
 }
@@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
 		long new;
 
 		new = atomic_long_add_return(nr_pages, &c->usage);
-		propagate_low_usage(counter, new);
+		propagate_protected_usage(counter, new);
 		/*
 		 * This is indeed racy, but we can live with some
 		 * inaccuracy in the watermark.
@@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 		new = atomic_long_add_return(nr_pages, &c->usage);
 		if (new > c->max) {
 			atomic_long_sub(nr_pages, &c->usage);
-			propagate_low_usage(counter, new);
+			propagate_protected_usage(counter, new);
 			/*
 			 * This is racy, but we can live with some
 			 * inaccuracy in the failcnt.
@@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 			*fail = c;
 			goto failed;
 		}
-		propagate_low_usage(counter, new);
+		propagate_protected_usage(counter, new);
 		/*
 		 * Just like with failcnt, we can live with some
 		 * inaccuracy in the watermark.
@@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
 	}
 }
 
+/**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+	struct page_counter *c;
+
+	counter->min = nr_pages;
+
+	for (c = counter; c; c = c->parent)
+		propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
 /**
  * page_counter_set_low - set the amount of protected memory
  * @counter: counter
@@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
 	counter->low = nr_pages;
 
 	for (c = counter; c; c = c->parent)
-		propagate_low_usage(c, atomic_long_read(&c->usage));
+		propagate_protected_usage(c, atomic_long_read(&c->usage));
 }
 
 /**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0e67b477ecef..03822f86f288 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			unsigned long reclaimed;
 			unsigned long scanned;
 
-			if (mem_cgroup_low(root, memcg)) {
+			switch (mem_cgroup_protected(root, memcg)) {
+			case MEMCG_PROT_MIN:
+				/*
+				 * Hard protection.
+				 * If there is no reclaimable memory, OOM.
+				 */
+				continue;
+			case MEMCG_PROT_LOW:
+				/*
+				 * Soft protection.
+				 * Respect the protection only as long as
+				 * there is an unprotected supply
+				 * of reclaimable memory from other cgroups.
+				 */
 				if (!sc->memcg_low_reclaim) {
 					sc->memcg_low_skipped = 1;
 					continue;
 				}
 				memcg_memory_event(memcg, MEMCG_LOW);
+				break;
+			case MEMCG_PROT_NONE:
+				break;
 			}
 
 			reclaimed = sc->nr_reclaimed;
-- 
cgit v1.2.3


From 2bcd6454bae78775632e1848ce1eabf65c6fbd4f Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Thu, 7 Jun 2018 17:08:00 -0700
Subject: mm: use new return type vm_fault_t

Use new return type vm_fault_t for fault handler in struct
vm_operations_struct.  For now, this is just documenting that the
function returns a VM_FAULT value rather than an errno.  Once all
instances are converted, vm_fault_t will become a distinct type.

Link: http://lkml.kernel.org/r/20180511190542.GA2412@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 mm/filemap.c       | 8 ++++----
 mm/nommu.c         | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7012bb1d218..d1dc618a56bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2303,10 +2303,10 @@ extern void truncate_inode_pages_range(struct address_space *,
 extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern int filemap_fault(struct vm_fault *vmf);
+extern vm_fault_t filemap_fault(struct vm_fault *vmf);
 extern void filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
-extern int filemap_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
 
 /* mm/page-writeback.c */
 int __must_check write_one_page(struct page *page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 0604cb02e6f3..52517f28e6f4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2489,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
  *
  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
  */
-int filemap_fault(struct vm_fault *vmf)
+vm_fault_t filemap_fault(struct vm_fault *vmf)
 {
 	int error;
 	struct file *file = vmf->vma->vm_file;
@@ -2499,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf)
 	pgoff_t offset = vmf->pgoff;
 	pgoff_t max_off;
 	struct page *page;
-	int ret = 0;
+	vm_fault_t ret = 0;
 
 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 	if (unlikely(offset >= max_off))
@@ -2693,11 +2693,11 @@ next:
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
-	int ret = VM_FAULT_LOCKED;
+	vm_fault_t ret = VM_FAULT_LOCKED;
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vmf->vma->vm_file);
diff --git a/mm/nommu.c b/mm/nommu.c
index 13723736d38f..4452d8bd9ae4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
-int filemap_fault(struct vm_fault *vmf)
+vm_fault_t filemap_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
-- 
cgit v1.2.3


From b3ec9f33acb8d3a6173515d3d547be969dc70566 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Thu, 7 Jun 2018 17:08:04 -0700
Subject: mm: change return type to vm_fault_t

Use new return type vm_fault_t for fault handler in struct
vm_operations_struct.  For now, this is just documenting that the
function returns a VM_FAULT value rather than an errno.  Once all
instances are converted, vm_fault_t will become a distinct type.

See commit 1c8f422059ae ("mm: change return type to vm_fault_t")

Link: http://lkml.kernel.org/r/20180512063745.GA26866@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joe Perches <joe@perches.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 6 +++---
 mm/hugetlb.c             | 2 +-
 mm/mmap.c                | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 49dd59ea90f7..eb9da245286d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -629,9 +629,9 @@ struct vm_special_mapping {
 	 * If non-NULL, then this is called to resolve page faults
 	 * on the special mapping.  If used, .pages is not checked.
 	 */
-	int (*fault)(const struct vm_special_mapping *sm,
-		     struct vm_area_struct *vma,
-		     struct vm_fault *vmf);
+	vm_fault_t (*fault)(const struct vm_special_mapping *sm,
+				struct vm_area_struct *vma,
+				struct vm_fault *vmf);
 
 	int (*mremap)(const struct vm_special_mapping *sm,
 		     struct vm_area_struct *new_vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 129088710510..6c6decc63469 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3159,7 +3159,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
  * this far.
  */
-static int hugetlb_vm_op_fault(struct vm_fault *vmf)
+static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index d817764a9974..d1eb87ef4b1a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3277,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 		mm->data_vm += npages;
 }
 
-static int special_mapping_fault(struct vm_fault *vmf);
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
 
 /*
  * Having a close hook prevents vma merging regardless of flags.
@@ -3316,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = {
 	.fault = special_mapping_fault,
 };
 
-static int special_mapping_fault(struct vm_fault *vmf)
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	pgoff_t pgoff;
-- 
cgit v1.2.3


From e67d4ca79aaf9d13a00d229b1b1c96b86828e8ba Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 7 Jun 2018 17:08:11 -0700
Subject: mm: save two stranded bits in gfp_mask

___GFP_COLD and ___GFP_OTHER_NODE were removed but their bits were
stranded.  Fill the gaps by moving the existing gfp masks around.

Link: http://lkml.kernel.org/r/20180516211439.177440-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fc5ab85278d5..7f860ea29ec6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,6 +24,7 @@ struct vm_area_struct;
 #define ___GFP_HIGH		0x20u
 #define ___GFP_IO		0x40u
 #define ___GFP_FS		0x80u
+#define ___GFP_WRITE		0x100u
 #define ___GFP_NOWARN		0x200u
 #define ___GFP_RETRY_MAYFAIL	0x400u
 #define ___GFP_NOFAIL		0x800u
@@ -36,11 +37,10 @@ struct vm_area_struct;
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_ATOMIC		0x80000u
 #define ___GFP_ACCOUNT		0x100000u
-#define ___GFP_DIRECT_RECLAIM	0x400000u
-#define ___GFP_WRITE		0x800000u
-#define ___GFP_KSWAPD_RECLAIM	0x1000000u
+#define ___GFP_DIRECT_RECLAIM	0x200000u
+#define ___GFP_KSWAPD_RECLAIM	0x400000u
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x2000000u
+#define ___GFP_NOLOCKDEP	0x800000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -205,7 +205,7 @@ struct vm_area_struct;
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /*
-- 
cgit v1.2.3


From 6e292b9be7f4358985ce33ae1f59ab30a8c09e08 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:18 -0700
Subject: mm: split page_type out from _mapcount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We're already using a union of many fields here, so stop abusing the
_mapcount and make page_type its own field.  That implies renaming some of
the machinery that creates PageBuddy, PageBalloon and PageKmemcg; bring
back the PG_buddy, PG_balloon and PG_kmemcg names.

As suggested by Kirill, make page_type a bitmask.  Because it starts out
life as -1 (thanks to sharing the storage with _mapcount), setting a page
flag means clearing the appropriate bit.  This gives us space for probably
twenty or so extra bits (depending how paranoid we want to be about
_mapcount underflow).

Link: http://lkml.kernel.org/r/20180518194519.3820-3-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h   | 13 ++++++++-----
 include/linux/page-flags.h | 45 ++++++++++++++++++++++++++-------------------
 kernel/crash_core.c        |  1 +
 mm/page_alloc.c            | 13 +++++--------
 scripts/tags.sh            |  6 +++---
 5 files changed, 43 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index eb9da245286d..3a554fdf45c2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -96,6 +96,14 @@ struct page {
 	};
 
 	union {
+		/*
+		 * If the page is neither PageSlab nor mappable to userspace,
+		 * the value stored here may help determine what this page
+		 * is used for.  See page-flags.h for a list of page types
+		 * which are currently stored here.
+		 */
+		unsigned int page_type;
+
 		_slub_counter_t counters;
 		unsigned int active;		/* SLAB */
 		struct {			/* SLUB */
@@ -109,11 +117,6 @@ struct page {
 			/*
 			 * Count of ptes mapped in mms, to show when
 			 * page is mapped & limit reverse map searches.
-			 *
-			 * Extra information about page type may be
-			 * stored here for pages that are never mapped,
-			 * in which case the value MUST BE <= -2.
-			 * See page-flags.h for more details.
 			 */
 			atomic_t _mapcount;
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e34a27727b9a..8c25b28a35aa 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -642,49 +642,56 @@ PAGEFLAG_FALSE(DoubleMap)
 #endif
 
 /*
- * For pages that are never mapped to userspace, page->mapcount may be
- * used for storing extra information about page type. Any value used
- * for this purpose must be <= -2, but it's better start not too close
- * to -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a special page.
+ * For pages that are never mapped to userspace (and aren't PageSlab),
+ * page_type may be used.  Because it is initialised to -1, we invert the
+ * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
+ * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
+ * low bits so that an underflow or overflow of page_mapcount() won't be
+ * mistaken for a page type value.
  */
-#define PAGE_MAPCOUNT_OPS(uname, lname)					\
+
+#define PAGE_TYPE_BASE	0xf0000000
+/* Reserve		0x0000007f to catch underflows of page_mapcount */
+#define PG_buddy	0x00000080
+#define PG_balloon	0x00000100
+#define PG_kmemcg	0x00000200
+
+#define PageType(page, flag)						\
+	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+
+#define PAGE_TYPE_OPS(uname, lname)					\
 static __always_inline int Page##uname(struct page *page)		\
 {									\
-	return atomic_read(&page->_mapcount) ==				\
-				PAGE_##lname##_MAPCOUNT_VALUE;		\
+	return PageType(page, PG_##lname);				\
 }									\
 static __always_inline void __SetPage##uname(struct page *page)		\
 {									\
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);	\
-	atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE);	\
+	VM_BUG_ON_PAGE(!PageType(page, 0), page);			\
+	page->page_type &= ~PG_##lname;					\
 }									\
 static __always_inline void __ClearPage##uname(struct page *page)	\
 {									\
 	VM_BUG_ON_PAGE(!Page##uname(page), page);			\
-	atomic_set(&page->_mapcount, -1);				\
+	page->page_type |= PG_##lname;					\
 }
 
 /*
- * PageBuddy() indicate that the page is free and in the buddy system
+ * PageBuddy() indicates that the page is free and in the buddy system
  * (see mm/page_alloc.c).
  */
-#define PAGE_BUDDY_MAPCOUNT_VALUE		(-128)
-PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
+PAGE_TYPE_OPS(Buddy, buddy)
 
 /*
- * PageBalloon() is set on pages that are on the balloon page list
+ * PageBalloon() is true for pages that are on the balloon page list
  * (see mm/balloon_compaction.c).
  */
-#define PAGE_BALLOON_MAPCOUNT_VALUE		(-256)
-PAGE_MAPCOUNT_OPS(Balloon, BALLOON)
+PAGE_TYPE_OPS(Balloon, balloon)
 
 /*
  * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
  * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
  */
-#define PAGE_KMEMCG_MAPCOUNT_VALUE		(-512)
-PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG)
+PAGE_TYPE_OPS(Kmemcg, kmemcg)
 
 extern bool is_free_buddy_page(struct page *page);
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index f7674d676889..b66aced5e8c2 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_NUMBER(PG_hwpoison);
 #endif
 	VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
 	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
 #ifdef CONFIG_HUGETLB_PAGE
 	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96d48d24e1ef..5afdf495c374 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page)
 
 /*
  * This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
+ * we can coalesce a page and its buddy if
  * (a) the buddy is not in a hole (check before calling!) &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
- * For recording whether a page is in the buddy system, we set ->_mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE.
- * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
- * serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set PageBuddy.
+ * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
@@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
- * field.
+ * free pages of length of (1 << order) and marked with PageBuddy.
+ * Page's order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
diff --git a/scripts/tags.sh b/scripts/tags.sh
index e587610d1492..66f08bb1cce9 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -179,9 +179,9 @@ regex_c=(
 	'/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/'
 	'/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/'
 	'/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/'
-	'/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/Page\1/'
-	'/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__SetPage\1/'
-	'/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/'
+	'/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/Page\1/'
+	'/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__SetPage\1/'
+	'/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/'
 	'/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/'
 	'/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/'
 	'/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/'
-- 
cgit v1.2.3


From 1d40a5ea01d53251c23c7be541d3f4a656cfc537 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:23 -0700
Subject: mm: mark pages in use for page tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define a new PageTable bit in the page_type and use it to mark pages in
use as page tables.  This can be helpful when debugging crashdumps or
analysing memory fragmentation.  Add a KPF flag to report these pages to
userspace and update page-types.c to interpret that flag.

Note that only pages currently accounted as NR_PAGETABLES are tracked as
PageTable; this does not include pgd/p4d/pud/pmd pages.  Those will be the
subject of a later patch.

Link: http://lkml.kernel.org/r/20180518194519.3820-4-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c                         | 2 ++
 include/linux/mm.h                     | 2 ++
 include/linux/page-flags.h             | 6 ++++++
 include/uapi/linux/kernel-page-flags.h | 2 +-
 tools/vm/page-types.c                  | 1 +
 5 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1491918a33c3..792c78a49174 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page)
 
 	if (PageBalloon(page))
 		u |= 1 << KPF_BALLOON;
+	if (PageTable(page))
+		u |= 1 << KPF_PGTABLE;
 
 	if (page_is_idle(page))
 		u |= 1 << KPF_IDLE;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d1dc618a56bf..1dd588b7c649 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1851,6 +1851,7 @@ static inline bool pgtable_page_ctor(struct page *page)
 {
 	if (!ptlock_init(page))
 		return false;
+	__SetPageTable(page);
 	inc_zone_page_state(page, NR_PAGETABLE);
 	return true;
 }
@@ -1858,6 +1859,7 @@ static inline bool pgtable_page_ctor(struct page *page)
 static inline void pgtable_page_dtor(struct page *page)
 {
 	pte_lock_deinit(page);
+	__ClearPageTable(page);
 	dec_zone_page_state(page, NR_PAGETABLE);
 }
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8c25b28a35aa..901943e4754b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -655,6 +655,7 @@ PAGEFLAG_FALSE(DoubleMap)
 #define PG_buddy	0x00000080
 #define PG_balloon	0x00000100
 #define PG_kmemcg	0x00000200
+#define PG_table	0x00000400
 
 #define PageType(page, flag)						\
 	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -693,6 +694,11 @@ PAGE_TYPE_OPS(Balloon, balloon)
  */
 PAGE_TYPE_OPS(Kmemcg, kmemcg)
 
+/*
+ * Marks pages in use as page tables.
+ */
+PAGE_TYPE_OPS(Table, table)
+
 extern bool is_free_buddy_page(struct page *page);
 
 __PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index fa139841ec18..21b9113c69da 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -35,6 +35,6 @@
 #define KPF_BALLOON		23
 #define KPF_ZERO_PAGE		24
 #define KPF_IDLE		25
-
+#define KPF_PGTABLE		26
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index a8783f48f77f..cce853dca691 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -131,6 +131,7 @@ static const char * const page_flag_names[] = {
 	[KPF_KSM]		= "x:ksm",
 	[KPF_THP]		= "t:thp",
 	[KPF_BALLOON]		= "o:balloon",
+	[KPF_PGTABLE]		= "g:pgtable",
 	[KPF_ZERO_PAGE]		= "z:zero_page",
 	[KPF_IDLE]              = "i:idle_page",
 
-- 
cgit v1.2.3


From d4fc5069a39405d67937b25c0dc9249a9b6c50f1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:26 -0700
Subject: mm: switch s_mem and slab_cache in struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will allow us to store slub's counters in the same bits as slab's
s_mem.  slub now needs to set page->mapping to NULL as it frees the page,
just like slab does.

Link: http://lkml.kernel.org/r/20180518194519.3820-5-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 4 ++--
 mm/slub.c                | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3a554fdf45c2..6f153f2fab50 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -83,7 +83,7 @@ struct page {
 		/* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */
 		struct address_space *mapping;
 
-		void *s_mem;			/* slab first object */
+		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
 		atomic_t compound_mapcount;	/* first tail page */
 		/* page_deferred_list().next	 -- second tail page */
 	};
@@ -194,7 +194,7 @@ struct page {
 		spinlock_t ptl;
 #endif
 #endif
-		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
+		void *s_mem;			/* slab first object */
 	};
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/slub.c b/mm/slub.c
index 48f75872c356..0170ea8a97fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1695,6 +1695,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__ClearPageSlab(page);
 
 	page_mapcount_reset(page);
+	page->mapping = NULL;
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
 	memcg_uncharge_slab(page, order, s);
-- 
cgit v1.2.3


From 7d27a04bb2b5bd665f147439d18ae236080eef32 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:31 -0700
Subject: mm: move 'private' union within struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By moving page->private to the fourth word of struct page, we can put the
SLUB counters in the same word as SLAB's s_mem and still do the
cmpxchg_double trick.  Now the SLUB counters no longer overlap with the
mapcount or refcount so we can drop the call to page_mapcount_reset() and
simplify set_page_slub_counters() to a single line.

Link: http://lkml.kernel.org/r/20180518194519.3820-6-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 56 +++++++++++++++++++++---------------------------
 mm/slub.c                | 20 ++---------------
 2 files changed, 27 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6f153f2fab50..bcc5ee8b7b07 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -65,15 +65,9 @@ struct hmm;
  */
 #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
 #define _struct_page_alignment	__aligned(2 * sizeof(unsigned long))
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
-#define _slub_counter_t		unsigned long
 #else
-#define _slub_counter_t		unsigned int
-#endif
-#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
 #define _struct_page_alignment
-#define _slub_counter_t		unsigned int
-#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
+#endif
 
 struct page {
 	/* First double word block */
@@ -95,6 +89,30 @@ struct page {
 		/* page_deferred_list().prev	-- second tail page */
 	};
 
+	union {
+		/*
+		 * Mapping-private opaque data:
+		 * Usually used for buffer_heads if PagePrivate
+		 * Used for swp_entry_t if PageSwapCache
+		 * Indicates order in the buddy system if PageBuddy
+		 */
+		unsigned long private;
+#if USE_SPLIT_PTE_PTLOCKS
+#if ALLOC_SPLIT_PTLOCKS
+		spinlock_t *ptl;
+#else
+		spinlock_t ptl;
+#endif
+#endif
+		void *s_mem;			/* slab first object */
+		unsigned long counters;		/* SLUB */
+		struct {			/* SLUB */
+			unsigned inuse:16;
+			unsigned objects:15;
+			unsigned frozen:1;
+		};
+	};
+
 	union {
 		/*
 		 * If the page is neither PageSlab nor mappable to userspace,
@@ -104,13 +122,7 @@ struct page {
 		 */
 		unsigned int page_type;
 
-		_slub_counter_t counters;
 		unsigned int active;		/* SLAB */
-		struct {			/* SLUB */
-			unsigned inuse:16;
-			unsigned objects:15;
-			unsigned frozen:1;
-		};
 		int units;			/* SLOB */
 
 		struct {			/* Page cache */
@@ -179,24 +191,6 @@ struct page {
 #endif
 	};
 
-	union {
-		/*
-		 * Mapping-private opaque data:
-		 * Usually used for buffer_heads if PagePrivate
-		 * Used for swp_entry_t if PageSwapCache
-		 * Indicates order in the buddy system if PageBuddy
-		 */
-		unsigned long private;
-#if USE_SPLIT_PTE_PTLOCKS
-#if ALLOC_SPLIT_PTLOCKS
-		spinlock_t *ptl;
-#else
-		spinlock_t ptl;
-#endif
-#endif
-		void *s_mem;			/* slab first object */
-	};
-
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup *mem_cgroup;
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index 0170ea8a97fe..f5db87839ab4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page)
 	__bit_spin_unlock(PG_locked, &page->flags);
 }
 
-static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
-{
-	struct page tmp;
-	tmp.counters = counters_new;
-	/*
-	 * page->counters can cover frozen/inuse/objects as well
-	 * as page->_refcount.  If we assign to ->counters directly
-	 * we run the risk of losing updates to page->_refcount, so
-	 * be careful and only assign to the fields we need.
-	 */
-	page->frozen  = tmp.frozen;
-	page->inuse   = tmp.inuse;
-	page->objects = tmp.objects;
-}
-
 /* Interrupts must be disabled (for the fallback code to work right) */
 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		void *freelist_old, unsigned long counters_old,
@@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
 		if (page->freelist == freelist_old &&
 					page->counters == counters_old) {
 			page->freelist = freelist_new;
-			set_page_slub_counters(page, counters_new);
+			page->counters = counters_new;
 			slab_unlock(page);
 			return true;
 		}
@@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		if (page->freelist == freelist_old &&
 					page->counters == counters_old) {
 			page->freelist = freelist_new;
-			set_page_slub_counters(page, counters_new);
+			page->counters = counters_new;
 			slab_unlock(page);
 			local_irq_restore(flags);
 			return true;
@@ -1694,7 +1679,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__ClearPageSlabPfmemalloc(page);
 	__ClearPageSlab(page);
 
-	page_mapcount_reset(page);
 	page->mapping = NULL;
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
-- 
cgit v1.2.3


From b21999da02f891aafa362252f4db7ba5c34a9a8d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:35 -0700
Subject: mm: move _refcount out of struct page union
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keeping the refcount in the union only encourages people to put something
else in the union which will overlap with _refcount and eventually explode
messily.  pahole reports no fields change location.

Link: http://lkml.kernel.org/r/20180518194519.3820-7-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bcc5ee8b7b07..0b0c0e224011 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -113,7 +113,13 @@ struct page {
 		};
 	};
 
-	union {
+	union {		/* This union is 4 bytes in size. */
+		/*
+		 * If the page can be mapped to userspace, encodes the number
+		 * of times this page is referenced by a page table.
+		 */
+		atomic_t _mapcount;
+
 		/*
 		 * If the page is neither PageSlab nor mappable to userspace,
 		 * the value stored here may help determine what this page
@@ -124,22 +130,11 @@ struct page {
 
 		unsigned int active;		/* SLAB */
 		int units;			/* SLOB */
-
-		struct {			/* Page cache */
-			/*
-			 * Count of ptes mapped in mms, to show when
-			 * page is mapped & limit reverse map searches.
-			 */
-			atomic_t _mapcount;
-
-			/*
-			 * Usage count, *USE WRAPPER FUNCTION* when manual
-			 * accounting. See page_ref.h
-			 */
-			atomic_t _refcount;
-		};
 	};
 
+	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
+	atomic_t _refcount;
+
 	/*
 	 * WARNING: bit 0 of the first word encode PageTail(). That means
 	 * the rest users of the storage space MUST NOT use the bit to
-- 
cgit v1.2.3


From 66a6ffd2af42b0bafaecccbd7c1c30d386de21ab Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:39 -0700
Subject: mm: combine first three unions in struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By combining these three one-word unions into one three-word union, we
make it easier for users to add their own multi-word fields to struct
page, as well as making it obvious that SLUB needs to keep its double-word
alignment for its freelist & counters.

No field moves position; verified with pahole.

Link: http://lkml.kernel.org/r/20180518194519.3820-8-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 66 ++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0b0c0e224011..158f9693f865 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -70,46 +70,46 @@ struct hmm;
 #endif
 
 struct page {
-	/* First double word block */
 	unsigned long flags;		/* Atomic flags, some possibly
 					 * updated asynchronously */
+	/* Three words (12/24 bytes) are available in this union. */
 	union {
-		/* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */
-		struct address_space *mapping;
-
-		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
+		struct {	/* Page cache and anonymous pages */
+			/* See page-flags.h for PAGE_MAPPING_FLAGS */
+			struct address_space *mapping;
+			pgoff_t index;		/* Our offset within mapping. */
+			/**
+			 * @private: Mapping-private opaque data.
+			 * Usually used for buffer_heads if PagePrivate.
+			 * Used for swp_entry_t if PageSwapCache.
+			 * Indicates order in the buddy system if PageBuddy.
+			 */
+			unsigned long private;
+		};
+		struct {	/* slab, slob and slub */
+			struct kmem_cache *slab_cache; /* not slob */
+			/* Double-word boundary */
+			void *freelist;		/* first free object */
+			union {
+				void *s_mem;	/* slab: first object */
+				unsigned long counters;		/* SLUB */
+				struct {			/* SLUB */
+					unsigned inuse:16;
+					unsigned objects:15;
+					unsigned frozen:1;
+				};
+			};
+		};
 		atomic_t compound_mapcount;	/* first tail page */
-		/* page_deferred_list().next	 -- second tail page */
-	};
-
-	/* Second double word */
-	union {
-		pgoff_t index;		/* Our offset within mapping. */
-		void *freelist;		/* sl[aou]b first free object */
-		/* page_deferred_list().prev	-- second tail page */
-	};
-
-	union {
-		/*
-		 * Mapping-private opaque data:
-		 * Usually used for buffer_heads if PagePrivate
-		 * Used for swp_entry_t if PageSwapCache
-		 * Indicates order in the buddy system if PageBuddy
-		 */
-		unsigned long private;
-#if USE_SPLIT_PTE_PTLOCKS
+		struct list_head deferred_list; /* second tail page */
+		struct {	/* Page table pages */
+			unsigned long _pt_pad_2;	/* mapping */
+			unsigned long _pt_pad_3;
 #if ALLOC_SPLIT_PTLOCKS
-		spinlock_t *ptl;
+			spinlock_t *ptl;
 #else
-		spinlock_t ptl;
-#endif
+			spinlock_t ptl;
 #endif
-		void *s_mem;			/* slab first object */
-		unsigned long counters;		/* SLUB */
-		struct {			/* SLUB */
-			unsigned inuse:16;
-			unsigned objects:15;
-			unsigned frozen:1;
 		};
 	};
 
-- 
cgit v1.2.3


From b7ccc7f8c643de0b71cd2da3245d0c27038a8dd7 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:46 -0700
Subject: mm: move lru union within struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the LRU is two words, this does not affect the double-word alignment
of SLUB's freelist.

Link: http://lkml.kernel.org/r/20180518194519.3820-10-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 102 +++++++++++++++++++++++------------------------
 mm/slub.c                |   8 ++--
 2 files changed, 55 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 158f9693f865..5b97bd445ada 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -72,6 +72,57 @@ struct hmm;
 struct page {
 	unsigned long flags;		/* Atomic flags, some possibly
 					 * updated asynchronously */
+	/*
+	 * WARNING: bit 0 of the first word encode PageTail(). That means
+	 * the rest users of the storage space MUST NOT use the bit to
+	 * avoid collision and false-positive PageTail().
+	 */
+	union {
+		struct list_head lru;	/* Pageout list, eg. active_list
+					 * protected by zone_lru_lock !
+					 * Can be used as a generic list
+					 * by the page owner.
+					 */
+		struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
+					    * lru or handled by a slab
+					    * allocator, this points to the
+					    * hosting device page map.
+					    */
+		struct {		/* slub per cpu partial pages */
+			struct page *next;	/* Next partial slab */
+#ifdef CONFIG_64BIT
+			int pages;	/* Nr of partial slabs left */
+			int pobjects;	/* Approximate # of objects */
+#else
+			short int pages;
+			short int pobjects;
+#endif
+		};
+
+		struct rcu_head rcu_head;	/* Used by SLAB
+						 * when destroying via RCU
+						 */
+		/* Tail pages of compound page */
+		struct {
+			unsigned long compound_head; /* If bit zero is set */
+
+			/* First tail page only */
+			unsigned char compound_dtor;
+			unsigned char compound_order;
+			/* two/six bytes available here */
+		};
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
+		struct {
+			unsigned long __pad;	/* do not overlay pmd_huge_pte
+						 * with compound_head to avoid
+						 * possible bit 0 collision.
+						 */
+			pgtable_t pmd_huge_pte; /* protected by page->ptl */
+		};
+#endif
+	};
+
 	/* Three words (12/24 bytes) are available in this union. */
 	union {
 		struct {	/* Page cache and anonymous pages */
@@ -135,57 +186,6 @@ struct page {
 	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
 	atomic_t _refcount;
 
-	/*
-	 * WARNING: bit 0 of the first word encode PageTail(). That means
-	 * the rest users of the storage space MUST NOT use the bit to
-	 * avoid collision and false-positive PageTail().
-	 */
-	union {
-		struct list_head lru;	/* Pageout list, eg. active_list
-					 * protected by zone_lru_lock !
-					 * Can be used as a generic list
-					 * by the page owner.
-					 */
-		struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
-					    * lru or handled by a slab
-					    * allocator, this points to the
-					    * hosting device page map.
-					    */
-		struct {		/* slub per cpu partial pages */
-			struct page *next;	/* Next partial slab */
-#ifdef CONFIG_64BIT
-			int pages;	/* Nr of partial slabs left */
-			int pobjects;	/* Approximate # of objects */
-#else
-			short int pages;
-			short int pobjects;
-#endif
-		};
-
-		struct rcu_head rcu_head;	/* Used by SLAB
-						 * when destroying via RCU
-						 */
-		/* Tail pages of compound page */
-		struct {
-			unsigned long compound_head; /* If bit zero is set */
-
-			/* First tail page only */
-			unsigned char compound_dtor;
-			unsigned char compound_order;
-			/* two/six bytes available here */
-		};
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
-		struct {
-			unsigned long __pad;	/* do not overlay pmd_huge_pte
-						 * with compound_head to avoid
-						 * possible bit 0 collision.
-						 */
-			pgtable_t pmd_huge_pte; /* protected by page->ptl */
-		};
-#endif
-	};
-
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup *mem_cgroup;
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index f5db87839ab4..a96bf429af08 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -52,11 +52,11 @@
  *   and to synchronize major metadata changes to slab cache structures.
  *
  *   The slab_lock is only used for debugging and on arches that do not
- *   have the ability to do a cmpxchg_double. It only protects the second
- *   double word in the page struct. Meaning
+ *   have the ability to do a cmpxchg_double. It only protects:
  *	A. page->freelist	-> List of object free in a page
- *	B. page->counters	-> Counters of objects
- *	C. page->frozen		-> frozen state
+ *	B. page->inuse		-> Number of objects in use
+ *	C. page->objects	-> Number of objects in page
+ *	D. page->frozen		-> frozen state
  *
  *   If a slab is frozen then it is exempt from list management. It is not
  *   on any list. The processor that froze the slab is the one who can
-- 
cgit v1.2.3


From 4da1984edbbed0ffc961006d265d78a7b0095ea4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:50 -0700
Subject: mm: combine LRU and main union in struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This gives us five words of space in a single union in struct page.  The
compound_mapcount moves position (from offset 24 to offset 20) on 64-bit
systems, but that does not seem likely to cause any trouble.

Link: http://lkml.kernel.org/r/20180518194519.3820-11-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 97 +++++++++++++++++++++++-------------------------
 mm/page_alloc.c          |  2 +-
 2 files changed, 47 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5b97bd445ada..dfe6d648e055 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,59 +73,19 @@ struct page {
 	unsigned long flags;		/* Atomic flags, some possibly
 					 * updated asynchronously */
 	/*
-	 * WARNING: bit 0 of the first word encode PageTail(). That means
-	 * the rest users of the storage space MUST NOT use the bit to
+	 * Five words (20/40 bytes) are available in this union.
+	 * WARNING: bit 0 of the first word is used for PageTail(). That
+	 * means the other users of this union MUST NOT use the bit to
 	 * avoid collision and false-positive PageTail().
 	 */
-	union {
-		struct list_head lru;	/* Pageout list, eg. active_list
-					 * protected by zone_lru_lock !
-					 * Can be used as a generic list
-					 * by the page owner.
-					 */
-		struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
-					    * lru or handled by a slab
-					    * allocator, this points to the
-					    * hosting device page map.
-					    */
-		struct {		/* slub per cpu partial pages */
-			struct page *next;	/* Next partial slab */
-#ifdef CONFIG_64BIT
-			int pages;	/* Nr of partial slabs left */
-			int pobjects;	/* Approximate # of objects */
-#else
-			short int pages;
-			short int pobjects;
-#endif
-		};
-
-		struct rcu_head rcu_head;	/* Used by SLAB
-						 * when destroying via RCU
-						 */
-		/* Tail pages of compound page */
-		struct {
-			unsigned long compound_head; /* If bit zero is set */
-
-			/* First tail page only */
-			unsigned char compound_dtor;
-			unsigned char compound_order;
-			/* two/six bytes available here */
-		};
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
-		struct {
-			unsigned long __pad;	/* do not overlay pmd_huge_pte
-						 * with compound_head to avoid
-						 * possible bit 0 collision.
-						 */
-			pgtable_t pmd_huge_pte; /* protected by page->ptl */
-		};
-#endif
-	};
-
-	/* Three words (12/24 bytes) are available in this union. */
 	union {
 		struct {	/* Page cache and anonymous pages */
+			/**
+			 * @lru: Pageout list, eg. active_list protected by
+			 * zone_lru_lock.  Sometimes used as a generic list
+			 * by the page owner.
+			 */
+			struct list_head lru;
 			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
 			pgoff_t index;		/* Our offset within mapping. */
@@ -138,6 +98,19 @@ struct page {
 			unsigned long private;
 		};
 		struct {	/* slab, slob and slub */
+			union {
+				struct list_head slab_list;	/* uses lru */
+				struct {	/* Partial pages */
+					struct page *next;
+#ifdef CONFIG_64BIT
+					int pages;	/* Nr of pages left */
+					int pobjects;	/* Approximate count */
+#else
+					short int pages;
+					short int pobjects;
+#endif
+				};
+			};
 			struct kmem_cache *slab_cache; /* not slob */
 			/* Double-word boundary */
 			void *freelist;		/* first free object */
@@ -151,9 +124,22 @@ struct page {
 				};
 			};
 		};
-		atomic_t compound_mapcount;	/* first tail page */
-		struct list_head deferred_list; /* second tail page */
+		struct {	/* Tail pages of compound page */
+			unsigned long compound_head;	/* Bit zero is set */
+
+			/* First tail page only */
+			unsigned char compound_dtor;
+			unsigned char compound_order;
+			atomic_t compound_mapcount;
+		};
+		struct {	/* Second tail page of compound page */
+			unsigned long _compound_pad_1;	/* compound_head */
+			unsigned long _compound_pad_2;
+			struct list_head deferred_list;
+		};
 		struct {	/* Page table pages */
+			unsigned long _pt_pad_1;	/* compound_head */
+			pgtable_t pmd_huge_pte; /* protected by page->ptl */
 			unsigned long _pt_pad_2;	/* mapping */
 			unsigned long _pt_pad_3;
 #if ALLOC_SPLIT_PTLOCKS
@@ -162,6 +148,15 @@ struct page {
 			spinlock_t ptl;
 #endif
 		};
+
+		/** @rcu_head: You can use this to free a page by RCU. */
+		struct rcu_head rcu_head;
+
+		/**
+		 * @pgmap: For ZONE_DEVICE pages, this points to the hosting
+		 * device page map.
+		 */
+		struct dev_pagemap *pgmap;
 	};
 
 	union {		/* This union is 4 bytes in size. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 628fd3e24731..ef1811531999 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -943,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
 	}
 	switch (page - head_page) {
 	case 1:
-		/* the first tail page: ->mapping is compound_mapcount() */
+		/* the first tail page: ->mapping may be compound_mapcount() */
 		if (unlikely(compound_mapcount(page))) {
 			bad_page(page, "nonzero compound_mapcount", 0);
 			goto out;
-- 
cgit v1.2.3


From 97b4a671987123009724dfbdf188ec6e122573d4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:53 -0700
Subject: mm: improve struct page documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrite the documentation to describe what you can use in struct page
rather than what you can't.

Link: http://lkml.kernel.org/r/20180518194519.3820-12-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index dfe6d648e055..9b45760cc797 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -33,29 +33,27 @@ struct hmm;
  * it to keep track of whatever it is we are using the page for at the
  * moment. Note that we have no way to track which tasks are using
  * a page, though if it is a pagecache page, rmap structures can tell us
- * who is mapping it. If you allocate the page using alloc_pages(), you
- * can use some of the space in struct page for your own purposes.
+ * who is mapping it.
  *
- * Pages that were once in the page cache may be found under the RCU lock
- * even after they have been recycled to a different purpose.  The page
- * cache reads and writes some of the fields in struct page to pin the
- * page before checking that it's still in the page cache.  It is vital
- * that all users of struct page:
- * 1. Use the first word as PageFlags.
- * 2. Clear or preserve bit 0 of page->compound_head.  It is used as
- *    PageTail for compound pages, and the page cache must not see false
- *    positives.  Some users put a pointer here (guaranteed to be at least
- *    4-byte aligned), other users avoid using the field altogether.
- * 3. page->_refcount must either not be used, or must be used in such a
- *    way that other CPUs temporarily incrementing and then decrementing the
- *    refcount does not cause problems.  On receiving the page from
- *    alloc_pages(), the refcount will be positive.
- * 4. Either preserve page->_mapcount or restore it to -1 before freeing it.
+ * If you allocate the page using alloc_pages(), you can use some of the
+ * space in struct page for your own purposes.  The five words in the main
+ * union are available, except for bit 0 of the first word which must be
+ * kept clear.  Many users use this word to store a pointer to an object
+ * which is guaranteed to be aligned.  If you use the same storage as
+ * page->mapping, you must restore it to NULL before freeing the page.
  *
- * If you allocate pages of order > 0, you can use the fields in the struct
- * page associated with each page, but bear in mind that the pages may have
- * been inserted individually into the page cache, so you must use the above
- * four fields in a compatible way for each struct page.
+ * If your page will not be mapped to userspace, you can also use the four
+ * bytes in the mapcount union, but you must call page_mapcount_reset()
+ * before freeing it.
+ *
+ * If you want to use the refcount field, it must be used in such a way
+ * that other CPUs temporarily incrementing and then decrementing the
+ * refcount does not cause problems.  On receiving the page from
+ * alloc_pages(), the refcount will be positive.
+ *
+ * If you allocate pages of order > 0, you can use some of the fields
+ * in each subpage, but you may need to restore some of their values
+ * afterwards.
  *
  * SLUB uses cmpxchg_double() to atomically update its freelist and
  * counters.  That requires that freelist & counters be adjacent and
-- 
cgit v1.2.3


From a052f0a5168ef99c517f6638aaec59f39060f81d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:08:57 -0700
Subject: mm: add pt_mm to struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For pgd page table pages, x86 overloads the page->index field to store a
pointer to the mm_struct.  Rename this to pt_mm so it's visible to other
users.

Link: http://lkml.kernel.org/r/20180518194519.3820-13-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pgtable.c    | 5 ++---
 include/linux/mm_types.h | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ffc8c13c50e4..938dbcd46b97 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -114,13 +114,12 @@ static inline void pgd_list_del(pgd_t *pgd)
 
 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 {
-	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
-	virt_to_page(pgd)->index = (pgoff_t)mm;
+	virt_to_page(pgd)->pt_mm = mm;
 }
 
 struct mm_struct *pgd_page_get_mm(struct page *page)
 {
-	return (struct mm_struct *)page->index;
+	return page->pt_mm;
 }
 
 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9b45760cc797..811aa0c71e1a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -139,7 +139,7 @@ struct page {
 			unsigned long _pt_pad_1;	/* compound_head */
 			pgtable_t pmd_huge_pte; /* protected by page->ptl */
 			unsigned long _pt_pad_2;	/* mapping */
-			unsigned long _pt_pad_3;
+			struct mm_struct *pt_mm;	/* x86 pgds only */
 #if ALLOC_SPLIT_PTLOCKS
 			spinlock_t *ptl;
 #else
-- 
cgit v1.2.3


From 50e7fbc3bf0c3e41af0075f17a9e4ddefa82e1b5 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:09:01 -0700
Subject: mm: add hmm_data to struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make hmm_data an explicit member of the struct page union.

Link: http://lkml.kernel.org/r/20180518194519.3820-14-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h      |  8 ++------
 include/linux/mm_types.h | 12 ++++++------
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 2f1327c37a63..4c92e3ba3e16 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -522,9 +522,7 @@ void hmm_devmem_remove(struct hmm_devmem *devmem);
 static inline void hmm_devmem_page_set_drvdata(struct page *page,
 					       unsigned long data)
 {
-	unsigned long *drvdata = (unsigned long *)&page->pgmap;
-
-	drvdata[1] = data;
+	page->hmm_data = data;
 }
 
 /*
@@ -535,9 +533,7 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page,
  */
 static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
-	const unsigned long *drvdata = (const unsigned long *)&page->pgmap;
-
-	return drvdata[1];
+	return page->hmm_data;
 }
 
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 811aa0c71e1a..99ce070e7dcb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -146,15 +146,15 @@ struct page {
 			spinlock_t ptl;
 #endif
 		};
+		struct {	/* ZONE_DEVICE pages */
+			/** @pgmap: Points to the hosting device page map. */
+			struct dev_pagemap *pgmap;
+			unsigned long hmm_data;
+			unsigned long _zd_pad_1;	/* uses mapping */
+		};
 
 		/** @rcu_head: You can use this to free a page by RCU. */
 		struct rcu_head rcu_head;
-
-		/**
-		 * @pgmap: For ZONE_DEVICE pages, this points to the hosting
-		 * device page map.
-		 */
-		struct dev_pagemap *pgmap;
 	};
 
 	union {		/* This union is 4 bytes in size. */
-- 
cgit v1.2.3


From 9736d2a95e36ac3f60b063f498961103f3d4f165 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 17:09:10 -0700
Subject: slub: remove kmem_cache->reserved
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reserved field was only used for embedding an rcu_head in the data
structure.  With the previous commit, we no longer need it.  That lets us
remove the 'reserved' argument to a lot of functions.

Link: http://lkml.kernel.org/r/20180518194519.3820-16-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h |  1 -
 mm/slub.c                | 41 ++++++++++++++++++++---------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 3773e26c08c1..09fa2c6f0e68 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -101,7 +101,6 @@ struct kmem_cache {
 	void (*ctor)(void *);
 	unsigned int inuse;		/* Offset to metadata */
 	unsigned int align;		/* Alignment */
-	unsigned int reserved;		/* Reserved bytes at the end of slabs */
 	unsigned int red_left_pad;	/* Left redzone padding size */
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
diff --git a/mm/slub.c b/mm/slub.c
index d5bddf0f4792..f885dcf09750 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
 	return (p - addr) / s->size;
 }
 
-static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved)
+static inline unsigned int order_objects(unsigned int order, unsigned int size)
 {
-	return (((unsigned int)PAGE_SIZE << order) - reserved) / size;
+	return ((unsigned int)PAGE_SIZE << order) / size;
 }
 
 static inline struct kmem_cache_order_objects oo_make(unsigned int order,
-		unsigned int size, unsigned int reserved)
+		unsigned int size)
 {
 	struct kmem_cache_order_objects x = {
-		(order << OO_SHIFT) + order_objects(order, size, reserved)
+		(order << OO_SHIFT) + order_objects(order, size)
 	};
 
 	return x;
@@ -832,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 		return 1;
 
 	start = page_address(page);
-	length = (PAGE_SIZE << compound_order(page)) - s->reserved;
+	length = PAGE_SIZE << compound_order(page);
 	end = start + length;
 	remainder = length % s->size;
 	if (!remainder)
@@ -921,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
 		return 0;
 	}
 
-	maxobj = order_objects(compound_order(page), s->size, s->reserved);
+	maxobj = order_objects(compound_order(page), s->size);
 	if (page->objects > maxobj) {
 		slab_err(s, page, "objects %u > max %u",
 			page->objects, maxobj);
@@ -971,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 		nr++;
 	}
 
-	max_objects = order_objects(compound_order(page), s->size, s->reserved);
+	max_objects = order_objects(compound_order(page), s->size);
 	if (max_objects > MAX_OBJS_PER_PAGE)
 		max_objects = MAX_OBJS_PER_PAGE;
 
@@ -3193,21 +3193,21 @@ static unsigned int slub_min_objects;
  */
 static inline unsigned int slab_order(unsigned int size,
 		unsigned int min_objects, unsigned int max_order,
-		unsigned int fract_leftover, unsigned int reserved)
+		unsigned int fract_leftover)
 {
 	unsigned int min_order = slub_min_order;
 	unsigned int order;
 
-	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
+	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
 		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 
-	for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
+	for (order = max(min_order, (unsigned int)get_order(min_objects * size));
 			order <= max_order; order++) {
 
 		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
 		unsigned int rem;
 
-		rem = (slab_size - reserved) % size;
+		rem = slab_size % size;
 
 		if (rem <= slab_size / fract_leftover)
 			break;
@@ -3216,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size,
 	return order;
 }
 
-static inline int calculate_order(unsigned int size, unsigned int reserved)
+static inline int calculate_order(unsigned int size)
 {
 	unsigned int order;
 	unsigned int min_objects;
@@ -3233,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
 	min_objects = slub_min_objects;
 	if (!min_objects)
 		min_objects = 4 * (fls(nr_cpu_ids) + 1);
-	max_objects = order_objects(slub_max_order, size, reserved);
+	max_objects = order_objects(slub_max_order, size);
 	min_objects = min(min_objects, max_objects);
 
 	while (min_objects > 1) {
@@ -3242,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
 		fraction = 16;
 		while (fraction >= 4) {
 			order = slab_order(size, min_objects,
-					slub_max_order, fraction, reserved);
+					slub_max_order, fraction);
 			if (order <= slub_max_order)
 				return order;
 			fraction /= 2;
@@ -3254,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
 	 * We were unable to place multiple objects in a slab. Now
 	 * lets see if we can place a single object there.
 	 */
-	order = slab_order(size, 1, slub_max_order, 1, reserved);
+	order = slab_order(size, 1, slub_max_order, 1);
 	if (order <= slub_max_order)
 		return order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
-	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
+	order = slab_order(size, 1, MAX_ORDER, 1);
 	if (order < MAX_ORDER)
 		return order;
 	return -ENOSYS;
@@ -3529,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	if (forced_order >= 0)
 		order = forced_order;
 	else
-		order = calculate_order(size, s->reserved);
+		order = calculate_order(size);
 
 	if ((int)order < 0)
 		return 0;
@@ -3547,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	/*
 	 * Determine the number of objects per slab
 	 */
-	s->oo = oo_make(order, size, s->reserved);
-	s->min = oo_make(get_order(size), size, s->reserved);
+	s->oo = oo_make(order, size);
+	s->min = oo_make(get_order(size), size);
 	if (oo_objects(s->oo) > oo_objects(s->max))
 		s->max = s->oo;
 
@@ -3558,7 +3558,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
 {
 	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
-	s->reserved = 0;
 #ifdef CONFIG_SLAB_FREELIST_HARDENED
 	s->random = get_random_long();
 #endif
@@ -5077,7 +5076,7 @@ SLAB_ATTR_RO(destroy_by_rcu);
 
 static ssize_t reserved_show(struct kmem_cache *s, char *buf)
 {
-	return sprintf(buf, "%u\n", s->reserved);
+	return sprintf(buf, "0\n");
 }
 SLAB_ATTR_RO(reserved);
 
-- 
cgit v1.2.3


From df2cc96e77011cf7989208b206da9817e0321028 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 7 Jun 2018 17:09:25 -0700
Subject: userfaultfd: prevent non-cooperative events vs mcopy_atomic races

If a process monitored with userfaultfd changes it's memory mappings or
forks() at the same time as uffd monitor fills the process memory with
UFFDIO_COPY, the actual creation of page table entries and copying of
the data in mcopy_atomic may happen either before of after the memory
mapping modifications and there is no way for the uffd monitor to
maintain consistent view of the process memory layout.

For instance, let's consider fork() running in parallel with
userfaultfd_copy():

process        		         |	uffd monitor
---------------------------------+------------------------------
fork()        		         | userfaultfd_copy()
...        		         | ...
    dup_mmap()        	         |     down_read(mmap_sem)
    down_write(mmap_sem)         |     /* create PTEs, copy data */
        dup_uffd()               |     up_read(mmap_sem)
        copy_page_range()        |
        up_write(mmap_sem)       |
        dup_uffd_complete()      |
            /* notify monitor */ |

If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will
be present by the time copy_page_range() is called and they will appear
in the child's memory mappings.  However, if the fork() is the first to
take the mmap_sem, the new pages won't be mapped in the child's address
space.

If the pages are not present and child tries to access them, the monitor
will get page fault notification and everything is fine.  However, if
the pages *are present*, the child can access them without uffd
noticing.  And if we copy them into child it'll see the wrong data.
Since we are talking about background copy, we'd need to decide whether
the pages should be copied or not regardless #PF notifications.

Since userfaultfd monitor has no way to determine what was the order,
let's disallow userfaultfd_copy in parallel with the non-cooperative
events.  In such case we return -EAGAIN and the uffd monitor can
understand that userfaultfd_copy() clashed with a non-cooperative event
and take an appropriate action.

Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c              | 22 ++++++++++++++++++++--
 include/linux/userfaultfd_k.h |  6 ++++--
 mm/userfaultfd.c              | 22 +++++++++++++++++-----
 3 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cec550c8468f..123bf7d516fc 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -62,6 +62,8 @@ struct userfaultfd_ctx {
 	enum userfaultfd_state state;
 	/* released */
 	bool released;
+	/* memory mappings are changing because of non-cooperative event */
+	bool mmap_changing;
 	/* mm with one ore more vmas attached to this userfaultfd_ctx */
 	struct mm_struct *mm;
 };
@@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 	 * already released.
 	 */
 out:
+	WRITE_ONCE(ctx->mmap_changing, false);
 	userfaultfd_ctx_put(ctx);
 }
 
@@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 		ctx->state = UFFD_STATE_RUNNING;
 		ctx->features = octx->features;
 		ctx->released = false;
+		ctx->mmap_changing = false;
 		ctx->mm = vma->vm_mm;
 		mmgrab(ctx->mm);
 
 		userfaultfd_ctx_get(octx);
+		WRITE_ONCE(octx->mmap_changing, true);
 		fctx->orig = octx;
 		fctx->new = ctx;
 		list_add_tail(&fctx->list, fcs);
@@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
+		WRITE_ONCE(ctx->mmap_changing, true);
 	}
 }
 
@@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
 		return true;
 
 	userfaultfd_ctx_get(ctx);
+	WRITE_ONCE(ctx->mmap_changing, true);
 	up_read(&mm->mmap_sem);
 
 	msg_init(&ewq.msg);
@@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 			return -ENOMEM;
 
 		userfaultfd_ctx_get(ctx);
+		WRITE_ONCE(ctx->mmap_changing, true);
 		unmap_ctx->ctx = ctx;
 		unmap_ctx->start = start;
 		unmap_ctx->end = end;
@@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 
 	user_uffdio_copy = (struct uffdio_copy __user *) arg;
 
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
 			   /* don't copy "copy" last field */
@@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 		goto out;
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-				   uffdio_copy.len);
+				   uffdio_copy.len, &ctx->mmap_changing);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 
 	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
 
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
 			   /* don't copy "zeropage" last field */
@@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
-				     uffdio_zeropage.range.len);
+				     uffdio_zeropage.range.len,
+				     &ctx->mmap_changing);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
 	ctx->features = 0;
 	ctx->state = UFFD_STATE_WAIT_API;
 	ctx->released = false;
+	ctx->mmap_changing = false;
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
 	mmgrab(ctx->mm);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2f3b68ba910..e091f0a11b11 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -31,10 +31,12 @@
 extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
-			    unsigned long src_start, unsigned long len);
+			    unsigned long src_start, unsigned long len,
+			    bool *mmap_changing);
 extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
-			      unsigned long len);
+			      unsigned long len,
+			      bool *mmap_changing);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 39791b81ede7..5029f241908f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage)
+					      bool zeropage,
+					      bool *mmap_changing)
 {
 	struct vm_area_struct *dst_vma;
 	ssize_t err;
@@ -430,6 +431,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 retry:
 	down_read(&dst_mm->mmap_sem);
 
+	/*
+	 * If memory mappings are changing because of non-cooperative
+	 * operation (e.g. mremap) running in parallel, bail out and
+	 * request the user to retry later
+	 */
+	err = -EAGAIN;
+	if (mmap_changing && READ_ONCE(*mmap_changing))
+		goto out_unlock;
+
 	/*
 	 * Make sure the vma is not shared, that the dst range is
 	 * both valid and fully within a single existing vma.
@@ -563,13 +573,15 @@ out:
 }
 
 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
-		     unsigned long src_start, unsigned long len)
+		     unsigned long src_start, unsigned long len,
+		     bool *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
+			      mmap_changing);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
-		       unsigned long len)
+		       unsigned long len, bool *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, true);
+	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
 }
-- 
cgit v1.2.3


From 4b33b6959581d5093af2badb489d914911d99eaf Mon Sep 17 00:00:00 2001
From: Huaisheng Ye <yehs1@lenovo.com>
Date: Thu, 7 Jun 2018 17:09:36 -0700
Subject: include/linux/gfp.h: fix the annotation of GFP_ZONE_TABLE

When bit is equal to 0x4, it means OPT_ZONE_DMA32 should be got from
GFP_ZONE_TABLE.  OPT_ZONE_DMA32 shall be equal to ZONE_DMA32 or
ZONE_NORMAL according to the status of CONFIG_ZONE_DMA32.

Similarly, when bit is equal to 0xc, that means OPT_ZONE_DMA32 should be
got with an allocation policy GFP_MOVABLE.  So ZONE_DMA32 or ZONE_NORMAL
is the possible result value.

Link: http://lkml.kernel.org/r/20180601163403.1032-1-yehs2007@zoho.com
Signed-off-by: Huaisheng Ye <yehs1@lenovo.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7f860ea29ec6..a6afcec53795 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -343,7 +343,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
  *       0x1    => DMA or NORMAL
  *       0x2    => HIGHMEM or NORMAL
  *       0x3    => BAD (DMA+HIGHMEM)
- *       0x4    => DMA32 or DMA or NORMAL
+ *       0x4    => DMA32 or NORMAL
  *       0x5    => BAD (DMA+DMA32)
  *       0x6    => BAD (HIGHMEM+DMA32)
  *       0x7    => BAD (HIGHMEM+DMA32+DMA)
@@ -351,7 +351,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
  *       0x9    => DMA or NORMAL (MOVABLE+DMA)
  *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
  *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
- *       0xc    => DMA32 (MOVABLE+DMA32)
+ *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
  *       0xd    => BAD (MOVABLE+DMA32+DMA)
  *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
  *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
-- 
cgit v1.2.3


From e81bf9793b1861d74953ef041b4f6c7faecc2dbd Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Thu, 7 Jun 2018 17:09:44 -0700
Subject: mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in
 the same cacheline

The LKP robot found a 27% will-it-scale/page_fault3 performance
regression regarding commit e27be240df53("mm: memcg: make sure
memory.events is uptodate when waking pollers").

What the test does is:
 1 mkstemp() a 128M file on a tmpfs;
 2 start $nr_cpu processes, each to loop the following:
   2.1 mmap() this file in shared write mode;
   2.2 write 0 to this file in a PAGE_SIZE step till the end of the file;
   2.3 unmap() this file and repeat this process.
 3 After 5 minutes, check how many loops they managed to complete, the
   higher the better.

The commit itself looks innocent enough as it merely changed some event
counting mechanism and this test didn't trigger those events at all.
Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu
in count_memcg_event_mm()(called by handle_mm_fault()) and in
__mod_memcg_state() called by page_add_file_rmap().  So it's likely due
to the changed layout of 'struct mem_cgroup' that either make stat_cpu
falling into a constantly modifying cacheline or some hot fields stop
being in the same cacheline.

I verified this by moving memory_events[] back to where it was:

: --- a/include/linux/memcontrol.h
: +++ b/include/linux/memcontrol.h
: @@ -205,7 +205,6 @@ struct mem_cgroup {
:  	int		oom_kill_disable;
:
:  	/* memory.events */
: -	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
:  	struct cgroup_file events_file;
:
:  	/* protect arrays of thresholds */
: @@ -238,6 +237,7 @@ struct mem_cgroup {
:  	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
:  	atomic_long_t		stat[MEMCG_NR_STAT];
:  	atomic_long_t		events[NR_VM_EVENT_ITEMS];
: +	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
:
:  	unsigned long		socket_pressure;

And performance restored.

Later investigation found that as long as the following 3 fields
moving_account, move_lock_task and stat_cpu are in the same cacheline,
performance will be good.  To avoid future performance surprise by other
commits changing the layout of 'struct mem_cgroup', this patch makes
sure the 3 fields stay in the same cacheline.

One concern of this approach is, moving_account and move_lock_task could
be modified when a process changes memory cgroup while stat_cpu is a
always read field, it might hurt to place them in the same cacheline.  I
assume it is rare for a process to change memory cgroup so this should
be OK.

Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop
Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9c04cf8e6487..4f52ec755725 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -166,6 +166,15 @@ enum memcg_kmem_state {
 	KMEM_ONLINE,
 };
 
+#if defined(CONFIG_SMP)
+struct memcg_padding {
+	char x[0];
+} ____cacheline_internodealigned_in_smp;
+#define MEMCG_PADDING(name)      struct memcg_padding name;
+#else
+#define MEMCG_PADDING(name)
+#endif
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -212,7 +221,6 @@ struct mem_cgroup {
 	int		oom_kill_disable;
 
 	/* memory.events */
-	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
 	struct cgroup_file events_file;
 
 	/* handle for "memory.swap.events" */
@@ -235,19 +243,26 @@ struct mem_cgroup {
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	unsigned long move_charge_at_immigrate;
+	/* taken only while moving_account > 0 */
+	spinlock_t		move_lock;
+	unsigned long		move_lock_flags;
+
+	MEMCG_PADDING(_pad1_);
+
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
 	atomic_t		moving_account;
-	/* taken only while moving_account > 0 */
-	spinlock_t		move_lock;
 	struct task_struct	*move_lock_task;
-	unsigned long		move_lock_flags;
 
 	/* memory.stat */
 	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+
+	MEMCG_PADDING(_pad2_);
+
 	atomic_long_t		stat[MEMCG_NR_STAT];
 	atomic_long_t		events[NR_VM_EVENT_ITEMS];
+	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
 
 	unsigned long		socket_pressure;
 
-- 
cgit v1.2.3


From 72eb7de9c1580cf5705d4f592aca21a8438fde22 Mon Sep 17 00:00:00 2001
From: Sahara <keun-o.park@darkmatter.ae>
Date: Thu, 7 Jun 2018 17:09:48 -0700
Subject: mm: remove page_is_poisoned() from linux/mm.h

When commit bd33ef368135 ("mm: enable page poisoning early at boot") got
rid of the PAGE_EXT_DEBUG_POISON, page_is_poisoned in the header left
behind.  This patch cleans up the leftovers under the table.

Link: http://lkml.kernel.org/r/1528101069-21637-1-git-send-email-kpark3469@gmail.com
Signed-off-by: Sahara <keun-o.park@darkmatter.ae>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1dd588b7c649..f0f3905f8bb8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2532,12 +2532,10 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
-extern bool page_is_poisoned(struct page *page);
 #else
 static inline bool page_poisoning_enabled(void) { return false; }
 static inline void kernel_poison_pages(struct page *page, int numpages,
 					int enable) { }
-static inline bool page_is_poisoned(struct page *page) { return false; }
 #endif
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-- 
cgit v1.2.3


From 6d8e410807829aaaa0c32013d1c8afe91d67c0c5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 7 Jun 2018 17:10:24 -0700
Subject: int-ll64.h: define u{8,16,32,64} and s{8,16,32,64} based on uapi
 header

<uapi/asm-generic/int-ll64.h> has the same typedefs except that it
prefixes them with double-underscore for user space.  Use them for
the kernel space typedefs.

Link: http://lkml.kernel.org/r/1526350925-14922-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Lihao Liang <lianglihao@huawei.com>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/int-ll64.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/int-ll64.h b/include/asm-generic/int-ll64.h
index ffb68d67be5f..a248545f1e18 100644
--- a/include/asm-generic/int-ll64.h
+++ b/include/asm-generic/int-ll64.h
@@ -13,17 +13,14 @@
 
 #ifndef __ASSEMBLY__
 
-typedef signed char s8;
-typedef unsigned char u8;
-
-typedef signed short s16;
-typedef unsigned short u16;
-
-typedef signed int s32;
-typedef unsigned int u32;
-
-typedef signed long long s64;
-typedef unsigned long long u64;
+typedef __s8  s8;
+typedef __u8  u8;
+typedef __s16 s16;
+typedef __u16 u16;
+typedef __s32 s32;
+typedef __u32 u32;
+typedef __s64 s64;
+typedef __u64 u64;
 
 #define S8_C(x)  x
 #define U8_C(x)  x ## U
-- 
cgit v1.2.3


From 6d0e8d53849fb491549a01893d786bf33d4b28df Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 7 Jun 2018 17:10:27 -0700
Subject: include/linux/types.h: define aligned_ types based on uapi header

<uapi/linux/types.h> has the same typedefs except that it prefixes them
with double-underscore for user space.  Use them for the kernel space
typedefs.

Link: http://lkml.kernel.org/r/1526350925-14922-2-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Lihao Liang <lianglihao@huawei.com>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/types.h b/include/linux/types.h
index ec13d02b3481..be1589763e0a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -115,9 +115,9 @@ typedef		__s64		int64_t;
 #endif
 
 /* this is a special 64bit data type that is 8-byte aligned */
-#define aligned_u64 __u64 __attribute__((aligned(8)))
-#define aligned_be64 __be64 __attribute__((aligned(8)))
-#define aligned_le64 __le64 __attribute__((aligned(8)))
+#define aligned_u64		__aligned_u64
+#define aligned_be64		__aligned_be64
+#define aligned_le64		__aligned_le64
 
 /**
  * The type used for indexing onto a disc or disc partition.
-- 
cgit v1.2.3


From b22f22a3c199b3282b05a927c60eb8edbe42d25d Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 7 Jun 2018 17:10:30 -0700
Subject: include/linux/types.h: use fixed width types without
 double-underscore prefix

This header file is not exported.  It is safe to reference types without
double-underscore prefix.

Link: http://lkml.kernel.org/r/1526350925-14922-3-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Lihao Liang <lianglihao@huawei.com>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/types.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/types.h b/include/linux/types.h
index be1589763e0a..9834e90aa010 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -10,14 +10,14 @@
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
 
-typedef __u32 __kernel_dev_t;
+typedef u32 __kernel_dev_t;
 
 typedef __kernel_fd_set		fd_set;
 typedef __kernel_dev_t		dev_t;
 typedef __kernel_ino_t		ino_t;
 typedef __kernel_mode_t		mode_t;
 typedef unsigned short		umode_t;
-typedef __u32			nlink_t;
+typedef u32			nlink_t;
 typedef __kernel_off_t		off_t;
 typedef __kernel_pid_t		pid_t;
 typedef __kernel_daddr_t	daddr_t;
@@ -95,23 +95,23 @@ typedef unsigned long		ulong;
 #ifndef __BIT_TYPES_DEFINED__
 #define __BIT_TYPES_DEFINED__
 
-typedef		__u8		u_int8_t;
-typedef		__s8		int8_t;
-typedef		__u16		u_int16_t;
-typedef		__s16		int16_t;
-typedef		__u32		u_int32_t;
-typedef		__s32		int32_t;
+typedef u8			u_int8_t;
+typedef s8			int8_t;
+typedef u16			u_int16_t;
+typedef s16			int16_t;
+typedef u32			u_int32_t;
+typedef s32			int32_t;
 
 #endif /* !(__BIT_TYPES_DEFINED__) */
 
-typedef		__u8		uint8_t;
-typedef		__u16		uint16_t;
-typedef		__u32		uint32_t;
+typedef u8			uint8_t;
+typedef u16			uint16_t;
+typedef u32			uint32_t;
 
 #if defined(__GNUC__)
-typedef		__u64		uint64_t;
-typedef		__u64		u_int64_t;
-typedef		__s64		int64_t;
+typedef u64			uint64_t;
+typedef u64			u_int64_t;
+typedef s64			int64_t;
 #endif
 
 /* this is a special 64bit data type that is 8-byte aligned */
-- 
cgit v1.2.3


From cbdc61ae1fa5d824fcfd59282b040f21144999ab Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 7 Jun 2018 17:10:51 -0700
Subject: lib/mpi: headers cleanup

MPI headers contain definitions for huge number of non-existing
functions.

Most part of these functions was removed in 2012 by Dmitry Kasatkin
 - 7cf4206a99d1 ("Remove unused code from MPI library")
 - 9e235dcaf4f6 ("Revert "crypto: GnuPG based MPI lib - additional ...")
 - bc95eeadf5c6 ("lib/mpi: removed unused functions")
however headers wwere not updated properly.

Also I deleted some unused macros.

Link: http://lkml.kernel.org/r/fb2fc1ef-1185-f0a3-d8d0-173d2f97bbaf@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitry Kasatkin <dmitry.kasatkin@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mpi.h    | 61 ----------------------------------------
 lib/mpi/mpi-internal.h | 75 ++++----------------------------------------------
 2 files changed, 5 insertions(+), 131 deletions(-)

(limited to 'include')

diff --git a/include/linux/mpi.h b/include/linux/mpi.h
index 1cc5ffb769af..7cd1473c64a4 100644
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
@@ -53,93 +53,32 @@ struct gcry_mpi {
 typedef struct gcry_mpi *MPI;
 
 #define mpi_get_nlimbs(a)     ((a)->nlimbs)
-#define mpi_is_neg(a)	      ((a)->sign)
 
 /*-- mpiutil.c --*/
 MPI mpi_alloc(unsigned nlimbs);
-MPI mpi_alloc_secure(unsigned nlimbs);
-MPI mpi_alloc_like(MPI a);
 void mpi_free(MPI a);
 int mpi_resize(MPI a, unsigned nlimbs);
-int mpi_copy(MPI *copy, const MPI a);
-void mpi_clear(MPI a);
-int mpi_set(MPI w, MPI u);
-int mpi_set_ui(MPI w, ulong u);
-MPI mpi_alloc_set_ui(unsigned long u);
-void mpi_m_check(MPI a);
-void mpi_swap(MPI a, MPI b);
 
 /*-- mpicoder.c --*/
-MPI do_encode_md(const void *sha_buffer, unsigned nbits);
 MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes);
 MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread);
 MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len);
-int mpi_fromstr(MPI val, const char *str);
-u32 mpi_get_keyid(MPI a, u32 *keyid);
 void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign);
 int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
 		    int *sign);
-void *mpi_get_secure_buffer(MPI a, unsigned *nbytes, int *sign);
 int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes,
 		     int *sign);
 
-#define log_mpidump g10_log_mpidump
-
-/*-- mpi-add.c --*/
-int mpi_add_ui(MPI w, MPI u, ulong v);
-int mpi_add(MPI w, MPI u, MPI v);
-int mpi_addm(MPI w, MPI u, MPI v, MPI m);
-int mpi_sub_ui(MPI w, MPI u, ulong v);
-int mpi_sub(MPI w, MPI u, MPI v);
-int mpi_subm(MPI w, MPI u, MPI v, MPI m);
-
-/*-- mpi-mul.c --*/
-int mpi_mul_ui(MPI w, MPI u, ulong v);
-int mpi_mul_2exp(MPI w, MPI u, ulong cnt);
-int mpi_mul(MPI w, MPI u, MPI v);
-int mpi_mulm(MPI w, MPI u, MPI v, MPI m);
-
-/*-- mpi-div.c --*/
-ulong mpi_fdiv_r_ui(MPI rem, MPI dividend, ulong divisor);
-int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor);
-int mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor);
-int mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor);
-int mpi_tdiv_r(MPI rem, MPI num, MPI den);
-int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den);
-int mpi_tdiv_q_2exp(MPI w, MPI u, unsigned count);
-int mpi_divisible_ui(const MPI dividend, ulong divisor);
-
-/*-- mpi-gcd.c --*/
-int mpi_gcd(MPI g, const MPI a, const MPI b);
-
 /*-- mpi-pow.c --*/
-int mpi_pow(MPI w, MPI u, MPI v);
 int mpi_powm(MPI res, MPI base, MPI exp, MPI mod);
 
-/*-- mpi-mpow.c --*/
-int mpi_mulpowm(MPI res, MPI *basearray, MPI *exparray, MPI mod);
-
 /*-- mpi-cmp.c --*/
 int mpi_cmp_ui(MPI u, ulong v);
 int mpi_cmp(MPI u, MPI v);
 
-/*-- mpi-scan.c --*/
-int mpi_getbyte(MPI a, unsigned idx);
-void mpi_putbyte(MPI a, unsigned idx, int value);
-unsigned mpi_trailing_zeros(MPI a);
-
 /*-- mpi-bit.c --*/
 void mpi_normalize(MPI a);
 unsigned mpi_get_nbits(MPI a);
-int mpi_test_bit(MPI a, unsigned n);
-int mpi_set_bit(MPI a, unsigned n);
-int mpi_set_highbit(MPI a, unsigned n);
-void mpi_clear_highbit(MPI a, unsigned n);
-void mpi_clear_bit(MPI a, unsigned n);
-int mpi_rshift(MPI x, MPI a, unsigned n);
-
-/*-- mpi-inv.c --*/
-int mpi_invm(MPI x, MPI u, MPI v);
 
 /* inline functions */
 
diff --git a/lib/mpi/mpi-internal.h b/lib/mpi/mpi-internal.h
index 7eceeddb3fb8..c2d6f4efcfbc 100644
--- a/lib/mpi/mpi-internal.h
+++ b/lib/mpi/mpi-internal.h
@@ -65,13 +65,6 @@
 typedef mpi_limb_t *mpi_ptr_t;	/* pointer to a limb */
 typedef int mpi_size_t;		/* (must be a signed type) */
 
-static inline int RESIZE_IF_NEEDED(MPI a, unsigned b)
-{
-	if (a->alloced < b)
-		return mpi_resize(a, b);
-	return 0;
-}
-
 /* Copy N limbs from S to D.  */
 #define MPN_COPY(d, s, n) \
 	do {					\
@@ -80,13 +73,6 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b)
 			(d)[_i] = (s)[_i];	\
 	} while (0)
 
-#define MPN_COPY_INCR(d, s, n) \
-	do {					\
-		mpi_size_t _i;			\
-		for (_i = 0; _i < (n); _i++)	\
-			(d)[_i] = (s)[_i];	\
-	} while (0)
-
 #define MPN_COPY_DECR(d, s, n) \
 	do {					\
 		mpi_size_t _i;			\
@@ -111,15 +97,6 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b)
 		}				\
 	} while (0)
 
-#define MPN_NORMALIZE_NOT_ZERO(d, n) \
-	do {				\
-		for (;;) {		\
-			if ((d)[(n)-1])	\
-				break;	\
-			(n)--;		\
-		}			\
-	} while (0)
-
 #define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \
 	do {							\
 		if ((size) < KARATSUBA_THRESHOLD)		\
@@ -128,46 +105,11 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b)
 			mul_n(prodp, up, vp, size, tspace);	\
 	} while (0);
 
-/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
- * limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
- * If this would yield overflow, DI should be the largest possible number
- * (i.e., only ones).  For correct operation, the most significant bit of D
- * has to be set.  Put the quotient in Q and the remainder in R.
- */
-#define UDIV_QRNND_PREINV(q, r, nh, nl, d, di) \
-	do {								\
-		mpi_limb_t _q, _ql, _r;					\
-		mpi_limb_t _xh, _xl;					\
-		umul_ppmm(_q, _ql, (nh), (di));				\
-		_q += (nh);	/* DI is 2**BITS_PER_MPI_LIMB too small */ \
-		umul_ppmm(_xh, _xl, _q, (d));				\
-		sub_ddmmss(_xh, _r, (nh), (nl), _xh, _xl);		\
-		if (_xh) {						\
-			sub_ddmmss(_xh, _r, _xh, _r, 0, (d));		\
-			_q++;						\
-			if (_xh) {					\
-				sub_ddmmss(_xh, _r, _xh, _r, 0, (d));	\
-				_q++;					\
-			}						\
-		}							\
-		if (_r >= (d)) {					\
-			_r -= (d);					\
-			_q++;						\
-		}							\
-		(r) = _r;						\
-		(q) = _q;						\
-	} while (0)
-
 /*-- mpiutil.c --*/
 mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs);
 void mpi_free_limb_space(mpi_ptr_t a);
 void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs);
 
-/*-- mpi-bit.c --*/
-void mpi_rshift_limbs(MPI a, unsigned int count);
-int mpi_lshift_limbs(MPI a, unsigned int count);
-
-/*-- mpihelp-add.c --*/
 static inline mpi_limb_t mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			 mpi_size_t s1_size, mpi_limb_t s2_limb);
 mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
@@ -175,7 +117,6 @@ mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 static inline mpi_limb_t mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
 		       mpi_ptr_t s2_ptr, mpi_size_t s2_size);
 
-/*-- mpihelp-sub.c --*/
 static inline mpi_limb_t mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			 mpi_size_t s1_size, mpi_limb_t s2_limb);
 mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
@@ -183,10 +124,10 @@ mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 static inline mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
 		       mpi_ptr_t s2_ptr, mpi_size_t s2_size);
 
-/*-- mpihelp-cmp.c --*/
+/*-- mpih-cmp.c --*/
 int mpihelp_cmp(mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size);
 
-/*-- mpihelp-mul.c --*/
+/*-- mpih-mul.c --*/
 
 struct karatsuba_ctx {
 	struct karatsuba_ctx *next;
@@ -202,7 +143,6 @@ mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			    mpi_size_t s1_size, mpi_limb_t s2_limb);
 mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			    mpi_size_t s1_size, mpi_limb_t s2_limb);
-int mpihelp_mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size);
 int mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
 		mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result);
 void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size);
@@ -214,21 +154,16 @@ int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp,
 			       mpi_ptr_t vp, mpi_size_t vsize,
 			       struct karatsuba_ctx *ctx);
 
-/*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/
+/*-- generic_mpih-mul1.c --*/
 mpi_limb_t mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			 mpi_size_t s1_size, mpi_limb_t s2_limb);
 
-/*-- mpihelp-div.c --*/
-mpi_limb_t mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
-			 mpi_limb_t divisor_limb);
+/*-- mpih-div.c --*/
 mpi_limb_t mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs,
 			  mpi_ptr_t np, mpi_size_t nsize,
 			  mpi_ptr_t dp, mpi_size_t dsize);
-mpi_limb_t mpihelp_divmod_1(mpi_ptr_t quot_ptr,
-			    mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
-			    mpi_limb_t divisor_limb);
 
-/*-- mpihelp-shift.c --*/
+/*-- generic_mpih-[lr]shift.c --*/
 mpi_limb_t mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
 			  unsigned cnt);
 mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
-- 
cgit v1.2.3


From ef8b42f78e883cb72fd781c2eff3a42d89c1c0c3 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 7 Jun 2018 17:11:05 -0700
Subject: autofs4: merge auto_fs.h and auto_fs4.h

The autofs module has long since been removed so there's no need to have
two separate include files for autofs.

Link: http://lkml.kernel.org/r/152626703024.28589.9571964661718767929.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h         |   2 +-
 fs/compat_ioctl.c             |   1 -
 include/uapi/linux/auto_fs.h  | 169 +++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/auto_fs4.h | 153 +-------------------------------------
 4 files changed, 161 insertions(+), 164 deletions(-)

(limited to 'include')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4737615f0eaa..01636f3945d5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -9,7 +9,7 @@
 
 /* Internal header file for autofs */
 
-#include <linux/auto_fs4.h>
+#include <linux/auto_fs.h>
 #include <linux/auto_dev-ioctl.h>
 
 #include <linux/kernel.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ef80085ed564..b3e1768b636e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -39,7 +39,6 @@
 #include <linux/if_pppox.h>
 #include <linux/mtio.h>
 #include <linux/auto_fs.h>
-#include <linux/auto_fs4.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 2a4432c7a4b4..e13eec3dfb2f 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 /*
- *   Copyright 1997 Transmeta Corporation - All Rights Reserved
+ * Copyright 1997 Transmeta Corporation - All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2005-2006,2013,2017-2018 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
@@ -8,7 +10,6 @@
  *
  * ----------------------------------------------------------------------- */
 
-
 #ifndef _UAPI_LINUX_AUTO_FS_H
 #define _UAPI_LINUX_AUTO_FS_H
 
@@ -18,13 +19,11 @@
 #include <sys/ioctl.h>
 #endif /* __KERNEL__ */
 
+#define AUTOFS_PROTO_VERSION		5
+#define AUTOFS_MIN_PROTO_VERSION	3
+#define AUTOFS_MAX_PROTO_VERSION	5
 
-/* This file describes autofs v3 */
-#define AUTOFS_PROTO_VERSION	3
-
-/* Range of protocol versions defined */
-#define AUTOFS_MAX_PROTO_VERSION	AUTOFS_PROTO_VERSION
-#define AUTOFS_MIN_PROTO_VERSION	AUTOFS_PROTO_VERSION
+#define AUTOFS_PROTO_SUBVERSION		2
 
 /*
  * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
@@ -76,9 +75,155 @@ enum {
 #define AUTOFS_IOC_READY        _IO(AUTOFS_IOCTL, AUTOFS_IOC_READY_CMD)
 #define AUTOFS_IOC_FAIL         _IO(AUTOFS_IOCTL, AUTOFS_IOC_FAIL_CMD)
 #define AUTOFS_IOC_CATATONIC    _IO(AUTOFS_IOCTL, AUTOFS_IOC_CATATONIC_CMD)
-#define AUTOFS_IOC_PROTOVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOVER_CMD, int)
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, compat_ulong_t)
-#define AUTOFS_IOC_SETTIMEOUT   _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, unsigned long)
-#define AUTOFS_IOC_EXPIRE       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_CMD, struct autofs_packet_expire)
+#define AUTOFS_IOC_PROTOVER     _IOR(AUTOFS_IOCTL, \
+				     AUTOFS_IOC_PROTOVER_CMD, int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, \
+				      AUTOFS_IOC_SETTIMEOUT_CMD, \
+				      compat_ulong_t)
+#define AUTOFS_IOC_SETTIMEOUT   _IOWR(AUTOFS_IOCTL, \
+				      AUTOFS_IOC_SETTIMEOUT_CMD, \
+				      unsigned long)
+#define AUTOFS_IOC_EXPIRE       _IOR(AUTOFS_IOCTL, \
+				     AUTOFS_IOC_EXPIRE_CMD, \
+				     struct autofs_packet_expire)
+
+/* autofs version 4 and later definitions */
+
+/* Mask for expire behaviour */
+#define AUTOFS_EXP_IMMEDIATE		1
+#define AUTOFS_EXP_LEAVES		2
+
+#define AUTOFS_TYPE_ANY			0U
+#define AUTOFS_TYPE_INDIRECT		1U
+#define AUTOFS_TYPE_DIRECT		2U
+#define AUTOFS_TYPE_OFFSET		4U
+
+static inline void set_autofs_type_indirect(unsigned int *type)
+{
+	*type = AUTOFS_TYPE_INDIRECT;
+}
+
+static inline unsigned int autofs_type_indirect(unsigned int type)
+{
+	return (type == AUTOFS_TYPE_INDIRECT);
+}
+
+static inline void set_autofs_type_direct(unsigned int *type)
+{
+	*type = AUTOFS_TYPE_DIRECT;
+}
+
+static inline unsigned int autofs_type_direct(unsigned int type)
+{
+	return (type == AUTOFS_TYPE_DIRECT);
+}
+
+static inline void set_autofs_type_offset(unsigned int *type)
+{
+	*type = AUTOFS_TYPE_OFFSET;
+}
+
+static inline unsigned int autofs_type_offset(unsigned int type)
+{
+	return (type == AUTOFS_TYPE_OFFSET);
+}
+
+static inline unsigned int autofs_type_trigger(unsigned int type)
+{
+	return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET);
+}
+
+/*
+ * This isn't really a type as we use it to say "no type set" to
+ * indicate we want to search for "any" mount in the
+ * autofs_dev_ioctl_ismountpoint() device ioctl function.
+ */
+static inline void set_autofs_type_any(unsigned int *type)
+{
+	*type = AUTOFS_TYPE_ANY;
+}
+
+static inline unsigned int autofs_type_any(unsigned int type)
+{
+	return (type == AUTOFS_TYPE_ANY);
+}
+
+/* Daemon notification packet types */
+enum autofs_notify {
+	NFY_NONE,
+	NFY_MOUNT,
+	NFY_EXPIRE
+};
+
+/* Kernel protocol version 4 packet types */
+
+/* Expire entry (umount request) */
+#define autofs_ptype_expire_multi	2
+
+/* Kernel protocol version 5 packet types */
+
+/* Indirect mount missing and expire requests. */
+#define autofs_ptype_missing_indirect	3
+#define autofs_ptype_expire_indirect	4
+
+/* Direct mount missing and expire requests */
+#define autofs_ptype_missing_direct	5
+#define autofs_ptype_expire_direct	6
+
+/* v4 multi expire (via pipe) */
+struct autofs_packet_expire_multi {
+	struct autofs_packet_hdr hdr;
+	autofs_wqt_t wait_queue_token;
+	int len;
+	char name[NAME_MAX+1];
+};
+
+union autofs_packet_union {
+	struct autofs_packet_hdr hdr;
+	struct autofs_packet_missing missing;
+	struct autofs_packet_expire expire;
+	struct autofs_packet_expire_multi expire_multi;
+};
+
+/* autofs v5 common packet struct */
+struct autofs_v5_packet {
+	struct autofs_packet_hdr hdr;
+	autofs_wqt_t wait_queue_token;
+	__u32 dev;
+	__u64 ino;
+	__u32 uid;
+	__u32 gid;
+	__u32 pid;
+	__u32 tgid;
+	__u32 len;
+	char name[NAME_MAX+1];
+};
+
+typedef struct autofs_v5_packet autofs_packet_missing_indirect_t;
+typedef struct autofs_v5_packet autofs_packet_expire_indirect_t;
+typedef struct autofs_v5_packet autofs_packet_missing_direct_t;
+typedef struct autofs_v5_packet autofs_packet_expire_direct_t;
+
+union autofs_v5_packet_union {
+	struct autofs_packet_hdr hdr;
+	struct autofs_v5_packet v5_packet;
+	autofs_packet_missing_indirect_t missing_indirect;
+	autofs_packet_expire_indirect_t expire_indirect;
+	autofs_packet_missing_direct_t missing_direct;
+	autofs_packet_expire_direct_t expire_direct;
+};
+
+enum {
+	AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */
+	AUTOFS_IOC_PROTOSUBVER_CMD,
+	AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */
+};
+
+#define AUTOFS_IOC_EXPIRE_MULTI		_IOW(AUTOFS_IOCTL, \
+					     AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
+#define AUTOFS_IOC_PROTOSUBVER		_IOR(AUTOFS_IOCTL, \
+					     AUTOFS_IOC_PROTOSUBVER_CMD, int)
+#define AUTOFS_IOC_ASKUMOUNT		_IOR(AUTOFS_IOCTL, \
+					     AUTOFS_IOC_ASKUMOUNT_CMD, int)
 
 #endif /* _UAPI_LINUX_AUTO_FS_H */
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 1f608e27a06f..d01ef0a0189c 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -7,156 +7,9 @@
  * option, any later version, incorporated herein by reference.
  */
 
-#ifndef _LINUX_AUTO_FS4_H
-#define _LINUX_AUTO_FS4_H
+#ifndef _UAPI_LINUX_AUTO_FS4_H
+#define _UAPI_LINUX_AUTO_FS4_H
 
-/* Include common v3 definitions */
-#include <linux/types.h>
 #include <linux/auto_fs.h>
 
-/* autofs v4 definitions */
-#undef AUTOFS_PROTO_VERSION
-#undef AUTOFS_MIN_PROTO_VERSION
-#undef AUTOFS_MAX_PROTO_VERSION
-
-#define AUTOFS_PROTO_VERSION		5
-#define AUTOFS_MIN_PROTO_VERSION	3
-#define AUTOFS_MAX_PROTO_VERSION	5
-
-#define AUTOFS_PROTO_SUBVERSION		2
-
-/* Mask for expire behaviour */
-#define AUTOFS_EXP_IMMEDIATE		1
-#define AUTOFS_EXP_LEAVES		2
-
-#define AUTOFS_TYPE_ANY			0U
-#define AUTOFS_TYPE_INDIRECT		1U
-#define AUTOFS_TYPE_DIRECT		2U
-#define AUTOFS_TYPE_OFFSET		4U
-
-static inline void set_autofs_type_indirect(unsigned int *type)
-{
-	*type = AUTOFS_TYPE_INDIRECT;
-}
-
-static inline unsigned int autofs_type_indirect(unsigned int type)
-{
-	return (type == AUTOFS_TYPE_INDIRECT);
-}
-
-static inline void set_autofs_type_direct(unsigned int *type)
-{
-	*type = AUTOFS_TYPE_DIRECT;
-}
-
-static inline unsigned int autofs_type_direct(unsigned int type)
-{
-	return (type == AUTOFS_TYPE_DIRECT);
-}
-
-static inline void set_autofs_type_offset(unsigned int *type)
-{
-	*type = AUTOFS_TYPE_OFFSET;
-}
-
-static inline unsigned int autofs_type_offset(unsigned int type)
-{
-	return (type == AUTOFS_TYPE_OFFSET);
-}
-
-static inline unsigned int autofs_type_trigger(unsigned int type)
-{
-	return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET);
-}
-
-/*
- * This isn't really a type as we use it to say "no type set" to
- * indicate we want to search for "any" mount in the
- * autofs_dev_ioctl_ismountpoint() device ioctl function.
- */
-static inline void set_autofs_type_any(unsigned int *type)
-{
-	*type = AUTOFS_TYPE_ANY;
-}
-
-static inline unsigned int autofs_type_any(unsigned int type)
-{
-	return (type == AUTOFS_TYPE_ANY);
-}
-
-/* Daemon notification packet types */
-enum autofs_notify {
-	NFY_NONE,
-	NFY_MOUNT,
-	NFY_EXPIRE
-};
-
-/* Kernel protocol version 4 packet types */
-
-/* Expire entry (umount request) */
-#define autofs_ptype_expire_multi	2
-
-/* Kernel protocol version 5 packet types */
-
-/* Indirect mount missing and expire requests. */
-#define autofs_ptype_missing_indirect	3
-#define autofs_ptype_expire_indirect	4
-
-/* Direct mount missing and expire requests */
-#define autofs_ptype_missing_direct	5
-#define autofs_ptype_expire_direct	6
-
-/* v4 multi expire (via pipe) */
-struct autofs_packet_expire_multi {
-	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
-	int len;
-	char name[NAME_MAX+1];
-};
-
-union autofs_packet_union {
-	struct autofs_packet_hdr hdr;
-	struct autofs_packet_missing missing;
-	struct autofs_packet_expire expire;
-	struct autofs_packet_expire_multi expire_multi;
-};
-
-/* autofs v5 common packet struct */
-struct autofs_v5_packet {
-	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
-	__u32 dev;
-	__u64 ino;
-	__u32 uid;
-	__u32 gid;
-	__u32 pid;
-	__u32 tgid;
-	__u32 len;
-	char name[NAME_MAX+1];
-};
-
-typedef struct autofs_v5_packet autofs_packet_missing_indirect_t;
-typedef struct autofs_v5_packet autofs_packet_expire_indirect_t;
-typedef struct autofs_v5_packet autofs_packet_missing_direct_t;
-typedef struct autofs_v5_packet autofs_packet_expire_direct_t;
-
-union autofs_v5_packet_union {
-	struct autofs_packet_hdr hdr;
-	struct autofs_v5_packet v5_packet;
-	autofs_packet_missing_indirect_t missing_indirect;
-	autofs_packet_expire_indirect_t expire_indirect;
-	autofs_packet_missing_direct_t missing_direct;
-	autofs_packet_expire_direct_t expire_direct;
-};
-
-enum {
-	AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */
-	AUTOFS_IOC_PROTOSUBVER_CMD,
-	AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */
-};
-
-#define AUTOFS_IOC_EXPIRE_MULTI    _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
-#define AUTOFS_IOC_PROTOSUBVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int)
-#define AUTOFS_IOC_ASKUMOUNT       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int)
-
-#endif /* _LINUX_AUTO_FS4_H */
+#endif /* _UAPI_LINUX_AUTO_FS4_H */
-- 
cgit v1.2.3


From d8e87fc6d11c31525430a388317b52f4a98a5328 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 7 Jun 2018 19:38:09 +0000
Subject: netfilter: remove include/net/netfilter/nft_dup.h

include/net/netfilter/nft_dup.h was introduced in d877f07112f1 ("netfilter: nf_tables: add nft_dup expression")
but was never user since this date.

Furthermore, the only struct in this file is unused elsewhere.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_dup.h | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 include/net/netfilter/nft_dup.h

(limited to 'include')

diff --git a/include/net/netfilter/nft_dup.h b/include/net/netfilter/nft_dup.h
deleted file mode 100644
index 4d9d512984b2..000000000000
--- a/include/net/netfilter/nft_dup.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NFT_DUP_H_
-#define _NFT_DUP_H_
-
-struct nft_dup_inet {
-	enum nft_registers	sreg_addr:8;
-	enum nft_registers	sreg_dev:8;
-};
-
-#endif /* _NFT_DUP_H_ */
-- 
cgit v1.2.3


From 28e89fd914a22e8a64f05ae2f0048b06165f371b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jun 2018 14:42:05 -0600
Subject: block: add bioset_init_from_src() helper

Add a helper that allows a caller to initialize a new bio_set,
using the settings from an existing bio_set.

Reported-by: Venkat R.B <vrbagal1@linux.vnet.ibm.com>
Tested-by: Venkat R.B <vrbagal1@linux.vnet.ibm.com>
Tested-by: Li Wang <liwang@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 18 ++++++++++++++++++
 include/linux/bio.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 595663e0281a..bf516d873ce9 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1967,6 +1967,24 @@ bad:
 }
 EXPORT_SYMBOL(bioset_init);
 
+/*
+ * Initialize and setup a new bio_set, based on the settings from
+ * another bio_set.
+ */
+int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
+{
+	int flags;
+
+	flags = 0;
+	if (src->bvec_pool.min_nr)
+		flags |= BIOSET_NEED_BVECS;
+	if (src->rescue_workqueue)
+		flags |= BIOSET_NEED_RESCUER;
+
+	return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
+}
+EXPORT_SYMBOL(bioset_init_from_src);
+
 #ifdef CONFIG_BLK_CGROUP
 
 /**
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 810a8bee8f85..84abd1706dcb 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -417,6 +417,7 @@ enum {
 extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
 extern void bioset_exit(struct bio_set *);
 extern int biovec_init_pool(mempool_t *pool, int pool_entries);
+extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
 
 extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
 extern void bio_put(struct bio *);
-- 
cgit v1.2.3


From 6c206b20092a3623184cff9470dba75d21507874 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 8 Jun 2018 11:35:40 +0200
Subject: udp: fix rx queue len reported by diag and proc interface

After commit 6b229cf77d68 ("udp: add batching to udp_rmem_release()")
the sk_rmem_alloc field does not measure exactly anymore the
receive queue length, because we batch the rmem release. The issue
is really apparent only after commit 0d4a6608f68c ("udp: do rmem bulk
free even if the rx sk queue is empty"): the user space can easily
check for an empty socket with not-0 queue length reported by the 'ss'
tool or the procfs interface.

We need to use a custom UDP helper to report the correct queue length,
taking into account the forward allocation deficit.

Reported-by: trevor.francis@46labs.com
Fixes: 6b229cf77d68 ("UDP: add batching to udp_rmem_release()")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/transp_v6.h | 11 +++++++++--
 include/net/udp.h       |  5 +++++
 net/ipv4/udp.c          |  2 +-
 net/ipv4/udp_diag.c     |  2 +-
 net/ipv6/datagram.c     |  6 +++---
 net/ipv6/udp.c          |  3 ++-
 6 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index c4f5caaf3778..f6a3543e5247 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -45,8 +45,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg,
 			  struct flowi6 *fl6, struct ipcm6_cookie *ipc6,
 			  struct sockcm_cookie *sockc);
 
-void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
-			     __u16 srcp, __u16 destp, int bucket);
+void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
+			       __u16 srcp, __u16 destp, int rqueue, int bucket);
+static inline void
+ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp,
+			__u16 destp, int bucket)
+{
+	__ip6_dgram_sock_seq_show(seq, sp, srcp, destp, sk_rmem_alloc_get(sp),
+				  bucket);
+}
 
 #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006)
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 7ba0ed252c52..b1ea8b0f5e6a 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -247,6 +247,11 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 	return htons((((u64) hash * (max - min)) >> 32) + min);
 }
 
+static inline int udp_rqueue_get(struct sock *sk)
+{
+	return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
+}
+
 /* net/ipv4/udp.c */
 void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3365362cac88..9bb27df4dac5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2772,7 +2772,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
-		sk_rmem_alloc_get(sp),
+		udp_rqueue_get(sp),
 		0, 0L, 0,
 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
 		0, sock_i_ino(sp),
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index d0390d844ac8..d9ad986c7b2c 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -163,7 +163,7 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
 static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 		void *info)
 {
-	r->idiag_rqueue = sk_rmem_alloc_get(sk);
+	r->idiag_rqueue = udp_rqueue_get(sk);
 	r->idiag_wqueue = sk_wmem_alloc_get(sk);
 }
 
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a02ad100f0d7..2ee08b6a86a4 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -1019,8 +1019,8 @@ exit_f:
 }
 EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl);
 
-void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
-			     __u16 srcp, __u16 destp, int bucket)
+void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
+			       __u16 srcp, __u16 destp, int rqueue, int bucket)
 {
 	const struct in6_addr *dest, *src;
 
@@ -1036,7 +1036,7 @@ void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
 		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
 		   sp->sk_state,
 		   sk_wmem_alloc_get(sp),
-		   sk_rmem_alloc_get(sp),
+		   rqueue,
 		   0, 0L, 0,
 		   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
 		   0,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 164afd31aebf..e6645cae403e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1523,7 +1523,8 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 		struct inet_sock *inet = inet_sk(v);
 		__u16 srcp = ntohs(inet->inet_sport);
 		__u16 destp = ntohs(inet->inet_dport);
-		ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
+		__ip6_dgram_sock_seq_show(seq, v, srcp, destp,
+					  udp_rqueue_get(v), bucket);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From ec84b27f9b3b569f9235413d1945a2006b97b0aa Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 25 May 2018 11:05:06 +0200
Subject: rcu: Update documentation of rcu_read_unlock()

Since commit b4abf91047cf ("rtmutex: Make wait_lock irq safe") the
explanation in rcu_read_unlock() documentation about irq unsafe rtmutex
wait_lock is no longer valid.

Remove it to prevent kernel developers reading the documentation to rely on
it.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: bigeasy@linutronix.de
Link: https://lkml.kernel.org/r/20180525090507.22248-2-anna-maria@linutronix.de
---
 include/linux/rcupdate.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e679b175b411..65163aa0bb04 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -652,9 +652,7 @@ static inline void rcu_read_lock(void)
  * Unfortunately, this function acquires the scheduler's runqueue and
  * priority-inheritance spinlocks.  This means that deadlock could result
  * if the caller of rcu_read_unlock() already holds one of these locks or
- * any lock that is ever acquired while holding them; or any lock which
- * can be taken from interrupt context because rcu_boost()->rt_mutex_lock()
- * does not disable irqs while taking ->wait_lock.
+ * any lock that is ever acquired while holding them.
  *
  * That said, RCU readers are never priority boosted unless they were
  * preempted.  Therefore, one way to avoid deadlock is to make sure
-- 
cgit v1.2.3


From ad584b46d7fb22c2a9f8ce35000a324bf5a98190 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@dell.com>
Date: Wed, 14 Mar 2018 16:13:00 -0700
Subject: ACPICA: Recognize the _OSI string "Windows 2017.2"

Dell uses this string to activate Thunderbolt native mode on supported
machines.

Signed-off-by: Mario Limonciello <mario.limonciello@dell.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/utosi.c | 1 +
 include/acpi/actypes.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/acpica/utosi.c b/drivers/acpi/acpica/utosi.c
index 1b415fa90cf8..64b63c81994b 100644
--- a/drivers/acpi/acpica/utosi.c
+++ b/drivers/acpi/acpica/utosi.c
@@ -69,6 +69,7 @@ static struct acpi_interface_info acpi_default_supported_interfaces[] = {
 	{"Windows 2015", NULL, 0, ACPI_OSI_WIN_10},	/* Windows 10 - Added 03/2015 */
 	{"Windows 2016", NULL, 0, ACPI_OSI_WIN_10_RS1},	/* Windows 10 version 1607 - Added 12/2017 */
 	{"Windows 2017", NULL, 0, ACPI_OSI_WIN_10_RS2},	/* Windows 10 version 1703 - Added 12/2017 */
+	{"Windows 2017.2", NULL, 0, ACPI_OSI_WIN_10_RS3},	/* Windows 10 version 1709 - Added 02/2018 */
 
 	/* Feature Group Strings */
 
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 2b1bafa197c0..66ceb12ebc63 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -1272,6 +1272,7 @@ typedef enum {
 #define ACPI_OSI_WIN_10                 0x0D
 #define ACPI_OSI_WIN_10_RS1             0x0E
 #define ACPI_OSI_WIN_10_RS2             0x0F
+#define ACPI_OSI_WIN_10_RS3             0x10
 
 /* Definitions of getopt */
 
-- 
cgit v1.2.3


From d5318d302e7cf6583ec85a2a8bfbb3a3910ae372 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 20 Nov 2017 11:45:47 +0100
Subject: backlight: as3711_bl: Fix Device Tree node leaks

Two framebuffer device-node names were looked up during probe, but were
only used as flags to indicate the presence of two framebuffer device.

Drop the unused framebuffer name along with a likewise unused device
pointer from the driver data, and update the platform data to pass in
booleans instead of the framebuffer strings. This allows us do drop the
node references acquired during probe, which would otherwise leak.

Note that there are no other in-kernel users of the modified
platform-data fields.

Fixes: 59eb2b5e57ea ("drivers/video/backlight/as3711_bl.c: add OF support")
Signed-off-by: Johan Hovold <johan@kernel.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/video/backlight/as3711_bl.c | 12 ++++++------
 include/linux/mfd/as3711.h          |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/as3711_bl.c b/drivers/video/backlight/as3711_bl.c
index e55304d5cf07..ca544aa764b8 100644
--- a/drivers/video/backlight/as3711_bl.c
+++ b/drivers/video/backlight/as3711_bl.c
@@ -28,8 +28,6 @@ enum as3711_bl_type {
 
 struct as3711_bl_data {
 	bool powered;
-	const char *fb_name;
-	struct device *fb_dev;
 	enum as3711_bl_type type;
 	int brightness;
 	struct backlight_device *bl;
@@ -273,7 +271,9 @@ static int as3711_backlight_parse_dt(struct device *dev)
 
 	fb = of_parse_phandle(bl, "su1-dev", 0);
 	if (fb) {
-		pdata->su1_fb = fb->full_name;
+		of_node_put(fb);
+
+		pdata->su1_fb = true;
 
 		ret = of_property_read_u32(bl, "su1-max-uA", &pdata->su1_max_uA);
 		if (pdata->su1_max_uA <= 0)
@@ -286,7 +286,9 @@ static int as3711_backlight_parse_dt(struct device *dev)
 	if (fb) {
 		int count = 0;
 
-		pdata->su2_fb = fb->full_name;
+		of_node_put(fb);
+
+		pdata->su2_fb = true;
 
 		ret = of_property_read_u32(bl, "su2-max-uA", &pdata->su2_max_uA);
 		if (pdata->su2_max_uA <= 0)
@@ -425,7 +427,6 @@ static int as3711_backlight_probe(struct platform_device *pdev)
 
 	if (pdata->su1_fb) {
 		su = &supply->su1;
-		su->fb_name = pdata->su1_fb;
 		su->type = AS3711_BL_SU1;
 
 		max_brightness = min(pdata->su1_max_uA, 31);
@@ -436,7 +437,6 @@ static int as3711_backlight_probe(struct platform_device *pdev)
 
 	if (pdata->su2_fb) {
 		su = &supply->su2;
-		su->fb_name = pdata->su2_fb;
 		su->type = AS3711_BL_SU2;
 
 		switch (pdata->su2_fbprot) {
diff --git a/include/linux/mfd/as3711.h b/include/linux/mfd/as3711.h
index 34cc85864be5..ddd0b953323b 100644
--- a/include/linux/mfd/as3711.h
+++ b/include/linux/mfd/as3711.h
@@ -108,9 +108,9 @@ struct as3711_regulator_pdata {
 };
 
 struct as3711_bl_pdata {
-	const char *su1_fb;
+	bool su1_fb;
 	int su1_max_uA;
-	const char *su2_fb;
+	bool su2_fb;
 	int su2_max_uA;
 	enum as3711_su2_feedback su2_feedback;
 	enum as3711_su2_fbprot su2_fbprot;
-- 
cgit v1.2.3


From f6fadff33e8b09373eedf99822b89d9dd84545b8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 11 Jun 2018 23:22:04 +0200
Subject: tls: fix NULL pointer dereference on poll

While hacking on kTLS, I ran into the following panic from an
unprivileged netserver / netperf TCP session:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  PGD 800000037f378067 P4D 800000037f378067 PUD 3c0e61067 PMD 0
  Oops: 0010 [#1] SMP KASAN PTI
  CPU: 1 PID: 2289 Comm: netserver Not tainted 4.17.0+ #139
  Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016
  RIP: 0010:          (null)
  Code: Bad RIP value.
  RSP: 0018:ffff88036abcf740 EFLAGS: 00010246
  RAX: dffffc0000000000 RBX: ffff88036f5f6800 RCX: 1ffff1006debed26
  RDX: ffff88036abcf920 RSI: ffff8803cb1a4f00 RDI: ffff8803c258c280
  RBP: ffff8803c258c280 R08: ffff8803c258c280 R09: ffffed006f559d48
  R10: ffff88037aacea43 R11: ffffed006f559d49 R12: ffff8803c258c280
  R13: ffff8803cb1a4f20 R14: 00000000000000db R15: ffffffffc168a350
  FS:  00007f7e631f4700(0000) GS:ffff8803d1c80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffffffffffffffd6 CR3: 00000003ccf64005 CR4: 00000000003606e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   ? tls_sw_poll+0xa4/0x160 [tls]
   ? sock_poll+0x20a/0x680
   ? do_select+0x77b/0x11a0
   ? poll_schedule_timeout.constprop.12+0x130/0x130
   ? pick_link+0xb00/0xb00
   ? read_word_at_a_time+0x13/0x20
   ? vfs_poll+0x270/0x270
   ? deref_stack_reg+0xad/0xe0
   ? __read_once_size_nocheck.constprop.6+0x10/0x10
  [...]

Debugging further, it turns out that calling into ctx->sk_poll() is
invalid since sk_poll itself is NULL which was saved from the original
TCP socket in order for tls_sw_poll() to invoke it.

Looks like the recent conversion from poll to poll_mask callback started
in 152524231023 ("net: add support for ->poll_mask in proto_ops") missed
to eventually convert kTLS, too: TCP's ->poll was converted over to the
->poll_mask in commit 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
and therefore kTLS wrongly saved the ->poll old one which is now NULL.

Convert kTLS over to use ->poll_mask instead. Also instead of POLLIN |
POLLRDNORM use the proper EPOLLIN | EPOLLRDNORM bits as the case in
tcp_poll_mask() as well that is mangled here.

Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Watson <davejwatson@fb.com>
Tested-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  6 ++----
 net/tls/tls_main.c |  2 +-
 net/tls/tls_sw.c   | 19 +++++++++----------
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 70c273777fe9..7f84ea3e217c 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -109,8 +109,7 @@ struct tls_sw_context_rx {
 
 	struct strparser strp;
 	void (*saved_data_ready)(struct sock *sk);
-	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
-				struct poll_table_struct *wait);
+	__poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events);
 	struct sk_buff *recv_pkt;
 	u8 control;
 	bool decrypted;
@@ -225,8 +224,7 @@ void tls_sw_free_resources_tx(struct sock *sk);
 void tls_sw_free_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait);
+__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 301f22430469..a127d61e8af9 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -712,7 +712,7 @@ static int __init tls_register(void)
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
 	tls_sw_proto_ops = inet_stream_ops;
-	tls_sw_proto_ops.poll = tls_sw_poll;
+	tls_sw_proto_ops.poll_mask = tls_sw_poll_mask;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 8ca57d01b18f..34895b7c132d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -915,23 +915,22 @@ splice_read_end:
 	return copied ? : err;
 }
 
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait)
+__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events)
 {
-	unsigned int ret;
 	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	__poll_t mask;
 
-	/* Grab POLLOUT and POLLHUP from the underlying socket */
-	ret = ctx->sk_poll(file, sock, wait);
+	/* Grab EPOLLOUT and EPOLLHUP from the underlying socket */
+	mask = ctx->sk_poll_mask(sock, events);
 
-	/* Clear POLLIN bits, and set based on recv_pkt */
-	ret &= ~(POLLIN | POLLRDNORM);
+	/* Clear EPOLLIN bits, and set based on recv_pkt */
+	mask &= ~(EPOLLIN | EPOLLRDNORM);
 	if (ctx->recv_pkt)
-		ret |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
-	return ret;
+	return mask;
 }
 
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1188,7 +1187,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
+		sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask;
 
 		strp_check_rcv(&sw_ctx_rx->strp);
 	}
-- 
cgit v1.2.3


From cfecc2918d2b3c5e86ff1a6c95eabbbb17bb8fd3 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Fri, 1 Jun 2018 12:02:39 +0800
Subject: virtio_pci: support enabling VFs

There is a new feature bit allocated in virtio spec to
support SR-IOV (Single Root I/O Virtualization):

https://github.com/oasis-tcs/virtio-spec/issues/11

This patch enables the support for this feature bit in
virtio driver.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_common.c | 30 ++++++++++++++++++++++++++++++
 drivers/virtio/virtio_pci_modern.c | 14 ++++++++++++++
 include/uapi/linux/virtio_config.h |  7 ++++++-
 3 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 48d4d1cf1cb6..1d4467b2dc31 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -577,6 +577,8 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
 	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
 	struct device *dev = get_device(&vp_dev->vdev.dev);
 
+	pci_disable_sriov(pci_dev);
+
 	unregister_virtio_device(&vp_dev->vdev);
 
 	if (vp_dev->ioaddr)
@@ -588,6 +590,33 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
 	put_device(dev);
 }
 
+static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs)
+{
+	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+	struct virtio_device *vdev = &vp_dev->vdev;
+	int ret;
+
+	if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK))
+		return -EBUSY;
+
+	if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV))
+		return -EINVAL;
+
+	if (pci_vfs_assigned(pci_dev))
+		return -EPERM;
+
+	if (num_vfs == 0) {
+		pci_disable_sriov(pci_dev);
+		return 0;
+	}
+
+	ret = pci_enable_sriov(pci_dev, num_vfs);
+	if (ret < 0)
+		return ret;
+
+	return num_vfs;
+}
+
 static struct pci_driver virtio_pci_driver = {
 	.name		= "virtio-pci",
 	.id_table	= virtio_pci_id_table,
@@ -596,6 +625,7 @@ static struct pci_driver virtio_pci_driver = {
 #ifdef CONFIG_PM_SLEEP
 	.driver.pm	= &virtio_pci_pm_ops,
 #endif
+	.sriov_configure = virtio_pci_sriov_configure,
 };
 
 module_pci_driver(virtio_pci_driver);
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 2555d80f6eec..07571daccfec 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -153,14 +153,28 @@ static u64 vp_get_features(struct virtio_device *vdev)
 	return features;
 }
 
+static void vp_transport_features(struct virtio_device *vdev, u64 features)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct pci_dev *pci_dev = vp_dev->pci_dev;
+
+	if ((features & BIT_ULL(VIRTIO_F_SR_IOV)) &&
+			pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_SRIOV))
+		__virtio_set_bit(vdev, VIRTIO_F_SR_IOV);
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	u64 features = vdev->features;
 
 	/* Give virtio_ring a chance to accept features. */
 	vring_transport_features(vdev);
 
+	/* Give virtio_pci a chance to accept features. */
+	vp_transport_features(vdev, features);
+
 	if (!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) {
 		dev_err(&vdev->dev, "virtio: device uses modern interface "
 			"but does not have VIRTIO_F_VERSION_1\n");
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e2096291f..b7c1f4e7d59e 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,7 +49,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		34
+#define VIRTIO_TRANSPORT_F_END		38
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -71,4 +71,9 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
+
+/*
+ * Does the device support Single Root I/O Virtualization?
+ */
+#define VIRTIO_F_SR_IOV			37
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
-- 
cgit v1.2.3


From 2eb98105f8c7f4b867f7f714a998f5b8c1bb009b Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Fri, 1 Jun 2018 19:26:32 +0800
Subject: virtio: update the comments for transport features

The existing comments for transport features are outdated.
So update them to address the latest changes in the spec.

Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/uapi/linux/virtio_config.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index b7c1f4e7d59e..449132c76b1c 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -45,9 +45,12 @@
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED		0x80
 
-/* Some virtio feature bits (currently bits 28 through 32) are reserved for the
- * transport being used (eg. virtio_ring), the rest are per-device feature
- * bits. */
+/*
+ * Virtio feature bits VIRTIO_TRANSPORT_F_START through
+ * VIRTIO_TRANSPORT_F_END are reserved for the transport
+ * being used (e.g. virtio_ring, virtio_pci etc.), the
+ * rest are per-device feature bits.
+ */
 #define VIRTIO_TRANSPORT_F_START	28
 #define VIRTIO_TRANSPORT_F_END		38
 
-- 
cgit v1.2.3


From b06c0b2f087ab498d51d50f5ae353133b602f614 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 12 Jun 2018 10:24:13 +0200
Subject: Revert "PM / runtime: Fixup reference counting of device link
 suppliers at probe"

Revert commit 1e8378619841 (PM / runtime: Fixup reference counting of
device link suppliers at probe), as it has introduced a regression
and the condition it was designed to address should be covered by the
existing code.

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/dd.c            |  3 ++-
 drivers/base/power/runtime.c | 27 ++++++++++++++++++++++++---
 include/linux/pm_runtime.h   |  6 ++++--
 3 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index a41c91bfac0e..10454fe54482 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -580,7 +580,7 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
 
-	pm_runtime_resume_suppliers(dev);
+	pm_runtime_get_suppliers(dev);
 	if (dev->parent)
 		pm_runtime_get_sync(dev->parent);
 
@@ -591,6 +591,7 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	if (dev->parent)
 		pm_runtime_put(dev->parent);
 
+	pm_runtime_put_suppliers(dev);
 	return ret;
 }
 
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index c6030f100c08..beb85c31f3fa 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1563,16 +1563,37 @@ void pm_runtime_clean_up_links(struct device *dev)
 }
 
 /**
- * pm_runtime_resume_suppliers - Resume supplier devices.
+ * pm_runtime_get_suppliers - Resume and reference-count supplier devices.
  * @dev: Consumer device.
  */
-void pm_runtime_resume_suppliers(struct device *dev)
+void pm_runtime_get_suppliers(struct device *dev)
 {
+	struct device_link *link;
 	int idx;
 
 	idx = device_links_read_lock();
 
-	rpm_get_suppliers(dev);
+	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node)
+		if (link->flags & DL_FLAG_PM_RUNTIME)
+			pm_runtime_get_sync(link->supplier);
+
+	device_links_read_unlock(idx);
+}
+
+/**
+ * pm_runtime_put_suppliers - Drop references to supplier devices.
+ * @dev: Consumer device.
+ */
+void pm_runtime_put_suppliers(struct device *dev)
+{
+	struct device_link *link;
+	int idx;
+
+	idx = device_links_read_lock();
+
+	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node)
+		if (link->flags & DL_FLAG_PM_RUNTIME)
+			pm_runtime_put(link->supplier);
 
 	device_links_read_unlock(idx);
 }
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index db5dbbf7a48d..f0fc4700b6ff 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -56,7 +56,8 @@ extern void pm_runtime_update_max_time_suspended(struct device *dev,
 						 s64 delta_ns);
 extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
 extern void pm_runtime_clean_up_links(struct device *dev);
-extern void pm_runtime_resume_suppliers(struct device *dev);
+extern void pm_runtime_get_suppliers(struct device *dev);
+extern void pm_runtime_put_suppliers(struct device *dev);
 extern void pm_runtime_new_link(struct device *dev);
 extern void pm_runtime_drop_link(struct device *dev);
 
@@ -172,7 +173,8 @@ static inline unsigned long pm_runtime_autosuspend_expiration(
 static inline void pm_runtime_set_memalloc_noio(struct device *dev,
 						bool enable){}
 static inline void pm_runtime_clean_up_links(struct device *dev) {}
-static inline void pm_runtime_resume_suppliers(struct device *dev) {}
+static inline void pm_runtime_get_suppliers(struct device *dev) {}
+static inline void pm_runtime_put_suppliers(struct device *dev) {}
 static inline void pm_runtime_new_link(struct device *dev) {}
 static inline void pm_runtime_drop_link(struct device *dev) {}
 
-- 
cgit v1.2.3


From 766d3571d8e50d3a73b77043dc632226f9e6b389 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 8 Jun 2018 02:19:53 +0300
Subject: kvm: fix typo in flag name

KVM_X86_DISABLE_EXITS_HTL really refers to exit on halt.
Obviously a typo: should be named KVM_X86_DISABLE_EXITS_HLT.

Fixes: caa057a2cad ("KVM: X86: Provide a capability to disable HLT intercepts")
Cc: stable@vger.kernel.org
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c             | 4 ++--
 include/uapi/linux/kvm.h       | 4 ++--
 tools/include/uapi/linux/kvm.h | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 439fb0c7dbc0..06dd4cdb2ca8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2899,7 +2899,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_CLOCK_TSC_STABLE;
 		break;
 	case KVM_CAP_X86_DISABLE_EXITS:
-		r |=  KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE;
+		r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
 		if(kvm_can_mwait_in_guest())
 			r |= KVM_X86_DISABLE_EXITS_MWAIT;
 		break;
@@ -4253,7 +4253,7 @@ split_irqchip_unlock:
 		if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
 			kvm_can_mwait_in_guest())
 			kvm->arch.mwait_in_guest = true;
-		if (cap->args[0] & KVM_X86_DISABLE_EXITS_HTL)
+		if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
 			kvm->arch.hlt_in_guest = true;
 		if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
 			kvm->arch.pause_in_guest = true;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b252ceb3965c..b6270a3b38e9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -677,10 +677,10 @@ struct kvm_ioeventfd {
 };
 
 #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
-#define KVM_X86_DISABLE_EXITS_HTL            (1 << 1)
+#define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
 #define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
-                                              KVM_X86_DISABLE_EXITS_HTL | \
+                                              KVM_X86_DISABLE_EXITS_HLT | \
                                               KVM_X86_DISABLE_EXITS_PAUSE)
 
 /* for KVM_ENABLE_CAP */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index b02c41e53d56..39e364c70caf 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -677,10 +677,10 @@ struct kvm_ioeventfd {
 };
 
 #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
-#define KVM_X86_DISABLE_EXITS_HTL            (1 << 1)
+#define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
 #define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
-                                              KVM_X86_DISABLE_EXITS_HTL | \
+                                              KVM_X86_DISABLE_EXITS_HLT | \
                                               KVM_X86_DISABLE_EXITS_PAUSE)
 
 /* for KVM_ENABLE_CAP */
-- 
cgit v1.2.3


From 00979ce4fcc90d488c7f27f750097adc6b11bd07 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 11 Jun 2018 11:35:26 +0900
Subject: linux/linkage.h: replace VMLINUX_SYMBOL_STR() with __stringify()

With the special case handling for Blackfin and Metag was removed by
commit 94e58e0ac312 ("export.h: remove code for prefixing symbols with
underscore"), VMLINUX_SYMBOL_STR() is now equivalent to __stringify().

Replace the remaining usages in <linux/linkage.h> to prepare for the
entire removal of VMLINUX_SYMBOL_STR().

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 include/linux/linkage.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index f68db9e450eb..d7618c41f74c 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -24,16 +24,16 @@
 
 #ifndef cond_syscall
 #define cond_syscall(x)	asm(				\
-	".weak " VMLINUX_SYMBOL_STR(x) "\n\t"		\
-	".set  " VMLINUX_SYMBOL_STR(x) ","		\
-		 VMLINUX_SYMBOL_STR(sys_ni_syscall))
+	".weak " __stringify(x) "\n\t"			\
+	".set  " __stringify(x) ","			\
+		 __stringify(sys_ni_syscall))
 #endif
 
 #ifndef SYSCALL_ALIAS
 #define SYSCALL_ALIAS(alias, name) asm(			\
-	".globl " VMLINUX_SYMBOL_STR(alias) "\n\t"	\
-	".set   " VMLINUX_SYMBOL_STR(alias) ","		\
-		  VMLINUX_SYMBOL_STR(name))
+	".globl " __stringify(alias) "\n\t"		\
+	".set   " __stringify(alias) ","		\
+		  __stringify(name))
 #endif
 
 #define __page_aligned_data	__section(.data..page_aligned) __aligned(PAGE_SIZE)
-- 
cgit v1.2.3


From 155fb5c5fae72d1faa2067d6fa0a5be12279c689 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Mon, 28 May 2018 18:14:49 +0900
Subject: netfilter: fix null-ptr-deref in nf_nat_decode_session

Add null check for nat_hook in nf_nat_decode_session()

[  195.648098] UBSAN: Undefined behaviour in ./include/linux/netfilter.h:348:14
[  195.651366] BUG: KASAN: null-ptr-deref in __xfrm_policy_check+0x208/0x1d70
[  195.653888] member access within null pointer of type 'struct nf_nat_hook'
[  195.653896] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.17.0-rc6+ #5
[  195.656320] Read of size 8 at addr 0000000000000008 by task ping/2469
[  195.658715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  195.658721] Call Trace:
[  195.661087]
[  195.669341]  <IRQ>
[  195.670574]  dump_stack+0xc6/0x150
[  195.672156]  ? dump_stack_print_info.cold.0+0x1b/0x1b
[  195.674121]  ? ubsan_prologue+0x31/0x92
[  195.676546]  ubsan_epilogue+0x9/0x49
[  195.678159]  handle_null_ptr_deref+0x11a/0x130
[  195.679800]  ? sprint_OID+0x1a0/0x1a0
[  195.681322]  __ubsan_handle_type_mismatch_v1+0xd5/0x11d
[  195.683146]  ? ubsan_prologue+0x92/0x92
[  195.684642]  __xfrm_policy_check+0x18ef/0x1d70
[  195.686294]  ? rt_cache_valid+0x118/0x180
[  195.687804]  ? __xfrm_route_forward+0x410/0x410
[  195.689463]  ? fib_multipath_hash+0x700/0x700
[  195.691109]  ? kvm_sched_clock_read+0x23/0x40
[  195.692805]  ? pvclock_clocksource_read+0xf6/0x280
[  195.694409]  ? graph_lock+0xa0/0xa0
[  195.695824]  ? pvclock_clocksource_read+0xf6/0x280
[  195.697508]  ? pvclock_read_flags+0x80/0x80
[  195.698981]  ? kvm_sched_clock_read+0x23/0x40
[  195.700347]  ? sched_clock+0x5/0x10
[  195.701525]  ? sched_clock_cpu+0x18/0x1a0
[  195.702846]  tcp_v4_rcv+0x1d32/0x1de0
[  195.704115]  ? lock_repin_lock+0x70/0x270
[  195.707072]  ? pvclock_read_flags+0x80/0x80
[  195.709302]  ? tcp_v4_early_demux+0x4b0/0x4b0
[  195.711833]  ? lock_acquire+0x195/0x380
[  195.714222]  ? ip_local_deliver_finish+0xfc/0x770
[  195.716967]  ? raw_rcv+0x2b0/0x2b0
[  195.718856]  ? lock_release+0xa00/0xa00
[  195.720938]  ip_local_deliver_finish+0x1b9/0x770
[...]

Fixes: 2c205dd3981f ("netfilter: add struct nf_nat_hook and use it")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 04551af2ff23..dd2052f0efb7 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -345,7 +345,7 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 
 	rcu_read_lock();
 	nat_hook = rcu_dereference(nf_nat_hook);
-	if (nat_hook->decode_session)
+	if (nat_hook && nat_hook->decode_session)
 		nat_hook->decode_session(skb, fl);
 	rcu_read_unlock();
 #endif
-- 
cgit v1.2.3


From 215a31f19dedd4e92a67cf5a9717ee898d012b3a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 11 Jun 2018 17:18:29 +0200
Subject: netfilter: nft_dynset: do not reject set updates with NFT_SET_EVAL

NFT_SET_EVAL is signalling the kernel that this sets can be updated from
the evaluation path, even if there are no expressions attached to the
element. Otherwise, set updates with no expressions fail. Update
description to describe the right semantics.

Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 +-
 net/netfilter/nft_dynset.c               | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index c9bf74b94f37..89438e68dc03 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -266,7 +266,7 @@ enum nft_rule_compat_attributes {
  * @NFT_SET_INTERVAL: set contains intervals
  * @NFT_SET_MAP: set is used as a dictionary
  * @NFT_SET_TIMEOUT: set uses timeouts
- * @NFT_SET_EVAL: set contains expressions for evaluation
+ * @NFT_SET_EVAL: set can be updated from the evaluation path
  * @NFT_SET_OBJECT: set contains stateful objects
  */
 enum nft_set_flags {
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 4d49529cff61..27d7e4598ab6 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -203,9 +203,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 				goto err1;
 			set->ops->gc_init(set);
 		}
-
-	} else if (set->flags & NFT_SET_EVAL)
-		return -EINVAL;
+	}
 
 	nft_set_ext_prepare(&priv->tmpl);
 	nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen);
-- 
cgit v1.2.3


From 21ba8847f857028dc83a0f341e16ecc616e34740 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Tue, 12 Jun 2018 10:51:34 -0700
Subject: netfilter: nf_conncount: Fix garbage collection with zones
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, we use check_hlist() for garbage colleciton. However, we
use the ‘zone’ from the counted entry to query the existence of
existing entries in the hlist. This could be wrong when they are in
different zones, and this patch fixes this issue.

Fixes: e59ea3df3fc2 ("netfilter: xt_connlimit: honor conntrack zone if available")
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h |  3 ++-
 net/netfilter/nf_conncount.c               | 13 +++++++++----
 net/netfilter/nft_connlimit.c              |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index 1910b6572430..3a188a0923a3 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -20,7 +20,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 				 bool *addit);
 
 bool nf_conncount_add(struct hlist_head *head,
-		      const struct nf_conntrack_tuple *tuple);
+		      const struct nf_conntrack_tuple *tuple,
+		      const struct nf_conntrack_zone *zone);
 
 void nf_conncount_cache_free(struct hlist_head *hhead);
 
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 3b5059a8dcdd..d8383609fe28 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -46,6 +46,7 @@
 struct nf_conncount_tuple {
 	struct hlist_node		node;
 	struct nf_conntrack_tuple	tuple;
+	struct nf_conntrack_zone	zone;
 };
 
 struct nf_conncount_rb {
@@ -80,7 +81,8 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 }
 
 bool nf_conncount_add(struct hlist_head *head,
-		      const struct nf_conntrack_tuple *tuple)
+		      const struct nf_conntrack_tuple *tuple,
+		      const struct nf_conntrack_zone *zone)
 {
 	struct nf_conncount_tuple *conn;
 
@@ -88,6 +90,7 @@ bool nf_conncount_add(struct hlist_head *head,
 	if (conn == NULL)
 		return false;
 	conn->tuple = *tuple;
+	conn->zone = *zone;
 	hlist_add_head(&conn->node, head);
 	return true;
 }
@@ -108,7 +111,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 
 	/* check the saved connections */
 	hlist_for_each_entry_safe(conn, n, head, node) {
-		found = nf_conntrack_find_get(net, zone, &conn->tuple);
+		found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
 		if (found == NULL) {
 			hlist_del(&conn->node);
 			kmem_cache_free(conncount_conn_cachep, conn);
@@ -117,7 +120,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 
 		found_ct = nf_ct_tuplehash_to_ctrack(found);
 
-		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) {
+		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
+		    nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 			/*
 			 * Just to be sure we have it only once in the list.
 			 * We should not see tuples twice unless someone hooks
@@ -196,7 +200,7 @@ count_tree(struct net *net, struct rb_root *root,
 			if (!addit)
 				return count;
 
-			if (!nf_conncount_add(&rbconn->hhead, tuple))
+			if (!nf_conncount_add(&rbconn->hhead, tuple, zone))
 				return 0; /* hotdrop */
 
 			return count + 1;
@@ -238,6 +242,7 @@ count_tree(struct net *net, struct rb_root *root,
 	}
 
 	conn->tuple = *tuple;
+	conn->zone = *zone;
 	memcpy(rbconn->key, key, sizeof(u32) * keylen);
 
 	INIT_HLIST_HEAD(&rbconn->hhead);
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 50c068d660e5..a832c59f0a9c 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -52,7 +52,7 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 	if (!addit)
 		goto out;
 
-	if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
+	if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) {
 		regs->verdict.code = NF_DROP;
 		spin_unlock_bh(&priv->lock);
 		return;
-- 
cgit v1.2.3


From 7654cb1ba7d0f312a6841d35d0f576db4723e8a3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 7 Jun 2018 07:57:16 -0700
Subject: Convert infiniband uverbs to struct_size

The flows were hidden from the C compiler; expose them as a zero-length
array to allow struct_size to work.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/infiniband/core/uverbs_cmd.c | 4 ++--
 include/rdma/ib_verbs.h              | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3179a95c6f5e..3e90b6a1d9d2 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3559,8 +3559,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 		goto err_uobj;
 	}
 
-	flow_attr = kzalloc(sizeof(*flow_attr) + cmd.flow_attr.num_of_specs *
-			    sizeof(union ib_flow_spec), GFP_KERNEL);
+	flow_attr = kzalloc(struct_size(flow_attr, flows,
+				cmd.flow_attr.num_of_specs), GFP_KERNEL);
 	if (!flow_attr) {
 		err = -ENOMEM;
 		goto err_put;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 2043e1a8f851..4c6241bc2039 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2093,10 +2093,7 @@ struct ib_flow_attr {
 	u32	     flags;
 	u8	     num_of_specs;
 	u8	     port;
-	/* Following are the optional layers according to user request
-	 * struct ib_flow_spec_xxx
-	 * struct ib_flow_spec_yyy
-	 */
+	union ib_flow_spec flows[];
 };
 
 struct ib_flow {
-- 
cgit v1.2.3


From 1c542f38ab8d30d9c852a16d49ac5a15267bbf1f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 11 Jun 2018 14:35:55 -0700
Subject: mm: Introduce kvcalloc()

The kv*alloc()-family was missing kvcalloc(). Adding this allows for
2-argument multiplication conversions of kvzalloc(a * b, ...) into
kvcalloc(a, b, ...).

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e493884e6e1..a0fbb9ffe380 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -575,6 +575,11 @@ static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
 	return kvmalloc(bytes, flags);
 }
 
+static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kvmalloc_array(n, size, flags | __GFP_ZERO);
+}
+
 extern void kvfree(const void *addr);
 
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
-- 
cgit v1.2.3


From 050e9baa9dc9fbd9ce2b27f0056990fc9e0a08a0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 14 Jun 2018 12:21:18 +0900
Subject: Kbuild: rename CC_STACKPROTECTOR[_STRONG] config variables

The changes to automatically test for working stack protector compiler
support in the Kconfig files removed the special STACKPROTECTOR_AUTO
option that picked the strongest stack protector that the compiler
supported.

That was all a nice cleanup - it makes no sense to have the AUTO case
now that the Kconfig phase can just determine the compiler support
directly.

HOWEVER.

It also meant that doing "make oldconfig" would now _disable_ the strong
stackprotector if you had AUTO enabled, because in a legacy config file,
the sane stack protector configuration would look like

  CONFIG_HAVE_CC_STACKPROTECTOR=y
  # CONFIG_CC_STACKPROTECTOR_NONE is not set
  # CONFIG_CC_STACKPROTECTOR_REGULAR is not set
  # CONFIG_CC_STACKPROTECTOR_STRONG is not set
  CONFIG_CC_STACKPROTECTOR_AUTO=y

and when you ran this through "make oldconfig" with the Kbuild changes,
it would ask you about the regular CONFIG_CC_STACKPROTECTOR (that had
been renamed from CONFIG_CC_STACKPROTECTOR_REGULAR to just
CONFIG_CC_STACKPROTECTOR), but it would think that the STRONG version
used to be disabled (because it was really enabled by AUTO), and would
disable it in the new config, resulting in:

  CONFIG_HAVE_CC_STACKPROTECTOR=y
  CONFIG_CC_HAS_STACKPROTECTOR_NONE=y
  CONFIG_CC_STACKPROTECTOR=y
  # CONFIG_CC_STACKPROTECTOR_STRONG is not set
  CONFIG_CC_HAS_SANE_STACKPROTECTOR=y

That's dangerously subtle - people could suddenly find themselves with
the weaker stack protector setup without even realizing.

The solution here is to just rename not just the old RECULAR stack
protector option, but also the strong one.  This does that by just
removing the CC_ prefix entirely for the user choices, because it really
is not about the compiler support (the compiler support now instead
automatially impacts _visibility_ of the options to users).

This results in "make oldconfig" actually asking the user for their
choice, so that we don't have any silent subtle security model changes.
The end result would generally look like this:

  CONFIG_HAVE_CC_STACKPROTECTOR=y
  CONFIG_CC_HAS_STACKPROTECTOR_NONE=y
  CONFIG_STACKPROTECTOR=y
  CONFIG_STACKPROTECTOR_STRONG=y
  CONFIG_CC_HAS_SANE_STACKPROTECTOR=y

where the "CC_" versions really are about internal compiler
infrastructure, not the user selections.

Acked-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kbuild/kconfig-language.txt  | 2 +-
 Documentation/security/self-protection.rst | 2 +-
 Makefile                                   | 4 ++--
 arch/Kconfig                               | 6 +++---
 arch/arm/kernel/asm-offsets.c              | 2 +-
 arch/arm/kernel/entry-armv.S               | 4 ++--
 arch/arm/kernel/process.c                  | 2 +-
 arch/arm64/kernel/process.c                | 2 +-
 arch/mips/kernel/asm-offsets.c             | 2 +-
 arch/mips/kernel/octeon_switch.S           | 2 +-
 arch/mips/kernel/process.c                 | 2 +-
 arch/mips/kernel/r2300_switch.S            | 2 +-
 arch/mips/kernel/r4k_switch.S              | 2 +-
 arch/sh/kernel/process.c                   | 2 +-
 arch/sh/kernel/process_32.c                | 2 +-
 arch/x86/entry/entry_32.S                  | 2 +-
 arch/x86/entry/entry_64.S                  | 2 +-
 arch/x86/include/asm/processor.h           | 2 +-
 arch/x86/include/asm/segment.h             | 2 +-
 arch/x86/include/asm/stackprotector.h      | 6 +++---
 arch/x86/kernel/asm-offsets.c              | 2 +-
 arch/x86/kernel/asm-offsets_32.c           | 2 +-
 arch/x86/kernel/asm-offsets_64.c           | 2 +-
 arch/x86/kernel/cpu/common.c               | 2 +-
 arch/x86/kernel/head_32.S                  | 2 +-
 arch/xtensa/kernel/asm-offsets.c           | 2 +-
 arch/xtensa/kernel/entry.S                 | 2 +-
 arch/xtensa/kernel/process.c               | 2 +-
 include/linux/sched.h                      | 2 +-
 include/linux/stackprotector.h             | 2 +-
 kernel/configs/android-recommended.config  | 2 +-
 kernel/fork.c                              | 2 +-
 kernel/panic.c                             | 2 +-
 33 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index a4eb01843c04..3534a84d206c 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -480,7 +480,7 @@ There are several features that need compiler support. The recommended way
 to describe the dependency on the compiler feature is to use "depends on"
 followed by a test macro.
 
-config CC_STACKPROTECTOR
+config STACKPROTECTOR
 	bool "Stack Protector buffer overflow detection"
 	depends on $(cc-option,-fstack-protector)
 	...
diff --git a/Documentation/security/self-protection.rst b/Documentation/security/self-protection.rst
index 0f53826c78b9..e1ca698e0006 100644
--- a/Documentation/security/self-protection.rst
+++ b/Documentation/security/self-protection.rst
@@ -156,7 +156,7 @@ The classic stack buffer overflow involves writing past the expected end
 of a variable stored on the stack, ultimately writing a controlled value
 to the stack frame's stored return address. The most widely used defense
 is the presence of a stack canary between the stack variables and the
-return address (``CONFIG_CC_STACKPROTECTOR``), which is verified just before
+return address (``CONFIG_STACKPROTECTOR``), which is verified just before
 the function returns. Other defenses include things like shadow stacks.
 
 Stack depth overflow
diff --git a/Makefile b/Makefile
index 73f0bb2c7a98..8a26b5937241 100644
--- a/Makefile
+++ b/Makefile
@@ -687,8 +687,8 @@ KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
 endif
 
 stackp-flags-$(CONFIG_CC_HAS_STACKPROTECTOR_NONE) := -fno-stack-protector
-stackp-flags-$(CONFIG_CC_STACKPROTECTOR)          := -fstack-protector
-stackp-flags-$(CONFIG_CC_STACKPROTECTOR_STRONG)   := -fstack-protector-strong
+stackp-flags-$(CONFIG_STACKPROTECTOR)             := -fstack-protector
+stackp-flags-$(CONFIG_STACKPROTECTOR_STRONG)      := -fstack-protector-strong
 
 KBUILD_CFLAGS += $(stackp-flags-y)
 
diff --git a/arch/Kconfig b/arch/Kconfig
index ebbb45096191..c302b3dd0058 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -558,7 +558,7 @@ config HAVE_CC_STACKPROTECTOR
 config CC_HAS_STACKPROTECTOR_NONE
 	def_bool $(cc-option,-fno-stack-protector)
 
-config CC_STACKPROTECTOR
+config STACKPROTECTOR
 	bool "Stack Protector buffer overflow detection"
 	depends on HAVE_CC_STACKPROTECTOR
 	depends on $(cc-option,-fstack-protector)
@@ -582,9 +582,9 @@ config CC_STACKPROTECTOR
 	  about 3% of all kernel functions, which increases kernel code size
 	  by about 0.3%.
 
-config CC_STACKPROTECTOR_STRONG
+config STACKPROTECTOR_STRONG
 	bool "Strong Stack Protector"
-	depends on CC_STACKPROTECTOR
+	depends on STACKPROTECTOR
 	depends on $(cc-option,-fstack-protector-strong)
 	default y
 	help
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 27c5381518d8..974d8d7d1bcd 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -61,7 +61,7 @@
 int main(void)
 {
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
   DEFINE(TSK_STACK_CANARY,	offsetof(struct task_struct, stack_canary));
 #endif
   BLANK();
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 1752033b0070..179a9f6bd1e3 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -791,7 +791,7 @@ ENTRY(__switch_to)
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
 	switch_tls r1, r4, r5, r3, r7
-#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard
 	.if (TSK_STACK_CANARY > IMM12_MASK)
@@ -807,7 +807,7 @@ ENTRY(__switch_to)
 	ldr	r0, =thread_notify_head
 	mov	r1, #THREAD_NOTIFY_SWITCH
 	bl	atomic_notifier_call_chain
-#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	str	r7, [r8]
 #endif
  THUMB(	mov	ip, r4			   )
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 1523cb18b109..225d1c58d2de 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -39,7 +39,7 @@
 #include <asm/tls.h>
 #include <asm/vdso.h>
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 #include <linux/stackprotector.h>
 unsigned long __stack_chk_guard __read_mostly;
 EXPORT_SYMBOL(__stack_chk_guard);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f08a2ed9db0d..e10bc363f533 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -59,7 +59,7 @@
 #include <asm/processor.h>
 #include <asm/stacktrace.h>
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 #include <linux/stackprotector.h>
 unsigned long __stack_chk_guard __read_mostly;
 EXPORT_SYMBOL(__stack_chk_guard);
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index c1cd41456d42..cbe4742d2fff 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -83,7 +83,7 @@ void output_task_defines(void)
 	OFFSET(TASK_FLAGS, task_struct, flags);
 	OFFSET(TASK_MM, task_struct, mm);
 	OFFSET(TASK_PID, task_struct, pid);
-#if defined(CONFIG_CC_STACKPROTECTOR)
+#if defined(CONFIG_STACKPROTECTOR)
 	OFFSET(TASK_STACK_CANARY, task_struct, stack_canary);
 #endif
 	DEFINE(TASK_STRUCT_SIZE, sizeof(struct task_struct));
diff --git a/arch/mips/kernel/octeon_switch.S b/arch/mips/kernel/octeon_switch.S
index e42113fe2762..896080b445c2 100644
--- a/arch/mips/kernel/octeon_switch.S
+++ b/arch/mips/kernel/octeon_switch.S
@@ -61,7 +61,7 @@
 #endif
 3:
 
-#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	PTR_LA	t8, __stack_chk_guard
 	LONG_L	t9, TASK_STACK_CANARY(a1)
 	LONG_S	t9, 0(t8)
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 3775a8d694fb..8d85046adcc8 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -180,7 +180,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
 	return 0;
 }
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 #include <linux/stackprotector.h>
 unsigned long __stack_chk_guard __read_mostly;
 EXPORT_SYMBOL(__stack_chk_guard);
diff --git a/arch/mips/kernel/r2300_switch.S b/arch/mips/kernel/r2300_switch.S
index 665897139f30..71b1aafae1bb 100644
--- a/arch/mips/kernel/r2300_switch.S
+++ b/arch/mips/kernel/r2300_switch.S
@@ -36,7 +36,7 @@ LEAF(resume)
 	cpu_save_nonscratch a0
 	sw	ra, THREAD_REG31(a0)
 
-#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	PTR_LA	t8, __stack_chk_guard
 	LONG_L	t9, TASK_STACK_CANARY(a1)
 	LONG_S	t9, 0(t8)
diff --git a/arch/mips/kernel/r4k_switch.S b/arch/mips/kernel/r4k_switch.S
index 17cf9341c1cf..58232ae6cfae 100644
--- a/arch/mips/kernel/r4k_switch.S
+++ b/arch/mips/kernel/r4k_switch.S
@@ -31,7 +31,7 @@
 	cpu_save_nonscratch a0
 	LONG_S	ra, THREAD_REG31(a0)
 
-#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	PTR_LA	t8, __stack_chk_guard
 	LONG_L	t9, TASK_STACK_CANARY(a1)
 	LONG_S	t9, 0(t8)
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 68b1a67533ce..4d1bfc848dd3 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -12,7 +12,7 @@
 struct kmem_cache *task_xstate_cachep = NULL;
 unsigned int xstate_size;
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 unsigned long __stack_chk_guard __read_mostly;
 EXPORT_SYMBOL(__stack_chk_guard);
 #endif
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 93522069cb15..27fddb56b3e1 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -177,7 +177,7 @@ __switch_to(struct task_struct *prev, struct task_struct *next)
 {
 	struct thread_struct *next_t = &next->thread;
 
-#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	__stack_chk_guard = next->stack_canary;
 #endif
 
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index bef8e2b202a8..2582881d19ce 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -239,7 +239,7 @@ ENTRY(__switch_to_asm)
 	movl	%esp, TASK_threadsp(%eax)
 	movl	TASK_threadsp(%edx), %esp
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	movl	TASK_stack_canary(%edx), %ebx
 	movl	%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
 #endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3166b9674429..73a522d53b53 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -357,7 +357,7 @@ ENTRY(__switch_to_asm)
 	movq	%rsp, TASK_threadsp(%rdi)
 	movq	TASK_threadsp(%rsi), %rsp
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	movq	TASK_stack_canary(%rsi), %rbx
 	movq	%rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
 #endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e28add6b791f..cfd29ee8c3da 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -412,7 +412,7 @@ extern asmlinkage void ignore_sysret(void);
 void save_fsgs_for_kvm(void);
 #endif
 #else	/* X86_64 */
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 /*
  * Make sure stack canary segment base is cached-aligned:
  *   "For Intel Atom processors, avoid non zero segment base address
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 8f09012b92e7..e293c122d0d5 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -146,7 +146,7 @@
 # define __KERNEL_PERCPU		0
 #endif
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 # define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY*8)
 #else
 # define __KERNEL_STACK_CANARY		0
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 371b3a4af000..8ec97a62c245 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -34,7 +34,7 @@
 #ifndef _ASM_STACKPROTECTOR_H
 #define _ASM_STACKPROTECTOR_H 1
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 
 #include <asm/tsc.h>
 #include <asm/processor.h>
@@ -105,7 +105,7 @@ static inline void load_stack_canary_segment(void)
 #endif
 }
 
-#else	/* CC_STACKPROTECTOR */
+#else	/* STACKPROTECTOR */
 
 #define GDT_STACK_CANARY_INIT
 
@@ -121,5 +121,5 @@ static inline void load_stack_canary_segment(void)
 #endif
 }
 
-#endif	/* CC_STACKPROTECTOR */
+#endif	/* STACKPROTECTOR */
 #endif	/* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 76417a9aab73..dcb008c320fe 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,7 @@
 void common(void) {
 	BLANK();
 	OFFSET(TASK_threadsp, task_struct, thread.sp);
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	OFFSET(TASK_stack_canary, task_struct, stack_canary);
 #endif
 
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index f91ba53e06c8..a4a3be399f4b 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -50,7 +50,7 @@ void foo(void)
 	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
 	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	BLANK();
 	OFFSET(stack_canary_offset, stack_canary, canary);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bf51e51d808d..b2dcd161f514 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -69,7 +69,7 @@ int main(void)
 	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	BLANK();
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
 	BLANK();
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 910b47ee8078..0df7151cfef4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1599,7 +1599,7 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
 	(unsigned long)&init_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
 
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b59e4fb40fd9..abe6df15a8fb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -375,7 +375,7 @@ ENDPROC(startup_32_smp)
  */
 __INIT
 setup_once:
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	/*
 	 * Configure the stack canary. The linker can't handle this by
 	 * relocation.  Manually set base address in stack canary
diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
index 022cf918ec20..67904f55f188 100644
--- a/arch/xtensa/kernel/asm-offsets.c
+++ b/arch/xtensa/kernel/asm-offsets.c
@@ -76,7 +76,7 @@ int main(void)
 	DEFINE(TASK_PID, offsetof (struct task_struct, pid));
 	DEFINE(TASK_THREAD, offsetof (struct task_struct, thread));
 	DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, stack));
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	DEFINE(TASK_STACK_CANARY, offsetof(struct task_struct, stack_canary));
 #endif
 	DEFINE(TASK_STRUCT_SIZE, sizeof (struct task_struct));
diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index 5caff0744f3c..9cbc380e9572 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -1971,7 +1971,7 @@ ENTRY(_switch_to)
 	s32i	a1, a2, THREAD_SP	# save stack pointer
 #endif
 
-#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	movi	a6, __stack_chk_guard
 	l32i	a8, a3, TASK_STACK_CANARY
 	s32i	a8, a6, 0
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 8dd0593fb2c4..483dcfb6e681 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -58,7 +58,7 @@ void (*pm_power_off)(void) = NULL;
 EXPORT_SYMBOL(pm_power_off);
 
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 #include <linux/stackprotector.h>
 unsigned long __stack_chk_guard __read_mostly;
 EXPORT_SYMBOL(__stack_chk_guard);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 16e4d984fe51..cfb7da88c217 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -742,7 +742,7 @@ struct task_struct {
 	pid_t				pid;
 	pid_t				tgid;
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	/* Canary value for the -fstack-protector GCC feature: */
 	unsigned long			stack_canary;
 #endif
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
index 03696c729fb4..6b792d080eee 100644
--- a/include/linux/stackprotector.h
+++ b/include/linux/stackprotector.h
@@ -6,7 +6,7 @@
 #include <linux/sched.h>
 #include <linux/random.h>
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 # include <asm/stackprotector.h>
 #else
 static inline void boot_init_stack_canary(void)
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 946fb92418f7..81e9af7dcec2 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -12,7 +12,7 @@ CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_STACKPROTECTOR_STRONG=y
 CONFIG_COMPACTION=y
 CONFIG_CPU_SW_DOMAIN_PAN=y
 CONFIG_DM_CRYPT=y
diff --git a/kernel/fork.c b/kernel/fork.c
index 08c6e5e217a0..92870be50bba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -811,7 +811,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	clear_tsk_need_resched(tsk);
 	set_task_stack_end_magic(tsk);
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 	tsk->stack_canary = get_random_canary();
 #endif
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 42e487488554..8b2e002d52eb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -623,7 +623,7 @@ static __init int register_warn_debugfs(void)
 device_initcall(register_warn_debugfs);
 #endif
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
 
 /*
  * Called when gcc's -fstack-protector feature is used, and
-- 
cgit v1.2.3


From da661267398869a553b7f67d739d360aaa1361b6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 14 Jun 2018 13:58:45 +0200
Subject: blk-mq: don't time out requests again that are in the timeout handler

We can currently call the timeout handler again on a request that has
already been handed over to the timeout handler.  Prevent that with a new
flag.

Fixes: 12f5b931 ("blk-mq: Remove generation seqeunce")
Reported-by: Andrew Randrianasulu <randrianasulu@gmail.com>
Tested-by: Andrew Randrianasulu <randrianasulu@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 5 +++++
 include/linux/blkdev.h | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2be78cc30ec5..8e57b84e50e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -671,6 +671,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 
 	if (blk_mq_request_started(rq)) {
 		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+		rq->rq_flags &= ~RQF_TIMED_OUT;
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -770,6 +771,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
+	req->rq_flags |= RQF_TIMED_OUT;
 	if (req->q->mq_ops->timeout) {
 		enum blk_eh_timer_return ret;
 
@@ -779,6 +781,7 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
 	}
 
+	req->rq_flags &= ~RQF_TIMED_OUT;
 	blk_add_timer(req);
 }
 
@@ -788,6 +791,8 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 
 	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
 		return false;
+	if (rq->rq_flags & RQF_TIMED_OUT)
+		return false;
 
 	deadline = blk_rq_deadline(rq);
 	if (time_after_eq(jiffies, deadline))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bca3a92eb55f..fa6f11751430 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 /* already slept for hybrid poll */
 #define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
+/* ->timeout has been called, don't expire again */
+#define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From e6c3456aa897c9799de5423b28550efad14a51b0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 14 Jun 2018 14:26:47 +0200
Subject: blk-mq: remove blk_mq_tagset_iter

Unused now that nvme stopped using it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c     | 29 -----------------------------
 include/linux/blk-mq.h |  2 --
 2 files changed, 31 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 70356a2a11ab..09b2ee6694fb 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -311,35 +311,6 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
-int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
-			 int (fn)(void *, struct request *))
-{
-	int i, j, ret = 0;
-
-	if (WARN_ON_ONCE(!fn))
-		goto out;
-
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		struct blk_mq_tags *tags = set->tags[i];
-
-		if (!tags)
-			continue;
-
-		for (j = 0; j < tags->nr_tags; j++) {
-			if (!tags->static_rqs[j])
-				continue;
-
-			ret = fn(data, tags->static_rqs[j]);
-			if (ret)
-				goto out;
-		}
-	}
-
-out:
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
-
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fb355173f3c7..e3147eb74222 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -281,8 +281,6 @@ void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
-int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
-		int (reinit_request)(void *, struct request *));
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
-- 
cgit v1.2.3


From 2dd5aa15d9a2d4b87b32d93f3c5a66f175c0123d Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Date: Thu, 14 Jun 2018 15:05:55 +0100
Subject: ALSA: usb-audio: Add bi-directional terminal types

Define the bi-directional USB terminal types for audio devices.

Signed-off-by: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/linux/usb/audio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h
index 13d98e6e0db1..74e520fb944f 100644
--- a/include/uapi/linux/usb/audio.h
+++ b/include/uapi/linux/usb/audio.h
@@ -230,6 +230,14 @@ struct uac1_output_terminal_descriptor {
 #define UAC_OUTPUT_TERMINAL_COMMUNICATION_SPEAKER	0x306
 #define UAC_OUTPUT_TERMINAL_LOW_FREQ_EFFECTS_SPEAKER	0x307
 
+/* Terminals - 2.4 Bi-directional Terminal Types */
+#define UAC_BIDIR_TERMINAL_UNDEFINED			0x400
+#define UAC_BIDIR_TERMINAL_HANDSET			0x401
+#define UAC_BIDIR_TERMINAL_HEADSET			0x402
+#define UAC_BIDIR_TERMINAL_SPEAKER_PHONE		0x403
+#define UAC_BIDIR_TERMINAL_ECHO_SUPPRESSING		0x404
+#define UAC_BIDIR_TERMINAL_ECHO_CANCELING		0x405
+
 /* Set bControlSize = 2 as default setting */
 #define UAC_DT_FEATURE_UNIT_SIZE(ch)		(7 + ((ch) + 1) * 2)
 
-- 
cgit v1.2.3


From 995191220056300c51ed870a5d5321f91f3eef89 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 14 Jun 2018 07:37:02 +0800
Subject: sctp: define sctp_packet_gso_append to build GSO frames

Now sctp GSO uses skb_gro_receive() to append the data into head
skb frag_list. However it actually only needs very few code from
skb_gro_receive(). Besides, NAPI_GRO_CB has to be set while most
of its members are not needed here.

This patch is to add sctp_packet_gso_append() to build GSO frames
instead of skb_gro_receive(), and it would avoid many unnecessary
checks and make the code clearer.

Note that sctp will use page frags instead of frag_list to build
GSO frames in another patch. But it may take time, as sctp's GSO
frames may have different size. skb_segment() can only split it
into the frags with the same size, which would break the border
of sctp chunks.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  5 +++++
 net/sctp/output.c          | 28 ++++++++++++++++++----------
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ebf809eed33a..dbe1b911a24d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1133,6 +1133,11 @@ struct sctp_input_cb {
 };
 #define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
 
+struct sctp_output_cb {
+	struct sk_buff *last;
+};
+#define SCTP_OUTPUT_CB(__skb)	((struct sctp_output_cb *)&((__skb)->cb[0]))
+
 static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb)
 {
 	const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index e672dee302c7..7f849b01ec8e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -409,6 +409,21 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
 	refcount_inc(&sk->sk_wmem_alloc);
 }
 
+static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb)
+{
+	if (SCTP_OUTPUT_CB(head)->last == head)
+		skb_shinfo(head)->frag_list = skb;
+	else
+		SCTP_OUTPUT_CB(head)->last->next = skb;
+	SCTP_OUTPUT_CB(head)->last = skb;
+
+	head->truesize += skb->truesize;
+	head->data_len += skb->len;
+	head->len += skb->len;
+
+	__skb_header_release(skb);
+}
+
 static int sctp_packet_pack(struct sctp_packet *packet,
 			    struct sk_buff *head, int gso, gfp_t gfp)
 {
@@ -422,7 +437,7 @@ static int sctp_packet_pack(struct sctp_packet *packet,
 
 	if (gso) {
 		skb_shinfo(head)->gso_type = sk->sk_gso_type;
-		NAPI_GRO_CB(head)->last = head;
+		SCTP_OUTPUT_CB(head)->last = head;
 	} else {
 		nskb = head;
 		pkt_size = packet->size;
@@ -503,15 +518,8 @@ merge:
 					 &packet->chunk_list);
 		}
 
-		if (gso) {
-			if (skb_gro_receive(&head, nskb)) {
-				kfree_skb(nskb);
-				return 0;
-			}
-			if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
-					 sk->sk_gso_max_segs))
-				return 0;
-		}
+		if (gso)
+			sctp_packet_gso_append(head, nskb);
 
 		pkt_count++;
 	} while (!list_empty(&packet->chunk_list));
-- 
cgit v1.2.3


From 92ee383f6daab4da5471b86f6fdaba775e6928f6 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 14 Jun 2018 15:26:27 -0700
Subject: mm: fix race between kmem_cache destroy, create and deactivate

The memcg kmem cache creation and deactivation (SLUB only) is
asynchronous.  If a root kmem cache is destroyed whose memcg cache is in
the process of creation or deactivation, the kernel may crash.

Example of one such crash:
	general protection fault: 0000 [#1] SMP PTI
	CPU: 1 PID: 1721 Comm: kworker/14:1 Not tainted 4.17.0-smp
	...
	Workqueue: memcg_kmem_cache kmemcg_deactivate_workfn
	RIP: 0010:has_cpu_slab
	...
	Call Trace:
	? on_each_cpu_cond
	__kmem_cache_shrink
	kmemcg_cache_deact_after_rcu
	kmemcg_deactivate_workfn
	process_one_work
	worker_thread
	kthread
	ret_from_fork+0x35/0x40

To fix this race, on root kmem cache destruction, mark the cache as
dying and flush the workqueue used for memcg kmem cache creation and
deactivation.  SLUB's memcg kmem cache deactivation also includes RCU
callback and thus make sure all previous registered RCU callbacks have
completed as well.

[shakeelb@google.com: handle the RCU callbacks for SLUB deactivation]
  Link: http://lkml.kernel.org/r/20180611192951.195727-1-shakeelb@google.com
[shakeelb@google.com: add more documentation, rename fields for readability]
  Link: http://lkml.kernel.org/r/20180522201336.196994-1-shakeelb@google.com
[akpm@linux-foundation.org: fix build, per Shakeel]
[shakeelb@google.com: v3.  Instead of refcount, flush the workqueue]
  Link: http://lkml.kernel.org/r/20180530001204.183758-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180521174116.171846-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  1 +
 mm/slab_common.c     | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4d759e1ddc33..14e3fe4bd6a1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -600,6 +600,7 @@ struct memcg_cache_params {
 			struct memcg_cache_array __rcu *memcg_caches;
 			struct list_head __root_caches_node;
 			struct list_head children;
+			bool dying;
 		};
 		struct {
 			struct mem_cgroup *memcg;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 98dcdc352062..42aca26d61d0 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -136,6 +136,7 @@ void slab_init_memcg_params(struct kmem_cache *s)
 	s->memcg_params.root_cache = NULL;
 	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
 	INIT_LIST_HEAD(&s->memcg_params.children);
+	s->memcg_params.dying = false;
 }
 
 static int init_memcg_params(struct kmem_cache *s,
@@ -608,7 +609,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	 * The memory cgroup could have been offlined while the cache
 	 * creation work was pending.
 	 */
-	if (memcg->kmem_state != KMEM_ONLINE)
+	if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying)
 		goto out_unlock;
 
 	idx = memcg_cache_id(memcg);
@@ -712,6 +713,9 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
 	    WARN_ON_ONCE(s->memcg_params.deact_fn))
 		return;
 
+	if (s->memcg_params.root_cache->memcg_params.dying)
+		return;
+
 	/* pin memcg so that @s doesn't get destroyed in the middle */
 	css_get(&s->memcg_params.memcg->css);
 
@@ -823,11 +827,36 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
 		return -EBUSY;
 	return 0;
 }
+
+static void flush_memcg_workqueue(struct kmem_cache *s)
+{
+	mutex_lock(&slab_mutex);
+	s->memcg_params.dying = true;
+	mutex_unlock(&slab_mutex);
+
+	/*
+	 * SLUB deactivates the kmem_caches through call_rcu_sched. Make
+	 * sure all registered rcu callbacks have been invoked.
+	 */
+	if (IS_ENABLED(CONFIG_SLUB))
+		rcu_barrier_sched();
+
+	/*
+	 * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
+	 * deactivates the memcg kmem_caches through workqueue. Make sure all
+	 * previous workitems on workqueue are processed.
+	 */
+	flush_workqueue(memcg_kmem_cache_wq);
+}
 #else
 static inline int shutdown_memcg_caches(struct kmem_cache *s)
 {
 	return 0;
 }
+
+static inline void flush_memcg_workqueue(struct kmem_cache *s)
+{
+}
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 void slab_kmem_cache_release(struct kmem_cache *s)
@@ -845,6 +874,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (unlikely(!s))
 		return;
 
+	flush_memcg_workqueue(s);
+
 	get_online_cpus();
 	get_online_mems();
 
-- 
cgit v1.2.3


From 0ed557aa813922f6f32adec69e266532091c895b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 14 Jun 2018 15:27:41 -0700
Subject: sched/core / kcov: avoid kcov_area during task switch

During a context switch, we first switch_mm() to the next task's mm,
then switch_to() that new task.  This means that vmalloc'd regions which
had previously been faulted in can transiently disappear in the context
of the prev task.

Functions instrumented by KCOV may try to access a vmalloc'd kcov_area
during this window, and as the fault handling code is instrumented, this
results in a recursive fault.

We must avoid accessing any kcov_area during this window.  We can do so
with a new flag in kcov_mode, set prior to switching the mm, and cleared
once the new task is live.  Since task_struct::kcov_mode isn't always a
specific enum kcov_mode value, this is made an unsigned int.

The manipulation is hidden behind kcov_{prepare,finish}_switch() helpers,
which are empty for !CONFIG_KCOV kernels.

The code uses macros because I can't use static inline functions without a
circular include dependency between <linux/sched.h> and <linux/kcov.h>,
since the definition of task_struct uses things defined in <linux/kcov.h>

Link: http://lkml.kernel.org/r/20180504135535.53744-4-mark.rutland@arm.com
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kcov.h  | 14 ++++++++++++++
 include/linux/sched.h |  2 +-
 kernel/kcov.c         |  2 +-
 kernel/sched/core.c   |  4 ++++
 4 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index 3ecf6f5e3a5f..b76a1807028d 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -22,13 +22,27 @@ enum kcov_mode {
 	KCOV_MODE_TRACE_CMP = 3,
 };
 
+#define KCOV_IN_CTXSW	(1 << 30)
+
 void kcov_task_init(struct task_struct *t);
 void kcov_task_exit(struct task_struct *t);
 
+#define kcov_prepare_switch(t)			\
+do {						\
+	(t)->kcov_mode |= KCOV_IN_CTXSW;	\
+} while (0)
+
+#define kcov_finish_switch(t)			\
+do {						\
+	(t)->kcov_mode &= ~KCOV_IN_CTXSW;	\
+} while (0)
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
 static inline void kcov_task_exit(struct task_struct *t) {}
+static inline void kcov_prepare_switch(struct task_struct *t) {}
+static inline void kcov_finish_switch(struct task_struct *t) {}
 
 #endif /* CONFIG_KCOV */
 #endif /* _LINUX_KCOV_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cfb7da88c217..87bf02d93a27 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1130,7 +1130,7 @@ struct task_struct {
 
 #ifdef CONFIG_KCOV
 	/* Coverage collection mode enabled for this task (0 if disabled): */
-	enum kcov_mode			kcov_mode;
+	unsigned int			kcov_mode;
 
 	/* Size of the kcov_area: */
 	unsigned int			kcov_size;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index cf250392c55c..3ebd09efe72a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -58,7 +58,7 @@ struct kcov {
 
 static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
 {
-	enum kcov_mode mode;
+	unsigned int mode;
 
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a98d54cd5535..78d8facba456 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,8 @@
 #include <linux/kthread.h>
 #include <linux/nospec.h>
 
+#include <linux/kcov.h>
+
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
 
@@ -2633,6 +2635,7 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
+	kcov_prepare_switch(prev);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
 	rseq_preempt(prev);
@@ -2702,6 +2705,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	finish_task(prev);
 	finish_lock_switch(rq);
 	finish_arch_post_lock_switch();
+	kcov_finish_switch(current);
 
 	fire_sched_in_preempt_notifiers(current);
 	/*
-- 
cgit v1.2.3


From fe6bdfc8e1e131720abbe77a2eb990c94c9024cb Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 14 Jun 2018 15:28:05 -0700
Subject: mm: fix oom_kill event handling

Commit e27be240df53 ("mm: memcg: make sure memory.events is uptodate
when waking pollers") converted most of memcg event counters to
per-memcg atomics, which made them less confusing for a user.  The
"oom_kill" counter remained untouched, so now it behaves differently
than other counters (including "oom").  This adds nothing but confusion.

Let's fix this by adding the MEMCG_OOM_KILL event, and follow the
MEMCG_OOM approach.

This also removes a hack from count_memcg_event_mm(), introduced earlier
specially for the OOM_KILL counter.

[akpm@linux-foundation.org: fix for droppage of memcg-replace-mm-owner-with-mm-memcg.patch]
Link: http://lkml.kernel.org/r/20180508124637.29984-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 26 ++++++++++++++++++++++----
 mm/memcontrol.c            |  6 ++++--
 mm/oom_kill.c              |  2 +-
 3 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4f52ec755725..6c6fb116e925 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -53,6 +53,7 @@ enum memcg_memory_event {
 	MEMCG_HIGH,
 	MEMCG_MAX,
 	MEMCG_OOM,
+	MEMCG_OOM_KILL,
 	MEMCG_SWAP_MAX,
 	MEMCG_SWAP_FAIL,
 	MEMCG_NR_MEMORY_EVENTS,
@@ -720,11 +721,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (likely(memcg)) {
+	if (likely(memcg))
 		count_memcg_events(memcg, idx, 1);
-		if (idx == OOM_KILL)
-			cgroup_file_notify(&memcg->events_file);
-	}
 	rcu_read_unlock();
 }
 
@@ -735,6 +733,21 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
 	cgroup_file_notify(&memcg->events_file);
 }
 
+static inline void memcg_memory_event_mm(struct mm_struct *mm,
+					 enum memcg_memory_event event)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	if (likely(memcg))
+		memcg_memory_event(memcg, event);
+	rcu_read_unlock();
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void mem_cgroup_split_huge_fixup(struct page *head);
 #endif
@@ -756,6 +769,11 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
 {
 }
 
+static inline void memcg_memory_event_mm(struct mm_struct *mm,
+					 enum memcg_memory_event event)
+{
+}
+
 static inline enum mem_cgroup_protection mem_cgroup_protected(
 	struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5a3873e9d657..e6f0d5ef320a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3550,7 +3550,8 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 
 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
-	seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+	seq_printf(sf, "oom_kill %lu\n",
+		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
 	return 0;
 }
 
@@ -5239,7 +5240,8 @@ static int memory_events_show(struct seq_file *m, void *v)
 		   atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
 	seq_printf(m, "oom %lu\n",
 		   atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
-	seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+	seq_printf(m, "oom_kill %lu\n",
+		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
 
 	return 0;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6694348b27e9..84081e77bc51 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -913,7 +913,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 
 	/* Raise event before sending signal: task reaper must see this */
 	count_vm_event(OOM_KILL);
-	count_memcg_event_mm(mm, OOM_KILL);
+	memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
 
 	/*
 	 * We should send SIGKILL before granting access to memory reserves
-- 
cgit v1.2.3


From 94aefd32f431a2c15f9d2c5f8f19dece73a77b52 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@scylladb.com>
Date: Fri, 8 Jun 2018 17:55:05 +0300
Subject: aio: mark __aio_sigset::sigmask const

io_pgetevents() will not change the signal mask. Mark it const
to make it clear and to reduce the need for casts in user code.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Avi Kivity <avi@scylladb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/uapi/linux/aio_abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index 75846164290e..d00221345c19 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -109,7 +109,7 @@ struct iocb {
 #undef IFLITTLE
 
 struct __aio_sigset {
-	sigset_t __user	*sigmask;
+	const sigset_t __user	*sigmask;
 	size_t		sigsetsize;
 };
 
-- 
cgit v1.2.3


From badbc27df3a934e0025be238754f9ca6a852c006 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Fri, 8 Jun 2018 10:04:47 +0300
Subject: nl80211: fix some kernel doc tag mistakes

There is a bunch of tags marking constants with &, which means struct
or enum name.  Replace them with %, which is the correct tag for
constants.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/uapi/linux/nl80211.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 28b36545de24..27e4e441caac 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -981,18 +981,18 @@
  *	only the %NL80211_ATTR_IE data is used and updated with this command.
  *
  * @NL80211_CMD_SET_PMK: For offloaded 4-Way handshake, set the PMK or PMK-R0
- *	for the given authenticator address (specified with &NL80211_ATTR_MAC).
- *	When &NL80211_ATTR_PMKR0_NAME is set, &NL80211_ATTR_PMK specifies the
+ *	for the given authenticator address (specified with %NL80211_ATTR_MAC).
+ *	When %NL80211_ATTR_PMKR0_NAME is set, %NL80211_ATTR_PMK specifies the
  *	PMK-R0, otherwise it specifies the PMK.
  * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously
  *	configured PMK for the authenticator address identified by
- *	&NL80211_ATTR_MAC.
+ *	%NL80211_ATTR_MAC.
  * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way
  *	handshake was completed successfully by the driver. The BSSID is
- *	specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake
+ *	specified with %NL80211_ATTR_MAC. Drivers that support 4 way handshake
  *	offload should send this event after indicating 802.11 association with
- *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
- *	&NL80211_CMD_DISCONNECT should be indicated instead.
+ *	%NL80211_CMD_CONNECT or %NL80211_CMD_ROAM. If the 4 way handshake failed
+ *	%NL80211_CMD_DISCONNECT should be indicated instead.
  *
  * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request
  *	and RX notification.  This command is used both as a request to transmit
@@ -1029,9 +1029,9 @@
  *	initiated the connection through the connect request.
  *
  * @NL80211_CMD_STA_OPMODE_CHANGED: An event that notify station's
- *	ht opmode or vht opmode changes using any of &NL80211_ATTR_SMPS_MODE,
- *	&NL80211_ATTR_CHANNEL_WIDTH,&NL80211_ATTR_NSS attributes with its
- *	address(specified in &NL80211_ATTR_MAC).
+ *	ht opmode or vht opmode changes using any of %NL80211_ATTR_SMPS_MODE,
+ *	%NL80211_ATTR_CHANNEL_WIDTH,%NL80211_ATTR_NSS attributes with its
+ *	address(specified in %NL80211_ATTR_MAC).
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -2218,7 +2218,7 @@ enum nl80211_commands {
  * @NL80211_ATTR_EXTERNAL_AUTH_ACTION: Identify the requested external
  *     authentication operation (u32 attribute with an
  *     &enum nl80211_external_auth_action value). This is used with the
- *     &NL80211_CMD_EXTERNAL_AUTH request event.
+ *     %NL80211_CMD_EXTERNAL_AUTH request event.
  * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user
  *     space supports external authentication. This attribute shall be used
  *     only with %NL80211_CMD_CONNECT request. The driver may offload
@@ -3491,7 +3491,7 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated
  *	base on contiguous rules and wider channels will be allowed to cross
  *	multiple contiguous/overlapping frequency ranges.
- * @NL80211_RRF_IR_CONCURRENT: See &NL80211_FREQUENCY_ATTR_IR_CONCURRENT
+ * @NL80211_RRF_IR_CONCURRENT: See %NL80211_FREQUENCY_ATTR_IR_CONCURRENT
  * @NL80211_RRF_NO_HT40MINUS: channels can't be used in HT40- operation
  * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation
  * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed
@@ -5643,11 +5643,11 @@ enum nl80211_nan_func_attributes {
  * @NL80211_NAN_SRF_INCLUDE: present if the include bit of the SRF set.
  *	This is a flag.
  * @NL80211_NAN_SRF_BF: Bloom Filter. Present if and only if
- *	&NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary.
+ *	%NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary.
  * @NL80211_NAN_SRF_BF_IDX: index of the Bloom Filter. Mandatory if
- *	&NL80211_NAN_SRF_BF is present. This is a u8.
+ *	%NL80211_NAN_SRF_BF is present. This is a u8.
  * @NL80211_NAN_SRF_MAC_ADDRS: list of MAC addresses for the SRF. Present if
- *	and only if &NL80211_NAN_SRF_BF isn't present. This is a nested
+ *	and only if %NL80211_NAN_SRF_BF isn't present. This is a nested
  *	attribute. Each nested attribute is a MAC address.
  * @NUM_NL80211_NAN_SRF_ATTR: internal
  * @NL80211_NAN_SRF_ATTR_MAX: highest NAN SRF attribute
-- 
cgit v1.2.3


From be7f99c536c5aeebad29082b7d8dce32077fea14 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 15 Jun 2018 13:55:07 +0200
Subject: block: remov blk_queue_invalidate_tags

This function is entirely unused, so remove it and the tag_queue_busy
member of struct request_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt | 15 +--------------
 block/blk-tag.c                | 22 ----------------------
 include/linux/blkdev.h         |  2 --
 3 files changed, 1 insertion(+), 38 deletions(-)

(limited to 'include')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 86927029a52d..207eca58efaa 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -752,18 +752,6 @@ completion of the request to the block layer. This means ending tag
 operations before calling end_that_request_last()! For an example of a user
 of these helpers, see the IDE tagged command queueing support.
 
-Certain hardware conditions may dictate a need to invalidate the block tag
-queue. For instance, on IDE any tagged request error needs to clear both
-the hardware and software block queue and enable the driver to sanely restart
-all the outstanding requests. There's a third helper to do that:
-
-	blk_queue_invalidate_tags(struct request_queue *q)
-
-	Clear the internal block tag queue and re-add all the pending requests
-	to the request queue. The driver will receive them again on the
-	next request_fn run, just like it did the first time it encountered
-	them.
-
 3.2.5.2 Tag info
 
 Some block functions exist to query current tag status or to go from a
@@ -805,8 +793,7 @@ Internally, block manages tags in the blk_queue_tag structure:
 Most of the above is simple and straight forward, however busy_list may need
 a bit of explaining. Normally we don't care too much about request ordering,
 but in the event of any barrier requests in the tag queue we need to ensure
-that requests are restarted in the order they were queue. This may happen
-if the driver needs to use blk_queue_invalidate_tags().
+that requests are restarted in the order they were queue.
 
 3.3 I/O Submission
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 09f19c6c52ce..ada0d7cff62b 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -188,7 +188,6 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
 	 */
 	q->queue_tags = tags;
 	queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
-	INIT_LIST_HEAD(&q->tag_busy_list);
 	return 0;
 }
 EXPORT_SYMBOL(blk_queue_init_tags);
@@ -374,27 +373,6 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
 	blk_start_request(rq);
-	list_add(&rq->queuelist, &q->tag_busy_list);
 	return 0;
 }
 EXPORT_SYMBOL(blk_queue_start_tag);
-
-/**
- * blk_queue_invalidate_tags - invalidate all pending tags
- * @q:  the request queue for the device
- *
- *  Description:
- *   Hardware conditions may dictate a need to stop all pending requests.
- *   In this case, we will safely clear the block side of the tag queue and
- *   readd all requests to the request queue in the right order.
- **/
-void blk_queue_invalidate_tags(struct request_queue *q)
-{
-	struct list_head *tmp, *n;
-
-	lockdep_assert_held(q->queue_lock);
-
-	list_for_each_safe(tmp, n, &q->tag_busy_list)
-		blk_requeue_request(q, list_entry_rq(tmp));
-}
-EXPORT_SYMBOL(blk_queue_invalidate_tags);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fa6f11751430..9154570edf29 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -562,7 +562,6 @@ struct request_queue {
 	unsigned int		dma_alignment;
 
 	struct blk_queue_tag	*queue_tags;
-	struct list_head	tag_busy_list;
 
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
@@ -1375,7 +1374,6 @@ extern void blk_queue_end_tag(struct request_queue *, struct request *);
 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int);
 extern void blk_queue_free_tags(struct request_queue *);
 extern int blk_queue_resize_tags(struct request_queue *, int);
-extern void blk_queue_invalidate_tags(struct request_queue *);
 extern struct blk_queue_tag *blk_init_tags(int, int);
 extern void blk_free_tags(struct blk_queue_tag *);
 
-- 
cgit v1.2.3


From 0da0b7fd73e4f20e1a987dfade0b36bb4813cf10 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 15 Jun 2018 15:19:22 +0100
Subject: afs: Display manually added cells in dynamic root mount

Alter the dynroot mount so that cells created by manipulation of
/proc/fs/afs/cells and /proc/fs/afs/rootcell and by specification of a root
cell as a module parameter will cause directories for those cells to be
created in the dynamic root superblock for the network namespace[*].

To this end:

 (1) Only one dynamic root superblock is now created per network namespace
     and this is shared between all attempts to mount it.  This makes it
     easier to find the superblock to modify.

 (2) When a dynamic root superblock is created, the list of cells is walked
     and directories created for each cell already defined.

 (3) When a new cell is added, if a dynamic root superblock exists, a
     directory is created for it.

 (4) When a cell is destroyed, the directory is removed.

 (5) These directories are created by calling lookup_one_len() on the root
     dir which automatically creates them if they don't exist.

[*] Inasmuch as network namespaces are currently supported here.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cell.c         |  12 +++--
 fs/afs/dynroot.c      | 124 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/afs/internal.h     |   7 ++-
 fs/afs/main.c         |   2 +-
 fs/afs/super.c        |  52 +++++++++++++--------
 fs/namei.c            |  29 ++++++++++++
 include/linux/namei.h |   1 +
 7 files changed, 200 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index bb92b54d2a4a..f3d0bef16d78 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -15,6 +15,7 @@
 #include <linux/dns_resolver.h>
 #include <linux/sched.h>
 #include <linux/inet.h>
+#include <linux/namei.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
@@ -531,9 +532,11 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
 	ret = afs_proc_cell_setup(cell);
 	if (ret < 0)
 		return ret;
-	spin_lock(&net->proc_cells_lock);
+
+	mutex_lock(&net->proc_cells_lock);
 	list_add_tail(&cell->proc_link, &net->proc_cells);
-	spin_unlock(&net->proc_cells_lock);
+	afs_dynroot_mkdir(net, cell);
+	mutex_unlock(&net->proc_cells_lock);
 	return 0;
 }
 
@@ -546,9 +549,10 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
 
 	afs_proc_cell_remove(cell);
 
-	spin_lock(&net->proc_cells_lock);
+	mutex_lock(&net->proc_cells_lock);
 	list_del_init(&cell->proc_link);
-	spin_unlock(&net->proc_cells_lock);
+	afs_dynroot_rmdir(net, cell);
+	mutex_unlock(&net->proc_cells_lock);
 
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(cell->cache, NULL, false);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 7425f416ed73..174e843f0633 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -1,4 +1,4 @@
-/* dir.c: AFS dynamic root handling
+/* AFS dynamic root handling
  *
  * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -207,3 +207,125 @@ const struct dentry_operations afs_dynroot_dentry_operations = {
 	.d_release	= afs_d_release,
 	.d_automount	= afs_d_automount,
 };
+
+/*
+ * Create a manually added cell mount directory.
+ * - The caller must hold net->proc_cells_lock
+ */
+int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
+{
+	struct super_block *sb = net->dynroot_sb;
+	struct dentry *root, *subdir;
+	int ret;
+
+	if (!sb || atomic_read(&sb->s_active) == 0)
+		return 0;
+
+	/* Let the ->lookup op do the creation */
+	root = sb->s_root;
+	inode_lock(root->d_inode);
+	subdir = lookup_one_len(cell->name, root, cell->name_len);
+	if (IS_ERR(subdir)) {
+		ret = PTR_ERR(subdir);
+		goto unlock;
+	}
+
+	/* Note that we're retaining an extra ref on the dentry */
+	subdir->d_fsdata = (void *)1UL;
+	ret = 0;
+unlock:
+	inode_unlock(root->d_inode);
+	return ret;
+}
+
+/*
+ * Remove a manually added cell mount directory.
+ * - The caller must hold net->proc_cells_lock
+ */
+void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
+{
+	struct super_block *sb = net->dynroot_sb;
+	struct dentry *root, *subdir;
+
+	if (!sb || atomic_read(&sb->s_active) == 0)
+		return;
+
+	root = sb->s_root;
+	inode_lock(root->d_inode);
+
+	/* Don't want to trigger a lookup call, which will re-add the cell */
+	subdir = try_lookup_one_len(cell->name, root, cell->name_len);
+	if (IS_ERR_OR_NULL(subdir)) {
+		_debug("lookup %ld", PTR_ERR(subdir));
+		goto no_dentry;
+	}
+
+	_debug("rmdir %pd %u", subdir, d_count(subdir));
+
+	if (subdir->d_fsdata) {
+		_debug("unpin %u", d_count(subdir));
+		subdir->d_fsdata = NULL;
+		dput(subdir);
+	}
+	dput(subdir);
+no_dentry:
+	inode_unlock(root->d_inode);
+	_leave("");
+}
+
+/*
+ * Populate a newly created dynamic root with cell names.
+ */
+int afs_dynroot_populate(struct super_block *sb)
+{
+	struct afs_cell *cell;
+	struct afs_net *net = afs_sb2net(sb);
+	int ret;
+
+	if (mutex_lock_interruptible(&net->proc_cells_lock) < 0)
+		return -ERESTARTSYS;
+
+	net->dynroot_sb = sb;
+	list_for_each_entry(cell, &net->proc_cells, proc_link) {
+		ret = afs_dynroot_mkdir(net, cell);
+		if (ret < 0)
+			goto error;
+	}
+
+	ret = 0;
+out:
+	mutex_unlock(&net->proc_cells_lock);
+	return ret;
+
+error:
+	net->dynroot_sb = NULL;
+	goto out;
+}
+
+/*
+ * When a dynamic root that's in the process of being destroyed, depopulate it
+ * of pinned directories.
+ */
+void afs_dynroot_depopulate(struct super_block *sb)
+{
+	struct afs_net *net = afs_sb2net(sb);
+	struct dentry *root = sb->s_root, *subdir, *tmp;
+
+	/* Prevent more subdirs from being created */
+	mutex_lock(&net->proc_cells_lock);
+	if (net->dynroot_sb == sb)
+		net->dynroot_sb = NULL;
+	mutex_unlock(&net->proc_cells_lock);
+
+	inode_lock(root->d_inode);
+
+	/* Remove all the pins for dirs created for manually added cells */
+	list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
+		if (subdir->d_fsdata) {
+			subdir->d_fsdata = NULL;
+			dput(subdir);
+		}
+	}
+
+	inode_unlock(root->d_inode);
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 209e04ffa6c3..5d8260b4c2b3 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -240,7 +240,7 @@ struct afs_net {
 	atomic_t		cells_outstanding;
 	seqlock_t		cells_lock;
 
-	spinlock_t		proc_cells_lock;
+	struct mutex		proc_cells_lock;
 	struct list_head	proc_cells;
 
 	/* Known servers.  Theoretically each fileserver can only be in one
@@ -264,6 +264,7 @@ struct afs_net {
 	struct mutex		lock_manager_mutex;
 
 	/* Misc */
+	struct super_block	*dynroot_sb;	/* Dynamic root mount superblock */
 	struct proc_dir_entry	*proc_afs;	/* /proc/net/afs directory */
 	struct afs_sysnames	*sysnames;
 	rwlock_t		sysnames_lock;
@@ -722,6 +723,10 @@ extern const struct inode_operations afs_dynroot_inode_operations;
 extern const struct dentry_operations afs_dynroot_dentry_operations;
 
 extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *);
+extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *);
+extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *);
+extern int afs_dynroot_populate(struct super_block *);
+extern void afs_dynroot_depopulate(struct super_block *);
 
 /*
  * file.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 7d2c1354e2ca..e84fe822a960 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -86,7 +86,7 @@ static int __net_init afs_net_init(struct net *net_ns)
 	INIT_WORK(&net->cells_manager, afs_manage_cells);
 	timer_setup(&net->cells_timer, afs_cells_timer, 0);
 
-	spin_lock_init(&net->proc_cells_lock);
+	mutex_init(&net->proc_cells_lock);
 	INIT_LIST_HEAD(&net->proc_cells);
 
 	seqlock_init(&net->fs_lock);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 8707d867334e..4d3e274207fb 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -355,12 +355,17 @@ static int afs_test_super(struct super_block *sb, void *data)
 
 	return (as->net_ns == as1->net_ns &&
 		as->volume &&
-		as->volume->vid == as1->volume->vid);
+		as->volume->vid == as1->volume->vid &&
+		!as->dyn_root);
 }
 
 static int afs_dynroot_test_super(struct super_block *sb, void *data)
 {
-	return false;
+	struct afs_super_info *as1 = data;
+	struct afs_super_info *as = AFS_FS_S(sb);
+
+	return (as->net_ns == as1->net_ns &&
+		as->dyn_root);
 }
 
 static int afs_set_super(struct super_block *sb, void *data)
@@ -420,10 +425,14 @@ static int afs_fill_super(struct super_block *sb,
 	if (!sb->s_root)
 		goto error;
 
-	if (params->dyn_root)
+	if (as->dyn_root) {
 		sb->s_d_op = &afs_dynroot_dentry_operations;
-	else
+		ret = afs_dynroot_populate(sb);
+		if (ret < 0)
+			goto error;
+	} else {
 		sb->s_d_op = &afs_fs_dentry_operations;
+	}
 
 	_leave(" = 0");
 	return 0;
@@ -458,6 +467,25 @@ static void afs_destroy_sbi(struct afs_super_info *as)
 	}
 }
 
+static void afs_kill_super(struct super_block *sb)
+{
+	struct afs_super_info *as = AFS_FS_S(sb);
+	struct afs_net *net = afs_net(as->net_ns);
+
+	if (as->dyn_root)
+		afs_dynroot_depopulate(sb);
+	
+	/* Clear the callback interests (which will do ilookup5) before
+	 * deactivating the superblock.
+	 */
+	if (as->volume)
+		afs_clear_callback_interests(net, as->volume->servers);
+	kill_anon_super(sb);
+	if (as->volume)
+		afs_deactivate_volume(as->volume);
+	afs_destroy_sbi(as);
+}
+
 /*
  * get an AFS superblock
  */
@@ -566,22 +594,6 @@ error:
 	return ERR_PTR(ret);
 }
 
-static void afs_kill_super(struct super_block *sb)
-{
-	struct afs_super_info *as = AFS_FS_S(sb);
-
-	/* Clear the callback interests (which will do ilookup5) before
-	 * deactivating the superblock.
-	 */
-	if (as->volume)
-		afs_clear_callback_interests(afs_net(as->net_ns),
-					     as->volume->servers);
-	kill_anon_super(sb);
-	if (as->volume)
-		afs_deactivate_volume(as->volume);
-	afs_destroy_sbi(as);
-}
-
 /*
  * Initialise an inode cache slab element prior to any use.  Note that
  * afs_alloc_inode() *must* reset anything that could incorrectly leak from one
diff --git a/fs/namei.c b/fs/namei.c
index 186bd2464fd5..2e0a1c5729f1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2463,6 +2463,35 @@ static int lookup_one_len_common(const char *name, struct dentry *base,
 	return inode_permission(base->d_inode, MAY_EXEC);
 }
 
+/**
+ * try_lookup_one_len - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Look up a dentry by name in the dcache, returning NULL if it does not
+ * currently exist.  The function does not try to create a dentry.
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
+ */
+struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
+{
+	struct qstr this;
+	int err;
+
+	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+
+	err = lookup_one_len_common(name, base, len, &this);
+	if (err)
+		return ERR_PTR(err);
+
+	return lookup_dcache(&this, base, 0);
+}
+EXPORT_SYMBOL(try_lookup_one_len);
+
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a982bb7cd480..a78606e8e3df 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -81,6 +81,7 @@ extern void done_path_create(struct path *, struct dentry *);
 extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
+extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
-- 
cgit v1.2.3


From ec15872daa0ac3f5cbe7cb6f1734c493d74301ac Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 8 May 2018 18:54:36 -0300
Subject: docs: fix broken references with multiple hints

The script:
	./scripts/documentation-file-ref-check --fix

Gives multiple hints for broken references on some files.
Manually use the one that applies for some files.

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/ABI/obsolete/sysfs-gpio                        | 2 +-
 Documentation/devicetree/bindings/display/bridge/tda998x.txt | 2 +-
 Documentation/trace/events.rst                               | 2 +-
 Documentation/trace/tracepoint-analysis.rst                  | 2 +-
 Documentation/translations/zh_CN/SubmittingDrivers           | 2 +-
 Documentation/translations/zh_CN/gpio.txt                    | 4 ++--
 MAINTAINERS                                                  | 2 +-
 drivers/hid/usbhid/Kconfig                                   | 2 +-
 drivers/input/Kconfig                                        | 4 ++--
 drivers/input/joystick/Kconfig                               | 4 ++--
 drivers/input/joystick/iforce/Kconfig                        | 4 ++--
 drivers/input/serio/Kconfig                                  | 4 ++--
 drivers/staging/fsl-mc/bus/dpio/dpio-driver.txt              | 2 +-
 drivers/video/fbdev/skeletonfb.c                             | 8 ++++----
 include/linux/tracepoint.h                                   | 2 +-
 security/device_cgroup.c                                     | 2 +-
 16 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/obsolete/sysfs-gpio b/Documentation/ABI/obsolete/sysfs-gpio
index 32513dc2eec9..40d41ea1a3f5 100644
--- a/Documentation/ABI/obsolete/sysfs-gpio
+++ b/Documentation/ABI/obsolete/sysfs-gpio
@@ -11,7 +11,7 @@ Description:
   Kernel code may export it for complete or partial access.
 
   GPIOs are identified as they are inside the kernel, using integers in
-  the range 0..INT_MAX.  See Documentation/gpio/gpio.txt for more information.
+  the range 0..INT_MAX.  See Documentation/gpio for more information.
 
     /sys/class/gpio
 	/export ... asks the kernel to export a GPIO to userspace
diff --git a/Documentation/devicetree/bindings/display/bridge/tda998x.txt b/Documentation/devicetree/bindings/display/bridge/tda998x.txt
index 1a4eaca40d94..f5a02f61dd36 100644
--- a/Documentation/devicetree/bindings/display/bridge/tda998x.txt
+++ b/Documentation/devicetree/bindings/display/bridge/tda998x.txt
@@ -30,7 +30,7 @@ Optional properties:
   - nxp,calib-gpios: calibration GPIO, which must correspond with the
 	gpio used for the TDA998x interrupt pin.
 
-[1] Documentation/sound/alsa/soc/DAI.txt
+[1] Documentation/sound/soc/dai.rst
 [2] include/dt-bindings/display/tda998x.h
 
 Example:
diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index 1afae55dc55c..696dc69b8158 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -8,7 +8,7 @@ Event Tracing
 1. Introduction
 ===============
 
-Tracepoints (see Documentation/trace/tracepoints.txt) can be used
+Tracepoints (see Documentation/trace/tracepoints.rst) can be used
 without creating custom kernel modules to register probe functions
 using the event tracing infrastructure.
 
diff --git a/Documentation/trace/tracepoint-analysis.rst b/Documentation/trace/tracepoint-analysis.rst
index a4d3ff2e5efb..bef37abf4ad3 100644
--- a/Documentation/trace/tracepoint-analysis.rst
+++ b/Documentation/trace/tracepoint-analysis.rst
@@ -6,7 +6,7 @@ Notes on Analysing Behaviour Using Events and Tracepoints
 1. Introduction
 ===============
 
-Tracepoints (see Documentation/trace/tracepoints.txt) can be used without
+Tracepoints (see Documentation/trace/tracepoints.rst) can be used without
 creating custom kernel modules to register probe functions using the event
 tracing infrastructure.
 
diff --git a/Documentation/translations/zh_CN/SubmittingDrivers b/Documentation/translations/zh_CN/SubmittingDrivers
index 929385e4b194..15e73562f710 100644
--- a/Documentation/translations/zh_CN/SubmittingDrivers
+++ b/Documentation/translations/zh_CN/SubmittingDrivers
@@ -107,7 +107,7 @@ Linux 2.6:
 		程序测试的指导，请参阅
 		Documentation/power/drivers-testing.txt。有关驱动程序电
 		源管理问题相对全面的概述，请参阅
-		Documentation/power/admin-guide/devices.rst。
+		Documentation/driver-api/pm/devices.rst。
 
 管理：		如果一个驱动程序的作者还在进行有效的维护，那么通常除了那
 		些明显正确且不需要任何检查的补丁以外，其他所有的补丁都会
diff --git a/Documentation/translations/zh_CN/gpio.txt b/Documentation/translations/zh_CN/gpio.txt
index 4f8bf30a41dc..4cb1ba8b8fed 100644
--- a/Documentation/translations/zh_CN/gpio.txt
+++ b/Documentation/translations/zh_CN/gpio.txt
@@ -1,4 +1,4 @@
-Chinese translated version of Documentation/gpio.txt
+Chinese translated version of Documentation/gpio
 
 If you have any comment or update to the content, please contact the
 original document maintainer directly.  However, if you have a problem
@@ -10,7 +10,7 @@ Maintainer: Grant Likely <grant.likely@secretlab.ca>
 		Linus Walleij <linus.walleij@linaro.org>
 Chinese maintainer: Fu Wei <tekkamanninja@gmail.com>
 ---------------------------------------------------------------------
-Documentation/gpio.txt 的中文翻译
+Documentation/gpio 的中文翻译
 
 如果想评论或更新本文的内容，请直接联系原文档的维护者。如果你使用英文
 交流有困难的话，也可以向中文版维护者求助。如果本翻译更新不及时或者翻
diff --git a/MAINTAINERS b/MAINTAINERS
index cb468a535f32..653a2c29ca43 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13312,7 +13312,7 @@ L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 W:	http://alsa-project.org/main/index.php/ASoC
 S:	Supported
 F:	Documentation/devicetree/bindings/sound/
-F:	Documentation/sound/alsa/soc/
+F:	Documentation/sound/soc/
 F:	sound/soc/
 F:	include/sound/soc*
 
diff --git a/drivers/hid/usbhid/Kconfig b/drivers/hid/usbhid/Kconfig
index 0108c5991a04..e50d8fe4d36c 100644
--- a/drivers/hid/usbhid/Kconfig
+++ b/drivers/hid/usbhid/Kconfig
@@ -14,7 +14,7 @@ config USB_HID
 
 	  You can't use this driver and the HIDBP (Boot Protocol) keyboard
 	  and mouse drivers at the same time. More information is available:
-	  <file:Documentation/input/input.txt>.
+	  <file:Documentation/input/input.rst>.
 
 	  If unsure, say Y.
 
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index ff8037798779..c5992cd195a1 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -16,7 +16,7 @@ config INPUT
 
 	  Say N here if you have a headless (no monitor, no keyboard) system.
 
-	  More information is available: <file:Documentation/input/input.txt>
+	  More information is available: <file:Documentation/input/input.rst>
 
 	  If unsure, say Y.
 
@@ -144,7 +144,7 @@ config INPUT_JOYDEV
 
 	  If unsure, say Y.
 
-	  More information is available: <file:Documentation/input/joystick.txt>
+	  More information is available: <file:Documentation/input/joydev/joystick.rst>
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called joydev.
diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig
index 9591fc04a8ab..32ec4cee6716 100644
--- a/drivers/input/joystick/Kconfig
+++ b/drivers/input/joystick/Kconfig
@@ -9,7 +9,7 @@ menuconfig INPUT_JOYSTICK
 	  and the list of supported devices will be displayed. This option
 	  doesn't affect the kernel.
 
-	  Please read the file <file:Documentation/input/joystick.txt> which
+	  Please read the file <file:Documentation/input/joydev/joystick.rst> which
 	  contains more information.
 
 if INPUT_JOYSTICK
@@ -25,7 +25,7 @@ config JOYSTICK_ANALOG
 	  Flightstick Pro, ThrustMaster FCS, 6 and 8 button gamepads, or
 	  Saitek Cyborg joysticks.
 
-	  Please read the file <file:Documentation/input/joystick.txt> which
+	  Please read the file <file:Documentation/input/joydev/joystick.rst> which
 	  contains more information.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/input/joystick/iforce/Kconfig b/drivers/input/joystick/iforce/Kconfig
index 8fde22a021b3..ab4dbcbcbf50 100644
--- a/drivers/input/joystick/iforce/Kconfig
+++ b/drivers/input/joystick/iforce/Kconfig
@@ -27,6 +27,6 @@ config JOYSTICK_IFORCE_232
 	  connected to your serial (COM) port.
 
 	  You will need an additional utility called inputattach, see
-	  <file:Documentation/input/joystick.txt>
-	  and <file:Documentation/input/ff.txt>.
+	  <file:Documentation/input/joydev/joystick.rst>
+	  and <file:Documentation/input/ff.rst>.
 
diff --git a/drivers/input/serio/Kconfig b/drivers/input/serio/Kconfig
index ca4530eb3378..d90d9f1098ff 100644
--- a/drivers/input/serio/Kconfig
+++ b/drivers/input/serio/Kconfig
@@ -47,7 +47,7 @@ config SERIO_SERPORT
 	  Say Y here if you plan to use an input device (mouse, joystick,
 	  tablet, 6dof) that communicates over the RS232 serial (COM) port.
 
-	  More information is available: <file:Documentation/input/input.txt>
+	  More information is available: <file:Documentation/input/input.rst>
 
 	  If unsure, say Y.
 
@@ -78,7 +78,7 @@ config SERIO_PARKBD
 	  Say Y here if you built a simple parallel port adapter to attach
 	  an additional AT keyboard, XT keyboard or PS/2 mouse.
 
-	  More information is available: <file:Documentation/input/input.txt>
+	  More information is available: <file:Documentation/input/input.rst>
 
 	  If unsure, say N.
 
diff --git a/drivers/staging/fsl-mc/bus/dpio/dpio-driver.txt b/drivers/staging/fsl-mc/bus/dpio/dpio-driver.txt
index 0ba6771654f7..72ba9da3d179 100644
--- a/drivers/staging/fsl-mc/bus/dpio/dpio-driver.txt
+++ b/drivers/staging/fsl-mc/bus/dpio/dpio-driver.txt
@@ -11,7 +11,7 @@ pool management for network interfaces.
 This document provides an overview the Linux DPIO driver, its
 subcomponents, and its APIs.
 
-See Documentation/dpaa2/overview.txt for a general overview of DPAA2
+See Documentation/networking/dpaa2/overview.rst for a general overview of DPAA2
 and the general DPAA2 driver architecture in Linux.
 
 Driver Overview
diff --git a/drivers/video/fbdev/skeletonfb.c b/drivers/video/fbdev/skeletonfb.c
index 7f4e908330bf..812a36cb60c3 100644
--- a/drivers/video/fbdev/skeletonfb.c
+++ b/drivers/video/fbdev/skeletonfb.c
@@ -836,7 +836,7 @@ static void xxxfb_remove(struct pci_dev *dev)
  *	@dev: PCI device
  *	@msg: the suspend event code.
  *
- *      See Documentation/power/admin-guide/devices.rst for more information
+ *      See Documentation/driver-api/pm/devices.rst for more information
  */
 static int xxxfb_suspend(struct pci_dev *dev, pm_message_t msg)
 {
@@ -851,7 +851,7 @@ static int xxxfb_suspend(struct pci_dev *dev, pm_message_t msg)
  *	xxxfb_resume - Optional but recommended function. Resume the device.
  *	@dev: PCI device
  *
- *      See Documentation/power/admin-guide/devices.rst for more information
+ *      See Documentation/driver-api/pm/devices.rst for more information
  */
 static int xxxfb_resume(struct pci_dev *dev)
 {
@@ -915,7 +915,7 @@ static void __exit xxxfb_exit(void)
  *	@dev: platform device
  *	@msg: the suspend event code.
  *
- *      See Documentation/power/admin-guide/devices.rst for more information
+ *      See Documentation/driver-api/pm/devices.rst for more information
  */
 static int xxxfb_suspend(struct platform_device *dev, pm_message_t msg)
 {
@@ -930,7 +930,7 @@ static int xxxfb_suspend(struct platform_device *dev, pm_message_t msg)
  *	xxxfb_resume - Optional but recommended function. Resume the device.
  *	@dev: platform device
  *
- *      See Documentation/power/admin-guide/devices.rst for more information
+ *      See Documentation/driver-api/pm/devices.rst for more information
  */
 static int xxxfb_resume(struct platform_dev *dev)
 {
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c94f466d57ef..19a690b559ca 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -4,7 +4,7 @@
 /*
  * Kernel Tracepoint API.
  *
- * See Documentation/trace/tracepoints.txt.
+ * See Documentation/trace/tracepoints.rst.
  *
  * Copyright (C) 2008-2014 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  *
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index c65b39bafdfe..cd97929fac66 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent)
  * This is one of the three key functions for hierarchy implementation.
  * This function is responsible for re-evaluating all the cgroup's active
  * exceptions due to a parent's exception change.
- * Refer to Documentation/cgroups/devices.txt for more details.
+ * Refer to Documentation/cgroup-v1/devices.txt for more details.
  */
 static void revalidate_active_exceptions(struct dev_cgroup *devcg)
 {
-- 
cgit v1.2.3


From 5fb94e9ca333f0fe1d96de06704a79942b3832c3 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 8 May 2018 15:14:57 -0300
Subject: docs: Fix some broken references

As we move stuff around, some doc references are broken. Fix some of
them via this script:
	./scripts/documentation-file-ref-check --fix

Manually checked if the produced result is valid, removing a few
false-positives.

Acked-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Acked-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/kernel-parameters.txt      |  4 ++--
 .../devicetree/bindings/input/rotary-encoder.txt     |  2 +-
 Documentation/driver-api/gpio/consumer.rst           |  2 +-
 Documentation/kprobes.txt                            |  4 ++--
 Documentation/trace/coresight.txt                    |  2 +-
 Documentation/trace/ftrace-uses.rst                  |  2 +-
 Documentation/trace/histogram.txt                    |  2 +-
 Documentation/trace/intel_th.rst                     |  2 +-
 Documentation/trace/tracepoint-analysis.rst          |  6 +++---
 Documentation/translations/ja_JP/howto.rst           |  4 ++--
 Documentation/translations/zh_CN/magic-number.txt    |  4 ++--
 .../translations/zh_CN/video4linux/omap3isp.txt      |  4 ++--
 MAINTAINERS                                          | 20 ++++++++++----------
 arch/Kconfig                                         |  2 +-
 arch/arm/include/asm/cacheflush.h                    |  2 +-
 arch/arm64/include/asm/cacheflush.h                  |  2 +-
 arch/microblaze/include/asm/cacheflush.h             |  2 +-
 arch/um/Kconfig.um                                   |  2 +-
 arch/unicore32/include/asm/cacheflush.h              |  2 +-
 arch/x86/entry/vsyscall/vsyscall_64.c                |  2 +-
 arch/xtensa/include/asm/cacheflush.h                 |  4 ++--
 block/Kconfig                                        |  2 +-
 certs/Kconfig                                        |  2 +-
 crypto/asymmetric_keys/asymmetric_type.c             |  2 +-
 crypto/asymmetric_keys/signature.c                   |  2 +-
 drivers/char/Kconfig                                 |  2 +-
 drivers/clk/clk.c                                    |  4 ++--
 drivers/clk/ingenic/cgu.h                            |  2 +-
 drivers/gpu/vga/Kconfig                              |  2 +-
 drivers/gpu/vga/vgaarb.c                             |  2 +-
 drivers/input/joystick/Kconfig                       | 10 +++++-----
 drivers/input/joystick/walkera0701.c                 |  2 +-
 drivers/input/misc/Kconfig                           |  4 ++--
 drivers/input/misc/rotary_encoder.c                  |  2 +-
 drivers/input/mouse/Kconfig                          |  6 +++---
 drivers/input/mouse/alps.c                           |  2 +-
 drivers/input/touchscreen/wm97xx-core.c              |  2 +-
 drivers/lightnvm/pblk-rb.c                           |  2 +-
 drivers/md/bcache/Kconfig                            |  2 +-
 drivers/md/bcache/btree.c                            |  2 +-
 drivers/md/bcache/extents.c                          |  2 +-
 drivers/media/dvb-core/dvb_ringbuffer.c              |  2 +-
 drivers/media/pci/meye/Kconfig                       |  2 +-
 drivers/media/platform/pxa_camera.c                  |  4 ++--
 .../media/platform/soc_camera/sh_mobile_ceu_camera.c |  2 +-
 drivers/media/radio/Kconfig                          |  2 +-
 drivers/media/radio/si470x/Kconfig                   |  2 +-
 drivers/media/usb/dvb-usb-v2/lmedm04.c               |  2 +-
 drivers/media/usb/zr364xx/Kconfig                    |  2 +-
 drivers/parport/Kconfig                              |  6 +++---
 drivers/staging/media/bcm2048/TODO                   |  2 +-
 include/keys/asymmetric-subtype.h                    |  2 +-
 include/keys/asymmetric-type.h                       |  2 +-
 include/linux/assoc_array.h                          |  2 +-
 include/linux/assoc_array_priv.h                     |  2 +-
 include/linux/circ_buf.h                             |  2 +-
 include/linux/ftrace.h                               |  2 +-
 include/linux/rculist_nulls.h                        |  2 +-
 include/uapi/linux/prctl.h                           |  2 +-
 include/xen/interface/io/kbdif.h                     |  2 +-
 kernel/cgroup/cpuset.c                               |  2 +-
 kernel/trace/Kconfig                                 | 16 ++++++++--------
 lib/Kconfig                                          |  2 +-
 security/selinux/hooks.c                             |  2 +-
 sound/core/Kconfig                                   |  4 ++--
 sound/drivers/Kconfig                                |  4 ++--
 sound/pci/Kconfig                                    | 10 +++++-----
 tools/include/uapi/linux/prctl.h                     |  2 +-
 tools/lib/api/fs/fs.c                                |  2 +-
 tools/perf/util/bpf-prologue.c                       |  2 +-
 .../pm-graph/config/custom-timeline-functions.cfg    |  4 ++--
 71 files changed, 113 insertions(+), 113 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 638342d0a095..6fa3f31ed2a5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4335,7 +4335,7 @@
 			[FTRACE] Set and start specified trace events in order
 			to facilitate early boot debugging. The event-list is a
 			comma separated list of trace events to enable. See
-			also Documentation/trace/events.txt
+			also Documentation/trace/events.rst
 
 	trace_options=[option-list]
 			[FTRACE] Enable or disable tracer options at boot.
@@ -4350,7 +4350,7 @@
 
 			      trace_options=stacktrace
 
-			See also Documentation/trace/ftrace.txt "trace options"
+			See also Documentation/trace/ftrace.rst "trace options"
 			section.
 
 	tp_printk[FTRACE]
diff --git a/Documentation/devicetree/bindings/input/rotary-encoder.txt b/Documentation/devicetree/bindings/input/rotary-encoder.txt
index f99fe5cdeaec..a644408b33b8 100644
--- a/Documentation/devicetree/bindings/input/rotary-encoder.txt
+++ b/Documentation/devicetree/bindings/input/rotary-encoder.txt
@@ -28,7 +28,7 @@ Deprecated properties:
   This property is deprecated. Instead, a 'steps-per-period ' value should
   be used, such as "rotary-encoder,steps-per-period = <2>".
 
-See Documentation/input/rotary-encoder.txt for more information.
+See Documentation/input/devices/rotary-encoder.rst for more information.
 
 Example:
 
diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst
index c71a50d85b50..aa03f389d41d 100644
--- a/Documentation/driver-api/gpio/consumer.rst
+++ b/Documentation/driver-api/gpio/consumer.rst
@@ -57,7 +57,7 @@ device that displays digits), an additional index argument can be specified::
 					  enum gpiod_flags flags)
 
 For a more detailed description of the con_id parameter in the DeviceTree case
-see Documentation/gpio/board.txt
+see Documentation/driver-api/gpio/board.rst
 
 The flags parameter is used to optionally specify a direction and initial value
 for the GPIO. Values can be:
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 22208bf2386d..cb3b0de83fc6 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -724,8 +724,8 @@ migrate your tool to one of the following options:
 
   See following documents:
 
-  - Documentation/trace/kprobetrace.txt
-  - Documentation/trace/events.txt
+  - Documentation/trace/kprobetrace.rst
+  - Documentation/trace/events.rst
   - tools/perf/Documentation/perf-probe.txt
 
 
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt
index 1d74ad0202b6..efbc832146e7 100644
--- a/Documentation/trace/coresight.txt
+++ b/Documentation/trace/coresight.txt
@@ -426,5 +426,5 @@ root@genericarmv8:~#
 Details on how to use the generic STM API can be found here [2].
 
 [1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
-[2]. Documentation/trace/stm.txt
+[2]. Documentation/trace/stm.rst
 [3]. https://github.com/Linaro/perf-opencsd
diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index 00283b6dd101..1fbc69894eed 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -199,7 +199,7 @@ If @buf is NULL and reset is set, all functions will be enabled for tracing.
 The @buf can also be a glob expression to enable all functions that
 match a specific pattern.
 
-See Filter Commands in :file:`Documentation/trace/ftrace.txt`.
+See Filter Commands in :file:`Documentation/trace/ftrace.rst`.
 
 To just trace the schedule function:
 
diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt
index b13771cb12c1..e73bcf9cb5f3 100644
--- a/Documentation/trace/histogram.txt
+++ b/Documentation/trace/histogram.txt
@@ -7,7 +7,7 @@
 
   Histogram triggers are special event triggers that can be used to
   aggregate trace event data into histograms.  For information on
-  trace events and event triggers, see Documentation/trace/events.txt.
+  trace events and event triggers, see Documentation/trace/events.rst.
 
 
 2. Histogram Trigger Command
diff --git a/Documentation/trace/intel_th.rst b/Documentation/trace/intel_th.rst
index 990f13265178..19e2d633f3c7 100644
--- a/Documentation/trace/intel_th.rst
+++ b/Documentation/trace/intel_th.rst
@@ -38,7 +38,7 @@ description is at Documentation/ABI/testing/sysfs-bus-intel_th-devices-gth.
 
 STH registers an stm class device, through which it provides interface
 to userspace and kernelspace software trace sources. See
-Documentation/trace/stm.txt for more information on that.
+Documentation/trace/stm.rst for more information on that.
 
 MSU can be configured to collect trace data into a system memory
 buffer, which can later on be read from its device nodes via read() or
diff --git a/Documentation/trace/tracepoint-analysis.rst b/Documentation/trace/tracepoint-analysis.rst
index bef37abf4ad3..716326b9f152 100644
--- a/Documentation/trace/tracepoint-analysis.rst
+++ b/Documentation/trace/tracepoint-analysis.rst
@@ -55,7 +55,7 @@ simple case of::
 3.1 System-Wide Event Enabling
 ------------------------------
 
-See Documentation/trace/events.txt for a proper description on how events
+See Documentation/trace/events.rst for a proper description on how events
 can be enabled system-wide. A short example of enabling all events related
 to page allocation would look something like::
 
@@ -112,7 +112,7 @@ at that point.
 3.4 Local Event Enabling
 ------------------------
 
-Documentation/trace/ftrace.txt describes how to enable events on a per-thread
+Documentation/trace/ftrace.rst describes how to enable events on a per-thread
 basis using set_ftrace_pid.
 
 3.5 Local Event Enablement with PCL
@@ -137,7 +137,7 @@ basis using PCL such as follows.
 4. Event Filtering
 ==================
 
-Documentation/trace/ftrace.txt covers in-depth how to filter events in
+Documentation/trace/ftrace.rst covers in-depth how to filter events in
 ftrace.  Obviously using grep and awk of trace_pipe is an option as well
 as any script reading trace_pipe.
 
diff --git a/Documentation/translations/ja_JP/howto.rst b/Documentation/translations/ja_JP/howto.rst
index 8d7ed0cbbf5f..f3116381c26b 100644
--- a/Documentation/translations/ja_JP/howto.rst
+++ b/Documentation/translations/ja_JP/howto.rst
@@ -1,5 +1,5 @@
 NOTE:
-This is a version of Documentation/HOWTO translated into Japanese.
+This is a version of Documentation/process/howto.rst translated into Japanese.
 This document is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
 If you find any difference between this document and the original file or
 a problem with the translation, please contact the maintainer of this file.
@@ -109,7 +109,7 @@ linux-api@vger.kernel.org に送ることを勧めます。
     ています。 カーネルに関して初めての人はここからスタートすると良い
     でしょう。
 
-  :ref:`Documentation/Process/changes.rst <changes>`
+  :ref:`Documentation/process/changes.rst <changes>`
     このファイルはカーネルをうまく生成(訳注 build )し、走らせるのに最
     小限のレベルで必要な数々のソフトウェアパッケージの一覧を示してい
     ます。
diff --git a/Documentation/translations/zh_CN/magic-number.txt b/Documentation/translations/zh_CN/magic-number.txt
index e9db693c0a23..7159cec04090 100644
--- a/Documentation/translations/zh_CN/magic-number.txt
+++ b/Documentation/translations/zh_CN/magic-number.txt
@@ -1,4 +1,4 @@
-Chinese translated version of Documentation/magic-number.txt
+Chinese translated version of Documentation/process/magic-number.rst
 
 If you have any comment or update to the content, please post to LKML directly.
 However, if you have problem communicating in English you can also ask the
@@ -7,7 +7,7 @@ translation is outdated or there is problem with translation.
 
 Chinese maintainer: Jia Wei Wei <harryxiyou@gmail.com>
 ---------------------------------------------------------------------
-Documentation/magic-number.txt的中文翻译
+Documentation/process/magic-number.rst的中文翻译
 
 如果想评论或更新本文的内容，请直接发信到LKML。如果你使用英文交流有困难的话，也可
 以向中文版维护者求助。如果本翻译更新不及时或者翻译存在问题，请联系中文版维护者。
diff --git a/Documentation/translations/zh_CN/video4linux/omap3isp.txt b/Documentation/translations/zh_CN/video4linux/omap3isp.txt
index 67ffbf352ae0..e9f29375aa95 100644
--- a/Documentation/translations/zh_CN/video4linux/omap3isp.txt
+++ b/Documentation/translations/zh_CN/video4linux/omap3isp.txt
@@ -1,4 +1,4 @@
-Chinese translated version of Documentation/video4linux/omap3isp.txt
+Chinese translated version of Documentation/media/v4l-drivers/omap3isp.rst
 
 If you have any comment or update to the content, please contact the
 original document maintainer directly.  However, if you have a problem
@@ -11,7 +11,7 @@ Maintainer: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
 	  David Cohen <dacohen@gmail.com>
 Chinese maintainer: Fu Wei <tekkamanninja@gmail.com>
 ---------------------------------------------------------------------
-Documentation/video4linux/omap3isp.txt 的中文翻译
+Documentation/media/v4l-drivers/omap3isp.rst 的中文翻译
 
 如果想评论或更新本文的内容，请直接联系原文档的维护者。如果你使用英文
 交流有困难的话，也可以向中文版维护者求助。如果本翻译更新不及时或者翻
diff --git a/MAINTAINERS b/MAINTAINERS
index 653a2c29ca43..09554034be46 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3079,7 +3079,7 @@ M:	Clemens Ladisch <clemens@ladisch.de>
 L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 T:	git git://git.alsa-project.org/alsa-kernel.git
 S:	Maintained
-F:	Documentation/sound/alsa/Bt87x.txt
+F:	Documentation/sound/cards/bt87x.rst
 F:	sound/pci/bt87x.c
 
 BT8XXGPIO DRIVER
@@ -3375,7 +3375,7 @@ M:	David Howells <dhowells@redhat.com>
 M:	David Woodhouse <dwmw2@infradead.org>
 L:	keyrings@vger.kernel.org
 S:	Maintained
-F:	Documentation/module-signing.txt
+F:	Documentation/admin-guide/module-signing.rst
 F:	certs/
 F:	scripts/sign-file.c
 F:	scripts/extract-cert.c
@@ -6501,7 +6501,7 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	mm/hmm*
 F:	include/linux/hmm*
-F:	Documentation/vm/hmm.txt
+F:	Documentation/vm/hmm.rst
 
 HOST AP DRIVER
 M:	Jouni Malinen <j@w1.fi>
@@ -7401,7 +7401,7 @@ F:	drivers/platform/x86/intel-wmi-thunderbolt.c
 INTEL(R) TRACE HUB
 M:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 S:	Supported
-F:	Documentation/trace/intel_th.txt
+F:	Documentation/trace/intel_th.rst
 F:	drivers/hwtracing/intel_th/
 
 INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
@@ -9665,7 +9665,7 @@ F:	include/uapi/linux/mmc/
 MULTIPLEXER SUBSYSTEM
 M:	Peter Rosin <peda@axentia.se>
 S:	Maintained
-F:	Documentation/ABI/testing/mux/sysfs-class-mux*
+F:	Documentation/ABI/testing/sysfs-class-mux*
 F:	Documentation/devicetree/bindings/mux/
 F:	include/linux/dt-bindings/mux/
 F:	include/linux/mux/
@@ -10244,7 +10244,7 @@ F:	arch/powerpc/include/asm/pnv-ocxl.h
 F:	drivers/misc/ocxl/
 F:	include/misc/ocxl*
 F:	include/uapi/misc/ocxl.h
-F:	Documentation/accelerators/ocxl.txt
+F:	Documentation/accelerators/ocxl.rst
 
 OMAP AUDIO SUPPORT
 M:	Peter Ujfalusi <peter.ujfalusi@ti.com>
@@ -13794,7 +13794,7 @@ SYSTEM TRACE MODULE CLASS
 M:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/ash/stm.git
-F:	Documentation/trace/stm.txt
+F:	Documentation/trace/stm.rst
 F:	drivers/hwtracing/stm/
 F:	include/linux/stm.h
 F:	include/uapi/linux/stm.h
@@ -14471,7 +14471,7 @@ M:	Steven Rostedt <rostedt@goodmis.org>
 M:	Ingo Molnar <mingo@redhat.com>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
 S:	Maintained
-F:	Documentation/trace/ftrace.txt
+F:	Documentation/trace/ftrace.rst
 F:	arch/*/*/*/ftrace.h
 F:	arch/*/kernel/ftrace.c
 F:	include/*/ftrace.h
@@ -14940,7 +14940,7 @@ M:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 L:	linux-usb@vger.kernel.org
 S:	Maintained
 F:	Documentation/ABI/testing/sysfs-class-typec
-F:	Documentation/usb/typec.rst
+F:	Documentation/driver-api/usb/typec.rst
 F:	drivers/usb/typec/
 F:	include/linux/usb/typec.h
 
@@ -15770,7 +15770,7 @@ YEALINK PHONE DRIVER
 M:	Henk Vergonet <Henk.Vergonet@gmail.com>
 L:	usbb2k-api-dev@nongnu.org
 S:	Maintained
-F:	Documentation/input/yealink.rst
+F:	Documentation/input/devices/yealink.rst
 F:	drivers/input/misc/yealink.*
 
 Z8530 DRIVER FOR AX.25
diff --git a/arch/Kconfig b/arch/Kconfig
index 47b235d43909..1aa59063f1fd 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -403,7 +403,7 @@ config SECCOMP_FILTER
 	  in terms of Berkeley Packet Filter programs which implement
 	  task-defined system call filtering polices.
 
-	  See Documentation/prctl/seccomp_filter.txt for details.
+	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
 preferred-plugin-hostcc := $(if-success,[ $(gcc-version) -ge 40800 ],$(HOSTCXX),$(HOSTCC))
 
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 869080bedb89..ec1a5fd0d294 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -35,7 +35,7 @@
  *	Start addresses are inclusive and end addresses are exclusive;
  *	start addresses should be rounded down, end addresses up.
  *
- *	See Documentation/cachetlb.txt for more information.
+ *	See Documentation/core-api/cachetlb.rst for more information.
  *	Please note that the implementation of these, and the required
  *	effects are cache-type (VIVT/VIPT/PIPT) specific.
  *
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 0094c6653b06..d264a7274811 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -36,7 +36,7 @@
  *	Start addresses are inclusive and end addresses are exclusive; start
  *	addresses should be rounded down, end addresses up.
  *
- *	See Documentation/cachetlb.txt for more information. Please note that
+ *	See Documentation/core-api/cachetlb.rst for more information. Please note that
  *	the implementation assumes non-aliasing VIPT D-cache and (aliasing)
  *	VIPT I-cache.
  *
diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h
index ffea82a16d2c..b091de77b15b 100644
--- a/arch/microblaze/include/asm/cacheflush.h
+++ b/arch/microblaze/include/asm/cacheflush.h
@@ -19,7 +19,7 @@
 #include <linux/mm.h>
 #include <linux/io.h>
 
-/* Look at Documentation/cachetlb.txt */
+/* Look at Documentation/core-api/cachetlb.rst */
 
 /*
  * Cache handling functions.
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index 3e7f228b22e1..20da5a8ca949 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -80,7 +80,7 @@ config MAGIC_SYSRQ
 	  On UML, this is accomplished by sending a "sysrq" command with
 	  mconsole, followed by the letter for the requested command.
 
-	  The keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
+	  The keys are documented in <file:Documentation/admin-guide/sysrq.rst>. Don't say Y
 	  unless you really know what this hack does.
 
 config KERNEL_STACK_ORDER
diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h
index 1d9132b66039..1c8b9f13a9e1 100644
--- a/arch/unicore32/include/asm/cacheflush.h
+++ b/arch/unicore32/include/asm/cacheflush.h
@@ -33,7 +33,7 @@
  *	Start addresses are inclusive and end addresses are exclusive;
  *	start addresses should be rounded down, end addresses up.
  *
- *	See Documentation/cachetlb.txt for more information.
+ *	See Documentation/core-api/cachetlb.rst for more information.
  *	Please note that the implementation of these, and the required
  *	effects are cache-type (VIVT/VIPT/PIPT) specific.
  *
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 7782cdbcd67d..82ed001e8909 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -201,7 +201,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	/*
 	 * Handle seccomp.  regs->ip must be the original value.
-	 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
+	 * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
 	 *
 	 * We could optimize the seccomp disabled case, but performance
 	 * here doesn't matter.
diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h
index 397d6a1a4224..a0d50be5a8cb 100644
--- a/arch/xtensa/include/asm/cacheflush.h
+++ b/arch/xtensa/include/asm/cacheflush.h
@@ -88,7 +88,7 @@ static inline void __invalidate_icache_page_alias(unsigned long virt,
  *
  * Pages can get remapped. Because this might change the 'color' of that page,
  * we have to flush the cache before the PTE is changed.
- * (see also Documentation/cachetlb.txt)
+ * (see also Documentation/core-api/cachetlb.rst)
  */
 
 #if defined(CONFIG_MMU) && \
@@ -152,7 +152,7 @@ void local_flush_cache_page(struct vm_area_struct *vma,
 		__invalidate_icache_range(start,(end) - (start));	\
 	} while (0)
 
-/* This is not required, see Documentation/cachetlb.txt */
+/* This is not required, see Documentation/core-api/cachetlb.rst */
 #define	flush_icache_page(vma,page)			do { } while (0)
 
 #define flush_dcache_mmap_lock(mapping)			do { } while (0)
diff --git a/block/Kconfig b/block/Kconfig
index 28ec55752b68..eb50fd4977c2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -114,7 +114,7 @@ config BLK_DEV_THROTTLING
 	one needs to mount and use blkio cgroup controller for creating
 	cgroups and specifying per device IO rate policies.
 
-	See Documentation/cgroups/blkio-controller.txt for more information.
+	See Documentation/cgroup-v1/blkio-controller.txt for more information.
 
 config BLK_DEV_THROTTLING_LOW
 	bool "Block throttling .low limit interface support (EXPERIMENTAL)"
diff --git a/certs/Kconfig b/certs/Kconfig
index 5f7663df6e8e..c94e93d8bccf 100644
--- a/certs/Kconfig
+++ b/certs/Kconfig
@@ -13,7 +13,7 @@ config MODULE_SIG_KEY
 
          If this option is unchanged from its default "certs/signing_key.pem",
          then the kernel will automatically generate the private key and
-         certificate as described in Documentation/module-signing.txt
+         certificate as described in Documentation/admin-guide/module-signing.rst
 
 config SYSTEM_TRUSTED_KEYRING
 	bool "Provide system-wide ring of trusted keys"
diff --git a/crypto/asymmetric_keys/asymmetric_type.c b/crypto/asymmetric_keys/asymmetric_type.c
index 39aecad286fe..26539e9a8bda 100644
--- a/crypto/asymmetric_keys/asymmetric_type.c
+++ b/crypto/asymmetric_keys/asymmetric_type.c
@@ -1,6 +1,6 @@
 /* Asymmetric public-key cryptography key type
  *
- * See Documentation/security/asymmetric-keys.txt
+ * See Documentation/crypto/asymmetric-keys.txt
  *
  * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/crypto/asymmetric_keys/signature.c b/crypto/asymmetric_keys/signature.c
index 11b7ba170904..28198314bc39 100644
--- a/crypto/asymmetric_keys/signature.c
+++ b/crypto/asymmetric_keys/signature.c
@@ -1,6 +1,6 @@
 /* Signature verification with an asymmetric key
  *
- * See Documentation/security/asymmetric-keys.txt
+ * See Documentation/crypto/asymmetric-keys.txt
  *
  * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 410c30c42120..212f447938ae 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -81,7 +81,7 @@ config PRINTER
 	  corresponding drivers into the kernel.
 
 	  To compile this driver as a module, choose M here and read
-	  <file:Documentation/parport.txt>.  The module will be called lp.
+	  <file:Documentation/admin-guide/parport.rst>.  The module will be called lp.
 
 	  If you have several parallel ports, you can specify which ports to
 	  use with the "lp" kernel command line option.  (Try "man bootparam"
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index a24a6afb50b6..9760b526ca31 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -6,7 +6,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * Standard functionality for the common clock API.  See Documentation/clk.txt
+ * Standard functionality for the common clock API.  See Documentation/driver-api/clk.rst
  */
 
 #include <linux/clk.h>
@@ -2747,7 +2747,7 @@ static int __clk_core_init(struct clk_core *core)
 		goto out;
 	}
 
-	/* check that clk_ops are sane.  See Documentation/clk.txt */
+	/* check that clk_ops are sane.  See Documentation/driver-api/clk.rst */
 	if (core->ops->set_rate &&
 	    !((core->ops->round_rate || core->ops->determine_rate) &&
 	      core->ops->recalc_rate)) {
diff --git a/drivers/clk/ingenic/cgu.h b/drivers/clk/ingenic/cgu.h
index 542192376ebf..502bcbb61b04 100644
--- a/drivers/clk/ingenic/cgu.h
+++ b/drivers/clk/ingenic/cgu.h
@@ -194,7 +194,7 @@ struct ingenic_cgu {
 
 /**
  * struct ingenic_clk - private data for a clock
- * @hw: see Documentation/clk.txt
+ * @hw: see Documentation/driver-api/clk.rst
  * @cgu: a pointer to the CGU data
  * @idx: the index of this clock in cgu->clock_info
  */
diff --git a/drivers/gpu/vga/Kconfig b/drivers/gpu/vga/Kconfig
index 29437eabe095..b677e5d524e6 100644
--- a/drivers/gpu/vga/Kconfig
+++ b/drivers/gpu/vga/Kconfig
@@ -6,7 +6,7 @@ config VGA_ARB
 	  Some "legacy" VGA devices implemented on PCI typically have the same
 	  hard-decoded addresses as they did on ISA. When multiple PCI devices
 	  are accessed at same time they need some kind of coordination. Please
-	  see Documentation/vgaarbiter.txt for more details. Select this to
+	  see Documentation/gpu/vgaarbiter.rst for more details. Select this to
 	  enable VGA arbiter.
 
 config VGA_ARB_MAX_GPUS
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 1c5e74cb9279..c61b04555779 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -1,6 +1,6 @@
 /*
  * vgaarb.c: Implements the VGA arbitration. For details refer to
- * Documentation/vgaarbiter.txt
+ * Documentation/gpu/vgaarbiter.rst
  *
  *
  * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig
index 32ec4cee6716..d8f9c6e1fc08 100644
--- a/drivers/input/joystick/Kconfig
+++ b/drivers/input/joystick/Kconfig
@@ -214,7 +214,7 @@ config JOYSTICK_DB9
 	  gamepad, Sega Saturn gamepad, or a Multisystem -- Atari, Amiga,
 	  Commodore, Amstrad CPC joystick connected to your parallel port.
 	  For more information on how to use the driver please read
-	  <file:Documentation/input/joystick-parport.txt>.
+	  <file:Documentation/input/devices/joystick-parport.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called db9.
@@ -229,7 +229,7 @@ config JOYSTICK_GAMECON
 	  Sony PlayStation gamepad or a Multisystem -- Atari, Amiga,
 	  Commodore, Amstrad CPC joystick connected to your parallel port.
 	  For more information on how to use the driver please read
-	  <file:Documentation/input/joystick-parport.txt>.
+	  <file:Documentation/input/devices/joystick-parport.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called gamecon.
@@ -241,7 +241,7 @@ config JOYSTICK_TURBOGRAFX
 	  Say Y here if you have the TurboGraFX interface by Steffen Schwenke,
 	  and want to use it with Multisystem -- Atari, Amiga, Commodore,
 	  Amstrad CPC joystick. For more information on how to use the driver
-	  please read <file:Documentation/input/joystick-parport.txt>.
+	  please read <file:Documentation/input/devices/joystick-parport.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called turbografx.
@@ -287,7 +287,7 @@ config JOYSTICK_XPAD
 	  and/or "Event interface support" (CONFIG_INPUT_EVDEV) as well.
 
 	  For information about how to connect the X-Box pad to USB, see
-	  <file:Documentation/input/xpad.txt>.
+	  <file:Documentation/input/devices/xpad.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called xpad.
@@ -313,7 +313,7 @@ config JOYSTICK_WALKERA0701
 	  Say Y or M here if you have a Walkera WK-0701 transmitter which is
 	  supplied with a ready to fly Walkera helicopters such as HM36,
 	  HM37, HM60 and want to use it via parport as a joystick. More
-	  information is available: <file:Documentation/input/walkera0701.txt>
+	  information is available: <file:Documentation/input/devices/walkera0701.rst>
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called walkera0701.
diff --git a/drivers/input/joystick/walkera0701.c b/drivers/input/joystick/walkera0701.c
index 36a5b93156ed..dce313dc260a 100644
--- a/drivers/input/joystick/walkera0701.c
+++ b/drivers/input/joystick/walkera0701.c
@@ -3,7 +3,7 @@
  *
  *  Copyright (c) 2008 Peter Popovec
  *
- *  More about driver:  <file:Documentation/input/walkera0701.txt>
+ *  More about driver:  <file:Documentation/input/devices/walkera0701.rst>
  */
 
 /*
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 572b15fa18c2..c25606e00693 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -411,7 +411,7 @@ config INPUT_YEALINK
 	  usb sound driver, so you might want to enable that as well.
 
 	  For information about how to use these additional functions, see
-	  <file:Documentation/input/yealink.txt>.
+	  <file:Documentation/input/devices/yealink.rst>.
 
 	  To compile this driver as a module, choose M here: the module will be
 	  called yealink.
@@ -595,7 +595,7 @@ config INPUT_GPIO_ROTARY_ENCODER
 	depends on GPIOLIB || COMPILE_TEST
 	help
 	  Say Y here to add support for rotary encoders connected to GPIO lines.
-	  Check file:Documentation/input/rotary-encoder.txt for more
+	  Check file:Documentation/input/devices/rotary-encoder.rst for more
 	  information.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/input/misc/rotary_encoder.c b/drivers/input/misc/rotary_encoder.c
index 6d304381fc30..30ec77ad32c6 100644
--- a/drivers/input/misc/rotary_encoder.c
+++ b/drivers/input/misc/rotary_encoder.c
@@ -7,7 +7,7 @@
  * state machine code inspired by code from Tim Ruetz
  *
  * A generic driver for rotary encoders connected to GPIO lines.
- * See file:Documentation/input/rotary-encoder.txt for more information
+ * See file:Documentation/input/devices/rotary-encoder.rst for more information
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig
index f27f23f2d99a..566a1e3aa504 100644
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -129,7 +129,7 @@ config MOUSE_PS2_ELANTECH
 
 	  This driver exposes some configuration registers via sysfs
 	  entries. For further information,
-	  see <file:Documentation/input/elantech.txt>.
+	  see <file:Documentation/input/devices/elantech.rst>.
 
 	  If unsure, say N.
 
@@ -228,7 +228,7 @@ config MOUSE_APPLETOUCH
 	  scrolling in X11.
 
 	  For further information, see
-	  <file:Documentation/input/appletouch.txt>.
+	  <file:Documentation/input/devices/appletouch.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called appletouch.
@@ -251,7 +251,7 @@ config MOUSE_BCM5974
 
 	  The interface is currently identical to the appletouch interface,
 	  for further information, see
-	  <file:Documentation/input/appletouch.txt>.
+	  <file:Documentation/input/devices/appletouch.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called bcm5974.
diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c
index cb5579716dba..0a6f7ca883e7 100644
--- a/drivers/input/mouse/alps.c
+++ b/drivers/input/mouse/alps.c
@@ -212,7 +212,7 @@ static void alps_set_abs_params_v7(struct alps_data *priv,
 static void alps_set_abs_params_ss4_v2(struct alps_data *priv,
 				       struct input_dev *dev1);
 
-/* Packet formats are described in Documentation/input/alps.txt */
+/* Packet formats are described in Documentation/input/devices/alps.rst */
 
 static bool alps_is_valid_first_byte(struct alps_data *priv,
 				     unsigned char data)
diff --git a/drivers/input/touchscreen/wm97xx-core.c b/drivers/input/touchscreen/wm97xx-core.c
index fd714ee881f7..2566b4d8b342 100644
--- a/drivers/input/touchscreen/wm97xx-core.c
+++ b/drivers/input/touchscreen/wm97xx-core.c
@@ -68,7 +68,7 @@
  * The default values correspond to Mainstone II in QVGA mode
  *
  * Please read
- * Documentation/input/input-programming.txt for more details.
+ * Documentation/input/input-programming.rst for more details.
  */
 
 static int abs_x[3] = {150, 4000, 5};
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 00cd1f20a196..55e9442a99e2 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -38,7 +38,7 @@ void pblk_rb_data_free(struct pblk_rb *rb)
 /*
  * Initialize ring buffer. The data and metadata buffers must be previously
  * allocated and their size must be a power of two
- * (Documentation/circular-buffers.txt)
+ * (Documentation/core-api/circular-buffers.rst)
  */
 int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
 		 unsigned int power_size, unsigned int power_seg_sz)
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 4d200883c505..17bf109c58e9 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -5,7 +5,7 @@ config BCACHE
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
 
-	See Documentation/bcache.txt for details.
+	See Documentation/admin-guide/bcache.rst for details.
 
 config BCACHE_DEBUG
 	bool "Bcache debugging"
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 2a0968c04e21..547c9eedc2f4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -18,7 +18,7 @@
  * as keys are inserted we only sort the pages that have not yet been written.
  * When garbage collection is run, we resort the entire node.
  *
- * All configuration is done via sysfs; see Documentation/bcache.txt.
+ * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
  */
 
 #include "bcache.h"
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index c334e6666461..1d096742eb41 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -18,7 +18,7 @@
  * as keys are inserted we only sort the pages that have not yet been written.
  * When garbage collection is run, we resort the entire node.
  *
- * All configuration is done via sysfs; see Documentation/bcache.txt.
+ * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
  */
 
 #include "bcache.h"
diff --git a/drivers/media/dvb-core/dvb_ringbuffer.c b/drivers/media/dvb-core/dvb_ringbuffer.c
index 4330b6fa4af2..d1d471af0636 100644
--- a/drivers/media/dvb-core/dvb_ringbuffer.c
+++ b/drivers/media/dvb-core/dvb_ringbuffer.c
@@ -55,7 +55,7 @@ int dvb_ringbuffer_empty(struct dvb_ringbuffer *rbuf)
 	 * this pairs with smp_store_release() in dvb_ringbuffer_write(),
 	 * dvb_ringbuffer_write_user(), or dvb_ringbuffer_reset()
 	 *
-	 * for memory barriers also see Documentation/circular-buffers.txt
+	 * for memory barriers also see Documentation/core-api/circular-buffers.rst
 	 */
 	return (rbuf->pread == smp_load_acquire(&rbuf->pwrite));
 }
diff --git a/drivers/media/pci/meye/Kconfig b/drivers/media/pci/meye/Kconfig
index 2e60334ffef5..9a50f54231ad 100644
--- a/drivers/media/pci/meye/Kconfig
+++ b/drivers/media/pci/meye/Kconfig
@@ -5,7 +5,7 @@ config VIDEO_MEYE
 	---help---
 	  This is the video4linux driver for the Motion Eye camera found
 	  in the Vaio Picturebook laptops. Please read the material in
-	  <file:Documentation/video4linux/meye.txt> for more information.
+	  <file:Documentation/media/v4l-drivers/meye.rst> for more information.
 
 	  If you say Y or M here, you need to say Y or M to "Sony Laptop
 	  Extras" in the misc device section.
diff --git a/drivers/media/platform/pxa_camera.c b/drivers/media/platform/pxa_camera.c
index 4d5a26b4cdda..d85ffbfb7c1f 100644
--- a/drivers/media/platform/pxa_camera.c
+++ b/drivers/media/platform/pxa_camera.c
@@ -1021,7 +1021,7 @@ static void pxa_camera_wakeup(struct pxa_camera_dev *pcdev,
  *  - a videobuffer is queued on the pcdev->capture list
  *
  * Please check the "DMA hot chaining timeslice issue" in
- *   Documentation/video4linux/pxa_camera.txt
+ *   Documentation/media/v4l-drivers/pxa_camera.rst
  *
  * Context: should only be called within the dma irq handler
  */
@@ -1443,7 +1443,7 @@ static void pxac_vb2_queue(struct vb2_buffer *vb)
 
 /*
  * Please check the DMA prepared buffer structure in :
- *   Documentation/video4linux/pxa_camera.txt
+ *   Documentation/media/v4l-drivers/pxa_camera.rst
  * Please check also in pxa_camera_check_link_miss() to understand why DMA chain
  * modification while DMA chain is running will work anyway.
  */
diff --git a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
index 242342fd7ede..9897213f2618 100644
--- a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
+++ b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
@@ -1111,7 +1111,7 @@ static void sh_mobile_ceu_put_formats(struct soc_camera_device *icd)
 /*
  * CEU can scale and crop, but we don't want to waste bandwidth and kill the
  * framerate by always requesting the maximum image from the client. See
- * Documentation/video4linux/sh_mobile_ceu_camera.txt for a description of
+ * Documentation/media/v4l-drivers/sh_mobile_ceu_camera.rst for a description of
  * scaling and cropping algorithms and for the meaning of referenced here steps.
  */
 static int sh_mobile_ceu_set_selection(struct soc_camera_device *icd,
diff --git a/drivers/media/radio/Kconfig b/drivers/media/radio/Kconfig
index 39b04ad924c0..b426d6f9787d 100644
--- a/drivers/media/radio/Kconfig
+++ b/drivers/media/radio/Kconfig
@@ -272,7 +272,7 @@ config RADIO_RTRACK
 	  been reported to be used by these cards.
 
 	  More information is contained in the file
-	  <file:Documentation/video4linux/radiotrack.txt>.
+	  <file:Documentation/media/v4l-drivers/radiotrack.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called radio-aimslab.
diff --git a/drivers/media/radio/si470x/Kconfig b/drivers/media/radio/si470x/Kconfig
index a21172e413a9..6dbb158cd2a0 100644
--- a/drivers/media/radio/si470x/Kconfig
+++ b/drivers/media/radio/si470x/Kconfig
@@ -29,7 +29,7 @@ config USB_SI470X
 
 	  Please have a look at the documentation, especially on how
 	  to redirect the audio stream from the radio to your sound device:
-	  Documentation/video4linux/si470x.txt
+	  Documentation/media/v4l-drivers/si470x.rst
 
 	  Say Y here if you want to connect this type of radio to your
 	  computer's USB port.
diff --git a/drivers/media/usb/dvb-usb-v2/lmedm04.c b/drivers/media/usb/dvb-usb-v2/lmedm04.c
index be26c029546b..39db6dc4b5cd 100644
--- a/drivers/media/usb/dvb-usb-v2/lmedm04.c
+++ b/drivers/media/usb/dvb-usb-v2/lmedm04.c
@@ -21,7 +21,7 @@
  *
  * LME2510C + M88RS2000
  *
- * For firmware see Documentation/dvb/lmedm04.txt
+ * For firmware see Documentation/media/dvb-drivers/lmedm04.rst
  *
  * I2C addresses:
  * 0xd0 - STV0288	- Demodulator
diff --git a/drivers/media/usb/zr364xx/Kconfig b/drivers/media/usb/zr364xx/Kconfig
index 0f585662881d..ac429bca70e8 100644
--- a/drivers/media/usb/zr364xx/Kconfig
+++ b/drivers/media/usb/zr364xx/Kconfig
@@ -6,7 +6,7 @@ config USB_ZR364XX
 	---help---
 	  Say Y here if you want to connect this type of camera to your
 	  computer's USB port.
-	  See <file:Documentation/video4linux/zr364xx.txt> for more info
+	  See <file:Documentation/media/v4l-drivers/zr364xx.rst> for more info
 	  and list of supported cameras.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig
index 44333bd8f908..a97f4eada60b 100644
--- a/drivers/parport/Kconfig
+++ b/drivers/parport/Kconfig
@@ -20,7 +20,7 @@ menuconfig PARPORT
 	  drive, PLIP link (Parallel Line Internet Protocol is mainly used to
 	  create a mini network by connecting the parallel ports of two local
 	  machines) etc., then you need to say Y here; please read
-	  <file:Documentation/parport.txt> and
+	  <file:Documentation/admin-guide/parport.rst> and
 	  <file:drivers/parport/BUGS-parport>.
 
 	  For extensive information about drivers for many devices attaching
@@ -33,7 +33,7 @@ menuconfig PARPORT
 	  the module will be called parport.
 	  If you have more than one parallel port and want to specify which
 	  port and IRQ to be used by this driver at module load time, take a
-	  look at <file:Documentation/parport.txt>.
+	  look at <file:Documentation/admin-guide/parport.rst>.
 
 	  If unsure, say Y.
 
@@ -71,7 +71,7 @@ config PARPORT_PC_FIFO
 	  As well as actually having a FIFO, or DMA capability, the kernel
 	  will need to know which IRQ the parallel port has.  By default,
 	  parallel port interrupts will not be used, and so neither will the
-	  FIFO.  See <file:Documentation/parport.txt> to find out how to
+	  FIFO.  See <file:Documentation/admin-guide/parport.rst> to find out how to
 	  specify which IRQ/DMA to use.
 
 config PARPORT_PC_SUPERIO
diff --git a/drivers/staging/media/bcm2048/TODO b/drivers/staging/media/bcm2048/TODO
index 051f85dbe89e..6bee2a2dad68 100644
--- a/drivers/staging/media/bcm2048/TODO
+++ b/drivers/staging/media/bcm2048/TODO
@@ -3,7 +3,7 @@ TODO:
 From the initial code review:
 
 The main thing you need to do is to implement all the controls using the
-control framework (see Documentation/video4linux/v4l2-controls.txt).
+control framework (see Documentation/media/kapi/v4l2-controls.rst).
 Most drivers are by now converted to the control framework, so you will
 find many examples of how to do this in drivers/media/radio.
 
diff --git a/include/keys/asymmetric-subtype.h b/include/keys/asymmetric-subtype.h
index 2480469ce8fb..e0a9c2368872 100644
--- a/include/keys/asymmetric-subtype.h
+++ b/include/keys/asymmetric-subtype.h
@@ -1,6 +1,6 @@
 /* Asymmetric public-key cryptography key subtype
  *
- * See Documentation/security/asymmetric-keys.txt
+ * See Documentation/crypto/asymmetric-keys.txt
  *
  * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h
index b38240716d41..1cb77cd5135e 100644
--- a/include/keys/asymmetric-type.h
+++ b/include/keys/asymmetric-type.h
@@ -1,6 +1,6 @@
 /* Asymmetric Public-key cryptography key type interface
  *
- * See Documentation/security/asymmetric-keys.txt
+ * See Documentation/crypto/asymmetric-keys.txt
  *
  * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/include/linux/assoc_array.h b/include/linux/assoc_array.h
index a89df3be1686..65e3832f96b2 100644
--- a/include/linux/assoc_array.h
+++ b/include/linux/assoc_array.h
@@ -1,6 +1,6 @@
 /* Generic associative array implementation.
  *
- * See Documentation/assoc_array.txt for information.
+ * See Documentation/core-api/assoc_array.rst for information.
  *
  * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/include/linux/assoc_array_priv.h b/include/linux/assoc_array_priv.h
index 711275e6681c..a00a06550c10 100644
--- a/include/linux/assoc_array_priv.h
+++ b/include/linux/assoc_array_priv.h
@@ -1,6 +1,6 @@
 /* Private definitions for the generic associative array implementation.
  *
- * See Documentation/assoc_array.txt for information.
+ * See Documentation/core-api/assoc_array.rst for information.
  *
  * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/include/linux/circ_buf.h b/include/linux/circ_buf.h
index 7cf262a421c3..b3233e8202f9 100644
--- a/include/linux/circ_buf.h
+++ b/include/linux/circ_buf.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * See Documentation/circular-buffers.txt for more information.
+ * See Documentation/core-api/circular-buffers.rst for more information.
  */
 
 #ifndef _LINUX_CIRC_BUF_H
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9c3c9a319e48..8154f4920fcb 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Ftrace header.  For implementation details beyond the random comments
- * scattered below, see: Documentation/trace/ftrace-design.txt
+ * scattered below, see: Documentation/trace/ftrace-design.rst
  */
 
 #ifndef _LINUX_FTRACE_H
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index e4b257ff881b..bc8206a8f30e 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -109,7 +109,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  *
  * The barrier() is needed to make sure compiler doesn't cache first element [1],
  * as this loop can be restarted [2]
- * [1] Documentation/atomic_ops.txt around line 114
+ * [1] Documentation/core-api/atomic_ops.rst around line 114
  * [2] Documentation/RCU/rculist_nulls.txt around line 146
  */
 #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)			\
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index db9f15f5db04..c0d7ea0bf5b6 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -170,7 +170,7 @@ struct prctl_mm_map {
  * asking selinux for a specific new context (e.g. with runcon) will result
  * in execve returning -EPERM.
  *
- * See Documentation/prctl/no_new_privs.txt for more details.
+ * See Documentation/userspace-api/no_new_privs.rst for more details.
  */
 #define PR_SET_NO_NEW_PRIVS	38
 #define PR_GET_NO_NEW_PRIVS	39
diff --git a/include/xen/interface/io/kbdif.h b/include/xen/interface/io/kbdif.h
index 2a9510ade701..e2340a4130cf 100644
--- a/include/xen/interface/io/kbdif.h
+++ b/include/xen/interface/io/kbdif.h
@@ -317,7 +317,7 @@ struct xenkbd_position {
  * Linux [2] and Windows [3] multi-touch support.
  *
  * [1] https://cgit.freedesktop.org/wayland/wayland/tree/protocol/wayland.xml
- * [2] https://www.kernel.org/doc/Documentation/input/multi-touch-protocol.txt
+ * [2] https://www.kernel.org/doc/Documentation/input/multi-touch-protocol.rst
  * [3] https://msdn.microsoft.com/en-us/library/jj151564(v=vs.85).aspx
  *
  *
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d8b12e0d39cd..266f10cb7222 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -605,7 +605,7 @@ static inline int nr_cpusets(void)
  * load balancing domains (sched domains) as specified by that partial
  * partition.
  *
- * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
+ * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
  * for a background explanation of this.
  *
  * Does not return errors, on the theory that the callers of this
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index dd6c0a2ad969..dcc0166d1997 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,22 +12,22 @@ config NOP_TRACER
 config HAVE_FTRACE_NMI_ENTER
 	bool
 	help
-	  See Documentation/trace/ftrace-design.txt
+	  See Documentation/trace/ftrace-design.rst
 
 config HAVE_FUNCTION_TRACER
 	bool
 	help
-	  See Documentation/trace/ftrace-design.txt
+	  See Documentation/trace/ftrace-design.rst
 
 config HAVE_FUNCTION_GRAPH_TRACER
 	bool
 	help
-	  See Documentation/trace/ftrace-design.txt
+	  See Documentation/trace/ftrace-design.rst
 
 config HAVE_DYNAMIC_FTRACE
 	bool
 	help
-	  See Documentation/trace/ftrace-design.txt
+	  See Documentation/trace/ftrace-design.rst
 
 config HAVE_DYNAMIC_FTRACE_WITH_REGS
 	bool
@@ -35,12 +35,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 	help
-	  See Documentation/trace/ftrace-design.txt
+	  See Documentation/trace/ftrace-design.rst
 
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
-	  See Documentation/trace/ftrace-design.txt
+	  See Documentation/trace/ftrace-design.rst
 
 config HAVE_FENTRY
 	bool
@@ -448,7 +448,7 @@ config KPROBE_EVENTS
 	help
 	  This allows the user to add tracing events (similar to tracepoints)
 	  on the fly via the ftrace interface. See
-	  Documentation/trace/kprobetrace.txt for more details.
+	  Documentation/trace/kprobetrace.rst for more details.
 
 	  Those events can be inserted wherever kprobes can probe, and record
 	  various register and memory values.
@@ -575,7 +575,7 @@ config MMIOTRACE
 	  implementation and works via page faults. Tracing is disabled by
 	  default and can be enabled at run-time.
 
-	  See Documentation/trace/mmiotrace.txt.
+	  See Documentation/trace/mmiotrace.rst.
 	  If you are not helping to develop drivers, say N.
 
 config TRACING_MAP
diff --git a/lib/Kconfig b/lib/Kconfig
index 809fdd155739..e34b04b56057 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -405,7 +405,7 @@ config ASSOCIATIVE_ARRAY
 
 	  See:
 
-		Documentation/assoc_array.txt
+		Documentation/core-api/assoc_array.rst
 
 	  for more information.
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9a46dc24ac10..2b5ee5fbd652 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4728,7 +4728,7 @@ err_af:
 }
 
 /* This supports connect(2) and SCTP connect services such as sctp_connectx(3)
- * and sctp_sendmsg(3) as described in Documentation/security/LSM-sctp.txt
+ * and sctp_sendmsg(3) as described in Documentation/security/LSM-sctp.rst
  */
 static int selinux_socket_connect_helper(struct socket *sock,
 					 struct sockaddr *address, int addrlen)
diff --git a/sound/core/Kconfig b/sound/core/Kconfig
index 6e937a8146a1..63b3ef9c83f5 100644
--- a/sound/core/Kconfig
+++ b/sound/core/Kconfig
@@ -48,7 +48,7 @@ config SND_MIXER_OSS
 	depends on SND_OSSEMUL
 	help
 	  To enable OSS mixer API emulation (/dev/mixer*), say Y here
-	  and read <file:Documentation/sound/alsa/OSS-Emulation.txt>.
+	  and read <file:Documentation/sound/designs/oss-emulation.rst>.
 
 	  Many programs still use the OSS API, so say Y.
 
@@ -61,7 +61,7 @@ config SND_PCM_OSS
 	select SND_PCM
 	help
 	  To enable OSS digital audio (PCM) emulation (/dev/dsp*), say Y
-	  here and read <file:Documentation/sound/alsa/OSS-Emulation.txt>.
+	  here and read <file:Documentation/sound/designs/oss-emulation.rst>.
 
 	  Many programs still use the OSS API, so say Y.
 
diff --git a/sound/drivers/Kconfig b/sound/drivers/Kconfig
index 7144cc36e8ae..648a12da44f9 100644
--- a/sound/drivers/Kconfig
+++ b/sound/drivers/Kconfig
@@ -153,7 +153,7 @@ config SND_SERIAL_U16550
 	select SND_RAWMIDI
 	help
 	  To include support for MIDI serial port interfaces, say Y here
-	  and read <file:Documentation/sound/alsa/serial-u16550.txt>.
+	  and read <file:Documentation/sound/cards/serial-u16550.rst>.
 	  This driver works with serial UARTs 16550 and better.
 
 	  This driver accesses the serial port hardware directly, so
@@ -223,7 +223,7 @@ config SND_AC97_POWER_SAVE
 	  the device frequently.  A value of 10 seconds would be a
 	  good choice for normal operations.
 
-	  See Documentation/sound/alsa/powersave.txt for more details.
+	  See Documentation/sound/designs/powersave.rst for more details.
 
 config SND_AC97_POWER_SAVE_DEFAULT
 	int "Default time-out for AC97 power-save mode"
diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig
index d9f3fdb777e4..4105d9f653d9 100644
--- a/sound/pci/Kconfig
+++ b/sound/pci/Kconfig
@@ -175,7 +175,7 @@ config SND_BT87X
 	help
 	  If you want to record audio from TV cards based on
 	  Brooktree Bt878/Bt879 chips, say Y here and read
-	  <file:Documentation/sound/alsa/Bt87x.txt>.
+	  <file:Documentation/sound/cards/bt87x.rst>.
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-bt87x.
@@ -210,7 +210,7 @@ config SND_CMIPCI
 	help
 	  If you want to use soundcards based on C-Media CMI8338, CMI8738,
 	  CMI8768 or CMI8770 chips, say Y here and read
-	  <file:Documentation/sound/alsa/CMIPCI.txt>.
+	  <file:Documentation/sound/cards/cmipci.rst>.
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-cmipci.
@@ -472,8 +472,8 @@ config SND_EMU10K1
 	  Audigy and E-mu APS (partially supported) soundcards.
 
 	  The confusing multitude of mixer controls is documented in
-	  <file:Documentation/sound/alsa/SB-Live-mixer.txt> and
-	  <file:Documentation/sound/alsa/Audigy-mixer.txt>.
+	  <file:Documentation/sound/cards/sb-live-mixer.rst> and
+	  <file:Documentation/sound/cards/audigy-mixer.rst>.
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-emu10k1.
@@ -735,7 +735,7 @@ config SND_MIXART
 	select SND_PCM
 	help
 	  If you want to use Digigram miXart soundcards, say Y here and
-	  read <file:Documentation/sound/alsa/MIXART.txt>.
+	  read <file:Documentation/sound/cards/mixart.rst>.
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-mixart.
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index db9f15f5db04..c0d7ea0bf5b6 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -170,7 +170,7 @@ struct prctl_mm_map {
  * asking selinux for a specific new context (e.g. with runcon) will result
  * in execve returning -EPERM.
  *
- * See Documentation/prctl/no_new_privs.txt for more details.
+ * See Documentation/userspace-api/no_new_privs.rst for more details.
  */
 #define PR_SET_NO_NEW_PRIVS	38
 #define PR_GET_NO_NEW_PRIVS	39
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 6a12bbf39f7b..7aba8243a0e7 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -201,7 +201,7 @@ static void mem_toupper(char *f, size_t len)
 
 /*
  * Check for "NAME_PATH" environment variable to override fs location (for
- * testing). This matches the recommendation in Documentation/sysfs-rules.txt
+ * testing). This matches the recommendation in Documentation/admin-guide/sysfs-rules.rst
  * for SYSFS_PATH.
  */
 static bool fs__env_override(struct fs *fs)
diff --git a/tools/perf/util/bpf-prologue.c b/tools/perf/util/bpf-prologue.c
index 29347756b0af..77e4891e17b0 100644
--- a/tools/perf/util/bpf-prologue.c
+++ b/tools/perf/util/bpf-prologue.c
@@ -61,7 +61,7 @@ check_pos(struct bpf_insn_pos *pos)
 
 /*
  * Convert type string (u8/u16/u32/u64/s8/s16/s32/s64 ..., see
- * Documentation/trace/kprobetrace.txt) to size field of BPF_LDX_MEM
+ * Documentation/trace/kprobetrace.rst) to size field of BPF_LDX_MEM
  * instruction (BPF_{B,H,W,DW}).
  */
 static int
diff --git a/tools/power/pm-graph/config/custom-timeline-functions.cfg b/tools/power/pm-graph/config/custom-timeline-functions.cfg
index 4f80ad7d7275..f8fcb06fd68b 100644
--- a/tools/power/pm-graph/config/custom-timeline-functions.cfg
+++ b/tools/power/pm-graph/config/custom-timeline-functions.cfg
@@ -105,7 +105,7 @@ override-dev-timeline-functions: true
 #       example: [color=#CC00CC]
 #
 #   arglist: A list of arguments from registers/stack addresses. See URL:
-#            https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt
+#            https://www.kernel.org/doc/Documentation/trace/kprobetrace.rst
 #
 #       example: cpu=%di:s32
 #
@@ -170,7 +170,7 @@ pm_restore_console:
 #       example: [color=#CC00CC]
 #
 #   arglist: A list of arguments from registers/stack addresses. See URL:
-#            https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt
+#            https://www.kernel.org/doc/Documentation/trace/kprobetrace.rst
 #
 #       example: port=+36(%di):s32
 #
-- 
cgit v1.2.3


From 44348e8ac145d78171c5a6f4a8bdb01b70969fc2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 14 Jun 2018 12:34:32 -0300
Subject: fix a series of Documentation/ broken file name references

As files move around, their previous links break. Fix the
references for them.

Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/translations/zh_CN/io_ordering.txt | 2 +-
 drivers/dma/dmaengine.c                          | 2 +-
 drivers/platform/x86/Kconfig                     | 2 +-
 drivers/sbus/char/oradax.c                       | 2 +-
 fs/befs/ChangeLog                                | 2 +-
 fs/orangefs/orangefs-sysfs.c                     | 2 +-
 include/linux/platform_data/sc18is602.h          | 2 +-
 kernel/power/main.c                              | 5 +++--
 8 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/translations/zh_CN/io_ordering.txt b/Documentation/translations/zh_CN/io_ordering.txt
index e592daf4e014..1f8127bdd415 100644
--- a/Documentation/translations/zh_CN/io_ordering.txt
+++ b/Documentation/translations/zh_CN/io_ordering.txt
@@ -1,4 +1,4 @@
-Chinese translated version of Documentation/io_orderings.txt
+Chinese translated version of Documentation/io_ordering.txt
 
 If you have any comment or update to the content, please contact the
 original document maintainer directly.  However, if you have a problem
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index b451354735d3..08ba8473a284 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -38,7 +38,7 @@
  * Each device has a channels list, which runs unlocked but is never modified
  * once the device is registered, it's just setup by the driver.
  *
- * See Documentation/dmaengine.txt for more details
+ * See Documentation/driver-api/dmaengine for more details
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index f27cb186437d..ac4d48830415 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1052,7 +1052,7 @@ config SAMSUNG_LAPTOP
 	  function keys, wireless LED, LCD backlight level.
 
 	  It may also provide some sysfs files described in
-	  <file:Documentation/ABI/testing/sysfs-platform-samsung-laptop>
+	  <file:Documentation/ABI/testing/sysfs-driver-samsung-laptop>
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called samsung-laptop.
diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c
index 1754f55e2fac..524f9ea62e52 100644
--- a/drivers/sbus/char/oradax.c
+++ b/drivers/sbus/char/oradax.c
@@ -30,7 +30,7 @@
  * the recommended way for applications to use the coprocessor, and
  * the driver interface is not intended for general use.
  *
- * See Documentation/sparc/oradax/oracle_dax.txt for more details.
+ * See Documentation/sparc/oradax/oracle-dax.txt for more details.
  */
 
 #include <linux/uaccess.h>
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index 16f2dfe8c2f7..aff7eec8f327 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -389,7 +389,7 @@ Version 0.4 (2001-10-28)
 	(fs/nls/Config.in)
 
 * Added Configure.help entries for CONFIG_BEFS_FS and CONFIG_DEBUG_BEFS
-	(Documentation/Configure.help)
+	(currently at fs/befs/Kconfig)
 
 2001-08-??
 ==========
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 079a465796f3..dd28079f518c 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Documentation/ABI/stable/orangefs-sysfs:
+ * Documentation/ABI/stable/sysfs-fs-orangefs:
  *
  * What:		/sys/fs/orangefs/perf_counter_reset
  * Date:		June 2015
diff --git a/include/linux/platform_data/sc18is602.h b/include/linux/platform_data/sc18is602.h
index 997b06634152..18602cab7799 100644
--- a/include/linux/platform_data/sc18is602.h
+++ b/include/linux/platform_data/sc18is602.h
@@ -7,7 +7,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * For further information, see the Documentation/spi/sc18is602 file.
+ * For further information, see the Documentation/spi/spi-sc18is602 file.
  */
 
 /**
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 705c2366dafe..d9706da10930 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -455,8 +455,9 @@ struct kobject *power_kobj;
  * state - control system sleep states.
  *
  * show() returns available sleep state labels, which may be "mem", "standby",
- * "freeze" and "disk" (hibernation).  See Documentation/power/states.txt for a
- * description of what they mean.
+ * "freeze" and "disk" (hibernation).
+ * See Documentation/admin-guide/pm/sleep-states.rst for a description of
+ * what they mean.
  *
  * store() accepts one of those strings, translates it into the proper
  * enumerated value, and initiates a suspend transition.
-- 
cgit v1.2.3


From b23908d3c48a37c46c6a26df2cdeab1610b360ba Mon Sep 17 00:00:00 2001
From: Simon Glass <sjg@chromium.org>
Date: Sun, 17 Jun 2018 14:09:42 +0200
Subject: firmware: dmi: Add access to the SKU ID string

This is used in some systems from user space for determining the identity
of the device.

Expose this as a file so that that user-space tools don't need to read
from /sys/firmware/dmi/tables/DMI

Signed-off-by: Simon Glass <sjg@chromium.org>
Signed-off-by: Jean Delvare <jdelvare@suse.de>
---
 drivers/firmware/dmi-id.c       | 2 ++
 drivers/firmware/dmi_scan.c     | 1 +
 include/linux/mod_devicetable.h | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c
index 951b6c79f166..624a11cb07e2 100644
--- a/drivers/firmware/dmi-id.c
+++ b/drivers/firmware/dmi-id.c
@@ -47,6 +47,7 @@ DEFINE_DMI_ATTR_WITH_SHOW(product_name,		0444, DMI_PRODUCT_NAME);
 DEFINE_DMI_ATTR_WITH_SHOW(product_version,	0444, DMI_PRODUCT_VERSION);
 DEFINE_DMI_ATTR_WITH_SHOW(product_serial,	0400, DMI_PRODUCT_SERIAL);
 DEFINE_DMI_ATTR_WITH_SHOW(product_uuid,		0400, DMI_PRODUCT_UUID);
+DEFINE_DMI_ATTR_WITH_SHOW(product_sku,		0444, DMI_PRODUCT_SKU);
 DEFINE_DMI_ATTR_WITH_SHOW(product_family,	0444, DMI_PRODUCT_FAMILY);
 DEFINE_DMI_ATTR_WITH_SHOW(board_vendor,		0444, DMI_BOARD_VENDOR);
 DEFINE_DMI_ATTR_WITH_SHOW(board_name,		0444, DMI_BOARD_NAME);
@@ -193,6 +194,7 @@ static void __init dmi_id_init_attr_table(void)
 	ADD_DMI_ATTR(product_serial,    DMI_PRODUCT_SERIAL);
 	ADD_DMI_ATTR(product_uuid,      DMI_PRODUCT_UUID);
 	ADD_DMI_ATTR(product_family,    DMI_PRODUCT_FAMILY);
+	ADD_DMI_ATTR(product_sku,       DMI_PRODUCT_SKU);
 	ADD_DMI_ATTR(board_vendor,      DMI_BOARD_VENDOR);
 	ADD_DMI_ATTR(board_name,        DMI_BOARD_NAME);
 	ADD_DMI_ATTR(board_version,     DMI_BOARD_VERSION);
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 54e66adef252..f2483548cde9 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -447,6 +447,7 @@ static void __init dmi_decode(const struct dmi_header *dm, void *dummy)
 		dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
 		dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
 		dmi_save_uuid(dm, DMI_PRODUCT_UUID, 8);
+		dmi_save_ident(dm, DMI_PRODUCT_SKU, 25);
 		dmi_save_ident(dm, DMI_PRODUCT_FAMILY, 26);
 		break;
 	case 2:		/* Base Board Information */
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 7d361be2e24f..61fbfbc03e5b 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -490,6 +490,7 @@ enum dmi_field {
 	DMI_PRODUCT_VERSION,
 	DMI_PRODUCT_SERIAL,
 	DMI_PRODUCT_UUID,
+	DMI_PRODUCT_SKU,
 	DMI_PRODUCT_FAMILY,
 	DMI_BOARD_VENDOR,
 	DMI_BOARD_NAME,
-- 
cgit v1.2.3